Fixed erroneous timing in reported Mailgun metrics

- we have calls to the metrics library so we can measure the time it
  takes the Mailgun API to return a response
- however, there's a bug in the code whereby if the `batchHandler`
  takes a long time and then throws an error, this time will be reported
  to metrics
- this is misleading because it looks like Mailgun is taking a long time
  if the databases are slow
- this pulls the specific SDK call out into a function so it's easier to
  wrap with timing code
This commit is contained in:
Daniel Lockyer 2023-04-19 08:58:23 +01:00 committed by Daniel Lockyer
parent eff4662af3
commit 9bb2bc82bd

View File

@ -110,6 +110,29 @@ module.exports = class MailgunClient {
}
}
/**
* @param {import('mailgun.js').default} mailgunInstance
* @param {Object} mailgunConfig
* @param {Object} mailgunOptions
*/
async getEventsFromMailgun(mailgunInstance, mailgunConfig, mailgunOptions) {
const startTime = Date.now();
try {
const page = await mailgunInstance.events.get(mailgunConfig.domain, mailgunOptions);
metrics.metric('mailgun-get-events', {
value: Date.now() - startTime,
statusCode: 200
});
return page;
} catch (error) {
metrics.metric('mailgun-get-events', {
value: Date.now() - startTime,
statusCode: error.status
});
throw error;
}
}
/**
* Fetches events from Mailgun
* @param {Object} mailgunOptions
@ -127,14 +150,11 @@ module.exports = class MailgunClient {
debug(`fetchEvents: starting fetching first events page`);
const mailgunConfig = this.#getConfig();
let startTime = Date.now();
const startDate = new Date();
try {
let page = await mailgunInstance.events.get(mailgunConfig.domain, mailgunOptions);
metrics.metric('mailgun-get-events', {
value: Date.now() - startTime,
statusCode: 200
});
let page = await this.getEventsFromMailgun(mailgunInstance, mailgunConfig, mailgunOptions);
// By limiting the processed events to ones created before this job started we cancel early ready for the next job run.
// Avoids chance of events being missed in long job runs due to mailgun's eventual-consistency creating events outside of our 30min sliding re-check window
let events = (page?.items?.map(this.normalizeEvent) || []).filter(e => !!e && e.timestamp <= startDate);
@ -153,26 +173,17 @@ module.exports = class MailgunClient {
const nextPageId = page.pages.next.page;
debug(`fetchEvents: starting fetching next page ${nextPageId}`);
startTime = Date.now();
page = await mailgunInstance.events.get(mailgunConfig.domain, {
page = await this.getEventsFromMailgun(mailgunInstance, mailgunConfig, {
page: nextPageId,
...mailgunOptions
});
metrics.metric('mailgun-get-events', {
value: Date.now() - startTime,
statusCode: 200
});
// We need to cap events at the time we started fetching them (see comment above)
events = (page?.items?.map(this.normalizeEvent) || []).filter(e => !!e && e.timestamp <= startDate);
debug(`fetchEvents: finished fetching next page with ${events.length} events`);
}
} catch (error) {
// Log and re-throw Mailgun errors
logging.error(error);
metrics.metric('mailgun-get-events', {
value: Date.now() - startTime,
statusCode: error.status
});
throw error;
}
}