@openactive/harvesting-utils
Version:
Utils library for harvesting RPDE feeds
254 lines (234 loc) • 9.15 kB
JavaScript
const { default: axios } = require('axios');
const { performance } = require('perf_hooks');
const { FeedPageChecker } = require('@openactive/rpde-validator');
const sleep = require('util').promisify(setTimeout);
const { createFeedContext } = require('./feed-context-utils');
const { jsonParseAllowingBigInts } = require('./utils');
const DEFAULT_HOW_LONG_TO_SLEEP_AT_FEED_END = 500;
/**
* @typedef {import('./models/FeedContext').FeedContext} FeedContext
* @typedef {import('./models/RpdePageProcessor').RpdePageProcessor} RpdePageProcessor
* @typedef {import('./models/HarvestRpde').HarvestRpdeArgs} HarvestRpdeArgs
*/
/**
* @param {HarvestRpdeArgs} args
* @param {boolean} [isLosslessMode=false] If true, the high-fidelity data TODO
*
* @returns {Promise<import('./models/HarvestRpde').HarvestRpdeResponse>} Only returns if there is a fatal error.
*/
async function baseHarvestRPDE({
baseUrl,
feedContextIdentifier,
headers,
processPage,
onReachedEndOfFeed,
onRetryDueToHttpError,
optionallyWaitBeforeNextRequest,
isOrdersFeed,
overrideContext,
// eslint-disable-next-line no-unused-vars
loggingFns: { log, logError, logErrorDuringHarvest } = {
log: console.log,
logError: console.error,
logErrorDuringHarvest: x => console.error(`\n\n${x}\n\n\n\n\n\n\n\n\n`), // Ensure messages are displayed above the multibar output
},
config: {
howLongToSleepAtFeedEnd,
REQUEST_LOGGING_ENABLED,
} = {
REQUEST_LOGGING_ENABLED: false,
},
}, isLosslessMode = false) {
const pageDescriptiveIdentifier = (url, thisHeaders) => `RPDE feed ${feedContextIdentifier} page "${url}" (request headers: ${JSON.stringify(thisHeaders)})`;
let isInitialHarvestComplete = false;
let numberOfRetries = 0;
const context = overrideContext || createFeedContext(baseUrl);
let url = baseUrl;
// One instance of FeedPageChecker per feed, as it maintains state relating to the feed
const feedChecker = new FeedPageChecker();
// Harvest forever, until a 404 is encountered
for (; ;) {
if (optionallyWaitBeforeNextRequest) await optionallyWaitBeforeNextRequest();
const headersForThisRequest = await headers();
try {
const axiosConfig = {
headers: headersForThisRequest,
transformResponse: (data) => {
let lowFidelityData;
try {
lowFidelityData = JSON.parse(data);
} catch (error) {
// Return the original data if parsing fails
return data;
}
const highFidelityData = /** @type {any} */(jsonParseAllowingBigInts(data));
// Store `modified`s as strings if 'items' exists
const rpdeItemsWithStringModifieds = highFidelityData.items?.map(item => ({
...item,
modified: String(item.modified),
}));
return {
lowFidelityData,
highFidelityData: { ...highFidelityData, items: rpdeItemsWithStringModifieds },
};
},
};
const timerStart = performance.now();
// Axios will throw an error if the status code is not 2xx
const response = await axios.get(url, axiosConfig);
const timerEnd = performance.now();
const responseTime = timerEnd - timerStart;
if (!response.data || typeof response.data !== 'object' || !response.data.lowFidelityData || typeof response.data.lowFidelityData !== 'object') {
const error = new Error(`${pageDescriptiveIdentifier(url, headersForThisRequest)} with response code ${response.status} is not valid JSON:\n\nResponse: ${response.data}`);
return {
error: {
type: 'unexpected-non-http-error',
reqUrl: url,
reqHeaders: headersForThisRequest,
error,
message: error.message,
},
};
}
// Low fidelity data must be used for validation as validator cannot deal with BigInts.
// High fidelity data contains modified as a string that contains a BigInt. This breaks RPDE validation rules that
// state that modified can't be a string representation of a number.
const validationJson = response.data.lowFidelityData;
// Validate RPDE page using RPDE Validator, noting that for non-2xx state.pendingGetOpportunityResponses that we want to retry axios will have already thrown an error above
const rpdeValidationErrors = feedChecker.validateRpdePage({
url,
json: validationJson,
pageIndex: context.pageIndex,
contentType: response.headers['content-type'],
cacheControl: response.headers['cache-control'],
status: response.status,
isInitialHarvestComplete,
isOrdersFeed,
});
if (rpdeValidationErrors.length > 0) {
return {
error: {
type: 'rpde-validation-error',
reqUrl: url,
reqHeaders: headersForThisRequest,
rpdeValidationErrors,
},
};
}
const json = isLosslessMode ? response.data.highFidelityData : response.data.lowFidelityData;
context.currentPage = url;
if (json.next === url && json.items.length === 0) {
await onReachedEndOfFeed({
lastPageUrl: url,
isInitialHarvestComplete,
// feedContext: context,
responseTime,
});
isInitialHarvestComplete = true;
// Potentially poll more slowly at the end of the feed
await sleep(
howLongToSleepAtFeedEnd
? howLongToSleepAtFeedEnd()
: DEFAULT_HOW_LONG_TO_SLEEP_AT_FEED_END,
);
} else {
context.responseTimes.push(responseTime);
// Maintain a buffer of the last 5 items
if (context.responseTimes.length > 5) context.responseTimes.shift();
context.pageIndex += 1;
context.items += json.items.length;
delete context.sleepMode;
if (REQUEST_LOGGING_ENABLED) {
const kind = json.items && json.items[0] && json.items[0].kind;
log(
`RPDE kind: ${kind}, page: ${context.pageIndex}, length: ${json.items.length
}, next: '${json.next}'`,
);
}
await processPage({
rpdePage: json,
feedContextIdentifier,
// eslint-disable-next-line no-loop-func
isInitialHarvestComplete: () => isInitialHarvestComplete,
reqUrl: url,
responseTime,
});
url = json.next;
}
numberOfRetries = 0;
} catch (error) {
if (!error.isAxiosError) {
return {
error: {
type: 'unexpected-non-http-error',
reqUrl: url,
reqHeaders: headersForThisRequest,
error,
message: `Error for ${pageDescriptiveIdentifier(url, headersForThisRequest)}: ${error.message}\n${error.stack}`,
},
};
}
if (error.response?.status === 404 || error.response?.status === 410) {
// As per
// https://openactive.io/realtime-paged-data-exchange/#http-status-codes,
// consider this endpoint in an error state and do not retry. If 404,
// simply stop polling feed
return {
error: {
type: 'feed-not-found',
reqUrl: url,
reqHeaders: headersForThisRequest,
resStatusCode: error.response.status,
error,
},
};
}
// Otherwise, it's an HTTP error, for which we'll attempt to retry
logErrorDuringHarvest(`Error ${error?.response?.status ?? 'without response'} for ${pageDescriptiveIdentifier(url, headersForThisRequest)} (attempt ${numberOfRetries}): ${error.message}.${error.response ? `\n\nResponse: ${typeof error.response.data?.lowFidelityData === 'object' ? JSON.stringify(error.response.data.lowFidelityData, null, 2) : error.response.data}` : ''}`);
// Force retry, after a delay, up to 12 times
if (numberOfRetries < 12) {
numberOfRetries += 1;
const delay = 5000;
if (onRetryDueToHttpError) {
await onRetryDueToHttpError(
url,
headersForThisRequest,
error.response.status,
error,
delay,
numberOfRetries,
);
}
await sleep(delay);
} else {
return {
error: {
type: 'retry-limit-exceeded-for-http-error',
reqUrl: url,
reqHeaders: headersForThisRequest,
resStatusCode: error.response.status,
error,
numberOfRetries,
},
};
}
}
}
}
/**
* @param {HarvestRpdeArgs} args
*
* @returns {Promise<import('./models/HarvestRpde').HarvestRpdeResponse>} Only returns if there is a fatal error.
*/
function harvestRPDE(args) {
return baseHarvestRPDE(args, false);
}
/**
* @param {HarvestRpdeArgs} args
*
* @returns {Promise<import('./models/HarvestRpde').HarvestRpdeResponse>} Only returns if there is a fatal error.
*/
function harvestRPDELossless(args) {
return baseHarvestRPDE(args, true);
}
module.exports = { harvestRPDE, harvestRPDELossless };