@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
148 lines • 5.39 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.ErrorSnapshotter = void 0;
const tslib_1 = require("tslib");
const node_crypto_1 = tslib_1.__importDefault(require("node:crypto"));
/**
* ErrorSnapshotter class is used to capture a screenshot of the page and a snapshot of the HTML when an error occurs during web crawling.
*
* This functionality is opt-in, and can be enabled via the crawler options:
*
* ```ts
* const crawler = new BasicCrawler({
* // ...
* statisticsOptions: {
* saveErrorSnapshots: true,
* },
* });
* ```
*/
class ErrorSnapshotter {
/**
* Capture a snapshot of the error context.
*/
async captureSnapshot(error, context) {
try {
const page = context?.page;
const body = context?.body;
const keyValueStore = await context?.getKeyValueStore();
// If the key-value store is not available, or the body and page are not available, return empty filenames
if (!keyValueStore || (!body && !page)) {
return {};
}
const fileName = this.generateFilename(error);
let screenshotFileName;
let htmlFileName;
if (page) {
const capturedFiles = await this.contextCaptureSnapshot(context, fileName);
if (capturedFiles) {
screenshotFileName = capturedFiles.screenshotFileName;
htmlFileName = capturedFiles.htmlFileName;
}
// If the snapshot for browsers failed to capture the HTML, try to capture it from the page content
if (!htmlFileName) {
const html = await page.content();
htmlFileName = html ? await this.saveHTMLSnapshot(html, keyValueStore, fileName) : undefined;
}
}
else if (typeof body === 'string') {
// for non-browser contexts
htmlFileName = await this.saveHTMLSnapshot(body, keyValueStore, fileName);
}
return {
screenshotFileName,
screenshotFileUrl: screenshotFileName && keyValueStore.getPublicUrl(screenshotFileName),
htmlFileName,
htmlFileUrl: htmlFileName && keyValueStore.getPublicUrl(htmlFileName),
};
}
catch {
return {};
}
}
/**
* Captures a snapshot of the current page using the context.saveSnapshot function.
* This function is applicable for browser contexts only.
* Returns an object containing the filenames of the screenshot and HTML file.
*/
async contextCaptureSnapshot(context, fileName) {
try {
await context.saveSnapshot({ key: fileName });
return {
screenshotFileName: `${fileName}.jpg`,
htmlFileName: `${fileName}.html`,
};
}
catch {
return undefined;
}
}
/**
* Save the HTML snapshot of the page, and return the fileName with the extension.
*/
async saveHTMLSnapshot(html, keyValueStore, fileName) {
try {
await keyValueStore.setValue(fileName, html, { contentType: 'text/html' });
return `${fileName}.html`;
}
catch {
return undefined;
}
}
/**
* Generate a unique fileName for each error snapshot.
*/
generateFilename(error) {
const { SNAPSHOT_PREFIX, BASE_MESSAGE, MAX_HASH_LENGTH, MAX_ERROR_CHARACTERS, MAX_FILENAME_LENGTH } = ErrorSnapshotter;
// Create a hash of the error stack trace
const errorStackHash = node_crypto_1.default
.createHash('sha1')
.update(error.stack || error.message || '')
.digest('hex')
.slice(0, MAX_HASH_LENGTH);
const errorMessagePrefix = (error.message || BASE_MESSAGE).slice(0, MAX_ERROR_CHARACTERS).trim();
/**
* Remove non-word characters from the start and end of a string.
*/
const sanitizeString = (str) => {
return str.replace(/^\W+|\W+$/g, '');
};
// Generate fileName and remove disallowed characters
const fileName = `${SNAPSHOT_PREFIX}_${sanitizeString(errorStackHash)}_${sanitizeString(errorMessagePrefix)}`
.replace(/\W+/g, '-') // Replace non-word characters with a dash
.slice(0, MAX_FILENAME_LENGTH);
return fileName;
}
}
exports.ErrorSnapshotter = ErrorSnapshotter;
Object.defineProperty(ErrorSnapshotter, "MAX_ERROR_CHARACTERS", {
enumerable: true,
configurable: true,
writable: true,
value: 30
});
Object.defineProperty(ErrorSnapshotter, "MAX_HASH_LENGTH", {
enumerable: true,
configurable: true,
writable: true,
value: 30
});
Object.defineProperty(ErrorSnapshotter, "MAX_FILENAME_LENGTH", {
enumerable: true,
configurable: true,
writable: true,
value: 250
});
Object.defineProperty(ErrorSnapshotter, "BASE_MESSAGE", {
enumerable: true,
configurable: true,
writable: true,
value: 'An error occurred'
});
Object.defineProperty(ErrorSnapshotter, "SNAPSHOT_PREFIX", {
enumerable: true,
configurable: true,
writable: true,
value: 'ERROR_SNAPSHOT'
});
//# sourceMappingURL=error_snapshotter.js.map