@crawlee/utils
Version:
A set of shared utilities that can be used by crawlers
349 lines • 14.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.Sitemap = void 0;
exports.parseSitemap = parseSitemap;
const tslib_1 = require("tslib");
const node_crypto_1 = require("node:crypto");
const node_stream_1 = require("node:stream");
const node_string_decoder_1 = require("node:string_decoder");
const node_zlib_1 = require("node:zlib");
const sax_1 = tslib_1.__importDefault(require("sax"));
const whatwg_mimetype_1 = tslib_1.__importDefault(require("whatwg-mimetype"));
const log_1 = tslib_1.__importDefault(require("@apify/log"));
class SitemapTxtParser extends node_stream_1.Transform {
constructor() {
super({
readableObjectMode: true,
transform: (chunk, _encoding, callback) => {
this.processBuffer(this.decoder.write(chunk), false);
callback();
},
flush: (callback) => {
this.processBuffer(this.decoder.end(), true);
callback();
},
});
Object.defineProperty(this, "decoder", {
enumerable: true,
configurable: true,
writable: true,
value: new node_string_decoder_1.StringDecoder('utf8')
});
Object.defineProperty(this, "buffer", {
enumerable: true,
configurable: true,
writable: true,
value: ''
});
}
processBuffer(input, finalize) {
this.buffer += input;
if (finalize || this.buffer.includes('\n')) {
const parts = this.buffer
.split('\n')
.map((part) => part.trim())
.filter((part) => part.length > 0);
if (finalize) {
for (const url of parts) {
this.push({ type: 'url', loc: url });
}
this.buffer = '';
}
else if (parts.length > 0) {
for (const url of parts.slice(0, -1)) {
this.push({ type: 'url', loc: url });
}
this.buffer = parts.at(-1);
}
}
}
}
class SitemapXmlParser extends node_stream_1.Transform {
constructor() {
super({
readableObjectMode: true,
transform: (chunk, _encoding, callback) => {
this.parser.write(this.decoder.write(chunk));
callback();
},
flush: (callback) => {
const rest = this.decoder.end();
if (rest.length > 0) {
this.parser.write(rest);
}
this.parser.end();
callback();
},
});
Object.defineProperty(this, "decoder", {
enumerable: true,
configurable: true,
writable: true,
value: new node_string_decoder_1.StringDecoder('utf8')
});
Object.defineProperty(this, "parser", {
enumerable: true,
configurable: true,
writable: true,
value: new sax_1.default.SAXParser(true)
});
Object.defineProperty(this, "rootTagName", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "currentTag", {
enumerable: true,
configurable: true,
writable: true,
value: undefined
});
Object.defineProperty(this, "url", {
enumerable: true,
configurable: true,
writable: true,
value: {}
});
this.parser.onopentag = this.onOpenTag.bind(this);
this.parser.onclosetag = this.onCloseTag.bind(this);
this.parser.ontext = this.onText.bind(this);
this.parser.oncdata = this.onText.bind(this);
this.parser.onerror = this.destroy.bind(this);
}
onOpenTag(node) {
if (this.rootTagName !== undefined) {
if (node.name === 'loc' ||
node.name === 'lastmod' ||
node.name === 'priority' ||
node.name === 'changefreq') {
this.currentTag = node.name;
}
}
if (node.name === 'urlset') {
this.rootTagName = 'urlset';
}
if (node.name === 'sitemapindex') {
this.rootTagName = 'sitemapindex';
}
}
onCloseTag(name) {
if (name === 'loc' || name === 'lastmod' || name === 'priority' || name === 'changefreq') {
this.currentTag = undefined;
}
if (name === 'url' && this.url.loc !== undefined) {
this.push({ type: 'url', ...this.url, loc: this.url.loc });
this.url = {};
}
}
onText(text) {
if (this.currentTag === 'loc') {
if (this.rootTagName === 'sitemapindex') {
this.push({ type: 'sitemapUrl', url: text.trim() });
}
if (this.rootTagName === 'urlset') {
this.url ?? (this.url = {});
this.url.loc = text.trim();
}
}
text = text.trim();
if (this.currentTag === 'lastmod') {
this.url.lastmod = new Date(text);
}
if (this.currentTag === 'priority') {
this.url.priority = Number(text);
}
if (this.currentTag === 'changefreq') {
if (['always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never'].includes(text)) {
this.url.changefreq = text;
}
}
}
}
async function* parseSitemap(initialSources, proxyUrl, options) {
const { gotScraping } = await import('got-scraping');
const { fileTypeStream } = await import('file-type');
const { emitNestedSitemaps = false, maxDepth = Infinity, sitemapRetries = 3, networkTimeouts, reportNetworkErrors = true, } = options ?? {};
const sources = [...initialSources];
const visitedSitemapUrls = new Set();
const createParser = (contentType = '', url) => {
let mimeType;
try {
mimeType = new whatwg_mimetype_1.default(contentType);
}
catch {
mimeType = null;
}
if (mimeType?.isXML() || url?.pathname.endsWith('.xml')) {
return new SitemapXmlParser();
}
if (mimeType?.essence === 'text/plain' || url?.pathname.endsWith('.txt')) {
return new SitemapTxtParser();
}
throw new Error(`Unsupported sitemap content type (contentType = ${contentType}, url = ${url?.toString()})`);
};
while (sources.length > 0) {
const source = sources.shift();
if ((source?.depth ?? 0) > maxDepth) {
log_1.default.debug(`Skipping sitemap ${source.type === 'url' ? source.url : ''} because it reached max depth ${maxDepth}.`);
continue;
}
let items = null;
if (source.type === 'url') {
const sitemapUrl = new URL(source.url);
visitedSitemapUrls.add(sitemapUrl.toString());
let retriesLeft = sitemapRetries + 1;
while (retriesLeft-- > 0) {
try {
const sitemapStream = await new Promise((resolve, reject) => {
const request = gotScraping.stream({
url: sitemapUrl,
proxyUrl,
method: 'GET',
timeout: networkTimeouts,
headers: {
accept: 'text/plain, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8',
},
});
request.on('response', () => resolve(request));
request.on('error', reject);
});
let error = null;
if (sitemapStream.response.statusCode >= 200 && sitemapStream.response.statusCode < 300) {
let contentType = sitemapStream.response.headers['content-type'];
const streamWithType = await fileTypeStream(sitemapStream);
if (streamWithType.fileType !== undefined) {
contentType = streamWithType.fileType.mime;
}
let isGzipped = false;
if (contentType !== undefined
? contentType === 'application/gzip'
: sitemapUrl.pathname.endsWith('.gz')) {
isGzipped = true;
if (sitemapUrl.pathname.endsWith('.gz')) {
sitemapUrl.pathname = sitemapUrl.pathname.substring(0, sitemapUrl.pathname.length - 3);
}
}
items = (0, node_stream_1.pipeline)(streamWithType, isGzipped ? (0, node_zlib_1.createGunzip)() : new node_stream_1.PassThrough(), createParser(contentType, sitemapUrl), (e) => {
if (e !== undefined && e !== null) {
error = { type: 'parser', error: e };
}
});
}
else {
error = {
type: 'fetch',
error: new Error(`Failed to fetch sitemap: ${sitemapUrl}, status code: ${sitemapStream.response.statusCode}`),
};
}
if (error !== null) {
const shouldIgnoreError = error.type === 'fetch' && !reportNetworkErrors;
if (!shouldIgnoreError) {
throw error.error;
}
}
else {
break;
}
}
catch (e) {
log_1.default.warning(`Malformed sitemap content: ${sitemapUrl}, ${retriesLeft === 0 ? 'no retries left.' : 'retrying...'} (${e})`);
}
}
}
else if (source.type === 'raw') {
items = (0, node_stream_1.pipeline)(node_stream_1.Readable.from([source.content]), createParser('text/xml'), (error) => {
if (error !== undefined) {
log_1.default.warning(`Malformed sitemap content: ${error}`);
}
});
}
if (items === null) {
continue;
}
for await (const item of items) {
if (item.type === 'sitemapUrl' && !visitedSitemapUrls.has(item.url)) {
sources.push({ type: 'url', url: item.url, depth: (source.depth ?? 0) + 1 });
if (emitNestedSitemaps) {
yield { loc: item.url, originSitemapUrl: null };
}
}
if (item.type === 'url') {
yield {
...item,
originSitemapUrl: source.type === 'url'
? source.url
: `raw://${(0, node_crypto_1.createHash)('sha256').update(source.content).digest('base64')}`,
};
}
}
}
}
/**
* Loads one or more sitemaps from given URLs, following references in sitemap index files, and exposes the contained URLs.
*
* **Example usage:**
* ```javascript
* // Load a sitemap
* const sitemap = await Sitemap.load(['https://example.com/sitemap.xml', 'https://example.com/sitemap_2.xml.gz']);
*
* // Enqueue all the contained URLs (including those from sub-sitemaps from sitemap indexes)
* await crawler.addRequests(sitemap.urls);
* ```
*/
class Sitemap {
constructor(urls) {
Object.defineProperty(this, "urls", {
enumerable: true,
configurable: true,
writable: true,
value: urls
});
}
/**
* Try to load sitemap from the most common locations - `/sitemap.xml` and `/sitemap.txt`.
* For loading based on `Sitemap` entries in `robots.txt`, the {@link RobotsTxtFile} class should be used.
* @param url The domain URL to fetch the sitemap for.
* @param proxyUrl A proxy to be used for fetching the sitemap file.
*/
static async tryCommonNames(url, proxyUrl) {
const sitemapUrls = [];
const sitemapUrl = new URL(url);
sitemapUrl.search = '';
sitemapUrl.pathname = '/sitemap.xml';
sitemapUrls.push(sitemapUrl.toString());
sitemapUrl.pathname = '/sitemap.txt';
sitemapUrls.push(sitemapUrl.toString());
return Sitemap.load(sitemapUrls, proxyUrl, { reportNetworkErrors: false });
}
/**
* Fetch sitemap content from given URL or URLs and return URLs of referenced pages.
* @param urls sitemap URL(s)
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static async load(urls, proxyUrl, parseSitemapOptions) {
return await this.parse((Array.isArray(urls) ? urls : [urls]).map((url) => ({ type: 'url', url })), proxyUrl, parseSitemapOptions);
}
/**
* Parse XML sitemap content from a string and return URLs of referenced pages. If the sitemap references other sitemaps, they will be loaded via HTTP.
* @param content XML sitemap content
* @param proxyUrl URL of a proxy to be used for fetching sitemap contents
*/
static async fromXmlString(content, proxyUrl) {
return await this.parse([{ type: 'raw', content }], proxyUrl);
}
static async parse(sources, proxyUrl, parseSitemapOptions) {
const urls = [];
try {
for await (const item of parseSitemap(sources, proxyUrl, parseSitemapOptions)) {
urls.push(item.loc);
}
}
catch {
return new Sitemap([]);
}
return new Sitemap(urls);
}
}
exports.Sitemap = Sitemap;
//# sourceMappingURL=sitemap.js.map