UNPKG

chen-crawler

Version:

Web Crawler Provider for Chen Framework

168 lines 6.15 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator.throw(value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : new P(function (resolve) { resolve(result.value); }).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments)).next()); }); }; const base_1 = require('./base'); const cheerio = require('cheerio'); const zlib = require('zlib'); const cluster = require('cluster'); const numCPUs = require('os').cpus().length; /** * Sitemap class */ class SitemapCrawler extends base_1.Crawler { /** * WebCrawler constructor * @param {string} private name * @param {string} private startingUrl * @param {HttpClientOptions} private config */ constructor(storage, name, startingUrl, config) { super(storage, name, startingUrl, config); /** * Whether follow anchor tag links * @type {boolean} */ this.followHtmlLinks = false; this.worker = (cluster.isMaster) ? `Master` : `Worker ${cluster.worker.process.pid}`; } /** * Load xml to cheerio * @param {string} content * @return {XmlSelector} */ loadXml(content) { return cheerio.load(content, { xmlMode: true }); } /** * Extract urls from sitemap * @param {string} content * @return {string[]} */ extractUrlsFromSitemap(content) { let urls = []; let elContent = this.loadXml(content); elContent('urlset > url').each((index, element) => { urls.push(elContent(element).find('loc').first().text()); }); return urls; } /** * Extract urls from gzip content * @param {Buffer} body * @return {Promise<string[]>} */ extractUrlsFromGzip(body) { return __awaiter(this, void 0, void 0, function* () { let content = yield new Promise((resolve, reject) => { zlib.gunzip(body, (err, result) => { if (err) { reject(err); return; } resolve(result.toString()); }); }); let urls = []; if (content) { // TODO: check if content is text file or RSS urls = this.extractUrlsFromSitemap(content); } return urls; }); } /** * Crawl sitemap url * @param {string} sitemapUrl */ crawlSitemapUrl(sitemapUrl) { return __awaiter(this, void 0, void 0, function* () { yield this.inProcessList.add(sitemapUrl); let response = yield this.httpClient.get(sitemapUrl, { encoding: null }); let extractedUrls = []; // TODO: accurate checking if (response.info.headers['content-type'].indexOf('gzip') != -1) { extractedUrls = yield this.extractUrlsFromGzip(response.body); } if (extractedUrls && extractedUrls.length) { extractedUrls = yield this.filterExtractedUrls(extractedUrls); yield this.addToQueue(extractedUrls); } yield this.inProcessList.remove(sitemapUrl); }); } /** * Extract urls from sitemap then add to queue * @param {string} sitemapContent * @return {Promise<void>} */ extractUrlsFromSitemapAndAddToQueue(sitemapContent) { return __awaiter(this, void 0, void 0, function* () { yield this.addToQueue(yield this.filterExtractedUrls(this.extractUrlsFromSitemap(sitemapContent))); }); } /** * Crawl url data */ crawlUrlData() { return __awaiter(this, void 0, void 0, function* () { let url; while (url = yield this.queue.shift()) { yield this.crawlUrl(url, this.worker); } }); } /** * Start crawling */ crawl() { return __awaiter(this, void 0, void 0, function* () { let sitemapXml = yield this.loadUrl(this.getStartingUrl()); let $ = this.loadXml(sitemapXml.body); let root = $.root().children(); if (!root.first()) return; if (root.get(0).tagName == 'sitemapindex') { let crawlStarted = false; if (cluster.isMaster) { let sitemapUrls = $('sitemapindex > sitemap').toArray().map((elem, index) => { return $(elem).find('loc').first().text(); }); let sitemapUrl; while (sitemapUrl = sitemapUrls.shift()) { if (!(yield this.inProcessList.has(sitemapUrl))) { yield this.crawlSitemapUrl(sitemapUrl); } if (!crawlStarted) { for (let i = 1; i < numCPUs; i++) { cluster.fork(); } } crawlStarted = true; cluster.on('exit', function (worker, code, signal) { cluster.fork(); }); yield this.crawlUrlData(); } } else { yield this.crawlUrlData(); } } else if (root.get(0).tagName = 'urlset') { yield this.extractUrlsFromSitemapAndAddToQueue(sitemapXml.body); for (let i = 1; i < numCPUs; i++) { cluster.fork(); } yield this.crawlUrlData(); } }); } } exports.SitemapCrawler = SitemapCrawler; //# sourceMappingURL=sitemap.js.map