UNPKG

chen-crawler

Version:

Web Crawler Provider for Chen Framework

501 lines 15.4 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator.throw(value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : new P(function (resolve) { resolve(result.value); }).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments)).next()); }); }; const core_1 = require('chen/core'); const web_1 = require('chen/web'); const queue_1 = require('../queue'); const child_process_1 = require('child_process'); const cheerio = require('cheerio'); const events = require('events'); const urllib = require('url'); const pathlib = require('path'); /** * Abstract Crawler class */ class Crawler extends events.EventEmitter { /** * Abstract crawler constructor * @param {Storage} storage * @param {string} private name * @param {string} private startingUrl * @param {HttpClientOptions} protected config */ constructor(storage, name, startingUrl, config) { super(); this.name = name; this.startingUrl = startingUrl; this.config = config; /** * Flag to determine if crawler is currently running * @type {boolean} */ this.running = false; /** * Flag to determine if page is rendered via ajax * @type {boolean | HeadlessBrowserEnabler} */ this.headlessBrowserEnabled = false; /** * Crawler http client * @type {HttpClient} */ this.httpClient = new web_1.HttpClient(); /** * Flag whether crawl anchor tag links * @type {boolean} */ this.followHtmlLinks = true; /** * Browser is processing * @type {boolean} */ this.browserBusy = false; this.queue = new queue_1.Queue(storage, name); this.inProcessList = new queue_1.ProcessingList(storage, name); } /** * Get name * @return {string} */ getName() { return this.name; } /** * Get starting url * @return {string} */ getStartingUrl() { return this.startingUrl; } /** * Get http client configuration * @return {HttpClientOptions} */ getConfig() { return core_1._.clone(this.config); } /** * Set URL Queue Filter * @param {QueueFilter} filter * @return {this} */ setQueueFilter(filter) { this.queueFilter = filter; return this; } /** * Set content filter to be saved in the database * @param {CrawledContentFilter} event * @return {this} */ setContentFilter(filter) { this.crawledContentFilter = filter; return this; } /** * Filter url * @param {urllib.Url} url * @return {boolean} */ filterQueue(url) { if (typeof this.queueFilter != 'function') return true; return this.queueFilter(url); } /** * Filter content * @param {urllib.Url} url * @param {CrawledContent} data * @return {boolean} */ filterContent(url, data) { if (typeof this.crawledContentFilter != 'function') return true; return this.crawledContentFilter(url, data); } /** * Check if url matched for URL ajax filter * @param {urllib.Url | string} url * @return {boolean} */ isHeadlessBrowserEnabled(url) { if (typeof this.headlessBrowserEnabled == 'function') { if (typeof url == 'string') { url = urllib.parse(url); } return this.headlessBrowserEnabled(url); } return this.headlessBrowserEnabled === true; } /** * Flag for enabling crawler for ajax rendered content * @param {boolean | HeadlessBrowserEnabler = true} enable * @return {this} */ useHeadlessBrowser(enable = true) { this.headlessBrowserEnabled = enable; return this; } /** * Get cheerio instance * @param {string} body * @return {HtmlSelector} */ loadHtml(body) { return cheerio.load(body); } /** * Load url * @param {string} url * @return {Promise<HttpClientResponse>} */ loadUrl(url) { return __awaiter(this, void 0, void 0, function* () { return yield this.httpClient.get(url, this.config); }); } /** * Listen on fetch start event * @param {(urllib.Url, worker) => void} fn * @return {this} */ onFetchStart(fn) { this.on('fetchStart', fn); return this; } /** * Listen on fetch complete event * @param {(HtmlSelector) => void} fn * @return {this} */ onFetchComplete(fn) { this.on('fetchComplete', fn); return this; } /** * On fetch error * @param {(urllib.Url, HttpClientResponse, worker) => void} fn * @return {this} */ onFetchError(fn) { this.on('fetchError', fn); return this; } /** * Listen on error event * @param {(err) => void} fn * @return {this} */ onError(fn) { this.on('error', fn); return this; } /** * Listen on start event * @param {() => void} fn * @return {this} */ onStart(fn) { this.on('start', fn); return this; } /** * Listen on stop event * @param {() => void} fn * @return {this} */ onStop(fn) { this.on('stop', fn); return this; } /** * Format url with additional filtering * @param {URL} urlInfo * @return {string} */ formatFromParsedUrl(urlInfo) { delete urlInfo.hash; return urllib.format(urlInfo); } /** * Remove unnecessary segments in url like hash * @param {string} url * @return {string} */ cleanUrl(url) { url = core_1._.trim(url); let urlInfo = urllib.parse(url); if (urlInfo && !urlInfo.host) { throw new core_1.Exception(`Invalid url for crawling: ${url}`); } return this.formatFromParsedUrl(urlInfo); } /** * Set storage service * @param {StorageService<Model>} service * @return {this} */ setStorageService(service) { this.storage = service; return this; } /** * Save crawled data * @param {urllib.Url | string} url * @param {HtmlSelector} select * @return {Promise<Model>} */ saveContent(url, select) { return __awaiter(this, void 0, void 0, function* () { if (typeof url == 'string') { url = urllib.parse(url); } let data = { url: url.href, title: core_1._.trim(select('title').text()), content: core_1._.utf8Encode(core_1._.trim(select.html())) }; let model = null; if (this.filterContent(url, data)) { model = yield this.insertData(data); } return model; }); } /** * Save crawled data to storage * @param {CrawledContent} data * @return {Promise<Model>} */ insertData(data) { return __awaiter(this, void 0, void 0, function* () { return yield this.storage.create(data); }); } /** * Check url is already crawled and saved to storage * @param {string} url * @return {Promise<Model>} */ getProcessed(url) { return __awaiter(this, void 0, void 0, function* () { return yield this.storage.findOne({ 'url': url }); }); } /** * Extract then add to queue * @param {string} url * @param {HtmlSelector} select * @return {Promise<void>} */ extractUrlsFromHtmlAndAddToQueue(url, select) { return __awaiter(this, void 0, void 0, function* () { yield this.addToQueue(yield this.filterExtractedUrls(this.extractUrlsFromHtml(url, select))); }); } /** * Add to queue * @param {string[]} urls * @return {Promise<void>} */ addToQueue(urls) { return __awaiter(this, void 0, void 0, function* () { for (let url of urls) { if (!url) continue; yield this.queue.push(url); } }); } /** * Extract urls from given cheerio instance * @param {string} baseUrl * @param {HtmlSelector} htmlSelector * @return {string[]} */ extractUrlsFromHtml(baseUrl, htmlSelector) { let urls = []; let baseUrlInfo = urllib.parse(baseUrl); htmlSelector('a').each((index, anchor) => { let href = htmlSelector(anchor).attr('href'); if (typeof href == 'string') { href = urllib.resolve(baseUrl, href); let hrefInfo = urllib.parse(href); if (hrefInfo && hrefInfo.protocol != 'mailto:' && core_1._.endsWith(hrefInfo.host, baseUrlInfo.host)) { urls.push(this.formatFromParsedUrl(hrefInfo)); } } }); return urls; } /** * Filter extracted urls * @param {string[]} extractedUrls * @return {Promise<string[]>} */ filterExtractedUrls(extractedUrls) { return __awaiter(this, void 0, void 0, function* () { let filteredUrls = []; for (let url of extractedUrls) { if (!url) continue; let urlInfo = urllib.parse(url); if (filteredUrls.indexOf(url) == -1 && this.filterQueue(urlInfo)) { filteredUrls.push(url); } } let fineUrls = []; // let alreadyCrawledUrls = (await this.storage.find({'url': filteredUrls})).pluck('url'); let alreadyCrawledUrls = (yield this.storage.query(q => q.where('url', 'in', filteredUrls)).get()).pluck('url'); if (alreadyCrawledUrls.length) { for (let url of filteredUrls) { if (alreadyCrawledUrls.indexOf(url) == -1) { fineUrls.push(url); } } } else { fineUrls = filteredUrls; } return fineUrls; }); } /** * Load Url via browser * @param {string} url * @return {Promise<string>} */ loadUrlFromBrowser(url) { return new Promise((resolve, reject) => { let extractorPath = pathlib.dirname(pathlib.dirname(__dirname)); child_process_1.exec(`phantomjs ${extractorPath}/extractor.js '${url}'`, (error, stdout, stderr) => { if (error) { reject(error); return; } if (stderr) { reject(stderr); return; } let content = stdout.split('--BOUNDARY'); resolve(content[1]); }); }); } /** * Crawl given url * @param {string} url * @param {string} worker * @return {Promise<void>} */ crawlUrlViaHttpClient(url, worker) { return __awaiter(this, void 0, void 0, function* () { try { url = this.cleanUrl(url); let urlInfo = urllib.parse(url); this.emit('fetchStart', urlInfo, worker); yield this.inProcessList.add(url); let urlResponse = yield this.loadUrl(url); if (urlResponse.info.statusCode >= 400 && urlResponse.info.statusCode <= 599) { this.emit('fetchError', urlInfo, urlResponse, worker); } else { let select = this.loadHtml(urlResponse.body); let model = yield this.saveContent(urlInfo, select); this.emit('fetchComplete', urlInfo, select, model, worker); if (this.followHtmlLinks) { yield this.extractUrlsFromHtmlAndAddToQueue(url, select); } } yield this.inProcessList.remove(url); } catch (ex) { this.emit('error', ex); } }); } /** * Crawl via browser and control the queue for browser crawling * @param {string} url * @param {string} worker * @return {Promise<void>} */ crawlUrlViaHeadlessBrowser(url, worker) { return __awaiter(this, void 0, void 0, function* () { try { url = this.cleanUrl(url); let urlInfo = urllib.parse(url); if (this.browserBusy) { yield this.queue.push(url); return; } this.emit('fetchStart', urlInfo, worker); this.browserBusy = true; yield this.inProcessList.add(url); let content = yield this.loadUrlFromBrowser(url); this.browserBusy = false; if (content) { let select = this.loadHtml(content); let model = yield this.saveContent(urlInfo, select); this.emit('fetchComplete', urlInfo, select, model, worker); if (this.followHtmlLinks) { yield this.extractUrlsFromHtmlAndAddToQueue(urlInfo.href, select); } } yield this.inProcessList.remove(url); } catch (ex) { this.emit('error', ex); } }); } /** * Start crawler * @return {Promise<void>} */ start() { return __awaiter(this, void 0, void 0, function* () { if (this.running) return; this.running = true; this.emit('start'); yield this.crawl(); }); } /** * Check if already in process * @param {string} url * @return {Promise<boolean>} */ inProcess(url) { return __awaiter(this, void 0, void 0, function* () { return yield this.inProcessList.has(url); }); } /** * Check whether to use headless browser or not then crawl the url * @param {string} url * @param {string} worker * @return {Promise<void>} */ crawlUrl(url, worker) { return __awaiter(this, void 0, void 0, function* () { if (!(yield this.getProcessed(url)) && !(yield this.inProcessList.has(url))) { if (this.isHeadlessBrowserEnabled(url)) { this.crawlUrlViaHeadlessBrowser(url, worker); } else { yield this.crawlUrlViaHttpClient(url, worker); } } }); } } exports.Crawler = Crawler; //# sourceMappingURL=base.js.map