chen-crawler
Version:
Web Crawler Provider for Chen Framework
501 lines • 15.4 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator.throw(value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : new P(function (resolve) { resolve(result.value); }).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments)).next());
});
};
const core_1 = require('chen/core');
const web_1 = require('chen/web');
const queue_1 = require('../queue');
const child_process_1 = require('child_process');
const cheerio = require('cheerio');
const events = require('events');
const urllib = require('url');
const pathlib = require('path');
/**
* Abstract Crawler class
*/
class Crawler extends events.EventEmitter {
/**
* Abstract crawler constructor
* @param {Storage} storage
* @param {string} private name
* @param {string} private startingUrl
* @param {HttpClientOptions} protected config
*/
constructor(storage, name, startingUrl, config) {
super();
this.name = name;
this.startingUrl = startingUrl;
this.config = config;
/**
* Flag to determine if crawler is currently running
* @type {boolean}
*/
this.running = false;
/**
* Flag to determine if page is rendered via ajax
* @type {boolean | HeadlessBrowserEnabler}
*/
this.headlessBrowserEnabled = false;
/**
* Crawler http client
* @type {HttpClient}
*/
this.httpClient = new web_1.HttpClient();
/**
* Flag whether crawl anchor tag links
* @type {boolean}
*/
this.followHtmlLinks = true;
/**
* Browser is processing
* @type {boolean}
*/
this.browserBusy = false;
this.queue = new queue_1.Queue(storage, name);
this.inProcessList = new queue_1.ProcessingList(storage, name);
}
/**
* Get name
* @return {string}
*/
getName() {
return this.name;
}
/**
* Get starting url
* @return {string}
*/
getStartingUrl() {
return this.startingUrl;
}
/**
* Get http client configuration
* @return {HttpClientOptions}
*/
getConfig() {
return core_1._.clone(this.config);
}
/**
* Set URL Queue Filter
* @param {QueueFilter} filter
* @return {this}
*/
setQueueFilter(filter) {
this.queueFilter = filter;
return this;
}
/**
* Set content filter to be saved in the database
* @param {CrawledContentFilter} event
* @return {this}
*/
setContentFilter(filter) {
this.crawledContentFilter = filter;
return this;
}
/**
* Filter url
* @param {urllib.Url} url
* @return {boolean}
*/
filterQueue(url) {
if (typeof this.queueFilter != 'function')
return true;
return this.queueFilter(url);
}
/**
* Filter content
* @param {urllib.Url} url
* @param {CrawledContent} data
* @return {boolean}
*/
filterContent(url, data) {
if (typeof this.crawledContentFilter != 'function')
return true;
return this.crawledContentFilter(url, data);
}
/**
* Check if url matched for URL ajax filter
* @param {urllib.Url | string} url
* @return {boolean}
*/
isHeadlessBrowserEnabled(url) {
if (typeof this.headlessBrowserEnabled == 'function') {
if (typeof url == 'string') {
url = urllib.parse(url);
}
return this.headlessBrowserEnabled(url);
}
return this.headlessBrowserEnabled === true;
}
/**
* Flag for enabling crawler for ajax rendered content
* @param {boolean | HeadlessBrowserEnabler = true} enable
* @return {this}
*/
useHeadlessBrowser(enable = true) {
this.headlessBrowserEnabled = enable;
return this;
}
/**
* Get cheerio instance
* @param {string} body
* @return {HtmlSelector}
*/
loadHtml(body) {
return cheerio.load(body);
}
/**
* Load url
* @param {string} url
* @return {Promise<HttpClientResponse>}
*/
loadUrl(url) {
return __awaiter(this, void 0, void 0, function* () {
return yield this.httpClient.get(url, this.config);
});
}
/**
* Listen on fetch start event
* @param {(urllib.Url, worker) => void} fn
* @return {this}
*/
onFetchStart(fn) {
this.on('fetchStart', fn);
return this;
}
/**
* Listen on fetch complete event
* @param {(HtmlSelector) => void} fn
* @return {this}
*/
onFetchComplete(fn) {
this.on('fetchComplete', fn);
return this;
}
/**
* On fetch error
* @param {(urllib.Url, HttpClientResponse, worker) => void} fn
* @return {this}
*/
onFetchError(fn) {
this.on('fetchError', fn);
return this;
}
/**
* Listen on error event
* @param {(err) => void} fn
* @return {this}
*/
onError(fn) {
this.on('error', fn);
return this;
}
/**
* Listen on start event
* @param {() => void} fn
* @return {this}
*/
onStart(fn) {
this.on('start', fn);
return this;
}
/**
* Listen on stop event
* @param {() => void} fn
* @return {this}
*/
onStop(fn) {
this.on('stop', fn);
return this;
}
/**
* Format url with additional filtering
* @param {URL} urlInfo
* @return {string}
*/
formatFromParsedUrl(urlInfo) {
delete urlInfo.hash;
return urllib.format(urlInfo);
}
/**
* Remove unnecessary segments in url like hash
* @param {string} url
* @return {string}
*/
cleanUrl(url) {
url = core_1._.trim(url);
let urlInfo = urllib.parse(url);
if (urlInfo && !urlInfo.host) {
throw new core_1.Exception(`Invalid url for crawling: ${url}`);
}
return this.formatFromParsedUrl(urlInfo);
}
/**
* Set storage service
* @param {StorageService<Model>} service
* @return {this}
*/
setStorageService(service) {
this.storage = service;
return this;
}
/**
* Save crawled data
* @param {urllib.Url | string} url
* @param {HtmlSelector} select
* @return {Promise<Model>}
*/
saveContent(url, select) {
return __awaiter(this, void 0, void 0, function* () {
if (typeof url == 'string') {
url = urllib.parse(url);
}
let data = {
url: url.href,
title: core_1._.trim(select('title').text()),
content: core_1._.utf8Encode(core_1._.trim(select.html()))
};
let model = null;
if (this.filterContent(url, data)) {
model = yield this.insertData(data);
}
return model;
});
}
/**
* Save crawled data to storage
* @param {CrawledContent} data
* @return {Promise<Model>}
*/
insertData(data) {
return __awaiter(this, void 0, void 0, function* () {
return yield this.storage.create(data);
});
}
/**
* Check url is already crawled and saved to storage
* @param {string} url
* @return {Promise<Model>}
*/
getProcessed(url) {
return __awaiter(this, void 0, void 0, function* () {
return yield this.storage.findOne({ 'url': url });
});
}
/**
* Extract then add to queue
* @param {string} url
* @param {HtmlSelector} select
* @return {Promise<void>}
*/
extractUrlsFromHtmlAndAddToQueue(url, select) {
return __awaiter(this, void 0, void 0, function* () {
yield this.addToQueue(yield this.filterExtractedUrls(this.extractUrlsFromHtml(url, select)));
});
}
/**
* Add to queue
* @param {string[]} urls
* @return {Promise<void>}
*/
addToQueue(urls) {
return __awaiter(this, void 0, void 0, function* () {
for (let url of urls) {
if (!url)
continue;
yield this.queue.push(url);
}
});
}
/**
* Extract urls from given cheerio instance
* @param {string} baseUrl
* @param {HtmlSelector} htmlSelector
* @return {string[]}
*/
extractUrlsFromHtml(baseUrl, htmlSelector) {
let urls = [];
let baseUrlInfo = urllib.parse(baseUrl);
htmlSelector('a').each((index, anchor) => {
let href = htmlSelector(anchor).attr('href');
if (typeof href == 'string') {
href = urllib.resolve(baseUrl, href);
let hrefInfo = urllib.parse(href);
if (hrefInfo && hrefInfo.protocol != 'mailto:' && core_1._.endsWith(hrefInfo.host, baseUrlInfo.host)) {
urls.push(this.formatFromParsedUrl(hrefInfo));
}
}
});
return urls;
}
/**
* Filter extracted urls
* @param {string[]} extractedUrls
* @return {Promise<string[]>}
*/
filterExtractedUrls(extractedUrls) {
return __awaiter(this, void 0, void 0, function* () {
let filteredUrls = [];
for (let url of extractedUrls) {
if (!url)
continue;
let urlInfo = urllib.parse(url);
if (filteredUrls.indexOf(url) == -1 && this.filterQueue(urlInfo)) {
filteredUrls.push(url);
}
}
let fineUrls = [];
// let alreadyCrawledUrls = (await this.storage.find({'url': filteredUrls})).pluck('url');
let alreadyCrawledUrls = (yield this.storage.query(q => q.where('url', 'in', filteredUrls)).get()).pluck('url');
if (alreadyCrawledUrls.length) {
for (let url of filteredUrls) {
if (alreadyCrawledUrls.indexOf(url) == -1) {
fineUrls.push(url);
}
}
}
else {
fineUrls = filteredUrls;
}
return fineUrls;
});
}
/**
* Load Url via browser
* @param {string} url
* @return {Promise<string>}
*/
loadUrlFromBrowser(url) {
return new Promise((resolve, reject) => {
let extractorPath = pathlib.dirname(pathlib.dirname(__dirname));
child_process_1.exec(`phantomjs ${extractorPath}/extractor.js '${url}'`, (error, stdout, stderr) => {
if (error) {
reject(error);
return;
}
if (stderr) {
reject(stderr);
return;
}
let content = stdout.split('--BOUNDARY');
resolve(content[1]);
});
});
}
/**
* Crawl given url
* @param {string} url
* @param {string} worker
* @return {Promise<void>}
*/
crawlUrlViaHttpClient(url, worker) {
return __awaiter(this, void 0, void 0, function* () {
try {
url = this.cleanUrl(url);
let urlInfo = urllib.parse(url);
this.emit('fetchStart', urlInfo, worker);
yield this.inProcessList.add(url);
let urlResponse = yield this.loadUrl(url);
if (urlResponse.info.statusCode >= 400 && urlResponse.info.statusCode <= 599) {
this.emit('fetchError', urlInfo, urlResponse, worker);
}
else {
let select = this.loadHtml(urlResponse.body);
let model = yield this.saveContent(urlInfo, select);
this.emit('fetchComplete', urlInfo, select, model, worker);
if (this.followHtmlLinks) {
yield this.extractUrlsFromHtmlAndAddToQueue(url, select);
}
}
yield this.inProcessList.remove(url);
}
catch (ex) {
this.emit('error', ex);
}
});
}
/**
* Crawl via browser and control the queue for browser crawling
* @param {string} url
* @param {string} worker
* @return {Promise<void>}
*/
crawlUrlViaHeadlessBrowser(url, worker) {
return __awaiter(this, void 0, void 0, function* () {
try {
url = this.cleanUrl(url);
let urlInfo = urllib.parse(url);
if (this.browserBusy) {
yield this.queue.push(url);
return;
}
this.emit('fetchStart', urlInfo, worker);
this.browserBusy = true;
yield this.inProcessList.add(url);
let content = yield this.loadUrlFromBrowser(url);
this.browserBusy = false;
if (content) {
let select = this.loadHtml(content);
let model = yield this.saveContent(urlInfo, select);
this.emit('fetchComplete', urlInfo, select, model, worker);
if (this.followHtmlLinks) {
yield this.extractUrlsFromHtmlAndAddToQueue(urlInfo.href, select);
}
}
yield this.inProcessList.remove(url);
}
catch (ex) {
this.emit('error', ex);
}
});
}
/**
* Start crawler
* @return {Promise<void>}
*/
start() {
return __awaiter(this, void 0, void 0, function* () {
if (this.running)
return;
this.running = true;
this.emit('start');
yield this.crawl();
});
}
/**
* Check if already in process
* @param {string} url
* @return {Promise<boolean>}
*/
inProcess(url) {
return __awaiter(this, void 0, void 0, function* () {
return yield this.inProcessList.has(url);
});
}
/**
* Check whether to use headless browser or not then crawl the url
* @param {string} url
* @param {string} worker
* @return {Promise<void>}
*/
crawlUrl(url, worker) {
return __awaiter(this, void 0, void 0, function* () {
if (!(yield this.getProcessed(url)) && !(yield this.inProcessList.has(url))) {
if (this.isHeadlessBrowserEnabled(url)) {
this.crawlUrlViaHeadlessBrowser(url, worker);
}
else {
yield this.crawlUrlViaHttpClient(url, worker);
}
}
});
}
}
exports.Crawler = Crawler;
//# sourceMappingURL=base.js.map