chen-crawler
Version:
Web Crawler Provider for Chen Framework
168 lines • 6.15 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator.throw(value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : new P(function (resolve) { resolve(result.value); }).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments)).next());
});
};
const base_1 = require('./base');
const cheerio = require('cheerio');
const zlib = require('zlib');
const cluster = require('cluster');
const numCPUs = require('os').cpus().length;
/**
* Sitemap class
*/
class SitemapCrawler extends base_1.Crawler {
/**
* WebCrawler constructor
* @param {string} private name
* @param {string} private startingUrl
* @param {HttpClientOptions} private config
*/
constructor(storage, name, startingUrl, config) {
super(storage, name, startingUrl, config);
/**
* Whether follow anchor tag links
* @type {boolean}
*/
this.followHtmlLinks = false;
this.worker = (cluster.isMaster) ? `Master` : `Worker ${cluster.worker.process.pid}`;
}
/**
* Load xml to cheerio
* @param {string} content
* @return {XmlSelector}
*/
loadXml(content) {
return cheerio.load(content, { xmlMode: true });
}
/**
* Extract urls from sitemap
* @param {string} content
* @return {string[]}
*/
extractUrlsFromSitemap(content) {
let urls = [];
let elContent = this.loadXml(content);
elContent('urlset > url').each((index, element) => {
urls.push(elContent(element).find('loc').first().text());
});
return urls;
}
/**
* Extract urls from gzip content
* @param {Buffer} body
* @return {Promise<string[]>}
*/
extractUrlsFromGzip(body) {
return __awaiter(this, void 0, void 0, function* () {
let content = yield new Promise((resolve, reject) => {
zlib.gunzip(body, (err, result) => {
if (err) {
reject(err);
return;
}
resolve(result.toString());
});
});
let urls = [];
if (content) {
// TODO: check if content is text file or RSS
urls = this.extractUrlsFromSitemap(content);
}
return urls;
});
}
/**
* Crawl sitemap url
* @param {string} sitemapUrl
*/
crawlSitemapUrl(sitemapUrl) {
return __awaiter(this, void 0, void 0, function* () {
yield this.inProcessList.add(sitemapUrl);
let response = yield this.httpClient.get(sitemapUrl, { encoding: null });
let extractedUrls = [];
// TODO: accurate checking
if (response.info.headers['content-type'].indexOf('gzip') != -1) {
extractedUrls = yield this.extractUrlsFromGzip(response.body);
}
if (extractedUrls && extractedUrls.length) {
extractedUrls = yield this.filterExtractedUrls(extractedUrls);
yield this.addToQueue(extractedUrls);
}
yield this.inProcessList.remove(sitemapUrl);
});
}
/**
* Extract urls from sitemap then add to queue
* @param {string} sitemapContent
* @return {Promise<void>}
*/
extractUrlsFromSitemapAndAddToQueue(sitemapContent) {
return __awaiter(this, void 0, void 0, function* () {
yield this.addToQueue(yield this.filterExtractedUrls(this.extractUrlsFromSitemap(sitemapContent)));
});
}
/**
* Crawl url data
*/
crawlUrlData() {
return __awaiter(this, void 0, void 0, function* () {
let url;
while (url = yield this.queue.shift()) {
yield this.crawlUrl(url, this.worker);
}
});
}
/**
* Start crawling
*/
crawl() {
return __awaiter(this, void 0, void 0, function* () {
let sitemapXml = yield this.loadUrl(this.getStartingUrl());
let $ = this.loadXml(sitemapXml.body);
let root = $.root().children();
if (!root.first())
return;
if (root.get(0).tagName == 'sitemapindex') {
let crawlStarted = false;
if (cluster.isMaster) {
let sitemapUrls = $('sitemapindex > sitemap').toArray().map((elem, index) => {
return $(elem).find('loc').first().text();
});
let sitemapUrl;
while (sitemapUrl = sitemapUrls.shift()) {
if (!(yield this.inProcessList.has(sitemapUrl))) {
yield this.crawlSitemapUrl(sitemapUrl);
}
if (!crawlStarted) {
for (let i = 1; i < numCPUs; i++) {
cluster.fork();
}
}
crawlStarted = true;
cluster.on('exit', function (worker, code, signal) {
cluster.fork();
});
yield this.crawlUrlData();
}
}
else {
yield this.crawlUrlData();
}
}
else if (root.get(0).tagName = 'urlset') {
yield this.extractUrlsFromSitemapAndAddToQueue(sitemapXml.body);
for (let i = 1; i < numCPUs; i++) {
cluster.fork();
}
yield this.crawlUrlData();
}
});
}
}
exports.SitemapCrawler = SitemapCrawler;
//# sourceMappingURL=sitemap.js.map