UNPKG

@nodefony/monitoring-bundle

Version:
202 lines (194 loc) 5.73 kB
// var request = require('request'); const cheerio = require("cheerio"); const http = require("http"); const https = require("https"); const async = require("async"); const makeRequestHttp = function (link, context, callback) { this.log(`REQUEST : ${link}`, "DEBUG"); const myurl = url.parse(link); // cookie session const headers = {}; if (context.session) { headers.Cookie = `${context.session.name}=${context.session.id}`; } const options = { hostname: myurl.hostname, port: myurl.port, path: myurl.path, method: "GET", headers }; let wrapper = http.request; // console.log(options) let keepAliveAgent = null; // https if (myurl.protocol === "https:") { // keepalive if multiple request in same socket keepAliveAgent = new https.Agent({ keepAlive: true }); // certificat nodefony.extend(options, { key: this.serverHttps.key, cert: this.serverHttps.cert, rejectUnauthorized: false, requestCert: true, agent: keepAliveAgent }); wrapper = https.request; } else { // keepalive keepAliveAgent = new http.Agent({ keepAlive: true }); options.agent = keepAliveAgent; } const req = wrapper(options, (res) => { let bodyRaw = ""; res.setEncoding("utf8"); res.on("data", (chunk) => { // this.log( chunk, "DEBUG"); bodyRaw += chunk; }); res.on("end", () => { parseLink.call(this, link, bodyRaw, callback); }); }); req.on("error", (e) => { this.log(`Problem with request: ${e.message}`, "ERROR"); }); req.end(); }; const parseLink = function (crawlUrl, body, callback) { const pageObject = {}; pageObject.links = []; if ((/^\//).test(crawlUrl)) { pageObject.url = this.protocol + this.base + crawlUrl; } else { pageObject.url = crawlUrl; } const $ = cheerio.load(body, { ignoreWhitespace: true }); pageObject.title = $("title").text(); pageObject.selector = $; // find link $("a").each((i, elem) => { // console.log(elem.attribs.href) if (elem.attribs.href === "#" || elem.attribs.href === "/") { return; } let href = null; if ((/^\//).test(elem.attribs.href)) { href = url.parse(this.protocol + this.base + elem.attribs.href); } else if (elem.attribs.href) { href = url.parse(elem.attribs.href); } else { href = null; } if (href) { pageObject.links.push({ linkText: $(elem).text(), linkUrl: href }); } }); callback(null, pageObject); }; const myLoop = function (link, context, finish, recurse) { if (this.crawled[link]) { if (this.crawled[link].page) { finish(null, this.crawled); return; } } makeRequestHttp.call(this, link, context, (error, pageObject) => { if (error) { return; } this.crawled[pageObject.url] = []; this.crawled[pageObject.url].page = pageObject; async.eachSeries(pageObject.links, (item, cb) => { if (item.linkUrl) { // test if the url actually points to the same domain if (item.linkUrl.host === this.base) { if (!item.linkUrl.hash) { this.crawled[pageObject.url].push(item.linkUrl.href); } } } cb(null); }, (error) => { if (!error) { for (let i = 0; i < this.crawled[pageObject.url].length; i++) { // console.log( this.crawled[pageObject.url] ) if (this.crawled[pageObject.url][i] in this.crawled) { continue; } else { recurse++; this.crawled[this.crawled[pageObject.url][i]] = []; myLoop.call(this, this.crawled[pageObject.url][i], context, () => { recurse--; if (recurse === 0) { // console.log("FINISH") finish(error, this.crawled); } }, 0); } } } if (recurse === 0) { // console.log( "FINISH 2" ) finish(error, this.crawled); } }); }); }; module.exports = class webCrawler extends nodefony.Service { constructor (container, kernel) { super("WEBCRAWLER", container, container.get("notificationsCenter")); this.kernel = kernel; this.crawled = {}; this.elastic = null; this.serverHttps = this.get("httpsServer"); this.once("onReady", () => { this.elastic = this.kernel.getBundle("documentation").elastic; }); } siteAll (urlBase, search, context, callback) { const recurse = 0; const Link = url.parse(urlBase); this.base = Link.host; this.protocol = Link.protocol ? `${Link.protocol}//` : "http://"; if (this.elastic) { myLoop.call(this, urlBase, context, (/* error, crawled*/) => {}); } else { myLoop.call(this, urlBase, context, (error, crawled) => { // console.log(crawled) const obj = {}; try { for (const page in crawled) { if (crawled && crawled[page] && crawled[page].page && crawled[page].page.selector) { const text = crawled[page].page.selector("body").text(); if (!text) { continue; } // var index = text.indexOf(search) ; const reg = new RegExp(search, "gi"); const index = text.search(reg); if (index !== -1) { obj[crawled[page].page.url] = { text: `...${text.substring(index - 100, index + 100)}...`, title: crawled[page].page.title }; } } } } catch (e) { this.log(e, "ERROR"); } callback(obj); }, recurse); } } };