UNPKG

ts-webcrawler

Version:

A typescript webcrawler library for downloading and parsing webpages

316 lines (315 loc) 13.3 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.BaseCrawler = void 0; const Page_1 = require("../Class/Page"); const Asset_1 = require("../Class/Asset"); const Logger_1 = require("../Class/Logger"); class BaseCrawler { constructor(settings) { this._pages = []; this._assets = []; this._pagesCrawled = []; this._assetsCrawled = []; this._pagesToFollow = []; this._running = false; this._activeThreads = 0; this.hooks = {}; this._settings = settings; } toString() { return this._pagesCrawled.map((page) => page.getUrl().toString()).join('\n'); } getCrawledPages() { return this._pagesCrawled; } getCrawledAssets() { return this._assetsCrawled; } getPages() { return this._pages; } getAssets() { return this._assets; } run(hooks) { const startingPage = new Page_1.Page(this._settings.startUrl); this._pages.push(startingPage); this._running = true; this.hooks = hooks; // Start crawling if (this._settings.maxPages || 0 > 0) { this.crawlingPageRunner().then(() => { if (this.hooks.onAllPagesLoaded) this.hooks.onAllPagesLoaded({ crawler: this }); }); } else { if (this.hooks.onAllPagesLoaded) this.hooks.onAllPagesLoaded({ crawler: this }); } if (this._settings.maxAssets || 0 > 0) { this.crawlingAssetRunner().then(() => { if (this.hooks.onAllAssetsLoaded) this.hooks.onAllAssetsLoaded({ crawler: this }); }); } else { if (this.hooks.onAllAssetsLoaded) this.hooks.onAllAssetsLoaded({ crawler: this }); } if (this._settings.downloadRobotsTxt || this._settings.downloadSitemapXml) { this.crawlingMeta(startingPage); } //onPageLoaded({ page: new Page(this._settings.startUrl), crawler: this }) //onAssetLoaded({ asset: new Asset(this._settings.startUrl), crawler: this }) //onQueueEmpty({ enqueuePage: this.enquequePage, enqueueAsset: this.enquequeAsset, crawler: this }) } /** * Crawls the starting page's robots.txt and sitemap.xml files. * Calls the appropriate hooks when they are loaded. */ crawlingMeta(startingPage) { var _a, _b, _c, _d, _e; return __awaiter(this, void 0, void 0, function* () { // Get the robots.txt and sitemap.xml URLs based on the starting page's origin. const robotsTxtUrl = startingPage.getUrlObject().getOrigin() + "/robots.txt"; const sitemapXmlUrl = startingPage.getUrlObject().getOrigin() + "/sitemap.xml"; let foundSitemap = false; // Load the robots.txt file. const robotsAsset = new Asset_1.Asset(robotsTxtUrl); yield robotsAsset.load(); const responseCode = (_a = robotsAsset.getCode()) !== null && _a !== void 0 ? _a : 404; // Call the onRobotsTxtLoaded hook with the appropriate parameters. if (this.hooks.onRobotsTxtLoaded) { this.hooks.onRobotsTxtLoaded({ crawler: this, success: responseCode < 400, asset: robotsAsset, }); } // If we're not downloading the sitemap.xml file, return early. if (!this._settings.downloadSitemapXml) { return; } // Parse the sitemap URLs from the robots.txt file. const sitemaps = Asset_1.Asset.parseSitemapUrl((_c = (_b = robotsAsset.getData()) === null || _b === void 0 ? void 0 : _b.toString()) !== null && _c !== void 0 ? _c : ''); if (sitemaps.length === 0) { sitemaps.push(sitemapXmlUrl); } // Loop through the sitemap URLs and load the first one that returns a success response code. for (let i = 0; i < sitemaps.length && i < ((_d = this._settings.maxSitemaps) !== null && _d !== void 0 ? _d : 1); i++) { const sitemapUrl = sitemaps[i]; const sitemapAsset = new Asset_1.Asset(sitemapUrl); yield sitemapAsset.load(); const responseCode = (_e = sitemapAsset.getCode()) !== null && _e !== void 0 ? _e : 404; if (responseCode < 400) { // Call the onSitemapXmlLoaded hook with the appropriate parameters. if (this.hooks.onSitemapXmlLoaded) { this.hooks.onSitemapXmlLoaded({ crawler: this, success: true, asset: sitemapAsset, }); } foundSitemap = true; } } // Call the onSitemapXmlLoaded hook with the appropriate parameters. if (this.hooks.onSitemapXmlLoaded && !foundSitemap) { this.hooks.onSitemapXmlLoaded({ crawler: this, success: false, asset: null, }); } }); } crawlingPageRunner() { return __awaiter(this, void 0, void 0, function* () { if (this._pagesCrawled.length >= (this._settings.maxPages || 100)) return Promise.resolve(); this._pagesToFollow.forEach((page) => { if (this._settings.followInternal) { page.getInternalLinks().forEach((url) => { this.enquequePage(url); }); } if (this._settings.followExternal) { page.getExternalLinks().forEach((url) => { this.enquequePage(url); }); } }); while ((this._settings.maxThreads || 5) > this._activeThreads && this.canPageCrawl()) { this.startPageThread().then((page) => { if (!page) return Promise.resolve(); if (this.hooks.onPageLoaded) this.hooks.onPageLoaded({ page, crawler: this }); this.crawlingPageRunner(); //console.log(page) }).catch((error) => { if (this.hooks.onPageError) this.hooks.onPageError(error.page, error.error); console.log(error); }); } Promise.resolve(); }); } crawlingAssetRunner() { return __awaiter(this, void 0, void 0, function* () { while ((this._settings.maxThreads || 5) > this._activeThreads && this.canAssetCrawl()) { this.startAssetThread().then((asset) => { if (!asset) return Promise.resolve(); if (this.hooks.onAssetLoaded) this.hooks.onAssetLoaded({ asset, crawler: this }); this.crawlingAssetRunner(); //console.log(asset) }).catch((error) => { if (this.hooks.onAssetError) this.hooks.onAssetError(error.asset, error.error); console.log(error); }); } return Promise.resolve(); }); } startPageThread() { return __awaiter(this, void 0, void 0, function* () { // Check if crawler can run if (!this.canPageCrawl()) return Promise.resolve(); // Increment active threads this._activeThreads++; // Get next page const page = this._pages.shift(); // Check if page exists if (!page) return Promise.resolve(); // Add page to crawled pages this._pagesCrawled.push(page); // Debug start if (this._settings.debug) Logger_1.Logger.log("Starts new Page thread", "Threads: " + this._activeThreads); try { // Load page yield page.load(); // Add page to pages to follow this._pagesToFollow.push(page); // Decrement active threads this._activeThreads--; if (this._settings.debug) Logger_1.Logger.log("Page loaded", page.getUrl().toString() + " (TTFB: " + page.getTtfb() + ")", "info" /* LogLevel.Info */); if (this._settings.debug) Logger_1.Logger.log("End Page thread", "Threads: " + this._activeThreads); return Promise.resolve(page); } catch (error) { // Decrement active threads this._activeThreads--; return Promise.reject({ error, page }); } }); } startAssetThread() { return __awaiter(this, void 0, void 0, function* () { // Check if crawler can run if (!this.canAssetCrawl()) return Promise.resolve(); // Increment active threads this._activeThreads++; // Get next asset const asset = this._assets.shift(); // Debug message if (this._settings.debug) Logger_1.Logger.log("Starts new Asset thread", "Threads: " + this._activeThreads); // Check if asset exists if (!asset) return Promise.resolve(); try { // Load asset yield asset.load(); // Add asset to crawled assets this._assetsCrawled.push(asset); // Decrement active threads this._activeThreads--; if (this._settings.debug) Logger_1.Logger.log("Asset loaded", asset.getUrl().toString(), "info" /* LogLevel.Info */); if (this._settings.debug) Logger_1.Logger.log("End Asset thread", "Threads: " + this._activeThreads); // Return asset return Promise.resolve(asset); } catch (error) { // Decrement active threads this._activeThreads--; // Return error return Promise.reject({ error, asset }); } }); } canPageCrawl() { if (!this.canRun()) return false; if (this._pagesCrawled.length >= (this._settings.maxPages || 100)) return false; return true; } canAssetCrawl() { if (!this.canRun()) return false; if (this._assetsCrawled.length >= (this._settings.maxAssets || 100)) return false; return true; } canRun() { if (!this._running) return false; if (this._pages.length + this._assets.length === 0) return false; return true; } /** * Kill the crawler (stop all activity) */ kill() { this._running = false; } /** * Enqueue a page to be crawled * @param url Url to enqueue */ enquequePage(url) { const comparable = url.getComparable(); if (this._pagesCrawled.find((page) => page.getUrlObject().getComparable() === comparable)) return; if (this._pages.find((page) => page.getUrlObject().getComparable() === comparable)) return; if (this._pagesToFollow.find((page) => page.getUrlObject().getComparable() === comparable)) return; this._pages.push(new Page_1.Page(url)); } /** * Enqueue an asset to be crawled * @param url Url to enqueue */ enquequeAsset(url) { if (this._assetsCrawled.find((asset) => asset.getUrl().toString() === url.toString())) return; if (this._assets.find((asset) => asset.getUrl().toString() === url.toString())) return; this._assets.push(new Asset_1.Asset(url)); } } exports.BaseCrawler = BaseCrawler;