ts-webcrawler
Version:
A typescript webcrawler library for downloading and parsing webpages
316 lines (315 loc) • 13.3 kB
JavaScript
"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.BaseCrawler = void 0;
const Page_1 = require("../Class/Page");
const Asset_1 = require("../Class/Asset");
const Logger_1 = require("../Class/Logger");
class BaseCrawler {
constructor(settings) {
this._pages = [];
this._assets = [];
this._pagesCrawled = [];
this._assetsCrawled = [];
this._pagesToFollow = [];
this._running = false;
this._activeThreads = 0;
this.hooks = {};
this._settings = settings;
}
toString() {
return this._pagesCrawled.map((page) => page.getUrl().toString()).join('\n');
}
getCrawledPages() {
return this._pagesCrawled;
}
getCrawledAssets() {
return this._assetsCrawled;
}
getPages() {
return this._pages;
}
getAssets() {
return this._assets;
}
run(hooks) {
const startingPage = new Page_1.Page(this._settings.startUrl);
this._pages.push(startingPage);
this._running = true;
this.hooks = hooks;
// Start crawling
if (this._settings.maxPages || 0 > 0) {
this.crawlingPageRunner().then(() => {
if (this.hooks.onAllPagesLoaded)
this.hooks.onAllPagesLoaded({ crawler: this });
});
}
else {
if (this.hooks.onAllPagesLoaded)
this.hooks.onAllPagesLoaded({ crawler: this });
}
if (this._settings.maxAssets || 0 > 0) {
this.crawlingAssetRunner().then(() => {
if (this.hooks.onAllAssetsLoaded)
this.hooks.onAllAssetsLoaded({ crawler: this });
});
}
else {
if (this.hooks.onAllAssetsLoaded)
this.hooks.onAllAssetsLoaded({ crawler: this });
}
if (this._settings.downloadRobotsTxt || this._settings.downloadSitemapXml) {
this.crawlingMeta(startingPage);
}
//onPageLoaded({ page: new Page(this._settings.startUrl), crawler: this })
//onAssetLoaded({ asset: new Asset(this._settings.startUrl), crawler: this })
//onQueueEmpty({ enqueuePage: this.enquequePage, enqueueAsset: this.enquequeAsset, crawler: this })
}
/**
* Crawls the starting page's robots.txt and sitemap.xml files.
* Calls the appropriate hooks when they are loaded.
*/
crawlingMeta(startingPage) {
var _a, _b, _c, _d, _e;
return __awaiter(this, void 0, void 0, function* () {
// Get the robots.txt and sitemap.xml URLs based on the starting page's origin.
const robotsTxtUrl = startingPage.getUrlObject().getOrigin() + "/robots.txt";
const sitemapXmlUrl = startingPage.getUrlObject().getOrigin() + "/sitemap.xml";
let foundSitemap = false;
// Load the robots.txt file.
const robotsAsset = new Asset_1.Asset(robotsTxtUrl);
yield robotsAsset.load();
const responseCode = (_a = robotsAsset.getCode()) !== null && _a !== void 0 ? _a : 404;
// Call the onRobotsTxtLoaded hook with the appropriate parameters.
if (this.hooks.onRobotsTxtLoaded) {
this.hooks.onRobotsTxtLoaded({
crawler: this,
success: responseCode < 400,
asset: robotsAsset,
});
}
// If we're not downloading the sitemap.xml file, return early.
if (!this._settings.downloadSitemapXml) {
return;
}
// Parse the sitemap URLs from the robots.txt file.
const sitemaps = Asset_1.Asset.parseSitemapUrl((_c = (_b = robotsAsset.getData()) === null || _b === void 0 ? void 0 : _b.toString()) !== null && _c !== void 0 ? _c : '');
if (sitemaps.length === 0) {
sitemaps.push(sitemapXmlUrl);
}
// Loop through the sitemap URLs and load the first one that returns a success response code.
for (let i = 0; i < sitemaps.length && i < ((_d = this._settings.maxSitemaps) !== null && _d !== void 0 ? _d : 1); i++) {
const sitemapUrl = sitemaps[i];
const sitemapAsset = new Asset_1.Asset(sitemapUrl);
yield sitemapAsset.load();
const responseCode = (_e = sitemapAsset.getCode()) !== null && _e !== void 0 ? _e : 404;
if (responseCode < 400) {
// Call the onSitemapXmlLoaded hook with the appropriate parameters.
if (this.hooks.onSitemapXmlLoaded) {
this.hooks.onSitemapXmlLoaded({
crawler: this,
success: true,
asset: sitemapAsset,
});
}
foundSitemap = true;
}
}
// Call the onSitemapXmlLoaded hook with the appropriate parameters.
if (this.hooks.onSitemapXmlLoaded && !foundSitemap) {
this.hooks.onSitemapXmlLoaded({
crawler: this,
success: false,
asset: null,
});
}
});
}
crawlingPageRunner() {
return __awaiter(this, void 0, void 0, function* () {
if (this._pagesCrawled.length >= (this._settings.maxPages || 100))
return Promise.resolve();
this._pagesToFollow.forEach((page) => {
if (this._settings.followInternal) {
page.getInternalLinks().forEach((url) => {
this.enquequePage(url);
});
}
if (this._settings.followExternal) {
page.getExternalLinks().forEach((url) => {
this.enquequePage(url);
});
}
});
while ((this._settings.maxThreads || 5) > this._activeThreads && this.canPageCrawl()) {
this.startPageThread().then((page) => {
if (!page)
return Promise.resolve();
if (this.hooks.onPageLoaded)
this.hooks.onPageLoaded({ page, crawler: this });
this.crawlingPageRunner();
//console.log(page)
}).catch((error) => {
if (this.hooks.onPageError)
this.hooks.onPageError(error.page, error.error);
console.log(error);
});
}
Promise.resolve();
});
}
crawlingAssetRunner() {
return __awaiter(this, void 0, void 0, function* () {
while ((this._settings.maxThreads || 5) > this._activeThreads && this.canAssetCrawl()) {
this.startAssetThread().then((asset) => {
if (!asset)
return Promise.resolve();
if (this.hooks.onAssetLoaded)
this.hooks.onAssetLoaded({ asset, crawler: this });
this.crawlingAssetRunner();
//console.log(asset)
}).catch((error) => {
if (this.hooks.onAssetError)
this.hooks.onAssetError(error.asset, error.error);
console.log(error);
});
}
return Promise.resolve();
});
}
startPageThread() {
return __awaiter(this, void 0, void 0, function* () {
// Check if crawler can run
if (!this.canPageCrawl())
return Promise.resolve();
// Increment active threads
this._activeThreads++;
// Get next page
const page = this._pages.shift();
// Check if page exists
if (!page)
return Promise.resolve();
// Add page to crawled pages
this._pagesCrawled.push(page);
// Debug start
if (this._settings.debug)
Logger_1.Logger.log("Starts new Page thread", "Threads: " + this._activeThreads);
try {
// Load page
yield page.load();
// Add page to pages to follow
this._pagesToFollow.push(page);
// Decrement active threads
this._activeThreads--;
if (this._settings.debug)
Logger_1.Logger.log("Page loaded", page.getUrl().toString() + " (TTFB: " + page.getTtfb() + ")", "info" /* LogLevel.Info */);
if (this._settings.debug)
Logger_1.Logger.log("End Page thread", "Threads: " + this._activeThreads);
return Promise.resolve(page);
}
catch (error) {
// Decrement active threads
this._activeThreads--;
return Promise.reject({ error, page });
}
});
}
startAssetThread() {
return __awaiter(this, void 0, void 0, function* () {
// Check if crawler can run
if (!this.canAssetCrawl())
return Promise.resolve();
// Increment active threads
this._activeThreads++;
// Get next asset
const asset = this._assets.shift();
// Debug message
if (this._settings.debug)
Logger_1.Logger.log("Starts new Asset thread", "Threads: " + this._activeThreads);
// Check if asset exists
if (!asset)
return Promise.resolve();
try {
// Load asset
yield asset.load();
// Add asset to crawled assets
this._assetsCrawled.push(asset);
// Decrement active threads
this._activeThreads--;
if (this._settings.debug)
Logger_1.Logger.log("Asset loaded", asset.getUrl().toString(), "info" /* LogLevel.Info */);
if (this._settings.debug)
Logger_1.Logger.log("End Asset thread", "Threads: " + this._activeThreads);
// Return asset
return Promise.resolve(asset);
}
catch (error) {
// Decrement active threads
this._activeThreads--;
// Return error
return Promise.reject({ error, asset });
}
});
}
canPageCrawl() {
if (!this.canRun())
return false;
if (this._pagesCrawled.length >= (this._settings.maxPages || 100))
return false;
return true;
}
canAssetCrawl() {
if (!this.canRun())
return false;
if (this._assetsCrawled.length >= (this._settings.maxAssets || 100))
return false;
return true;
}
canRun() {
if (!this._running)
return false;
if (this._pages.length + this._assets.length === 0)
return false;
return true;
}
/**
* Kill the crawler (stop all activity)
*/
kill() {
this._running = false;
}
/**
* Enqueue a page to be crawled
* @param url Url to enqueue
*/
enquequePage(url) {
const comparable = url.getComparable();
if (this._pagesCrawled.find((page) => page.getUrlObject().getComparable() === comparable))
return;
if (this._pages.find((page) => page.getUrlObject().getComparable() === comparable))
return;
if (this._pagesToFollow.find((page) => page.getUrlObject().getComparable() === comparable))
return;
this._pages.push(new Page_1.Page(url));
}
/**
* Enqueue an asset to be crawled
* @param url Url to enqueue
*/
enquequeAsset(url) {
if (this._assetsCrawled.find((asset) => asset.getUrl().toString() === url.toString()))
return;
if (this._assets.find((asset) => asset.getUrl().toString() === url.toString()))
return;
this._assets.push(new Asset_1.Asset(url));
}
}
exports.BaseCrawler = BaseCrawler;