UNPKG

ts-webcrawler

Version:

A typescript webcrawler library for downloading and parsing webpages

68 lines (67 loc) 2.83 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const Page_1 = require("./Class/Page"); const BaseCrawler_1 = require("./Crawlers/BaseCrawler"); const Seo_1 = require("./Class/Seo"); const bootstrap = () => { const crawler = new BaseCrawler_1.BaseCrawler({ startUrl: 'https://www.archiweb.cz/vladimir-balda-architekt', maxPages: 1, maxAssets: 50, maxDepth: 2, followExternal: false, followInternal: true, assetFolder: 'assets', debug: true, downloadRobotsTxt: true, downloadSitemapXml: true, maxSitemaps: 2, }); crawler.run({ onPageLoaded: ({ page, crawler }) => { //console.log(`Page loaded: ${page.getUrl()}`) //Logger.log("Page Title", page.getTitleTag() || "None", LogLevel.Info) //Logger.log("Page Description", page.getMetaTag("description") || "None", LogLevel.Info) console.log(Page_1.Page.extractWords(page.getData() || "")); console.log(Seo_1.Seo.extractHeadingTags(page.getData() || "")); console.log(Seo_1.Seo.extractImagesWithoutAlt(page.getData() || "")); }, onAllPagesLoaded: ({ crawler }) => { console.log(`All pages loaded`); }, onAssetLoaded: ({ asset, crawler }) => { console.log(`Asset loaded: ${asset.getUrl()}`); }, onAllAssetsLoaded: ({ crawler }) => { console.log(`All assets loaded`); }, onPageError: (url, error) => { console.log(`Error loading page: ${url}`); }, onAssetError: (url, error) => { console.log(`Error loading asset: ${url}`); }, onRobotsTxtLoaded: ({ asset, crawler, success }) => { var _a; if (success) { console.log(`Robots.txt loaded: ${(_a = asset === null || asset === void 0 ? void 0 : asset.getUrl()) !== null && _a !== void 0 ? _a : 'None'}`); } else { console.log(`Robots.txt not loaded`); } }, onSitemapXmlLoaded: ({ asset, crawler, success }) => { var _a, _b; if (success) { console.log(`Sitemap.xml loaded: ${(_a = asset === null || asset === void 0 ? void 0 : asset.getUrl()) !== null && _a !== void 0 ? _a : 'None'} Code: ${(_b = asset === null || asset === void 0 ? void 0 : asset.getCode()) !== null && _b !== void 0 ? _b : 'None'}`); } else { console.log(`Sitemap.xml not loaded`); } }, /*onQueueEmpty: ({ enqueuePage, enqueueAsset, crawler}) => { console.log(`Queue empty`) }*/ }); }; bootstrap();