ts-webcrawler
Version:
A typescript webcrawler library for downloading and parsing webpages
68 lines (67 loc) • 2.83 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const Page_1 = require("./Class/Page");
const BaseCrawler_1 = require("./Crawlers/BaseCrawler");
const Seo_1 = require("./Class/Seo");
const bootstrap = () => {
const crawler = new BaseCrawler_1.BaseCrawler({
startUrl: 'https://www.archiweb.cz/vladimir-balda-architekt',
maxPages: 1,
maxAssets: 50,
maxDepth: 2,
followExternal: false,
followInternal: true,
assetFolder: 'assets',
debug: true,
downloadRobotsTxt: true,
downloadSitemapXml: true,
maxSitemaps: 2,
});
crawler.run({
onPageLoaded: ({ page, crawler }) => {
//console.log(`Page loaded: ${page.getUrl()}`)
//Logger.log("Page Title", page.getTitleTag() || "None", LogLevel.Info)
//Logger.log("Page Description", page.getMetaTag("description") || "None", LogLevel.Info)
console.log(Page_1.Page.extractWords(page.getData() || ""));
console.log(Seo_1.Seo.extractHeadingTags(page.getData() || ""));
console.log(Seo_1.Seo.extractImagesWithoutAlt(page.getData() || ""));
},
onAllPagesLoaded: ({ crawler }) => {
console.log(`All pages loaded`);
},
onAssetLoaded: ({ asset, crawler }) => {
console.log(`Asset loaded: ${asset.getUrl()}`);
},
onAllAssetsLoaded: ({ crawler }) => {
console.log(`All assets loaded`);
},
onPageError: (url, error) => {
console.log(`Error loading page: ${url}`);
},
onAssetError: (url, error) => {
console.log(`Error loading asset: ${url}`);
},
onRobotsTxtLoaded: ({ asset, crawler, success }) => {
var _a;
if (success) {
console.log(`Robots.txt loaded: ${(_a = asset === null || asset === void 0 ? void 0 : asset.getUrl()) !== null && _a !== void 0 ? _a : 'None'}`);
}
else {
console.log(`Robots.txt not loaded`);
}
},
onSitemapXmlLoaded: ({ asset, crawler, success }) => {
var _a, _b;
if (success) {
console.log(`Sitemap.xml loaded: ${(_a = asset === null || asset === void 0 ? void 0 : asset.getUrl()) !== null && _a !== void 0 ? _a : 'None'} Code: ${(_b = asset === null || asset === void 0 ? void 0 : asset.getCode()) !== null && _b !== void 0 ? _b : 'None'}`);
}
else {
console.log(`Sitemap.xml not loaded`);
}
},
/*onQueueEmpty: ({ enqueuePage, enqueueAsset, crawler}) => {
console.log(`Queue empty`)
}*/
});
};
bootstrap();