UNPKG

ts-webcrawler

Version:

A typescript webcrawler library for downloading and parsing webpages

github.com/standahorvath/webcrawler

standahorvath/webcrawler

220 lines (219 loc) • 9.12 kB

JavaScript

"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Page = void 0; const Url_1 = require("./Url"); const node_fetch_1 = __importDefault(require("node-fetch")); const Regex_1 = require("../Constants/Regex"); class Page { constructor(url) { this.url = url instanceof Url_1.Url ? url : new Url_1.Url(url); this.data = null; this.code = null; this.loaded = false; this.links = []; this.files = []; this._loadingStart = 0; this._loadingEnd = 0; this.ttfb = 0; } getUrl() { return this.url.toString(); } getUrlObject() { return this.url; } getData() { return this.data; } getCode() { return this.code; } getLinks() { return this.links; } getFiles() { return this.files; } /** * Method loads page and returns promise with loaded page * @param param0 {onload: (page: Page) => void} Callback function that is called when page is loaded * @returns */ load({ onload = (page) => { } } = {}) { return __awaiter(this, void 0, void 0, function* () { if (!this.loaded) { this._loadingStart = Date.now(); try { const response = yield (0, node_fetch_1.default)(this.url.toString()); this.code = response.status; this.data = yield response.text(); this.loaded = true; this._loadingEnd = Date.now(); this.ttfb = this._loadingEnd - this._loadingStart; const processedData = this.processData() || { links: [], files: [] }; this.links = processedData.links; this.files = processedData.files; onload(this); } catch (error) { console.log(error); } } return this; }); } getInternalLinks() { return this.links.filter((url) => url.getHost() === this.url.getHost()); } getExternalLinks() { return this.links.filter((url) => url.getHost() !== this.url.getHost()); } getTtfb() { return this.ttfb; } getLang() { if (!this.loaded || this.data == null) return null; const match = this.data.match(/<html(?:\s+[^>]*?)?\s+lang=["']([\w-]+)["']/i); if (match == null) return null; return match[1]; } getTitleTag() { if (!this.loaded || this.data == null) return null; const match = this.data.match(Regex_1.titleTag); if (match == null) return null; return match[0].replace(/<[^>]*>/g, ''); } getMetaTag(name) { const metaTags = this.getMetaTags(); const metaTag = metaTags.find((metaTag) => metaTag.name === name); return metaTag == null ? null : metaTag.content; } getMetaTags() { if (!this.loaded || this.data == null) return []; const matches = this.data.matchAll(Regex_1.metaTag); if (matches == null) return []; const metaTags = []; for (const match of matches) { const nameAtribute = match[0].match(/name=["'](.*?)["']/); const nameAtributeValue = nameAtribute == null ? '' : nameAtribute[0].replace(/name=["']/, '').replace(/["']/, ''); const contentAtribute = match[0].match(/content=["'](.*?)["']/); const contentAtributeValue = contentAtribute == null ? '' : contentAtribute[0].replace(/content=["']/, '').replace(/["']/, ''); const propertyAtribute = match[0].match(/property=["'](.*?)["']/); const propertyAtributeValue = propertyAtribute == null ? '' : propertyAtribute[0].replace(/property=["']/, '').replace(/["']/, ''); metaTags.push({ name: nameAtributeValue, content: contentAtributeValue, property: propertyAtributeValue }); } return metaTags; } /** * Method returns array of Url objects * @returns {Url[]} Returns array of Url objects */ processData() { if (this.data == null) return null; const absoluteLinks = this.processAbsoluteLinks() || []; const relativeLinks = this.processRelativeLinks() || []; const allLinks = [...absoluteLinks, ...relativeLinks]; return { links: allLinks.filter((url) => url.isValid && url.isPage), files: allLinks.filter((url) => url.isValid && url.isAsset) }; } /** * Method returns array of Url objects * @returns {Url[]} Returns array of Url objects */ processAbsoluteLinks() { if (this.data == null) return null; let clearedData = this.data.replace(/<head(?:\s+[^>]*?)?>[\s\S]*?<\/head>/gi, ''); clearedData = clearedData.replace(/<script(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/javascript|application\/javascript)\1)?[\s\S]*?<\/script>/gi, ''); clearedData = clearedData.replace(/<style(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/css)\1|\s+id=['"]\w+['"])?[\s\S]*?<\/style>/gi, ''); return [...clearedData.matchAll(Regex_1.absoluteUrl)].map((match) => { const url = new Url_1.Url(match[0]); return url; }); } /** * Method returns array of Url objects * @returns {Url[]} Returns array of Url objects */ processRelativeLinks() { if (this.data == null) return null; let clearedData = this.data.replace(/<head(?:\s+[^>]*?)?>[\s\S]*?<\/head>/gi, ''); clearedData = clearedData.replace(/<script(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/javascript|application\/javascript)\1)?[\s\S]*?<\/script>/gi, ''); clearedData = clearedData.replace(/<style(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/css)\1|\s+id=['"]\w+['"])?[\s\S]*?<\/style>/gi, ''); return [...clearedData.matchAll(Regex_1.relativeUrl)].filter((match) => { const file = match[1]; // Filter out some not real links if (file.startsWith('#')) return false; if (file.startsWith('javascript:')) return false; if (file.startsWith('mailto:')) return false; if (file.startsWith('tel:')) return false; if (file.startsWith('data:')) return false; return true; }).map((match) => { var _a; // match[1] is file path // /path/to/file let file = match[1]; if (file.startsWith('./')) file = file.replace('./', ''); let path = ''; if (!file.startsWith('/')) { file = '/' + file; path = ((_a = this.url.getFolder()) !== null && _a !== void 0 ? _a : '') + file; } else { path = file; } const url = new Url_1.Url(this.url.getOrigin() + path.replace(/\/\//, '/')); return url; }); } static extractWords(html) { // Remove head and script tags let cleanHtml = html.replace(/<head[\s\S]*?<\/head>/gi, ' ') .replace(/<script[\s\S]*?<\/script>/gi, ' '); // Remove all remaining tags and attributes cleanHtml = cleanHtml.replace(/<[^>]+>/g, ''); // Remove all style blocks and attributes cleanHtml = cleanHtml.replace(/style\s*=\s*"[^"]*"/gi, ' ') .replace(/<style[\s\S]*?<\/style>/gi, ' '); // Remove all comments and extra spaces cleanHtml = cleanHtml.replace(//g, ' ').replace(/\s+/g, ' ').replace(/-->/g, ' '); // Clean html all entities cleanHtml = cleanHtml.replace(/&[^;]+;/g, ''); // Trim whitespace and return the result return cleanHtml.trim(); } } exports.Page = Page;