ts-webcrawler
Version:
A typescript webcrawler library for downloading and parsing webpages
220 lines (219 loc) • 9.12 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Page = void 0;
const Url_1 = require("./Url");
const node_fetch_1 = __importDefault(require("node-fetch"));
const Regex_1 = require("../Constants/Regex");
class Page {
constructor(url) {
this.url = url instanceof Url_1.Url ? url : new Url_1.Url(url);
this.data = null;
this.code = null;
this.loaded = false;
this.links = [];
this.files = [];
this._loadingStart = 0;
this._loadingEnd = 0;
this.ttfb = 0;
}
getUrl() {
return this.url.toString();
}
getUrlObject() {
return this.url;
}
getData() {
return this.data;
}
getCode() {
return this.code;
}
getLinks() {
return this.links;
}
getFiles() {
return this.files;
}
/**
* Method loads page and returns promise with loaded page
* @param param0 {onload: (page: Page) => void} Callback function that is called when page is loaded
* @returns
*/
load({ onload = (page) => { } } = {}) {
return __awaiter(this, void 0, void 0, function* () {
if (!this.loaded) {
this._loadingStart = Date.now();
try {
const response = yield (0, node_fetch_1.default)(this.url.toString());
this.code = response.status;
this.data = yield response.text();
this.loaded = true;
this._loadingEnd = Date.now();
this.ttfb = this._loadingEnd - this._loadingStart;
const processedData = this.processData() || { links: [], files: [] };
this.links = processedData.links;
this.files = processedData.files;
onload(this);
}
catch (error) {
console.log(error);
}
}
return this;
});
}
getInternalLinks() {
return this.links.filter((url) => url.getHost() === this.url.getHost());
}
getExternalLinks() {
return this.links.filter((url) => url.getHost() !== this.url.getHost());
}
getTtfb() {
return this.ttfb;
}
getLang() {
if (!this.loaded || this.data == null)
return null;
const match = this.data.match(/<html(?:\s+[^>]*?)?\s+lang=["']([\w-]+)["']/i);
if (match == null)
return null;
return match[1];
}
getTitleTag() {
if (!this.loaded || this.data == null)
return null;
const match = this.data.match(Regex_1.titleTag);
if (match == null)
return null;
return match[0].replace(/<[^>]*>/g, '');
}
getMetaTag(name) {
const metaTags = this.getMetaTags();
const metaTag = metaTags.find((metaTag) => metaTag.name === name);
return metaTag == null ? null : metaTag.content;
}
getMetaTags() {
if (!this.loaded || this.data == null)
return [];
const matches = this.data.matchAll(Regex_1.metaTag);
if (matches == null)
return [];
const metaTags = [];
for (const match of matches) {
const nameAtribute = match[0].match(/name=["'](.*?)["']/);
const nameAtributeValue = nameAtribute == null ? '' : nameAtribute[0].replace(/name=["']/, '').replace(/["']/, '');
const contentAtribute = match[0].match(/content=["'](.*?)["']/);
const contentAtributeValue = contentAtribute == null ? '' : contentAtribute[0].replace(/content=["']/, '').replace(/["']/, '');
const propertyAtribute = match[0].match(/property=["'](.*?)["']/);
const propertyAtributeValue = propertyAtribute == null ? '' : propertyAtribute[0].replace(/property=["']/, '').replace(/["']/, '');
metaTags.push({
name: nameAtributeValue,
content: contentAtributeValue,
property: propertyAtributeValue
});
}
return metaTags;
}
/**
* Method returns array of Url objects
* @returns {Url[]} Returns array of Url objects
*/
processData() {
if (this.data == null)
return null;
const absoluteLinks = this.processAbsoluteLinks() || [];
const relativeLinks = this.processRelativeLinks() || [];
const allLinks = [...absoluteLinks, ...relativeLinks];
return {
links: allLinks.filter((url) => url.isValid && url.isPage),
files: allLinks.filter((url) => url.isValid && url.isAsset)
};
}
/**
* Method returns array of Url objects
* @returns {Url[]} Returns array of Url objects
*/
processAbsoluteLinks() {
if (this.data == null)
return null;
let clearedData = this.data.replace(/<head(?:\s+[^>]*?)?>[\s\S]*?<\/head>/gi, '');
clearedData = clearedData.replace(/<script(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/javascript|application\/javascript)\1)?[\s\S]*?<\/script>/gi, '');
clearedData = clearedData.replace(/<style(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/css)\1|\s+id=['"]\w+['"])?[\s\S]*?<\/style>/gi, '');
return [...clearedData.matchAll(Regex_1.absoluteUrl)].map((match) => {
const url = new Url_1.Url(match[0]);
return url;
});
}
/**
* Method returns array of Url objects
* @returns {Url[]} Returns array of Url objects
*/
processRelativeLinks() {
if (this.data == null)
return null;
let clearedData = this.data.replace(/<head(?:\s+[^>]*?)?>[\s\S]*?<\/head>/gi, '');
clearedData = clearedData.replace(/<script(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/javascript|application\/javascript)\1)?[\s\S]*?<\/script>/gi, '');
clearedData = clearedData.replace(/<style(?:\s+[^>]*?)?(?:\s+type=(['"])(text\/css)\1|\s+id=['"]\w+['"])?[\s\S]*?<\/style>/gi, '');
return [...clearedData.matchAll(Regex_1.relativeUrl)].filter((match) => {
const file = match[1];
// Filter out some not real links
if (file.startsWith('#'))
return false;
if (file.startsWith('javascript:'))
return false;
if (file.startsWith('mailto:'))
return false;
if (file.startsWith('tel:'))
return false;
if (file.startsWith('data:'))
return false;
return true;
}).map((match) => {
var _a;
// match[1] is file path
// /path/to/file
let file = match[1];
if (file.startsWith('./'))
file = file.replace('./', '');
let path = '';
if (!file.startsWith('/')) {
file = '/' + file;
path = ((_a = this.url.getFolder()) !== null && _a !== void 0 ? _a : '') + file;
}
else {
path = file;
}
const url = new Url_1.Url(this.url.getOrigin() + path.replace(/\/\//, '/'));
return url;
});
}
static extractWords(html) {
// Remove head and script tags
let cleanHtml = html.replace(/<head[\s\S]*?<\/head>/gi, ' ')
.replace(/<script[\s\S]*?<\/script>/gi, ' ');
// Remove all remaining tags and attributes
cleanHtml = cleanHtml.replace(/<[^>]+>/g, '');
// Remove all style blocks and attributes
cleanHtml = cleanHtml.replace(/style\s*=\s*"[^"]*"/gi, ' ')
.replace(/<style[\s\S]*?<\/style>/gi, ' ');
// Remove all comments and extra spaces
cleanHtml = cleanHtml.replace(/<!--[\s\S]*?-->/g, ' ').replace(/\s+/g, ' ').replace(/-->/g, ' ');
// Clean html all entities
cleanHtml = cleanHtml.replace(/&[^;]+;/g, '');
// Trim whitespace and return the result
return cleanHtml.trim();
}
}
exports.Page = Page;