UNPKG

@ynetlabo/htmlanalyzer

Version:

a tool for the npm package to easily analysis a html document.

101 lines 4.7 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.DateRegexFormat = exports.HTMLAnalyzer = exports.AnalyzerOptions = void 0; const axios_1 = __importDefault(require("axios")); const AnalyzedDoc_1 = require("./AnalyzedDoc"); const luxon_1 = require("luxon"); const node_html_parser_1 = require("node-html-parser"); class AnalyzerOptions { constructor(selectQuery) { this.selectQuery = selectQuery; } } exports.AnalyzerOptions = AnalyzerOptions; class HTMLAnalyzer { static async getAnalyzedDocByUrl(targetUrl, options, postDocAdapter) { let analyzedDoc = await this.getHtmlDocByUrl(targetUrl); analyzedDoc.parsedHtml = this.getParsedHtml(analyzedDoc.siteData); analyzedDoc.selectedElements = analyzedDoc.parsedHtml.querySelectorAll(options.selectQuery); return postDocAdapter(analyzedDoc.selectedElements, options); } static DlTagAdapter(elements) { let resDocs = new Array(); elements.forEach((elDl) => { let resDoc; elDl.childNodes.forEach((definition) => { if (definition.rawTagName == "dt") { resDoc = new AnalyzedDoc_1.ArticleDoc(definition.innerText.trim(), "", HTMLAnalyzer.guessDate(definition.innerText)); } else if (definition.rawTagName == "dd") { resDoc.description = definition.innerText.trim(); resDocs.push(resDoc); } }); }); return resDocs; } static CustomBlockAdapter(elements, options) { let resDocs = new Array(); elements.forEach((el) => { var _a, _b, _c, _d, _e, _f, _g, _h, _j; let title = (_c = (_b = el.querySelector((_a = options.titleSelector) !== null && _a !== void 0 ? _a : "")) === null || _b === void 0 ? void 0 : _b.innerText.trim()) !== null && _c !== void 0 ? _c : ""; let description = (_f = (_e = el.querySelector((_d = options.descriptionSelector) !== null && _d !== void 0 ? _d : "")) === null || _e === void 0 ? void 0 : _e.innerText.trim()) !== null && _f !== void 0 ? _f : ""; let date = HTMLAnalyzer.guessDate((_j = (_h = el.querySelector((_g = options.dateSelector) !== null && _g !== void 0 ? _g : "")) === null || _h === void 0 ? void 0 : _h.innerText) !== null && _j !== void 0 ? _j : ""); resDocs.push(new AnalyzedDoc_1.ArticleDoc(title, description, date)); }); return resDocs; } static guessDate(sentence) { var _a; const dateRegexFormats = [ new DateRegexFormat("[0-9]{4}/[0-9]{1,2}/[0-9]{1,2}", "yyyy/M/d"), new DateRegexFormat("[0-9]{4}.[0-9]{1,2}.[0-9]{1,2}", "yyyy.M.d"), new DateRegexFormat("[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日", "yyyy年M月d日"), new DateRegexFormat("[0-9]{1,2}月[0-9]{1,2}日", "M月d日"), ]; // let matchStr: string = ""; let res = luxon_1.DateTime.fromISO("2020-01-01"); for (const dateRegexFormat of dateRegexFormats) { let regex = new RegExp(dateRegexFormat.regex); let finds = (_a = regex.exec(sentence)) !== null && _a !== void 0 ? _a : []; for (const find of finds) { res = luxon_1.DateTime.fromFormat(find, dateRegexFormat.dateFormat); if (res.isValid) { return res.toJSDate(); } } } return luxon_1.DateTime.fromISO("2020-01-01").toJSDate(); } static async getHtmlDocByUrl(targetUrl) { var _a; let res = new AnalyzedDoc_1.AnalyzedDoc(targetUrl); let result = await axios_1.default.get(targetUrl); res.statusCode = (_a = result.status) !== null && _a !== void 0 ? _a : -1; res.siteData = result.data; try { res.documentDate = new Date(result.headers.date); } catch (e) { } return res; } static getParsedHtml(doc) { let parsedHtml = (0, node_html_parser_1.parse)(doc, { lowerCaseTagName: false, // convert tag name to lower case (hurts performance heavily) comment: false, // retrieve comments (hurts performance slightly) }); return parsedHtml; } } exports.HTMLAnalyzer = HTMLAnalyzer; class DateRegexFormat { constructor(regex, dateFormat) { this.regex = regex; this.dateFormat = dateFormat; } } exports.DateRegexFormat = DateRegexFormat; //# sourceMappingURL=HTMLAnalyzer.js.map