@ynetlabo/htmlanalyzer
Version:
a tool for the npm package to easily analysis a html document.
101 lines • 4.7 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.DateRegexFormat = exports.HTMLAnalyzer = exports.AnalyzerOptions = void 0;
const axios_1 = __importDefault(require("axios"));
const AnalyzedDoc_1 = require("./AnalyzedDoc");
const luxon_1 = require("luxon");
const node_html_parser_1 = require("node-html-parser");
class AnalyzerOptions {
constructor(selectQuery) {
this.selectQuery = selectQuery;
}
}
exports.AnalyzerOptions = AnalyzerOptions;
class HTMLAnalyzer {
static async getAnalyzedDocByUrl(targetUrl, options, postDocAdapter) {
let analyzedDoc = await this.getHtmlDocByUrl(targetUrl);
analyzedDoc.parsedHtml = this.getParsedHtml(analyzedDoc.siteData);
analyzedDoc.selectedElements = analyzedDoc.parsedHtml.querySelectorAll(options.selectQuery);
return postDocAdapter(analyzedDoc.selectedElements, options);
}
static DlTagAdapter(elements) {
let resDocs = new Array();
elements.forEach((elDl) => {
let resDoc;
elDl.childNodes.forEach((definition) => {
if (definition.rawTagName == "dt") {
resDoc = new AnalyzedDoc_1.ArticleDoc(definition.innerText.trim(), "", HTMLAnalyzer.guessDate(definition.innerText));
}
else if (definition.rawTagName == "dd") {
resDoc.description = definition.innerText.trim();
resDocs.push(resDoc);
}
});
});
return resDocs;
}
static CustomBlockAdapter(elements, options) {
let resDocs = new Array();
elements.forEach((el) => {
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
let title = (_c = (_b = el.querySelector((_a = options.titleSelector) !== null && _a !== void 0 ? _a : "")) === null || _b === void 0 ? void 0 : _b.innerText.trim()) !== null && _c !== void 0 ? _c : "";
let description = (_f = (_e = el.querySelector((_d = options.descriptionSelector) !== null && _d !== void 0 ? _d : "")) === null || _e === void 0 ? void 0 : _e.innerText.trim()) !== null && _f !== void 0 ? _f : "";
let date = HTMLAnalyzer.guessDate((_j = (_h = el.querySelector((_g = options.dateSelector) !== null && _g !== void 0 ? _g : "")) === null || _h === void 0 ? void 0 : _h.innerText) !== null && _j !== void 0 ? _j : "");
resDocs.push(new AnalyzedDoc_1.ArticleDoc(title, description, date));
});
return resDocs;
}
static guessDate(sentence) {
var _a;
const dateRegexFormats = [
new DateRegexFormat("[0-9]{4}/[0-9]{1,2}/[0-9]{1,2}", "yyyy/M/d"),
new DateRegexFormat("[0-9]{4}.[0-9]{1,2}.[0-9]{1,2}", "yyyy.M.d"),
new DateRegexFormat("[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日", "yyyy年M月d日"),
new DateRegexFormat("[0-9]{1,2}月[0-9]{1,2}日", "M月d日"),
];
// let matchStr: string = "";
let res = luxon_1.DateTime.fromISO("2020-01-01");
for (const dateRegexFormat of dateRegexFormats) {
let regex = new RegExp(dateRegexFormat.regex);
let finds = (_a = regex.exec(sentence)) !== null && _a !== void 0 ? _a : [];
for (const find of finds) {
res = luxon_1.DateTime.fromFormat(find, dateRegexFormat.dateFormat);
if (res.isValid) {
return res.toJSDate();
}
}
}
return luxon_1.DateTime.fromISO("2020-01-01").toJSDate();
}
static async getHtmlDocByUrl(targetUrl) {
var _a;
let res = new AnalyzedDoc_1.AnalyzedDoc(targetUrl);
let result = await axios_1.default.get(targetUrl);
res.statusCode = (_a = result.status) !== null && _a !== void 0 ? _a : -1;
res.siteData = result.data;
try {
res.documentDate = new Date(result.headers.date);
}
catch (e) { }
return res;
}
static getParsedHtml(doc) {
let parsedHtml = (0, node_html_parser_1.parse)(doc, {
lowerCaseTagName: false, // convert tag name to lower case (hurts performance heavily)
comment: false, // retrieve comments (hurts performance slightly)
});
return parsedHtml;
}
}
exports.HTMLAnalyzer = HTMLAnalyzer;
class DateRegexFormat {
constructor(regex, dateFormat) {
this.regex = regex;
this.dateFormat = dateFormat;
}
}
exports.DateRegexFormat = DateRegexFormat;
//# sourceMappingURL=HTMLAnalyzer.js.map