UNPKG

@xcrap/parser

Version:

Xcrap Parser is a package of the Xcrap framework, it was developed to take care of the data extraction part of text files (currently supporting only HTML and JSON) using declarative models.

72 lines (71 loc) 2.49 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.HtmlParser = exports.nodeHtmlParserOptions = void 0; const node_html_parser_1 = __importDefault(require("node-html-parser")); const errors_1 = require("../errors"); const parser_1 = require("../parser"); exports.nodeHtmlParserOptions = { blockTextElements: { script: true, noscript: true, style: true, code: true } }; class HtmlParser extends parser_1.Parser { constructor(source, options = exports.nodeHtmlParserOptions) { super(source); this.source = source; this.root = node_html_parser_1.default.parse(source, options); } async parseMany({ query, extractor, limit }) { const elements = this.root.querySelectorAll(query); let items = []; for (const element of elements) { if (limit != undefined && items.length >= limit) break; const data = await extractor(element); items.push(data); } return items; } async parseFirst({ query, extractor, default: default_ }) { let data; if (query) { const element = this.root.querySelector(query); if (!element) { if (default_ !== undefined) { return default_; } throw new errors_1.HTMLElementNotFoundError(query); } data = await extractor(element); } else { data = await extractor(this.root); } return data !== null && data !== void 0 ? data : default_; } async extractFirst({ model, query }) { const element = query ? this.root.querySelector(query) : this.root; if (!element) { throw new errors_1.HTMLElementNotFoundError(query); } return await model.parse(element.outerHTML); } async extractMany({ model, query, limit }) { const elements = this.root.querySelectorAll(query); let dataList = []; for (const element of elements) { if (limit != undefined && dataList.length >= limit) break; const data = await model.parse(element.outerHTML); dataList.push(data); } return dataList; } } exports.HtmlParser = HtmlParser;