UNPKG

html-table-to-dataframe

Version:
71 lines (70 loc) 2.77 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.BaseDataFrame = void 0; const jsdom_1 = require("jsdom"); class BaseDataFrame { constructor(html, options) { this.html = html; this.dom = new jsdom_1.JSDOM(html); this.document = this.dom.window.document; this.options = options; } validateHtml() { if (!this.html || this.html === '') { throw new Error('HTML cannot be empty'); } } /** * Validates the provided headers against the number of columns in the table. * Throws an error if the lengths do not match. * * @param headers - The headers provided by the user. * @param document - The HTML document containing the table. */ validateHeaders(headers) { const columnCount = this.document.querySelectorAll('table thead th, , table thead td').length; if (headers.length !== columnCount) { throw new Error(`The number of provided headers (${headers.length}) does not match the number of columns in the table (${columnCount}).`); } } /** * Generates an array of header names from the table's thead section. * If a header element's text content is empty, it will be replaced with a unique identifier * in the format 'unknownX', where X is the number of missing headers encountered so far. * * @returns {string[]} - An array of header names, with empty headers replaced by 'unknownX'. */ generateHeaders() { // Select both <th> and <td> elements within <thead> const headerElements = Array.from(this.document.querySelectorAll('table thead th, table thead td')); let unknownCount = 0; return headerElements.map((element) => { // Clean the text content of the header element const text = this.cleanHeaderText(element.textContent || ''); if (text && text !== '') { return text; } else { return `Unknown${unknownCount++}`; } }); } /** * Cleans header text by replacing newlines and multiple spaces with a single space. * @param text - The raw text extracted from the header element (th or td). * @returns A cleaned string with normalized spaces and no line breaks. */ // function cleanHeaderText(text) { return text.replace(/\s+/g, ' ').trim(); } buildData(rows, headers) { // Build the array of data const tableData = rows.map((row) => row.reduce((rowData, cell, index) => { rowData[headers[index]] = cell; return rowData; }, {})); return tableData; } } exports.BaseDataFrame = BaseDataFrame;