html-tables-to-json
Version:
Parser for HTML to JSON
97 lines • 3.52 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.CheerioTableParser = void 0;
const cheerio = require("cheerio");
// TODO: TableParser should be an interface
/**
* Parses HTML into a DOM tree using Cheerio.
*/
class CheerioTableParser {
/**
* @returns array of cheerio instances representing the tables in the document
*/
parse(doc) {
this.$ = cheerio.load(doc);
const result = [];
const $tables = this.$("table");
if ($tables.length === 0) {
throw new Error("No tables found");
}
$tables.each((_, table) => {
result.push(this.expandColspanRowspan(this.parseTr(this.$(table))));
});
return result;
}
/**
* Given a list of <tr>s, return a list of text rows.
*
* @see Any cell with `rowspan` or `colspan` will have its contents copied
* to subsequent cells.
* @param $rows list of cheerio instances representing rows
* @returns array of array, each returned row is a list of string text.
*/
expandColspanRowspan($rows) {
const allTexts = [];
let remainder = []; // [index, text, rowspan]
$rows.each((_, tr) => {
const texts = [];
const nextRemainder = [];
let index = 0;
const $tds = this.parseTd(this.$(tr));
$tds.each((_, td) => {
// push texts from previous rows with rowspan > 1 that come
// before this <td>
while (remainder.length > 0 &&
remainder[0] &&
remainder[0][0] <= index) {
const [prevIndex, prevText, prevRowspan] = remainder[0];
texts.push(prevText);
if (prevRowspan > 1) {
nextRemainder.push([prevIndex, prevText, prevRowspan - 1]);
}
index++;
remainder.shift();
}
const $td = this.$(td);
// push the text from this <td>, colspan times
const text = $td.text().trim();
const rowspan = parseInt($td.attr("rowspan") || "1", 10);
const colspan = parseInt($td.attr("colspan") || "1", 10);
for (let i = 0; i < colspan; i++) {
texts.push(text);
if (rowspan > 1) {
nextRemainder.push([index, text, rowspan - 1]);
}
index++;
}
});
for (const [prevIndex, prevText, prevRowspan] of remainder) {
texts.push(prevText);
if (prevRowspan > 1) {
nextRemainder.push([prevIndex, prevText, prevRowspan - 1]);
}
}
allTexts.push(texts);
remainder = nextRemainder;
});
return allTexts;
}
/**
* Return the list of row elements from the parsed table element.
*
* @param $table the table to parse
* @returns array of cheerio instances representing the rows in the table
*/
parseTr($table) {
return $table.find("tr");
}
/**
* @param $row the row to parse
* @returns array of cheerio instances representing the cells in the row
*/
parseTd($row) {
return $row.find("td, th");
}
}
exports.CheerioTableParser = CheerioTableParser;
//# sourceMappingURL=parser.js.map