UNPKG

@felisdiligens/md-table-tools

Version:

MultiMarkdown table tools

426 lines (366 loc) 16.3 kB
import * as cheerio from 'cheerio'; import * as htmlparser2 from 'htmlparser2'; import TurndownService from 'turndown'; import { getTurndownService, removeInvisibleCharacters } from "./common.js"; import { Table, TableCaption, TableCaptionPosition, TableCell, TableCellMerge, TableRow, TextAlignment } from "./table.js"; import { ParsingError, TableParser } from "./tableParser.js"; import { TableRenderer } from "./tableRenderer.js"; function escapeMarkdown(mdStr: string): string { return mdStr.replace(/\|/g, "\\|"); } function mdToHtml(markdown: string, inline = true): string { let html = markdown.trim(); // escape(markdown); // Blockquote: if (!inline) { let lines = []; let quoted = false; for (let line of html.split(/\r?\n/)) { if (line.startsWith("> ")) { if (!quoted) lines.push("<blockquote>"); quoted = true; lines.push(`<p>${mdToHtml(line.substring(2))}</p>`); } else { if (quoted) lines.push("</blockquote>"); quoted = false; lines.push(line); } if (quoted) lines.push("</blockquote>"); } html = lines.join("\n"); if (!html.startsWith("<blockquote>")) html = `<p>${html}</p>`; } // Image: html = html.replace(/!\[([^\[\]]+)\]\(([^\(\)]+)\)/g, "<img src=\"$2\" alt=\"$1\">"); // Links: html = html.replace(/\[([^\[\]]+)\]\(([^\(\)]+)\)/g, "<a href=\"$2\">$1</a>"); // Block code: html = html.replace(/```[a-z]*?\n(.*?)\n```/g, "<code>$1</code>"); // Inline code: html = html.replace(/`(.*?)`/g, "<code>$1</code>"); // Strikethrough: html = html.replace(/~~(.*?)~~/g, "<del>$1</del>"); // Oblique: html = html.replace(/___(.*?)___/g, "<em><strong>$1</strong></em>"); html = html.replace(/\*\*\*(.*?)\*\*\*/g, "<em><strong>$1</strong></em>"); // Bold: html = html.replace(/__(.*?)__/g, "<strong>$1</strong>"); html = html.replace(/\*\*(.*?)\*\*/g, "<strong>$1</strong>"); // Italic: html = html.replace(/_(.*?)_/g, "<em>$1</em>"); html = html.replace(/\*(.*?)\*/g, "<em>$1</em>"); // Escaped characters: html = html.replace(/\\([#\.\|\*_\s`\[\]\-])/g, "$1"); // Newlines: if (inline) html = html.replace(/\r?\n/g, "<br>"); else html = html.replace(/(\r?\n){2}/g, "</p>\n<p>").replace(/(?<!\<\\p\>)\r?\n(?!\<p\>)/g, " "); // Remove unnecessary whitespace: html = html.replace(/[ \t]{2,}/g, " "); return html; } function htmlToMd(html: string, turndownService: TurndownService) { return turndownService.turndown(html); } function textAlignToCSS(textAlign: TextAlignment) { switch (textAlign) { case TextAlignment.left: return "text-align: left"; case TextAlignment.right: return "text-align: right"; case TextAlignment.center: return "text-align: center"; case TextAlignment.default: default: return "text-align: start"; } } function cssToTextAlign(element: cheerio.Cheerio): TextAlignment { if (!element.css("text-align")) // Might return 'undefined' return TextAlignment.default; switch (element.css("text-align").toLowerCase()) { case "left": return TextAlignment.left; case "center": return TextAlignment.center; case "right": return TextAlignment.right; default: return TextAlignment.default; } } /** changes the behavior of HTMLTableParser */ export enum HTMLTableParserMode { /** uses only text (`Cheerio.text()`) */ StripHTMLElements, /** uses the HTML code (`Cheerio.html()`) without any converting */ PreserveHTMLElements, /** uses the HTML code (`Cheerio.html()`) and converts to Markdown using Turndown if possible (default) */ ConvertHTMLElements, } export class HTMLTableParser implements TableParser { public constructor( public mode: HTMLTableParserMode = HTMLTableParserMode.ConvertHTMLElements, public turndownService: TurndownService = getTurndownService()) {} public parse(table: string): Table { /* Parse the html string and find our <table> tag to start: */ const dom = htmlparser2.parseDocument(table, { xmlMode: false, lowerCaseTags: true, lowerCaseAttributeNames: true, decodeEntities: true, }); const $ = cheerio.load(dom as any); const $tables = $("table"); if ($tables.length === 0) { throw new ParsingError("Couldn't find <table> tag in DOM."); } const $table = $($tables[0]); /* Converting table to Markdown: */ let parsedTable = new Table(); let hasSections = false; let tableTextAlign = cssToTextAlign($table); // Get everything before <table>: let m = table.match(/((.|\n)*)<\s*[tT][aA][bB][lL][eE][^<>]*>/m); if (m) { parsedTable.beforeTable = htmlToMd(m[1], this.turndownService); } // Get everything after </table>: m = table.match(/<\/\s*[tT][aA][bB][lL][eE]\s*>((.|\n)*)/m); if (m) { parsedTable.afterTable = htmlToMd(m[1], this.turndownService); } // Parse <thead> tag in <table>: let $theads = $table.find("thead"); if ($theads.length != 0) { let sectionTextAlign = cssToTextAlign($theads); this.parseSection( $, parsedTable, $theads.find("tr"), (sectionTextAlign != TextAlignment.default ? sectionTextAlign : tableTextAlign), true); hasSections = true; } // Parse <tbody> tags in <table>: const self = this; let $tbodies = $table.find("tbody"); if ($tbodies.length > 0) { $tbodies.each((i: number, element: cheerio.Element) => { const domTBody = $(element); let sectionTextAlign = cssToTextAlign(domTBody); self.parseSection( $, parsedTable, domTBody.find("tr"), (sectionTextAlign != TextAlignment.default ? sectionTextAlign : tableTextAlign), false, $theads == null, i > 0); }); hasSections = true; } // No <thead> or <tbody> tags? if (!hasSections) { // Parse table that doesn't have thead or tbody tags as one section with no header: this.parseSection( $, parsedTable, $table.find("tr"), tableTextAlign, false, true, false); } // Parse <caption> tag in <table>: let $captions = $table.find("caption"); if ($captions.length != 0) { const $caption = $($captions); let caption = new TableCaption(); caption.text = htmlToMd($caption.html() ? $caption.html() : "", this.turndownService).replace(/(\r?\n)/g, "").trim(); // domCaption.innerText.replace(/(\r?\n|\[|\])/g, "").trim(); if ($caption.attr('id') && caption.getLabel() != $caption.attr('id')) caption.label = $caption.attr('id').replace(/(\r?\n|\[|\])/g, "").trim(); switch ($caption.css("caption-side") && $caption.css("caption-side").toLowerCase()) { case "bottom": caption.position = TableCaptionPosition.bottom; break; case "top": default: caption.position = TableCaptionPosition.top; } parsedTable.caption = caption; } return parsedTable.update(); } private parseSection($: cheerio.Root, table: Table, $rows: cheerio.Cheerio, defaultTextAlign: TextAlignment, isHeader: boolean = false, allowHeaderDetection: boolean = false, firstRowStartsNewSection: boolean = false) { // HTML skips "ghost" cells that are overshadowed by other cells that have a rowspan > 1. // We'll memorize them: let rowspanGhostCells: { row: number; col: number; }[] = []; // Remember how many rows we already have: let rowOffset = table.rowCount(); // Iterate over each row (<tr>) of the HTML table: $rows.each((domRowIndex: number, element: cheerio.Element) => { let rowIndex = domRowIndex + rowOffset; let row = table.getRow(rowIndex); if (!row) row = table.addRow(); row.isHeader = isHeader; if (domRowIndex == 0) row.startsNewSection = firstRowStartsNewSection; // Memorize an offset (colspan): let colOffset = 0; // Iterate over each cell (<td> or <th>) of the HTML table row: const $row = $(element); let $cells = $row.find("td, th"); let allCellsAreTH = true; $cells.each((domColIndex: number, element: cheerio.Element) => { const $cell = $(element); // Get the TableColumn of our Table object, taking the memorized rowspans and colOffset into account: let colIndex = domColIndex + colOffset; while (rowspanGhostCells.filter(ghost => ghost.row == rowIndex && ghost.col == colIndex).length > 0) { colIndex = domColIndex + ++colOffset; } let column = table.getColumn(colIndex); if (!column) column = table.addColumn(); // Add cell to our Table object: let cellContent = this.parseCell($cell); let textAlign = cssToTextAlign($cell); let wrappable = $cell.hasClass("extend"); textAlign = textAlign != TextAlignment.default ? textAlign : defaultTextAlign; let cellIsHeader = $cell.prop("tagName").toLowerCase() == "th"; allCellsAreTH = allCellsAreTH && cellIsHeader; let cell = new TableCell(table, row, column); cell.setText(cellContent); cell.textAlign = textAlign; cell.isHeader = cellIsHeader; if (column.textAlign == TextAlignment.default) { column.textAlign = textAlign; } column.wrappable = wrappable; table.addCell(cell); // Take "colspan" into account: let colspan = $cell.prop("colspan"); if (colspan > 1) { // Add empty cells to our Table object: for (let i = 1; i < colspan; i++) { let nextColumn = table.getColumn(colIndex + i); if (!nextColumn) nextColumn = table.addColumn(); let mergedCell = table.getCell(row, nextColumn); mergedCell.merged = TableCellMerge.left; } // Add colspan to colOffset: colOffset += colspan - 1; } // Take "rowspan" into account: let rowspan = $cell.prop("rowspan"); if (rowspan > 1) { // Add empty cells to our Table object: for (let i = 1; i < rowspan; i++) { let nextRow = table.getRow(rowIndex + i); if (!nextRow) nextRow = table.addRow(); nextRow.isHeader = isHeader; let mergedCell = table.getCell(nextRow, column); mergedCell.merged = TableCellMerge.above; // Memorize "ghost" cells: rowspanGhostCells.push({ "row": rowIndex + i, "col": colIndex }); } } }); // Detect headers: if (allowHeaderDetection && !isHeader) { row.isHeader = allCellsAreTH; } }); } private parseCell($cell: cheerio.Cheerio): string { switch (this.mode) { case HTMLTableParserMode.PreserveHTMLElements: return removeInvisibleCharacters(escapeMarkdown($cell.html())); case HTMLTableParserMode.StripHTMLElements: return removeInvisibleCharacters(escapeMarkdown($cell.text())); case HTMLTableParserMode.ConvertHTMLElements: default: return removeInvisibleCharacters(escapeMarkdown(htmlToMd($cell.html(), this.turndownService))); } } } export class HTMLTableRenderer implements TableRenderer { public constructor( public prettify = true, public indent = " ", public renderOutsideTable = true) { } public render(table: Table): string { let result: string[] = []; if (this.renderOutsideTable && table.beforeTable.trim() !== "") result.push(mdToHtml(table.beforeTable, false)); result.push("<table>"); let headerRows = table.getHeaderRows(); let normalRows = table.getNormalRows(); if (headerRows.length > 0) { result.push(this.indentString("<thead>", 1)); for (const row of headerRows) result.push(...this.renderRow(table, row)); result.push(this.indentString("</thead>", 1)); } if (normalRows.length > 0) { result.push(this.indentString("<tbody>", 1)); for (const row of normalRows) { if (row.startsNewSection) result.push(this.indentString("</tbody>", 1), this.indentString("<tbody>", 1)); result.push(...this.renderRow(table, row)); } result.push(this.indentString("</tbody>", 1)); } if (table.caption && table.caption.text.length > 0) result.push(this.indentString(`<caption id="${table.caption.getLabel()}" style="caption-side: ${table.caption.position}">${mdToHtml(table.caption.text)}</caption>`, 1)); result.push("</table>"); if (this.renderOutsideTable && table.afterTable.trim() !== "") result.push(mdToHtml(table.afterTable, false)); return result.join(this.prettify ? "\n" : ""); } private renderRow(table: Table, row: TableRow): string[] { let result: string[] = []; result.push(this.indentString("<tr>", 2)); for (let cell of table.getCellsInRow(row)) { let renderedCell = this.indentString(this.renderCell(cell), 3); if (renderedCell.trim() !== "") result.push(renderedCell); } result.push(this.indentString("</tr>", 2)); return result; } private renderCell(cell: TableCell): string { let colspan = cell.getColspan(); let rowspan = cell.getRowspan(); if (cell.merged == TableCellMerge.none) { let cellProps = (colspan > 1 ? ` colspan="${colspan}"` : "") + (rowspan > 1 ? ` rowspan="${rowspan}"` : "") + (cell.getTextAlignment() != TextAlignment.default ? ` style="${textAlignToCSS(cell.getTextAlignment())}"` : "") + // ` align="${cell.getTextAlignment()}"` (cell.column.wrappable ? ` class="extend"` : ""); let cellTag = cell.isHeaderCell() ? "th" : "td"; return ["<", cellTag, cellProps, ">", mdToHtml(cell.text), "</", cellTag, ">"].join(""); // (markdown-it) mdIt.renderInline(cell.text) } return ""; } private indentString(str: string, indentLevel: number = 0): string { if (this.prettify) return this.indent.repeat(indentLevel) + str; else return str; } }