UNPKG

html-to-text

Version:
416 lines (381 loc) 13.8 kB
// eslint-disable-next-line no-unused-vars const { Picker } = require('selderee'); const { trimCharacter } = require('./helper'); // eslint-disable-next-line no-unused-vars const { StackItem, BlockStackItem, TableCellStackItem, TableRowStackItem, TableStackItem, TransformerStackItem } = require('./stack-item'); const { tableToString } = require('./table-printer'); const { WhitespaceProcessor } = require('./whitespace-processor'); // eslint-disable-next-line import/no-unassigned-import require('./typedefs'); /** * Helps to build text from inline and block elements. * * @class BlockTextBuilder */ class BlockTextBuilder { /** * Creates an instance of BlockTextBuilder. * * @param { Options } options HtmlToText options. * @param { Picker<DomNode, TagDefinition> } picker Selectors decision tree picker. */ constructor (options, picker) { this.options = options; this.picker = picker; this.whitepaceProcessor = new WhitespaceProcessor(options); /** @type { StackItem } */ this._stackItem = new BlockStackItem(options); /** @type { TransformerStackItem } */ this._wordTransformer = undefined; } /** * Put a word-by-word transform function onto the transformations stack. * * Mainly used for uppercasing. Can be bypassed to add unformatted text such as URLs. * * Word transformations applied before wrapping. * * @param { (str: string) => string } wordTransform Word transformation function. */ pushWordTransform (wordTransform) { this._wordTransformer = new TransformerStackItem(this._wordTransformer, wordTransform); } /** * Remove a function from the word transformations stack. * * @returns { (str: string) => string } A function that was removed. */ popWordTransform () { if (!this._wordTransformer) { return undefined; } const transform = this._wordTransformer.transform; this._wordTransformer = this._wordTransformer.next; return transform; } /** @returns { (str: string) => string } */ _getCombinedWordTransformer () { const applyTransformer = (str, transformer) => ((transformer) ? applyTransformer(transformer.transform(str), transformer.next) : str); return (str) => applyTransformer(str, this._wordTransformer); } _popStackItem () { const item = this._stackItem; this._stackItem = item.next; return item; } /** * Add a line break into currently built block. */ addLineBreak () { if (!( this._stackItem instanceof BlockStackItem || this._stackItem instanceof TableCellStackItem )) { return; } if (this._stackItem.isPre) { this._stackItem.rawText += '\n'; } else { this._stackItem.inlineTextBuilder.startNewLine(); } } /** * Allow to break line in case directly following text will not fit. */ addWordBreakOpportunity () { if ( this._stackItem instanceof BlockStackItem || this._stackItem instanceof TableCellStackItem ) { this._stackItem.inlineTextBuilder.wordBreakOpportunity = true; } } /** * Add a node inline into the currently built block. * * @param { string } str * Text content of a node to add. * * @param { object | boolean } [ optionsObjectOrNoWordTransform ] * Object holding the parameters of the operation. * * Boolean value is deprecated. * * @param { boolean } [ optionsObjectOrNoWordTransform.noWordTransform = false ] * Ignore word transformers if there are any. */ addInline (str, optionsObjectOrNoWordTransform = {}) { if (typeof optionsObjectOrNoWordTransform === 'object') { this._addInline(str, optionsObjectOrNoWordTransform); } else { this._addInline(str, { noWordTransform: optionsObjectOrNoWordTransform }); } } _addInline (str, { noWordTransform = false } = {}) { if (!( this._stackItem instanceof BlockStackItem || this._stackItem instanceof TableCellStackItem )) { return; } if (this._stackItem.isPre) { this._stackItem.rawText += str; return; } if ( this.whitepaceProcessor.testContainsWords(str) || // There are words to add; (str.length && !this._stackItem.stashedLineBreaks) // or at least spaces to take into account. ) { if (this._stackItem.stashedLineBreaks) { this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks); } this.whitepaceProcessor.shrinkWrapAdd( str, this._stackItem.inlineTextBuilder, (this._wordTransformer && !noWordTransform) ? this._getCombinedWordTransformer() : undefined ); this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks } } /** * Start building a new block. * * @param { object | number } [optionsObjectOrLeadingLineBreaks] * Object holding the parameters of the block. * * Number value is deprecated. * * @param { number } [optionsObjectOrLeadingLineBreaks.leadingLineBreaks = 1] * This block should have at least this number of line breaks to separate if from any preceding block. * * @param { number } [optionsObjectOrLeadingLineBreaks.reservedLineLength = 0] * Reserve this number of characters on each line for block markup. * * @param { boolean } [optionsObjectOrLeadingLineBreaks.isPre = false] * Should HTML whitespace be preserved inside this block. * * @param { number } [reservedLineLength] * Deprecated. * * @param { boolean } [isPre] * Deprecated. */ openBlock (optionsObjectOrLeadingLineBreaks = {}, reservedLineLength = undefined, isPre = undefined) { if (typeof optionsObjectOrLeadingLineBreaks === 'object') { this._openBlock(optionsObjectOrLeadingLineBreaks); } else { this._openBlock({ isPre: isPre, leadingLineBreaks: optionsObjectOrLeadingLineBreaks, reservedLineLength: reservedLineLength, }); } } _openBlock ({ leadingLineBreaks = 1, reservedLineLength = 0, isPre = false } = {}) { const maxLineLength = Math.max(20, this._stackItem.inlineTextBuilder.maxLineLength - reservedLineLength); this._stackItem = new BlockStackItem( this.options, this._stackItem, leadingLineBreaks, maxLineLength ); if (isPre) { this._stackItem.isPre = true; } } /** * Finalize currently built block, add it's content to the parent block. * * @param { object | number } [optionsObjectOrTrailingLineBreaks] * Object holding the parameters of the block. * * Number value is deprecated. * * @param { number } [optionsObjectOrTrailingLineBreaks.trailingLineBreaks = 1] * This block should have at least this number of line breaks to separate it from any following block. * * @param { (str: string) => string } [optionsObjectOrTrailingLineBreaks.blockTransform = undefined] * A function to transform the block text before adding to the parent block. * This happens after word wrap and should be used in combination with reserved line length * in order to keep line lengths correct. * Used for whole block markup. * * @param { (str: string) => string } [blockTransform] * Deprecated. */ closeBlock (optionsObjectOrTrailingLineBreaks = {}, blockTransform = undefined) { if (typeof optionsObjectOrTrailingLineBreaks === 'object') { this._closeBlock(optionsObjectOrTrailingLineBreaks); } else { this._closeBlock({ trailingLineBreaks: optionsObjectOrTrailingLineBreaks, blockTransform: blockTransform, }); } } _closeBlock ({ trailingLineBreaks = 1, blockTransform = undefined } = {}) { const block = this._popStackItem(); const blockText = (blockTransform) ? blockTransform(getText(block)) : getText(block); addText(this._stackItem, blockText, block.leadingLineBreaks, Math.max(block.stashedLineBreaks, trailingLineBreaks)); } /** * Start building a table. */ openTable () { this._stackItem = new TableStackItem(this._stackItem); } /** * Start building a table row. */ openTableRow () { if (!(this._stackItem instanceof TableStackItem)) { throw new Error('Can\'t add table row to something that is not a table! Check the formatter.'); } this._stackItem = new TableRowStackItem(this._stackItem); } /** * Start building a table cell. * * @param { object | number } [optionsObjectOrMaxColumnWidth = undefined] * Object holding the parameters of the cell. * * Number value is deprecated. * * @param { number } [optionsObjectOrMaxColumnWidth.maxColumnWidth = undefined] * Wrap cell content to this width. Fall back to global wordwrap value if undefined. */ openTableCell (optionsObjectOrMaxColumnWidth = {}) { if (typeof optionsObjectOrMaxColumnWidth === 'object') { this._openTableCell(optionsObjectOrMaxColumnWidth); } else { this._openTableCell({ maxColumnWidth: optionsObjectOrMaxColumnWidth }); } } _openTableCell ({ maxColumnWidth = undefined } = {}) { if (!(this._stackItem instanceof TableRowStackItem)) { throw new Error('Can\'t add table cell to something that is not a table row! Check the formatter.'); } this._stackItem = new TableCellStackItem(this.options, this._stackItem, maxColumnWidth); } /** * Finalize currently built table cell and add it to parent table row's cells. * * @param { object | number } [optionsObjectOrColspan] * Object holding the parameters of the cell. * * Number value is deprecated. * * @param { number } [optionsObjectOrColspan.colspan = 1] How many columns this cell should occupy. * @param { number } [optionsObjectOrColspan.rowspan = 1] How many rows this cell should occupy. * * @param { number } [rowspan] Deprecated. */ closeTableCell (optionsObjectOrColspan = {}, rowspan = undefined) { if (typeof optionsObjectOrColspan === 'object') { this._closeTableCell(optionsObjectOrColspan); } else { this._closeTableCell({ colspan: optionsObjectOrColspan, rowspan: rowspan, }); } } _closeTableCell ({ colspan = 1, rowspan = 1 } = {}) { const cell = this._popStackItem(); const text = trimCharacter(getText(cell), '\n'); cell.next.cells.push({ colspan: colspan, rowspan: rowspan, text: text }); } /** * Finalize currently built table row and add it to parent table's rows. */ closeTableRow () { const row = this._popStackItem(); row.next.rows.push(row.cells); } /** * Finalize currently built table and add the rendered text to the parent block. * * @param { object | number } [optionsObjectOrColSpacing] * Object holding the parameters of the table. * * Number value is depreceted. * * @param { number } [optionsObjectOrColSpacing.colSpacing = 3] * Number of spaces between table columns. * * @param { number } [optionsObjectOrColSpacing.rowSpacing = 0] * Number of empty lines between table rows. * * @param { number } [optionsObjectOrColSpacing.leadingLineBreaks = 2] * This table should have at least this number of line breaks to separate if from any preceding block. * * @param { number } [optionsObjectOrColSpacing.trailingLineBreaks = 2] * This table should have at least this number of line breaks to separate it from any following block. * * @param { number } [rowSpacing] * Deprecated. * * @param { number } [leadingLineBreaks] * Deprecated. * * @param { number } [trailingLineBreaks] * Deprecated. */ closeTable ( optionsObjectOrColSpacing = {}, rowSpacing = undefined, leadingLineBreaks = undefined, trailingLineBreaks = undefined ) { if (typeof optionsObjectOrColSpacing === 'object') { this._closeTable(optionsObjectOrColSpacing); } else { this._closeTable({ colSpacing: optionsObjectOrColSpacing, leadingLineBreaks: leadingLineBreaks, rowSpacing: rowSpacing, trailingLineBreaks: trailingLineBreaks }); } } _closeTable ({ colSpacing = 3, rowSpacing = 0, leadingLineBreaks = 2, trailingLineBreaks = 2 } = {}) { const table = this._popStackItem(); const output = tableToString(table.rows, rowSpacing, colSpacing); if (output) { addText(this._stackItem, output, leadingLineBreaks, trailingLineBreaks); } } /** * Return the rendered text content of this builder. * * @returns { string } */ toString () { return getText(this._stackItem.getRoot()); // There should only be the root item if everything is closed properly. } } function getText (stackItem) { if (!( stackItem instanceof BlockStackItem || stackItem instanceof TableCellStackItem )) { throw new Error('Only blocks and table cells can be requested for text contents.'); } return (stackItem.inlineTextBuilder.isEmpty()) ? stackItem.rawText : stackItem.rawText + stackItem.inlineTextBuilder.toString(); } function addText (stackItem, text, leadingLineBreaks, trailingLineBreaks) { if (!( stackItem instanceof BlockStackItem || stackItem instanceof TableCellStackItem )) { throw new Error('Only blocks and table cells can contain text.'); } const parentText = getText(stackItem); const lineBreaks = Math.max(stackItem.stashedLineBreaks, leadingLineBreaks); stackItem.inlineTextBuilder.clear(); if (parentText) { stackItem.rawText = parentText + '\n'.repeat(lineBreaks) + text; } else { stackItem.rawText = text; stackItem.leadingLineBreaks = lineBreaks; } stackItem.stashedLineBreaks = trailingLineBreaks; } module.exports = { BlockTextBuilder: BlockTextBuilder };