html-to-text
Version:
Advanced html to plain text converter
416 lines (381 loc) • 13.8 kB
JavaScript
// eslint-disable-next-line no-unused-vars
const { Picker } = require('selderee');
const { trimCharacter } = require('./helper');
// eslint-disable-next-line no-unused-vars
const { StackItem, BlockStackItem, TableCellStackItem, TableRowStackItem, TableStackItem, TransformerStackItem }
= require('./stack-item');
const { tableToString } = require('./table-printer');
const { WhitespaceProcessor } = require('./whitespace-processor');
// eslint-disable-next-line import/no-unassigned-import
require('./typedefs');
/**
* Helps to build text from inline and block elements.
*
* @class BlockTextBuilder
*/
class BlockTextBuilder {
/**
* Creates an instance of BlockTextBuilder.
*
* @param { Options } options HtmlToText options.
* @param { Picker<DomNode, TagDefinition> } picker Selectors decision tree picker.
*/
constructor (options, picker) {
this.options = options;
this.picker = picker;
this.whitepaceProcessor = new WhitespaceProcessor(options);
/** @type { StackItem } */
this._stackItem = new BlockStackItem(options);
/** @type { TransformerStackItem } */
this._wordTransformer = undefined;
}
/**
* Put a word-by-word transform function onto the transformations stack.
*
* Mainly used for uppercasing. Can be bypassed to add unformatted text such as URLs.
*
* Word transformations applied before wrapping.
*
* @param { (str: string) => string } wordTransform Word transformation function.
*/
pushWordTransform (wordTransform) {
this._wordTransformer = new TransformerStackItem(this._wordTransformer, wordTransform);
}
/**
* Remove a function from the word transformations stack.
*
* @returns { (str: string) => string } A function that was removed.
*/
popWordTransform () {
if (!this._wordTransformer) { return undefined; }
const transform = this._wordTransformer.transform;
this._wordTransformer = this._wordTransformer.next;
return transform;
}
/** @returns { (str: string) => string } */
_getCombinedWordTransformer () {
const applyTransformer = (str, transformer) =>
((transformer) ? applyTransformer(transformer.transform(str), transformer.next) : str);
return (str) => applyTransformer(str, this._wordTransformer);
}
_popStackItem () {
const item = this._stackItem;
this._stackItem = item.next;
return item;
}
/**
* Add a line break into currently built block.
*/
addLineBreak () {
if (!(
this._stackItem instanceof BlockStackItem
|| this._stackItem instanceof TableCellStackItem
)) { return; }
if (this._stackItem.isPre) {
this._stackItem.rawText += '\n';
} else {
this._stackItem.inlineTextBuilder.startNewLine();
}
}
/**
* Allow to break line in case directly following text will not fit.
*/
addWordBreakOpportunity () {
if (
this._stackItem instanceof BlockStackItem
|| this._stackItem instanceof TableCellStackItem
) {
this._stackItem.inlineTextBuilder.wordBreakOpportunity = true;
}
}
/**
* Add a node inline into the currently built block.
*
* @param { string } str
* Text content of a node to add.
*
* @param { object | boolean } [ optionsObjectOrNoWordTransform ]
* Object holding the parameters of the operation.
*
* Boolean value is deprecated.
*
* @param { boolean } [ optionsObjectOrNoWordTransform.noWordTransform = false ]
* Ignore word transformers if there are any.
*/
addInline (str, optionsObjectOrNoWordTransform = {}) {
if (typeof optionsObjectOrNoWordTransform === 'object') {
this._addInline(str, optionsObjectOrNoWordTransform);
} else {
this._addInline(str, { noWordTransform: optionsObjectOrNoWordTransform });
}
}
_addInline (str, { noWordTransform = false } = {}) {
if (!(
this._stackItem instanceof BlockStackItem
|| this._stackItem instanceof TableCellStackItem
)) { return; }
if (this._stackItem.isPre) {
this._stackItem.rawText += str;
return;
}
if (
this.whitepaceProcessor.testContainsWords(str) || // There are words to add;
(str.length && !this._stackItem.stashedLineBreaks) // or at least spaces to take into account.
) {
if (this._stackItem.stashedLineBreaks) {
this._stackItem.inlineTextBuilder.startNewLine(this._stackItem.stashedLineBreaks);
}
this.whitepaceProcessor.shrinkWrapAdd(
str,
this._stackItem.inlineTextBuilder,
(this._wordTransformer && !noWordTransform) ? this._getCombinedWordTransformer() : undefined
);
this._stackItem.stashedLineBreaks = 0; // inline text doesn't introduce line breaks
}
}
/**
* Start building a new block.
*
* @param { object | number } [optionsObjectOrLeadingLineBreaks]
* Object holding the parameters of the block.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrLeadingLineBreaks.leadingLineBreaks = 1]
* This block should have at least this number of line breaks to separate if from any preceding block.
*
* @param { number } [optionsObjectOrLeadingLineBreaks.reservedLineLength = 0]
* Reserve this number of characters on each line for block markup.
*
* @param { boolean } [optionsObjectOrLeadingLineBreaks.isPre = false]
* Should HTML whitespace be preserved inside this block.
*
* @param { number } [reservedLineLength]
* Deprecated.
*
* @param { boolean } [isPre]
* Deprecated.
*/
openBlock (optionsObjectOrLeadingLineBreaks = {}, reservedLineLength = undefined, isPre = undefined) {
if (typeof optionsObjectOrLeadingLineBreaks === 'object') {
this._openBlock(optionsObjectOrLeadingLineBreaks);
} else {
this._openBlock({
isPre: isPre,
leadingLineBreaks: optionsObjectOrLeadingLineBreaks,
reservedLineLength: reservedLineLength,
});
}
}
_openBlock ({ leadingLineBreaks = 1, reservedLineLength = 0, isPre = false } = {}) {
const maxLineLength = Math.max(20, this._stackItem.inlineTextBuilder.maxLineLength - reservedLineLength);
this._stackItem = new BlockStackItem(
this.options,
this._stackItem,
leadingLineBreaks,
maxLineLength
);
if (isPre) { this._stackItem.isPre = true; }
}
/**
* Finalize currently built block, add it's content to the parent block.
*
* @param { object | number } [optionsObjectOrTrailingLineBreaks]
* Object holding the parameters of the block.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrTrailingLineBreaks.trailingLineBreaks = 1]
* This block should have at least this number of line breaks to separate it from any following block.
*
* @param { (str: string) => string } [optionsObjectOrTrailingLineBreaks.blockTransform = undefined]
* A function to transform the block text before adding to the parent block.
* This happens after word wrap and should be used in combination with reserved line length
* in order to keep line lengths correct.
* Used for whole block markup.
*
* @param { (str: string) => string } [blockTransform]
* Deprecated.
*/
closeBlock (optionsObjectOrTrailingLineBreaks = {}, blockTransform = undefined) {
if (typeof optionsObjectOrTrailingLineBreaks === 'object') {
this._closeBlock(optionsObjectOrTrailingLineBreaks);
} else {
this._closeBlock({
trailingLineBreaks: optionsObjectOrTrailingLineBreaks,
blockTransform: blockTransform,
});
}
}
_closeBlock ({ trailingLineBreaks = 1, blockTransform = undefined } = {}) {
const block = this._popStackItem();
const blockText = (blockTransform) ? blockTransform(getText(block)) : getText(block);
addText(this._stackItem, blockText, block.leadingLineBreaks, Math.max(block.stashedLineBreaks, trailingLineBreaks));
}
/**
* Start building a table.
*/
openTable () {
this._stackItem = new TableStackItem(this._stackItem);
}
/**
* Start building a table row.
*/
openTableRow () {
if (!(this._stackItem instanceof TableStackItem)) {
throw new Error('Can\'t add table row to something that is not a table! Check the formatter.');
}
this._stackItem = new TableRowStackItem(this._stackItem);
}
/**
* Start building a table cell.
*
* @param { object | number } [optionsObjectOrMaxColumnWidth = undefined]
* Object holding the parameters of the cell.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrMaxColumnWidth.maxColumnWidth = undefined]
* Wrap cell content to this width. Fall back to global wordwrap value if undefined.
*/
openTableCell (optionsObjectOrMaxColumnWidth = {}) {
if (typeof optionsObjectOrMaxColumnWidth === 'object') {
this._openTableCell(optionsObjectOrMaxColumnWidth);
} else {
this._openTableCell({ maxColumnWidth: optionsObjectOrMaxColumnWidth });
}
}
_openTableCell ({ maxColumnWidth = undefined } = {}) {
if (!(this._stackItem instanceof TableRowStackItem)) {
throw new Error('Can\'t add table cell to something that is not a table row! Check the formatter.');
}
this._stackItem = new TableCellStackItem(this.options, this._stackItem, maxColumnWidth);
}
/**
* Finalize currently built table cell and add it to parent table row's cells.
*
* @param { object | number } [optionsObjectOrColspan]
* Object holding the parameters of the cell.
*
* Number value is deprecated.
*
* @param { number } [optionsObjectOrColspan.colspan = 1] How many columns this cell should occupy.
* @param { number } [optionsObjectOrColspan.rowspan = 1] How many rows this cell should occupy.
*
* @param { number } [rowspan] Deprecated.
*/
closeTableCell (optionsObjectOrColspan = {}, rowspan = undefined) {
if (typeof optionsObjectOrColspan === 'object') {
this._closeTableCell(optionsObjectOrColspan);
} else {
this._closeTableCell({
colspan: optionsObjectOrColspan,
rowspan: rowspan,
});
}
}
_closeTableCell ({ colspan = 1, rowspan = 1 } = {}) {
const cell = this._popStackItem();
const text = trimCharacter(getText(cell), '\n');
cell.next.cells.push({ colspan: colspan, rowspan: rowspan, text: text });
}
/**
* Finalize currently built table row and add it to parent table's rows.
*/
closeTableRow () {
const row = this._popStackItem();
row.next.rows.push(row.cells);
}
/**
* Finalize currently built table and add the rendered text to the parent block.
*
* @param { object | number } [optionsObjectOrColSpacing]
* Object holding the parameters of the table.
*
* Number value is depreceted.
*
* @param { number } [optionsObjectOrColSpacing.colSpacing = 3]
* Number of spaces between table columns.
*
* @param { number } [optionsObjectOrColSpacing.rowSpacing = 0]
* Number of empty lines between table rows.
*
* @param { number } [optionsObjectOrColSpacing.leadingLineBreaks = 2]
* This table should have at least this number of line breaks to separate if from any preceding block.
*
* @param { number } [optionsObjectOrColSpacing.trailingLineBreaks = 2]
* This table should have at least this number of line breaks to separate it from any following block.
*
* @param { number } [rowSpacing]
* Deprecated.
*
* @param { number } [leadingLineBreaks]
* Deprecated.
*
* @param { number } [trailingLineBreaks]
* Deprecated.
*/
closeTable (
optionsObjectOrColSpacing = {},
rowSpacing = undefined,
leadingLineBreaks = undefined,
trailingLineBreaks = undefined
) {
if (typeof optionsObjectOrColSpacing === 'object') {
this._closeTable(optionsObjectOrColSpacing);
} else {
this._closeTable({
colSpacing: optionsObjectOrColSpacing,
leadingLineBreaks: leadingLineBreaks,
rowSpacing: rowSpacing,
trailingLineBreaks: trailingLineBreaks
});
}
}
_closeTable ({ colSpacing = 3, rowSpacing = 0, leadingLineBreaks = 2, trailingLineBreaks = 2 } = {}) {
const table = this._popStackItem();
const output = tableToString(table.rows, rowSpacing, colSpacing);
if (output) {
addText(this._stackItem, output, leadingLineBreaks, trailingLineBreaks);
}
}
/**
* Return the rendered text content of this builder.
*
* @returns { string }
*/
toString () {
return getText(this._stackItem.getRoot());
// There should only be the root item if everything is closed properly.
}
}
function getText (stackItem) {
if (!(
stackItem instanceof BlockStackItem
|| stackItem instanceof TableCellStackItem
)) {
throw new Error('Only blocks and table cells can be requested for text contents.');
}
return (stackItem.inlineTextBuilder.isEmpty())
? stackItem.rawText
: stackItem.rawText + stackItem.inlineTextBuilder.toString();
}
function addText (stackItem, text, leadingLineBreaks, trailingLineBreaks) {
if (!(
stackItem instanceof BlockStackItem
|| stackItem instanceof TableCellStackItem
)) {
throw new Error('Only blocks and table cells can contain text.');
}
const parentText = getText(stackItem);
const lineBreaks = Math.max(stackItem.stashedLineBreaks, leadingLineBreaks);
stackItem.inlineTextBuilder.clear();
if (parentText) {
stackItem.rawText = parentText + '\n'.repeat(lineBreaks) + text;
} else {
stackItem.rawText = text;
stackItem.leadingLineBreaks = lineBreaks;
}
stackItem.stashedLineBreaks = trailingLineBreaks;
}
module.exports = { BlockTextBuilder: BlockTextBuilder };