@types/html-to-text
Version:
TypeScript definitions for html-to-text
407 lines (390 loc) • 12.8 kB
TypeScript
import { BlockTextBuilder } from "./lib/block-text-builder";
export type compiledFunction = (str: string) => string;
export type metaData = any;
/**
* Preprocess options, compile selectors into a decision tree,
* return a function intended for batch processing.
*/
export function compile(options?: HtmlToTextOptions): compiledFunction;
/**
* Convert given HTML content to plain text string.
*
* @example
* const { htmlToText } = require('html-to-text');
* const text = htmlToText('<h1>Hello World</h1>', {
* wordwrap: 130
* });
* console.log(text); // HELLO WORLD
*/
export function htmlToText(html: string, options?: HtmlToTextOptions, metadata?: metaData): string;
export { htmlToText as convert };
export interface HtmlToTextOptions {
/**
* Options for narrowing down to informative parts of HTML document.
*/
baseElements?: BaseElementsOptions | undefined;
/**
* Decode HTML entities found in the input HTML if true.
* Otherwise preserve in output text.
*/
decodeEntities?: boolean | undefined;
/**
* A dictionary with characters that should be replaced in the output
* text and corresponding escape sequences.
*/
encodeCharacters?: Record<string, string> | undefined;
/**
* A dictionary with custom formatting functions for specific kinds of elements.
*
* Keys are custom string identifiers, values are callbacks.
*/
formatters?: Record<string, FormatCallback> | undefined;
/**
* Options for handling complex documents and limiting the output size.
*/
limits?: LimitsOptions | undefined;
/**
* Describes how to wrap long words.
*/
longWordSplit?: LongWordSplitOptions | undefined;
/**
* By default, any newlines `\n` from the input HTML are dropped.
*
* If `true`, these newlines will be preserved in the output.
*/
preserveNewlines?: boolean | undefined;
/**
* Instructions for how to render HTML elements based on matched selectors.
*
* Use this to (re)define options for new or already supported tags.
*/
selectors?: SelectorDefinition[] | undefined;
/**
* All characters that are considered whitespace.
* Default is according to HTML specifications.
*/
whitespaceCharacters?: string | undefined;
/**
* After how many chars a line break should follow in `p` elements.
*
* Set to `null` or `false` to disable word-wrapping.
*/
wordwrap?: number | false | null | undefined;
/**
* The following are deprecated options. See the documentation.
*/
/**
* @deprecated Use baseElements.selectors instead.
*/
baseElement?: string | string[] | undefined;
/**
* @deprecated Use baseElements instead.
*/
returnDomByDefault?: boolean | undefined;
/**
* @deprecated Use selectors with `format: 'dataTable'` instead.
*/
tables?: string[] | boolean | undefined;
/**
* @deprecated Use selectors instead.
*/
tags?: TagDefinitions | undefined;
}
/**
* Options for narrowing down to informative parts of HTML document.
*/
export interface BaseElementsOptions {
/**
* The resulting text output will be composed from the text content of elements
* matched with these selectors.
*/
selectors?: string[] | undefined;
/**
* When multiple selectors are set, this option specifies
* whether the selectors order has to be reflected in the output text.
*
* `'selectors'` (default) - matches for the first selector will appear first, etc;
*
* `'occurrence'` - all bases will appear in the same order as in input HTML.
*/
orderBy?: "selectors" | "occurrence" | undefined;
/**
* Use the entire document if none of provided selectors matched.
*/
returnDomByDefault?: boolean | undefined;
}
/**
* Options for handling complex documents and limiting the output size.
*/
export interface LimitsOptions {
/**
* ...]
* A string to put in place of skipped content.
*/
ellipsis?: string | undefined;
/**
* Stop looking for more base elements after reaching this amount.
*
* Unlimited if undefined.
*/
maxBaseElements?: number | undefined;
/**
* Maximum number of child nodes of a single node to be added to the
* output. Unlimited if undefined.
*/
maxChildNodes?: number | undefined;
/**
* Only go to a certain depth starting from `Options.baseElement`.
*
* Replace deeper nodes with ellipsis.
*
* No depth limit if undefined.
*/
maxDepth?: number | undefined;
/**
* If the input string is longer than this value - it will be truncated
* and a message will be sent to `stderr`.
*
* Ellipsis is not used in this case.
*/
maxInputLength?: number | undefined;
}
/**
* Describes how to wrap long words.
*/
export interface LongWordSplitOptions {
/**
* Break long words on the `Options.wordwrap` limit when there are no characters to wrap on.
*/
forceWrapOnLimit?: boolean | undefined;
/**
* An array containing the characters that may be wrapped on.
*/
wrapCharacters?: string[] | undefined;
}
/**
* Describes how to handle tags matched by a selector.
*/
export interface SelectorDefinition {
/**
* CSS selector. Refer to README for notes on supported selectors etc.
*/
selector: string;
/**
* Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary.
*/
format?: string | undefined;
/**
* Options to customize the formatter for this tag.
*/
options?: FormatOptions | undefined;
}
/**
* Describes how to handle a tag.
*/
export interface TagDefinition {
/**
* Identifier of a {@link FormatCallback}, built-in or provided in `Options.formatters` dictionary.
*/
format?: string | undefined;
/**
* Options to customize the formatter for this tag.
*/
options?: FormatOptions | undefined;
}
/**
* Options specific to different formatters ({@link FormatCallback}).
* This is an umbrella type definition. Each formatter supports it's own subset of options.
*/
export interface FormatOptions {
/**
* Number of line breaks to separate previous block from this one.
*
* Note that N+1 line breaks are needed to make N empty lines.
*/
leadingLineBreaks?: number | undefined;
/**
* Number of line breaks to separate this block from the next one.
*
* Note that N+1 line breaks are needed to make N empty lines.
*/
trailingLineBreaks?: number | undefined;
/**
* (Only for: `anchor` and `image` formatters.) Server host for link `href` attributes and image `src` attributes
* relative to the root (the ones that start with `/`).
*
* For example, with `baseUrl = 'http://asdf.com'` and `<a href='/dir/subdir'>...</a>`
* the link in the text will be `http://asdf.com/dir/subdir`.
*
* Keep in mind that `baseUrl` should not end with a `/`.
*/
baseUrl?: string | undefined;
/**
* Surround links with these brackets.<br/>Set to `false` or `['', '']` to disable.
* @default ['[', ']']
*/
linkBrackets?: [string, string] | false | undefined;
/**
* (Only for: `anchor` and `image` formatters.) A function to rewrite link
* href attributes and image src attributes. Applied before baseUrl.
*/
pathRewrite?: ((path: string, meta: metaData) => string) | undefined;
/**
* (Only for: `anchor` formatter.) By default links are translated in the following way:
*
* `<a href='link'>text</a>` => becomes => `text [link]`.
*
* If this option is set to `true` and `link` and `text` are the same,
* `[link]` will be omitted and only `text` will be present.
*/
hideLinkHrefIfSameAsText?: boolean | undefined;
/**
* (Only for: `anchor` formatter.) Ignore all links. Only process internal text of anchor tags.
*/
ignoreHref?: boolean | undefined;
/**
* (Only for: `anchor` formatter.) Ignore anchor links (where `href='#...'`).
*/
noAnchorUrl?: boolean | undefined;
/**
* (Only for: `unorderedList` formatter.) String prefix for each list item.
*/
itemPrefix?: string | undefined;
/**
* (Only for: `heading` formatter.) By default, headings (`<h1>`, `<h2>`, etc) are uppercased.
*
* Set this to `false` to leave headings as they are.
*/
uppercase?: boolean | undefined;
/**
* (Only for: `horizontalLine` formatter.) Length of the `<hr/>` line.
*
* If numeric value is provided - it is used.
* Otherwise, if global `wordwrap` number is provided - it is used.
* If neither is true, then the fallback value of 40 is used.
*/
length?: number | undefined;
/**
* (Only for: `blockquote` formatter.) Trim empty lines from blockquote.
*/
trimEmptyLines?: boolean | undefined;
/**
* (Only for: `table`, `dataTable` formatter.) By default, heading cells (`<th>`) are uppercased.
*
* Set this to `false` to leave heading cells as they are.
*/
uppercaseHeaderCells?: boolean | undefined;
/**
* (Only for: `table`, `dataTable` formatter.) Data table cell content will be wrapped to fit this width
* instead of global `wordwrap` limit.
*
* Set to `undefined` in order to fall back to `wordwrap` limit.
*/
maxColumnWidth?: number | undefined;
/**
* (Only for: `table`, `dataTable` formatter.) Number of spaces between data table columns.
*/
colSpacing?: number | undefined;
/**
* (Only for: `table`, `dataTable` formatter.) Number of empty lines between data table rows.
*/
rowSpacing?: number | undefined;
/**
* (Only for: `blockString`, `inlineString` formatters.) A string to be inserted in place of a tag.
*/
string?: string | undefined;
/**
* (Only for: `inlineSurround` formatter.) String prefix to be inserted before inline tag contents.
*/
prefix?: string | undefined;
/**
* (Only for: `inlineSurround` formatter.) String suffix to be inserted after inline tag contents.
*/
suffix?: string | undefined;
/**
* User defined values are supported.
*/
[key: string]: any;
/**
* @deprecated Use linkBrackets instead.
* (Only for: `anchor` formatter.) Don't print brackets around links.
*/
noLinkBrackets?: boolean | undefined;
}
/**
* Simplified definition of [htmlparser2](https://github.com/fb55/htmlparser2) Node type.
*
* Makes no distinction between elements (tags) and data nodes (good enough for now).
*/
export interface DomNode {
/**
* Type of node - "text", "tag", "comment", "script", etc.
*/
type: string;
/**
* Content of a data node.
*/
data?: string | undefined;
/**
* Tag name.
*/
name?: string | undefined;
/**
* Tag attributes dictionary.
*/
attribs?: any;
/**
* Child nodes.
* Not optional for typescript use.
*/
children: DomNode[];
/**
* Parent node.
*/
parent?: DomNode | undefined;
}
/**
* A function to stringify a DOM node.
*/
export type FormatCallback = (
elem: DomNode,
walk: RecursiveCallback,
builder: BlockTextBuilder,
formatOptions: FormatOptions,
) => void;
/**
* A function to process child nodes.
* Passed into a {@link FormatCallback} as an argument.
*/
export type RecursiveCallback = (nodes: DomNode[], builder: BlockTextBuilder) => void;
/**
* Type of object passed to tags in the options.
*/
export interface TagDefinitions {
""?: TagDefinition | undefined;
a?: TagDefinition | undefined;
article?: TagDefinition | undefined;
aside?: TagDefinition | undefined;
blockquote?: TagDefinition | undefined;
br?: TagDefinition | undefined;
div?: TagDefinition | undefined;
footer?: TagDefinition | undefined;
form?: TagDefinition | undefined;
h1?: TagDefinition | undefined;
h2?: TagDefinition | undefined;
h3?: TagDefinition | undefined;
h4?: TagDefinition | undefined;
h5?: TagDefinition | undefined;
h6?: TagDefinition | undefined;
header?: TagDefinition | undefined;
hr?: TagDefinition | undefined;
img?: TagDefinition | undefined;
main?: TagDefinition | undefined;
nav?: TagDefinition | undefined;
ol?: TagDefinition | undefined;
p?: TagDefinition | undefined;
pre?: TagDefinition | undefined;
table?: TagDefinition | undefined;
ul?: TagDefinition | undefined;
wbr?: TagDefinition | undefined;
}