UNPKG

tag-soup

Version:

The fastest pure JS SAX/DOM XML/HTML parser.

187 lines (186 loc) 5.72 kB
import { type ResolvedTokenizerOptions, type TokenCallback } from './tokenizeMarkup.js'; /** * Options of the {@link createTokenizer} that are applied depending on the enclosing context. * * @group Tokenizer */ export interface ContextualTokenizerOptions { /** * The map from a foreign tag name to a tokenizer options applied to the tag children. */ foreignTags?: Record<string, ContextualTokenizerOptions>; /** * If `true` then self-closing tags are recognized, otherwise they are treated as start tags. * * @default false */ areSelfClosingTagsRecognized?: boolean; /** * If `true` then CDATA sections are recognized. * * @default false */ areCDATASectionsRecognized?: boolean; /** * If `true` then processing instructions are recognized. * * @default false */ areProcessingInstructionsRecognized?: boolean; } /** * Options of the {@link createTokenizer}. * * @group Tokenizer */ export interface TokenizerOptions extends ContextualTokenizerOptions { /** * The list of tags that can't have any contents (since there's no end tag, no content can be put between the start * tag and the end tag). * * @example * ['link', 'meta'] * @see [HTML5 Void Elements](https://www.w3.org/TR/2010/WD-html5-20101019/syntax.html#void-elements) */ voidTags?: string[]; /** * The list of tags which content is interpreted as plain text. * * @example * ['script', 'style'] * @see [HTML5 Raw Text Elements](https://www.w3.org/TR/2010/WD-html5-20101019/syntax.html#raw-text-elements) */ rawTextTags?: string[]; /** * The map from a tag (A) to a list of tags that must be closed if tag (A) is opened. * * For example, in HTML `p` and `h1` tags have the following semantics: * * ```html * <p><h1> → <p></p><h1></h1> * ^^^^ p is implicitly closed by h1 * ``` * * To achieve this behavior, set this option to: * * ```ts * // h1 implicitly closes p * { h1: ['p'] } * ``` * * Use in conjunctions with {@link areUnbalancedStartTagsImplicitlyClosed}. */ implicitlyClosedTags?: Record<string, string[]>; /** * The list of tags for which a start tag is inserted if an unbalanced end tag is met. Otherwise, * a {@link ParserError} is thrown. * * You can ignore unbalanced end tags with {@link areUnbalancedEndTagsIgnored}. * * For example, in HTML `p` and `br` tags follow this semantics: * * ```html * </p> → <p></p> * ^^^ p is implicitly opened * * </br> → <br/> * ^ br is implicitly opened * ``` * * To achieve this behavior, set this option to: * * ```ts * ['p', 'br'] * ``` * * @see {@link areUnbalancedEndTagsIgnored} */ implicitlyOpenedTags?: string[]; /** * If `true` then ASCII alpha characters are case-insensitive in tag names. * * @default false */ areTagNamesCaseInsensitive?: boolean; /** * If `true` then unbalanced start tags are forcefully closed. Otherwise, a {@link ParserError} is thrown. * * Use in conjunctions with {@link areUnbalancedEndTagsIgnored}. * * ```html * <a><b></a> → <a><b></b></a> * ^^^^ b is implicitly closed * ``` * * @default false */ areUnbalancedStartTagsImplicitlyClosed?: boolean; /** * If `true` then end tags that don't have a corresponding start tag are ignored. Otherwise, * a {@link ParserError} is thrown. * * Use in conjunctions with {@link areUnbalancedStartTagsImplicitlyClosed}. * * ```html * <a></b></a> → <a></a> * ^^^^ b is ignored * ``` * * @default false */ areUnbalancedEndTagsIgnored?: boolean; /** * If `true` then tag names and attributes are processed with XML constraints. * * @default false */ isStrict?: boolean; } /** * A tokenizer that reads tokens from text and returns them by invoking a callback. * * @see {@link createTokenizer} * @group Tokenizer */ export interface Tokenizer { /** * Reads tokens from text and returns them by invoking a callback. * * @param text The text string to read tokens from. * @param callback The callback that is invoked when a token is read. */ tokenizeDocument(text: string, callback: TokenCallback): void; /** * Reads tokens from text and returns them by invoking a callback. * * @param text The text string to read tokens from. * @param callback The callback that is invoked when a token is read. */ tokenizeFragment(text: string, callback: TokenCallback): void; } /** * Reads tokens from text and returns them by invoking a callback. * * Tokens are _guaranteed_ to be returned in correct order. Missing tokens are inserted to restore the correct order if * needed. * * @example * import { createTokenizer, htmlTokenizerOptions } from 'tag-soup'; * * const tokenizer = createTokenizer(htmlTokenizerOptions); * * tokenizer.tokenize( * 'Hello, <b>Bob</b>!', * (token, startIndex, endIndex) => { * // Handle token here * }, * ); * * @param options Tokenizer options. * @group Tokenizer */ export declare function createTokenizer(options?: TokenizerOptions): Tokenizer; /** * Converts human-readable tokenizer options into options consumed by {@link tokenizeMarkup}. */ export declare function resolveTokenizerOptions(options: TokenizerOptions): ResolvedTokenizerOptions;