UNPKG

tag-soup

Version:

The fastest pure JS SAX/DOM XML/HTML parser.

70 lines (69 loc) 3.35 kB
import { getCaseInsensitiveHashCode, getCaseSensitiveHashCode, tokenizeMarkup, } from './tokenizeMarkup.js'; /** * Reads tokens from text and returns them by invoking a callback. * * Tokens are _guaranteed_ to be returned in correct order. Missing tokens are inserted to restore the correct order if * needed. * * @example * import { createTokenizer, htmlTokenizerOptions } from 'tag-soup'; * * const tokenizer = createTokenizer(htmlTokenizerOptions); * * tokenizer.tokenize( * 'Hello, <b>Bob</b>!', * (token, startIndex, endIndex) => { * // Handle token here * }, * ); * * @param options Tokenizer options. * @group Tokenizer */ export function createTokenizer(options = {}) { const documentOptions = resolveTokenizerOptions(options); const fragmentOptions = Object.assign(Object.assign({}, documentOptions), { isFragment: true }); return { tokenizeDocument(text, callback) { return tokenizeMarkup(text, callback, documentOptions); }, tokenizeFragment(text, callback) { return tokenizeMarkup(text, callback, fragmentOptions); }, }; } /** * Converts human-readable tokenizer options into options consumed by {@link tokenizeMarkup}. */ export function resolveTokenizerOptions(options) { const { voidTags, rawTextTags, implicitlyClosedTags, implicitlyOpenedTags, areTagNamesCaseInsensitive, areUnbalancedStartTagsImplicitlyClosed, areUnbalancedEndTagsIgnored, isStrict, } = options; const getHashCode = areTagNamesCaseInsensitive ? getCaseInsensitiveHashCode : getCaseSensitiveHashCode; const toHashCode = (str) => getHashCode(str, 0, str.length); return Object.assign(Object.assign({}, resolveContextualTokenReaderOptions(options, undefined, toHashCode, new Map())), { readTag: getHashCode, voidTags: voidTags && new Set(voidTags.map(toHashCode)), rawTextTags: rawTextTags && new Set(rawTextTags.map(toHashCode)), implicitlyClosedTags: implicitlyClosedTags && new Map(Object.entries(implicitlyClosedTags).map(entry => [toHashCode(entry[0]), new Set(entry[1].map(toHashCode))])), implicitlyOpenedTags: implicitlyOpenedTags && new Set(implicitlyOpenedTags.map(toHashCode)), isFragment: false, areUnbalancedStartTagsImplicitlyClosed, areUnbalancedEndTagsIgnored, isStrict }); } function resolveContextualTokenReaderOptions(options, parentOptions, toHashCode, resolvedOptionsCache) { const alreadyResolvedOptions = resolvedOptionsCache.get(options); if (alreadyResolvedOptions !== undefined) { return alreadyResolvedOptions; } const { foreignTags, areSelfClosingTagsRecognized = false, areCDATASectionsRecognized = false, areProcessingInstructionsRecognized = false, } = options; const resolvedOptions = { foreignTags: undefined, parentOptions, areSelfClosingTagsRecognized, areCDATASectionsRecognized, areProcessingInstructionsRecognized, }; resolvedOptionsCache.set(options, resolvedOptions); if (foreignTags === undefined) { return resolvedOptions; } resolvedOptions.foreignTags = new Map(Object.entries(foreignTags).map(entry => [ toHashCode(entry[0]), resolveContextualTokenReaderOptions(entry[1], resolvedOptions, toHashCode, resolvedOptionsCache), ])); return resolvedOptions; }