tag-soup
Version:
The fastest pure JS SAX/DOM XML/HTML parser.
70 lines (69 loc) • 3.35 kB
JavaScript
import { getCaseInsensitiveHashCode, getCaseSensitiveHashCode, tokenizeMarkup, } from './tokenizeMarkup.js';
/**
* Reads tokens from text and returns them by invoking a callback.
*
* Tokens are _guaranteed_ to be returned in correct order. Missing tokens are inserted to restore the correct order if
* needed.
*
* @example
* import { createTokenizer, htmlTokenizerOptions } from 'tag-soup';
*
* const tokenizer = createTokenizer(htmlTokenizerOptions);
*
* tokenizer.tokenize(
* 'Hello, <b>Bob</b>!',
* (token, startIndex, endIndex) => {
* // Handle token here
* },
* );
*
* @param options Tokenizer options.
* @group Tokenizer
*/
export function createTokenizer(options = {}) {
const documentOptions = resolveTokenizerOptions(options);
const fragmentOptions = Object.assign(Object.assign({}, documentOptions), { isFragment: true });
return {
tokenizeDocument(text, callback) {
return tokenizeMarkup(text, callback, documentOptions);
},
tokenizeFragment(text, callback) {
return tokenizeMarkup(text, callback, fragmentOptions);
},
};
}
/**
* Converts human-readable tokenizer options into options consumed by {@link tokenizeMarkup}.
*/
export function resolveTokenizerOptions(options) {
const { voidTags, rawTextTags, implicitlyClosedTags, implicitlyOpenedTags, areTagNamesCaseInsensitive, areUnbalancedStartTagsImplicitlyClosed, areUnbalancedEndTagsIgnored, isStrict, } = options;
const getHashCode = areTagNamesCaseInsensitive ? getCaseInsensitiveHashCode : getCaseSensitiveHashCode;
const toHashCode = (str) => getHashCode(str, 0, str.length);
return Object.assign(Object.assign({}, resolveContextualTokenReaderOptions(options, undefined, toHashCode, new Map())), { readTag: getHashCode, voidTags: voidTags && new Set(voidTags.map(toHashCode)), rawTextTags: rawTextTags && new Set(rawTextTags.map(toHashCode)), implicitlyClosedTags: implicitlyClosedTags &&
new Map(Object.entries(implicitlyClosedTags).map(entry => [toHashCode(entry[0]), new Set(entry[1].map(toHashCode))])), implicitlyOpenedTags: implicitlyOpenedTags && new Set(implicitlyOpenedTags.map(toHashCode)), isFragment: false, areUnbalancedStartTagsImplicitlyClosed,
areUnbalancedEndTagsIgnored,
isStrict });
}
function resolveContextualTokenReaderOptions(options, parentOptions, toHashCode, resolvedOptionsCache) {
const alreadyResolvedOptions = resolvedOptionsCache.get(options);
if (alreadyResolvedOptions !== undefined) {
return alreadyResolvedOptions;
}
const { foreignTags, areSelfClosingTagsRecognized = false, areCDATASectionsRecognized = false, areProcessingInstructionsRecognized = false, } = options;
const resolvedOptions = {
foreignTags: undefined,
parentOptions,
areSelfClosingTagsRecognized,
areCDATASectionsRecognized,
areProcessingInstructionsRecognized,
};
resolvedOptionsCache.set(options, resolvedOptions);
if (foreignTags === undefined) {
return resolvedOptions;
}
resolvedOptions.foreignTags = new Map(Object.entries(foreignTags).map(entry => [
toHashCode(entry[0]),
resolveContextualTokenReaderOptions(entry[1], resolvedOptions, toHashCode, resolvedOptionsCache),
]));
return resolvedOptions;
}