UNPKG

tag-soup

Version:

The fastest pure JS SAX/DOM XML/HTML parser.

121 lines (120 loc) 3.33 kB
import { __assign } from "tslib"; import { createSaxParser } from './createSaxParser'; import { decodeHtml } from 'speedy-entities'; /** * Creates a pre-configured HTML SAX parser. * * @param handler The parsing handler. * @param options Options that override the defaults. */ export function createHtmlSaxParser(handler, options) { return createSaxParser(handler, __assign(__assign({}, htmlParserOptions), options)); } /** * The default HTML parser options: * - CDATA sections and processing instructions are treated as comments; * - Self-closing tags are treated as a start tags; * - Tags like `p`, `li`, `td` and others follow implicit end rules, so `<p>foo<p>bar` is parsed as * `<p>foo</p><p>bar</p>`; * - Tag and attribute names are converted to lower case; * - Legacy HTML entities are decoded in text and attribute values. To decode all known HTML entities use: * * ```ts * import {decodeHtml} from 'speedy-entities/lib/full'; * * createHtmlSaxParser({ * decodeText: decodeHtml, * decodeAttribute: decodeHtml, * }); * ``` * * @see {@link https://github.com/smikhalevski/speedy-entities decodeHtml} */ export var htmlParserOptions = { decodeText: decodeHtml, decodeAttribute: decodeHtml, renameTag: toLowerCase, renameAttribute: toLowerCase, checkCdataTag: checkCdataTag, checkVoidTag: checkVoidTag, endsAncestorAt: endsAncestorAt, }; function toLowerCase(name) { return name.toLowerCase(); } function checkCdataTag(token) { return cdataTags.has(token.name); } function checkVoidTag(token) { return voidTags.has(token.name); } function endsAncestorAt(ancestors, token) { var tagNames = implicitEndMap.get(token.name); if (tagNames) { for (var i = ancestors.length - 1; i >= 0; --i) { if (tagNames.has(ancestors[i].name)) { return i; } } } return -1; } var voidTags = toSet('area base basefont br col command embed frame hr img input isindex keygen link meta param source track wbr'); var cdataTags = toSet('script style textarea'); var formTags = toSet('input option optgroup select button datalist textarea'); var pTags = toSet('p'); var implicitEndMap = toMap({ tr: toSet('tr th td'), th: toSet('th'), td: toSet('thead th td'), body: toSet('head link script'), li: toSet('li'), option: toSet('option'), optgroup: toSet('optgroup option'), dd: toSet('dt dd'), dt: toSet('dt dd'), select: formTags, input: formTags, output: formTags, button: formTags, datalist: formTags, textarea: formTags, p: pTags, h1: pTags, h2: pTags, h3: pTags, h4: pTags, h5: pTags, h6: pTags, address: pTags, article: pTags, aside: pTags, blockquote: pTags, details: pTags, div: pTags, dl: pTags, fieldset: pTags, figcaption: pTags, figure: pTags, footer: pTags, form: pTags, header: pTags, hr: pTags, main: pTags, nav: pTags, ol: pTags, pre: pTags, section: pTags, table: pTags, ul: pTags, rt: toSet('rt rp'), rp: toSet('rt rp'), tbody: toSet('thead tbody'), tfoot: toSet('thead tbody'), }); function toSet(data) { return new Set(data.split(' ')); } function toMap(rec) { return new Map(Object.entries(rec)); }