tag-soup
Version:
The fastest pure JS SAX/DOM XML/HTML parser.
121 lines (120 loc) • 3.33 kB
JavaScript
import { __assign } from "tslib";
import { createSaxParser } from './createSaxParser';
import { decodeHtml } from 'speedy-entities';
/**
* Creates a pre-configured HTML SAX parser.
*
* @param handler The parsing handler.
* @param options Options that override the defaults.
*/
export function createHtmlSaxParser(handler, options) {
return createSaxParser(handler, __assign(__assign({}, htmlParserOptions), options));
}
/**
* The default HTML parser options:
* - CDATA sections and processing instructions are treated as comments;
* - Self-closing tags are treated as a start tags;
* - Tags like `p`, `li`, `td` and others follow implicit end rules, so `<p>foo<p>bar` is parsed as
* `<p>foo</p><p>bar</p>`;
* - Tag and attribute names are converted to lower case;
* - Legacy HTML entities are decoded in text and attribute values. To decode all known HTML entities use:
*
* ```ts
* import {decodeHtml} from 'speedy-entities/lib/full';
*
* createHtmlSaxParser({
* decodeText: decodeHtml,
* decodeAttribute: decodeHtml,
* });
* ```
*
* @see {@link https://github.com/smikhalevski/speedy-entities decodeHtml}
*/
export var htmlParserOptions = {
decodeText: decodeHtml,
decodeAttribute: decodeHtml,
renameTag: toLowerCase,
renameAttribute: toLowerCase,
checkCdataTag: checkCdataTag,
checkVoidTag: checkVoidTag,
endsAncestorAt: endsAncestorAt,
};
function toLowerCase(name) {
return name.toLowerCase();
}
function checkCdataTag(token) {
return cdataTags.has(token.name);
}
function checkVoidTag(token) {
return voidTags.has(token.name);
}
function endsAncestorAt(ancestors, token) {
var tagNames = implicitEndMap.get(token.name);
if (tagNames) {
for (var i = ancestors.length - 1; i >= 0; --i) {
if (tagNames.has(ancestors[i].name)) {
return i;
}
}
}
return -1;
}
var voidTags = toSet('area base basefont br col command embed frame hr img input isindex keygen link meta param source track wbr');
var cdataTags = toSet('script style textarea');
var formTags = toSet('input option optgroup select button datalist textarea');
var pTags = toSet('p');
var implicitEndMap = toMap({
tr: toSet('tr th td'),
th: toSet('th'),
td: toSet('thead th td'),
body: toSet('head link script'),
li: toSet('li'),
option: toSet('option'),
optgroup: toSet('optgroup option'),
dd: toSet('dt dd'),
dt: toSet('dt dd'),
select: formTags,
input: formTags,
output: formTags,
button: formTags,
datalist: formTags,
textarea: formTags,
p: pTags,
h1: pTags,
h2: pTags,
h3: pTags,
h4: pTags,
h5: pTags,
h6: pTags,
address: pTags,
article: pTags,
aside: pTags,
blockquote: pTags,
details: pTags,
div: pTags,
dl: pTags,
fieldset: pTags,
figcaption: pTags,
figure: pTags,
footer: pTags,
form: pTags,
header: pTags,
hr: pTags,
main: pTags,
nav: pTags,
ol: pTags,
pre: pTags,
section: pTags,
table: pTags,
ul: pTags,
rt: toSet('rt rp'),
rp: toSet('rt rp'),
tbody: toSet('thead tbody'),
tfoot: toSet('thead tbody'),
});
function toSet(data) {
return new Set(data.split(' '));
}
function toMap(rec) {
return new Map(Object.entries(rec));
}