tag-soup
Version:
The fastest pure JS SAX/DOM XML/HTML parser.
142 lines (141 loc) • 5.59 kB
JavaScript
import { resolveTokenizerOptions } from './createTokenizer.js';
import { tokenizeMarkup } from './tokenizeMarkup.js';
/**
* Parses text as a stream of tokens.
*
* @example
* import { createSAXParser, htmlTokenizerOptions } from 'tag-soup';
*
* const parser = createSAXParser(htmlTokenizerOptions);
*
* parser.parseFragment('Hello, <b>Bob</b>!', {
* onStartTagOpening(tagName) {
* // Handle <b> tag
* },
* });
*
* @param options Parser options.
* @group SAX
*/
export function createSAXParser(options = {}) {
const { decodeText } = options;
const documentOptions = Object.assign(Object.assign({}, resolveTokenizerOptions(options)), { decodeText });
const fragmentOptions = Object.assign(Object.assign({}, documentOptions), { isFragment: true });
return {
parseDocument(input, handler) {
return parseSAX(input, handler, documentOptions);
},
parseFragment(input, handler) {
return parseSAX(input, handler, fragmentOptions);
},
};
}
/**
* Parses text as a stream of tokens.
*
* @param input The text to parse.
* @param handler The token handler.
* @param options Parser options.
* @returns The document node.
*/
export function parseSAX(input, handler, options = {}) {
const { decodeText = identity } = options;
const tagNameStack = ['', '', '', '', '', '', '', '', ''];
let tagNameStackCursor = -1;
let attributeNameStartIndex = 0;
let attributeNameEndIndex = 0;
let piTargetStartIndex = 0;
let piTargetEndIndex = 0;
let attributes;
const tokenCallback = (token, startIndex, endIndex) => {
switch (token) {
case 'TEXT':
if (handler.onText !== undefined) {
handler.onText(decodeText(input.substring(startIndex, endIndex)));
}
break;
case 'START_TAG_NAME':
let tagName;
if (handler.onStartTagOpening !== undefined) {
tagName = input.substring(startIndex, endIndex);
handler.onStartTagOpening(tagName);
}
if (handler.onEndTag !== undefined || handler.onStartTag !== undefined) {
tagNameStack[++tagNameStackCursor] = tagName !== undefined ? tagName : input.substring(startIndex, endIndex);
}
if (handler.onStartTag !== undefined) {
attributes = {};
}
break;
case 'START_TAG_CLOSING':
if (handler.onStartTagClosing !== undefined) {
handler.onStartTagClosing();
}
if (handler.onStartTag !== undefined) {
handler.onStartTag(tagNameStack[tagNameStackCursor], attributes, false);
attributes = undefined;
}
break;
case 'START_TAG_SELF_CLOSING':
if (handler.onStartTagSelfClosing !== undefined) {
handler.onStartTagSelfClosing();
}
if (handler.onStartTag !== undefined) {
handler.onStartTag(tagNameStack[tagNameStackCursor], attributes, true);
attributes = undefined;
}
--tagNameStackCursor;
break;
case 'END_TAG_NAME':
if (handler.onEndTag !== undefined) {
handler.onEndTag(tagNameStack[tagNameStackCursor--]);
}
break;
case 'ATTRIBUTE_NAME':
attributeNameStartIndex = startIndex;
attributeNameEndIndex = endIndex;
break;
case 'ATTRIBUTE_VALUE':
if (attributes === undefined && handler.onAttribute === undefined) {
break;
}
const attributeName = input.substring(attributeNameStartIndex, attributeNameEndIndex);
const attributeValue = decodeText(input.substring(startIndex, endIndex));
if (attributes !== undefined) {
attributes[attributeName] = attributeValue;
}
if (handler.onAttribute !== undefined) {
handler.onAttribute(attributeName, attributeValue);
}
break;
case 'CDATA_SECTION':
if (handler.onCDATASection !== undefined) {
handler.onCDATASection(input.substring(startIndex, endIndex));
}
break;
case 'COMMENT':
if (handler.onComment !== undefined) {
handler.onComment(input.substring(startIndex, endIndex));
}
break;
case 'DOCTYPE_NAME':
if (handler.onDoctype !== undefined) {
handler.onDoctype(input.substring(startIndex, endIndex));
}
break;
case 'PROCESSING_INSTRUCTION_TARGET':
piTargetStartIndex = startIndex;
piTargetEndIndex = endIndex;
break;
case 'PROCESSING_INSTRUCTION_DATA':
if (handler.onProcessingInstruction !== undefined) {
handler.onProcessingInstruction(input.substring(piTargetStartIndex, piTargetEndIndex), input.substring(startIndex, endIndex));
}
break;
}
};
tokenizeMarkup(input, tokenCallback, options);
}
function identity(value) {
return value;
}