UNPKG

tag-soup

Version:

The fastest pure JS SAX/DOM XML/HTML parser.

142 lines (141 loc) 5.59 kB
import { resolveTokenizerOptions } from './createTokenizer.js'; import { tokenizeMarkup } from './tokenizeMarkup.js'; /** * Parses text as a stream of tokens. * * @example * import { createSAXParser, htmlTokenizerOptions } from 'tag-soup'; * * const parser = createSAXParser(htmlTokenizerOptions); * * parser.parseFragment('Hello, <b>Bob</b>!', { * onStartTagOpening(tagName) { * // Handle <b> tag * }, * }); * * @param options Parser options. * @group SAX */ export function createSAXParser(options = {}) { const { decodeText } = options; const documentOptions = Object.assign(Object.assign({}, resolveTokenizerOptions(options)), { decodeText }); const fragmentOptions = Object.assign(Object.assign({}, documentOptions), { isFragment: true }); return { parseDocument(input, handler) { return parseSAX(input, handler, documentOptions); }, parseFragment(input, handler) { return parseSAX(input, handler, fragmentOptions); }, }; } /** * Parses text as a stream of tokens. * * @param input The text to parse. * @param handler The token handler. * @param options Parser options. * @returns The document node. */ export function parseSAX(input, handler, options = {}) { const { decodeText = identity } = options; const tagNameStack = ['', '', '', '', '', '', '', '', '']; let tagNameStackCursor = -1; let attributeNameStartIndex = 0; let attributeNameEndIndex = 0; let piTargetStartIndex = 0; let piTargetEndIndex = 0; let attributes; const tokenCallback = (token, startIndex, endIndex) => { switch (token) { case 'TEXT': if (handler.onText !== undefined) { handler.onText(decodeText(input.substring(startIndex, endIndex))); } break; case 'START_TAG_NAME': let tagName; if (handler.onStartTagOpening !== undefined) { tagName = input.substring(startIndex, endIndex); handler.onStartTagOpening(tagName); } if (handler.onEndTag !== undefined || handler.onStartTag !== undefined) { tagNameStack[++tagNameStackCursor] = tagName !== undefined ? tagName : input.substring(startIndex, endIndex); } if (handler.onStartTag !== undefined) { attributes = {}; } break; case 'START_TAG_CLOSING': if (handler.onStartTagClosing !== undefined) { handler.onStartTagClosing(); } if (handler.onStartTag !== undefined) { handler.onStartTag(tagNameStack[tagNameStackCursor], attributes, false); attributes = undefined; } break; case 'START_TAG_SELF_CLOSING': if (handler.onStartTagSelfClosing !== undefined) { handler.onStartTagSelfClosing(); } if (handler.onStartTag !== undefined) { handler.onStartTag(tagNameStack[tagNameStackCursor], attributes, true); attributes = undefined; } --tagNameStackCursor; break; case 'END_TAG_NAME': if (handler.onEndTag !== undefined) { handler.onEndTag(tagNameStack[tagNameStackCursor--]); } break; case 'ATTRIBUTE_NAME': attributeNameStartIndex = startIndex; attributeNameEndIndex = endIndex; break; case 'ATTRIBUTE_VALUE': if (attributes === undefined && handler.onAttribute === undefined) { break; } const attributeName = input.substring(attributeNameStartIndex, attributeNameEndIndex); const attributeValue = decodeText(input.substring(startIndex, endIndex)); if (attributes !== undefined) { attributes[attributeName] = attributeValue; } if (handler.onAttribute !== undefined) { handler.onAttribute(attributeName, attributeValue); } break; case 'CDATA_SECTION': if (handler.onCDATASection !== undefined) { handler.onCDATASection(input.substring(startIndex, endIndex)); } break; case 'COMMENT': if (handler.onComment !== undefined) { handler.onComment(input.substring(startIndex, endIndex)); } break; case 'DOCTYPE_NAME': if (handler.onDoctype !== undefined) { handler.onDoctype(input.substring(startIndex, endIndex)); } break; case 'PROCESSING_INSTRUCTION_TARGET': piTargetStartIndex = startIndex; piTargetEndIndex = endIndex; break; case 'PROCESSING_INSTRUCTION_DATA': if (handler.onProcessingInstruction !== undefined) { handler.onProcessingInstruction(input.substring(piTargetStartIndex, piTargetEndIndex), input.substring(startIndex, endIndex)); } break; } }; tokenizeMarkup(input, tokenCallback, options); } function identity(value) { return value; }