tag-soup
Version:
The fastest pure JS SAX/DOM XML/HTML parser.
90 lines (89 loc) • 3.4 kB
JavaScript
import { resolveTokenizerOptions } from './createTokenizer.js';
import { ParserError, tokenizeMarkup } from './tokenizeMarkup.js';
import { CDATASection, Comment, Document, DocumentFragment, DocumentType, Element, ProcessingInstruction, Text, } from 'flyweight-dom';
/**
* Parses text as a DOM.
*
* @example
* import { createDOMParser, htmlTokenizerOptions } from 'tag-soup';
*
* const parser = createDOMParser(htmlTokenizerOptions);
*
* parser.parseFragment('Hello, <b>Bob</b>!');
* // ⮕ DocumentFragment
*
* @param options Parser options.
* @group DOM
*/
export function createDOMParser(options = {}) {
const { decodeText } = options;
const documentOptions = Object.assign(Object.assign({}, resolveTokenizerOptions(options)), { decodeText });
const fragmentOptions = Object.assign(Object.assign({}, documentOptions), { isFragment: true });
return {
parseDocument(input) {
return parseDOM(input, documentOptions);
},
parseFragment(input) {
return parseDOM(input, fragmentOptions);
},
};
}
/**
* Parses text as a DOM.
*
* @param input The text to parse.
* @param options Parser options.
* @returns The document or document fragment node.
*/
export function parseDOM(input, options = {}) {
const { isStrict, isFragment, decodeText = identity } = options;
const root = isFragment ? new DocumentFragment() : new Document();
let parent = root;
let attributeName;
let piTarget;
const tokenCallback = (token, startIndex, endIndex) => {
switch (token) {
case 'TEXT':
parent.appendChild(new Text(decodeText(input.substring(startIndex, endIndex))));
break;
case 'START_TAG_NAME':
parent.appendChild((parent = new Element(input.substring(startIndex, endIndex))));
break;
case 'START_TAG_CLOSING':
break;
case 'START_TAG_SELF_CLOSING':
case 'END_TAG_NAME':
parent = parent.parentNode;
break;
case 'ATTRIBUTE_NAME':
attributeName = input.substring(startIndex, endIndex);
break;
case 'ATTRIBUTE_VALUE':
parent.setAttribute(attributeName, decodeText(input.substring(startIndex, endIndex)));
break;
case 'CDATA_SECTION':
parent.appendChild(new CDATASection(input.substring(startIndex, endIndex)));
break;
case 'COMMENT':
parent.appendChild(new Comment(decodeText(input.substring(startIndex, endIndex))));
break;
case 'DOCTYPE_NAME':
parent.appendChild(new DocumentType(input.substring(startIndex, endIndex)));
break;
case 'PROCESSING_INSTRUCTION_TARGET':
piTarget = input.substring(startIndex, endIndex);
break;
case 'PROCESSING_INSTRUCTION_DATA':
parent.appendChild(new ProcessingInstruction(piTarget, input.substring(startIndex, endIndex)));
break;
}
};
tokenizeMarkup(input, tokenCallback, options);
if (isStrict && parent !== root) {
throw new ParserError('Expected an end tag.', input, input.length);
}
return root;
}
function identity(value) {
return value;
}