tag-soup
Version:
The fastest pure JS SAX/DOM XML/HTML parser.
410 lines (409 loc) • 16.8 kB
JavaScript
import { all, char, seq, text, until } from 'tokenizer-dsl';
// https://www.w3.org/TR/xml/#NT-S
var isSpaceChar = function (charCode) {
return charCode === 32 /* ' ' */
|| charCode === 9 /* '\t' */
|| charCode === 13 /* '\r' */
|| charCode === 10 /* '\n' */;
};
// https://www.w3.org/TR/xml/#NT-NameStartChar
var isTagNameStartChar = function (charCode) {
return charCode >= 97 /* 'a' */ && charCode <= 122 /* 'z' */
|| charCode >= 65 /* 'A' */ && charCode <= 90 /* 'Z' */
|| charCode === 95 /* '_' */
|| charCode === 58 /* ':' */
|| charCode >= 0xc0 && charCode <= 0xd6
|| charCode >= 0xd8 && charCode <= 0xf6
|| charCode >= 0xf8 && charCode <= 0x2ff
|| charCode >= 0x370 && charCode <= 0x37d
|| charCode >= 0x37f && charCode <= 0x1fff
|| charCode >= 0x200c && charCode <= 0x200d
|| charCode >= 0x2070 && charCode <= 0x218f
|| charCode >= 0x2c00 && charCode <= 0x2fef
|| charCode >= 0x3001 && charCode <= 0xd7ff
|| charCode >= 0xf900 && charCode <= 0xfdcf
|| charCode >= 0xfdf0 && charCode <= 0xfffd
|| charCode >= 0x10000 && charCode <= 0xeffff;
};
/**
* Check if char should be treated as a whitespace inside a tag.
*/
var isTagSpaceChar = function (charCode) {
// isSpaceChar(charCode)
return charCode === 32 /* ' ' */
|| charCode === 9 /* '\t' */
|| charCode === 13 /* '\r' */
|| charCode === 10 /* '\n' */
//
|| charCode === 47 /* '/' */;
};
var isNotTagNameChar = function (charCode) {
// isSpaceChar(charCode)
return charCode === 32 /* ' ' */
|| charCode === 9 /* '\t' */
|| charCode === 13 /* '\r' */
|| charCode === 10 /* '\n' */
//
|| charCode === 47 /* '/' */
|| charCode === 62 /* '>' */;
};
var isNotAttributeNameChar = function (charCode) {
// isSpaceChar(charCode)
return charCode === 32 /* ' ' */
|| charCode === 9 /* '\t' */
|| charCode === 13 /* '\r' */
|| charCode === 10 /* '\n' */
//
|| charCode === 47 /* '/' */
|| charCode === 62 /* '>' */
|| charCode === 61 /* '=' */;
};
var isNotUnquotedValueChar = function (charCode) {
//isSpaceChar(charCode)
return charCode === 32 /* ' ' */
|| charCode === 9 /* '\t' */
|| charCode === 13 /* '\r' */
|| charCode === 10 /* '\n' */
//
|| charCode === 62 /* '>' */;
};
var takeText = until(text('<'));
var takeUntilGt = until(text('>'), { inclusive: true });
var takeTagNameStartChar = char(isTagNameStartChar);
var takeTagNameChars = until(char(isNotTagNameChar), { openEnded: true, endOffset: 1 });
// <…
var takeStartTagOpening = seq(text('<'), takeTagNameStartChar, takeTagNameChars);
// </…
var takeEndTagOpening = seq(text('</'), takeTagNameStartChar, takeTagNameChars);
var takeAttributeName = until(char(isNotAttributeNameChar), { openEnded: true });
var takeTagSpace = all(char(isTagSpaceChar));
var takeSpace = all(char(isSpaceChar));
// =
var takeEq = seq(takeSpace, text('='), takeSpace);
// "…"
var takeQuotValue = seq(text('"'), until(text('"'), { inclusive: true, openEnded: true, endOffset: 1 }));
// '…'
var takeAposValue = seq(text('\''), until(text('\''), { inclusive: true, openEnded: true, endOffset: 1 }));
// okay
var takeUnquotedValue = until(char(isNotUnquotedValueChar), { openEnded: true });
// <!-- … -->
var takeComment = seq(text('<!--'), until(text('-->'), { inclusive: true, openEnded: true, endOffset: 3 }));
// <! … >
var takeDtd = seq(text('<!'), until(text('>'), { inclusive: true, openEnded: true, endOffset: 1 }));
// <? … ?>
var takeProcessingInstruction = seq(text('<?'), until(text('?>'), { inclusive: true, openEnded: true, endOffset: 2 }));
// <![CDATA[ … ]]>
var takeCdata = seq(text('<![CDATA['), until(text(']]>'), { inclusive: true, openEnded: true, endOffset: 3 }));
// <!DOCTYPE … >
var takeDoctype = seq(text('<!DOCTYPE', { caseInsensitive: true }), until(text('>'), { inclusive: true, openEnded: true, endOffset: 1 }));
/**
* Reads attributes from the source.
*
* @param chunk The string to read attributes from.
* @param index The index in `chunk` from which to start reading.
* @param chunkOffset The offset of the `chunk` in scope of the whole input.
* @param attributes An array-like object to which {@link IAttributeToken} objects are added.
* @param options Tokenization options.
* @param parserOptions Parsing options.
* @returns The index in `chunk` at which reading was completed.
*/
export function tokenizeAttributes(chunk, index, chunkOffset, attributes, options, parserOptions) {
var attributeTokenPool = options.attributeTokenPool;
var decodeAttribute = parserOptions.decodeAttribute, renameAttribute = parserOptions.renameAttribute;
var charCount = chunk.length;
var attributeCount = 0;
while (index < charCount) {
var k = takeTagSpace(chunk, index);
var j = takeAttributeName(chunk, k);
// No attributes are available
if (j === k) {
break;
}
var token = attributes[attributeCount] = attributeTokenPool.take();
var rawName = chunk.substring(k, j);
token.rawName = rawName;
token.name = renameAttribute != null ? renameAttribute(rawName) : rawName;
token.nameStart = token.start = chunkOffset + k;
token.nameEnd = chunkOffset + j;
k = j;
j = takeEq(chunk, k);
var rawValue = void 0;
var value = void 0;
var valueStart = -1;
var valueEnd = -1;
var quoted = false;
// Equals sign presents, so there may be a value
if (j !== -1 /* NO_MATCH */) {
k = j;
rawValue = value = null;
// Quoted value
j = takeQuotValue(chunk, k);
if (j === -1 /* NO_MATCH */) {
j = takeAposValue(chunk, k);
}
if (j !== -1 /* NO_MATCH */) {
valueStart = k + 1;
valueEnd = j - 1;
quoted = true;
k = Math.min(j, charCount);
}
else {
// Unquoted value
j = takeUnquotedValue(chunk, k);
if (j !== k) {
valueStart = k;
valueEnd = j;
k = j;
}
}
if (valueStart !== -1) {
rawValue = chunk.substring(valueStart, valueEnd);
value = decodeAttribute != null ? decodeAttribute(rawValue) : rawValue;
valueStart += chunkOffset;
valueEnd += chunkOffset;
}
}
token.rawValue = rawValue;
token.value = value;
token.valueStart = valueStart;
token.valueEnd = valueEnd;
token.quoted = quoted;
token.end = chunkOffset + k;
++attributeCount;
index = k;
}
// Clean up array-like object
for (var i = attributeCount; i < attributes.length; ++i) {
attributes[i] = undefined;
}
attributes.length = attributeCount;
return index;
}
/**
* Reads markup tokens from the string.
*
* **Note:** Tokenizer doesn't return allocated tokens back to pools.
*
* @param chunk The chunk of the input to read tokens from.
* @param streaming If set to `true` then tokenizer stops when an ambiguous char sequence is met.
* @param chunkOffset The offset of the `chunk` in scope of the whole input.
* @param options Tokenization options.
* @param parserOptions Parsing options.
* @param handler SAX handler that is notified about parsed tokens.
* @returns The index in `chunk` right after the last parsed character.
*/
export function tokenize(chunk, streaming, chunkOffset, options, parserOptions, handler) {
var startTagTokenPool = options.startTagTokenPool, endTagToken = options.endTagToken, dataToken = options.dataToken;
var cdataEnabled = parserOptions.cdataEnabled, processingInstructionsEnabled = parserOptions.processingInstructionsEnabled, selfClosingEnabled = parserOptions.selfClosingEnabled, decodeText = parserOptions.decodeText, renameTag = parserOptions.renameTag, checkCdataTag = parserOptions.checkCdataTag;
var startTagCallback = handler.startTag, endTagCallback = handler.endTag, textCallback = handler.text, commentCallback = handler.comment, processingInstructionCallback = handler.processingInstruction, cdataCallback = handler.cdata, doctypeCallback = handler.doctype;
var textStart = -1;
var textEnd = 0;
var tagParsingEnabled = true;
var startTagName;
var charCount = chunk.length;
var i = 0;
var j;
// This function is inlined by Terser
var triggerTextCallback = function () {
if (textStart !== -1) {
triggerDataCallback(chunk, chunkOffset, 3 /* TEXT */, dataToken, textCallback, textStart, textEnd, 0, 0, decodeText);
textStart = -1;
}
};
while (i < charCount) {
// Text
if (textStart === -1) {
var k = takeText(chunk, i);
if (k === -1 /* NO_MATCH */ && (k = charCount) && streaming) {
break;
}
if (k !== i) {
textStart = i;
textEnd = i = k;
continue;
}
}
if (tagParsingEnabled) {
// Start tag
j = takeStartTagOpening(chunk, i);
if (j !== -1 /* NO_MATCH */) {
var token = startTagTokenPool.take();
var attributes = token.attributes;
var nameStart = i + 1;
var nameEnd = j;
var rawTagName = chunk.substring(nameStart, nameEnd);
var tagName = renameTag != null ? renameTag(rawTagName) : rawTagName;
j = tokenizeAttributes(chunk, j, chunkOffset, attributes, options, parserOptions);
// Skip malformed content and excessive whitespaces
var k = takeUntilGt(chunk, j);
if (k === -1 /* NO_MATCH */) {
// Unterminated start tag
return i;
}
var selfClosing = selfClosingEnabled && k - j >= 2 && chunk.charCodeAt(k - 2) === 47 /* '/' */ || false;
/*@__INLINE__*/
triggerTextCallback();
token.rawName = rawTagName;
token.name = tagName;
token.selfClosing = selfClosing;
token.start = chunkOffset + i;
token.end = chunkOffset + k;
token.nameStart = chunkOffset + nameStart;
token.nameEnd = chunkOffset + nameEnd;
if (!selfClosing) {
startTagName = tagName;
tagParsingEnabled = !(checkCdataTag === null || checkCdataTag === void 0 ? void 0 : checkCdataTag(token));
}
i = k;
startTagCallback === null || startTagCallback === void 0 ? void 0 : startTagCallback(token);
// Start tag token and its attributes must be returned to the pool owner
continue;
}
}
// End tag
j = takeEndTagOpening(chunk, i);
if (j !== -1 /* NO_MATCH */) {
var nameStart = i + 2;
var nameEnd = j;
var rawTagName = chunk.substring(nameStart, nameEnd);
var tagName = renameTag != null ? renameTag(rawTagName) : rawTagName;
if (tagParsingEnabled || startTagName === tagName) {
// Resume tag parsing if CDATA content tag has ended
tagParsingEnabled = true;
// Skip malformed content and excessive whitespaces
var k = takeUntilGt(chunk, j);
if (k === -1 /* NO_MATCH */) {
// Unterminated end tag
return i;
}
/*@__INLINE__*/
triggerTextCallback();
if (endTagCallback) {
endTagToken.rawName = rawTagName;
endTagToken.name = tagName;
endTagToken.start = chunkOffset + i;
endTagToken.end = chunkOffset + k;
endTagToken.nameStart = chunkOffset + nameStart;
endTagToken.nameEnd = chunkOffset + nameEnd;
endTagCallback(endTagToken);
}
i = k;
continue;
}
}
if (tagParsingEnabled) {
var k = void 0;
// Comment
k = j = takeComment(chunk, i);
if (j !== -1 /* NO_MATCH */) {
if (j > charCount && streaming) {
return i;
}
/*@__INLINE__*/
triggerTextCallback();
i = triggerDataCallback(chunk, chunkOffset, 8 /* COMMENT */, dataToken, commentCallback, i, j, 4, 3, decodeText);
continue;
}
// Doctype
k = j = takeDoctype(chunk, i);
if (j !== -1 /* NO_MATCH */) {
if (j > charCount && streaming) {
return i;
}
/*@__INLINE__*/
triggerTextCallback();
i = triggerDataCallback(chunk, chunkOffset, 10 /* DOCTYPE */, dataToken, doctypeCallback, i, j, 9, 1);
continue;
}
// CDATA section
j = takeCdata(chunk, i);
if (j !== -1 /* NO_MATCH */) {
if (j > charCount && streaming) {
return i;
}
/*@__INLINE__*/
triggerTextCallback();
if (cdataEnabled) {
i = triggerDataCallback(chunk, chunkOffset, 4 /* CDATA_SECTION */, dataToken, cdataCallback, i, j, 9, 3);
}
else {
i = triggerDataCallback(chunk, chunkOffset, 8 /* COMMENT */, dataToken, commentCallback, i, j, 2, 1);
}
continue;
}
// Processing instruction
j = takeProcessingInstruction(chunk, i);
if (j !== -1 /* NO_MATCH */) {
if (j > charCount && streaming) {
return i;
}
/*@__INLINE__*/
triggerTextCallback();
if (processingInstructionsEnabled) {
i = triggerDataCallback(chunk, chunkOffset, 7 /* PROCESSING_INSTRUCTION */, dataToken, processingInstructionCallback, i, j, 2, 2);
}
else {
i = triggerDataCallback(chunk, chunkOffset, 8 /* COMMENT */, dataToken, commentCallback, i, j, 1, 1);
}
continue;
}
// DTD
j = takeDtd(chunk, i);
if (j !== -1 /* NO_MATCH */) {
if (j > charCount && streaming) {
return i;
}
/*@__INLINE__*/
triggerTextCallback();
if (cdataEnabled) {
i = Math.min(j, charCount);
}
else {
i = triggerDataCallback(chunk, chunkOffset, 8 /* COMMENT */, dataToken, commentCallback, i, j, 2, 1, decodeText);
}
continue;
}
}
// Concat with existing text
if (textStart === -1) {
textStart = i;
}
textEnd = takeText(chunk, i + 1);
if (textEnd === -1) {
textEnd = charCount;
break;
}
i = textEnd;
}
if (streaming) {
if (textStart !== -1) {
return textStart;
}
return i;
}
/*@__INLINE__*/
triggerTextCallback();
return i;
}
/**
* Populates `dataToken` and passes it to `dataCallback`.
*/
function triggerDataCallback(chunk, chunkOffset, tokenType, dataToken, dataCallback, start, end, offsetStart, offsetEnd, decodeData) {
var charCount = chunk.length;
var index = Math.min(end, charCount);
if (!dataCallback) {
return index;
}
var dataStart = start + offsetStart;
var dataEnd = Math.min(end - offsetEnd, charCount);
var rawData = chunk.substring(dataStart, dataEnd);
dataToken.tokenType = tokenType;
dataToken.rawData = rawData;
dataToken.data = decodeData != null ? decodeData(rawData) : rawData;
dataToken.start = chunkOffset + start;
dataToken.end = chunkOffset + index;
dataToken.dataStart = chunkOffset + dataStart;
dataToken.dataEnd = chunkOffset + dataEnd;
dataCallback(dataToken);
return index;
}