fortissimo-html
Version:
Fortissimo HTML - Flexible, Forgiving, Formatting HTML Parser
950 lines • 39.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.HtmlParser = exports.State = exports.ParseResults = void 0;
const platform_specifics_1 = require("./platform-specifics");
const elements_1 = require("./elements");
const characters_1 = require("./characters");
const dom_1 = require("./dom");
class ParseResults {
constructor() {
this.characters = 0;
this.errors = 0;
this.implicitlyClosedTags = 0;
this.lines = 0;
this.stopped = false;
this.totalTime = 0;
this.unclosedTags = 0;
}
toString() {
return this.domRoot && this.domRoot.toString() || '';
}
}
exports.ParseResults = ParseResults;
const DEFAULT_YIELD_TIME = 50;
var State;
(function (State) {
State[State["OUTSIDE_MARKUP"] = 0] = "OUTSIDE_MARKUP";
State[State["AT_ATTRIBUTE_ASSIGNMENT"] = 1] = "AT_ATTRIBUTE_ASSIGNMENT";
State[State["AT_ATTRIBUTE_START"] = 2] = "AT_ATTRIBUTE_START";
State[State["AT_ATTRIBUTE_VALUE"] = 3] = "AT_ATTRIBUTE_VALUE";
State[State["AT_END_TAG_START"] = 4] = "AT_END_TAG_START";
State[State["AT_MARKUP_START"] = 5] = "AT_MARKUP_START";
State[State["AT_START_TAG_START"] = 6] = "AT_START_TAG_START";
State[State["IN_END_TAG"] = 7] = "IN_END_TAG";
// States below this point are subject to special unexpected EOF handling
State[State["AT_COMMENT_START"] = 8] = "AT_COMMENT_START";
State[State["AT_DECLARATION_START"] = 9] = "AT_DECLARATION_START";
State[State["AT_PROCESSING_START"] = 10] = "AT_PROCESSING_START";
State[State["IN_SCRIPT_ELEMENT"] = 11] = "IN_SCRIPT_ELEMENT";
State[State["IN_STYLE_ELEMENT"] = 12] = "IN_STYLE_ELEMENT";
State[State["IN_TEXT_AREA_ELEMENT"] = 13] = "IN_TEXT_AREA_ELEMENT";
})(State = exports.State || (exports.State = {}));
const RE_WHITESPACE = /^([ \f]+)/;
const RE_TEXT = /^([^<\t\n\r\uD800-\uDFFF]+)/;
const RE_ATTRIB_NAME = /^([^=/>\s\uD800-\uDFFF]+)/;
const RE_COMMENT = /^([^->\t\n\r\uD800-\uDFFF]+)/;
const RE_DECLARATION = /^([^>\t\n\r\uD800-\uDFFF]+)/;
const RE_ATTRIB_VALUE = {
'"': /^([^"\t\n\r\uD800-\uDFFF]+)/,
"'": /^([^'\t\n\r\uD800-\uDFFF]+)/,
'': /^([^=/>\s\uD800-\uDFFF]+)/,
};
const RE_WHITESPACE_FAST = /^([ \t\n\f\r]+)/;
const RE_TEXT_FAST = /^([^<]+)/;
const RE_COMMENT_FAST = /^([^->]+)/;
const RE_DECLARATION_FAST = /^([^>]+)/;
const RE_TAG_FAST = /^([^>]+>)/;
const CAN_BE_HANDLED_GENERICALLY = new Set(['attribute', 'cdata', 'comment', 'declaration', 'end-tag', 'error',
'processing', 'start-tag-end', 'start-tag-start', 'text']);
class HtmlParser {
constructor(options) {
this.attribute = '';
this.charset = '';
this.callbacks = new Map();
this.checkingCharset = false;
this.collectedSpace = '';
this.column = 0;
this.columnIncrement = 1;
this.contentType = false;
this.currentTag = '';
this.currentTagLc = '';
this.dom = new dom_1.DomModel();
this.fast = HtmlParser.DEFAULT_OPTIONS.fast;
this.leadingSpace = '';
this.line = 1;
this.parserRunning = false;
this.pendingCharset = '';
this.pendingSource = '';
this.pendingReset = false;
this.preEqualsSpace = '';
this.putBacks = [];
this.reAttribValue = RE_ATTRIB_VALUE;
this.state = State.OUTSIDE_MARKUP;
this.stopped = false;
this.tabSize = HtmlParser.DEFAULT_OPTIONS.tabSize;
this.xmlMode = false;
this.yieldTime = 0;
this.options = {};
Object.assign(this.options, HtmlParser.DEFAULT_OPTIONS);
if (options && options.fast)
this.options.eol = false;
Object.assign(this.options, options);
this.adjustOptions();
this.xmlMode = this.options.xmlMode;
}
on(event, callback) {
if (!callback)
this.callbacks.delete(event);
else
this.callbacks.set(event, callback);
return this;
}
off(event) {
return this.on(event, null);
}
callback(event, ...args) {
if (!this.parserRunning && event !== 'completion')
return false;
let cb = this.callbacks.get(event);
if (cb)
return cb(...args);
cb = this.callbacks.get('generic');
if (!cb || !CAN_BE_HANDLED_GENERICALLY.has(event))
return;
switch (event) {
case 'attribute': return cb(-1, args[0] + args[1] + args[2] + (0, dom_1.OQ)(args[4]) + args[3] + (0, dom_1.CQ)(args[4]));
case 'cdata': return cb(args[0], '<![CDATA[' + args[1] + (args[2] ? ']]>' : ''));
case 'comment': return cb(args[0], '<!--' + args[1] + (args[2] ? '-->' : ''));
case 'declaration': return cb(args[0], '<!' + args[1] + (args[2] ? '>' : ''));
case 'end-tag': return cb(args[0], '</' + args[1] + args[2]);
case 'error': return cb(-1, args[3] || '');
case 'processing': return cb(args[0], '<?' + args[1] + (args[2] ? '>' : ''));
case 'start-tag-end': return cb(args[0], args[1] + args[2]);
case 'start-tag-start': return cb(args[0], '<' + args[1]);
case 'text': return cb(args[0], args[1]);
}
}
stop() {
this.charset = '';
this.checkingCharset = false;
this.columnIncrement = 1;
this.htmlSource = '';
this.parserRunning = false;
this.pendingCharset = '';
this.putBacks = [];
this.stopped = true;
}
reset() {
this.charset = '';
this.checkingCharset = false;
this.collectedSpace = '';
this.column = 0;
this.columnIncrement = 1;
this.contentType = false;
this.dom = new dom_1.DomModel();
this.htmlSource = '';
this.leadingSpace = '';
this.line = 1;
this.parseResults = undefined;
this.parserRunning = false;
this.pendingCharset = '';
this.pendingReset = false;
this.pendingSource = '';
this.putBacks = [];
this.stopped = false;
this.state = State.OUTSIDE_MARKUP;
this.xmlMode = this.options.xmlMode;
}
startParsing(source) {
this.startTime = (0, platform_specifics_1.processMillis)();
if (this.fast && this.options.eol)
source = source.replace(/\r\n|\r|\n/g, this.options.eol);
this.parserRunning = true;
this.htmlSource = source || '';
this.pendingSource = '';
this.putBacks = [];
this.state = State.OUTSIDE_MARKUP;
this.dom = new dom_1.DomModel();
this.parseResults = new ParseResults();
this.parseResults.domRoot = this.dom.getRoot();
this.checkEncoding(this.htmlSource);
}
parse(source) {
this.startParsing(source);
this.yieldTime = 0;
if (this.parseResults)
this.parseLoop();
return this.parseResults;
}
async parseAsync(source, yieldTime = DEFAULT_YIELD_TIME) {
this.startParsing(source);
this.yieldTime = yieldTime;
if (!this.parserRunning)
return undefined;
return new Promise(resolve => {
const parse = () => {
this.parseLoop();
if (this.pendingReset) {
this.reset();
this.callback('completion', null);
resolve(undefined);
}
else if (this.stopped) {
this.callback('completion', this.parseResults);
resolve(this.parseResults);
}
else if (!this.parserRunning)
resolve(this.parseResults);
else
setTimeout(parse);
};
parse();
});
}
checkEncoding(firstChars) {
let encoding;
if (/^(\x00\x00\xFE\xFF|\x00\x00\x00[\x01-\xFF]\x00\x00\x00[\x01-\xFF])/.test(firstChars))
encoding = 'UTF-32BE';
else if (/^(\xFF\xFE\x00\x00|[\x01-\xFF]\x00\x00\x00[\x01-\xFF]\x00\x00\x00)/.test(firstChars))
encoding = 'UTF-32LE';
else if (/^(\xFE\xFF|\x00[\x01-\xFF]\x00[\x01-\xFF])/.test(firstChars))
encoding = 'UTF-16BE';
else if (/^(\xFF\xFE|[\x01-\xFF]\x00[\x01-\xFF]\x00)/.test(firstChars))
encoding = 'UTF-16LE';
if (encoding) {
const bailout = this.callback('encoding', encoding, encoding.toLowerCase().replace('-', ''), false);
if (bailout)
this.reset();
}
}
parseLoop() {
const loopStartTime = (0, platform_specifics_1.processMillis)();
let ch;
let content;
let terminated;
let isCData;
let endTag;
while ((ch = this.getChar()) || this.state >= State.AT_COMMENT_START) {
if (ch) {
if (HtmlParser.TEXT_STARTERS.has(this.state)) {
this.textLine = this.line;
this.textColumn = this.column;
}
ch = this.gatherWhitespace(ch);
}
if (!ch && this.state < State.AT_COMMENT_START)
break;
switch (this.state) {
case State.OUTSIDE_MARKUP:
this.putBack(ch);
this.handleText(this.collectedSpace + this.gatherText());
break;
case State.AT_MARKUP_START:
this.handleMarkupStart(ch);
break;
case State.AT_START_TAG_START:
if (this.fast)
this.handleFullTag(ch);
else {
this.gatherTagName(ch);
this.handleStartTagStart();
}
break;
case State.AT_END_TAG_START:
if (ch === '>') {
this.currentTag = this.currentTagLc = '';
this.putBack(ch);
}
else {
this.gatherTagName(ch);
this.collectedSpace = '';
}
this.state = State.IN_END_TAG;
break;
case State.IN_END_TAG:
const invalidEnding = this.handleEndTag(ch);
if (invalidEnding) {
this.gatherInvalidEndTagEnding();
this.pop(this.currentTagLc, `</${this.currentTag}${this.pendingSource}`);
this.doEndTagCallback(this.currentTag, this.pendingSource);
}
break;
case State.AT_ATTRIBUTE_START:
let end = '>';
if (ch === '/') {
end = '/>';
ch = this.getChar();
if (ch !== '>') {
this.putBack(ch);
ch = '/';
}
}
const getAttribName = this.handleAttributeStart(ch, end);
if (getAttribName)
this.gatherAttributeName(ch);
break;
case State.AT_ATTRIBUTE_ASSIGNMENT:
this.handleAttributeAssignment(ch);
break;
case State.AT_ATTRIBUTE_VALUE:
const quote = this.handleAttributeValueStepOne(ch);
if (quote !== undefined) {
let value;
[value, terminated] = this.gatherAttributeValue(quote, quote ? '' : ch);
if (this.handleAttributeValueStepTwo(quote, value, terminated))
return;
}
this.state = State.AT_ATTRIBUTE_START;
break;
case State.AT_DECLARATION_START:
if (this.handleDeclarationStartStepOne(ch)) {
[content, terminated, isCData] = this.gatherDeclarationOrProcessing(this.collectedSpace + ch, this.dom.shouldParseCData());
this.handleDeclarationStartStepTwo(content, terminated, isCData);
}
break;
case State.AT_PROCESSING_START:
[content, terminated] = this.gatherDeclarationOrProcessing(this.collectedSpace + ch);
this.handleProcessingStart(content, terminated);
break;
case State.AT_COMMENT_START:
[content, terminated] = this.gatherComment(this.collectedSpace + ch);
this.handleCommentStart(content, terminated);
break;
case State.IN_STYLE_ELEMENT:
case State.IN_SCRIPT_ELEMENT:
case State.IN_TEXT_AREA_ELEMENT:
const tag = HtmlParser.tagForState[this.state];
if (ch === '<') {
this.markupLine = this.line;
this.markupColumn = this.column;
}
[content, endTag, terminated] = this.gatherUntilEndTag(tag, ch);
this.handleTextBlockElements(tag, content, endTag, terminated);
break;
}
if (this.yieldTime && (0, platform_specifics_1.processMillis)() >= loopStartTime + this.yieldTime)
return;
}
this.parseLoopWrapUp();
}
parseLoopWrapUp() {
if (this.state !== State.OUTSIDE_MARKUP) {
++this.parseResults.errors;
if (this.state <= State.AT_ATTRIBUTE_VALUE) {
if (this.state === State.AT_ATTRIBUTE_ASSIGNMENT) {
this.dom.addAttribute(this.attribute, '', this.leadingSpace, '', '');
this.doAttributeCallback('', '', '');
}
else if (this.state === State.AT_ATTRIBUTE_VALUE) {
const equals = this.preEqualsSpace + '=';
this.dom.addAttribute(this.attribute, '', this.leadingSpace, equals, '');
this.doAttributeCallback(equals, '', '');
}
this.dom.getCurrentNode().badTerminator = '';
this.callback('error', `Unexpected end of <${this.currentTag}> tag`, this.line, this.column);
}
else if (this.state === State.AT_END_TAG_START || this.state === State.IN_END_TAG) {
this.callback('error', 'Unexpected end of file in end tag', this.line, this.column, this.pendingSource);
this.dom.addChild(new dom_1.UnmatchedClosingTag(this.pendingSource, this.line, this.column));
this.collectedSpace = '';
}
else
this.callback('error', 'Unexpected end of file', this.line, this.column, this.pendingSource);
}
if (!this.parseResults) // In case parser reset while running.
return;
if (this.collectedSpace) {
this.dom.addChild(new dom_1.TextElement(this.collectedSpace, this.textLine, this.textColumn, true));
this.callback('text', this.dom.getDepth() + 1, this.collectedSpace);
}
[this.parseResults.unclosedTags, this.parseResults.implicitlyClosedTags] =
this.dom.getRoot().countUnclosed();
this.parseResults.lines = this.line;
this.parseResults.stopped = this.stopped;
this.parseResults.totalTime = (0, platform_specifics_1.processMillis)() - this.startTime;
this.callback('completion', this.parseResults);
this.parserRunning = false;
}
handleText(text) {
if (text) {
this.dom.addChild(new dom_1.TextElement(text, this.textLine, this.textColumn, true));
this.pendingSource = this.atEOF() && this.putBacks.length === 0 ? '' : '<';
this.callback('text', this.dom.getDepth() + 1, text, true);
}
this.collectedSpace = '';
this.currentTag = this.currentTagLc = '';
this.state = State.AT_MARKUP_START;
}
handleMarkupStart(ch) {
this.markupLine = this.line;
this.markupColumn = this.column - 1;
switch (ch) {
case '/':
this.state = State.AT_END_TAG_START;
break;
case '!':
case '?':
this.state = (ch === '!' ? State.AT_DECLARATION_START : State.AT_PROCESSING_START);
this.collectedSpace = '';
break;
default:
this.state = State.AT_START_TAG_START;
this.putBack(ch);
}
}
handleFullTag(init) {
let fullTag = init + this.getChar(RE_TAG_FAST);
const end = (/(\/>|>)$/.exec(fullTag) || [''])[0];
if (end)
fullTag = fullTag.substr(0, fullTag.length - end.length);
let tag, attribs;
[tag, attribs, this.collectedSpace] = /^(\S+)((?:.|\s)*?)(\s*)$/.exec(fullTag).slice(1);
this.currentTag = tag;
this.currentTagLc = tag.toLowerCase();
const node = new dom_1.DomNode(this.currentTagLc, 0, 0);
this.dom.prePush(node);
this.dom.addChild(node);
this.dom.push(node);
this.callback('start-tag-start', this.dom.getDepth(), tag);
const attribMatcher = /(\s+)([^=\s]+)(?:(\s*=\s*)("[^"]*"?|'[^']*'?|\S*)?)?/g;
let $;
while (($ = attribMatcher.exec(attribs))) {
let [lead, attrib, equals, value] = $.slice(1);
let quote;
equals = equals || '';
value = value || '';
if (value.startsWith('"')) {
quote = value.endsWith('"') ? '"' : '_"';
value = value.replace(/"/g, '');
}
else if (value.startsWith("'")) {
quote = value.endsWith("'") ? "'" : "_'";
value = value.replace(/'/g, '');
}
this.dom.addAttribute(attrib, value, lead, equals, quote);
this.attribute = attrib;
this.doAttributeCallback(equals, value, quote);
}
this.handleAttributeStart('>', end);
}
handleStartTagStart() {
const node = new dom_1.DomNode(this.currentTag, this.markupLine, this.markupColumn);
this.dom.prePush(node);
this.dom.addChild(node);
this.dom.push(node);
this.callback('start-tag-start', this.dom.getDepth(), this.currentTag);
this.checkingCharset = (!this.charset && this.currentTagLc === 'meta');
this.collectedSpace = '';
this.pendingSource = '';
this.state = State.AT_ATTRIBUTE_START;
}
handleEndTag(ch) {
let invalidEnding = false;
if (ch !== '>') {
if (this.xmlMode) {
this.putBack(ch);
this.pop(this.currentTagLc, this.pendingSource);
this.reportError('Syntax error in end tag');
}
else {
if (this.atEOF())
return false;
++this.parseResults.errors;
this.callback('error', 'Syntax error in end tag', this.line, this.column, '');
this.pendingSource = this.collectedSpace + ch;
invalidEnding = true;
}
}
else if (!this.currentTag) {
++this.parseResults.errors;
this.callback('error', 'Empty end tag', this.line, this.column, this.pendingSource);
this.dom.addChild(new dom_1.UnmatchedClosingTag(this.pendingSource, this.line, this.column));
this.collectedSpace = '';
this.pendingSource = '';
this.state = State.OUTSIDE_MARKUP;
}
else {
this.pop(this.currentTagLc, `</${this.currentTag}${this.collectedSpace}>`);
this.doEndTagCallback(this.currentTag, this.collectedSpace + '>');
}
return invalidEnding;
}
handleAttributeStart(ch, end) {
let getAttribName = false;
if (ch !== '>') {
if (ch === '/' && !this.xmlMode) {
// Most browsers seem to simply ignore stray slashes in tags which aren't followed by `>`.
// Here will turn it into into its own valueless attribute.
this.attribute = '/';
this.leadingSpace = this.collectedSpace;
this.collectedSpace = '';
this.dom.addAttribute('/', '', this.leadingSpace, '', '');
this.doAttributeCallback('', '', '');
this.state = State.AT_ATTRIBUTE_START;
}
else if ((0, characters_1.isAttributeNameChar)(ch, !this.xmlMode)) {
this.leadingSpace = this.collectedSpace;
this.collectedSpace = '';
this.state = State.AT_ATTRIBUTE_ASSIGNMENT;
getAttribName = true;
}
else {
this.dom.addInnerWhitespace(this.collectedSpace);
this.dom.getCurrentNode().badTerminator = ch;
this.reportError(`Syntax error in <${this.currentTag}>`);
}
}
else {
this.dom.addInnerWhitespace(this.collectedSpace);
this.callback('start-tag-end', this.dom.getDepth(), this.collectedSpace, end);
this.collectedSpace = '';
this.pendingSource = '';
this.checkingCharset = false;
this.contentType = false;
this.pendingCharset = '';
if (end.length > 1 || (!this.xmlMode && elements_1.VOID_ELEMENTS.has(this.currentTagLc))) {
this.pop(end.length > 1 ? null : undefined);
this.state = State.OUTSIDE_MARKUP;
}
else if (this.currentTagLc === 'script')
this.state = State.IN_SCRIPT_ELEMENT;
else if (this.currentTagLc === 'style')
this.state = State.IN_STYLE_ELEMENT;
else if (this.currentTagLc === 'textarea')
this.state = State.IN_TEXT_AREA_ELEMENT;
else
this.state = State.OUTSIDE_MARKUP;
}
return getAttribName;
}
handleAttributeAssignment(ch) {
if (ch === '=') {
this.preEqualsSpace = this.collectedSpace;
this.collectedSpace = '';
this.state = State.AT_ATTRIBUTE_VALUE;
}
else {
this.dom.addAttribute(this.attribute, '', this.leadingSpace, '', '');
this.doAttributeCallback('', '', '');
this.putBack(ch);
this.state = State.AT_ATTRIBUTE_START;
}
}
handleAttributeValueStepOne(ch) {
if (ch === '>') {
const equals = this.preEqualsSpace + '=';
this.dom.addAttribute(this.attribute, '', this.leadingSpace, equals, '');
this.doAttributeCallback(equals, '', '');
this.putBack(ch);
return undefined;
}
return (ch === '"' || ch === "'") ? ch : '';
}
handleAttributeValueStepTwo(quote, value, terminated) {
const equals = this.preEqualsSpace + '=' + this.collectedSpace;
quote = (terminated ? '' : '_') + quote;
this.dom.addAttribute(this.attribute, value, this.leadingSpace, equals, quote);
this.doAttributeCallback(equals, value, quote);
this.collectedSpace = '';
if (this.checkingCharset) {
const attribLc = this.attribute.toLowerCase();
if (attribLc === 'charset')
this.charset = value.trim();
else if (attribLc === 'http-equiv' && value.toLowerCase() === 'content-type') {
this.contentType = true;
this.charset = this.pendingCharset;
}
else if (attribLc === 'content') {
const charset = (/\bcharset[ \n\r\t\f]*=[ \n\r\t\f]*([\w\-]+)\b/i.exec(value) || [])[1];
if (this.contentType)
this.charset = charset;
else
this.pendingCharset = charset;
}
if (this.charset && this.parserRunning && this.callbacks.has('encoding')) {
const bailout = this.callback('encoding', this.charset, this.charset.toLowerCase().replace(/:\d{4}$|[^0-9a-z]/g, ''), true);
if (bailout) {
this.parserRunning = false;
this.pendingReset = true;
return true;
}
}
}
return false;
}
handleDeclarationStartStepOne(ch) {
if (this.collectedSpace.length === 0 && ch === '-') {
const ch2 = this.getChar();
if (ch2 === '-') {
this.state = State.AT_COMMENT_START;
return false;
}
else
this.putBack(ch2);
}
return true;
}
handleDeclarationStartStepTwo(content, terminated, isCData) {
if (isCData) {
this.dom.addChild(new dom_1.CData(content, this.markupLine, this.markupColumn, terminated));
if (!terminated)
this.reportError('File ended in unterminated CDATA', false);
this.callback('cdata', this.dom.getDepth() + 1, content, terminated);
}
else if (/^doctype\b/i.test(content)) {
const docType = new dom_1.DocType(content, this.markupLine, this.markupColumn, terminated);
this.dom.addChild(docType);
if (!terminated)
this.reportError('File ended in unterminated doctype', false);
if (this.parserRunning && this.callbacks.has('doctype'))
this.callback('doctype', docType, terminated);
else
this.callback('declaration', this.dom.getDepth() + 1, content, terminated);
this.xmlMode = (docType.type === 'xhtml');
this.dom.setXmlMode(this.xmlMode);
}
else {
this.dom.addChild(new dom_1.DeclarationElement(content, this.markupLine, this.markupColumn, terminated));
if (!terminated)
this.reportError('File ended in unterminated declaration', false);
this.callback('declaration', this.dom.getDepth() + 1, content, terminated);
}
this.collectedSpace = '';
this.pendingSource = '';
this.leadingSpace = '';
this.state = State.OUTSIDE_MARKUP;
}
handleProcessingStart(content, terminated) {
this.dom.addChild(new dom_1.ProcessingElement(content, this.markupLine, this.markupColumn, terminated));
if (!terminated)
this.reportError('File ended in unterminated processing instruction', false);
this.callback('processing', this.dom.getDepth() + 1, content, terminated);
if (content.startsWith('xml ') && this.dom.canDoXmlMode()) {
this.xmlMode = true;
this.dom.setXmlMode(true);
}
this.collectedSpace = '';
this.pendingSource = '';
this.leadingSpace = '';
this.state = State.OUTSIDE_MARKUP;
}
handleCommentStart(content, terminated) {
this.dom.addChild(new dom_1.CommentElement(content, this.markupLine, this.markupColumn, terminated));
if (!terminated)
this.reportError('File ended in unterminated comment', false);
this.callback('comment', this.dom.getDepth() + 1, content, terminated);
this.collectedSpace = '';
this.pendingSource = '';
this.leadingSpace = '';
this.state = State.OUTSIDE_MARKUP;
}
handleTextBlockElements(tag, content, endTag, terminated) {
if (!terminated) {
this.reportError(`File ended in unterminated <${tag}> section`, false);
this.dom.getCurrentNode().closureState = dom_1.ClosureState.UNCLOSED;
}
if (this.collectedSpace || content) {
content = this.collectedSpace + content;
this.dom.addChild(new dom_1.TextElement(content, this.textLine, this.textColumn, tag === 'textarea'));
this.callback('text', this.dom.getDepth() + 1, content, tag === 'textarea');
this.collectedSpace = '';
this.pendingSource = '';
}
if (terminated) {
const $$ = new RegExp('^<\\/(' + tag + ')([ \\n\\r\\t\\f]*)>$', 'i').exec(endTag);
this.pop(tag, `</${$$[1]}${$$[2]}>`);
this.doEndTagCallback($$[1], $$[2] + '>');
}
this.state = State.OUTSIDE_MARKUP;
}
pop(tagLc, endTagText = '') {
if (!this.dom.pop(tagLc, endTagText, this.markupLine, this.markupColumn)) {
++this.parseResults.errors;
this.callback('error', `Unmatched closing tag </${tagLc}>`, this.line, this.column, '');
}
}
reportError(message, reportPending = true) {
++this.parseResults.errors;
this.callback('error', message, this.line, this.column, reportPending ? this.pendingSource : '');
this.state = State.OUTSIDE_MARKUP;
if (reportPending) {
this.collectedSpace = '';
this.pendingSource = '';
}
}
doEndTagCallback(tag, trailingContent) {
this.callback('end-tag', this.dom.getDepth() + 1, tag, trailingContent);
this.state = State.OUTSIDE_MARKUP;
this.collectedSpace = '';
this.pendingSource = '';
}
doAttributeCallback(equalSign, value, quote) {
this.callback('attribute', this.leadingSpace, this.attribute, equalSign, value, quote);
this.pendingSource = '';
}
atEOF() {
return !this.htmlSource;
}
getChar(multi) {
let ch;
if (this.putBacks.length > 0) {
ch = this.putBacks.pop();
this.pendingSource += ch;
if (!this.fast) {
if ((0, characters_1.isEol)(ch)) {
++this.line;
this.column = 0;
}
else {
this.column += this.columnIncrement;
this.columnIncrement = (ch === '\t' ? this.tabSize - (this.column - 1) % this.tabSize : 1);
}
}
return ch;
}
if (this.htmlSource.length === 0)
return '';
else if (multi) {
const $ = multi.exec(this.htmlSource);
if ($) {
this.htmlSource = this.htmlSource.slice($[1].length);
this.parseResults.characters += $[1].length;
this.column += this.columnIncrement + $[1].length - 1;
return $[1];
}
}
let skip = 1;
++this.parseResults.characters;
ch = this.htmlSource.charAt(0);
if (!this.fast && ch === '\r') {
const ch2 = this.htmlSource.charAt(1);
if (ch2 === '\n') {
++this.parseResults.characters;
ch += '\n';
skip = 2;
}
}
if (!this.fast) {
if ((0, characters_1.isEol)(ch)) {
++this.line;
this.column = 0;
if (this.options.eol)
ch = this.options.eol;
}
else {
const cp = ch.charCodeAt(0);
this.column += this.columnIncrement;
this.columnIncrement = (ch === '\t' ? this.tabSize - (this.column - 1) % this.tabSize : 1);
// Test for high surrogate
if (0xD800 <= cp && cp <= 0xDBFF) {
const ch2 = this.htmlSource.charAt(1);
if (ch2) {
const cp2 = ch2.charCodeAt(0);
// Test for low surrogate
if (0xDC00 <= cp2 && cp2 <= 0xDFFF) {
ch += ch2;
skip = 2;
}
}
}
}
}
this.pendingSource += ch;
this.htmlSource = this.htmlSource.slice(skip);
return ch;
}
putBack(ch) {
this.putBacks.push(ch);
this.pendingSource = this.pendingSource.substr(0, this.pendingSource.length - ch.length);
if ((0, characters_1.isEol)(ch))
--this.line;
else
--this.column;
}
gatherWhitespace(ch) {
while (ch.length > 1 || (0, characters_1.isWhitespace)(ch)) {
this.collectedSpace += ch;
ch = this.getChar(this.reWhitespace);
}
return ch;
}
gatherText() {
const text = [];
let ch;
this.pendingSource = '';
while ((ch = this.getChar(this.reText))) {
if (ch === '<') {
const ch2 = this.getChar();
if (ch2 === '/' && !this.options.emptyEndTag) {
const ch3 = this.getChar();
if (ch3 !== '/' && (0, characters_1.isMarkupStart)(ch3)) {
this.putBack(ch3);
this.putBack(ch2);
break;
}
else
text.push(ch + ch2 + ch3);
}
else if ((0, characters_1.isMarkupStart)(ch2)) {
this.putBack(ch2);
break;
}
else
text.push(ch + ch2);
}
else
text.push(ch);
}
return text.join('');
}
gatherTagName(init = '') {
const tag = [init];
let ch;
while ((0, characters_1.isPCENChar)(ch = this.getChar(), !this.xmlMode))
tag.push(ch);
this.currentTag = tag.join('');
this.currentTagLc = this.xmlMode ? this.currentTag : this.currentTag.toLowerCase();
this.putBack(ch);
}
gatherInvalidEndTagEnding() {
let ch;
while ((ch = this.getChar()) && ch !== '>') { }
}
gatherAttributeName(init = '') {
this.attribute = init;
let ch;
while ((0, characters_1.isAttributeNameChar)(ch = this.getChar(RE_ATTRIB_NAME), !this.xmlMode))
this.attribute += ch;
this.putBack(ch);
}
gatherAttributeValue(quote, init = '') {
let value = init;
let ch;
let afterSlash = false;
while ((ch = this.getChar(this.reAttribValue[quote])) &&
ch !== quote && (quote || (!(0, characters_1.isWhitespace)(ch) && ch !== '>'))) {
value += ch;
afterSlash = ch === '/';
}
if (!quote) {
this.putBack(ch);
if (afterSlash) {
this.putBack('/');
value = value.substr(0, value.length - 1);
}
}
return [value, !quote || ch === quote];
}
gatherComment(init = '') {
const comment = [init];
let stage = (init.endsWith('-') ? 1 : 0);
let ch;
while ((ch = this.getChar(stage === 0 ? this.reComment : undefined))) {
comment.push(ch);
if (stage === 0 && ch === '-')
stage = 1;
else if (stage === 1 && ch === '-')
stage = 2;
else if (stage === 2 && ch === '>') {
const cmt = comment.join('');
return [cmt.substr(0, cmt.length - 3), true];
}
else
stage = 0;
}
return [comment.join(''), false];
}
gatherDeclarationOrProcessing(init = '', checkForCData) {
if (init === '>')
return ['', true, false];
let content = init;
let ch;
let cdataDetected = false;
while ((ch = this.getChar(checkForCData ? undefined : this.reDeclaration))) {
if (checkForCData && content.length === 7) {
cdataDetected = (content === '[CDATA[');
checkForCData = false;
}
if (ch === '>' && (!cdataDetected || content.endsWith(']]')))
return [cdataDetected ? content.substring(7, content.length - 2) : content, true, cdataDetected];
content += ch;
}
return [cdataDetected ? content.substr(7) : content, false, cdataDetected];
}
gatherUntilEndTag(endTag, init = '') {
const ender = '</' + endTag;
const len = ender.length;
let content = init;
let endStage = ender.startsWith(init) ? init.length : 0;
let ch;
while ((ch = this.getChar(endStage === 0 ? this.reText : undefined))) {
content += ch;
if (endStage >= len && ch === '>')
return [content.substr(0, content.length - endStage - 1), content.substr(content.length - endStage - 1), true];
else if (endStage >= len && (0, characters_1.isWhitespace)(ch))
++endStage;
else if (endStage < len && ch.toLowerCase() === ender.charAt(endStage)) {
if (endStage === 0) {
this.markupLine = this.line;
this.markupColumn = this.column;
}
++endStage;
}
else
endStage = 0;
}
return [content, '', false];
}
adjustOptions() {
if (this.options.eol) {
switch (this.options.eol) {
case true:
case '\n':
case 'n':
case 'lf':
this.options.eol = '\n';
break;
case '\r':
case 'r':
case 'cr':
this.options.eol = '\r';
break;
case '\r\n':
case 'rn':
case 'crlf':
this.options.eol = '\r\n';
break;
default: this.options.eol = false;
}
}
this.fast = this.options.fast;
this.tabSize = this.options.tabSize;
if (this.fast) {
this.reWhitespace = RE_WHITESPACE_FAST;
this.reText = RE_TEXT_FAST;
this.reComment = RE_COMMENT_FAST;
this.reDeclaration = RE_DECLARATION_FAST;
}
else {
this.reWhitespace = RE_WHITESPACE;
this.reText = RE_TEXT;
this.reComment = RE_COMMENT;
this.reDeclaration = RE_DECLARATION;
}
}
}
exports.HtmlParser = HtmlParser;
HtmlParser.TEXT_STARTERS = new Set([State.OUTSIDE_MARKUP, State.IN_SCRIPT_ELEMENT, State.IN_STYLE_ELEMENT, State.IN_TEXT_AREA_ELEMENT]);
HtmlParser.DEFAULT_OPTIONS = {
emptyEndTag: true,
eol: '\n',
fast: false,
tabSize: 8,
xmlMode: false
};
HtmlParser.tagForState = {
[State.IN_SCRIPT_ELEMENT]: 'script',
[State.IN_STYLE_ELEMENT]: 'style',
[State.IN_TEXT_AREA_ELEMENT]: 'textarea',
};
//# sourceMappingURL=html-parser.js.map