UNPKG

substance

Version:

Substance is a JavaScript library for web-based content editing. It provides building blocks for realizing custom text editors and web-based publishing system. It is developed to power our online editing platform [Substance](http://substance.io).

1,340 lines (1,171 loc) 35.4 kB
import { decode_codepoint } from './entities'; import { entities } from './entities'; import { legacy } from './entities'; import { xml } from './entities'; var Tokenizer_1 = Tokenizer; var i = 0, TEXT = i++, BEFORE_TAG_NAME = i++, //after < IN_TAG_NAME = i++, IN_SELF_CLOSING_TAG = i++, BEFORE_CLOSING_TAG_NAME = i++, IN_CLOSING_TAG_NAME = i++, AFTER_CLOSING_TAG_NAME = i++, //attributes BEFORE_ATTRIBUTE_NAME = i++, IN_ATTRIBUTE_NAME = i++, AFTER_ATTRIBUTE_NAME = i++, BEFORE_ATTRIBUTE_VALUE = i++, IN_ATTRIBUTE_VALUE_DQ = i++, // " IN_ATTRIBUTE_VALUE_SQ = i++, // ' IN_ATTRIBUTE_VALUE_NQ = i++, //declarations BEFORE_DECLARATION = i++, // ! IN_DECLARATION = i++, //processing instructions IN_PROCESSING_INSTRUCTION = i++, // ? //comments BEFORE_COMMENT = i++, IN_COMMENT = i++, AFTER_COMMENT_1 = i++, AFTER_COMMENT_2 = i++, //cdata BEFORE_CDATA_1 = i++, // [ BEFORE_CDATA_2 = i++, // C BEFORE_CDATA_3 = i++, // D BEFORE_CDATA_4 = i++, // A BEFORE_CDATA_5 = i++, // T BEFORE_CDATA_6 = i++, // A IN_CDATA = i++, // [ AFTER_CDATA_1 = i++, // ] AFTER_CDATA_2 = i++, // ] //special tags BEFORE_SPECIAL = i++, //S BEFORE_SPECIAL_END = i++, //S BEFORE_SCRIPT_1 = i++, //C BEFORE_SCRIPT_2 = i++, //R BEFORE_SCRIPT_3 = i++, //I BEFORE_SCRIPT_4 = i++, //P BEFORE_SCRIPT_5 = i++, //T AFTER_SCRIPT_1 = i++, //C AFTER_SCRIPT_2 = i++, //R AFTER_SCRIPT_3 = i++, //I AFTER_SCRIPT_4 = i++, //P AFTER_SCRIPT_5 = i++, //T BEFORE_STYLE_1 = i++, //T BEFORE_STYLE_2 = i++, //Y BEFORE_STYLE_3 = i++, //L BEFORE_STYLE_4 = i++, //E AFTER_STYLE_1 = i++, //T AFTER_STYLE_2 = i++, //Y AFTER_STYLE_3 = i++, //L AFTER_STYLE_4 = i++, //E BEFORE_ENTITY = i++, //& BEFORE_NUMERIC_ENTITY = i++, //# IN_NAMED_ENTITY = i++, IN_NUMERIC_ENTITY = i++, IN_HEX_ENTITY = i++, //X j = 0, SPECIAL_NONE = j++, SPECIAL_SCRIPT = j++, SPECIAL_STYLE = j++; function whitespace(c){ return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r"; } function characterState(char, SUCCESS){ return function(c){ if(c === char) this._state = SUCCESS; }; } function ifElseState(upper, SUCCESS, FAILURE){ var lower = upper.toLowerCase(); if(upper === lower){ return function(c){ if(c === lower){ this._state = SUCCESS; } else { this._state = FAILURE; this._index--; } }; } else { return function(c){ if(c === lower || c === upper){ this._state = SUCCESS; } else { this._state = FAILURE; this._index--; } }; } } function consumeSpecialNameChar(upper, NEXT_STATE){ var lower = upper.toLowerCase(); return function(c){ if(c === lower || c === upper){ this._state = NEXT_STATE; } else { this._state = IN_TAG_NAME; this._index--; //consume the token again } }; } function Tokenizer(options, cbs){ this._state = TEXT; this._buffer = ""; this._sectionStart = 0; this._index = 0; this._bufferOffset = 0; //chars removed from _buffer this._baseState = TEXT; this._special = SPECIAL_NONE; this._cbs = cbs; this._running = true; this._ended = false; this._xmlMode = !!(options && options.xmlMode); this._decodeEntities = !!(options && options.decodeEntities); } Tokenizer.prototype._stateText = function(c){ if(c === "<"){ if(this._index > this._sectionStart){ this._cbs.ontext(this._getSection()); } this._state = BEFORE_TAG_NAME; this._sectionStart = this._index; } else if(this._decodeEntities && this._special === SPECIAL_NONE && c === "&"){ if(this._index > this._sectionStart){ this._cbs.ontext(this._getSection()); } this._baseState = TEXT; this._state = BEFORE_ENTITY; this._sectionStart = this._index; } }; Tokenizer.prototype._stateBeforeTagName = function(c){ if(c === "/"){ this._state = BEFORE_CLOSING_TAG_NAME; } else if(c === "<"){ this._cbs.ontext(this._getSection()); this._sectionStart = this._index; } else if(c === ">" || this._special !== SPECIAL_NONE || whitespace(c)) { this._state = TEXT; } else if(c === "!"){ this._state = BEFORE_DECLARATION; this._sectionStart = this._index + 1; } else if(c === "?"){ this._state = IN_PROCESSING_INSTRUCTION; this._sectionStart = this._index + 1; } else { this._state = (!this._xmlMode && (c === "s" || c === "S")) ? BEFORE_SPECIAL : IN_TAG_NAME; this._sectionStart = this._index; } }; Tokenizer.prototype._stateInTagName = function(c){ if(c === "/" || c === ">" || whitespace(c)){ this._emitToken("onopentagname"); this._state = BEFORE_ATTRIBUTE_NAME; this._index--; } }; Tokenizer.prototype._stateBeforeCloseingTagName = function(c){ if(whitespace(c)); else if(c === ">"){ this._state = TEXT; } else if(this._special !== SPECIAL_NONE){ if(c === "s" || c === "S"){ this._state = BEFORE_SPECIAL_END; } else { this._state = TEXT; this._index--; } } else { this._state = IN_CLOSING_TAG_NAME; this._sectionStart = this._index; } }; Tokenizer.prototype._stateInCloseingTagName = function(c){ if(c === ">" || whitespace(c)){ this._emitToken("onclosetag"); this._state = AFTER_CLOSING_TAG_NAME; this._index--; } }; Tokenizer.prototype._stateAfterCloseingTagName = function(c){ //skip everything until ">" if(c === ">"){ this._state = TEXT; this._sectionStart = this._index + 1; } }; Tokenizer.prototype._stateBeforeAttributeName = function(c){ if(c === ">"){ this._cbs.onopentagend(); this._state = TEXT; this._sectionStart = this._index + 1; } else if(c === "/"){ this._state = IN_SELF_CLOSING_TAG; } else if(!whitespace(c)){ this._state = IN_ATTRIBUTE_NAME; this._sectionStart = this._index; } }; Tokenizer.prototype._stateInSelfClosingTag = function(c){ if(c === ">"){ this._cbs.onselfclosingtag(); this._state = TEXT; this._sectionStart = this._index + 1; } else if(!whitespace(c)){ this._state = BEFORE_ATTRIBUTE_NAME; this._index--; } }; Tokenizer.prototype._stateInAttributeName = function(c){ if(c === "=" || c === "/" || c === ">" || whitespace(c)){ this._cbs.onattribname(this._getSection()); this._sectionStart = -1; this._state = AFTER_ATTRIBUTE_NAME; this._index--; } }; Tokenizer.prototype._stateAfterAttributeName = function(c){ if(c === "="){ this._state = BEFORE_ATTRIBUTE_VALUE; } else if(c === "/" || c === ">"){ this._cbs.onattribend(); this._state = BEFORE_ATTRIBUTE_NAME; this._index--; } else if(!whitespace(c)){ this._cbs.onattribend(); this._state = IN_ATTRIBUTE_NAME; this._sectionStart = this._index; } }; Tokenizer.prototype._stateBeforeAttributeValue = function(c){ if(c === "\""){ this._state = IN_ATTRIBUTE_VALUE_DQ; this._sectionStart = this._index + 1; } else if(c === "'"){ this._state = IN_ATTRIBUTE_VALUE_SQ; this._sectionStart = this._index + 1; } else if(!whitespace(c)){ this._state = IN_ATTRIBUTE_VALUE_NQ; this._sectionStart = this._index; this._index--; //reconsume token } }; Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function(c){ if(c === "\""){ this._emitToken("onattribdata"); this._cbs.onattribend(); this._state = BEFORE_ATTRIBUTE_NAME; } else if(this._decodeEntities && c === "&"){ this._emitToken("onattribdata"); this._baseState = this._state; this._state = BEFORE_ENTITY; this._sectionStart = this._index; } }; Tokenizer.prototype._stateInAttributeValueSingleQuotes = function(c){ if(c === "'"){ this._emitToken("onattribdata"); this._cbs.onattribend(); this._state = BEFORE_ATTRIBUTE_NAME; } else if(this._decodeEntities && c === "&"){ this._emitToken("onattribdata"); this._baseState = this._state; this._state = BEFORE_ENTITY; this._sectionStart = this._index; } }; Tokenizer.prototype._stateInAttributeValueNoQuotes = function(c){ if(whitespace(c) || c === ">"){ this._emitToken("onattribdata"); this._cbs.onattribend(); this._state = BEFORE_ATTRIBUTE_NAME; this._index--; } else if(this._decodeEntities && c === "&"){ this._emitToken("onattribdata"); this._baseState = this._state; this._state = BEFORE_ENTITY; this._sectionStart = this._index; } }; Tokenizer.prototype._stateBeforeDeclaration = function(c){ this._state = c === "[" ? BEFORE_CDATA_1 : c === "-" ? BEFORE_COMMENT : IN_DECLARATION; }; Tokenizer.prototype._stateInDeclaration = function(c){ if(c === ">"){ this._cbs.ondeclaration(this._getSection()); this._state = TEXT; this._sectionStart = this._index + 1; } }; Tokenizer.prototype._stateInProcessingInstruction = function(c){ if(c === ">"){ this._cbs.onprocessinginstruction(this._getSection()); this._state = TEXT; this._sectionStart = this._index + 1; } }; Tokenizer.prototype._stateBeforeComment = function(c){ if(c === "-"){ this._state = IN_COMMENT; this._sectionStart = this._index + 1; } else { this._state = IN_DECLARATION; } }; Tokenizer.prototype._stateInComment = function(c){ if(c === "-") this._state = AFTER_COMMENT_1; }; Tokenizer.prototype._stateAfterComment1 = function(c){ if(c === "-"){ this._state = AFTER_COMMENT_2; } else { this._state = IN_COMMENT; } }; Tokenizer.prototype._stateAfterComment2 = function(c){ if(c === ">"){ //remove 2 trailing chars this._cbs.oncomment(this._buffer.substring(this._sectionStart, this._index - 2)); this._state = TEXT; this._sectionStart = this._index + 1; } else if(c !== "-"){ this._state = IN_COMMENT; } // else: stay in AFTER_COMMENT_2 (`--->`) }; Tokenizer.prototype._stateBeforeCdata1 = ifElseState("C", BEFORE_CDATA_2, IN_DECLARATION); Tokenizer.prototype._stateBeforeCdata2 = ifElseState("D", BEFORE_CDATA_3, IN_DECLARATION); Tokenizer.prototype._stateBeforeCdata3 = ifElseState("A", BEFORE_CDATA_4, IN_DECLARATION); Tokenizer.prototype._stateBeforeCdata4 = ifElseState("T", BEFORE_CDATA_5, IN_DECLARATION); Tokenizer.prototype._stateBeforeCdata5 = ifElseState("A", BEFORE_CDATA_6, IN_DECLARATION); Tokenizer.prototype._stateBeforeCdata6 = function(c){ if(c === "["){ this._state = IN_CDATA; this._sectionStart = this._index + 1; } else { this._state = IN_DECLARATION; this._index--; } }; Tokenizer.prototype._stateInCdata = function(c){ if(c === "]") this._state = AFTER_CDATA_1; }; Tokenizer.prototype._stateAfterCdata1 = characterState("]", AFTER_CDATA_2); Tokenizer.prototype._stateAfterCdata2 = function(c){ if(c === ">"){ //remove 2 trailing chars this._cbs.oncdata(this._buffer.substring(this._sectionStart, this._index - 2)); this._state = TEXT; this._sectionStart = this._index + 1; } else if(c !== "]") { this._state = IN_CDATA; } //else: stay in AFTER_CDATA_2 (`]]]>`) }; Tokenizer.prototype._stateBeforeSpecial = function(c){ if(c === "c" || c === "C"){ this._state = BEFORE_SCRIPT_1; } else if(c === "t" || c === "T"){ this._state = BEFORE_STYLE_1; } else { this._state = IN_TAG_NAME; this._index--; //consume the token again } }; Tokenizer.prototype._stateBeforeSpecialEnd = function(c){ if(this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")){ this._state = AFTER_SCRIPT_1; } else if(this._special === SPECIAL_STYLE && (c === "t" || c === "T")){ this._state = AFTER_STYLE_1; } else this._state = TEXT; }; Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar("R", BEFORE_SCRIPT_2); Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar("I", BEFORE_SCRIPT_3); Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar("P", BEFORE_SCRIPT_4); Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar("T", BEFORE_SCRIPT_5); Tokenizer.prototype._stateBeforeScript5 = function(c){ if(c === "/" || c === ">" || whitespace(c)){ this._special = SPECIAL_SCRIPT; } this._state = IN_TAG_NAME; this._index--; //consume the token again }; Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT); Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT); Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT); Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT); Tokenizer.prototype._stateAfterScript5 = function(c){ if(c === ">" || whitespace(c)){ this._special = SPECIAL_NONE; this._state = IN_CLOSING_TAG_NAME; this._sectionStart = this._index - 6; this._index--; //reconsume the token } else this._state = TEXT; }; Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar("Y", BEFORE_STYLE_2); Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar("L", BEFORE_STYLE_3); Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar("E", BEFORE_STYLE_4); Tokenizer.prototype._stateBeforeStyle4 = function(c){ if(c === "/" || c === ">" || whitespace(c)){ this._special = SPECIAL_STYLE; } this._state = IN_TAG_NAME; this._index--; //consume the token again }; Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT); Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT); Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT); Tokenizer.prototype._stateAfterStyle4 = function(c){ if(c === ">" || whitespace(c)){ this._special = SPECIAL_NONE; this._state = IN_CLOSING_TAG_NAME; this._sectionStart = this._index - 5; this._index--; //reconsume the token } else this._state = TEXT; }; Tokenizer.prototype._stateBeforeEntity = ifElseState("#", BEFORE_NUMERIC_ENTITY, IN_NAMED_ENTITY); Tokenizer.prototype._stateBeforeNumericEntity = ifElseState("X", IN_HEX_ENTITY, IN_NUMERIC_ENTITY); //for entities terminated with a semicolon Tokenizer.prototype._parseNamedEntityStrict = function(){ //offset = 1 if(this._sectionStart + 1 < this._index){ var entity = this._buffer.substring(this._sectionStart + 1, this._index), map = this._xmlMode ? xml : entities; if(map.hasOwnProperty(entity)){ this._emitPartial(map[entity]); this._sectionStart = this._index + 1; } } }; //parses legacy entities (without trailing semicolon) Tokenizer.prototype._parseLegacyEntity = function(){ var start = this._sectionStart + 1, limit = this._index - start; if(limit > 6) limit = 6; //the max length of legacy entities is 6 while(limit >= 2){ //the min length of legacy entities is 2 var entity = this._buffer.substr(start, limit); if(legacy.hasOwnProperty(entity)){ this._emitPartial(legacy[entity]); this._sectionStart += limit + 1; return; } else { limit--; } } }; Tokenizer.prototype._stateInNamedEntity = function(c){ if(c === ";"){ this._parseNamedEntityStrict(); if(this._sectionStart + 1 < this._index && !this._xmlMode){ this._parseLegacyEntity(); } this._state = this._baseState; } else if((c < "a" || c > "z") && (c < "A" || c > "Z") && (c < "0" || c > "9")){ if(this._xmlMode); else if(this._sectionStart + 1 === this._index); else if(this._baseState !== TEXT){ if(c !== "="){ this._parseNamedEntityStrict(); } } else { this._parseLegacyEntity(); } this._state = this._baseState; this._index--; } }; Tokenizer.prototype._decodeNumericEntity = function(offset, base){ var sectionStart = this._sectionStart + offset; if(sectionStart !== this._index){ //parse entity var entity = this._buffer.substring(sectionStart, this._index); var parsed = parseInt(entity, base); this._emitPartial(decode_codepoint(parsed)); this._sectionStart = this._index; } else { this._sectionStart--; } this._state = this._baseState; }; Tokenizer.prototype._stateInNumericEntity = function(c){ if(c === ";"){ this._decodeNumericEntity(2, 10); this._sectionStart++; } else if(c < "0" || c > "9"){ if(!this._xmlMode){ this._decodeNumericEntity(2, 10); } else { this._state = this._baseState; } this._index--; } }; Tokenizer.prototype._stateInHexEntity = function(c){ if(c === ";"){ this._decodeNumericEntity(3, 16); this._sectionStart++; } else if((c < "a" || c > "f") && (c < "A" || c > "F") && (c < "0" || c > "9")){ if(!this._xmlMode){ this._decodeNumericEntity(3, 16); } else { this._state = this._baseState; } this._index--; } }; Tokenizer.prototype._cleanup = function (){ if(this._sectionStart < 0){ this._buffer = ""; this._bufferOffset += this._index; this._index = 0; } else if(this._running){ if(this._state === TEXT){ if(this._sectionStart !== this._index){ this._cbs.ontext(this._buffer.substr(this._sectionStart)); } this._buffer = ""; this._bufferOffset += this._index; this._index = 0; } else if(this._sectionStart === this._index){ //the section just started this._buffer = ""; this._bufferOffset += this._index; this._index = 0; } else { //remove everything unnecessary this._buffer = this._buffer.substr(this._sectionStart); this._index -= this._sectionStart; this._bufferOffset += this._sectionStart; } this._sectionStart = 0; } }; //TODO make events conditional Tokenizer.prototype.write = function(chunk){ if(this._ended) this._cbs.onerror(Error(".write() after done!")); this._buffer += chunk; this._parse(); }; Tokenizer.prototype._parse = function(){ while(this._index < this._buffer.length && this._running){ var c = this._buffer.charAt(this._index); if(this._state === TEXT) { this._stateText(c); } else if(this._state === BEFORE_TAG_NAME){ this._stateBeforeTagName(c); } else if(this._state === IN_TAG_NAME) { this._stateInTagName(c); } else if(this._state === BEFORE_CLOSING_TAG_NAME){ this._stateBeforeCloseingTagName(c); } else if(this._state === IN_CLOSING_TAG_NAME){ this._stateInCloseingTagName(c); } else if(this._state === AFTER_CLOSING_TAG_NAME){ this._stateAfterCloseingTagName(c); } else if(this._state === IN_SELF_CLOSING_TAG){ this._stateInSelfClosingTag(c); } /* * attributes */ else if(this._state === BEFORE_ATTRIBUTE_NAME){ this._stateBeforeAttributeName(c); } else if(this._state === IN_ATTRIBUTE_NAME){ this._stateInAttributeName(c); } else if(this._state === AFTER_ATTRIBUTE_NAME){ this._stateAfterAttributeName(c); } else if(this._state === BEFORE_ATTRIBUTE_VALUE){ this._stateBeforeAttributeValue(c); } else if(this._state === IN_ATTRIBUTE_VALUE_DQ){ this._stateInAttributeValueDoubleQuotes(c); } else if(this._state === IN_ATTRIBUTE_VALUE_SQ){ this._stateInAttributeValueSingleQuotes(c); } else if(this._state === IN_ATTRIBUTE_VALUE_NQ){ this._stateInAttributeValueNoQuotes(c); } /* * declarations */ else if(this._state === BEFORE_DECLARATION){ this._stateBeforeDeclaration(c); } else if(this._state === IN_DECLARATION){ this._stateInDeclaration(c); } /* * processing instructions */ else if(this._state === IN_PROCESSING_INSTRUCTION){ this._stateInProcessingInstruction(c); } /* * comments */ else if(this._state === BEFORE_COMMENT){ this._stateBeforeComment(c); } else if(this._state === IN_COMMENT){ this._stateInComment(c); } else if(this._state === AFTER_COMMENT_1){ this._stateAfterComment1(c); } else if(this._state === AFTER_COMMENT_2){ this._stateAfterComment2(c); } /* * cdata */ else if(this._state === BEFORE_CDATA_1){ this._stateBeforeCdata1(c); } else if(this._state === BEFORE_CDATA_2){ this._stateBeforeCdata2(c); } else if(this._state === BEFORE_CDATA_3){ this._stateBeforeCdata3(c); } else if(this._state === BEFORE_CDATA_4){ this._stateBeforeCdata4(c); } else if(this._state === BEFORE_CDATA_5){ this._stateBeforeCdata5(c); } else if(this._state === BEFORE_CDATA_6){ this._stateBeforeCdata6(c); } else if(this._state === IN_CDATA){ this._stateInCdata(c); } else if(this._state === AFTER_CDATA_1){ this._stateAfterCdata1(c); } else if(this._state === AFTER_CDATA_2){ this._stateAfterCdata2(c); } /* * special tags */ else if(this._state === BEFORE_SPECIAL){ this._stateBeforeSpecial(c); } else if(this._state === BEFORE_SPECIAL_END){ this._stateBeforeSpecialEnd(c); } /* * script */ else if(this._state === BEFORE_SCRIPT_1){ this._stateBeforeScript1(c); } else if(this._state === BEFORE_SCRIPT_2){ this._stateBeforeScript2(c); } else if(this._state === BEFORE_SCRIPT_3){ this._stateBeforeScript3(c); } else if(this._state === BEFORE_SCRIPT_4){ this._stateBeforeScript4(c); } else if(this._state === BEFORE_SCRIPT_5){ this._stateBeforeScript5(c); } else if(this._state === AFTER_SCRIPT_1){ this._stateAfterScript1(c); } else if(this._state === AFTER_SCRIPT_2){ this._stateAfterScript2(c); } else if(this._state === AFTER_SCRIPT_3){ this._stateAfterScript3(c); } else if(this._state === AFTER_SCRIPT_4){ this._stateAfterScript4(c); } else if(this._state === AFTER_SCRIPT_5){ this._stateAfterScript5(c); } /* * style */ else if(this._state === BEFORE_STYLE_1){ this._stateBeforeStyle1(c); } else if(this._state === BEFORE_STYLE_2){ this._stateBeforeStyle2(c); } else if(this._state === BEFORE_STYLE_3){ this._stateBeforeStyle3(c); } else if(this._state === BEFORE_STYLE_4){ this._stateBeforeStyle4(c); } else if(this._state === AFTER_STYLE_1){ this._stateAfterStyle1(c); } else if(this._state === AFTER_STYLE_2){ this._stateAfterStyle2(c); } else if(this._state === AFTER_STYLE_3){ this._stateAfterStyle3(c); } else if(this._state === AFTER_STYLE_4){ this._stateAfterStyle4(c); } /* * entities */ else if(this._state === BEFORE_ENTITY){ this._stateBeforeEntity(c); } else if(this._state === BEFORE_NUMERIC_ENTITY){ this._stateBeforeNumericEntity(c); } else if(this._state === IN_NAMED_ENTITY){ this._stateInNamedEntity(c); } else if(this._state === IN_NUMERIC_ENTITY){ this._stateInNumericEntity(c); } else if(this._state === IN_HEX_ENTITY){ this._stateInHexEntity(c); } else { this._cbs.onerror(Error("unknown _state"), this._state); } this._index++; } this._cleanup(); }; Tokenizer.prototype.pause = function(){ this._running = false; }; Tokenizer.prototype.resume = function(){ this._running = true; if(this._index < this._buffer.length){ this._parse(); } if(this._ended){ this._finish(); } }; Tokenizer.prototype.end = function(chunk){ if(this._ended) this._cbs.onerror(Error(".end() after done!")); if(chunk) this.write(chunk); this._ended = true; if(this._running) this._finish(); }; Tokenizer.prototype._finish = function(){ //if there is remaining data, emit it in a reasonable way if(this._sectionStart < this._index){ this._handleTrailingData(); } this._cbs.onend(); }; Tokenizer.prototype._handleTrailingData = function(){ var data = this._buffer.substr(this._sectionStart); if(this._state === IN_CDATA || this._state === AFTER_CDATA_1 || this._state === AFTER_CDATA_2){ this._cbs.oncdata(data); } else if(this._state === IN_COMMENT || this._state === AFTER_COMMENT_1 || this._state === AFTER_COMMENT_2){ this._cbs.oncomment(data); } else if(this._state === IN_NAMED_ENTITY && !this._xmlMode){ this._parseLegacyEntity(); if(this._sectionStart < this._index){ this._state = this._baseState; this._handleTrailingData(); } } else if(this._state === IN_NUMERIC_ENTITY && !this._xmlMode){ this._decodeNumericEntity(2, 10); if(this._sectionStart < this._index){ this._state = this._baseState; this._handleTrailingData(); } } else if(this._state === IN_HEX_ENTITY && !this._xmlMode){ this._decodeNumericEntity(3, 16); if(this._sectionStart < this._index){ this._state = this._baseState; this._handleTrailingData(); } } else if( this._state !== IN_TAG_NAME && this._state !== BEFORE_ATTRIBUTE_NAME && this._state !== BEFORE_ATTRIBUTE_VALUE && this._state !== AFTER_ATTRIBUTE_NAME && this._state !== IN_ATTRIBUTE_NAME && this._state !== IN_ATTRIBUTE_VALUE_SQ && this._state !== IN_ATTRIBUTE_VALUE_DQ && this._state !== IN_ATTRIBUTE_VALUE_NQ && this._state !== IN_CLOSING_TAG_NAME ){ this._cbs.ontext(data); } //else, ignore remaining data //TODO add a way to remove current tag }; Tokenizer.prototype.reset = function(){ Tokenizer.call(this, {xmlMode: this._xmlMode, decodeEntities: this._decodeEntities}, this._cbs); }; Tokenizer.prototype.getAbsoluteIndex = function(){ return this._bufferOffset + this._index; }; Tokenizer.prototype._getSection = function(){ return this._buffer.substring(this._sectionStart, this._index); }; Tokenizer.prototype._emitToken = function(name){ let section = this._getSection(); switch (name) { case 'onattribute': this._cbs.onattribute(section); break case 'onattribdata': this._cbs.onattribdata(section); break case 'oncdatastart': this._cbs.oncdatastart(section); break case 'oncdataend': this._cbs.oncdataend(section); break case 'ontext': this._cbs.ontext(section); break case 'onprocessinginstruction': this._cbs.onprocessinginstruction(section); break case 'oncomment': this._cbs.oncomment(section); break case 'oncommentend': this._cbs.oncommentend(section); break case 'onclosetag': this._cbs.onclosetag(section); break case 'onopentag': this._cbs.onopentag(section); break case 'onopentagname': this._cbs.onopentagname(section); break case 'onerror': this._cbs.onerror(section); break case 'onend': this._cbs.onend(section); break default: throw new Error('Unsupported event: ' + name) } this._sectionStart = -1; }; Tokenizer.prototype._emitPartial = function(value){ if(this._baseState !== TEXT){ this._cbs.onattribdata(value); //TODO implement the new event } else { this._cbs.ontext(value); } }; var Tokenizer$1 = Tokenizer_1; /* Options: xmlMode: Disables the special behavior for script/style tags (false by default) lowerCaseAttributeNames: call .toLowerCase for each attribute name (true if xmlMode is `false`) lowerCaseTags: call .toLowerCase for each tag name (true if xmlMode is `false`) */ /* Callbacks: oncdataend, oncdatastart, onclosetag, oncomment, oncommentend, onerror, onopentag, onprocessinginstruction, onreset, ontext */ var formTags = new Set([ 'input', 'option', 'optgroup', 'select', 'button', 'datalist', 'textarea' ]); var openImpliesClose = new Map([ ['tr' , new Set('tr','th','td')], ['th' , new Set('th')], ['td' , new Set('thead','th','td')], ['body' , new Set('head','link','script')], ['li' , new Set('li')], ['p' , new Set('p')], ['h1' , new Set('p')], ['h2' , new Set('p')], ['h3' , new Set('p')], ['h4' , new Set('p')], ['h5' , new Set('p')], ['h6' , new Set('p')], ['select' , formTags], ['input' , formTags], ['output' , formTags], ['button' , formTags], ['datalist', formTags], ['textarea', formTags], ['option' , new Set('option')], ['optgroup', new Set('optgroup')] ]); var voidElements = new Set([ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr', //common self closing svg elements 'path', 'circle', 'ellipse', 'line', 'rect', 'use', 'stop', 'polyline', 'polygon' ]); var re_nameEnd = /\s|\//; function Parser(cbs, options){ this._options = options || {}; this._cbs = cbs || {}; this._tagname = ""; this._attribname = ""; this._attribvalue = ""; this._attribs = null; this._stack = []; this.startIndex = 0; this.endIndex = null; this._lowerCaseTagNames = this._options.lowerCaseTags || !this._options.xmlMode; this._lowerCaseAttributeNames = this._options.lowerCaseAttributeNames || !this._options.xmlMode; if(this._options.Tokenizer) { Tokenizer$1 = this._options.Tokenizer; } this._tokenizer = new Tokenizer$1(this._options, this); if(this._cbs.onparserinit) this._cbs.onparserinit(this); } Parser.prototype._updatePosition = function(initialOffset){ if(this.endIndex === null){ if(this._tokenizer._sectionStart <= initialOffset){ this.startIndex = 0; } else { this.startIndex = this._tokenizer._sectionStart - initialOffset; } } else this.startIndex = this.endIndex + 1; this.endIndex = this._tokenizer.getAbsoluteIndex(); }; //Tokenizer event handlers Parser.prototype.ontext = function(data){ this._updatePosition(1); this.endIndex--; if(this._cbs.ontext) this._cbs.ontext(data); }; Parser.prototype.onopentagname = function(name){ if(this._lowerCaseTagNames){ name = name.toLowerCase(); } this._tagname = name; if(!this._options.xmlMode && openImpliesClose.has(name)) { while (this._stack.length > 0) { var el = this._stack[this._stack.length - 1]; if (openImpliesClose.get(name).has(el)) { this.onclosetag(el); } else { break } } } if(this._options.xmlMode || !(voidElements.has(name))){ this._stack.push(name); } if(this._cbs.onopentagname) this._cbs.onopentagname(name); if(this._cbs.onopentag) this._attribs = {}; }; Parser.prototype.onopentagend = function(){ this._updatePosition(1); if(this._attribs){ if(this._cbs.onopentag) this._cbs.onopentag(this._tagname, this._attribs); this._attribs = null; } if(!this._options.xmlMode && this._cbs.onclosetag && voidElements.has(this._tagname)){ this._cbs.onclosetag(this._tagname); } this._tagname = ""; }; Parser.prototype.onclosetag = function(name){ this._updatePosition(1); if(this._lowerCaseTagNames){ name = name.toLowerCase(); } // NOTE: the original implementation allows 'voidElements' in XML, such as in `<input>`, which // IMO is not valid. Only self-closing tags are allowd, such as in `<foo />`, but actually only // if it is declared as EMPTY in the XSD if(this._options.xmlMode) { const stack = this._stack; let last = stack.pop(); while(last !== name) { if(this._cbs.onerror) { this._cbs.onerror("Unclosed tag <"+last+">"); } last = stack.pop(); } this.onopentagend(); if(this._cbs.onclosetag) { this._cbs.onclosetag(last); } } else { if(this._stack.length && (!(voidElements.has(name)))) { let pos = this._stack.lastIndexOf(name); if(pos !== -1){ if(this._cbs.onclosetag){ pos = this._stack.length - pos; while(pos--) this._cbs.onclosetag(this._stack.pop()); } else this._stack.length = pos; } else if(name === "p"){ this.onopentagname(name); this._closeCurrentTag(); } } else if(name === "br" || name === "p"){ this.onopentagname(name); this._closeCurrentTag(); } } }; Parser.prototype.onselfclosingtag = function(){ if(this._options.xmlMode || this._options.recognizeSelfClosing){ this._closeCurrentTag(); } else { this.onopentagend(); } }; Parser.prototype._closeCurrentTag = function(){ var name = this._tagname; this.onopentagend(); //self-closing tags will be on the top of the stack //(cheaper check than in onclosetag) if(this._stack[this._stack.length - 1] === name){ if(this._cbs.onclosetag){ this._cbs.onclosetag(name); } this._stack.pop(); } }; Parser.prototype.onattribname = function(name){ if(this._lowerCaseAttributeNames){ name = name.toLowerCase(); } this._attribname = name; }; Parser.prototype.onattribdata = function(value){ this._attribvalue += value; }; Parser.prototype.onattribend = function(){ if(this._cbs.onattribute) this._cbs.onattribute(this._attribname, this._attribvalue); if( this._attribs && !Object.prototype.hasOwnProperty.call(this._attribs, this._attribname) ){ this._attribs[this._attribname] = this._attribvalue; } this._attribname = ""; this._attribvalue = ""; }; Parser.prototype._getInstructionName = function(value){ var idx = value.search(re_nameEnd), name = idx < 0 ? value : value.substr(0, idx); if(this._lowerCaseTagNames){ name = name.toLowerCase(); } return name; }; Parser.prototype.ondeclaration = function(value){ if(this._cbs.onprocessinginstruction){ var name = this._getInstructionName(value); this._cbs.onprocessinginstruction("!" + name, "!" + value); } }; Parser.prototype.onprocessinginstruction = function(value){ if(this._cbs.onprocessinginstruction){ var name = this._getInstructionName(value); this._cbs.onprocessinginstruction("?" + name, "?" + value); } }; Parser.prototype.oncomment = function(value){ this._updatePosition(4); if(this._cbs.oncomment) this._cbs.oncomment(value); if(this._cbs.oncommentend) this._cbs.oncommentend(); }; Parser.prototype.oncdata = function(value){ this._updatePosition(1); if(this._options.xmlMode || this._options.recognizeCDATA){ if(this._cbs.oncdatastart) this._cbs.oncdatastart(); if(this._cbs.ontext) this._cbs.ontext(value); if(this._cbs.oncdataend) this._cbs.oncdataend(); } else { this.oncomment("[CDATA[" + value + "]]"); } }; Parser.prototype.onerror = function(err){ if(this._cbs.onerror) this._cbs.onerror(err); }; Parser.prototype.onend = function(){ if(this._cbs.onclosetag){ for( var i = this._stack.length; i > 0; this._cbs.onclosetag(this._stack[--i]) ); } if(this._cbs.onend) this._cbs.onend(); }; //Resets the parser to a blank state, ready to parse a new HTML document Parser.prototype.reset = function(){ if(this._cbs.onreset) this._cbs.onreset(); this._tokenizer.reset(); this._tagname = ""; this._attribname = ""; this._attribs = null; this._stack = []; if(this._cbs.onparserinit) this._cbs.onparserinit(this); }; //Parses a complete HTML document and pushes it to the handler Parser.prototype.parseComplete = function(data){ this.reset(); this.end(data); }; Parser.prototype.write = function(chunk){ this._tokenizer.write(chunk); }; Parser.prototype.end = function(chunk){ this._tokenizer.end(chunk); }; Parser.prototype.pause = function(){ this._tokenizer.pause(); }; Parser.prototype.resume = function(){ this._tokenizer.resume(); }; //alias for backwards compat Parser.prototype.parseChunk = Parser.prototype.write; Parser.prototype.done = Parser.prototype.end; var Parser_1 = Parser; // monkey patching Parser_1.prototype.ondeclaration = function(value){ if(this._cbs.ondeclaration){ this._cbs.ondeclaration(value); } else if(this._cbs.onprocessinginstruction){ var name = this._getInstructionName(value); this._cbs.onprocessinginstruction("!" + name, "!" + value); } }; Parser_1.prototype.oncdata = function(value){ this._updatePosition(1); if(this._options.xmlMode || this._options.recognizeCDATA){ if(this._cbs.oncdatastart) this._cbs.oncdatastart(value); // we don't want `ontext` getting called here if(this._cbs.oncdataend) this._cbs.oncdataend(); } else { this.oncomment("[CDATA[" + value + "]]"); } }; export default Parser_1;