wcc.js
Version:
Compiler for wxml and wxss files.
1,267 lines (1,143 loc) • 39.2 kB
JavaScript
/**
这部分逻辑, 改造了https: //github.com/fb55/htmlparser2/blob/master/lib/Tokenizer.js
的token解析逻辑,基于有穷状态的自动机,解析token十分高效!
_emitToken:
onopentagname: 开始标签标签名
onclosetag: 结束标签标签名
onattribdata: 属性值
cbs:
ontext: 文本
onerror: 出错
onopentagend: 开始标签结束
onselfclosingtag: 自闭合的标签结束
onattribname: 属性名
onattribend: 某个属性解析完毕
oncomment: 注释
onend
onattribdata
*/
module.exports = Tokenizer;
// var decodeCodePoint = require("entities/lib/decode_codepoint.js");
// var entityMap = require("entities/maps/entities.json");
// var legacyMap = require("entities/maps/legacy.json");
// var xmlMap = require("entities/maps/xml.json");
const util = require("./util.js");
const error = require("./error.js");
var TEXT = 'TEXT';
var BEFORE_TAG_NAME = 'BEFORE_TAG_NAME'; //after <
var IN_TAG_NAME = 'IN_TAG_NAME';
var IN_SELF_CLOSING_TAG = 'IN_SELF_CLOSING_TAG';
var BEFORE_CLOSING_TAG_NAME = 'BEFORE_CLOSING_TAG_NAME';
var IN_CLOSING_TAG_NAME = 'IN_CLOSING_TAG_NAME';
var AFTER_CLOSING_TAG_NAME = 'AFTER_CLOSING_TAG_NAME';
//attributes
var BEFORE_ATTRIBUTE_NAME = 'BEFORE_ATTRIBUTE_NAME';
var IN_ATTRIBUTE_NAME = 'IN_ATTRIBUTE_NAME';
var AFTER_ATTRIBUTE_NAME = 'AFTER_ATTRIBUTE_NAME';
var BEFORE_ATTRIBUTE_VALUE = 'BEFORE_ATTRIBUTE_VALUE';
var IN_ATTRIBUTE_VALUE_DQ = 'IN_ATTRIBUTE_VALUE_DQ'; // "
var IN_ATTRIBUTE_VALUE_SQ = 'IN_ATTRIBUTE_VALUE_SQ'; // '
var IN_ATTRIBUTE_VALUE_NQ = 'IN_ATTRIBUTE_VALUE_NQ';
//declarations
var BEFORE_DECLARATION = 'BEFORE_DECLARATION'; // !
var IN_DECLARATION = 'IN_DECLARATION';
//processing instructions
var IN_PROCESSING_INSTRUCTION = 'IN_PROCESSING_INSTRUCTION'; // ?
//comments
var BEFORE_COMMENT = 'BEFORE_COMMENT';
var IN_COMMENT = 'IN_COMMENT';
var AFTER_COMMENT_1 = 'AFTER_COMMENT_1';
var AFTER_COMMENT_2 = 'AFTER_COMMENT_2';
//cdata
var BEFORE_CDATA_1 = 'BEFORE_CDATA_1'; // [
var BEFORE_CDATA_2 = 'BEFORE_CDATA_2'; // C
var BEFORE_CDATA_3 = 'BEFORE_CDATA_3'; // D
var BEFORE_CDATA_4 = 'BEFORE_CDATA_4'; // A
var BEFORE_CDATA_5 = 'BEFORE_CDATA_5'; // T
var BEFORE_CDATA_6 = 'BEFORE_CDATA_6'; // A
var IN_CDATA = 'IN_CDATA'; // [
var AFTER_CDATA_1 = 'AFTER_CDATA_1'; // ]
var AFTER_CDATA_2 = 'AFTER_CDATA_2'; // ]
//special tags
var BEFORE_SPECIAL = 'BEFORE_SPECIAL'; //S
var BEFORE_SPECIAL_END = 'BEFORE_SPECIAL_END'; //S
var BEFORE_SCRIPT_1 = 'BEFORE_SCRIPT_1'; //C
var BEFORE_SCRIPT_2 = 'BEFORE_SCRIPT_2'; //R
var BEFORE_SCRIPT_3 = 'BEFORE_SCRIPT_3'; //I
var BEFORE_SCRIPT_4 = 'BEFORE_SCRIPT_4'; //P
var BEFORE_SCRIPT_5 = 'BEFORE_SCRIPT_5'; //T
var AFTER_SCRIPT_1 = 'AFTER_SCRIPT_1'; //C
var AFTER_SCRIPT_2 = 'AFTER_SCRIPT_2'; //R
var AFTER_SCRIPT_3 = 'AFTER_SCRIPT_3'; //I
var AFTER_SCRIPT_4 = 'AFTER_SCRIPT_4'; //P
var AFTER_SCRIPT_5 = 'AFTER_SCRIPT_5'; //T
var BEFORE_STYLE_1 = 'BEFORE_STYLE_1'; //T
var BEFORE_STYLE_2 = 'BEFORE_STYLE_2'; //Y
var BEFORE_STYLE_3 = 'BEFORE_STYLE_3'; //L
var BEFORE_STYLE_4 = 'BEFORE_STYLE_4'; //E
var AFTER_STYLE_1 = 'AFTER_STYLE_1'; //T
var AFTER_STYLE_2 = 'AFTER_STYLE_2'; //Y
var AFTER_STYLE_3 = 'AFTER_STYLE_3'; //L
var AFTER_STYLE_4 = 'AFTER_STYLE_4'; //E
var BEFORE_ENTITY = 'BEFORE_ENTITY'; //&
var BEFORE_NUMERIC_ENTITY = 'BEFORE_NUMERIC_ENTITY'; //#
var IN_NAMED_ENTITY = 'IN_NAMED_ENTITY';
var IN_NUMERIC_ENTITY = 'IN_NUMERIC_ENTITY';
var IN_HEX_ENTITY = 'IN_HEX_ENTITY'; //X
var j = 0;
var SPECIAL_NONE = 'SPECIAL_NONE';
var SPECIAL_SCRIPT = 'SPECIAL_SCRIPT';
var SPECIAL_STYLE = 'SPECIAL_STYLE';
function whitespace(c) {
return c === " " || c === "\n" || c === "\t" || c === "\f" || c === "\r";
}
const LEGAL_START = '_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'.split('');
function legalStart(c) {
return whitespace(c) || (LEGAL_START.indexOf(c) !== -1)
}
function ifElseState(upper, SUCCESS, FAILURE) {
var lower = upper.toLowerCase();
if (upper === lower) {
return function (cItem) {
let c = cItem.c;
if (c === lower) {
this._state = SUCCESS;
} else {
this._state = FAILURE;
this._index--;
}
};
} else {
return function (cItem) {
let c = cItem.c;
if (c === lower || c === upper) {
this._state = SUCCESS;
} else {
this._state = FAILURE;
this._index--;
}
};
}
}
function consumeSpecialNameChar(upper, NEXT_STATE) {
var lower = upper.toLowerCase();
return function (cItem) {
let c = cItem.c;
if (c === lower || c === upper) {
this._state = NEXT_STATE;
} else {
this._state = IN_TAG_NAME;
this._index--; //consume the token again
}
};
}
function Tokenizer(options, cbs) {
this._options = options;
this._state = TEXT;
this._buffer = [];
this._sectionStart = 0;
this._index = 0;
this._bufferOffset = 0; //chars removed from _buffer
this._baseState = TEXT;
this._special = SPECIAL_NONE;
this._cbs = cbs;
this._running = true;
this._ended = false;
this._xmlMode = !!(options && options.xmlMode);
this._decodeEntities = !!(options && options.decodeEntities);
}
Tokenizer.prototype.isDQ = function (index) {
let cItem = this._buffer[index];
let preCItem = this._buffer[index - 1];
if (preCItem) {
if (preCItem.c !== '\\' && cItem.c === '\"') {
return true;
}
} else if (cItem.c === '\"') {
return true;
}
return false;
};
Tokenizer.prototype.isSQ = function (index) {
let cItem = this._buffer[index];
let preCItem = this._buffer[index - 1];
if (preCItem) {
if (preCItem.c !== '\\' && cItem.c === '\'') {
return true;
}
} else if (cItem.c === '\'') {
return true;
}
return false;
};
Tokenizer.prototype._stateText = function (cItem) {
let c = cItem.c;
if (c === "<") {
if (this._index > this._sectionStart) {
this._cbs.ontext(this._getSection());
}
this._state = BEFORE_TAG_NAME;
this._sectionStart = this._index;
} else if (
this._decodeEntities &&
this._special === SPECIAL_NONE &&
c === "&"
) {
if (this._index > this._sectionStart) {
this._cbs.ontext(this._getSection());
}
this._baseState = TEXT;
this._state = BEFORE_ENTITY;
this._sectionStart = this._index;
}
};
Tokenizer.prototype._stateBeforeTagName = function (cItem) {
let c = cItem.c;
if (c === "/") {
this._state = BEFORE_CLOSING_TAG_NAME;
} else if (c === ">" || this._special !== SPECIAL_NONE) {
this._state = TEXT;
} else if (whitespace(c)) {
//<空格,保持为BEFORE_TAG_NAME
} else if (c === "!") {
this._state = BEFORE_DECLARATION;
this._sectionStart = this._index + 1;
} else if (c === "?") {
this._state = IN_PROCESSING_INSTRUCTION;
this._sectionStart = this._index + 1;
} else if (!legalStart(c)) {
//开标签里面出现<或数字
this._errored = true;
let message = `${this._options.path}:${cItem.loc.line}:${cItem.loc.col}: unexpected character \`${c}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
// this._cbs.ontext(this._getSection());
// this._sectionStart = this._index;
} else {
this._state = !this._xmlMode && (c === "s" || c === "S") ? BEFORE_SPECIAL : IN_TAG_NAME;
this._sectionStart = this._index;
}
};
Tokenizer.prototype._stateInTagName = function (cItem) {
let c = cItem.c;
if (c === "/" || c === ">" || whitespace(c)) {
this._emitToken("onopentagname");
this._state = BEFORE_ATTRIBUTE_NAME;
this._index--;
} else if (c === '<') {
//标签名带了<
this._errored = true;
let message = `${this._options.path}:${cItem.loc.line}:${cItem.loc.col}: unexpected character \`${c}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
}
};
Tokenizer.prototype._stateBeforeClosingTagName = function (cItem) {
let c = cItem.c;
if (whitespace(c));
else if (c === ">") {
this._state = TEXT;
} else if (this._special !== SPECIAL_NONE) {
if (c === "s" || c === "S") {
this._state = BEFORE_SPECIAL_END;
} else {
this._state = TEXT;
this._index--;
}
} else {
this._state = IN_CLOSING_TAG_NAME;
this._sectionStart = this._index;
}
};
Tokenizer.prototype._stateInClosingTagName = function (cItem) {
let c = cItem.c;
if (c === ">" || whitespace(c)) {
this._emitToken("onclosetag");
this._state = AFTER_CLOSING_TAG_NAME;
this._index--;
}
};
Tokenizer.prototype._stateAfterClosingTagName = function (cItem) {
let c = cItem.c;
//skip everything until ">"
if (c === ">") {
this._state = TEXT;
this._sectionStart = this._index + 1;
} else if (!whitespace(c)) {
//闭合标签不能有属性
this._errored = true;
let message = `${this._options.path}:${cItem.loc.line}:${cItem.loc.col}: unexpected character \`${c}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
}
};
Tokenizer.prototype._stateBeforeAttributeName = function (cItem) {
let c = cItem.c;
if (c === ">") {
this._cbs.onopentagend();
this._state = TEXT;
this._sectionStart = this._index + 1;
} else if (c === "/") {
this._state = IN_SELF_CLOSING_TAG;
} else if (c === "\'" || c === "\"") {
this._errored = true;
let message = `${this._options.path}:${cItem.loc.line}:${cItem.loc.col}: unexpected attribute name, near \`${c}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
} else if (!legalStart(c)) {
//开标签里面不能有<或者数字开头
this._errored = true;
let message = `${this._options.path}:${cItem.loc.line}:${cItem.loc.col}: unexpected character \`${c}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
} else if (!whitespace(c)) {
this._state = IN_ATTRIBUTE_NAME;
this._sectionStart = this._index;
}
};
Tokenizer.prototype._stateInSelfClosingTag = function (cItem) {
let c = cItem.c;
if (c === ">") {
this._cbs.onselfclosingtag();
this._state = TEXT;
this._sectionStart = this._index + 1;
} else if (!whitespace(c)) {
this._state = BEFORE_ATTRIBUTE_NAME;
this._index--;
}
};
Tokenizer.prototype._stateInAttributeName = function (cItem) {
let c = cItem.c;
if (c === "=" || c === "/" || c === ">" || whitespace(c)) {
this._cbs.onattribname(this._getSection());
this._sectionStart = -1;
this._state = AFTER_ATTRIBUTE_NAME;
this._index--;
} else if (c === '<' || c === '\'' || c === '"') {
//属性名不能有< ' "
this._errored = true;
let message = `${this._options.path}:${cItem.loc.line}:${cItem.loc.col}: unexpected attribute name, near \`${c}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
}
};
Tokenizer.prototype._stateAfterAttributeName = function (cItem) {
let c = cItem.c;
if (c === "=") {
this._state = BEFORE_ATTRIBUTE_VALUE;
} else if (c === "/" || c === ">") {
this._cbs.onattribend();
this._state = BEFORE_ATTRIBUTE_NAME;
this._index--;
} else if (!whitespace(c)) {
this._cbs.onattribend();
this._state = IN_ATTRIBUTE_NAME;
this._sectionStart = this._index;
}
};
Tokenizer.prototype._stateBeforeAttributeValue = function (cItem) {
let c = cItem.c;
if (c === '"') {
this._state = IN_ATTRIBUTE_VALUE_DQ;
this._sectionStart = this._index + 1;
} else if (c === "'") {
this._state = IN_ATTRIBUTE_VALUE_SQ;
this._sectionStart = this._index + 1;
} else if (whitespace(c)) {
//skip whitespace
while (whitespace(c)) {
this._index++;
cItem = this._buffer[this._index];
c = cItem.c;
}
this._index--;
// this._state = BEFORE_ATTRIBUTE_NAME;
} else if (!whitespace(c)) {
let lastCItem = this._buffer[this._index - 1];
if (lastCItem && lastCItem.c === '=') {
//属性值必须由"和'包起来
this._errored = true;
let message = `${this._options.path}:${cItem.loc.line}:${cItem.loc.col}: unexpected character \`${c}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
} else {
this._state = BEFORE_ATTRIBUTE_NAME;
this._index--;
this._cbs.onattribnameNoValue();
}
// this._state = IN_ATTRIBUTE_VALUE_NQ;
// this._sectionStart = this._index;
// this._index--; //reconsume token
}
};
Tokenizer.prototype._stateInAttributeValueDoubleQuotes = function (cItem) {
let c = cItem.c;
if (c === '"') {
this._emitToken("onattribdata");
this._cbs.onattribend();
this._state = BEFORE_ATTRIBUTE_NAME;
} else if (this._decodeEntities && c === "&") {
this._emitToken("onattribdata");
this._baseState = this._state;
this._state = BEFORE_ENTITY;
this._sectionStart = this._index;
} else if (c === "\\") {
//转义字符,跳过转移字符
this._index++;
if (this._index < this._buffer.length) {
this._index++;
}
}
};
Tokenizer.prototype._stateInAttributeValueSingleQuotes = function (cItem) {
let c = cItem.c;
if (c === "'") {
this._emitToken("onattribdata");
this._cbs.onattribend();
this._state = BEFORE_ATTRIBUTE_NAME;
} else if (this._decodeEntities && c === "&") {
this._emitToken("onattribdata");
this._baseState = this._state;
this._state = BEFORE_ENTITY;
this._sectionStart = this._index;
} else if (c === "\\") {
//转义字符
this._index++;
if (this._index < this._buffer.length) {
this._index++;
}
}
};
Tokenizer.prototype._stateInAttributeValueNoQuotes = function (cItem) {
let c = cItem.c;
if (whitespace(c) || c === ">") {
this._emitToken("onattribdata");
this._cbs.onattribend();
this._state = BEFORE_ATTRIBUTE_NAME;
this._index--;
} else if (this._decodeEntities && c === "&") {
this._emitToken("onattribdata");
this._baseState = this._state;
this._state = BEFORE_ENTITY;
this._sectionStart = this._index;
}
};
Tokenizer.prototype._stateBeforeDeclaration = function (cItem) {
let c = cItem.c;
this._state =
c === "[" ? BEFORE_CDATA_1 : c === "-" ? BEFORE_COMMENT : IN_DECLARATION;
};
Tokenizer.prototype._stateInDeclaration = function (cItem) {
let c = cItem.c;
if (c === ">") {
this._cbs.ondeclaration(this._getSection());
this._state = TEXT;
this._sectionStart = this._index + 1;
}
};
Tokenizer.prototype._stateInProcessingInstruction = function (cItem) {
let c = cItem.c;
if (c === ">") {
this._cbs.onprocessinginstruction(this._getSection());
this._state = TEXT;
this._sectionStart = this._index + 1;
}
};
Tokenizer.prototype._stateBeforeComment = function (cItem) {
let c = cItem.c;
if (c === "-") {
this._state = IN_COMMENT;
this._sectionStart = this._index + 1;
} else {
this._state = IN_DECLARATION;
}
};
Tokenizer.prototype._stateInComment = function (cItem) {
let c = cItem.c;
if (c === "-") this._state = AFTER_COMMENT_1;
};
Tokenizer.prototype._stateAfterComment1 = function (cItem) {
let c = cItem.c;
if (c === "-") {
this._state = AFTER_COMMENT_2;
} else {
this._state = IN_COMMENT;
}
};
Tokenizer.prototype._stateAfterComment2 = function (cItem) {
let c = cItem.c;
if (c === ">") {
//remove 2 trailing chars
this._cbs.oncomment(
this._getSection({
sectionStart: this._sectionStart,
sectionEnd: this._index - 3
})
);
this._state = TEXT;
this._sectionStart = this._index + 1;
} else if (c !== "-") {
this._state = IN_COMMENT;
}
// else: stay in AFTER_COMMENT_2 (`--->`)
};
Tokenizer.prototype._stateBeforeCdata1 = ifElseState(
"C",
BEFORE_CDATA_2,
IN_DECLARATION
);
Tokenizer.prototype._stateBeforeCdata2 = ifElseState(
"D",
BEFORE_CDATA_3,
IN_DECLARATION
);
Tokenizer.prototype._stateBeforeCdata3 = ifElseState(
"A",
BEFORE_CDATA_4,
IN_DECLARATION
);
Tokenizer.prototype._stateBeforeCdata4 = ifElseState(
"T",
BEFORE_CDATA_5,
IN_DECLARATION
);
Tokenizer.prototype._stateBeforeCdata5 = ifElseState(
"A",
BEFORE_CDATA_6,
IN_DECLARATION
);
Tokenizer.prototype._stateBeforeCdata6 = function (cItem) {
let c = cItem.c;
if (c === "[") {
this._state = IN_CDATA;
this._sectionStart = this._index + 1;
} else {
this._state = IN_DECLARATION;
this._index--;
}
};
Tokenizer.prototype._stateInCdata = function (cItem) {
let c = cItem.c;
if (c === "]") this._state = AFTER_CDATA_1;
};
Tokenizer.prototype._stateAfterCdata1 = function (cItem) {
let c = cItem.c;
if (c === "]") this._state = AFTER_CDATA_2;
else this._state = IN_CDATA;
};
Tokenizer.prototype._stateAfterCdata2 = function (cItem) {
let c = cItem.c;
if (c === ">") {
//remove 2 trailing chars
this._cbs.oncdata(
this._getSection({
sectionStart: this._sectionStart,
sectionEnd: this._index - 2
})
);
this._state = TEXT;
this._sectionStart = this._index + 1;
} else if (c !== "]") {
this._state = IN_CDATA;
}
//else: stay in AFTER_CDATA_2 (`]]]>`)
};
Tokenizer.prototype._stateBeforeSpecial = function (cItem) {
let c = cItem.c;
if (c === "c" || c === "C") {
this._state = BEFORE_SCRIPT_1;
} else if (c === "t" || c === "T") {
this._state = BEFORE_STYLE_1;
} else {
this._state = IN_TAG_NAME;
this._index--; //consume the token again
}
};
Tokenizer.prototype._stateBeforeSpecialEnd = function (cItem) {
let c = cItem.c;
if (this._special === SPECIAL_SCRIPT && (c === "c" || c === "C")) {
this._state = AFTER_SCRIPT_1;
} else if (this._special === SPECIAL_STYLE && (c === "t" || c === "T")) {
this._state = AFTER_STYLE_1;
} else this._state = TEXT;
};
Tokenizer.prototype._stateBeforeScript1 = consumeSpecialNameChar(
"R",
BEFORE_SCRIPT_2
);
Tokenizer.prototype._stateBeforeScript2 = consumeSpecialNameChar(
"I",
BEFORE_SCRIPT_3
);
Tokenizer.prototype._stateBeforeScript3 = consumeSpecialNameChar(
"P",
BEFORE_SCRIPT_4
);
Tokenizer.prototype._stateBeforeScript4 = consumeSpecialNameChar(
"T",
BEFORE_SCRIPT_5
);
Tokenizer.prototype._stateBeforeScript5 = function (cItem) {
let c = cItem.c;
if (c === "/" || c === ">" || whitespace(c)) {
this._special = SPECIAL_SCRIPT;
}
this._state = IN_TAG_NAME;
this._index--; //consume the token again
};
Tokenizer.prototype._stateAfterScript1 = ifElseState("R", AFTER_SCRIPT_2, TEXT);
Tokenizer.prototype._stateAfterScript2 = ifElseState("I", AFTER_SCRIPT_3, TEXT);
Tokenizer.prototype._stateAfterScript3 = ifElseState("P", AFTER_SCRIPT_4, TEXT);
Tokenizer.prototype._stateAfterScript4 = ifElseState("T", AFTER_SCRIPT_5, TEXT);
Tokenizer.prototype._stateAfterScript5 = function (cItem) {
let c = cItem.c;
if (c === ">" || whitespace(c)) {
this._special = SPECIAL_NONE;
this._state = IN_CLOSING_TAG_NAME;
this._sectionStart = this._index - 6;
this._index--; //reconsume the token
} else this._state = TEXT;
};
Tokenizer.prototype._stateBeforeStyle1 = consumeSpecialNameChar(
"Y",
BEFORE_STYLE_2
);
Tokenizer.prototype._stateBeforeStyle2 = consumeSpecialNameChar(
"L",
BEFORE_STYLE_3
);
Tokenizer.prototype._stateBeforeStyle3 = consumeSpecialNameChar(
"E",
BEFORE_STYLE_4
);
Tokenizer.prototype._stateBeforeStyle4 = function (cItem) {
let c = cItem.c;
if (c === "/" || c === ">" || whitespace(c)) {
this._special = SPECIAL_STYLE;
}
this._state = IN_TAG_NAME;
this._index--; //consume the token again
};
Tokenizer.prototype._stateAfterStyle1 = ifElseState("Y", AFTER_STYLE_2, TEXT);
Tokenizer.prototype._stateAfterStyle2 = ifElseState("L", AFTER_STYLE_3, TEXT);
Tokenizer.prototype._stateAfterStyle3 = ifElseState("E", AFTER_STYLE_4, TEXT);
Tokenizer.prototype._stateAfterStyle4 = function (cItem) {
let c = cItem.c;
if (c === ">" || whitespace(c)) {
this._special = SPECIAL_NONE;
this._state = IN_CLOSING_TAG_NAME;
this._sectionStart = this._index - 5;
this._index--; //reconsume the token
} else this._state = TEXT;
};
Tokenizer.prototype._stateBeforeEntity = ifElseState(
"#",
BEFORE_NUMERIC_ENTITY,
IN_NAMED_ENTITY
);
Tokenizer.prototype._stateBeforeNumericEntity = ifElseState(
"X",
IN_HEX_ENTITY,
IN_NUMERIC_ENTITY
);
//for entities terminated with a semicolon
Tokenizer.prototype._parseNamedEntityStrict = function () {
//offset = 1
if (this._sectionStart + 1 < this._index) {
var entity = this._buffer.substring(this._sectionStart + 1, this._index),
map = this._xmlMode ? xmlMap : entityMap;
if (map.hasOwnProperty(entity)) {
this._emitPartial(map[entity]);
this._sectionStart = this._index + 1;
}
}
};
//parses legacy entities (without trailing semicolon)
Tokenizer.prototype._parseLegacyEntity = function () {
var start = this._sectionStart + 1,
limit = this._index - start;
if (limit > 6) limit = 6; //the max length of legacy entities is 6
while (limit >= 2) {
//the min length of legacy entities is 2
var entity = this._buffer.substr(start, limit);
if (legacyMap.hasOwnProperty(entity)) {
this._emitPartial(legacyMap[entity]);
this._sectionStart += limit + 1;
return;
} else {
limit--;
}
}
};
Tokenizer.prototype._stateInNamedEntity = function (cItem) {
let c = cItem.c;
if (c === ";") {
this._parseNamedEntityStrict();
if (this._sectionStart + 1 < this._index && !this._xmlMode) {
this._parseLegacyEntity();
}
this._state = this._baseState;
} else if (
(c < "a" || c > "z") &&
(c < "A" || c > "Z") &&
(c < "0" || c > "9")
) {
if (this._xmlMode);
else if (this._sectionStart + 1 === this._index);
else if (this._baseState !== TEXT) {
if (c !== "=") {
this._parseNamedEntityStrict();
}
} else {
this._parseLegacyEntity();
}
this._state = this._baseState;
this._index--;
}
};
Tokenizer.prototype._decodeNumericEntity = function (offset, base) {
var sectionStart = this._sectionStart + offset;
if (sectionStart !== this._index) {
//parse entity
var entity = this._buffer.substring(sectionStart, this._index);
var parsed = parseInt(entity, base);
this._emitPartial(decodeCodePoint(parsed));
this._sectionStart = this._index;
} else {
this._sectionStart--;
}
this._state = this._baseState;
};
Tokenizer.prototype._stateInNumericEntity = function (cItem) {
let c = cItem.c;
if (c === ";") {
this._decodeNumericEntity(2, 10);
this._sectionStart++;
} else if (c < "0" || c > "9") {
if (!this._xmlMode) {
this._decodeNumericEntity(2, 10);
} else {
this._state = this._baseState;
}
this._index--;
}
};
Tokenizer.prototype._stateInHexEntity = function (cItem) {
let c = cItem.c;
if (c === ";") {
this._decodeNumericEntity(3, 16);
this._sectionStart++;
} else if (
(c < "a" || c > "f") &&
(c < "A" || c > "F") &&
(c < "0" || c > "9")
) {
if (!this._xmlMode) {
this._decodeNumericEntity(3, 16);
} else {
this._state = this._baseState;
}
this._index--;
}
};
Tokenizer.prototype._cleanup = function () {
if (this._sectionStart < 0) {
this._buffer = [];
this._bufferOffset += this._index;
this._index = 0;
} else if (this._running && !this._errored) {
if (this._state === TEXT) {
if (this._sectionStart !== this._index) {
this._cbs.ontext(
this._getSection({
sectionStart: this._sectionStart,
sectionEnd: this._buffer.length - 1
})
);
}
this._buffer = [];
this._bufferOffset += this._index;
this._index = 0;
} else if (this._sectionStart === this._index) {
//the section just started
this._buffer = [];
this._bufferOffset += this._index;
this._index = 0;
} else {
//remove everything unnecessary
//FIXIT _buffer
// this._buffer = this._buffer.slice(this._sectionStart);
// this._index -= this._sectionStart;
// this._bufferOffset += this._sectionStart;
let data = this._getSection({
sectionStart: this._sectionStart,
sectionEnd: this._buffer.length - 1
});
if (data.str) {
this._errored = true;
let message = `${this._options.path}:${data.start.loc.line}:${data.start.loc.col}: unexpected end \`${data.str}\`\n`;
this._cbs.onerror(
new error.WccError(
error.CODE.ML_PARSE.UNEXPECTED_LEFT_ANGLE_BRACKET,
message
)
);
}
}
this._sectionStart = 0;
}
};
//TODO make events conditional
Tokenizer.prototype.write = function (chunk = "") {
if (this._ended) {
this._errored = true;
this._cbs.onerror(Error(".write() after done!"));
}
let baseLine = (this._buffer && this._buffer[0] && this._buffer[0].line) || 1;
let baseCol = 1;
let newBuffer = [];
let beginIdx = this._buffer.length;
for (let i = 0; i < chunk.length; ++i) {
newBuffer.push({
c: chunk[i],
loc: {
line: baseLine,
col: baseCol
},
idx: beginIdx + i
});
if (chunk[i] === "\n") {
baseCol = 0;
baseLine++;
}
baseCol++;
}
this._buffer = this._buffer.concat(newBuffer);
this._parse();
};
Tokenizer.prototype._parse = function () {
while (this._index < this._buffer.length && this._running && !this._errored) {
var cItem = this._buffer[this._index];
//console.log('_state:', this._state);
//console.log('cItem:', cItem);
if (this._state === TEXT) {
this._stateText(cItem);
} else if (this._state === BEFORE_TAG_NAME) {
this._stateBeforeTagName(cItem);
} else if (this._state === IN_TAG_NAME) {
this._stateInTagName(cItem);
} else if (this._state === BEFORE_CLOSING_TAG_NAME) {
this._stateBeforeClosingTagName(cItem);
} else if (this._state === IN_CLOSING_TAG_NAME) {
this._stateInClosingTagName(cItem);
} else if (this._state === AFTER_CLOSING_TAG_NAME) {
this._stateAfterClosingTagName(cItem);
} else if (this._state === IN_SELF_CLOSING_TAG) {
this._stateInSelfClosingTag(cItem);
} else if (this._state === BEFORE_ATTRIBUTE_NAME) {
/*
* attributes
*/
this._stateBeforeAttributeName(cItem);
} else if (this._state === IN_ATTRIBUTE_NAME) {
this._stateInAttributeName(cItem);
} else if (this._state === AFTER_ATTRIBUTE_NAME) {
this._stateAfterAttributeName(cItem);
} else if (this._state === BEFORE_ATTRIBUTE_VALUE) {
this._stateBeforeAttributeValue(cItem);
} else if (this._state === IN_ATTRIBUTE_VALUE_DQ) {
this._stateInAttributeValueDoubleQuotes(cItem);
} else if (this._state === IN_ATTRIBUTE_VALUE_SQ) {
this._stateInAttributeValueSingleQuotes(cItem);
} else if (this._state === IN_ATTRIBUTE_VALUE_NQ) {
this._stateInAttributeValueNoQuotes(cItem);
} else if (this._state === BEFORE_DECLARATION) {
/*
* declarations
*/
this._stateBeforeDeclaration(cItem);
} else if (this._state === IN_DECLARATION) {
this._stateInDeclaration(cItem);
} else if (this._state === IN_PROCESSING_INSTRUCTION) {
/*
* processing instructions
*/
this._stateInProcessingInstruction(cItem);
} else if (this._state === BEFORE_COMMENT) {
/*
* comments
*/
this._stateBeforeComment(cItem);
} else if (this._state === IN_COMMENT) {
this._stateInComment(cItem);
} else if (this._state === AFTER_COMMENT_1) {
this._stateAfterComment1(cItem);
} else if (this._state === AFTER_COMMENT_2) {
this._stateAfterComment2(cItem);
} else if (this._state === BEFORE_CDATA_1) {
/*
* cdata
*/
this._stateBeforeCdata1(cItem);
} else if (this._state === BEFORE_CDATA_2) {
this._stateBeforeCdata2(cItem);
} else if (this._state === BEFORE_CDATA_3) {
this._stateBeforeCdata3(cItem);
} else if (this._state === BEFORE_CDATA_4) {
this._stateBeforeCdata4(cItem);
} else if (this._state === BEFORE_CDATA_5) {
this._stateBeforeCdata5(cItem);
} else if (this._state === BEFORE_CDATA_6) {
this._stateBeforeCdata6(cItem);
} else if (this._state === IN_CDATA) {
this._stateInCdata(cItem);
} else if (this._state === AFTER_CDATA_1) {
this._stateAfterCdata1(cItem);
} else if (this._state === AFTER_CDATA_2) {
this._stateAfterCdata2(cItem);
} else if (this._state === BEFORE_SPECIAL) {
/*
* special tags
*/
this._stateBeforeSpecial(cItem);
} else if (this._state === BEFORE_SPECIAL_END) {
this._stateBeforeSpecialEnd(cItem);
} else if (this._state === BEFORE_SCRIPT_1) {
/*
* script
*/
this._stateBeforeScript1(cItem);
} else if (this._state === BEFORE_SCRIPT_2) {
this._stateBeforeScript2(cItem);
} else if (this._state === BEFORE_SCRIPT_3) {
this._stateBeforeScript3(cItem);
} else if (this._state === BEFORE_SCRIPT_4) {
this._stateBeforeScript4(cItem);
} else if (this._state === BEFORE_SCRIPT_5) {
this._stateBeforeScript5(cItem);
} else if (this._state === AFTER_SCRIPT_1) {
this._stateAfterScript1(cItem);
} else if (this._state === AFTER_SCRIPT_2) {
this._stateAfterScript2(cItem);
} else if (this._state === AFTER_SCRIPT_3) {
this._stateAfterScript3(cItem);
} else if (this._state === AFTER_SCRIPT_4) {
this._stateAfterScript4(cItem);
} else if (this._state === AFTER_SCRIPT_5) {
this._stateAfterScript5(cItem);
} else if (this._state === BEFORE_STYLE_1) {
/*
* style
*/
this._stateBeforeStyle1(cItem);
} else if (this._state === BEFORE_STYLE_2) {
this._stateBeforeStyle2(cItem);
} else if (this._state === BEFORE_STYLE_3) {
this._stateBeforeStyle3(cItem);
} else if (this._state === BEFORE_STYLE_4) {
this._stateBeforeStyle4(cItem);
} else if (this._state === AFTER_STYLE_1) {
this._stateAfterStyle1(cItem);
} else if (this._state === AFTER_STYLE_2) {
this._stateAfterStyle2(cItem);
} else if (this._state === AFTER_STYLE_3) {
this._stateAfterStyle3(cItem);
} else if (this._state === AFTER_STYLE_4) {
this._stateAfterStyle4(cItem);
} else if (this._state === BEFORE_ENTITY) {
/*
* entities
*/
this._stateBeforeEntity(cItem);
} else if (this._state === BEFORE_NUMERIC_ENTITY) {
this._stateBeforeNumericEntity(cItem);
} else if (this._state === IN_NAMED_ENTITY) {
this._stateInNamedEntity(cItem);
} else if (this._state === IN_NUMERIC_ENTITY) {
this._stateInNumericEntity(cItem);
} else if (this._state === IN_HEX_ENTITY) {
this._stateInHexEntity(cItem);
} else {
this._errored = true;
let message = `unknow state:${this._state}\n at ${
this._options.path
}:${cItem.loc.line}:${cItem.loc.col}`;
this._cbs.onerror(
new error.WccError(error.CODE.ML_PARSE.UNKNOW_STATE, message)
);
}
this._index++;
}
this._cleanup();
};
Tokenizer.prototype.pause = function () {
this._running = false;
};
Tokenizer.prototype.resume = function () {
this._running = true;
if (this._index < this._buffer.length) {
this._parse();
}
if (this._ended) {
this._finish();
}
};
Tokenizer.prototype.end = function (chunk) {
if (this._ended) {
this._errored = true;
this._cbs.onerror(Error(".end() after done!"));
return;
}
if (chunk) this.write(chunk);
this._ended = true;
if (this._running && !this._errored) this._finish();
};
Tokenizer.prototype._finish = function () {
//if there is remaining data, emit it in a reasonable way
if (this._sectionStart < this._index) {
this._handleTrailingData();
}
this._cbs.onend();
};
Tokenizer.prototype._handleTrailingData = function () {
let data = this._getSection({
sectionStart: this._sectionStart,
sectionEnd: this._buffer.length - 1
});
if (
this._state === IN_CDATA ||
this._state === AFTER_CDATA_1 ||
this._state === AFTER_CDATA_2
) {
this._cbs.oncdata(data);
} else if (
this._state === IN_COMMENT ||
this._state === AFTER_COMMENT_1 ||
this._state === AFTER_COMMENT_2
) {
this._cbs.oncomment(data);
} else if (this._state === IN_NAMED_ENTITY && !this._xmlMode) {
this._parseLegacyEntity();
if (this._sectionStart < this._index) {
this._state = this._baseState;
this._handleTrailingData();
}
} else if (this._state === IN_NUMERIC_ENTITY && !this._xmlMode) {
this._decodeNumericEntity(2, 10);
if (this._sectionStart < this._index) {
this._state = this._baseState;
this._handleTrailingData();
}
} else if (this._state === IN_HEX_ENTITY && !this._xmlMode) {
this._decodeNumericEntity(3, 16);
if (this._sectionStart < this._index) {
this._state = this._baseState;
this._handleTrailingData();
}
} else if (this._state === TEXT) {
this._cbs.ontext(data);
} else if (
this._state !== IN_TAG_NAME &&
this._state !== BEFORE_ATTRIBUTE_NAME &&
this._state !== BEFORE_ATTRIBUTE_VALUE &&
this._state !== AFTER_ATTRIBUTE_NAME &&
this._state !== IN_ATTRIBUTE_NAME &&
this._state !== IN_ATTRIBUTE_VALUE_SQ &&
this._state !== IN_ATTRIBUTE_VALUE_DQ &&
this._state !== IN_ATTRIBUTE_VALUE_NQ &&
this._state !== IN_CLOSING_TAG_NAME
) {
this._cbs.ontext(data);
}
//else, ignore remaining data
//TODO add a way to remove current tag
};
Tokenizer.prototype.reset = function () {
Tokenizer.call(
this, {
xmlMode: this._xmlMode,
decodeEntities: this._decodeEntities
},
this._cbs
);
};
Tokenizer.prototype.getAbsoluteIndex = function () {
return this._bufferOffset + this._index;
};
Tokenizer.prototype._getSection = function (opts = {}) {
let sectionStart = 0;
let sectionEnd = 0;
if (typeof opts.sectionStart === "undefined") {
sectionStart = this._sectionStart;
} else {
sectionStart = opts.sectionStart;
}
if (typeof opts.sectionEnd === "undefined") {
sectionEnd = this._index - 1;
} else {
sectionEnd = opts.sectionEnd;
}
if (sectionStart < 0) {
sectionStart = 0;
} else if (sectionStart >= this._buffer.length) {
sectionStart = this._buffer.length - 1;
}
if (sectionEnd < 0) {
sectionEnd = 0;
} else if (sectionEnd >= this._buffer.length) {
sectionEnd = this._buffer.length - 1;
}
let start = this._buffer[sectionStart];
let end = this._buffer[sectionEnd];
let setion = {
start: start,
end: end
};
if (this._buffer.length) {
let data = [];
for (let i = sectionStart; i <= sectionEnd; ++i) {
data.push(this._buffer[i].c);
}
setion.str = data.join("");
}
return setion;
};
Tokenizer.prototype._emitToken = function (name) {
this._cbs[name](this._getSection());
this._sectionStart = -1;
};
Tokenizer.prototype._emitPartial = function (value) {
if (this._baseState !== TEXT) {
this._cbs.onattribdata(value); //TODO implement the new event
} else {
this._cbs.ontext(value);
}
};