html5parser
Version:
A super fast & tiny HTML5 parser
500 lines (472 loc) • 12 kB
text/typescript
/*!
*
* Copyright 2017 - acrazing
*
* @author acrazing joking.young@gmail.com
* @since 2017-08-19 00:54:29
* @version 1.0.0
* @desc tokenize.ts
*/
const enum State {
Literal,
BeforeOpenTag,
OpeningTag,
AfterOpenTag,
InValueNq,
InValueSq,
InValueDq,
ClosingOpenTag,
OpeningSpecial,
OpeningDoctype,
OpeningNormalComment,
InNormalComment,
InShortComment,
ClosingNormalComment,
ClosingTag,
}
export const enum TokenKind {
Literal,
OpenTag, // trim leading '<'
OpenTagEnd, // trim tailing '>', only could be '/' or ''
CloseTag, // trim leading '</' and tailing '>'
Whitespace, // the whitespace between attributes
AttrValueEq,
AttrValueNq,
AttrValueSq,
AttrValueDq,
}
export interface IToken {
start: number;
end: number;
value: string;
type: TokenKind;
}
let state: State;
let buffer: string;
let bufSize: number;
let sectionStart: number;
let index: number;
let tokens: IToken[];
let char: number;
let inScript: boolean;
let inStyle: boolean;
let offset: number;
function makeCodePoints(input: string) {
return {
lower: input
.toLowerCase()
.split('')
.map((c) => c.charCodeAt(0)),
upper: input
.toUpperCase()
.split('')
.map((c) => c.charCodeAt(0)),
length: input.length,
};
}
const doctype = makeCodePoints('!doctype');
const style = makeCodePoints('style');
const script = makeCodePoints('script');
const enum Chars {
_S = 32, // ' '
_N = 10, // \n
_T = 9, // \t
_R = 13, // \r
_F = 12, // \f
Lt = 60, // <
Ep = 33, // !
Cl = 45, // -
Sl = 47, // /
Gt = 62, // >
Qm = 63, // ?
La = 97, // a
Lz = 122, // z
Ua = 65, // A
Uz = 90, // Z
Eq = 61, // =
Sq = 39, // '
Dq = 34, // "
Ld = 100, // d
Ud = 68, //D
}
function isWhiteSpace() {
return (
char === Chars._S ||
char === Chars._N ||
char === Chars._T ||
char === Chars._T ||
char === Chars._R ||
char === Chars._F
);
}
function init(input: string) {
state = State.Literal;
buffer = input;
bufSize = input.length;
sectionStart = 0;
index = 0;
tokens = [];
inScript = false;
inStyle = false;
offset = 0;
}
export function tokenize(input: string): IToken[] {
init(input);
while (index < bufSize) {
char = buffer.charCodeAt(index);
switch (state) {
case State.Literal:
parseLiteral();
break;
case State.BeforeOpenTag:
parseBeforeOpenTag();
break;
case State.OpeningTag:
parseOpeningTag();
break;
case State.AfterOpenTag:
parseAfterOpenTag();
break;
case State.InValueNq:
parseInValueNq();
break;
case State.InValueSq:
parseInValueSq();
break;
case State.InValueDq:
parseInValueDq();
break;
case State.ClosingOpenTag:
parseClosingOpenTag();
break;
case State.OpeningSpecial:
parseOpeningSpecial();
break;
case State.OpeningDoctype:
parseOpeningDoctype();
break;
case State.OpeningNormalComment:
parseOpeningNormalComment();
break;
case State.InNormalComment:
parseNormalComment();
break;
case State.InShortComment:
parseShortComment();
break;
case State.ClosingNormalComment:
parseClosingNormalComment();
break;
case State.ClosingTag:
parseClosingTag();
break;
default:
unexpected();
break;
}
index++;
}
switch (state) {
case State.Literal:
case State.BeforeOpenTag:
case State.InValueNq:
case State.InValueSq:
case State.InValueDq:
case State.ClosingOpenTag:
case State.InNormalComment:
case State.InShortComment:
case State.ClosingNormalComment:
emitToken(TokenKind.Literal);
break;
case State.OpeningTag:
emitToken(TokenKind.OpenTag);
break;
case State.AfterOpenTag:
break;
case State.OpeningSpecial:
emitToken(TokenKind.OpenTag, State.InShortComment);
break;
case State.OpeningDoctype:
if (index - sectionStart === doctype.length) {
emitToken(TokenKind.OpenTag);
} else {
emitToken(TokenKind.OpenTag, void 0, sectionStart + 1);
emitToken(TokenKind.Literal);
}
break;
case State.OpeningNormalComment:
if (index - sectionStart === 2) {
emitToken(TokenKind.OpenTag);
} else {
emitToken(TokenKind.OpenTag, void 0, sectionStart + 1);
emitToken(TokenKind.Literal);
}
break;
case State.ClosingTag:
emitToken(TokenKind.CloseTag);
break;
default:
break;
}
const _tokens = tokens;
init('');
return _tokens;
}
function emitToken(kind: TokenKind, newState = state, end = index) {
let value = buffer.substring(sectionStart, end);
if (kind === TokenKind.OpenTag || kind === TokenKind.CloseTag) {
value = value.toLowerCase();
}
if (kind === TokenKind.OpenTag) {
if (value === 'script') {
inScript = true;
} else if (value === 'style') {
inStyle = true;
}
}
if (kind === TokenKind.CloseTag) {
inScript = inStyle = false;
}
if (!((kind === TokenKind.Literal || kind === TokenKind.Whitespace) && end === sectionStart)) {
// empty literal should be ignored
tokens.push({ type: kind, start: sectionStart, end, value });
}
if (kind === TokenKind.OpenTagEnd || kind === TokenKind.CloseTag) {
sectionStart = end + 1;
state = State.Literal;
} else {
sectionStart = end;
state = newState;
}
}
function parseLiteral() {
if (char === Chars.Lt) {
// <
emitToken(TokenKind.Literal, State.BeforeOpenTag);
}
}
function parseBeforeOpenTag() {
if (inScript || inStyle) {
if (char === Chars.Sl) {
state = State.ClosingTag;
sectionStart = index + 1;
} else {
state = State.Literal;
}
return;
}
if ((char >= Chars.La && char <= Chars.Lz) || (char >= Chars.Ua && char <= Chars.Uz)) {
// <d
state = State.OpeningTag;
sectionStart = index;
} else if (char === Chars.Sl) {
// </
state = State.ClosingTag;
sectionStart = index + 1;
} else if (char === Chars.Lt) {
// <<
emitToken(TokenKind.Literal);
} else if (char === Chars.Ep) {
// <!
state = State.OpeningSpecial;
sectionStart = index;
} else if (char === Chars.Qm) {
// <?
// treat as short comment
sectionStart = index;
emitToken(TokenKind.OpenTag, State.InShortComment);
} else {
// <>
// any other chars covert to normal state
state = State.Literal;
}
}
function parseOpeningTag() {
if (isWhiteSpace()) {
// <div ...
emitToken(TokenKind.OpenTag, State.AfterOpenTag);
} else if (char === Chars.Gt) {
// <div>
emitToken(TokenKind.OpenTag);
emitToken(TokenKind.OpenTagEnd);
} else if (char === Chars.Sl) {
// <div/
emitToken(TokenKind.OpenTag, State.ClosingOpenTag);
}
}
function parseAfterOpenTag() {
if (char === Chars.Gt) {
// <div >
emitToken(TokenKind.Whitespace);
emitToken(TokenKind.OpenTagEnd);
} else if (char === Chars.Sl) {
// <div /
emitToken(TokenKind.Whitespace, State.ClosingOpenTag);
} else if (char === Chars.Eq) {
// <div ...=...
emitToken(TokenKind.Whitespace);
emitToken(TokenKind.AttrValueEq, void 0, index + 1);
} else if (char === Chars.Sq) {
// <div ...'...
emitToken(TokenKind.Whitespace, State.InValueSq);
} else if (char === Chars.Dq) {
// <div ..."...
emitToken(TokenKind.Whitespace, State.InValueDq);
} else if (!isWhiteSpace()) {
// <div ...name...
emitToken(TokenKind.Whitespace, State.InValueNq);
}
}
function parseInValueNq() {
if (char === Chars.Gt) {
// <div xxx>
emitToken(TokenKind.AttrValueNq);
emitToken(TokenKind.OpenTagEnd);
} else if (char === Chars.Sl) {
// <div xxx/
emitToken(TokenKind.AttrValueNq, State.ClosingOpenTag);
} else if (char === Chars.Eq) {
// <div xxx=
emitToken(TokenKind.AttrValueNq);
emitToken(TokenKind.AttrValueEq, State.AfterOpenTag, index + 1);
} else if (isWhiteSpace()) {
// <div xxx ...
emitToken(TokenKind.AttrValueNq, State.AfterOpenTag);
}
}
function parseInValueSq() {
if (char === Chars.Sq) {
// <div 'xxx'
emitToken(TokenKind.AttrValueSq, State.AfterOpenTag, index + 1);
}
}
function parseInValueDq() {
if (char === Chars.Dq) {
// <div "xxx", problem same to Sq
emitToken(TokenKind.AttrValueDq, State.AfterOpenTag, index + 1);
}
}
function parseClosingOpenTag() {
if (char === Chars.Gt) {
// <div />
emitToken(TokenKind.OpenTagEnd);
} else {
// <div /...>
emitToken(TokenKind.AttrValueNq, State.AfterOpenTag);
parseAfterOpenTag();
}
}
function parseOpeningSpecial() {
switch (char) {
case Chars.Cl: // <!-
state = State.OpeningNormalComment;
break;
case Chars.Ld: // <!d
case Chars.Ud: // <!D
state = State.OpeningDoctype;
break;
default:
emitToken(TokenKind.OpenTag, State.InShortComment);
break;
}
}
function parseOpeningDoctype() {
offset = index - sectionStart;
if (offset === doctype.length) {
// <!d, <!d , start: 0, index: 2
if (isWhiteSpace()) {
emitToken(TokenKind.OpenTag, State.AfterOpenTag);
} else {
unexpected();
}
} else if (char === Chars.Gt) {
// <!DOCT>
emitToken(TokenKind.OpenTag, void 0, sectionStart + 1);
emitToken(TokenKind.Literal);
emitToken(TokenKind.OpenTagEnd);
} else if (doctype.lower[offset] !== char && doctype.upper[offset] !== char) {
// <!DOCX...
emitToken(TokenKind.OpenTag, State.InShortComment, sectionStart + 1);
}
}
function parseOpeningNormalComment() {
if (char === Chars.Cl) {
// <!--
emitToken(TokenKind.OpenTag, State.InNormalComment, index + 1);
} else {
emitToken(TokenKind.OpenTag, State.InShortComment, sectionStart + 1);
}
}
function parseNormalComment() {
if (char === Chars.Cl) {
// <!-- ... -
emitToken(TokenKind.Literal, State.ClosingNormalComment);
}
}
function parseShortComment() {
if (char === Chars.Gt) {
// <! ... >
emitToken(TokenKind.Literal);
emitToken(TokenKind.OpenTagEnd);
}
}
function parseClosingNormalComment() {
offset = index - sectionStart;
if (offset === 2) {
if (char === Chars.Gt) {
// <!-- xxx -->
emitToken(TokenKind.OpenTagEnd);
} else if (char === Chars.Cl) {
// <!-- xxx ---
emitToken(TokenKind.Literal, void 0, sectionStart + 1);
} else {
// <!-- xxx --x
state = State.InNormalComment;
}
} else if (char !== Chars.Cl) {
// <!-- xxx - ...
state = State.InNormalComment;
}
}
function parseClosingTag() {
offset = index - sectionStart;
if (inStyle) {
if (char === Chars.Lt) {
sectionStart -= 2;
emitToken(TokenKind.Literal, State.BeforeOpenTag);
} else if (offset < style.length) {
if (style.lower[offset] !== char && style.upper[offset] !== char) {
sectionStart -= 2;
state = State.Literal;
}
} else if (char === Chars.Gt) {
emitToken(TokenKind.CloseTag);
} else if (!isWhiteSpace()) {
sectionStart -= 2;
state = State.Literal;
}
} else if (inScript) {
if (char === Chars.Lt) {
sectionStart -= 2;
emitToken(TokenKind.Literal, State.BeforeOpenTag);
} else if (offset < script.length) {
if (script.lower[offset] !== char && script.upper[offset] !== char) {
sectionStart -= 2;
state = State.Literal;
}
} else if (char === Chars.Gt) {
emitToken(TokenKind.CloseTag);
} else if (!isWhiteSpace()) {
sectionStart -= 2;
state = State.Literal;
}
} else if (char === Chars.Gt) {
// </ xxx >
emitToken(TokenKind.CloseTag);
}
}
function unexpected() {
throw new SyntaxError(
`Unexpected token "${buffer.charAt(index)}" at ${index} when parse ${state}`,
);
}