tiny-html-lexer
Version:
A tiny HTML5 lexer
341 lines (285 loc) • 11 kB
JavaScript
const log = console.log.bind (console)
const { defineProperty:define } = Object
const StringsInto = map => new Proxy ({}, { get:($,k) => (map [k] = k, k) })
// A tiny-lexer based tokenizer for HTML5
// =======================================
// ### The tokens
const tokenTypes = {}, states = {}
const T = tokenTypes
const {
attributeName,
attributeAssign,
attributeValueStart,
attributeValueData,
attributeValueEnd,
tagSpace, // whitespace and/ or slashes within tags
commentStart,
commentStartBogus,
commentData,
commentEnd,
commentEndBogus,
startTagStart,
endTagStart,
tagEnd,
tagEndClose,
charRefDecimal,
charRefHex,
charRefNamed,
unescaped,
data,
space,
newline,
rcdata,
rawtext,
plaintext,
} = StringsInto (tokenTypes)
// Some of the token types double as states.
// The additional states are as follows.
const {
beforeAtt,
bogusComment,
afterAttName,
afterAssign,
attValue,
comment,
doubleQuoted,
singleQuoted,
unquoted,
charRef
} = StringsInto (states)
//### The grammar
const STARTTAG_START = '<[a-zA-Z][^>/\t\r\n\f ]*'
const ENDTAG_START = '</[a-zA-Z][^>/\t\r\n\f ]*'
const CHARREF_DEC = '&#[0-9]+;?'
const CHARREF_HEX = '&#[xX][0-9A-Fa-f]+;?'
const CHARREF_NAMED = '&[A-Za-z][A-Za-z0-9]*;'
const ATTNAME = '.[^>/\t\r\n\f =]*' /* '[^>/\t\n\f ][^>/\t\n\f =]*' */
const ATT_UNQUOT = '[^&>\t\r\n\f ]+'
const UNQUOT_END = '(?=[>\t\r\n\f ])'
const DOCTYPE_START = '<![Dd][Oo][Cc][Tt][Yy][Pp][Ee]'
// The below generated by preprocessing the list of named character references;
// Legacy charrefs may occur without terminating semicolon, but not as a prefix
// of a known named reference. CHARREF_CONTD matches the references that have a legacy ref as prefix.
const CHARREF_CONTD = '&(?:copysr|centerdot|divideontimes|[gl]t(?:quest|dot|cir|cc)|[gl]trPar|gtr(?:dot|less|eqqless|eqless|approx|arr|sim)|ltr(?:i|if|ie|mes)|ltlarr|lthree|notin(?:dot|E|v[abc])?|notni(?:v[abc])?|parallel|times(?:bar|d|b));'
const CHARREF_LEGACY = '&(?:[AEIOUYaeiouy]?acute|[AEIOUaeiou](?:grave|circ|uml)|y?uml|[ANOano]tilde|[Aa]ring|[Oo]slash|[Cc]?cedil|brvbar|curren|divide|frac(?:12|14|34)|iquest|middot|plusmn|(?:AE|ae|sz)lig|[lr]aquo|iexcl|micro|pound|THORN|thorn|times|COPY|copy|cent|macr|nbsp|ord[fm]|para|QUOT|quot|sect|sup[123]|AMP|amp|ETH|eth|REG|reg|deg|not|shy|yen|GT|gt|LT|lt);?'
const grammar =
{ data: [
[ STARTTAG_START, startTagStart, startTag ],
[ ENDTAG_START, endTagStart, beforeAtt ],
//[ DOCTYPE_START, doctype_start, 'beforeName' ], // before doctype name
[ '<!--', commentStart, commentStart ],
[ '<[/!?]', commentStartBogus, bogusComment ],
[ '[\t\f ]+', space ],
[ '\r?\n|\r', addNewline ],
[ '[^<&\t\f\r\n ]+',data ],
[ '<', unescaped ],
[ '', data, charRefIn ]],
rawtext: [
[ ENDTAG_START, maybeEndTagT, maybeEndTag ],
[ '\r?\n|\r', addNewline ],
[ '.[^<\r\n]*', rawtext ]],
rcdata: [
[ ENDTAG_START, maybeEndTagT, maybeEndTag ],
[ '<', unescaped ],
[ '[^<&\r\n]+', rcdata ],
[ '\r?\n|\r', addNewline ],
[ '', rcdata, charRefIn ]],
plaintext: [
[ '\r?\n|\r', addNewline ],
[ '.[^\r\n]*', plaintext ]],
charRef: [
[ CHARREF_DEC, charRefDecimal, context ],
[ CHARREF_HEX, charRefHex, context ],
[ CHARREF_CONTD, charRefNamed, context ],
[ CHARREF_LEGACY, legacyCharRefT, context ],
[ CHARREF_NAMED, charRefNamed, context ],
[ '&', unescaped, context ]],
beforeAtt: [
[ '>', tagEnd, content ],
[ '/>', tagEndClose, content ],
[ '\r?\n|\r', addNewline ],
[ '[\t\f ]+', tagSpace, ],
[ '/+(?!>)', tagSpace, ],
[ ATTNAME, attributeName, afterAttName ]],
afterAttName: [
[ '>', tagEnd, content ],
[ '/>', tagEndClose, content ],
[ '\r?\n|\r', addNewline ],
[ '[\t\f ]+', tagSpace ],
[ '=', attributeAssign, afterAssign ],
[ '/+(?!>)', tagSpace, beforeAtt ],
[ ATTNAME, attributeName ]],
afterAssign: [
[ '\r?\n|\r', addNewline ],
[ '[\t\f ]+', tagSpace, ],
[ '', tagSpace, attValue ]],
attValue: [ // 'attributeAssign' has eaten all the space
[ '>' , tagEnd, content ],
[ '"' , attributeValueStart, doubleQuoted ],
[ "'" , attributeValueStart, singleQuoted ],
[ '', attributeValueStart, unquoted ]],
unquoted: [
[ ATT_UNQUOT, attributeValueData ],
[ UNQUOT_END, attributeValueEnd, beforeAtt ],
[ '', attributeValueData, charRefIn ]],
doubleQuoted: [
[ '\r?\n|\r', addNewline ],
[ '[^\r\n"&]+', attributeValueData ],
[ '"', attributeValueEnd, beforeAtt ],
[ '', attributeValueData, charRefIn ]],
singleQuoted: [
[ '\r?\n|\r', addNewline ],
[ "[^\r\n'&]+", attributeValueData ],
[ "'", attributeValueEnd, beforeAtt ],
[ '', attributeValueData, charRefIn ]],
bogusComment: [
[ '\r?\n|\r', addNewline ],
[ '[^\r\n>]+', commentData, bogusComment ],
[ '>', commentEndBogus, content ]],
commentStart: [
[ '-?>', commentEnd, content ],
[ '--!?>', commentEnd, content ],
[ '--!', commentData, comment ],
[ '--?', commentData, comment ],
[ '[^>-][^-]*', commentData, comment ]],
comment: [
[ '--!?>', commentEnd, content ],
[ '--!' , commentData ],
[ '--?' , commentData ],
[ '\r?\n|\r', addNewline ],
[ '[^\r\n-]+', commentData ]]
}
// Additional state management, to
// supplement the grammar/ state machine.
const content_map =
{ style: rawtext
, script: rawtext
, xmp: rawtext
, iframe: rawtext
, noembed: rawtext
, noframes: rawtext
, textarea: rcdata
, title: rcdata
, plaintext: plaintext
//, noscript: rawtext // if scripting is enabled in a UA
}
function startTag (_, chunk) {
// log ('startTag', this)
let tagName = chunk.substr (1)
this.tagName = tagName
this.content = tagName in content_map ? content_map[tagName] : data
return beforeAtt
}
function addNewline (_, chunk) {
this.lastnl = this.position + chunk.length
this.line++
return newline
}
function content () {
return this.content
}
function context () {
return this.context
}
// From the spec;
// "If the character reference was consumed as part of an attribute,
// and the last character matched is not a U+003B SEMICOLON character (;),
// and the next input character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric,
// then, for historical reasons, flush code points consumed as a character reference and switch to the return state."
function legacyCharRefT (_, chunk) {
// NOTE this has changed after the runtme rewrite, this is brittle:
// it is unclear when the position is updated.
const x = this.context, c = this.input [this.position + chunk.length]
if ((x === unquoted || x === doubleQuoted || x === singleQuoted) && chunk.substr(-1) !== ';' && /[a-zA-Z0-9=]/.test(c)) {
return attributeValueData
}
return charRefNamed
}
function maybeEndTagT (_, chunk) {
if (chunk.substr (2) === this.tagName) {
this.content = data
return endTagStart
}
else return this.content // TODO careful, this is a token type, not a state!
}
function maybeEndTag (symbol, chunk) {
if (chunk.substr (2) === this.tagName) {
this.content = data
return beforeAtt
}
else return symbol
}
function charRefIn (symbol, chunk) {
this.context = symbol
return charRef
}
// Lexer
// -----
const _compiled =
compile (grammar)
class LexerState {
constructor (input) {
this.symbol = data // current grammar rule
this.content = data // one of { data, rcdata, rawtext, unquoted, doubleQuoted, singleQuoted }
this.context = data // likewise
this.position = 0
this.line = 1
this.lastnl = 0
define (this, 'tagName', { value:'', enumerable:false, writable:true }) // hidden // the last seen 'startTag-start' name
define (this, 'input', { value:String (input) }) // hidden
}
get col () {
return this.position - this.lastnl
}
*tokens () {
do {
const { input, position, symbol } = this
const { regex, edges } = _compiled [symbol]
const match = (regex.lastIndex = position, regex.exec (input))
if (!match) return
let i = 1; while (match [i] == null) i++
const edge = edges [i-1]
const token = edge.emit.call (this, symbol, match[i])
this.symbol = edge.goto.call (this, symbol, match[i])
this.position = regex.lastIndex
yield token
}
while (this.position <= this.input.length)
}
}
function lexemes (input) {
const lexer = new LexerState (input)
const stream = lexer.tokens ()
stream.state = lexer
return stream
}
// The grammar compiler
// --------------------
function State (table, name) {
this.name = name
this.regex = new RegExp ('(' + table.map (fst) .join (')|(') + ')', 'sgy')
this.edges = table.map (compileRow (name))
}
function compile (grammar) {
const compiled = {}
for (let state_name in grammar)
compiled [state_name] = new State (grammar [state_name], state_name)
return compiled
}
function fst (row) { return row [0] }
function compileRow (symbol) {
return (row) => {
const [r = '.{0}', emit, goto = symbol] = row
const g = typeof goto === 'function' ? goto : (symbol, data) => goto
const e = typeof emit === 'function' ? wrapEmit (emit) : (symbol, data) => [emit, data]
return { emit:e, goto:g }
}
}
function wrapEmit (fn) {
return function (type, data) {
return [fn.call (this, type, data), data]
}
}
// Exports
// -------
export { lexemes, lexemes as chunks, tokenTypes }