UNPKG

tiny-html-lexer

Version:

A tiny HTML5 lexer

github.com/alwinb/tiny-html-lexer

alwinb/tiny-html-lexer

341 lines (285 loc) • 11 kB

JavaScript

const log = console.log.bind (console) const { defineProperty:define } = Object const StringsInto = map => new Proxy ({}, { get:($,k) => (map [k] = k, k) }) // A tiny-lexer based tokenizer for HTML5 // ======================================= // ### The tokens const tokenTypes = {}, states = {} const T = tokenTypes const { attributeName, attributeAssign, attributeValueStart, attributeValueData, attributeValueEnd, tagSpace, // whitespace and/ or slashes within tags commentStart, commentStartBogus, commentData, commentEnd, commentEndBogus, startTagStart, endTagStart, tagEnd, tagEndClose, charRefDecimal, charRefHex, charRefNamed, unescaped, data, space, newline, rcdata, rawtext, plaintext, } = StringsInto (tokenTypes) // Some of the token types double as states. // The additional states are as follows. const { beforeAtt, bogusComment, afterAttName, afterAssign, attValue, comment, doubleQuoted, singleQuoted, unquoted, charRef } = StringsInto (states) //### The grammar const STARTTAG_START = '<[a-zA-Z][^>/\t\r\n\f ]*' const ENDTAG_START = '</[a-zA-Z][^>/\t\r\n\f ]*' const CHARREF_DEC = '&#[0-9]+;?' const CHARREF_HEX = '&#[xX][0-9A-Fa-f]+;?' const CHARREF_NAMED = '&[A-Za-z][A-Za-z0-9]*;' const ATTNAME = '.[^>/\t\r\n\f =]*' /* '[^>/\t\n\f ][^>/\t\n\f =]*' */ const ATT_UNQUOT = '[^&>\t\r\n\f ]+' const UNQUOT_END = '(?=[>\t\r\n\f ])' const DOCTYPE_START = '<![Dd][Oo][Cc][Tt][Yy][Pp][Ee]' // The below generated by preprocessing the list of named character references; // Legacy charrefs may occur without terminating semicolon, but not as a prefix // of a known named reference. CHARREF_CONTD matches the references that have a legacy ref as prefix. const CHARREF_CONTD = '&(?:copysr|centerdot|divideontimes|[gl]t(?:quest|dot|cir|cc)|[gl]trPar|gtr(?:dot|less|eqqless|eqless|approx|arr|sim)|ltr(?:i|if|ie|mes)|ltlarr|lthree|notin(?:dot|E|v[abc])?|notni(?:v[abc])?|parallel|times(?:bar|d|b));' const CHARREF_LEGACY = '&(?:[AEIOUYaeiouy]?acute|[AEIOUaeiou](?:grave|circ|uml)|y?uml|[ANOano]tilde|[Aa]ring|[Oo]slash|[Cc]?cedil|brvbar|curren|divide|frac(?:12|14|34)|iquest|middot|plusmn|(?:AE|ae|sz)lig|[lr]aquo|iexcl|micro|pound|THORN|thorn|times|COPY|copy|cent|macr|nbsp|ord[fm]|para|QUOT|quot|sect|sup[123]|AMP|amp|ETH|eth|REG|reg|deg|not|shy|yen|GT|gt|LT|lt);?' const grammar = { data: [ [ STARTTAG_START, startTagStart, startTag ], [ ENDTAG_START, endTagStart, beforeAtt ], //[ DOCTYPE_START, doctype_start, 'beforeName' ], // before doctype name [ '<!--', commentStart, commentStart ], [ '<[/!?]', commentStartBogus, bogusComment ], [ '[\t\f ]+', space ], [ '\r?\n|\r', addNewline ], [ '[^<&\t\f\r\n ]+',data ], [ '<', unescaped ], [ '', data, charRefIn ]], rawtext: [ [ ENDTAG_START, maybeEndTagT, maybeEndTag ], [ '\r?\n|\r', addNewline ], [ '.[^<\r\n]*', rawtext ]], rcdata: [ [ ENDTAG_START, maybeEndTagT, maybeEndTag ], [ '<', unescaped ], [ '[^<&\r\n]+', rcdata ], [ '\r?\n|\r', addNewline ], [ '', rcdata, charRefIn ]], plaintext: [ [ '\r?\n|\r', addNewline ], [ '.[^\r\n]*', plaintext ]], charRef: [ [ CHARREF_DEC, charRefDecimal, context ], [ CHARREF_HEX, charRefHex, context ], [ CHARREF_CONTD, charRefNamed, context ], [ CHARREF_LEGACY, legacyCharRefT, context ], [ CHARREF_NAMED, charRefNamed, context ], [ '&', unescaped, context ]], beforeAtt: [ [ '>', tagEnd, content ], [ '/>', tagEndClose, content ], [ '\r?\n|\r', addNewline ], [ '[\t\f ]+', tagSpace, ], [ '/+(?!>)', tagSpace, ], [ ATTNAME, attributeName, afterAttName ]], afterAttName: [ [ '>', tagEnd, content ], [ '/>', tagEndClose, content ], [ '\r?\n|\r', addNewline ], [ '[\t\f ]+', tagSpace ], [ '=', attributeAssign, afterAssign ], [ '/+(?!>)', tagSpace, beforeAtt ], [ ATTNAME, attributeName ]], afterAssign: [ [ '\r?\n|\r', addNewline ], [ '[\t\f ]+', tagSpace, ], [ '', tagSpace, attValue ]], attValue: [ // 'attributeAssign' has eaten all the space [ '>' , tagEnd, content ], [ '"' , attributeValueStart, doubleQuoted ], [ "'" , attributeValueStart, singleQuoted ], [ '', attributeValueStart, unquoted ]], unquoted: [ [ ATT_UNQUOT, attributeValueData ], [ UNQUOT_END, attributeValueEnd, beforeAtt ], [ '', attributeValueData, charRefIn ]], doubleQuoted: [ [ '\r?\n|\r', addNewline ], [ '[^\r\n"&]+', attributeValueData ], [ '"', attributeValueEnd, beforeAtt ], [ '', attributeValueData, charRefIn ]], singleQuoted: [ [ '\r?\n|\r', addNewline ], [ "[^\r\n'&]+", attributeValueData ], [ "'", attributeValueEnd, beforeAtt ], [ '', attributeValueData, charRefIn ]], bogusComment: [ [ '\r?\n|\r', addNewline ], [ '[^\r\n>]+', commentData, bogusComment ], [ '>', commentEndBogus, content ]], commentStart: [ [ '-?>', commentEnd, content ], [ '--!?>', commentEnd, content ], [ '--!', commentData, comment ], [ '--?', commentData, comment ], [ '[^>-][^-]*', commentData, comment ]], comment: [ [ '--!?>', commentEnd, content ], [ '--!' , commentData ], [ '--?' , commentData ], [ '\r?\n|\r', addNewline ], [ '[^\r\n-]+', commentData ]] } // Additional state management, to // supplement the grammar/ state machine. const content_map = { style: rawtext , script: rawtext , xmp: rawtext , iframe: rawtext , noembed: rawtext , noframes: rawtext , textarea: rcdata , title: rcdata , plaintext: plaintext //, noscript: rawtext // if scripting is enabled in a UA } function startTag (_, chunk) { // log ('startTag', this) let tagName = chunk.substr (1) this.tagName = tagName this.content = tagName in content_map ? content_map[tagName] : data return beforeAtt } function addNewline (_, chunk) { this.lastnl = this.position + chunk.length this.line++ return newline } function content () { return this.content } function context () { return this.context } // From the spec; // "If the character reference was consumed as part of an attribute, // and the last character matched is not a U+003B SEMICOLON character (;), // and the next input character is either a U+003D EQUALS SIGN character (=) or an ASCII alphanumeric, // then, for historical reasons, flush code points consumed as a character reference and switch to the return state." function legacyCharRefT (_, chunk) { // NOTE this has changed after the runtme rewrite, this is brittle: // it is unclear when the position is updated. const x = this.context, c = this.input [this.position + chunk.length] if ((x === unquoted || x === doubleQuoted || x === singleQuoted) && chunk.substr(-1) !== ';' && /[a-zA-Z0-9=]/.test(c)) { return attributeValueData } return charRefNamed } function maybeEndTagT (_, chunk) { if (chunk.substr (2) === this.tagName) { this.content = data return endTagStart } else return this.content // TODO careful, this is a token type, not a state! } function maybeEndTag (symbol, chunk) { if (chunk.substr (2) === this.tagName) { this.content = data return beforeAtt } else return symbol } function charRefIn (symbol, chunk) { this.context = symbol return charRef } // Lexer // ----- const _compiled = compile (grammar) class LexerState { constructor (input) { this.symbol = data // current grammar rule this.content = data // one of { data, rcdata, rawtext, unquoted, doubleQuoted, singleQuoted } this.context = data // likewise this.position = 0 this.line = 1 this.lastnl = 0 define (this, 'tagName', { value:'', enumerable:false, writable:true }) // hidden // the last seen 'startTag-start' name define (this, 'input', { value:String (input) }) // hidden } get col () { return this.position - this.lastnl } *tokens () { do { const { input, position, symbol } = this const { regex, edges } = _compiled [symbol] const match = (regex.lastIndex = position, regex.exec (input)) if (!match) return let i = 1; while (match [i] == null) i++ const edge = edges [i-1] const token = edge.emit.call (this, symbol, match[i]) this.symbol = edge.goto.call (this, symbol, match[i]) this.position = regex.lastIndex yield token } while (this.position <= this.input.length) } } function lexemes (input) { const lexer = new LexerState (input) const stream = lexer.tokens () stream.state = lexer return stream } // The grammar compiler // -------------------- function State (table, name) { this.name = name this.regex = new RegExp ('(' + table.map (fst) .join (')|(') + ')', 'sgy') this.edges = table.map (compileRow (name)) } function compile (grammar) { const compiled = {} for (let state_name in grammar) compiled [state_name] = new State (grammar [state_name], state_name) return compiled } function fst (row) { return row [0] } function compileRow (symbol) { return (row) => { const [r = '.{0}', emit, goto = symbol] = row const g = typeof goto === 'function' ? goto : (symbol, data) => goto const e = typeof emit === 'function' ? wrapEmit (emit) : (symbol, data) => [emit, data] return { emit:e, goto:g } } } function wrapEmit (fn) { return function (type, data) { return [fn.call (this, type, data), data] } } // Exports // ------- export { lexemes, lexemes as chunks, tokenTypes }