UNPKG

html-lexer

Version:

An HTML5 lexer

github.com/alwinb/html-lexer

alwinb/html-lexer

255 lines (208 loc) • 8.6 kB

JavaScript

const log = console.log.bind (console) const { defineProperties:define } = Object // Imports - DFA // ------------- import DFA from './dfa.mjs' const { eqClass, defaultClass, tokens:T, states:S, initialState, table, minAccepts } = DFA const FAIL = 0 const errorToken = 0 // TokenTypes // ---------- // This maps the DFA tokenTypes from ints to strings; // It renames some of the token-types to maintain some // compatibility with previous versions of html-lexer. const names = [] for (const k in T) names[T[k]] = k names [T.unquoted] = 'attributeValueData' names [T.quoted] = 'attributeValueData' names [T.squoted] = 'attributeValueData' names [T.attributeSep] = 'tagSpace' names [T.valueStartApos] = 'attributeValueStart' names [T.valueStartQuot] = 'attributeValueStart' names [T.valueEnd] = 'attributeValueEnd' names [T.bogusStart] = 'commentStartBogus' names [T.bogusData] = 'commentData' names [T.bogusEnd] = 'commentEndBogus' names [T.lt] = 'lessThanSign' names [T.ampersand] = 'uncodedAmpersand' const tokenTypes = {} for (const x of names) tokenTypes[x] = x delete tokenTypes.errorToken delete tokenTypes.mDeclStart // Lexer / Push Parser // ------------------- function Lexer (delegate) { // State let buffer = '' let closed = false // true after end() call let line = 1, lastnl = 0, _c = 0 // line counter let anchor = 0, end = 0, pos = 0 // lexer position let entry = S.Main // lexer (entry) state-id let lastTagType = 0 let lastStartTagName = '' // API return define (this, { position: { get:getPosition }, write: { value: write, hidden:true }, end: { value: writeEOF, hidden:true }, parse: { value: writeEOF, hidden:true } }) // Public methods function write (input) { buffer += input const length = buffer.length while (pos < length) { let state = entry let exit = entry < minAccepts ? FAIL : entry do { const c = buffer.charCodeAt(pos++) state = table [state] [c <= 0x7a ? eqClass[c] : defaultClass] if (minAccepts <= state) (exit = state, end = pos) // Newline counter if (c === 0xD || c === 0xA) (lastnl = pos, line += (_c !== 0xD)); _c = c } while (state && pos < length) if (end < buffer.length || closed) emit (table [exit] [0], anchor, end) else { pos = end = anchor break } } buffer = buffer.substr (end) anchor = pos = end = 0 } function writeEOF (input = '') { closed = true write (input) delegate.end () } // Private methods function getPosition () { return { line, column: pos-lastnl } } function emit (type, anchor_, end_) { // log ('emit', {buffer, l:buffer.length, anchor_, end_, closed }) switch (type) { case errorToken: { const message = `Lexer error at line ${line}:${pos-lastnl}` throw new SyntaxError (message) } break case T.startTagStart: { const tagName = buffer.substring (anchor+1, end_) lastTagType = type lastStartTagName = tagName.toLowerCase () delegate.write (['startTagStart', '<']) delegate.write (['tagName', tagName]) entry = S.BeforeAttribute return anchor = pos = end_ // NB returns } case T.endTagStart: { const tagName = buffer.substring (anchor+2, end_) lastTagType = type if (entry === S.Main || lastStartTagName === tagName.toLowerCase ()) entry = S.BeforeAttribute else entry === S.RcData ? T.rcdata : T.rawtext delegate.write (['endTagStart', '</']) delegate.write (['tagName', tagName]) return anchor = pos = end_ // NB returns } case T.mDeclStart: { entry = S.Bogus; delegate.write ([names[T.bogusStart], '<!']) delegate.write ([names[T.bogusData], buffer.substring (anchor+2, end_)]) return anchor = pos = end_ // NB returns } case T.tagEnd: { const xmlIsh = false // needs the feedback // TODO support SVG / MathML entry = lastTagType === T.startTagStart && !xmlIsh ? contentMap [lastStartTagName] || S.Main : S.Main const ttype = buffer[end_ - 2] === '/' ? 'tagEndAutoclose' : 'tagEnd' delegate.write ([ttype, buffer.substring (anchor, end_)]) return anchor = pos = end_ // NB returns } case T.charRefNamed: case T.charRefLegacy: { const nextChar = buffer[end_] // FIXME case at buffer end ? need to back up... const parts = splitCharRef (buffer.substring (anchor, end_), entry, nextChar) for (const item of parts) delegate.write (item) return anchor = pos = end_ // NB returns } case T.attributeSep: entry = S.BeforeAttribute; break case T.attributeName: entry = S.BeforeAssign; break case T.attributeAssign: entry = S.BeforeValue; break case T.valueStartQuot: entry = S.ValueQuoted; break case T.valueStartApos: entry = S.ValueAposed; break case T.valueEnd: entry = S.BeforeAttribute; break case T.unquoted: entry = S.ValueUnquoted; break case T.commentStart: entry = S.BeforeCommentData; break case T.commentData: entry = S.InCommentData; break case T.commentEnd: entry = S.Main; break case T.bogusStart: entry = S.Bogus; break case T.bogusData: entry = S.Bogus; break case T.bogusEnd: entry = S.Main; break // case T.newline: entry = entry; break } const name = names [type] delegate.write ([name, buffer.substring (anchor, end_)]) anchor = pos = end_ } } // The contentMap defines the lexer state to use // immediately _after_ specific html start-tags. const contentMap = { style: S.RawText, script: S.RawText, xmp: S.RawText, iframe: S.RawText, noembed: S.RawText, noframes: S.RawText, textarea: S.RcData, title: S.RcData, plaintext: S.PlainText // noscript: scriptingEnabled ? S.RawText : S.Main } // Legacy Character References // --------------------------- // Legacy character references are named character references that // may occur without a terminating semicolon. // `LEGACY` and `PREFIXED` result from preprocessing the table of all // entity names in the HTML5 specification, specifically, by selecting // 1. The names that may occur without a terminating semicolon. // 2. Semicolon terminated names that have a legacy name as a prefix. const LEGACY = /^&([AEIOUYaeiouy]?acute|[AEIOUaeiou](?:grave|circ|uml)|y?uml|[ANOano]tilde|[Aa]ring|[Oo]slash|[Cc]?cedil|brvbar|curren|divide|frac(?:12|14|34)|iquest|middot|plusmn|(?:AE|ae|sz)lig|[lr]aquo|iexcl|micro|pound|THORN|thorn|times|COPY|copy|cent|macr|nbsp|ord[fm]|para|QUOT|quot|sect|sup[123]|AMP|amp|ETH|eth|REG|reg|deg|not|shy|yen|GT|gt|LT|lt)(;|.*)$/ const PREFIXED = /^&(?:copysr|centerdot|divideontimes|[gl]t(?:quest|dot|cir|cc)|[gl]trPar|gtr(?:dot|less|eqqless|eqless|approx|arr|sim)|ltr(?:i|if|ie|mes)|ltlarr|lthree|notin(?:dot|E|v[abc])?|notni(?:v[abc])?|parallel|times(?:bar|d|b));$/ function splitCharRef (string, entry, nextChar) { // A semicolon-terminated, known charref if (PREFIXED.test (string)) return [['charRefNamed', string]] // Test legacy charrefs (terminated or nonterminated) const r = LEGACY.exec (string) const terminated = string[string.length-1] === ';' const dataTokenType = entry === S.Main ? 'data' : entry === S.RcData ? 'rcdata' : 'attributeValueData' // Not a special charref, nor one with trailing alphanums if (!r) return (terminated ? [['charRefNamed', string]] : [[dataTokenType, string]]) // A semicolon terminated legacy charref if (r[2] === ';') return [['charRefNamed', '&'+r[1]+';']] const inAttribute = entry === S.BeforeValue || entry === S.ValueQuoted || entry === S.ValueAposed || entry === S.ValueUnquoted // A nonterminated legacy charref (exact match) if (r[2] === '') return (!inAttribute || nextChar !== '=') ? [['charRefLegacy', string]] // And also a parse error : [[dataTokenType, string]] // A nonterminated legacy charref with trailing alphanums else return (!inAttribute) ? [['charRefLegacy', '&'+r[1]], [dataTokenType, r[2]]] : [[dataTokenType, string]] } // Exports // ------- export { DFA, tokenTypes, Lexer }