html-lexer
Version:
An HTML5 lexer
272 lines (241 loc) • 10.9 kB
JavaScript
const samples =
[ '\n'
, '<h2>Legacy Named Character References</h2>'
, '<xmp title=& >&</xmp> <span>&</span>'
, '<xmp title=& >&</xmp> <span>&</span>'
, '<xmp title=&o >&o</xmp> <span>&o</span>'
, '<xmp title=&* >&*</xmp> <span>&*</span>'
, '<xmp title="& " >& </xmp> <span>& </span>'
, '<xmp title=&= >&=</xmp> <span>&=</span>'
, '<xmp title=¬in >¬in</xmp> <span>¬in</span>'
, '<xmp title=¬it >¬it</xmp> <span>¬it</span>'
, '<xmp title=¬ina >¬ina</xmp> <span>¬ina</span>'
, '<xmp title=¬ita >¬ita</xmp> <span>¬ita</span>'
, '<xmp title=∉ >∉</xmp> <span>∉</span>'
, '<xmp title=¬it; >¬it;</xmp> <span>¬it;</span>'
, '<xmp title=∉a >∉a</xmp> <span>∉a</span>'
, '<xmp title=¬it;a >¬it;a</xmp> <span>¬it;a</span>'
, '<xmp title=¬in= >¬in=</xmp> <span>¬in=</span>'
, '<xmp title=¬it= >¬it=</xmp> <span>¬it=</span>'
, '<xmp title=∉= >∉=</xmp> <span>∉=</span>'
, '<xmp title=¬it;= >¬it;=</xmp> <span>¬it;=</span>' // REVIEW (¬it; is not a named charref; but neither is &foo;)
, '<xmp title=&foo;= >&foo;=</xmp> <span>&foo;=</span>'
, '<xmp title=&foo; >&foo;</xmp> <span>&foo;</span>'
, '<a href="/foo&=a&¬in¬it&∉¬it;">Link</a>'
, '\n'
, '<h2>Legacy character references in rcdata</h2>'
, `<textarea>Test &foo`
, `<textarea>Test ¬it;foo`
, `<textarea>Test ∉foo`
, '\n'
, '<h2>Named Character References</h2>'
, 'charref: uncoded ampersand & in data'
, 'charref: <span title="uncoded ampersand & in attribute">'
, 'charref: named & in data'
, 'charref: named non-terminated & in data'
, 'charref: named non-terminated &a in data'
, 'charref: hexadecimal ೌ in data'
, 'charref: hexadecimal non-terminated ೌ in data'
, 'charref: decimal ф in data'
, 'charref: decimal non-terminated n in data'
, 'charref: special <input value=asda¬*=c></input>'
, 'charref: special <input value=asda¬=c></input>'
, 'charref: special <input value="asda¬it; I tell you"></input>'
, 'charref: non-special <input value=asda¬in*=c></input>'
, 'charref: non-special <input value=asda¬in=c></input>'
, 'charref: non-special <input value=asda∉=c></input>'
, 'charref: special ¬*=c in data'
, 'charref: special ¬=c in data'
, 'charref: special ¬it; I tell you, in data'
, 'charref: special ∉ I tell you, in data'
, 'charref: non-special ¬in*=c in data'
, 'charref: non-special ¬in=c in data'
, 'charref: non-special ∉=c in data'
, 'charref: named <input value="you & me"/> in attribute'
, 'charref: named <input value=\'you & me\'/> in attribute'
, 'charref: named <input value=youme /> in attribute'
, 'charref: named <input value=&me /> in attribute'
, 'charref: named <input value=& attr=val /> in attribute'
, 'charref: named <input value=&o attr=val /> in attribute'
, 'charref: bogus <input value="you &# am me"/> in attribute'
, 'charref: bogus <input value=\'you &# amp me\'/> in attribute'
, 'charref: bogus <input value=you&x ampme /> in attribute'
, 'charref: ampHash &# such'
, '\n'
, 'rcdata <textarea> asdf & & <textareaNot </textarea> and more'
, 'rcdata2 <textarea> asdf & & </textarea( and not ending> it'
, 'rcdata3 <textarea> asdf & & </textarea/ and ending> it, see <span>yes</span>'
, 'rcdata5 <textarea/> asdf & & and NOT ending < it, see <span>yes</span>'
, 'rcdata4 <textarea> asdf & & </textarea and ending> it'
, 'rawtext <script> asdf & <span> </scriptNot </script> and more'
, 'rawtext2 <script> asdf & <span> </script( and> not ending it <span>'
, 'rawtext3 <script> asdf & <span> </script/ and> ending it <span> see'
, 'rawtext4 <script> asdf & <span> </script and ending it <span> see'
, 'script hello <script><!-- asdf</script> thus'
, 'nonalpha tag This is not a <ém attr>tag</ém>'
, 'double open tag A double less than sign <<div attr>content</div>'
, 'bad end tag <div style=color:blue> This is blue </ div> And this too!'
, 'closePlaintext hi <plaintext>asd<as &ap, </plaintext> cannot be ended'
, '\n'
, 'comment: and such'
, 'comment: <!> and such'
, 'comment: <?> and such'
, 'comment: </> and such'
, 'comment: <!-> and such'
, 'comment: <?-> and such'
, 'comment: <!-> and such'
, 'comment: <!--> and such'
, 'comment: <?--> and such'
, 'comment: <!--> and such'
, 'comment: <!--!> part of the comment --> and such'
, 'comment: <!---!> part of the comment --> and such'
, 'comment: <!----!> and such'
, 'comment: <!-> and such'
, 'comment: <!-- with -> within --> and subsequent data'
, 'comment: <!-- with bogus end -> part of the comment --> and subsequent data'
, 'comment: <!-- Comment with -- double dash within --> and subsequent data'
, 'comment: <!-- Comment with --!- weird stuff within --> and subsequent data'
, 'comment: <!-- Comment with strange end --!> and subsequent data'
, 'bogus comment: <! with end !@> and subsequent data'
, 'bogus comment: </ with end !@> and subsequent data'
, 'bogus comment: <? with end !@> and subsequent data'
, 'bogus comment: <!- with end -> and subsequent data'
, '\n'
, '<!doctype foo>'
, `<!ba>`
, `<! xos >`
, '\n'
, 'missing space attribues connected <div name="a"name="b" >'
, 'nonalpha attribute weird template tag <div {name="a" name="b" >'
, 'normalHtml This is <span class = "s1">html</span> Yeah!'
, 'unescaped ampersand data & such'
, 'unescaped ampersand Hash data &# such'
, 'unescaped ampersand HashEx data &#x such'
, 'unescaped ampersand HashExZed data &#xz such'
, '\n'
, 'slashes: <span/>'
, 'slashes: <span name=foo//>'
, 'slashes: <div//>'
, 'slashes: <div/foo/bar//>'
, 'slashes: <span//>'
, 'slashes: <span />'
, 'slashes: <span <>'
, 'slashes: <span //>'
, 'slashes: <span / />'
, 'slashes: <span/////>'
, 'slashes: <span/////name////=/blabla>'
, 'slashes: <span / attr >foo bar</span>'
, 'slashes: <span name=/ >asdf'
, 'slashes: <span name=/>asdf'
, 'slashes: <span name=// />asdf'
, 'slashes: <span name= / />asdf'
, '\n'
, 'weirdEquals <span attr = / asd >content</span>'
, 'weirdEquals2 <span attr = @ asd >content</span>'
, 'weirdEquals3 <span attr /= asd >content</span>'
, 'weirdEquals4 <span attr @= asd >content</span>'
, 'missingValue <span name=>asdf'
, 'invalidAttributeValue1 <div class= =at >'
, 'invalidAttributeValue2 <div class= <at >'
, 'invalidAttributeValue3 <div class= `at >'
]
const EOFSamples =
[ 'data state eof in da'
, 'tagOpen state eof in <'
, 'tagName state eof in <d'
, 'selfClosingStartTag state in <div /'
, 'endTagOpen state in </a'
, 'beforeAttributeName state <div '
, 'attributeName state <div at'
, 'afterAttributeName state <div attr '
, 'beforeAttributeValue state <div attr ='
, 'attributeValueDoubleQuoted state <div attr="te'
, 'attributeValueSingleQuoted state <div attr=\'te'
, 'attributeValueUnquoted state <div attr=te'
, 'afterAttributeValueQuoted state <div attr="test"'
, 'markupDeclarationOpen state a markup decl <!'
, 'selfClosingTag state An eof after a / <span /'
, 'commentStart state a comment start <!--'
, 'commentStartDash state a comment start dash <!---'
, 'comment state a comment <!-- hello th'
, 'commentEndDash state a comment end dash <!-- hello th -'
, 'commentEnd state a comment end <!-- hello th --'
, 'commentEndBang state a comment end bang <!-- hello th --!'
, 'bogusComment state <! bogus comment'
, 'charRefIn_ state data &'
, 'numericCharRef state data &#'
, 'hexadecimalCharRef state data &#x'
, 'hexDigits state data '
, 'decimalCharRef state data '
, 'namedCharRef state data &name'
, 'namedCharRefInAttr state <span attr="asd&a&b c">text</span>'
, 'namedCharRefInData state named charref in data asd&a&b cde'
, 'rawtext state eof in raw text <script> funct'
, 'plaintext state eof in raw text <plaintext> asdf'
, 'rawtextLessThanSign state eof in raw text less than sign <script> if (i<'
, 'rawtextEndTagOpen state eof in raw text end tag open <script> asdf </'
]
const samples2 = [
'<span a=& b>',
'<table><input type=hidden type=still-hidden>',
// '</ tttt>',
// '<table><input type = hidden /// / type= still-hidden&foo >foo',
// '<script type=hidden ///type=still-hidden&foo >foo</x>bae',
// '<!doctype script type = hidden ///type= still-hidden&foo >foo</x>bae',
// '<!--> <!---> <!-----> bae',
// `<test val = unq&ed bar="foo" bee='buzz'> bae`,
// `<plaintext = unq&ed bar="foo" bee='buzz'> baeasas </plaintext > `,
// `<test val = unq&ed b // >`,
'<script a =\n b>foo bar </script>',
'<h1>Hello, World</h1>',
'<!namas >',
'<foo/>',
// Newlines
// --------
// Newlines in data
`Test & Line1\nLine2\r\rLine4\r\nLine5`,
`Test & Line1 \nLine2 \r\r Line4 \r\nLine5`,
// Newlines in rcdata
`<textarea>Test & Line1\nLine2\r\rLine4\r\nLine5`,
`<textarea>Test & Line1<\nLine2<\r\rLine4<\r\nLine5`,
`<textarea>Test & Line1</\nLine2</\r\rLine4</\r\nLine5`,
// Newlines in attribute values
`<div title="Test & Line1\nLine2\r\rLine4\r\nLine5" foo >`,
`<div title='Test & Line1\nLine2\r\rLine4\r\nLine5' foo >`,
`<div title="Test & Line1 \nLine2 \r\rLine4 \r\nLine5" foo >`,
`<div title='Test & Line1 \nLine2 \r\rLine4 \r\nLine5' foo >`,
// Newlines in rawtext
`<style>Test & Line1\nLine2\r\rLine4\r\nLine5`,
`<style>Test & Line1<\nLine2<\r\rLine4<\r\nLine5`,
`<style>Test & Line1</\nLine2</\r\rLine4</\r\nLine5`,
// Newlines in comments
`<!-- Test & Line1\nLine2\r\rLine4\r\nLine5`,
`<!? Test & Line1\nLine2\r\rLine4\r\nLine5`,
// No newlines in plaintext then
`<plaintext>Test & Line1\nLine2\r\rLine4\r\nLine5`,
// NUL
// ---
// NULs in data
`Test & Line1\0Line2\0\0Line4\0\0Line5`,
`Test & Line1 \0Line2 \0\0 Line4 \0\0Line5`,
// NULs in rcdata
`<textarea>Test & Line1\0Line2\0\0Line4\0\0Line5`,
`<textarea>Test & Line1<\0Line2<\0\0Line4<\0\0Line5`,
`<textarea>Test & Line1</\0Line2</\0\0Line4</\0\0Line5`,
// NULs in attribute values
`<div title="Test & Line1\0Line2\0\0Line4\0\0Line5" foo >`,
`<div title='Test & Line1\0Line2\0\0Line4\0\0Line5' foo >`,
`<div title=Line1\0Line2\0\0Line4\0\0Line5 foo >`,
// NULs in rawtext
`<style>Test & Line1\0Line2\0\0Line4\0\0Line5`,
`<style>Test & Line1<\0Line2<\0\0Line4<\0\0Line5`,
`<style>Test & Line1</\0Line2</\0\0Line4</\0\0Line5`,
// NULs in comments
`<!-- Test & Line1\0Line2\0\0Line4\0\0Line5`,
`<!? Test & Line1\0Line2\0\0Line4\0\0Line5`,
// No NULs in plaintext then
`<plaintext>Test & Line1\0Line2\0\0Line4\0\0Line5`,
]
// Exports
// -------
export { samples, samples2, EOFSamples }