@dolphinweex/himalaya
Version:
HTML to JSON parser
776 lines (752 loc) • 18.6 kB
JavaScript
import test from 'ava'
import parser from '../lib/parser'
import lexer from '../lib/lexer'
function ps (index) {
return { index, line: 0, column: index }
}
const lexerOptions = { childlessTags: [] }
const parserOptions = {
voidTags: [],
closingTags: [],
closingTagAncestorBreakers: {}
}
test('parser() should return nodes', t => {
const str = '<h1>Hello world</h1>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'h1',
attributes: [],
children: [
{
type: 'text',
content: 'Hello world',
position: {
start: ps(4),
end: ps(15)
}
}
],
position: {
start: ps(0),
end: ps(str.length)
}
}
])
})
test('parser() should not nest within void tags', t => {
const str = '<div>abc<img/>def</div>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, { voidTags: 'img', closingTags: [] })
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'div',
attributes: [],
children: [
{
type: 'text',
content: 'abc',
position: {
start: ps(5),
end: ps(8)
}
},
{
type: 'element',
tagName: 'img',
attributes: [],
children: [],
position: {
start: ps(8),
end: ps(14)
}
},
{
type: 'text',
content: 'def',
position: {
start: ps(14),
end: ps(17)
}
}
],
position: {
start: ps(0),
end: ps(str.length)
}
}
])
})
test('parser() should handle optional-close tags', t => {
{
const parserOptions = {
voidTags: [],
closingTags: ['p'],
closingTagAncestorBreakers: {}
}
const str = '<p>This is one<p>This is two</p>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'p',
attributes: [],
children: [
{
type: 'text',
content: 'This is one',
position: {
start: ps(3),
end: ps(14)
}
}
],
position: {
start: ps(0),
end: ps(14)
}
},
{
type: 'element',
tagName: 'p',
attributes: [],
children: [
{
type: 'text',
content: 'This is two',
position: {
start: ps(17),
end: ps(28)
}
}
],
position: {
start: ps(14),
end: ps(str.length)
}
}
])
}
{
const parserOptions = {
voidTags: [],
closingTags: ['p', 'span'],
closingTagAncestorBreakers: {}
}
const str = '<p>This is one <span>okay<p>This is two</p>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'p',
attributes: [],
children: [
{
type: 'text',
content: 'This is one ',
position: {
start: ps(3),
end: ps(15)
}
},
{
type: 'element',
tagName: 'span',
attributes: [],
children: [
{
type: 'text',
content: 'okay',
position: {
start: ps(21),
end: ps(25)
}
}
],
position: {
start: ps(15),
end: ps(25)
}
}
],
position: {
start: ps(0),
end: ps(25)
}
},
{
type: 'element',
tagName: 'p',
attributes: [],
children: [
{
type: 'text',
content: 'This is two',
position: {
start: ps(28),
end: ps(39)
}
}
],
position: {
start: ps(25),
end: ps(43)
}
}
])
}
})
test('parser() should auto-close unmatched child tags', t => {
const parserOptions = {
voidTags: [],
closingTags: [],
closingTagAncestorBreakers: {}
}
const str = '<div>This is <b>one <span>okay</div>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'div',
attributes: [],
position: {
start: ps(0),
end: ps(36)
},
children: [
{
type: 'text',
content: 'This is ',
position: {
start: ps(5),
end: ps(13)
}
},
{
type: 'element',
tagName: 'b',
attributes: [],
position: {
start: ps(13),
end: ps(30)
},
children: [
{
type: 'text',
content: 'one ',
position: {
start: ps(16),
end: ps(20)
}
},
{
type: 'element',
tagName: 'span',
attributes: [],
position: {
start: ps(20),
end: ps(30)
},
children: [
{
type: 'text',
content: 'okay',
position: {
start: ps(26),
end: ps(30)
}
}
]
}
]
}
]
}
])
})
test('parser() should handle empty token arrays', t => {
const tokens = []
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [])
})
test('parser() should report the element attributes', t => {
const str = '<div class="cake" data-key="abc" disabled></div>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'div',
attributes: ['class="cake"', 'data-key="abc"', 'disabled'],
position: {
start: ps(0),
end: ps(48)
},
children: []
}
])
})
test('parser() should handle unclosed elements', t => {
const str = '<div>abc'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'div',
attributes: [],
position: {
start: ps(0),
end: ps(str.length)
},
children: [
{
type: 'text',
content: 'abc',
position: {
start: ps(5),
end: ps(str.length)
}
}
]
}
])
})
test('parser() should preserve case-sensitive tag names', t => {
const str = '<You-Know-8>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'You-Know-8',
attributes: [],
position: {
start: ps(0),
end: ps(str.length)
},
children: []
}
])
})
test('parser() should match by case-insensitive tags', t => {
const str = '<div>abc</DIV>def'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'div',
attributes: [],
position: {
start: ps(0),
end: ps(14)
},
children: [
{
type: 'text',
content: 'abc',
position: {
start: ps(5),
end: ps(8)
}
}
]
},
{
type: 'text',
content: 'def',
position: {
start: ps(14),
end: ps(17)
}
}
])
})
test('parser() should handle ancestor breaker special case (#39)', t => {
/*
To summarize, this special case is where a <ul|ol|menu> is
encountered within an <li>. The default behavior for <li>s
as closing tags is to rewind up and auto-close the previous
<li>. However, <li> may contain <ul|ol|menu> before being
closed so we should not rewind the stack in those cases.
This edge-case also applies to <dt|dd> in <dl>s.
*/
{
const str = '<ul><li>abc<ul><li>def</li></ul></li></ul>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ['li'],
closingTagAncestorBreakers: {
li: ['ul']
}
})
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'ul',
attributes: [],
position: {
start: ps(0),
end: ps(42)
},
children: [
{
type: 'element',
tagName: 'li',
attributes: [],
position: {
start: ps(4),
end: ps(37)
},
children: [
{
type: 'text',
content: 'abc',
position: {
start: ps(8),
end: ps(11)
}
},
{
type: 'element',
tagName: 'ul',
attributes: [],
position: {
start: ps(11),
end: ps(32)
},
children: [
{
type: 'element',
tagName: 'li',
attributes: [],
position: {
start: ps(15),
end: ps(27)
},
children: [
{
type: 'text',
content: 'def',
position: {
start: ps(19),
end: ps(22)
}
}
]
}
]
}
]
}
]
}
])
}
{
const str = '<ul><li>abc<ul><span><li>def</li></span></ul></li></ul>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ['li'],
closingTagAncestorBreakers: {
li: ['ul']
}
})
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'ul',
attributes: [],
position: {
start: ps(0),
end: ps(55)
},
children: [
{
type: 'element',
tagName: 'li',
attributes: [],
position: {
start: ps(4),
end: ps(50)
},
children: [
{
type: 'text',
content: 'abc',
position: {
start: ps(8),
end: ps(11)
}
},
{
type: 'element',
tagName: 'ul',
attributes: [],
position: {
start: ps(11),
end: ps(45)
},
children: [
{
type: 'element',
tagName: 'span',
attributes: [],
position: {
start: ps(15),
end: ps(40)
},
children: [
{
type: 'element',
tagName: 'li',
attributes: [],
position: {
start: ps(21),
end: ps(33)
},
children: [
{
type: 'text',
content: 'def',
position: {
start: ps(25),
end: ps(28)
}
}
]
}
]
}
]
}
]
}
]
}
])
}
{
const str = '<ul><li>abc<ul><li>def<li>ghi</li></ul></li></ul>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ['li'],
closingTagAncestorBreakers: {
li: ['ul']
}
})
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'ul',
attributes: [],
position: {
start: ps(0),
end: ps(49)
},
children: [
{
type: 'element',
tagName: 'li',
attributes: [],
position: {
start: ps(4),
end: ps(44)
},
children: [
{
type: 'text',
content: 'abc',
position: {
start: ps(8),
end: ps(11)
}
},
{
type: 'element',
tagName: 'ul',
attributes: [],
position: {
start: ps(11),
end: ps(39)
},
children: [
{
type: 'element',
tagName: 'li',
attributes: [],
position: {
start: ps(15),
end: ps(22)
},
children: [
{
type: 'text',
content: 'def',
position: {
start: ps(19),
end: ps(22)
}
}
]
},
{
type: 'element',
tagName: 'li',
attributes: [],
position: {
start: ps(22),
end: ps(34)
},
children: [
{
type: 'text',
content: 'ghi',
position: {
start: ps(26),
end: ps(29)
}
}
]
}
]
}
]
}
]
}
])
}
})
test('parser() should handle nested tables', t => {
const str =
'<table><tbody><tr><td><table><tbody><tr><td></td></tr></tbody></table></td></tr></tbody></table>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ['tbody'],
closingTagAncestorBreakers: {
tbody: ['table'],
tr: ['table'],
td: ['table']
}
})
t.deepEqual(nodes, [
{
type: 'element',
tagName: 'table',
attributes: [],
position: {
start: ps(0),
end: ps(96)
},
children: [
{
type: 'element',
tagName: 'tbody',
attributes: [],
position: {
start: ps(7),
end: ps(88)
},
children: [
{
type: 'element',
tagName: 'tr',
attributes: [],
position: {
start: ps(14),
end: ps(80)
},
children: [
{
type: 'element',
tagName: 'td',
attributes: [],
position: {
start: ps(18),
end: ps(75)
},
children: [
{
type: 'element',
tagName: 'table',
attributes: [],
position: {
start: ps(22),
end: ps(70)
},
children: [
{
type: 'element',
tagName: 'tbody',
attributes: [],
position: {
start: ps(29),
end: ps(62)
},
children: [
{
type: 'element',
tagName: 'tr',
attributes: [],
position: {
start: ps(36),
end: ps(54)
},
children: [
{
type: 'element',
tagName: 'td',
attributes: [],
position: {
start: ps(40),
end: ps(49)
},
children: []
}
]
}
]
}
]
}
]
}
]
}
]
}
]
}
])
})
test('parser() should ignore unnecessary closing tags', t => {
/*
In this case the </i> bit is unnecessary and should
not be represented in the output nor interfere with the stack.
*/
const str = '</i>x'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
t.deepEqual(nodes, [
{
type: 'text',
content: 'x',
position: {
start: ps(4),
end: ps(str.length)
}
}
])
})