micro-mdx-parser
Version:
A tiny parser to convert markdown or html into JSON
876 lines (838 loc) • 20.8 kB
JavaScript
const { inspect } = require("util")
const { test } = require("uvu")
const assert = require("uvu/assert")
const { lexer } = require("./lexer")
const { parser } = require("./parser")
const { getTextBetweenChars } = require('./utils')
function deepLog(obj) {
console.log(inspect(obj, { showHidden: false, depth: null, colors: true }))
}
function ps(index) {
return { index, line: 1, column: index + 1 }
}
const lexerOptions = { childlessTags: [] }
const parserOptions = {
voidTags: [],
closingTags: [],
closingTagAncestorBreakers: {},
}
test("parser() should return nodes", (t) => {
const str = "<h1>Hello world</h1>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [
{
type: "element",
tagName: "h1",
props: {},
propsRaw: "",
children: [
{
type: "text",
content: "Hello world",
position: {
start: ps(4),
end: ps(15),
},
},
],
position: {
start: ps(0),
end: ps(str.length),
},
},
])
})
test("parser() should not nest within void tags", (t) => {
const str = "<div>abc<img/>def</div>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, { voidTags: "img", closingTags: [] })
const answer = [
{
type: "element",
tagName: "div",
props: {},
propsRaw: "",
children: [
{
type: "text",
content: "abc",
position: {
start: ps(5),
end: ps(8),
},
},
{
type: "element",
tagName: "img",
props: {},
propsRaw: "",
children: [],
position: {
start: ps(8),
end: ps(14),
},
isSelfClosing: true
},
{
type: "text",
content: "def",
position: {
start: ps(14),
end: ps(17),
},
},
],
position: {
start: ps(0),
end: ps(str.length),
},
},
]
/*
console.log('nodes')
deepLog(nodes)
console.log('answer')
deepLog(answer)
process.exit(1)
/** */
assert.equal(nodes, answer)
})
test("parser() should handle optional-close tags", (t) => {
{
const parserOptions = {
voidTags: [],
closingTags: ["p"],
closingTagAncestorBreakers: {},
}
const str = "<p>This is one<p>This is two</p>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [
{
type: "element",
tagName: "p",
props: {},
propsRaw: "",
children: [
{
type: "text",
content: "This is one",
position: {
start: ps(3),
end: ps(14),
},
},
],
position: {
start: ps(0),
end: ps(14),
},
},
{
type: "element",
tagName: "p",
props: {},
propsRaw: "",
children: [
{
type: "text",
content: "This is two",
position: {
start: ps(17),
end: ps(28),
},
},
],
position: {
start: ps(14),
end: ps(str.length),
},
},
])
}
{
const parserOptions = {
voidTags: [],
closingTags: ["p", "span"],
closingTagAncestorBreakers: {},
}
const str = "<p>This is one <span>okay<p>This is two</p>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [
{
type: "element",
tagName: "p",
props: {},
propsRaw: "",
children: [
{
type: "text",
content: "This is one ",
position: {
start: ps(3),
end: ps(15),
},
},
{
type: "element",
tagName: "span",
props: {},
propsRaw: "",
children: [
{
type: "text",
content: "okay",
position: {
start: ps(21),
end: ps(25),
},
},
],
position: {
start: ps(15),
end: ps(25),
},
},
],
position: {
start: ps(0),
end: ps(25),
},
},
{
type: "element",
tagName: "p",
props: {},
propsRaw: "",
children: [
{
type: "text",
content: "This is two",
position: {
start: ps(28),
end: ps(39),
},
},
],
position: {
start: ps(25),
end: ps(43),
},
},
])
}
})
test("parser() should auto-close unmatched child tags", (t) => {
const parserOptions = {
voidTags: [],
closingTags: [],
closingTagAncestorBreakers: {},
}
const str = "<div>This is <b>one <span>okay</div>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [
{
type: "element",
tagName: "div",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(36),
},
children: [
{
type: "text",
content: "This is ",
position: {
start: ps(5),
end: ps(13),
},
},
{
type: "element",
tagName: "b",
props: {},
propsRaw: "",
position: {
start: ps(13),
end: ps(30),
},
children: [
{
type: "text",
content: "one ",
position: {
start: ps(16),
end: ps(20),
},
},
{
type: "element",
tagName: "span",
props: {},
propsRaw: "",
position: {
start: ps(20),
end: ps(30),
},
children: [
{
type: "text",
content: "okay",
position: {
start: ps(26),
end: ps(30),
},
},
],
},
],
},
],
},
])
})
test("parser() should handle empty token arrays", (t) => {
const tokens = []
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [])
})
test("parser() should report the element attributes", (t) => {
const str = '<div class="cake" data-key="abc" disabled></div>'
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
const answer = [
{
type: "element",
tagName: "div",
// attributes: ['class="cake"', 'data-key="abc"', "disabled"],
props: { class: 'cake', 'data-key': 'abc', disabled: true },
propsRaw: ' class="cake" data-key="abc" disabled',
position: {
start: ps(0),
end: ps(48),
},
children: [],
},
]
/*
console.log('nodes')
deepLog(nodes)
console.log('answer')
deepLog(answer)
process.exit(1)
/** */
assert.equal(nodes, answer)
})
test("parser() should handle unclosed elements", (t) => {
const str = "<div>abc"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [
{
type: "element",
tagName: "div",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(str.length),
},
children: [
{
type: "text",
content: "abc",
position: {
start: ps(5),
end: ps(str.length),
},
},
],
},
])
})
test("parser() should preserve case-sensitive tag names", (t) => {
const str = "<You-Know-8>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [
{
type: "element",
tagName: "You-Know-8",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(str.length),
},
children: [],
},
])
})
test("parser() should match by case-insensitive tags", (t) => {
const str = "<div>abc</DIV>def"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
assert.equal(nodes, [
{
type: "element",
tagName: "div",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(14),
},
children: [
{
type: "text",
content: "abc",
position: {
start: ps(5),
end: ps(8),
},
},
],
},
{
type: "text",
content: "def",
position: {
start: ps(14),
end: ps(17),
},
},
])
})
test("parser() should handle ancestor breaker special case (#39)", (t) => {
/*
To summarize, this special case is where a <ul|ol|menu> is
encountered within an <li>. The default behavior for <li>s
as closing tags is to rewind up and auto-close the previous
<li>. However, <li> may contain <ul|ol|menu> before being
closed so we should not rewind the stack in those cases.
This edge-case also applies to <dt|dd> in <dl>s.
*/
{
const str = "<ul><li>abc<ul><li>def</li></ul></li></ul>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ["li"],
closingTagAncestorBreakers: {
li: ["ul"],
},
})
assert.equal(nodes, [
{
type: "element",
tagName: "ul",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(42),
},
children: [
{
type: "element",
tagName: "li",
props: {},
propsRaw: "",
position: {
start: ps(4),
end: ps(37),
},
children: [
{
type: "text",
content: "abc",
position: {
start: ps(8),
end: ps(11),
},
},
{
type: "element",
tagName: "ul",
props: {},
propsRaw: "",
position: {
start: ps(11),
end: ps(32),
},
children: [
{
type: "element",
tagName: "li",
props: {},
propsRaw: "",
position: {
start: ps(15),
end: ps(27),
},
children: [
{
type: "text",
content: "def",
position: {
start: ps(19),
end: ps(22),
},
},
],
},
],
},
],
},
],
},
])
}
{
const str = "<ul><li>abc<ul><span><li>def</li></span></ul></li></ul>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ["li"],
closingTagAncestorBreakers: {
li: ["ul"],
},
})
assert.equal(nodes, [
{
type: "element",
tagName: "ul",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(55),
},
children: [
{
type: "element",
tagName: "li",
props: {},
propsRaw: "",
position: {
start: ps(4),
end: ps(50),
},
children: [
{
type: "text",
content: "abc",
position: {
start: ps(8),
end: ps(11),
},
},
{
type: "element",
tagName: "ul",
props: {},
propsRaw: "",
position: {
start: ps(11),
end: ps(45),
},
children: [
{
type: "element",
tagName: "span",
props: {},
propsRaw: "",
position: {
start: ps(15),
end: ps(40),
},
children: [
{
type: "element",
tagName: "li",
props: {},
propsRaw: "",
position: {
start: ps(21),
end: ps(33),
},
children: [
{
type: "text",
content: "def",
position: {
start: ps(25),
end: ps(28),
},
},
],
},
],
},
],
},
],
},
],
},
])
}
{
const str = "<ul><li>abc<ul><li>def<li>ghi</li></ul></li></ul>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ["li"],
closingTagAncestorBreakers: {
li: ["ul"],
},
})
assert.equal(nodes, [
{
type: "element",
tagName: "ul",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(49),
},
children: [
{
type: "element",
tagName: "li",
props: {},
propsRaw: "",
position: {
start: ps(4),
end: ps(44),
},
children: [
{
type: "text",
content: "abc",
position: {
start: ps(8),
end: ps(11),
},
},
{
type: "element",
tagName: "ul",
props: {},
propsRaw: "",
position: {
start: ps(11),
end: ps(39),
},
children: [
{
type: "element",
tagName: "li",
props: {},
propsRaw: "",
position: {
start: ps(15),
end: ps(22),
},
children: [
{
type: "text",
content: "def",
position: {
start: ps(19),
end: ps(22),
},
},
],
},
{
type: "element",
tagName: "li",
props: {},
propsRaw: "",
position: {
start: ps(22),
end: ps(34),
},
children: [
{
type: "text",
content: "ghi",
position: {
start: ps(26),
end: ps(29),
},
},
],
},
],
},
],
},
],
},
])
}
})
test("parser() should handle nested tables", (t) => {
const str =
"<table><tbody><tr><td><table><tbody><tr><td></td></tr></tbody></table></td></tr></tbody></table>"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, {
voidTags: [],
closingTags: ["tbody"],
closingTagAncestorBreakers: {
tbody: ["table"],
tr: ["table"],
td: ["table"],
},
})
const answer = [
{
type: "element",
tagName: "table",
props: {},
propsRaw: "",
position: {
start: ps(0),
end: ps(96),
},
children: [
{
type: "element",
tagName: "tbody",
props: {},
propsRaw: "",
position: {
start: ps(7),
end: ps(88),
},
children: [
{
type: "element",
tagName: "tr",
props: {},
propsRaw: "",
position: {
start: ps(14),
end: ps(80),
},
children: [
{
type: "element",
tagName: "td",
props: {},
propsRaw: "",
position: {
start: ps(18),
end: ps(75),
},
children: [
{
type: "element",
tagName: "table",
props: {},
propsRaw: "",
position: {
start: ps(22),
end: ps(70),
},
children: [
{
type: "element",
tagName: "tbody",
props: {},
propsRaw: "",
position: {
start: ps(29),
end: ps(62),
},
children: [
{
type: "element",
tagName: "tr",
props: {},
propsRaw: "",
position: {
start: ps(36),
end: ps(54),
},
children: [
{
type: "element",
tagName: "td",
props: {},
propsRaw: "",
position: {
start: ps(40),
end: ps(49),
},
children: [],
},
],
},
],
},
],
},
],
},
],
},
],
},
],
},
]
// console.log(getTextBetweenChars(str, 4, 5))
// console.log('nodes')
// deepLog(nodes)
// console.log('answer')
// deepLog(answer)
// process.exit(1)
assert.equal(nodes, answer)
})
test("parser() should ignore unnecessary closing tags", (t) => {
/*
In this case the </i> bit is unnecessary and should
not be represented in the output nor interfere with the stack.
*/
const str = "</i>x"
const tokens = lexer(str, lexerOptions)
const nodes = parser(tokens, parserOptions)
const answer = [
{
type: "text",
content: "x",
position: {
start: ps(4),
end: ps(str.length),
},
},
]
/*
console.log(getTextBetweenChars(str, 4, 5))
deepLog(tokens)
deepLog(nodes)
deepLog(answer)
//process.exit(1)
/** */
assert.equal(nodes, [
{
type: "text",
content: "x",
position: {
start: ps(4),
end: ps(str.length),
},
},
])
})
test.run()