UNPKG

micro-mdx-parser

Version:

A tiny parser to convert markdown or html into JSON

github.com/DavidWells/micro-mdx

DavidWells/micro-mdx

443 lines (397 loc) • 13.6 kB

JavaScript

const { inspect } = require('util') const { test } = require('uvu') const assert = require('uvu/assert') const { lexer, lexText, lexComment, lexTag, lexTagName, lexTagAttributes, lexSkipTag, findTextEnd, isWhitespaceChar } = require('./lexer') function deepLog(obj) { console.log(inspect(obj, {showHidden: false, depth: null, colors: true})) } function ps (index) { return { index, line: 1, column: index + 1 } } function deleteSrc(obj) { delete obj.src return obj } function normalizeArray(tags) { return tags.reduce((acc, curr) => { acc = acc.concat(deleteSrc(curr)) return acc }, []) } test('lexer should return tokens', t => { const str = '<h1>Test case</h1>' const options = {childlessTags: []} const tokens = lexer(str, options) const answer = [ {type: 'tag-start', close: false, position: {start: ps(0)}}, {type: 'tag', content: 'h1'}, {type: 'tag-end', close: false, position: {end: ps(4)}}, {type: 'text', content: 'Test case', position: {start: ps(4), end: ps(13)}}, {type: 'tag-start', close: true, position: {start: ps(13)}}, {type: 'tag', content: 'h1'}, {type: 'tag-end', close: false, position: {end: ps(str.length)}} ] assert.equal(tokens, answer) }) test('lexer should parse tags beginning with alphanumeric names', t => { { const str = '2 <= 4 >' const options = {childlessTags: []} const tokens = lexer(str, options) // console.log('tokens') // deepLog(tokens) // console.log('answer') // deepLog(answer) // deepLog(tokens) // process.exit(1) assert.equal(tokens, [ {type: 'text', content: '2 <= 4 >', position: {start: ps(0), end: ps(str.length)}} ], 'x') } { const str = '2 <a 4 >' const options = {childlessTags: []} const tokens = lexer(str, options) const answer = [ {type: 'text', content: '2 ', position: {start: ps(0), end: ps(2)}}, {type: 'tag-start', close: false, position: {start: ps(2)}}, {type: 'tag', content: 'a'}, {type: 'attribute', content: '4', src: ' 4 '}, {type: 'tag-end', close: false, position: {end: ps(str.length)}} ] // deepLog(answer) // deepLog(tokens) assert.equal(tokens, answer, 'y') } }) test('lexer should skip lexing the content of childless tags', t => { const str = '<template>Hello <img/></template>' const options = {childlessTags: ['template']} const tokens = lexer(str, options) assert.equal(tokens, [ {type: 'tag-start', close: false, position: {start: ps(0)}}, {type: 'tag', content: 'template'}, {type: 'tag-end', close: false, position: {end: ps(10)}}, {type: 'text', content: 'Hello <img/>', position: {start: ps(10), end: ps(22)}}, {type: 'tag-start', close: true, position: {start: ps(22)}}, {type: 'tag', content: 'template'}, {type: 'tag-end', close: false, position: {end: ps(str.length)}} ]) }) test('findTextEnd should find the end of the text segment', t => { assert.is(findTextEnd('</end', 0), 0) assert.is(findTextEnd('<= 4', 0), -1) assert.is(findTextEnd('a<b', 0), 1) assert.is(findTextEnd('<= <= <=', 0), -1) }) test('lexText should tokenize the next text segment', t => { const str = 'text that ends<x>' const finish = str.indexOf('<') const state = {str, position: ps(0), tokens: []} lexText(state) assert.is(state.position.index, finish) const token = state.tokens[0] assert.equal(token, { type: 'text', content: 'text that ends', position: { start: ps(0), end: ps(14) } }) }) test('lexText should tokenize from the current position', t => { const str = 'abcdtext that ends<x>' const finish = str.indexOf('<') const state = {str, position: ps(4), tokens: []} lexText(state) assert.is(state.position.index, finish) const token = state.tokens[0] assert.equal(token, { type: 'text', content: 'text that ends', position: { start: ps(4), end: ps(18) } }) }) test('lexText should tokenize safely to string end', t => { const str = 'text that does not end' const finish = str.length const state = {str, position: ps(0), tokens: []} lexText(state) assert.is(state.position.index, finish) const token = state.tokens[0] assert.equal(token, { type: 'text', content: 'text that does not end', position: { start: ps(0), end: ps(str.length) } }) }) test('lexText should not add a token for an empty text', t => { const str = ' <x>never reach here</x>' const start = 2 const finish = 2 const state = {str, position: ps(start), tokens: []} lexText(state) assert.is(state.position.index, finish) assert.is(state.tokens.length, 0) }) test('lexComment should tokenize the next comment', t => { const str = 'abcd' const finish = str.indexOf('abcd') const state = {str, position: ps(0), tokens: []} lexComment(state) assert.is(state.position.index, finish) assert.equal(state.tokens[0], { type: 'comment', content: ' this is a comment ', position: { start: ps(0), end: ps(finish) } }) }) test('lexComment should tokenize safely to string end', t => { const str = '<x>' const finish = str.indexOf('<x>') const state = {str, position: ps(4), tokens: []} lexComment(state) assert.is(state.position.index, finish) assert.equal(state.tokens[0], { type: 'comment', content: ' comment text ', position: { start: ps(4), end: ps(finish) } }) }) test('lexComment should add a token for an empty comment', t => { const str = '' const finish = str.length const state = {str, position: ps(0), tokens: []} lexComment(state) assert.is(state.position.index, finish) assert.equal(state.tokens[0], { type: 'comment', content: '', position: { start: ps(0), end: ps(finish) } }) }) test('lexTag should tokenize the next tag', t => { const str = '<img/>abcd' const finish = str.indexOf('abcd') const state = {str, position: ps(0), tokens: []} lexTag(state) assert.is(state.position.index, finish) assert.equal(state.tokens, [ {type: 'tag-start', close: false, position: {start: ps(0)}}, {type: 'tag', content: 'img'}, // not a part of this test {type: 'tag-end', close: true, position: {end: ps(finish)}, "isSelfClosing": true} ]) }) test('lexTagName should tokenize the next tag name', t => { const str = 'h1 id="title"> test' const finish = 2 const state = {str, position: ps(0), tokens: []} lexTagName(state) assert.is(state.position.index, finish) assert.equal(state.tokens[0], { type: 'tag', content: 'h1' }) }) test('lexTagName should ignore leading not-tagname characters', t => { const str = '>/ div' const state = {str, position: ps(0), tokens: []} lexTagName(state) assert.is(state.position.index, str.length) assert.equal(state.tokens[0], { type: 'tag', content: 'div' }) }) test('lexTagAttributes should tokenize attributes until tag end', t => { const str = 'yes="no" maybe data-type="array">abcd' const finish = str.indexOf('>abcd') const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) assert.is(state.position.index, finish) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'yes="no"'}, {type: 'attribute', content: 'maybe'}, {type: 'attribute', content: 'data-type="array"'} ]) }) test('lexTagAttributes should tokenize independent of whitespace', t => { const str = 'yes = "no" maybe data-type= "array" key ="value" >abcd' const finish = str.indexOf('>abcd') const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) assert.is(state.position.index, finish) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'yes="no"'}, {type: 'attribute', content: 'maybe'}, {type: 'attribute', content: 'data-type="array"'}, {type: 'attribute', content: 'key="value"'} ]) }) test('lexTagAttributes should handle an unset attribute name', t => { const str = '<div foo= bar="baz"></div>' const state = {str, position: ps(4), tokens: []} lexTagAttributes(state) assert.is(state.position.index, str.indexOf('></div>')) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'foo'}, {type: 'attribute', content: 'bar="baz"'} ]) }) test('lexTagAttributes should handle newline separated attributes', t => { const str = '<div foo="bar"\nbaz="bat"></div>' const state = {str, position: ps(4), tokens: []} lexTagAttributes(state) assert.is(state.position.index, str.indexOf('></div>')) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'foo="bar"'}, {type: 'attribute', content: 'baz="bat"'} ]) }) test('lexTagAttributes should handle tab separated attributes', t => { const str = '<div foo="bar"\tbaz="bat"></div>' const state = {str, position: ps(4), tokens: []} lexTagAttributes(state) assert.is(state.position.index, str.indexOf('></div>')) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'foo="bar"'}, {type: 'attribute', content: 'baz="bat"'} ]) }) test('lexTagAttributes should handle prefixed spacing', t => { const str = ' \n\tyes="no">abcd' const finish = str.indexOf('>abcd') const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) assert.is(state.position.index, finish) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'yes="no"'} ]) }) test('lexTagAttributes should handle unquoted one-word values', t => { const str = 'num=8 ham = steak>abcd' const finish = str.indexOf('>abcd') const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) assert.is(state.position.index, finish) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'num=8'}, {type: 'attribute', content: 'ham=steak'} ]) }) test('lexTagAttributes should handle incomplete attributes', t => { const str = 'x = >abcd' const finish = str.indexOf('>abcd') const state = {str, position: ps(0), tokens: []} lexTagAttributes(state) assert.is(state.position.index, finish) assert.equal(normalizeArray(state.tokens), [ {type: 'attribute', content: 'x'} ]) }) test('lexSkipTag should tokenize as text until the matching tag name', t => { const str = 'abcd<test><h1>Test case</h1></test><x>' const finish = str.indexOf('<x>') const state = {str, position: ps(10), tokens: []} lexSkipTag('test', state) assert.is(state.position.index, finish) assert.equal(normalizeArray(state.tokens), [ {type: 'text', content: '<h1>Test case</h1>', position: {start: ps(10), end: ps(28)}}, {type: 'tag-start', close: true, position: {start: ps(28)}}, {type: 'tag', content: 'test'}, {type: 'tag-end', close: false, position: {end: ps(finish)}} ]) }) test('lexSkipTag should stop at the case-insensitive matching tag name', t => { const str = '<tEsT>proving <???> the point</TeSt><x>' const finish = str.indexOf('<x>') const state = {str, position: ps(6), tokens: []} lexSkipTag('tEsT', state) assert.is(state.position.index, finish) assert.equal(normalizeArray(state.tokens), [ {type: 'text', content: 'proving <???> the point', position: {start: ps(6), end: ps(29)}}, {type: 'tag-start', close: true, position: {start: ps(29)}}, {type: 'tag', content: 'TeSt'}, {type: 'tag-end', close: false, position: {end: ps(finish)}} ]) }) test('lexSkipTag should auto-close if the end tag is not found', t => { const str = '<script>This never ends' const state = {str, position: ps(8), tokens: []} lexSkipTag('script', state) assert.is(state.position.index, str.length) assert.equal(normalizeArray(state.tokens), [ {type: 'text', content: 'This never ends', position: {start: ps(8), end: ps(str.length)}} ]) }) test('lexSkipTag should handle finding a stray "</" [resilience]', t => { const str = '<script>proving </nothing></script>' const state = {str, position: ps(8), tokens: []} lexSkipTag('script', state) assert.is(state.position.index, str.length) assert.equal(normalizeArray(state.tokens), [ {type: 'text', content: 'proving </nothing>', position: {start: ps(8), end: ps(26)}}, {type: 'tag-start', close: true, position: {start: ps(26)}}, {type: 'tag', content: 'script'}, {type: 'tag-end', close: false, position: {end: ps(str.length)}} ]) }) test('lexSkipTag should not add an empty inner text node', t => { const str = '<script></script>' const state = {str, position: ps(8), tokens: []} lexSkipTag('script', state) assert.is(state.position.index, str.length) assert.equal(normalizeArray(state.tokens), [ {type: 'tag-start', close: true, position: {start: ps(8)}}, {type: 'tag', content: 'script'}, {type: 'tag-end', close: false, position: {end: ps(str.length)}} ]) }) test('isWhitespace should work', t => { assert.is(isWhitespaceChar(' '), true) assert.is(isWhitespaceChar('\t'), true) assert.is(isWhitespaceChar('x'), false) // assert.is(isWhitespaceChar('\n'), true) }) test.run()