micro-mdx-parser
Version:
A tiny parser to convert markdown or html into JSON
470 lines (430 loc) • 11.7 kB
JavaScript
const {
startsWith,
endsWith,
stringIncludes,
arrayIncludes,
getTextBetweenChars
} = require('./utils')
const {
REP_SYMBOL,
REP_SYMBOL_PATTERN,
fixOpenBracket,
ARROW_SYMBOL,
ARROW_SYMBOL_PATTERN
} = require('./utils/find-code')
const { CLOSE_ELEMENT_SYMBOL_PATTERN } = require('./utils/find-components')
const { CLOSE_BRACKET_PATTERN, OPEN_BRACKET_PATTERN } = require('./utils/find-inline-arrow-fn')
function feedPosition(position, str, len) {
const start = position.index
const end = position.index = start + len
for (let i = start; i < end; i++) {
const char = str.charAt(i)
if (char === '\n') {
position.line++
position.column = 0
} else {
position.column++
}
}
}
function jumpPosition (position, str, end) {
const len = end - position.index
return feedPosition(position, str, len)
}
function makeInitialPosition () {
return {
index: 0,
column: 1,
line: 1, // Start at line 1
// line: 0
}
}
function copyPositionStart(position) {
return {
index: position.index,
line: position.line,
// column: position.column + 1, // start at char 1
column: (!position.column) ? position.column + 1 : position.column
}
}
function copyPositionEnd(position) {
return {
// index: position.index + 1,
index: position.index,
line: position.line,
//column: position.column + 1
column: (!position.column) ? position.column + 1 : position.column
}
}
// Old offset was index/column was 1 off in some cases
function copyPosition(position) {
return {
index: position.index,
line: position.line,
column: position.column
}
}
function lexer (str, options) {
const state = {
str,
options,
position: makeInitialPosition(),
tokens: []
}
lex(state)
return state.tokens
}
function lex (state) {
const {str, options: {childlessTags}} = state
const len = (str && str.length) ? str.length : 0
while (state.position.index < len) {
const start = state.position.index
lexText(state)
if (state.position.index === start) {
const isComment = startsWith(str, '!--', start + 1)
if (isComment) {
lexComment(state)
} else {
const tagName = lexTag(state)
const safeTag = tagName.toLowerCase()
if (arrayIncludes(childlessTags, safeTag)) {
lexSkipTag(tagName, state)
}
}
}
}
}
const alphanumeric = /[A-Za-z0-9]/
function findTextEnd (str, index) {
while (true) {
const textEnd = str.indexOf('<', index)
if (textEnd === -1) {
return textEnd
}
const char = str.charAt(textEnd + 1)
if (char === '/' || char === '!' || alphanumeric.test(char)) {
return textEnd
}
index = textEnd + 1
}
}
function lexText (state) {
const {str, position} = state
let textEnd = findTextEnd(str, position.index)
if (textEnd === position.index) return
if (textEnd === -1) {
textEnd = str.length
}
const start = copyPositionStart(position)
const content = str.slice(position.index, textEnd)
jumpPosition(position, str, textEnd)
const end = copyPositionEnd(position)
state.tokens.push({
type: 'text',
/* Fix inline code blocks if they were tweaked for parser */
content: fixOpenBracket(content)
.replace(CLOSE_ELEMENT_SYMBOL_PATTERN, '/>')
.replace(CLOSE_BRACKET_PATTERN, '}')
.replace(OPEN_BRACKET_PATTERN, '{'),
position: { start, end }
})
}
function lexComment (state) {
const {str, position} = state
const start = copyPositionStart(position)
feedPosition(position, str, 4) // "<!--".length
let contentEnd = str.indexOf('-->', position.index)
let commentEnd = contentEnd + 3 // "-->".length
if (contentEnd === -1) {
contentEnd = commentEnd = str.length
}
const content = str.slice(position.index, contentEnd)
jumpPosition(position, str, commentEnd)
state.tokens.push({
type: 'comment',
content,
position: {
start,
end: copyPositionEnd(position)
}
})
}
function lexTag (state) {
const {str, position} = state
{
const secondChar = str.charAt(position.index + 1)
const close = secondChar === '/'
const start = copyPositionStart(position, 0)
feedPosition(position, str, close ? 2 : 1)
state.tokens.push({type: 'tag-start', close, position: {start}})
/* persist start for tagOpen lookup below */
state._tagStart = start.index
}
const tagName = lexTagName(state)
lexTagAttributes(state, tagName)
{
const firstChar = str.charAt(position.index)
const close = firstChar === '/'
feedPosition(position, str, close ? 2 : 1)
const end = copyPositionEnd(position)
const tagOpen = getTextBetweenChars(str, state._tagStart, end.index)
// console.log('Opening text match', tagOpen)
const endToken = {
type: 'tag-end',
close,
position: {end},
// tagOpen,
// name: state.currentTagName,
}
if (tagOpen.indexOf('/>') > -1) {
endToken.isSelfClosing = true
}
state.tokens.push(endToken)
}
return tagName
}
// See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#special-white-space
// const whitespace = /\s/ // old stripped newlines
const whitespace = /[^\S\r]/
function isWhitespaceChar(char) {
return whitespace.test(char)
}
function lexTagName (state) {
const {str, position} = state
const len = str.length
let start = position.index
while (start < len) {
const char = str.charAt(start)
const isTagChar = !(isWhitespaceChar(char) || char === '/' || char === '>')
if (isTagChar) break
start++
}
let end = start + 1
while (end < len) {
const char = str.charAt(end)
const isTagChar = !(isWhitespaceChar(char) || char === '/' || char === '>')
if (!isTagChar) break
end++
}
jumpPosition(position, str, end)
const tagName = str.slice(start, end)
// state.currentTagName = tagName // for debugging
state.tokens.push({
type: 'tag',
content: tagName
})
return tagName
}
function lexTagAttributes (state, tagName) {
let rawAttrs = ''
// console.log('tagName', tagName)
// console.log('state', state)
const {str, position, tokens} = state
// console.log(`xxxx ${tagName}`, tokens)
let cursor = position.index
let quote = null // null, single-, or double-quote
let wordBegin = cursor // index of word start
const words = [] // "key", "key=value", "key='value'", etc
const len = str.length
let prevWasClose = false
while (cursor < len) {
const char = str.charAt(cursor)
rawAttrs += char
/* If opening bracket is brackets {}. Ensure balance */
// TODO harden to access string values with unbalanced brackets
if (quote === '{' && char === '}') {
const isQuoteEnd = isBalanced(str.slice(wordBegin, cursor + 1))
if (isQuoteEnd) {
quote = null
}
cursor = cursor + 1
prevWasClose = true
continue
}
if (quote) {
const isQuoteEnd = char === quote
if (isQuoteEnd) {
quote = null
}
cursor++
prevWasClose = true
continue
}
/* If closed quote & char is newline, skip. Else keep newlines in attributes */
if (prevWasClose && char === '\n') {
if (cursor !== wordBegin) {
words.push(str.slice(wordBegin, cursor))
}
wordBegin = cursor + 1
cursor++
prevWasClose = false
continue
}
const isTagEnd = char === '/' || char === '>'
if (isTagEnd) {
if (cursor !== wordBegin) {
words.push(str.slice(wordBegin, cursor))
}
break
}
//console.log('char', char)
const isWordEnd = isWhitespaceChar(char)
if (isWordEnd) {
if (cursor !== wordBegin) {
words.push(str.slice(wordBegin, cursor))
}
wordBegin = cursor + 1
cursor++
continue
}
const isQuoteStart = char === '\'' || char === '"' || char === '`'
if (isQuoteStart) {
quote = char
cursor++
continue
}
const isBracketStart = char === '{'
if (isBracketStart) {
quote = char
cursor++
continue
}
cursor++
}
jumpPosition(position, str, cursor)
// Raw attribute source
//console.log('rawAttrs', rawAttrs)
const src = rawAttrs.replace(/\/?>?$/, '')
// console.log("src", src)
const wLen = words.length
const type = 'attribute'
for (let i = 0; i < wLen; i++) {
const word = words[i]
const isNotPair = word.indexOf('=') === -1
if (isNotPair) {
const secondWord = words[i + 1]
if (secondWord && startsWith(secondWord, '=')) {
if (secondWord.length > 1) {
const newWord = word + secondWord
tokens.push({
type,
content: newWord,
src
})
i += 1
continue
}
const thirdWord = words[i + 2]
i += 1
if (thirdWord) {
const newWord = word + '=' + thirdWord
tokens.push({
type,
content: newWord,
src
})
i += 1
continue
}
}
}
if (endsWith(word, '=')) {
const secondWord = words[i + 1]
if (secondWord && !stringIncludes(secondWord, '=')) {
const newWord = word + secondWord
tokens.push({
type,
content: newWord,
src
})
i += 1
continue
}
const newWord = word.slice(0, -1)
tokens.push({
type,
content: newWord,
src
})
continue
}
if (word !== '\n') {
// console.log(`word to add ${type}`, word)
tokens.push({
type,
content: (word.indexOf(ARROW_SYMBOL) === -1) ? word : word.replace(ARROW_SYMBOL_PATTERN, ' => '),
src
})
}
}
}
/**
* Verify brackets are balanced
* @param {string} str - string with code
* @return {Boolean}
*/
function isBalanced(str) {
return !str.split('').reduce((uptoPrevChar, thisChar) => {
if (thisChar === '(' || thisChar === '{' || thisChar === '[') {
return ++uptoPrevChar
} else if (thisChar === ')' || thisChar === '}' || thisChar === ']') {
return --uptoPrevChar
}
return uptoPrevChar
}, 0)
}
const push = [].push
function lexSkipTag (tagName, state) {
const {str, position, tokens} = state
const safeTagName = tagName.toLowerCase()
const len = str.length
let index = position.index
while (index < len) {
const nextTag = str.indexOf('</', index)
if (nextTag === -1) {
lexText(state)
break
}
const tagStartPosition = copyPositionStart(position)
jumpPosition(tagStartPosition, str, nextTag)
const tagState = {str, position: tagStartPosition, tokens: []}
const name = lexTag(tagState)
if (safeTagName !== name.toLowerCase()) {
index = tagState.position.index
continue
}
if (nextTag !== position.index) {
const textStart = copyPositionStart(position)
jumpPosition(position, str, nextTag)
tokens.push({
type: 'text',
content: str.slice(textStart.index, nextTag)
.replace(CLOSE_BRACKET_PATTERN, '}')
.replace(OPEN_BRACKET_PATTERN, '{'),
position: {
start: textStart,
end: copyPositionEnd(position)
}
})
}
push.apply(tokens, tagState.tokens)
jumpPosition(position, str, tagState.position.index)
break
}
}
module.exports = {
feedPosition,
jumpPosition,
makeInitialPosition,
copyPosition,
lexer,
lex,
findTextEnd,
lexText,
lexTag,
lexComment,
isWhitespaceChar,
lexTagName,
lexTagAttributes,
lexSkipTag,
}