UNPKG

micro-mdx-parser

Version:

A tiny parser to convert markdown or html into JSON

470 lines (430 loc) 11.7 kB
const { startsWith, endsWith, stringIncludes, arrayIncludes, getTextBetweenChars } = require('./utils') const { REP_SYMBOL, REP_SYMBOL_PATTERN, fixOpenBracket, ARROW_SYMBOL, ARROW_SYMBOL_PATTERN } = require('./utils/find-code') const { CLOSE_ELEMENT_SYMBOL_PATTERN } = require('./utils/find-components') const { CLOSE_BRACKET_PATTERN, OPEN_BRACKET_PATTERN } = require('./utils/find-inline-arrow-fn') function feedPosition(position, str, len) { const start = position.index const end = position.index = start + len for (let i = start; i < end; i++) { const char = str.charAt(i) if (char === '\n') { position.line++ position.column = 0 } else { position.column++ } } } function jumpPosition (position, str, end) { const len = end - position.index return feedPosition(position, str, len) } function makeInitialPosition () { return { index: 0, column: 1, line: 1, // Start at line 1 // line: 0 } } function copyPositionStart(position) { return { index: position.index, line: position.line, // column: position.column + 1, // start at char 1 column: (!position.column) ? position.column + 1 : position.column } } function copyPositionEnd(position) { return { // index: position.index + 1, index: position.index, line: position.line, //column: position.column + 1 column: (!position.column) ? position.column + 1 : position.column } } // Old offset was index/column was 1 off in some cases function copyPosition(position) { return { index: position.index, line: position.line, column: position.column } } function lexer (str, options) { const state = { str, options, position: makeInitialPosition(), tokens: [] } lex(state) return state.tokens } function lex (state) { const {str, options: {childlessTags}} = state const len = (str && str.length) ? str.length : 0 while (state.position.index < len) { const start = state.position.index lexText(state) if (state.position.index === start) { const isComment = startsWith(str, '!--', start + 1) if (isComment) { lexComment(state) } else { const tagName = lexTag(state) const safeTag = tagName.toLowerCase() if (arrayIncludes(childlessTags, safeTag)) { lexSkipTag(tagName, state) } } } } } const alphanumeric = /[A-Za-z0-9]/ function findTextEnd (str, index) { while (true) { const textEnd = str.indexOf('<', index) if (textEnd === -1) { return textEnd } const char = str.charAt(textEnd + 1) if (char === '/' || char === '!' || alphanumeric.test(char)) { return textEnd } index = textEnd + 1 } } function lexText (state) { const {str, position} = state let textEnd = findTextEnd(str, position.index) if (textEnd === position.index) return if (textEnd === -1) { textEnd = str.length } const start = copyPositionStart(position) const content = str.slice(position.index, textEnd) jumpPosition(position, str, textEnd) const end = copyPositionEnd(position) state.tokens.push({ type: 'text', /* Fix inline code blocks if they were tweaked for parser */ content: fixOpenBracket(content) .replace(CLOSE_ELEMENT_SYMBOL_PATTERN, '/>') .replace(CLOSE_BRACKET_PATTERN, '}') .replace(OPEN_BRACKET_PATTERN, '{'), position: { start, end } }) } function lexComment (state) { const {str, position} = state const start = copyPositionStart(position) feedPosition(position, str, 4) // "<!--".length let contentEnd = str.indexOf('-->', position.index) let commentEnd = contentEnd + 3 // "-->".length if (contentEnd === -1) { contentEnd = commentEnd = str.length } const content = str.slice(position.index, contentEnd) jumpPosition(position, str, commentEnd) state.tokens.push({ type: 'comment', content, position: { start, end: copyPositionEnd(position) } }) } function lexTag (state) { const {str, position} = state { const secondChar = str.charAt(position.index + 1) const close = secondChar === '/' const start = copyPositionStart(position, 0) feedPosition(position, str, close ? 2 : 1) state.tokens.push({type: 'tag-start', close, position: {start}}) /* persist start for tagOpen lookup below */ state._tagStart = start.index } const tagName = lexTagName(state) lexTagAttributes(state, tagName) { const firstChar = str.charAt(position.index) const close = firstChar === '/' feedPosition(position, str, close ? 2 : 1) const end = copyPositionEnd(position) const tagOpen = getTextBetweenChars(str, state._tagStart, end.index) // console.log('Opening text match', tagOpen) const endToken = { type: 'tag-end', close, position: {end}, // tagOpen, // name: state.currentTagName, } if (tagOpen.indexOf('/>') > -1) { endToken.isSelfClosing = true } state.tokens.push(endToken) } return tagName } // See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#special-white-space // const whitespace = /\s/ // old stripped newlines const whitespace = /[^\S\r]/ function isWhitespaceChar(char) { return whitespace.test(char) } function lexTagName (state) { const {str, position} = state const len = str.length let start = position.index while (start < len) { const char = str.charAt(start) const isTagChar = !(isWhitespaceChar(char) || char === '/' || char === '>') if (isTagChar) break start++ } let end = start + 1 while (end < len) { const char = str.charAt(end) const isTagChar = !(isWhitespaceChar(char) || char === '/' || char === '>') if (!isTagChar) break end++ } jumpPosition(position, str, end) const tagName = str.slice(start, end) // state.currentTagName = tagName // for debugging state.tokens.push({ type: 'tag', content: tagName }) return tagName } function lexTagAttributes (state, tagName) { let rawAttrs = '' // console.log('tagName', tagName) // console.log('state', state) const {str, position, tokens} = state // console.log(`xxxx ${tagName}`, tokens) let cursor = position.index let quote = null // null, single-, or double-quote let wordBegin = cursor // index of word start const words = [] // "key", "key=value", "key='value'", etc const len = str.length let prevWasClose = false while (cursor < len) { const char = str.charAt(cursor) rawAttrs += char /* If opening bracket is brackets {}. Ensure balance */ // TODO harden to access string values with unbalanced brackets if (quote === '{' && char === '}') { const isQuoteEnd = isBalanced(str.slice(wordBegin, cursor + 1)) if (isQuoteEnd) { quote = null } cursor = cursor + 1 prevWasClose = true continue } if (quote) { const isQuoteEnd = char === quote if (isQuoteEnd) { quote = null } cursor++ prevWasClose = true continue } /* If closed quote & char is newline, skip. Else keep newlines in attributes */ if (prevWasClose && char === '\n') { if (cursor !== wordBegin) { words.push(str.slice(wordBegin, cursor)) } wordBegin = cursor + 1 cursor++ prevWasClose = false continue } const isTagEnd = char === '/' || char === '>' if (isTagEnd) { if (cursor !== wordBegin) { words.push(str.slice(wordBegin, cursor)) } break } //console.log('char', char) const isWordEnd = isWhitespaceChar(char) if (isWordEnd) { if (cursor !== wordBegin) { words.push(str.slice(wordBegin, cursor)) } wordBegin = cursor + 1 cursor++ continue } const isQuoteStart = char === '\'' || char === '"' || char === '`' if (isQuoteStart) { quote = char cursor++ continue } const isBracketStart = char === '{' if (isBracketStart) { quote = char cursor++ continue } cursor++ } jumpPosition(position, str, cursor) // Raw attribute source //console.log('rawAttrs', rawAttrs) const src = rawAttrs.replace(/\/?>?$/, '') // console.log("src", src) const wLen = words.length const type = 'attribute' for (let i = 0; i < wLen; i++) { const word = words[i] const isNotPair = word.indexOf('=') === -1 if (isNotPair) { const secondWord = words[i + 1] if (secondWord && startsWith(secondWord, '=')) { if (secondWord.length > 1) { const newWord = word + secondWord tokens.push({ type, content: newWord, src }) i += 1 continue } const thirdWord = words[i + 2] i += 1 if (thirdWord) { const newWord = word + '=' + thirdWord tokens.push({ type, content: newWord, src }) i += 1 continue } } } if (endsWith(word, '=')) { const secondWord = words[i + 1] if (secondWord && !stringIncludes(secondWord, '=')) { const newWord = word + secondWord tokens.push({ type, content: newWord, src }) i += 1 continue } const newWord = word.slice(0, -1) tokens.push({ type, content: newWord, src }) continue } if (word !== '\n') { // console.log(`word to add ${type}`, word) tokens.push({ type, content: (word.indexOf(ARROW_SYMBOL) === -1) ? word : word.replace(ARROW_SYMBOL_PATTERN, ' => '), src }) } } } /** * Verify brackets are balanced * @param {string} str - string with code * @return {Boolean} */ function isBalanced(str) { return !str.split('').reduce((uptoPrevChar, thisChar) => { if (thisChar === '(' || thisChar === '{' || thisChar === '[') { return ++uptoPrevChar } else if (thisChar === ')' || thisChar === '}' || thisChar === ']') { return --uptoPrevChar } return uptoPrevChar }, 0) } const push = [].push function lexSkipTag (tagName, state) { const {str, position, tokens} = state const safeTagName = tagName.toLowerCase() const len = str.length let index = position.index while (index < len) { const nextTag = str.indexOf('</', index) if (nextTag === -1) { lexText(state) break } const tagStartPosition = copyPositionStart(position) jumpPosition(tagStartPosition, str, nextTag) const tagState = {str, position: tagStartPosition, tokens: []} const name = lexTag(tagState) if (safeTagName !== name.toLowerCase()) { index = tagState.position.index continue } if (nextTag !== position.index) { const textStart = copyPositionStart(position) jumpPosition(position, str, nextTag) tokens.push({ type: 'text', content: str.slice(textStart.index, nextTag) .replace(CLOSE_BRACKET_PATTERN, '}') .replace(OPEN_BRACKET_PATTERN, '{'), position: { start: textStart, end: copyPositionEnd(position) } }) } push.apply(tokens, tagState.tokens) jumpPosition(position, str, tagState.position.index) break } } module.exports = { feedPosition, jumpPosition, makeInitialPosition, copyPosition, lexer, lex, findTextEnd, lexText, lexTag, lexComment, isWhitespaceChar, lexTagName, lexTagAttributes, lexSkipTag, }