UNPKG

very-small-parser

Version:

A very small Markdown, HTML, and CSS parser.

507 lines (506 loc) 19.7 kB
import { toPlainText } from '../toPlainText'; const toMdastInlineChildren = ({ children }) => { const res = []; const length = children.length; for (let i = 0; i < length; i++) { const node = toMdastInline(children[i]); if (node) res.push(node); } return res; }; const createSimpleInlineNode = (type, element) => ({ type, children: toMdastInlineChildren(element), }); const BLOCK_TAGS_REGEX = /^(blockquote|div|h1|h2|h3|h4|h5|h6|hr|ol|p|pre|table|ul)$/; const toMdastInline = (node) => { const { type } = node; switch (type) { case 'element': { const { tagName } = node; switch (tagName) { case 'code': case 'pre': { const attr = node.properties; const isMath = attr?.class?.includes('math') || attr?.['data-lang'] === 'math'; if (isMath) { return { type: 'inlineMath', value: toPlainText(node), }; } return { type: 'inlineCode', value: toPlainText(node), wrap: '`', }; } case 'b': case 'strong': return createSimpleInlineNode('strong', node); case 'i': case 'em': return createSimpleInlineNode('emphasis', node); case 'del': return createSimpleInlineNode('delete', node); case 'spoiler': return createSimpleInlineNode('spoiler', node); case 'sup': { const attr = node.properties; const isFootnoteReference = attr?.['data-node'] === 'footnote'; if (isFootnoteReference) { const anchor = node.children?.[0]; if (anchor && anchor.type === 'element' && anchor.tagName === 'a') { const anchorAttr = anchor.properties; const href = anchorAttr?.href || ''; if (href[0] === '#' && href.length > 1) { const identifier = href.slice(1); const label = toPlainText(anchor); return { type: 'footnoteReference', identifier, label, }; } } } return createSimpleInlineNode('sup', node); } case 'sub': return createSimpleInlineNode('sub', node); case 'mark': return createSimpleInlineNode('mark', node); case 'u': return createSimpleInlineNode('underline', node); case 'acronym': { const attr = node.properties; const emoji = attr?.['data-icon']; if (emoji) { return { type: 'icon', emoji: emoji, }; } break; } case 'a': { const attr = node.properties || {}; const href = attr.href; if (href) { const isAnchor = href[0] === '#'; if (isAnchor) { const isImageAnchor = attr['data-ref'] === 'img'; const identifier = href.slice(1); if (isImageAnchor) { const alt = toPlainText(node) || null; return { type: 'imageReference', identifier, alt, referenceType: alt ? 'full' : 'collapsed', }; } else { const text = toPlainText(node).trim(); return { type: 'linkReference', identifier, referenceType: text ? 'full' : 'collapsed', children: toMdastInlineChildren(node), }; } } else { const title = attr.title; if (!title && node.children?.length === 1 && node.children[0].type === 'text' && node.children[0].value === href && href.startsWith('http')) { return { type: 'inlineLink', value: href, }; } else { return { type: 'link', url: href, children: toMdastInlineChildren(node), title, }; } } } return; } case 'img': { const attr = node.properties || {}; const src = attr.src; if (src) { const title = attr.title || ''; const alt = attr.alt || ''; return { type: 'image', url: src, title, alt, }; } break; } case 'cite': { const children = node.children; if (children?.length === 1 && children[0].type === 'text') { const text = children[0].value || ''; const prefix = text[0]; if (prefix === '#' || prefix === '~' || prefix === '@') { return { type: 'handle', prefix, value: text.slice(1), }; } } break; } case 'br': { return { type: 'break', }; } } if (BLOCK_TAGS_REGEX.test(tagName)) { return toMdast0(node); } return node; } case 'text': return node; } }; const toMdastChildren = ({ children }) => { const res = []; const length = children.length; for (let i = 0; i < length; i++) { const node = toMdast0(children[i]); if (node) res.push(node); } return res; }; const validAlignAttr = new Set(['left', 'center', 'right']); export const toMdast0 = (node) => { if (Array.isArray(node)) return toMdast0({ type: 'root', children: node }); switch (node.type) { case 'element': { const { tagName, properties } = node; switch (tagName) { case 'p': { return { type: 'paragraph', children: toMdastInlineChildren(node), }; } case 'blockquote': { const blockquote = { type: 'blockquote', children: toMdastChildren(node), }; if (properties?.['data-spoiler'] === 'true') blockquote.spoiler = true; return blockquote; } case 'code': case 'pre': { const attr = properties || {}; const children = node.children || []; if (children.length) { const firstChild = node.children?.[0]; if (firstChild.type === 'element' && firstChild.tagName === 'code') { Object.assign(attr, firstChild.properties); } } const isMath = attr['data-math'] === 'true'; if (isMath) { const mdastNode = { type: 'math', value: toPlainText(node), }; return mdastNode; } const lang = attr['data-lang'] || ''; const meta = attr['data-meta'] || ''; const mdastNode = { type: 'code', value: toPlainText(node), lang, }; if (meta) mdastNode.meta = meta; return mdastNode; } case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6': { const depth = Number.parseInt(tagName[1]); const headingNode = { type: 'heading', depth, children: toMdastInlineChildren(node), }; return headingNode; } case 'ul': case 'ol': { const children = node.children || []; const length = children.length; const ordered = tagName === 'ol'; const list = { type: 'list', ordered, children: [], }; if (ordered) list.start = Number.parseInt(properties?.start || '1'); for (let i = 0; i < length; i++) { const child = children[i]; if (child.type !== 'element' || child.tagName !== 'li') continue; const dataChecked = child.properties?.['data-checked']; const checked = dataChecked ? dataChecked === 'true' : null; const item = { type: 'listItem', checked, children: toMdastChildren(child), }; list.children.push(item); } return list; } case 'hr': return { type: 'thematicBreak' }; case 'table': { const table = { type: 'table', align: [], children: [], }; let firstRow = true; const processRow = (hastRow) => { const row = { type: 'tableRow', children: [], }; const children = hastRow.children || []; const length = children.length; for (let i = 0; i < length; i++) { const child = children[i]; if (child.type !== 'element') continue; if (firstRow) { let align = null; const alignAttr = child.properties?.align; if (validAlignAttr.has(alignAttr)) align = alignAttr; table.align.push(align); } const cell = { type: 'tableCell', children: toMdastInlineChildren(child), }; row.children.push(cell); } table.children.push(row); firstRow = false; }; const processRows = (hastRow) => { for (const child of hastRow.children || []) if (child.type === 'element' && child.tagName === 'tr') processRow(child); }; for (const tableChild of node.children || []) { if (tableChild.type !== 'element') continue; switch (tableChild.tagName) { case 'thead': { processRows(tableChild); break; } case 'tbody': { processRows(tableChild); break; } case 'tr': { processRow(tableChild); break; } } } return table; } case 'div': { const attr = node.properties || {}; const nodeType = attr['data-node']; switch (nodeType) { case 'definition': { const label = attr['data-label']; const identifier = attr['data-id']; const url = attr['data-url']; if (!label || !identifier || !url) break; const definitionNode = { type: 'definition', label, identifier, url, }; const title = attr['data-title']; if (title) definitionNode.title = title; return definitionNode; } case 'footnoteDefinition': { const label = attr['data-label']; const identifier = attr['data-id']; const children = toMdastChildren(node); const footnoteDefinitionNode = { type: 'footnoteDefinition', label, identifier, children, }; return footnoteDefinitionNode; } } return { type: 'root', children: toMdastChildren(node), }; } case '': { return { type: 'root', children: toMdastChildren(node), }; } default: { return toMdastInline(node); } } } case 'root': { return { type: 'root', children: toMdastChildren(node), }; } } return node; }; const isBlock = (node) => { switch (node.type) { case 'paragraph': case 'heading': case 'blockquote': case 'list': case 'code': case 'definition': case 'thematicBreak': case 'table': case 'math': case 'footnoteDefinition': return true; } return false; }; const flattenInlineChildren = (node) => { let result = []; const children = node.children ?? []; const length = children.length; for (let i = 0; i < length; i++) { const child = children[i]; if (isBlock(child)) { const flattened = flattenInlineChildren(child); result = result.concat(flattened); } else { result.push(child); } } return result; }; const ensureChildrenAreBlockNodes = (node) => { // Ensure that immediate children of the root node are always block nodes. let lastBlockNode; let children = node.children ?? []; const newChildren = []; for (let i = 0; i < children.length; i++) { const child = children[i]; if (child.type === 'root') { const head = children.slice(0, i); const tail = children.slice(i + 1); const mid = child.children || []; children = head.concat(mid).concat(tail); i--; continue; } if (isBlock(child)) { lastBlockNode = child; newChildren.push(child); } else { if (!lastBlockNode || lastBlockNode.type !== 'paragraph') { lastBlockNode = { type: 'paragraph', children: [], }; newChildren.push(lastBlockNode); } if (!lastBlockNode.children) lastBlockNode.children = []; lastBlockNode.children.push(child); } switch (child.type) { case 'blockquote': ensureChildrenAreBlockNodes(child); break; case 'list': { const { children } = child; if (children) { const length = children.length; for (let i = 0; i < length; i++) ensureChildrenAreBlockNodes(children[i]); } break; } case 'footnoteDefinition': ensureChildrenAreBlockNodes(child); break; case 'paragraph': child.children = flattenInlineChildren(child); break; case 'heading': child.children = flattenInlineChildren(child); break; } } node.children = newChildren; }; export const fixupMdast = (node) => { // Ensure the root node is always a root node. if (node.type !== 'root') { node = { type: 'root', children: [node], }; } ensureChildrenAreBlockNodes(node); return node; }; export const toMdast = (node) => fixupMdast(toMdast0(node));