very-small-parser
Version:
A very small Markdown, HTML, and CSS parser.
507 lines (506 loc) • 19.7 kB
JavaScript
import { toPlainText } from '../toPlainText';
const toMdastInlineChildren = ({ children }) => {
const res = [];
const length = children.length;
for (let i = 0; i < length; i++) {
const node = toMdastInline(children[i]);
if (node)
res.push(node);
}
return res;
};
const createSimpleInlineNode = (type, element) => ({
type,
children: toMdastInlineChildren(element),
});
const BLOCK_TAGS_REGEX = /^(blockquote|div|h1|h2|h3|h4|h5|h6|hr|ol|p|pre|table|ul)$/;
const toMdastInline = (node) => {
const { type } = node;
switch (type) {
case 'element': {
const { tagName } = node;
switch (tagName) {
case 'code':
case 'pre': {
const attr = node.properties;
const isMath = attr?.class?.includes('math') || attr?.['data-lang'] === 'math';
if (isMath) {
return {
type: 'inlineMath',
value: toPlainText(node),
};
}
return {
type: 'inlineCode',
value: toPlainText(node),
wrap: '`',
};
}
case 'b':
case 'strong':
return createSimpleInlineNode('strong', node);
case 'i':
case 'em':
return createSimpleInlineNode('emphasis', node);
case 'del':
return createSimpleInlineNode('delete', node);
case 'spoiler':
return createSimpleInlineNode('spoiler', node);
case 'sup': {
const attr = node.properties;
const isFootnoteReference = attr?.['data-node'] === 'footnote';
if (isFootnoteReference) {
const anchor = node.children?.[0];
if (anchor && anchor.type === 'element' && anchor.tagName === 'a') {
const anchorAttr = anchor.properties;
const href = anchorAttr?.href || '';
if (href[0] === '#' && href.length > 1) {
const identifier = href.slice(1);
const label = toPlainText(anchor);
return {
type: 'footnoteReference',
identifier,
label,
};
}
}
}
return createSimpleInlineNode('sup', node);
}
case 'sub':
return createSimpleInlineNode('sub', node);
case 'mark':
return createSimpleInlineNode('mark', node);
case 'u':
return createSimpleInlineNode('underline', node);
case 'acronym': {
const attr = node.properties;
const emoji = attr?.['data-icon'];
if (emoji) {
return {
type: 'icon',
emoji: emoji,
};
}
break;
}
case 'a': {
const attr = node.properties || {};
const href = attr.href;
if (href) {
const isAnchor = href[0] === '#';
if (isAnchor) {
const isImageAnchor = attr['data-ref'] === 'img';
const identifier = href.slice(1);
if (isImageAnchor) {
const alt = toPlainText(node) || null;
return {
type: 'imageReference',
identifier,
alt,
referenceType: alt ? 'full' : 'collapsed',
};
}
else {
const text = toPlainText(node).trim();
return {
type: 'linkReference',
identifier,
referenceType: text ? 'full' : 'collapsed',
children: toMdastInlineChildren(node),
};
}
}
else {
const title = attr.title;
if (!title &&
node.children?.length === 1 &&
node.children[0].type === 'text' &&
node.children[0].value === href &&
href.startsWith('http')) {
return {
type: 'inlineLink',
value: href,
};
}
else {
return {
type: 'link',
url: href,
children: toMdastInlineChildren(node),
title,
};
}
}
}
return;
}
case 'img': {
const attr = node.properties || {};
const src = attr.src;
if (src) {
const title = attr.title || '';
const alt = attr.alt || '';
return {
type: 'image',
url: src,
title,
alt,
};
}
break;
}
case 'cite': {
const children = node.children;
if (children?.length === 1 && children[0].type === 'text') {
const text = children[0].value || '';
const prefix = text[0];
if (prefix === '#' || prefix === '~' || prefix === '@') {
return {
type: 'handle',
prefix,
value: text.slice(1),
};
}
}
break;
}
case 'br': {
return {
type: 'break',
};
}
}
if (BLOCK_TAGS_REGEX.test(tagName)) {
return toMdast0(node);
}
return node;
}
case 'text':
return node;
}
};
const toMdastChildren = ({ children }) => {
const res = [];
const length = children.length;
for (let i = 0; i < length; i++) {
const node = toMdast0(children[i]);
if (node)
res.push(node);
}
return res;
};
const validAlignAttr = new Set(['left', 'center', 'right']);
export const toMdast0 = (node) => {
if (Array.isArray(node))
return toMdast0({ type: 'root', children: node });
switch (node.type) {
case 'element': {
const { tagName, properties } = node;
switch (tagName) {
case 'p': {
return {
type: 'paragraph',
children: toMdastInlineChildren(node),
};
}
case 'blockquote': {
const blockquote = {
type: 'blockquote',
children: toMdastChildren(node),
};
if (properties?.['data-spoiler'] === 'true')
blockquote.spoiler = true;
return blockquote;
}
case 'code':
case 'pre': {
const attr = properties || {};
const children = node.children || [];
if (children.length) {
const firstChild = node.children?.[0];
if (firstChild.type === 'element' && firstChild.tagName === 'code') {
Object.assign(attr, firstChild.properties);
}
}
const isMath = attr['data-math'] === 'true';
if (isMath) {
const mdastNode = {
type: 'math',
value: toPlainText(node),
};
return mdastNode;
}
const lang = attr['data-lang'] || '';
const meta = attr['data-meta'] || '';
const mdastNode = {
type: 'code',
value: toPlainText(node),
lang,
};
if (meta)
mdastNode.meta = meta;
return mdastNode;
}
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6': {
const depth = Number.parseInt(tagName[1]);
const headingNode = {
type: 'heading',
depth,
children: toMdastInlineChildren(node),
};
return headingNode;
}
case 'ul':
case 'ol': {
const children = node.children || [];
const length = children.length;
const ordered = tagName === 'ol';
const list = {
type: 'list',
ordered,
children: [],
};
if (ordered)
list.start = Number.parseInt(properties?.start || '1');
for (let i = 0; i < length; i++) {
const child = children[i];
if (child.type !== 'element' || child.tagName !== 'li')
continue;
const dataChecked = child.properties?.['data-checked'];
const checked = dataChecked ? dataChecked === 'true' : null;
const item = {
type: 'listItem',
checked,
children: toMdastChildren(child),
};
list.children.push(item);
}
return list;
}
case 'hr':
return { type: 'thematicBreak' };
case 'table': {
const table = {
type: 'table',
align: [],
children: [],
};
let firstRow = true;
const processRow = (hastRow) => {
const row = {
type: 'tableRow',
children: [],
};
const children = hastRow.children || [];
const length = children.length;
for (let i = 0; i < length; i++) {
const child = children[i];
if (child.type !== 'element')
continue;
if (firstRow) {
let align = null;
const alignAttr = child.properties?.align;
if (validAlignAttr.has(alignAttr))
align = alignAttr;
table.align.push(align);
}
const cell = {
type: 'tableCell',
children: toMdastInlineChildren(child),
};
row.children.push(cell);
}
table.children.push(row);
firstRow = false;
};
const processRows = (hastRow) => {
for (const child of hastRow.children || [])
if (child.type === 'element' && child.tagName === 'tr')
processRow(child);
};
for (const tableChild of node.children || []) {
if (tableChild.type !== 'element')
continue;
switch (tableChild.tagName) {
case 'thead': {
processRows(tableChild);
break;
}
case 'tbody': {
processRows(tableChild);
break;
}
case 'tr': {
processRow(tableChild);
break;
}
}
}
return table;
}
case 'div': {
const attr = node.properties || {};
const nodeType = attr['data-node'];
switch (nodeType) {
case 'definition': {
const label = attr['data-label'];
const identifier = attr['data-id'];
const url = attr['data-url'];
if (!label || !identifier || !url)
break;
const definitionNode = {
type: 'definition',
label,
identifier,
url,
};
const title = attr['data-title'];
if (title)
definitionNode.title = title;
return definitionNode;
}
case 'footnoteDefinition': {
const label = attr['data-label'];
const identifier = attr['data-id'];
const children = toMdastChildren(node);
const footnoteDefinitionNode = {
type: 'footnoteDefinition',
label,
identifier,
children,
};
return footnoteDefinitionNode;
}
}
return {
type: 'root',
children: toMdastChildren(node),
};
}
case '': {
return {
type: 'root',
children: toMdastChildren(node),
};
}
default: {
return toMdastInline(node);
}
}
}
case 'root': {
return {
type: 'root',
children: toMdastChildren(node),
};
}
}
return node;
};
const isBlock = (node) => {
switch (node.type) {
case 'paragraph':
case 'heading':
case 'blockquote':
case 'list':
case 'code':
case 'definition':
case 'thematicBreak':
case 'table':
case 'math':
case 'footnoteDefinition':
return true;
}
return false;
};
const flattenInlineChildren = (node) => {
let result = [];
const children = node.children ?? [];
const length = children.length;
for (let i = 0; i < length; i++) {
const child = children[i];
if (isBlock(child)) {
const flattened = flattenInlineChildren(child);
result = result.concat(flattened);
}
else {
result.push(child);
}
}
return result;
};
const ensureChildrenAreBlockNodes = (node) => {
// Ensure that immediate children of the root node are always block nodes.
let lastBlockNode;
let children = node.children ?? [];
const newChildren = [];
for (let i = 0; i < children.length; i++) {
const child = children[i];
if (child.type === 'root') {
const head = children.slice(0, i);
const tail = children.slice(i + 1);
const mid = child.children || [];
children = head.concat(mid).concat(tail);
i--;
continue;
}
if (isBlock(child)) {
lastBlockNode = child;
newChildren.push(child);
}
else {
if (!lastBlockNode || lastBlockNode.type !== 'paragraph') {
lastBlockNode = {
type: 'paragraph',
children: [],
};
newChildren.push(lastBlockNode);
}
if (!lastBlockNode.children)
lastBlockNode.children = [];
lastBlockNode.children.push(child);
}
switch (child.type) {
case 'blockquote':
ensureChildrenAreBlockNodes(child);
break;
case 'list': {
const { children } = child;
if (children) {
const length = children.length;
for (let i = 0; i < length; i++)
ensureChildrenAreBlockNodes(children[i]);
}
break;
}
case 'footnoteDefinition':
ensureChildrenAreBlockNodes(child);
break;
case 'paragraph':
child.children = flattenInlineChildren(child);
break;
case 'heading':
child.children = flattenInlineChildren(child);
break;
}
}
node.children = newChildren;
};
export const fixupMdast = (node) => {
// Ensure the root node is always a root node.
if (node.type !== 'root') {
node = {
type: 'root',
children: [node],
};
}
ensureChildrenAreBlockNodes(node);
return node;
};
export const toMdast = (node) => fixupMdast(toMdast0(node));