slimdown-js
Version:
A regex-based Markdown parser.
652 lines (571 loc) • 19.9 kB
text/typescript
export type RegexReplacer = (substring: string, ...args: any[]) => string;
/**
* Slimdown - A very basic regex-based Markdown parser. Supports the
* following elements (and can be extended via Slimdown::add_rule()):
*
* - Headers
* - Images
* - Links
* - Bold
* - Emphasis
* - Deletions
* - Quotes
* - Inline code
* - Code blocks
* - Blockquotes
* - Ordered/unordered lists (one level only)
* - Horizontal rules
* - Superscript and subscript (`z_1_` or `a^2^`)
*
* Original author: Johnny Broadway <johnny@johnnybroadway.com>
* Website: https://gist.github.com/jbroadway/2836900
* Inspiration:
* - https://gist.github.com/plugnburn/f0d12e38b6416a77c098
* - https://github.com/Chalarangelo/parse-md-js/blob/master/parsemd.js
* - https://gist.github.com/plugnburn/f0d12e38b6416a77c098
*
* Author: Erik Vullings <erik.vullings@gmail.com>
* Conversion from PHP to TypeScript, applying fixes and tests, adding more elements, and publishing to npm:
* Website: https://github.com/erikvullings/slimdown-js
* License: MIT
*/
// Store code blocks temporarily to prevent markdown processing within them
const codeBlocks: string[] = [];
const inlineCode: string[] = [];
// Store math expressions to prevent markdown processing within them
const mathBlocks: string[] = [];
const inlineMath: string[] = [];
// Store footnotes
const footnotes: Array<[id: string, text: string]> = [];
const escapeMap: Record<string, string> = {
'&': '&',
'<': '<',
'>': '>',
'"': '"',
"'": ''',
};
const escRegex = new RegExp(`[${Object.keys(escapeMap).join('')}]`, 'g');
const esc = (s: string): string =>
s.replace(escRegex, (match) => escapeMap[match]);
const para = (_: string, line: string) => {
const trimmed = line.trim();
return /^<\/?(ul|ol|li|h|p|bl|table|tbody|tr|td|th|caption)/i.test(trimmed) || trimmed === ''
? `\n${line}\n`
: `\n<p>\n${trimmed}\n</p>\n`;
};
const ulList = (
_text: string,
indent: string,
_bullet: string,
item: string,
) => {
const level = Math.floor(indent.length / 2);
return `\n{{LISTITEM:ul:${level}:${item.trim()}}}\n`;
};
const olList = (
_text: string,
indent: string,
_bullet: string,
item: string,
) => {
const level = Math.floor(indent.length / 2);
return `\n{{LISTITEM:ol:${level}:${item.trim()}}}\n`;
};
const blockquote = (_: string, __: string, item = '') =>
`\n<blockquote>${item.trim()}</blockquote>`;
const taskList = (
_text: string,
indent: string,
checkboxState: string,
item: string,
) => {
const level = Math.floor(indent.length / 2);
const checked = checkboxState.toLowerCase() === 'x';
const checkboxHtml = `<input type="checkbox"${
checked ? ' checked' : ''
} disabled>`;
return `\n{{LISTITEM:ul:${level}:${checkboxHtml} ${item.trim()}}}\n`;
};
const definitionList = (_: string, term: string, definition: string) => {
return `\n<dl><dt>${term.trim()}</dt><dd>${definition.trim()}</dd></dl>\n`;
};
// Function to process list items into proper nested HTML
const processListItems = (markdown: string): string => {
if (!markdown.includes('{{LISTITEM:')) return markdown;
// Find groups of consecutive list items separated by non-list content
const lines = markdown.split('\n');
const groups: Array<
Array<{
type: 'ul' | 'ol';
level: number;
content: string;
originalLine: string;
}>
> = [];
let currentGroup: Array<{
type: 'ul' | 'ol';
level: number;
content: string;
originalLine: string;
}> = [];
for (const line of lines) {
const listMatch = line.match(/\{\{LISTITEM:([^:]+):([^:]+):(.+)\}\}/);
if (listMatch) {
currentGroup.push({
type: listMatch[1] as 'ul' | 'ol',
level: parseInt(listMatch[2]),
content: listMatch[3],
originalLine: line,
});
} else if (line.trim() !== '') {
// Non-empty, non-list line - end current group
if (currentGroup.length > 0) {
groups.push([...currentGroup]);
currentGroup = [];
}
}
// Empty lines don't break list groups
}
// Add final group if any
if (currentGroup.length > 0) {
groups.push(currentGroup);
}
if (groups.length === 0) return markdown;
// Process each group separately
for (const group of groups) {
const html = buildNestedList(group);
// Replace first item in group with the complete HTML
const firstItem = group[0];
markdown = markdown.replace(firstItem.originalLine, html);
// Remove remaining items in group
for (let i = 1; i < group.length; i++) {
markdown = markdown.replace(group[i].originalLine, '');
}
}
return markdown;
};
// Build nested HTML from a group of list items
const buildNestedList = (
listItems: Array<{
type: 'ul' | 'ol';
level: number;
content: string;
originalLine: string;
}>,
): string => {
if (listItems.length === 0) return '';
let html = '';
const stack: Array<{ type: 'ul' | 'ol'; level: number; hasOpenLi: boolean }> =
[];
for (let i = 0; i < listItems.length; i++) {
const item = listItems[i];
const nextItem = i < listItems.length - 1 ? listItems[i + 1] : null;
// Close lists that are deeper than current level
while (stack.length > 0 && stack[stack.length - 1].level > item.level) {
const last = stack.pop()!;
if (last.hasOpenLi) {
html += '</li>';
}
html += `</${last.type}>`;
}
// Close current list if switching types at same level
if (
stack.length > 0 &&
stack[stack.length - 1].level === item.level &&
stack[stack.length - 1].type !== item.type
) {
const last = stack.pop()!;
if (last.hasOpenLi) {
html += '</li>';
}
html += `</${last.type}>`;
}
// Open new list if needed
if (stack.length === 0 || stack[stack.length - 1].level < item.level) {
html += `<${item.type}>`;
stack.push({ type: item.type, level: item.level, hasOpenLi: false });
}
// Close previous li at same level if needed
if (
stack.length > 0 &&
stack[stack.length - 1].hasOpenLi &&
stack[stack.length - 1].level === item.level
) {
html += '</li>';
stack[stack.length - 1].hasOpenLi = false;
}
// Add list item
html += `<li>${item.content}`;
stack[stack.length - 1].hasOpenLi = true;
// Close li if next item is not deeper
if (!nextItem || nextItem.level <= item.level) {
html += '</li>';
stack[stack.length - 1].hasOpenLi = false;
}
}
// Close remaining lists
while (stack.length > 0) {
const last = stack.pop()!;
if (last.hasOpenLi) {
html += '</li>';
}
html += `</${last.type}>`;
}
return html;
};
// Process footnote references in the text [^1]
const footnoteReferenceReplacer = (_match: string, id: string) => {
// Create a link inside a superscript tag with proper references
return `<sup id="fnref:${id}"><a href="#fn:${id}">[${id}]</a></sup>`;
};
// Process footnote definitions [^1]: Footnote text
const footnoteDefinitionReplacer = (
_match: string,
id: string,
text: string,
) => {
footnotes.push([id, text.trim()]);
return ''; // Remove the definition from the main text
};
// Generate the footnotes section
const generateFootnotesSection = () => {
if (footnotes.length === 0) return '';
const footnotesHtml = footnotes
.map(
([id, text]) => `
<li id="fn:${id}">
${text}
<sup><a href="#fnref:${id}">↩</a></sup>
</li>`,
)
.join('\n');
return `
<div class="footnotes">
<hr>
<ol>
${footnotesHtml}
</ol>
</div>`;
};
const table = (
_: string,
headers: string,
format: string,
content: string = '',
) => {
const align = format
.split('|')
.filter((__, i, arr) => i > 0 && i < arr.length - 1)
.map((col) =>
/:-+:/g.test(col)
? 'center'
: /-+:/g.test(col)
? 'right'
: /:-+/.test(col)
? 'left'
: '',
);
// Return attribute string (keeps compatibility with original code's template)
const td = (col: number) => {
const a = align[col];
return a ? ` align="${a}"` : '';
};
// Parse header cells (keep both raw and trimmed for emptiness checks)
const rawHeaderCells = headers.split('|').slice(1, -1); // remove first and last empty elements
const headerCells = rawHeaderCells.map((hd) => hd.trim());
const headerResults: string[] = [];
let skipNext = 0;
for (let i = 0; i < headerCells.length; i++) {
if (skipNext > 0) {
skipNext--;
continue;
}
const hd = headerCells[i];
const rawHd = rawHeaderCells[i];
if (hd && hd.length) {
// count how many consecutive empty header cells follow -> colspan
// Only do colspan if there's at least one truly empty cell (from ||) in the sequence
let spanCount = 1;
let hasTrulyEmptyCell = false;
// Check if there are any truly empty cells following this cell
for (let j = i + 1; j < headerCells.length && headerCells[j].length === 0; j++) {
if (rawHeaderCells[j] === '') {
hasTrulyEmptyCell = true;
break;
}
}
// Only count consecutive empty cells for colspan if we found truly empty cells
if (hasTrulyEmptyCell) {
for (
let j = i + 1;
j < headerCells.length && headerCells[j].length === 0;
j++
) {
spanCount++;
}
}
if (spanCount > 1) {
skipNext = spanCount - 1;
headerResults.push(`<th${td(i)} colspan="${spanCount}">${hd}</th>`);
} else {
headerResults.push(`<th${td(i)}>${hd}</th>`);
}
} else {
// Check if this is a truly empty cell (from ||) or just whitespace
if (rawHd === '') {
// This is a truly empty cell that should be skipped for colspan
// But if we reach here, it means it wasn't part of a colspan, so render empty th
headerResults.push(`<th${td(i)}></th>`);
} else {
// This is a whitespace-only cell that should remain as individual cell
headerResults.push(`<th${td(i)}></th>`);
}
}
}
const h = `<tr>\n ${headerResults.join('\n ')}\n</tr>\n`;
// body rows
const rows = content
.split('\n')
.map((row) => row.trim())
.filter((row) => row && row.length);
const c = rows
.map((row) => {
// Split by | but keep track of truly empty cells (from ||)
const rawCells = row.split('|').slice(1, -1); // remove first and last empty elements
const cells = rawCells.map((cell) => cell.trim());
const cellResults: string[] = [];
let skipNext = 0;
for (let i = 0; i < cells.length; i++) {
if (skipNext > 0) {
skipNext--;
continue;
}
const cell = cells[i];
const rawCell = rawCells[i];
if (cell && cell.length) {
// Count consecutive empty cells after this one -> colspan
// Only do colspan if there's at least one truly empty cell (from ||) in the sequence
let spanCount = 1;
let hasTrulyEmptyCell = false;
// Check if there are any truly empty cells following this cell
for (let j = i + 1; j < cells.length && cells[j].length === 0; j++) {
if (rawCells[j] === '') {
hasTrulyEmptyCell = true;
break;
}
}
// Only count consecutive empty cells for colspan if we found truly empty cells
if (hasTrulyEmptyCell) {
for (
let j = i + 1;
j < cells.length && cells[j].length === 0;
j++
) {
spanCount++;
}
}
if (spanCount > 1) {
skipNext = spanCount - 1;
cellResults.push(`<td${td(i)} colspan="${spanCount}">${cell}</td>`);
} else {
cellResults.push(`<td${td(i)}>${cell}</td>`);
}
} else {
// Check if this is a truly empty cell (from ||) or just whitespace
if (rawCell === '') {
// This is a truly empty cell that should be skipped for colspan
// But if we reach here, it means it wasn't part of a colspan, so render empty td
cellResults.push(`<td${td(i)}></td>`);
} else {
// This is a whitespace-only cell that should remain as individual cell
cellResults.push(`<td${td(i)}></td>`);
}
}
}
return `<tr>\n ${cellResults.join('\n ')}\n</tr>\n`;
})
.join('');
// keep the original surrounding newlines/structure to stay compatible
return `\n<table><tbody>${h}${c}</tbody></table>\n`;
};
// Enhanced table with caption support
const tableWithCaption = (
_: string,
caption: string,
headers: string,
format: string,
content: string = '',
) => {
const tableHtml = table(_, headers, format, content);
// Insert caption after <table> tag
return tableHtml.replace(
'<table>',
`<table><caption>${caption.trim()}</caption>`,
);
};
const cleanUpUrl = (link: string) => link.replace(/<\/?em>/g, '_');
const header = (_: string, match: string, h = '') => {
const level = match.length;
return `<h${level}>${h.trim()}</h${level}>`;
};
// Function to extract and store code blocks
const extractCodeBlocks = (markdown: string): string => {
return markdown.replace(
/\n\s*```\w*\n([^]*?)\n\s*```\s*\n/g,
(_match, code) => {
codeBlocks.push(code);
return `\n<pre>{{CODEBLOCKPH${codeBlocks.length - 1}}}</pre>\n`;
},
);
};
// Function to extract and store inline code
const extractInlineCode = (markdown: string): string => {
return markdown.replace(/`([^`]+)`/g, (_match, code) => {
inlineCode.push(code);
return `{{INLINECODEPH${inlineCode.length - 1}}}`;
});
};
// Function to restore code blocks with proper HTML escaping
const restoreCodeBlocks = (markdown: string): string => {
return markdown.replace(
/<pre>{{CODEBLOCKPH(\d+)}}<\/pre>/g,
(_match, index) => {
const code = codeBlocks[parseInt(index)];
return `<pre>${esc(code)}</pre>`;
},
);
};
// Function to restore inline code with proper HTML escaping
const restoreInlineCode = (markdown: string): string => {
return markdown.replace(/{{INLINECODEPH(\d+)}}/g, (_match, index) => {
const code = inlineCode[parseInt(index)];
return `<code>${esc(code)}</code>`;
});
};
// Function to extract and store math blocks
const extractMathBlocks = (markdown: string): string => {
return markdown.replace(/\n\s*\$\$([^]*?)\$\$\s*\n/g, (_match, math) => {
mathBlocks.push(math.trim());
return `\n{{MATHBLOCKPH${mathBlocks.length - 1}}}\n`;
});
};
// Function to extract and store inline math
const extractInlineMath = (markdown: string): string => {
return markdown.replace(/\$([^$\n]+)\$/g, (_match, math) => {
inlineMath.push(math);
return `{{INLINEMATHPH${inlineMath.length - 1}}}`;
});
};
// Function to restore math blocks
const restoreMathBlocks = (markdown: string): string => {
return markdown.replace(/{{MATHBLOCKPH(\d+)}}/g, (_match, index) => {
const math = mathBlocks[parseInt(index)];
return `<div class="math-block">${esc(math)}</div>`;
});
};
// Function to restore inline math
const restoreInlineMath = (markdown: string): string => {
return markdown.replace(/{{INLINEMATHPH(\d+)}}/g, (_match, index) => {
const math = inlineMath[parseInt(index)];
return `<span class="math-inline">${esc(math)}</span>`;
});
};
/** Pre-paragraph rules (everything except paragraph processing) */
const preParaRules = [
[/\r\n/g, '\n'], // Remove \r
[/\n(#+)(.*)/g, header], // headers
[/!\[([^\[]+)\]\((?:javascript:)?([^\)]+)\)/g, '<img src="$2" alt="$1">'], // images, invoked before links
[/\[([^\[]+)\]\((?:javascript:)?([^\)]+)\)/g, '<a href="$2">$1</a>'], // links
[/([^\\])(\*\*|__)(.*?(_|\*)?)\2/g, '$1<strong>$3</strong>'], // bold
[/([^\\])(\*|_)(.*?)\2/g, '$1<em>$3</em>'], // emphasis
[/\\_/g, '_'], // underscores part 1
[/\~\~(.*?)\~\~/g, '<del>$1</del>'], // del
[/\:\"(.*?)\"\:/g, '<q>$1</q>'], // quote
[/\n( *)[-*+] \[([xX ])\](.*)/g, taskList], // task lists with checkboxes (must come before regular ul lists)
[/\n( *)(\*|-|\+)(.*)/g, ulList], // ul lists using +, - or * to denote an entry
[/\n( *)([0-9]+\.) (.*)/g, olList], // ol lists
[/\n(>|\>)(.*)/g, blockquote], // blockquotes
[/(\^)(.*?)\1/g, '<sup>$2</sup>'], // superscript
[/(\~)(.*?)\1/g, '<sub>$2</sub>'], // subscript
[/\n-{5,}/g, '\n<hr />'], // horizontal rule
[
/\n\[(.+?)\]\n( *\|[^\n]+\|\r?\n)((?: *\|:?[ -]+:?)+ *\|)(\n(?: *\|[^\n]+\|\r?\n?)*)?/g,
tableWithCaption,
], // tables with captions
[
/( *\|[^\n]+\|\r?\n)((?: *\|:?[ -]+:?)+ *\|)(\n(?: *\|[^\n]+\|\r?\n?)*)?/g,
table,
], // regular tables
[/\[\^([^\]]+)\](?!:)/g, footnoteReferenceReplacer], // footnote references
[/\[\^([^\]]+)\]:\s*((?:[^\n]*\n?)*)/g, footnoteDefinitionReplacer], // footnote definitions
[/\n([A-Z][A-Za-z\s]*?)\s:\s*([A-Z][^\n]*)/g, definitionList], // definition lists (Capitalized Term : Capitalized Definition)
] as Array<[RegExp, RegexReplacer | string]>;
/** Post-paragraph rules (cleanup rules that run after paragraph processing) */
const postParaRules = [
[/\s?<\/[ou]l>\s?<[ou]l>/g, '', 3], // fix extra ol and ul
[/<\/blockquote>\n<blockquote>/g, '<br>\n'], // fix extra blockquote
[/https?:\/\/[^"']*/g, cleanUpUrl], // fix em in links
[/_/g, '_'], // underscores part 2
] as Array<[RegExp, RegexReplacer | string]>;
/**
* Render Markdown text into HTML.
*
* @param markdown Markdown text
* @param removeParagraphs If true (default false), remove the \<p\>...\</p\> around paragraphs
* @param externalLinks If true (default false), replace \<a href...\> with \<a taget="_blank" href...\>
* to open them in a new page
* @returns
*/
export const render = (
markdown: string,
removeParagraphs = false,
externalLinks = false,
) => {
// Reset the storage arrays
codeBlocks.length = 0;
inlineCode.length = 0;
mathBlocks.length = 0;
inlineMath.length = 0;
footnotes.length = 0;
// Extract code blocks, math, and inline code before processing
markdown = extractCodeBlocks(`\n${markdown}\n`);
markdown = extractMathBlocks(markdown);
markdown = extractInlineCode(markdown);
markdown = extractInlineMath(markdown);
// Apply pre-paragraph rules
preParaRules.forEach(([regex, subst, repeat = 1]) => {
for (let i = 0; i < repeat; i++) {
markdown = markdown.replace(regex, subst as any);
}
});
// Process collected list items into proper nested structure
markdown = processListItems(markdown);
// Apply paragraph processing
markdown = markdown.replace(/\n([^\n]+)\n/g, para);
// Apply post-paragraph cleanup rules
postParaRules.forEach(([regex, subst, repeat = 1]) => {
for (let i = 0; i < repeat; i++) {
markdown = markdown.replace(regex, subst as any);
}
});
// Restore code blocks, math, and inline code with proper escaping
markdown = restoreCodeBlocks(markdown);
markdown = restoreMathBlocks(markdown);
markdown = restoreInlineCode(markdown);
markdown = restoreInlineMath(markdown);
// Add footnotes section if there are any footnotes
markdown = markdown.trim() + generateFootnotesSection();
if (removeParagraphs) {
markdown = markdown.replace(/^<p>(.*)<\/p>$/s, '$1');
}
if (externalLinks) {
markdown = markdown.replace(/<a href="/g, '<a target="_blank" href="');
}
return markdown;
};
/**
* Add a new rule: The regex should be global and not use multiline mode.
*/
export const addRule = (regex: RegExp, replacement: RegexReplacer | string) => {
preParaRules.push([regex, replacement]);
};