UNPKG

@kayvan/markdown-tree-parser

Version:

A powerful JavaScript library and CLI tool for parsing and manipulating markdown files as tree structures using the remark/unified ecosystem

405 lines (365 loc) 12 kB
import { unified } from 'unified'; import remarkParse from 'remark-parse'; import remarkStringify from 'remark-stringify'; import { visit } from 'unist-util-visit'; import { selectAll, select } from 'unist-util-select'; import { find } from 'unist-util-find'; /** * A powerful markdown parser that treats markdown as a manipulable tree structure * Built on top of the remark/unified ecosystem */ export class MarkdownTreeParser { constructor(options = {}) { this.options = { // Default remark-stringify options bullet: '*', emphasis: '*', strong: '*', ...options, }; this.processor = unified() .use(remarkParse) .use(remarkStringify, this.options); } /** * Parse markdown text into an Abstract Syntax Tree (AST) * @param {string} markdownText - The markdown content to parse * @returns {Promise<Object>} The parsed AST tree */ async parse(markdownText) { const file = await this.processor.process(markdownText); return file.data.tree || this.processor.parse(markdownText); } /** * Convert AST back to markdown text * @param {Object} tree - The AST tree to stringify * @returns {Promise<string>} The markdown string */ async stringify(tree) { // Create a deep copy of the tree to avoid mutation issues const treeCopy = JSON.parse(JSON.stringify(tree)); const result = await unified() .use(remarkStringify, this.options) .stringify(treeCopy); return result; } /** * Find a specific heading and return it with all its content until the next same-level heading * @param {Object} tree - The AST tree to search * @param {string} headingText - Text to search for in headings * @param {number|null} level - Optional specific heading level to match * @returns {Object|null} New tree containing the section, or null if not found */ extractSection(tree, headingText, level = null) { let foundHeading = null; let startIndex = -1; let endIndex = -1; let exactMatch = null; let exactMatchIndex = -1; // Find the target heading - prefer exact matches over partial matches for (let i = 0; i < tree.children.length; i++) { const node = tree.children[i]; if (node.type === 'heading') { const nodeText = this.getHeadingText(node); const lowerNodeText = nodeText.toLowerCase(); const lowerSearchText = headingText.toLowerCase(); if (level === null || node.depth === level) { // Check for exact match first if (lowerNodeText === lowerSearchText) { exactMatch = node; exactMatchIndex = i; } else if (lowerNodeText.includes(lowerSearchText) && !foundHeading) { // Only use partial match if no exact match found yet and no other partial match foundHeading = node; startIndex = i; } } } } // Prefer exact match over partial match if (exactMatch) { foundHeading = exactMatch; startIndex = exactMatchIndex; } if (!foundHeading || startIndex === -1) { return null; } // Find where this section ends (next heading of same or higher level) const targetDepth = foundHeading.depth; for (let i = startIndex + 1; i < tree.children.length; i++) { const node = tree.children[i]; if (node.type === 'heading' && node.depth <= targetDepth) { endIndex = i; break; } } // Create a new tree with just this section const sectionNodes = tree.children.slice( startIndex, endIndex === -1 ? undefined : endIndex ); // Create deep copies to avoid node reference issues const copiedNodes = JSON.parse(JSON.stringify(sectionNodes)); return { type: 'root', children: copiedNodes, }; } /** * Extract all sections at a specific heading level * @param {Object} tree - The AST tree to process * @param {number} level - Heading level to extract (default: 2) * @returns {Array} Array of section objects with heading and tree properties */ extractAllSections(tree, level = 2) { const sections = []; let startIndex = -1; for (let i = 0; i < tree.children.length; i++) { const node = tree.children[i]; if (node.type === 'heading' && node.depth === level) { // If we have a previous section, save it if (startIndex !== -1) { const sectionNodes = tree.children.slice(startIndex, i); // Create deep copies to avoid node reference issues const copiedNodes = JSON.parse(JSON.stringify(sectionNodes)); sections.push({ heading: JSON.parse(JSON.stringify(tree.children[startIndex])), tree: { type: 'root', children: copiedNodes, }, headingText: this.getHeadingText(tree.children[startIndex]), }); } // Start new section startIndex = i; } else if ( node.type === 'heading' && node.depth <= level && startIndex !== -1 ) { // End current section if we hit a higher-level heading const sectionNodes = tree.children.slice(startIndex, i); // Create deep copies to avoid node reference issues const copiedNodes = JSON.parse(JSON.stringify(sectionNodes)); sections.push({ heading: JSON.parse(JSON.stringify(tree.children[startIndex])), tree: { type: 'root', children: copiedNodes, }, headingText: this.getHeadingText(tree.children[startIndex]), }); startIndex = -1; } } // Don't forget the last section if (startIndex !== -1) { const sectionNodes = tree.children.slice(startIndex); // Create deep copies to avoid node reference issues const copiedNodes = JSON.parse(JSON.stringify(sectionNodes)); sections.push({ heading: JSON.parse(JSON.stringify(tree.children[startIndex])), tree: { type: 'root', children: copiedNodes, }, headingText: this.getHeadingText(tree.children[startIndex]), }); } return sections; } /** * Search for a single node using CSS-like selectors * @param {Object} tree - The AST tree to search * @param {string} selector - CSS-like selector string * @returns {Object|null} First matching node or null */ select(tree, selector) { return select(selector, tree); } /** * Search for all nodes using CSS-like selectors * @param {Object} tree - The AST tree to search * @param {string} selector - CSS-like selector string * @returns {Array} Array of matching nodes */ selectAll(tree, selector) { return selectAll(selector, tree); } /** * Find nodes by custom condition * @param {Object} tree - The AST tree to search * @param {string|function|Object} condition - Search condition * @returns {Object|null} First matching node or null */ findNode(tree, condition) { if (typeof condition === 'string') { return find(tree, condition); } if (typeof condition === 'function') { return find(tree, condition); } if (typeof condition === 'object') { return find(tree, condition); } return null; } /** * Get plain text content from a heading node * @param {Object} headingNode - The heading AST node * @returns {string} Plain text content of the heading */ getHeadingText(headingNode) { let text = ''; visit(headingNode, 'text', (node) => { text += node.value; }); return text; } /** * Transform tree by visiting nodes with a custom function * @param {Object} tree - The AST tree to transform * @param {function} visitor - Visitor function to apply to nodes * @returns {Object} The transformed tree */ transform(tree, visitor) { visit(tree, visitor); return tree; } /** * Get a flat list of all headings with their levels and text * @param {Object} tree - The AST tree to analyze * @returns {Array} Array of heading objects with level, text, and node properties */ getHeadingsList(tree) { const headings = []; visit(tree, 'heading', (node) => { headings.push({ level: node.depth, text: this.getHeadingText(node), node: node, }); }); return headings; } /** * Extract everything under a specific heading level down to a certain depth * @param {Object} tree - The AST tree to search * @param {string} headingText - Heading text to search for * @param {number|null} maxDepth - Maximum depth to include (relative to found heading) * @returns {Object|null} New tree with nested section or null if not found */ extractNestedSection(tree, headingText, maxDepth = null) { let foundHeading = null; let collecting = false; const sectionNodes = []; visit(tree, (node, _index) => { if (node.type === 'heading') { if ( !collecting && this.getHeadingText(node) .toLowerCase() .includes(headingText.toLowerCase()) ) { foundHeading = node; collecting = true; sectionNodes.push(node); } else if (collecting) { // Stop collecting if we hit a heading of same or higher level if (node.depth <= foundHeading.depth) { return 'skip'; } // Stop if we've exceeded max depth if (maxDepth && node.depth > foundHeading.depth + maxDepth) { return; } sectionNodes.push(node); } } else if (collecting) { sectionNodes.push(node); } }); if (sectionNodes.length === 0) { return null; } // Create deep copies to avoid node reference issues const copiedNodes = JSON.parse(JSON.stringify(sectionNodes)); return { type: 'root', children: copiedNodes, }; } /** * Get document statistics * @param {Object} tree - The AST tree to analyze * @returns {Object} Statistics about the document */ getStats(tree) { const stats = { headings: { total: 0, byLevel: {} }, paragraphs: 0, codeBlocks: 0, lists: 0, links: 0, images: 0, wordCount: 0, }; visit(tree, (node) => { switch (node.type) { case 'heading': stats.headings.total++; stats.headings.byLevel[node.depth] = (stats.headings.byLevel[node.depth] || 0) + 1; break; case 'paragraph': stats.paragraphs++; break; case 'code': stats.codeBlocks++; break; case 'list': stats.lists++; break; case 'link': stats.links++; break; case 'image': stats.images++; break; case 'text': stats.wordCount += node.value .trim() .split(/\s+/) .filter((word) => word.length > 0).length; break; } }); return stats; } /** * Create a table of contents from the document * @param {Object} tree - The AST tree to process * @param {number} maxLevel - Maximum heading level to include (default: 3) * @returns {string} Markdown table of contents */ generateTableOfContents(tree, maxLevel = 3) { const headings = this.getHeadingsList(tree); const filteredHeadings = headings.filter((h) => h.level <= maxLevel); if (filteredHeadings.length === 0) { return ''; } let toc = '## Table of Contents\n\n'; for (const heading of filteredHeadings) { const indent = ' '.repeat(heading.level - 1); const link = heading.text .toLowerCase() .replace(/[^a-z0-9\s-]/g, '') .replace(/\s+/g, '-') .replace(/-+/g, '-') .replace(/^-|-$/g, ''); toc += `${indent}- [${heading.text}](#${link})\n`; } return toc; } }