UNPKG

llmxml

Version:

Convert between markdown and LLM-friendly pseudo-XML

238 lines (222 loc) 6.46 kB
import { unified } from 'unified'; import remarkParse from 'remark-parse'; import type { Root, RootContent, List, ListItem } from 'mdast'; import type { ASTNode, SourceLocation } from '../types/ast'; import { LLMXMLError } from '../errors'; import logger from '../utils/logger'; import { visit } from 'unist-util-visit'; // Helper function to convert remark Position to our SourceLocation function convertPosition(pos: any): SourceLocation | undefined { if (!pos) return undefined; return { start: { offset: pos.start.offset || 0, line: pos.start.line || 1, column: pos.start.column || 1 }, end: { offset: pos.end.offset || 0, line: pos.end.line || 1, column: pos.end.column || 1 } }; } /** * Custom remark plugin to capture emphasis markers */ function remarkEmphasisMarkers() { return (tree: Root) => { visit(tree, ['emphasis', 'strong'], (node: any) => { // Check the raw source text to determine the marker used const raw = node.position?.source?.value?.slice( node.position.start.offset, node.position.end.offset ); if (raw) { const marker = raw.startsWith('_') ? '_' : '*'; node.data = node.data || {}; node.data.marker = marker.repeat(node.type === 'strong' ? 2 : 1); } }); }; } /** * Markdown parser implementation */ export class MarkdownParser { /** * Parse Markdown content into an AST * * @param input - The Markdown content to parse * @returns The parsed AST */ public async parseToAST(input: string): Promise<ASTNode[]> { try { const mdast = await unified() .use(remarkParse) .use(remarkEmphasisMarkers) .parse(input); // Validate the AST for unclosed code blocks this.validateAST(mdast); return this.transformNodes(mdast.children); } catch (error) { logger.error('Failed to parse Markdown', { error }); throw new LLMXMLError( 'Failed to parse Markdown', 'PARSE_ERROR', { error } ); } } /** * Validate the Markdown AST for common issues */ private validateAST(root: Root): void { const validate = (node: any) => { if (node.type === 'code' && !node.value) { throw new LLMXMLError( 'Unclosed code block', 'PARSE_ERROR', { node } ); } if (Array.isArray(node.children)) { node.children.forEach(validate); } }; root.children.forEach(validate); } /** * Transform remark AST nodes into our AST format * * @param nodes - Array of remark AST nodes * @returns Array of transformed AST nodes */ private transformNodes(nodes: RootContent[]): ASTNode[] { return nodes.map(node => this.transformNode(node)).filter(Boolean) as ASTNode[]; } /** * Transform a single remark AST node into our AST format * * @param node - The remark AST node to transform * @returns The transformed AST node */ private transformNode(node: RootContent): ASTNode | null { switch (node.type) { case 'heading': return { type: 'heading', text: this.getNodeText(node), depth: node.depth, children: [], location: convertPosition(node.position) }; case 'text': return { type: 'text', value: node.value, location: convertPosition(node.position) }; case 'code': return { type: 'code', value: node.value, lang: node.lang || undefined, location: convertPosition(node.position) }; case 'list': return { type: 'list', ordered: node.ordered || false, children: this.transformNodes(node.children), location: convertPosition(node.position) }; case 'listItem': return { type: 'listItem', children: this.transformNodes(node.children), location: convertPosition(node.position) }; case 'paragraph': return { type: 'paragraph', children: this.transformNodes(node.children), location: convertPosition(node.position) }; case 'link': return { type: 'link', url: node.url, children: this.transformNodes(node.children), location: convertPosition(node.position) }; case 'emphasis': return { type: 'emphasis', marker: (node as any).data?.marker || '*', children: this.transformNodes(node.children), location: convertPosition(node.position) }; case 'strong': return { type: 'strong', marker: (node as any).data?.marker || '**', children: this.transformNodes(node.children), location: convertPosition(node.position) }; default: logger.warn('Unknown node type during transformation', { type: node.type }); return null; } } /** * Get the text content of a node */ private getNodeText(node: RootContent): string { if ('value' in node) { return node.value; } if ('children' in node) { return node.children.map(child => this.getNodeText(child)).join(''); } return ''; } /** * Serialize a list node to text */ private serializeList(node: List, indent: string = ''): string { if (!Array.isArray(node.children)) { return ''; } return node.children .map((item: ListItem, index: number) => { const marker = node.ordered ? `${index + 1}.` : '-'; const itemText = this.serializeListItem(item, `${indent} `); return itemText ? `${indent}${marker} ${itemText}` : ''; }) .filter(Boolean) .join('\n'); } /** * Serialize a list item node to text */ private serializeListItem(node: ListItem, indent: string): string { if (!Array.isArray(node.children)) { return ''; } return node.children .map(child => { if (child.type === 'list') { return '\n' + this.serializeList(child as List, indent); } if (child.type === 'paragraph') { return this.getNodeText(child); } return ''; }) .filter(Boolean) .join(''); } } // Export a singleton instance export const markdownParser = new MarkdownParser();