UNPKG

llmxml

Version:

Convert between markdown and LLM-friendly pseudo-XML

227 lines (209 loc) 6.42 kB
import type { ASTNode, TagNode, TextNode, SourceLocation } from '../types/ast'; import logger from '../utils/logger'; interface ExtendedASTNode { type: string; location?: SourceLocation; depth?: number; text?: string; lang?: string; value?: string; children?: ExtendedASTNode[]; ordered?: boolean; url?: string; } /** * Transforms our internal Markdown AST into LLM-XML AST format */ export class MarkdownToLLMTransformer { /** * Convert a Markdown AST to LLM-XML AST * * @param nodes - Array of Markdown AST nodes * @returns Array of LLM-XML AST nodes */ public transform(nodes: ASTNode[]): ASTNode[] { const result: ASTNode[] = []; // Stack of parent sections, from root to current const sectionStack: ExtendedASTNode[] = []; for (const node of nodes) { if (node.type === 'heading') { const tagNode = this.createTagNode(node as ExtendedASTNode); const currentLevel = (node as ExtendedASTNode).depth || 1; // Pop stack until we find the appropriate parent level while ( sectionStack.length > 0 && (sectionStack[sectionStack.length - 1] as any).depth! >= currentLevel ) { sectionStack.pop(); } // Add to parent if we have one, otherwise to root if (sectionStack.length > 0) { const parent = sectionStack[sectionStack.length - 1] as TagNode; parent.children = parent.children || []; parent.children.push(tagNode); } else { result.push(tagNode); } // Push this section onto the stack sectionStack.push(tagNode); } else { // For non-heading content, add it to the most recently seen section if (sectionStack.length > 0) { const lastSection = sectionStack[sectionStack.length - 1] as TagNode; lastSection.children = lastSection.children || []; lastSection.children.push(this.transformContent(node as ExtendedASTNode)); } else { // No sections yet, add to root result.push(this.transformContent(node as ExtendedASTNode)); } } } return result; } /** * Create a tag node from a heading node * * @param heading - The heading node to convert * @returns A new tag node */ private createTagNode(heading: ExtendedASTNode): TagNode { const name = this.generateTagName(heading.text || ''); logger.debug('Creating tag node', { text: heading.text, name }); return { type: 'tag', name, attributes: { title: heading.text || name, hlevel: String(heading.depth || 1), }, children: [], depth: heading.depth || 1, // Keep track of depth for nesting location: heading.location } as TagNode; } /** * Transform non-heading content into text nodes */ private transformContent(node: ExtendedASTNode): TextNode { switch (node.type) { case 'code': return { type: 'text', value: '```' + (node.lang || '') + '\n' + node.value + '\n```', textType: 'code', language: node.lang, location: node.location } as TextNode; case 'list': return { type: 'text', value: this.serializeList(node), location: node.location } as TextNode; case 'paragraph': return { type: 'text', value: this.serializeParagraph(node), location: node.location } as TextNode; case 'text': return { type: 'text', value: node.value || '', location: node.location } as TextNode; default: logger.warn('Unknown node type during content transformation', { type: node.type }); return { type: 'text', value: '', location: node.location } as TextNode; } } /** * Generate a valid tag name from heading text * * @param text - The heading text * @returns A camelCase tag name */ private generateTagName(text: string): string { // Remove special characters and convert to camelCase return text // Remove markdown formatting .replace(/[`*_\[\]]/g, '') // Split on non-word characters .split(/[^a-zA-Z0-9]+/) // Filter out empty strings .filter(Boolean) // Convert to camelCase .map((word) => { const normalized = word.toLowerCase(); return normalized.charAt(0).toUpperCase() + normalized.slice(1); }) .join(''); } /** * Serialize a list node to text */ private serializeList(node: ExtendedASTNode): string { if (!Array.isArray(node.children)) { return ''; } const ordered = (node as any).ordered || false; return node.children .map((item: ExtendedASTNode, index: number) => { const marker = ordered ? `${index + 1}.` : '-'; const itemText = this.serializeListItem(item); return itemText ? `${marker} ${itemText}` : ''; }) .filter(Boolean) .join('\n'); } /** * Serialize a list item node to text */ private serializeListItem(node: ExtendedASTNode): string { if (!Array.isArray(node.children)) { return ''; } return node.children .map((child: ExtendedASTNode) => { if (child.type === 'list') { return '\n' + this.serializeList(child); } if (child.type === 'paragraph') { return this.serializeParagraph(child); } return ''; }) .filter(Boolean) .join(''); } /** * Serialize a paragraph node to text */ private serializeParagraph(node: ExtendedASTNode): string { if (!Array.isArray(node.children)) { return ''; } return node.children .map((child: ExtendedASTNode) => { switch (child.type) { case 'text': return child.value || ''; case 'emphasis': return `*${this.serializeParagraph(child)}*`; case 'strong': return `**${this.serializeParagraph(child)}**`; case 'link': return `[${this.serializeParagraph(child)}](${(child as any).url})`; default: return ''; } }) .join(''); } } // Export a singleton instance export const markdownToLLM = new MarkdownToLLMTransformer();