UNPKG

llmxml

Version:

Convert between markdown and LLM-friendly pseudo-XML

366 lines (324 loc) 11 kB
import type { Warning, LLMXMLOptions } from './types'; import { markdownParser } from './parser/markdown'; import { llmParser } from './parser/llm'; import { markdownToLLM } from './transformer/md-to-llm'; import { llmToMarkdown } from './transformer/llm-to-md'; import { sectionExtractor } from './extractor'; import logger, { events, configureLogger } from './utils/logger'; import { markdownSerializer } from './serializer/markdown'; import { LLMXMLError } from './errors'; import type { ASTNode, TagNode } from './types/ast'; import { LLMSerializer } from './serializer/llm'; /** * Options for section extraction */ export interface GetSectionOptions { /** Only match headers at this level */ level?: number; /** Require exact matches */ exact?: boolean; /** Include subsections in result */ includeNested?: boolean; /** Minimum fuzzy match score (0-1) */ fuzzyThreshold?: number; } /** * Heading information from document */ export interface HeadingInfo { /** Heading title */ title: string; /** Heading level (1-6) */ level: number; /** Hierarchical path to this heading */ path: string[]; } /** * Main LLMXML class for converting between Markdown and LLM-friendly pseudo-XML */ export class LLMXML { private options: LLMXMLOptions; private llmSerializer: LLMSerializer; private warningHandlers: ((warning: Warning) => void)[] = []; /** * Creates a new LLMXML instance * * @param options - Configuration options */ constructor(options: LLMXMLOptions = {}) { this.options = { defaultFuzzyThreshold: 0.7, warningLevel: 'all', includeHlevel: false, includeTitle: false, verbose: false, tagFormat: 'PascalCase', ...options, }; this.llmSerializer = new LLMSerializer(this.options); // Configure logger based on options configureLogger(this.options); // Log initialization only when not silenced if (this.options.warningLevel !== 'none' && this.options.verbose) { logger.info('LLMXML instance created', { options: this.options }); } // Register warning handler events.on('warning', (warning: Warning) => { if (this.options.warningLevel === 'none') return; if (this.options.warningLevel === 'ambiguous-only' && warning.code !== 'AMBIGUOUS_MATCH') return; this.warningHandlers.forEach(handler => handler(warning)); }); } /** * Convert Markdown to LLM-XML format */ public async toXML(markdown: string): Promise<string> { const mdAst = await markdownParser.parseToAST(markdown); logger.debug('Parsed Markdown AST', { ast: mdAst }); const xmlAst = markdownToLLM.transform(mdAst); logger.debug('Transformed to LLM-XML AST', { ast: xmlAst }); return this.llmSerializer.serialize(xmlAst); } /** * Convert LLM-XML to Markdown format */ public async toMarkdown(xml: string): Promise<string> { const xmlAst = await llmParser.parseToAST(xml); logger.debug('Parsed LLM-XML AST', { ast: xmlAst }); const mdAst = llmToMarkdown.transform(xmlAst); logger.debug('Transformed to Markdown AST', { ast: mdAst }); return markdownSerializer.serialize(mdAst); } /** * Extract a single section from the document */ public async getSection( content: string, title: string, options: GetSectionOptions = {} ): Promise<string> { try { const sections = await this.getSections(content, title, options); return sections[0]; } catch (error) { if (error instanceof LLMXMLError && error.code === 'SECTION_NOT_FOUND') { // Enhance error with available headings const headings = await this.getHeadings(content); throw new LLMXMLError( `Section "${title}" not found in document`, 'SECTION_NOT_FOUND', { title, availableHeadings: headings, closestMatches: sectionExtractor.findClosestMatches(title, headings, options.fuzzyThreshold || this.options.defaultFuzzyThreshold) } ); } throw error; } } /** * Extract multiple matching sections from the document */ public async getSections( content: string, title: string, options: GetSectionOptions = {} ): Promise<string[]> { // Validate section options if (options.level !== undefined && (options.level < 1 || options.level > 6)) { throw new LLMXMLError( 'Invalid heading level. Must be between 1 and 6.', 'INVALID_LEVEL', { level: options.level } ); } let ast: ASTNode[]; const isXML = content.trim().startsWith('<'); if (isXML) { ast = await llmParser.parseToAST(content); } else { const mdAst = await markdownParser.parseToAST(content); ast = markdownToLLM.transform(mdAst); } // Register warning handler for section extraction const warnings: Warning[] = []; const warningHandler = (warning: Warning) => { warnings.push(warning); this.emitWarning(warning); }; events.on('warning', warningHandler); try { const matches = sectionExtractor.findSections(ast, { title, level: options.level, threshold: options.fuzzyThreshold || this.options.defaultFuzzyThreshold, fuzzyMatches: !options.exact, includeSubsections: options.includeNested }); // If we have multiple matches with similar scores, emit an ambiguous match warning if (matches.length > 1) { const warning: Warning = { code: 'AMBIGUOUS_MATCH', message: `Multiple potential matches found for section "${title}"`, details: { matches: matches.map(match => ({ title: match.attributes.title, score: match.attributes.score })) } }; warningHandler(warning); } // Convert matches back to original format return matches.map(match => { if (isXML) { return this.llmSerializer.serialize([match]); } else { return markdownSerializer.serialize(llmToMarkdown.transform([match])); } }); } catch (error) { if (error instanceof LLMXMLError && error.code === 'SECTION_NOT_FOUND') { // Enhance error with available headings const headings = this.extractHeadingsFromAST(ast); error.details = { availableHeadings: headings, closestMatches: sectionExtractor.findClosestMatches(title, headings.map(h => ({ title: h.title, level: h.level })), options.fuzzyThreshold || this.options.defaultFuzzyThreshold), ...(error.details as Record<string, unknown> || {}) }; } throw error; } finally { // Clean up warning handler events.off('warning', warningHandler); } } /** * Extract all headings from the document * * @param content - Markdown or LLM-XML content * @returns Array of heading information */ public async getHeadings(content: string): Promise<HeadingInfo[]> { let ast: ASTNode[]; const isXML = content.trim().startsWith('<'); if (isXML) { ast = await llmParser.parseToAST(content); } else { const mdAst = await markdownParser.parseToAST(content); ast = markdownToLLM.transform(mdAst); } return this.extractHeadingsFromAST(ast); } /** * Extract headings from AST * * @private * @param ast - Document AST * @returns Array of heading information */ private extractHeadingsFromAST(ast: ASTNode[]): HeadingInfo[] { const headings: HeadingInfo[] = []; const traverse = (nodes: ASTNode[], currentPath: string[] = []) => { for (const node of nodes) { if (node.type === 'tag') { const tagNode = node as TagNode; const level = parseInt(tagNode.attributes.hlevel || '1', 10); const title = tagNode.attributes.title || tagNode.name; // Adjust current path based on level while (currentPath.length >= level) { currentPath.pop(); } // Add this node to the path currentPath.push(title); // Record heading headings.push({ title, level, path: [...currentPath] }); // Process children with updated path if (tagNode.children) { traverse(tagNode.children, [...currentPath]); } } } }; traverse(ast); return headings; } /** * Register a warning handler */ public onWarning(handler: (warning: Warning) => void): void { this.warningHandlers.push(handler); } /** * Remove a previously registered warning handler * * @param event - The event type ('warning') * @param handler - The handler to remove */ public off(_event: 'warning', handler: (warning: Warning) => void): void { this.warningHandlers = this.warningHandlers.filter(h => h !== handler); } /** * Emit a warning to all registered handlers * * @param warning - The warning to emit */ protected emitWarning(warning: Warning): void { if ( this.options.warningLevel === 'none' || (this.options.warningLevel === 'ambiguous-only' && warning.code !== 'AMBIGUOUS_MATCH') ) { return; } for (const handler of this.warningHandlers) { handler(warning); } } /** * Convert Markdown to XML and back to Markdown, preserving all structure */ public async roundTrip(markdown: string): Promise<string> { const mdAst = await markdownParser.parseToAST(markdown); logger.debug('Parsed Markdown AST', { ast: mdAst }); const xmlAst = markdownToLLM.transform(mdAst); logger.debug('Transformed to LLM-XML AST', { ast: xmlAst }); // Create a new serializer with hlevel enabled to preserve heading levels const serializer = new LLMSerializer({ ...this.options, includeHlevel: true }); const xml = serializer.serialize(xmlAst); return this.toMarkdown(xml); } /** * Normalize markdown formatting with consistent spacing * * @param markdown - The markdown content to normalize * @returns Normalized markdown with consistent spacing */ public async normalizeMarkdown(markdown: string): Promise<string> { // Parse the markdown into an AST const mdAst = await markdownParser.parseToAST(markdown); logger.debug('Parsed Markdown AST for normalization', { ast: mdAst }); // Serialize it back to markdown with proper spacing return markdownSerializer.serialize(mdAst); } } /** * Create a new LLMXML instance with the given options * * @param options - Configuration options * @returns A new LLMXML instance */ export function createLLMXML(options?: LLMXMLOptions): LLMXML { return new LLMXML(options); } export * from './types'; export * from './errors';