llmxml

Version:

Convert between markdown and LLM-friendly pseudo-XML

119 lines (105 loc) • 3.25 kB

text/typescript

import peggy from 'peggy'; import { readFileSync } from 'fs'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; import type { ParseOptions, ParseResult, ASTNode, TagNode, TextNode, SourceLocation } from '../types/ast'; import { ParseError } from '../types/ast'; import logger from '../utils/logger'; // ESM path resolution const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // Load and compile grammar const grammarPaths = [ join(__dirname, 'grammar', 'llmxml.pegjs'), join(__dirname, '..', 'grammar', 'llmxml.pegjs'), join(__dirname, '..', '..', 'grammar', 'llmxml.pegjs') ]; let grammar: string | undefined; for (const path of grammarPaths) { try { grammar = readFileSync(path, 'utf-8'); break; } catch (error) { // Continue to next path } } if (!grammar) { throw new Error('Failed to load grammar file. This might be a build configuration issue.'); } // Node creation utilities function createNode( type: 'tag' | 'text', data: Partial<TagNode | TextNode>, location: SourceLocation ): ASTNode { return { type, location, ...data } as ASTNode; } // Generate parser with createNode function const parser = peggy.generate(grammar, { allowedStartRules: ['Document'] } as peggy.ParserBuildOptions); // Main parse function export function parse(input: string, options: ParseOptions = {}): ParseResult { const errors: ParseError[] = []; let ast: ASTNode[]; try { ast = parser.parse(input, { grammarSource: options.filename, createNode // Pass createNode as a parse option }) as ASTNode[]; } catch (error) { if (error instanceof Error) { // Extract location information if available const location = (error as any).location || { start: { offset: 0, line: 1, column: 1 }, end: { offset: input.length, line: 1, column: input.length + 1 } }; // Create more descriptive error message let errorMessage = error.message; if (error.message.includes('end of input')) { const lastTagMatch = input.match(/<([A-Z][A-Za-z0-9]*)(?:\s+[^>]*)?>/); if (lastTagMatch) { errorMessage = `Mismatched tags: unclosed tag <${lastTagMatch[1]}>`; } else { errorMessage = 'Unexpected end of input while parsing tag structure'; } } const parseError = new ParseError(errorMessage, location); errors.push(parseError); logger.error('Parse error', { error: parseError }); // Handle different error cases if (error.message.includes('end of input') && input.includes('<') && !input.includes('</')) { // Unclosed tag case - preserve content as text ast = [createNode('text', { value: input }, location)]; } else { // For other errors, create empty AST but preserve error information ast = []; } } else { throw error; } } return { ast, errors }; } // Export a singleton instance export const llmParser = { parse, parseToAST: async (input: string, options: ParseOptions = {}) => { const { ast, errors } = parse(input, options); if (errors.length > 0) { throw errors[0]; } return ast; } };