llmxml
Version:
Convert between markdown and LLM-friendly pseudo-XML
119 lines (105 loc) • 3.25 kB
text/typescript
import peggy from 'peggy';
import { readFileSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import type {
ParseOptions,
ParseResult,
ASTNode,
TagNode,
TextNode,
SourceLocation
} from '../types/ast';
import { ParseError } from '../types/ast';
import logger from '../utils/logger';
// ESM path resolution
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);
// Load and compile grammar
const grammarPaths = [
join(__dirname, 'grammar', 'llmxml.pegjs'),
join(__dirname, '..', 'grammar', 'llmxml.pegjs'),
join(__dirname, '..', '..', 'grammar', 'llmxml.pegjs')
];
let grammar: string | undefined;
for (const path of grammarPaths) {
try {
grammar = readFileSync(path, 'utf-8');
break;
} catch (error) {
// Continue to next path
}
}
if (!grammar) {
throw new Error('Failed to load grammar file. This might be a build configuration issue.');
}
// Node creation utilities
function createNode(
type: 'tag' | 'text',
data: Partial<TagNode | TextNode>,
location: SourceLocation
): ASTNode {
return {
type,
location,
...data
} as ASTNode;
}
// Generate parser with createNode function
const parser = peggy.generate(grammar, {
allowedStartRules: ['Document']
} as peggy.ParserBuildOptions);
// Main parse function
export function parse(input: string, options: ParseOptions = {}): ParseResult {
const errors: ParseError[] = [];
let ast: ASTNode[];
try {
ast = parser.parse(input, {
grammarSource: options.filename,
createNode // Pass createNode as a parse option
}) as ASTNode[];
} catch (error) {
if (error instanceof Error) {
// Extract location information if available
const location = (error as any).location || {
start: { offset: 0, line: 1, column: 1 },
end: { offset: input.length, line: 1, column: input.length + 1 }
};
// Create more descriptive error message
let errorMessage = error.message;
if (error.message.includes('end of input')) {
const lastTagMatch = input.match(/<([A-Z][A-Za-z0-9]*)(?:\s+[^>]*)?>/);
if (lastTagMatch) {
errorMessage = `Mismatched tags: unclosed tag <${lastTagMatch[1]}>`;
} else {
errorMessage = 'Unexpected end of input while parsing tag structure';
}
}
const parseError = new ParseError(errorMessage, location);
errors.push(parseError);
logger.error('Parse error', { error: parseError });
// Handle different error cases
if (error.message.includes('end of input') && input.includes('<') && !input.includes('</')) {
// Unclosed tag case - preserve content as text
ast = [createNode('text', { value: input }, location)];
} else {
// For other errors, create empty AST but preserve error information
ast = [];
}
} else {
throw error;
}
}
return { ast, errors };
}
// Export a singleton instance
export const llmParser = {
parse,
parseToAST: async (input: string, options: ParseOptions = {}) => {
const { ast, errors } = parse(input, options);
if (errors.length > 0) {
throw errors[0];
}
return ast;
}
};