llmxml
Version:
Convert between markdown and LLM-friendly pseudo-XML
366 lines (324 loc) • 11 kB
text/typescript
import type { Warning, LLMXMLOptions } from './types';
import { markdownParser } from './parser/markdown';
import { llmParser } from './parser/llm';
import { markdownToLLM } from './transformer/md-to-llm';
import { llmToMarkdown } from './transformer/llm-to-md';
import { sectionExtractor } from './extractor';
import logger, { events, configureLogger } from './utils/logger';
import { markdownSerializer } from './serializer/markdown';
import { LLMXMLError } from './errors';
import type { ASTNode, TagNode } from './types/ast';
import { LLMSerializer } from './serializer/llm';
/**
* Options for section extraction
*/
export interface GetSectionOptions {
/** Only match headers at this level */
level?: number;
/** Require exact matches */
exact?: boolean;
/** Include subsections in result */
includeNested?: boolean;
/** Minimum fuzzy match score (0-1) */
fuzzyThreshold?: number;
}
/**
* Heading information from document
*/
export interface HeadingInfo {
/** Heading title */
title: string;
/** Heading level (1-6) */
level: number;
/** Hierarchical path to this heading */
path: string[];
}
/**
* Main LLMXML class for converting between Markdown and LLM-friendly pseudo-XML
*/
export class LLMXML {
private options: LLMXMLOptions;
private llmSerializer: LLMSerializer;
private warningHandlers: ((warning: Warning) => void)[] = [];
/**
* Creates a new LLMXML instance
*
* @param options - Configuration options
*/
constructor(options: LLMXMLOptions = {}) {
this.options = {
defaultFuzzyThreshold: 0.7,
warningLevel: 'all',
includeHlevel: false,
includeTitle: false,
verbose: false,
tagFormat: 'PascalCase',
...options,
};
this.llmSerializer = new LLMSerializer(this.options);
// Configure logger based on options
configureLogger(this.options);
// Log initialization only when not silenced
if (this.options.warningLevel !== 'none' && this.options.verbose) {
logger.info('LLMXML instance created', { options: this.options });
}
// Register warning handler
events.on('warning', (warning: Warning) => {
if (this.options.warningLevel === 'none') return;
if (this.options.warningLevel === 'ambiguous-only' && warning.code !== 'AMBIGUOUS_MATCH') return;
this.warningHandlers.forEach(handler => handler(warning));
});
}
/**
* Convert Markdown to LLM-XML format
*/
public async toXML(markdown: string): Promise<string> {
const mdAst = await markdownParser.parseToAST(markdown);
logger.debug('Parsed Markdown AST', { ast: mdAst });
const xmlAst = markdownToLLM.transform(mdAst);
logger.debug('Transformed to LLM-XML AST', { ast: xmlAst });
return this.llmSerializer.serialize(xmlAst);
}
/**
* Convert LLM-XML to Markdown format
*/
public async toMarkdown(xml: string): Promise<string> {
const xmlAst = await llmParser.parseToAST(xml);
logger.debug('Parsed LLM-XML AST', { ast: xmlAst });
const mdAst = llmToMarkdown.transform(xmlAst);
logger.debug('Transformed to Markdown AST', { ast: mdAst });
return markdownSerializer.serialize(mdAst);
}
/**
* Extract a single section from the document
*/
public async getSection(
content: string,
title: string,
options: GetSectionOptions = {}
): Promise<string> {
try {
const sections = await this.getSections(content, title, options);
return sections[0];
} catch (error) {
if (error instanceof LLMXMLError && error.code === 'SECTION_NOT_FOUND') {
// Enhance error with available headings
const headings = await this.getHeadings(content);
throw new LLMXMLError(
`Section "${title}" not found in document`,
'SECTION_NOT_FOUND',
{
title,
availableHeadings: headings,
closestMatches: sectionExtractor.findClosestMatches(title, headings, options.fuzzyThreshold || this.options.defaultFuzzyThreshold)
}
);
}
throw error;
}
}
/**
* Extract multiple matching sections from the document
*/
public async getSections(
content: string,
title: string,
options: GetSectionOptions = {}
): Promise<string[]> {
// Validate section options
if (options.level !== undefined && (options.level < 1 || options.level > 6)) {
throw new LLMXMLError(
'Invalid heading level. Must be between 1 and 6.',
'INVALID_LEVEL',
{ level: options.level }
);
}
let ast: ASTNode[];
const isXML = content.trim().startsWith('<');
if (isXML) {
ast = await llmParser.parseToAST(content);
} else {
const mdAst = await markdownParser.parseToAST(content);
ast = markdownToLLM.transform(mdAst);
}
// Register warning handler for section extraction
const warnings: Warning[] = [];
const warningHandler = (warning: Warning) => {
warnings.push(warning);
this.emitWarning(warning);
};
events.on('warning', warningHandler);
try {
const matches = sectionExtractor.findSections(ast, {
title,
level: options.level,
threshold: options.fuzzyThreshold || this.options.defaultFuzzyThreshold,
fuzzyMatches: !options.exact,
includeSubsections: options.includeNested
});
// If we have multiple matches with similar scores, emit an ambiguous match warning
if (matches.length > 1) {
const warning: Warning = {
code: 'AMBIGUOUS_MATCH',
message: `Multiple potential matches found for section "${title}"`,
details: {
matches: matches.map(match => ({
title: match.attributes.title,
score: match.attributes.score
}))
}
};
warningHandler(warning);
}
// Convert matches back to original format
return matches.map(match => {
if (isXML) {
return this.llmSerializer.serialize([match]);
} else {
return markdownSerializer.serialize(llmToMarkdown.transform([match]));
}
});
} catch (error) {
if (error instanceof LLMXMLError && error.code === 'SECTION_NOT_FOUND') {
// Enhance error with available headings
const headings = this.extractHeadingsFromAST(ast);
error.details = {
availableHeadings: headings,
closestMatches: sectionExtractor.findClosestMatches(title,
headings.map(h => ({ title: h.title, level: h.level })),
options.fuzzyThreshold || this.options.defaultFuzzyThreshold),
...(error.details as Record<string, unknown> || {})
};
}
throw error;
} finally {
// Clean up warning handler
events.off('warning', warningHandler);
}
}
/**
* Extract all headings from the document
*
* @param content - Markdown or LLM-XML content
* @returns Array of heading information
*/
public async getHeadings(content: string): Promise<HeadingInfo[]> {
let ast: ASTNode[];
const isXML = content.trim().startsWith('<');
if (isXML) {
ast = await llmParser.parseToAST(content);
} else {
const mdAst = await markdownParser.parseToAST(content);
ast = markdownToLLM.transform(mdAst);
}
return this.extractHeadingsFromAST(ast);
}
/**
* Extract headings from AST
*
* @private
* @param ast - Document AST
* @returns Array of heading information
*/
private extractHeadingsFromAST(ast: ASTNode[]): HeadingInfo[] {
const headings: HeadingInfo[] = [];
const traverse = (nodes: ASTNode[], currentPath: string[] = []) => {
for (const node of nodes) {
if (node.type === 'tag') {
const tagNode = node as TagNode;
const level = parseInt(tagNode.attributes.hlevel || '1', 10);
const title = tagNode.attributes.title || tagNode.name;
// Adjust current path based on level
while (currentPath.length >= level) {
currentPath.pop();
}
// Add this node to the path
currentPath.push(title);
// Record heading
headings.push({
title,
level,
path: [...currentPath]
});
// Process children with updated path
if (tagNode.children) {
traverse(tagNode.children, [...currentPath]);
}
}
}
};
traverse(ast);
return headings;
}
/**
* Register a warning handler
*/
public onWarning(handler: (warning: Warning) => void): void {
this.warningHandlers.push(handler);
}
/**
* Remove a previously registered warning handler
*
* @param event - The event type ('warning')
* @param handler - The handler to remove
*/
public off(_event: 'warning', handler: (warning: Warning) => void): void {
this.warningHandlers = this.warningHandlers.filter(h => h !== handler);
}
/**
* Emit a warning to all registered handlers
*
* @param warning - The warning to emit
*/
protected emitWarning(warning: Warning): void {
if (
this.options.warningLevel === 'none' ||
(this.options.warningLevel === 'ambiguous-only' && warning.code !== 'AMBIGUOUS_MATCH')
) {
return;
}
for (const handler of this.warningHandlers) {
handler(warning);
}
}
/**
* Convert Markdown to XML and back to Markdown, preserving all structure
*/
public async roundTrip(markdown: string): Promise<string> {
const mdAst = await markdownParser.parseToAST(markdown);
logger.debug('Parsed Markdown AST', { ast: mdAst });
const xmlAst = markdownToLLM.transform(mdAst);
logger.debug('Transformed to LLM-XML AST', { ast: xmlAst });
// Create a new serializer with hlevel enabled to preserve heading levels
const serializer = new LLMSerializer({
...this.options,
includeHlevel: true
});
const xml = serializer.serialize(xmlAst);
return this.toMarkdown(xml);
}
/**
* Normalize markdown formatting with consistent spacing
*
* @param markdown - The markdown content to normalize
* @returns Normalized markdown with consistent spacing
*/
public async normalizeMarkdown(markdown: string): Promise<string> {
// Parse the markdown into an AST
const mdAst = await markdownParser.parseToAST(markdown);
logger.debug('Parsed Markdown AST for normalization', { ast: mdAst });
// Serialize it back to markdown with proper spacing
return markdownSerializer.serialize(mdAst);
}
}
/**
* Create a new LLMXML instance with the given options
*
* @param options - Configuration options
* @returns A new LLMXML instance
*/
export function createLLMXML(options?: LLMXMLOptions): LLMXML {
return new LLMXML(options);
}
export * from './types';
export * from './errors';