llmxml
Version:
Convert between markdown and LLM-friendly pseudo-XML
238 lines (222 loc) • 6.46 kB
text/typescript
import { unified } from 'unified';
import remarkParse from 'remark-parse';
import type { Root, RootContent, List, ListItem } from 'mdast';
import type { ASTNode, SourceLocation } from '../types/ast';
import { LLMXMLError } from '../errors';
import logger from '../utils/logger';
import { visit } from 'unist-util-visit';
// Helper function to convert remark Position to our SourceLocation
function convertPosition(pos: any): SourceLocation | undefined {
if (!pos) return undefined;
return {
start: {
offset: pos.start.offset || 0,
line: pos.start.line || 1,
column: pos.start.column || 1
},
end: {
offset: pos.end.offset || 0,
line: pos.end.line || 1,
column: pos.end.column || 1
}
};
}
/**
* Custom remark plugin to capture emphasis markers
*/
function remarkEmphasisMarkers() {
return (tree: Root) => {
visit(tree, ['emphasis', 'strong'], (node: any) => {
// Check the raw source text to determine the marker used
const raw = node.position?.source?.value?.slice(
node.position.start.offset,
node.position.end.offset
);
if (raw) {
const marker = raw.startsWith('_') ? '_' : '*';
node.data = node.data || {};
node.data.marker = marker.repeat(node.type === 'strong' ? 2 : 1);
}
});
};
}
/**
* Markdown parser implementation
*/
export class MarkdownParser {
/**
* Parse Markdown content into an AST
*
* @param input - The Markdown content to parse
* @returns The parsed AST
*/
public async parseToAST(input: string): Promise<ASTNode[]> {
try {
const mdast = await unified()
.use(remarkParse)
.use(remarkEmphasisMarkers)
.parse(input);
// Validate the AST for unclosed code blocks
this.validateAST(mdast);
return this.transformNodes(mdast.children);
} catch (error) {
logger.error('Failed to parse Markdown', { error });
throw new LLMXMLError(
'Failed to parse Markdown',
'PARSE_ERROR',
{ error }
);
}
}
/**
* Validate the Markdown AST for common issues
*/
private validateAST(root: Root): void {
const validate = (node: any) => {
if (node.type === 'code' && !node.value) {
throw new LLMXMLError(
'Unclosed code block',
'PARSE_ERROR',
{ node }
);
}
if (Array.isArray(node.children)) {
node.children.forEach(validate);
}
};
root.children.forEach(validate);
}
/**
* Transform remark AST nodes into our AST format
*
* @param nodes - Array of remark AST nodes
* @returns Array of transformed AST nodes
*/
private transformNodes(nodes: RootContent[]): ASTNode[] {
return nodes.map(node => this.transformNode(node)).filter(Boolean) as ASTNode[];
}
/**
* Transform a single remark AST node into our AST format
*
* @param node - The remark AST node to transform
* @returns The transformed AST node
*/
private transformNode(node: RootContent): ASTNode | null {
switch (node.type) {
case 'heading':
return {
type: 'heading',
text: this.getNodeText(node),
depth: node.depth,
children: [],
location: convertPosition(node.position)
};
case 'text':
return {
type: 'text',
value: node.value,
location: convertPosition(node.position)
};
case 'code':
return {
type: 'code',
value: node.value,
lang: node.lang || undefined,
location: convertPosition(node.position)
};
case 'list':
return {
type: 'list',
ordered: node.ordered || false,
children: this.transformNodes(node.children),
location: convertPosition(node.position)
};
case 'listItem':
return {
type: 'listItem',
children: this.transformNodes(node.children),
location: convertPosition(node.position)
};
case 'paragraph':
return {
type: 'paragraph',
children: this.transformNodes(node.children),
location: convertPosition(node.position)
};
case 'link':
return {
type: 'link',
url: node.url,
children: this.transformNodes(node.children),
location: convertPosition(node.position)
};
case 'emphasis':
return {
type: 'emphasis',
marker: (node as any).data?.marker || '*',
children: this.transformNodes(node.children),
location: convertPosition(node.position)
};
case 'strong':
return {
type: 'strong',
marker: (node as any).data?.marker || '**',
children: this.transformNodes(node.children),
location: convertPosition(node.position)
};
default:
logger.warn('Unknown node type during transformation', { type: node.type });
return null;
}
}
/**
* Get the text content of a node
*/
private getNodeText(node: RootContent): string {
if ('value' in node) {
return node.value;
}
if ('children' in node) {
return node.children.map(child => this.getNodeText(child)).join('');
}
return '';
}
/**
* Serialize a list node to text
*/
private serializeList(node: List, indent: string = ''): string {
if (!Array.isArray(node.children)) {
return '';
}
return node.children
.map((item: ListItem, index: number) => {
const marker = node.ordered ? `${index + 1}.` : '-';
const itemText = this.serializeListItem(item, `${indent} `);
return itemText ? `${indent}${marker} ${itemText}` : '';
})
.filter(Boolean)
.join('\n');
}
/**
* Serialize a list item node to text
*/
private serializeListItem(node: ListItem, indent: string): string {
if (!Array.isArray(node.children)) {
return '';
}
return node.children
.map(child => {
if (child.type === 'list') {
return '\n' + this.serializeList(child as List, indent);
}
if (child.type === 'paragraph') {
return this.getNodeText(child);
}
return '';
})
.filter(Boolean)
.join('');
}
}
// Export a singleton instance
export const markdownParser = new MarkdownParser();