llmxml
Version:
Convert between markdown and LLM-friendly pseudo-XML
227 lines (209 loc) • 6.42 kB
text/typescript
import type { ASTNode, TagNode, TextNode, SourceLocation } from '../types/ast';
import logger from '../utils/logger';
interface ExtendedASTNode {
type: string;
location?: SourceLocation;
depth?: number;
text?: string;
lang?: string;
value?: string;
children?: ExtendedASTNode[];
ordered?: boolean;
url?: string;
}
/**
* Transforms our internal Markdown AST into LLM-XML AST format
*/
export class MarkdownToLLMTransformer {
/**
* Convert a Markdown AST to LLM-XML AST
*
* @param nodes - Array of Markdown AST nodes
* @returns Array of LLM-XML AST nodes
*/
public transform(nodes: ASTNode[]): ASTNode[] {
const result: ASTNode[] = [];
// Stack of parent sections, from root to current
const sectionStack: ExtendedASTNode[] = [];
for (const node of nodes) {
if (node.type === 'heading') {
const tagNode = this.createTagNode(node as ExtendedASTNode);
const currentLevel = (node as ExtendedASTNode).depth || 1;
// Pop stack until we find the appropriate parent level
while (
sectionStack.length > 0 &&
(sectionStack[sectionStack.length - 1] as any).depth! >= currentLevel
) {
sectionStack.pop();
}
// Add to parent if we have one, otherwise to root
if (sectionStack.length > 0) {
const parent = sectionStack[sectionStack.length - 1] as TagNode;
parent.children = parent.children || [];
parent.children.push(tagNode);
} else {
result.push(tagNode);
}
// Push this section onto the stack
sectionStack.push(tagNode);
} else {
// For non-heading content, add it to the most recently seen section
if (sectionStack.length > 0) {
const lastSection = sectionStack[sectionStack.length - 1] as TagNode;
lastSection.children = lastSection.children || [];
lastSection.children.push(this.transformContent(node as ExtendedASTNode));
} else {
// No sections yet, add to root
result.push(this.transformContent(node as ExtendedASTNode));
}
}
}
return result;
}
/**
* Create a tag node from a heading node
*
* @param heading - The heading node to convert
* @returns A new tag node
*/
private createTagNode(heading: ExtendedASTNode): TagNode {
const name = this.generateTagName(heading.text || '');
logger.debug('Creating tag node', { text: heading.text, name });
return {
type: 'tag',
name,
attributes: {
title: heading.text || name,
hlevel: String(heading.depth || 1),
},
children: [],
depth: heading.depth || 1, // Keep track of depth for nesting
location: heading.location
} as TagNode;
}
/**
* Transform non-heading content into text nodes
*/
private transformContent(node: ExtendedASTNode): TextNode {
switch (node.type) {
case 'code':
return {
type: 'text',
value: '```' + (node.lang || '') + '\n' + node.value + '\n```',
textType: 'code',
language: node.lang,
location: node.location
} as TextNode;
case 'list':
return {
type: 'text',
value: this.serializeList(node),
location: node.location
} as TextNode;
case 'paragraph':
return {
type: 'text',
value: this.serializeParagraph(node),
location: node.location
} as TextNode;
case 'text':
return {
type: 'text',
value: node.value || '',
location: node.location
} as TextNode;
default:
logger.warn('Unknown node type during content transformation', { type: node.type });
return {
type: 'text',
value: '',
location: node.location
} as TextNode;
}
}
/**
* Generate a valid tag name from heading text
*
* @param text - The heading text
* @returns A camelCase tag name
*/
private generateTagName(text: string): string {
// Remove special characters and convert to camelCase
return text
// Remove markdown formatting
.replace(/[`*_\[\]]/g, '')
// Split on non-word characters
.split(/[^a-zA-Z0-9]+/)
// Filter out empty strings
.filter(Boolean)
// Convert to camelCase
.map((word) => {
const normalized = word.toLowerCase();
return normalized.charAt(0).toUpperCase() + normalized.slice(1);
})
.join('');
}
/**
* Serialize a list node to text
*/
private serializeList(node: ExtendedASTNode): string {
if (!Array.isArray(node.children)) {
return '';
}
const ordered = (node as any).ordered || false;
return node.children
.map((item: ExtendedASTNode, index: number) => {
const marker = ordered ? `${index + 1}.` : '-';
const itemText = this.serializeListItem(item);
return itemText ? `${marker} ${itemText}` : '';
})
.filter(Boolean)
.join('\n');
}
/**
* Serialize a list item node to text
*/
private serializeListItem(node: ExtendedASTNode): string {
if (!Array.isArray(node.children)) {
return '';
}
return node.children
.map((child: ExtendedASTNode) => {
if (child.type === 'list') {
return '\n' + this.serializeList(child);
}
if (child.type === 'paragraph') {
return this.serializeParagraph(child);
}
return '';
})
.filter(Boolean)
.join('');
}
/**
* Serialize a paragraph node to text
*/
private serializeParagraph(node: ExtendedASTNode): string {
if (!Array.isArray(node.children)) {
return '';
}
return node.children
.map((child: ExtendedASTNode) => {
switch (child.type) {
case 'text':
return child.value || '';
case 'emphasis':
return `*${this.serializeParagraph(child)}*`;
case 'strong':
return `**${this.serializeParagraph(child)}**`;
case 'link':
return `[${this.serializeParagraph(child)}](${(child as any).url})`;
default:
return '';
}
})
.join('');
}
}
// Export a singleton instance
export const markdownToLLM = new MarkdownToLLMTransformer();