llmxml
Version:
Convert between markdown and LLM-friendly pseudo-XML
228 lines (202 loc) • 7.34 kB
text/typescript
import { ASTNode, TagNode, TextNode, ParagraphNode, ListNode, ListItemNode, CodeBlockNode } from '../types';
import logger from '../utils/logger';
/**
* Serializes LLM-XML AST back to text
*/
export class LLMSerializer {
constructor(private options: LLMXMLOptions = {}) {
this.options = {
includeTitle: false,
includeHlevel: false,
verbose: false,
...options
};
}
private formatTagName(name: string): string {
if (!name) return '';
// Split into words and clean up
const words = name
.replace(/[`*_\[\]]/g, '') // Remove markdown formatting
.replace(/([a-z])([A-Z])/g, '$1 $2') // Split camelCase
.split(/[^a-zA-Z0-9]+/) // Split on non-word chars
.filter(Boolean)
.map(word => word.toLowerCase());
if (words.length === 0) return '';
switch (this.options.tagFormat) {
case 'snake_case':
return words.join('_');
case 'SCREAMING_SNAKE':
return words.join('_').toUpperCase();
case 'camelCase':
return words[0] + words.slice(1).map(w => w.charAt(0).toUpperCase() + w.slice(1)).join('');
case 'UPPERCASE':
return words.join('').toUpperCase();
case 'PascalCase':
default:
return words.map(w => w.charAt(0).toUpperCase() + w.slice(1)).join('');
}
}
/**
* Convert AST to LLM-XML text
*/
public serialize(ast: ASTNode[]): string {
// Filter out null/undefined nodes
const validNodes = ast.filter((node): node is ASTNode => node != null);
return validNodes.map(node => this.serializeNode(node)).filter(Boolean).join('\n\n');
}
private serializeNode(node: ASTNode, indent: string = ''): string {
try {
switch (node.type) {
case 'tag':
return this.serializeTag(node as TagNode, indent);
case 'text':
return this.serializeText(node as TextNode, indent);
case 'paragraph':
return this.serializeParagraph(node as ParagraphNode, indent);
case 'list':
return this.serializeList(node as ListNode, indent);
case 'code':
return this.serializeCode(node as CodeBlockNode, indent);
default:
logger.warn('Unknown node type during serialization', { type: node.type });
return '';
}
} catch (error) {
logger.error('Error serializing node', { node, error });
return '';
}
}
private serializeTag(node: TagNode, indent: string): string {
try {
const { name, attributes = {}, children = [] } = node;
// Build opening tag with attributes
const formattedName = this.formatTagName(name);
let tag = `${indent}<${this.escapeXml(formattedName)}`;
// Include attributes based on options
const shouldIncludeTitle = this.options.verbose || this.options.includeTitle;
const shouldIncludeHlevel = this.options.verbose || this.options.includeHlevel;
if (shouldIncludeTitle && attributes.title) {
tag += ` title="${this.escapeXml(attributes.title)}"`;
}
if (shouldIncludeHlevel && attributes.hlevel && attributes.hlevel !== '1') {
tag += ` hlevel="${this.escapeXml(attributes.hlevel)}"`;
}
tag += '>';
// Add children with proper indentation
if (children.length > 0) {
const childIndent = indent + ' ';
const validChildren = children
.filter((child): child is ASTNode => child != null)
.map(child => this.serializeNode(child, childIndent))
.filter(Boolean);
if (validChildren.length > 0) {
tag += '\n' + validChildren.join('\n').trimEnd();
if (!validChildren[validChildren.length - 1].endsWith('\n')) {
tag += '\n' + indent;
} else {
tag += indent;
}
}
}
// Add closing tag
return tag + `</${this.escapeXml(formattedName)}>`;
} catch (error) {
logger.error('Error serializing tag', { node, error });
return '';
}
}
private serializeText(node: TextNode, indent: string): string {
if (!node.value) return '';
// Skip JSON detection for code blocks - preserve them exactly as is
if (node.textType === 'code') {
return node.value;
}
// Always preserve and pretty-print JSON structures in regular text
return this.preserveJsonInText(node.value.trimEnd());
}
/**
* Preserves JSON structures within text content and pretty-prints them
* @param text The text content that may contain JSON structures
* @returns Text with JSON structures preserved and pretty-printed
*/
private preserveJsonInText(text: string): string {
// Regular expression to find potential JSON objects or arrays
const jsonPattern = /({[\s\S]*?}|\[[\s\S]*?\])/g;
// If no potential JSON structures, return as is
if (!jsonPattern.test(text)) {
return this.escapeXml(text);
}
// Process text to preserve JSON structures
return text.replace(jsonPattern, (match) => {
try {
// Verify this is valid JSON by attempting to parse it
const parsed = JSON.parse(match);
// If it's valid JSON, return it pretty-printed with 2-space indentation
return JSON.stringify(parsed, null, 2);
} catch (error) {
// If it's not valid JSON, escape it like normal text
return this.escapeXml(match);
}
});
}
private serializeParagraph(node: ParagraphNode, indent: string): string {
if (!Array.isArray(node.children)) {
logger.warn('Malformed paragraph node', { node });
return '';
}
return indent + node.children
.map(child => this.serializeNode(child, ''))
.filter(Boolean)
.join('');
}
private serializeList(node: ListNode, indent: string): string {
if (!Array.isArray(node.children)) {
logger.warn('Malformed list node', { node });
return '';
}
return node.children
.map((item: ListItemNode, index: number) => {
const marker = node.ordered ? `${index + 1}.` : '-';
const itemText = this.serializeListItem(item, `${indent} `);
return itemText ? `${indent}${marker} ${itemText}` : '';
})
.filter(Boolean)
.join('\n');
}
private serializeListItem(node: ListItemNode, indent: string): string {
if (!Array.isArray(node.children)) {
logger.warn('Malformed list item', { node });
return '';
}
return node.children
.map(child => {
if (child.type === 'list') {
return '\n' + this.serializeList(child as ListNode, indent);
}
return this.serializeNode(child, '');
})
.filter(Boolean)
.join('');
}
private serializeCode(node: CodeBlockNode, indent: string): string {
const { lang = '', value = '' } = node;
if (!value) {
logger.warn('Empty code block', { node });
return '';
}
return `${indent}\`\`\`${lang}\n${value}\`\`\``;
}
private escapeXml(str: string): string {
return str
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, ''');
}
}
// Export a singleton instance
export const llmSerializer = new LLMSerializer({
includeTitle: true,
includeHlevel: true
});