UNPKG

llmxml

Version:

Convert between markdown and LLM-friendly pseudo-XML

228 lines (202 loc) 7.34 kB
import { ASTNode, TagNode, TextNode, ParagraphNode, ListNode, ListItemNode, CodeBlockNode } from '../types'; import logger from '../utils/logger'; /** * Serializes LLM-XML AST back to text */ export class LLMSerializer { constructor(private options: LLMXMLOptions = {}) { this.options = { includeTitle: false, includeHlevel: false, verbose: false, ...options }; } private formatTagName(name: string): string { if (!name) return ''; // Split into words and clean up const words = name .replace(/[`*_\[\]]/g, '') // Remove markdown formatting .replace(/([a-z])([A-Z])/g, '$1 $2') // Split camelCase .split(/[^a-zA-Z0-9]+/) // Split on non-word chars .filter(Boolean) .map(word => word.toLowerCase()); if (words.length === 0) return ''; switch (this.options.tagFormat) { case 'snake_case': return words.join('_'); case 'SCREAMING_SNAKE': return words.join('_').toUpperCase(); case 'camelCase': return words[0] + words.slice(1).map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(''); case 'UPPERCASE': return words.join('').toUpperCase(); case 'PascalCase': default: return words.map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(''); } } /** * Convert AST to LLM-XML text */ public serialize(ast: ASTNode[]): string { // Filter out null/undefined nodes const validNodes = ast.filter((node): node is ASTNode => node != null); return validNodes.map(node => this.serializeNode(node)).filter(Boolean).join('\n\n'); } private serializeNode(node: ASTNode, indent: string = ''): string { try { switch (node.type) { case 'tag': return this.serializeTag(node as TagNode, indent); case 'text': return this.serializeText(node as TextNode, indent); case 'paragraph': return this.serializeParagraph(node as ParagraphNode, indent); case 'list': return this.serializeList(node as ListNode, indent); case 'code': return this.serializeCode(node as CodeBlockNode, indent); default: logger.warn('Unknown node type during serialization', { type: node.type }); return ''; } } catch (error) { logger.error('Error serializing node', { node, error }); return ''; } } private serializeTag(node: TagNode, indent: string): string { try { const { name, attributes = {}, children = [] } = node; // Build opening tag with attributes const formattedName = this.formatTagName(name); let tag = `${indent}<${this.escapeXml(formattedName)}`; // Include attributes based on options const shouldIncludeTitle = this.options.verbose || this.options.includeTitle; const shouldIncludeHlevel = this.options.verbose || this.options.includeHlevel; if (shouldIncludeTitle && attributes.title) { tag += ` title="${this.escapeXml(attributes.title)}"`; } if (shouldIncludeHlevel && attributes.hlevel && attributes.hlevel !== '1') { tag += ` hlevel="${this.escapeXml(attributes.hlevel)}"`; } tag += '>'; // Add children with proper indentation if (children.length > 0) { const childIndent = indent + ' '; const validChildren = children .filter((child): child is ASTNode => child != null) .map(child => this.serializeNode(child, childIndent)) .filter(Boolean); if (validChildren.length > 0) { tag += '\n' + validChildren.join('\n').trimEnd(); if (!validChildren[validChildren.length - 1].endsWith('\n')) { tag += '\n' + indent; } else { tag += indent; } } } // Add closing tag return tag + `</${this.escapeXml(formattedName)}>`; } catch (error) { logger.error('Error serializing tag', { node, error }); return ''; } } private serializeText(node: TextNode, indent: string): string { if (!node.value) return ''; // Skip JSON detection for code blocks - preserve them exactly as is if (node.textType === 'code') { return node.value; } // Always preserve and pretty-print JSON structures in regular text return this.preserveJsonInText(node.value.trimEnd()); } /** * Preserves JSON structures within text content and pretty-prints them * @param text The text content that may contain JSON structures * @returns Text with JSON structures preserved and pretty-printed */ private preserveJsonInText(text: string): string { // Regular expression to find potential JSON objects or arrays const jsonPattern = /({[\s\S]*?}|\[[\s\S]*?\])/g; // If no potential JSON structures, return as is if (!jsonPattern.test(text)) { return this.escapeXml(text); } // Process text to preserve JSON structures return text.replace(jsonPattern, (match) => { try { // Verify this is valid JSON by attempting to parse it const parsed = JSON.parse(match); // If it's valid JSON, return it pretty-printed with 2-space indentation return JSON.stringify(parsed, null, 2); } catch (error) { // If it's not valid JSON, escape it like normal text return this.escapeXml(match); } }); } private serializeParagraph(node: ParagraphNode, indent: string): string { if (!Array.isArray(node.children)) { logger.warn('Malformed paragraph node', { node }); return ''; } return indent + node.children .map(child => this.serializeNode(child, '')) .filter(Boolean) .join(''); } private serializeList(node: ListNode, indent: string): string { if (!Array.isArray(node.children)) { logger.warn('Malformed list node', { node }); return ''; } return node.children .map((item: ListItemNode, index: number) => { const marker = node.ordered ? `${index + 1}.` : '-'; const itemText = this.serializeListItem(item, `${indent} `); return itemText ? `${indent}${marker} ${itemText}` : ''; }) .filter(Boolean) .join('\n'); } private serializeListItem(node: ListItemNode, indent: string): string { if (!Array.isArray(node.children)) { logger.warn('Malformed list item', { node }); return ''; } return node.children .map(child => { if (child.type === 'list') { return '\n' + this.serializeList(child as ListNode, indent); } return this.serializeNode(child, ''); }) .filter(Boolean) .join(''); } private serializeCode(node: CodeBlockNode, indent: string): string { const { lang = '', value = '' } = node; if (!value) { logger.warn('Empty code block', { node }); return ''; } return `${indent}\`\`\`${lang}\n${value}\`\`\``; } private escapeXml(str: string): string { return str .replace(/&/g, '&amp;') .replace(/</g, '&lt;') .replace(/>/g, '&gt;') .replace(/"/g, '&quot;') .replace(/'/g, '&apos;'); } } // Export a singleton instance export const llmSerializer = new LLMSerializer({ includeTitle: true, includeHlevel: true });