UNPKG

llm-stream-parser

Version:

A TypeScript library for parsing and processing structured data from LLM streaming responses with custom tag definitions and event-driven architecture

569 lines (485 loc) 16.1 kB
/** * Main Stream Parser - Core parsing engine * Supports both flat and nested XML parsing modes */ import { EventEmitter } from 'events'; import { BaseTag, NestedTag, ParserState, ParserStats, TagMatch } from '../types/base'; import { ParserConfig, mergeConfig, RequiredParserConfig } from '../types/config'; import { ParserError, ParserErrorCode } from '../types/errors'; import { TagDefinition } from '../types/schema'; import { BufferManager } from './buffer-manager'; import { TagMatcher } from './tag-matcher'; import { TagValidator } from './validator'; import { TagTransformer } from './transformer'; /** * Stack entry for tracking open tags in nested mode */ interface TagStackEntry { tag: NestedTag; startIndex: number; depth: number; path: string; } /** * Main stream parser with support for both flat and nested XML parsing */ export class StreamParser<T extends BaseTag = BaseTag> extends EventEmitter { private readonly config: RequiredParserConfig; private readonly bufferManager: BufferManager; private readonly tagMatcher: TagMatcher; private readonly tagRegistry = new Map<string, TagDefinition<T>>(); private state: ParserState = ParserState.IDLE; private stats: ParserStats; // Nested parsing state private tagStack: TagStackEntry[] = []; private currentDepth = 0; private currentPath = ''; constructor(config: ParserConfig = {}) { super(); this.config = mergeConfig(config); this.bufferManager = new BufferManager(this.config.maxBufferSize); this.tagMatcher = new TagMatcher(this.config.caseSensitive); this.stats = this.initializeStats(); } /** * Register a tag definition */ defineTag(definition: TagDefinition<T>): this { this.tagRegistry.set(definition.tagName, definition); this.updateStats(); return this; } /** * Remove a tag definition */ removeTag(tagName: string): boolean { const deleted = this.tagRegistry.delete(tagName); if (deleted) { this.updateStats(); } return deleted; } /** * Check if a tag is registered */ hasTag(tagName: string): boolean { return this.tagRegistry.has(tagName); } /** * Get all registered tag names */ getRegisteredTags(): readonly string[] { return Array.from(this.tagRegistry.keys()); } /** * Parse a chunk of streaming data */ parse(chunk: string): void { try { this.state = ParserState.PARSING; this.bufferManager.append(chunk); // Choose parsing strategy based on configuration if (this.config.enableNested) { this.processBufferNested(); } else { this.processBufferFlat(); } this.state = ParserState.COMPLETED; } catch (error) { this.state = ParserState.ERROR; this.stats = { ...this.stats, errorCount: this.stats.errorCount + 1 }; this.emitError( error instanceof ParserError ? error : new ParserError( `Unexpected error: ${error instanceof Error ? error.message : String(error)}`, ParserErrorCode.INVALID_TAG_FORMAT ), chunk ); } } /** * Process buffer for flat parsing mode */ private processBufferFlat(): void { const buffer = this.bufferManager.getContent(); const completeTags = this.tagMatcher.findCompleteTags(buffer); const parsedTags: T[] = []; for (const match of completeTags) { try { const tag = this.processTagFlat(match); if (tag) { parsedTags.push(tag); this.stats = { ...this.stats, totalTagsParsed: this.stats.totalTagsParsed + 1 }; } } catch (error) { this.emitError( error instanceof ParserError ? error : new ParserError('Failed to process tag', ParserErrorCode.TRANSFORMATION_FAILED), match ); } } // Remove processed content from buffer this.removeProcessedContent(completeTags); if (parsedTags.length > 0) { this.emit('parsing_complete', parsedTags); this.emit('document_completed', parsedTags); } this.updateStats(); } /** * Process buffer for nested parsing mode */ private processBufferNested(): void { const buffer = this.bufferManager.getContent(); let lastProcessedIndex = 0; while (lastProcessedIndex < buffer.length) { const nextTag = this.tagMatcher.findNextTag(buffer, lastProcessedIndex); if (!nextTag) break; // Check if the tag is registered BEFORE processing const isRegistered = this.tagRegistry.has(nextTag.tagName); if (!isRegistered) { // If tag is not registered, treat the entire tag (including content) as text content const unregisteredTagContent = this.tagMatcher.extractTextContent( buffer, lastProcessedIndex, nextTag.endIndex ); this.handleTextContent(unregisteredTagContent); lastProcessedIndex = nextTag.endIndex; continue; } // Extract text content between tags if (nextTag.startIndex > lastProcessedIndex) { const textContent = this.tagMatcher.extractTextContent( buffer, lastProcessedIndex, nextTag.startIndex ); this.handleTextContent(textContent); } // Process the tag this.processTagNested(nextTag); lastProcessedIndex = nextTag.endIndex; } // Update buffer to remaining content if (lastProcessedIndex > 0) { this.bufferManager.consume(lastProcessedIndex); } } /** * Process a single tag match in flat mode */ private processTagFlat(match: TagMatch): T | null { const definition = this.tagRegistry.get(match.tagName); if (!definition) { throw ParserError.fromUnknownTag(match.tagName); } // Emit tag started event this.emit('tag_started', match.tagName as T['tagName'], match.attributes as T['attributes']); definition.onStart?.(match.tagName as T['tagName'], match.attributes as T['attributes']); // Create tag object const tag: T = { tagName: match.tagName, content: this.config.trimWhitespace ? match.content.trim() : match.content, attributes: match.attributes, } as T; // Apply defaults TagTransformer.applyDefaults(tag, definition); // Validate and transform TagValidator.validate(tag, definition); TagTransformer.transform(tag, definition); // Emit completion events this.emit('tag_completed', tag); definition.onComplete?.(tag); return tag; } /** * Process a tag in nested mode */ private processTagNested(tagMatch: TagMatch): void { switch (tagMatch.type) { case 'opening': this.handleOpeningTag(tagMatch); break; case 'closing': this.handleClosingTag(tagMatch); break; case 'self-closing': this.handleSelfClosingTag(tagMatch); break; } } /** * Handle opening tag in nested mode */ private handleOpeningTag(tagMatch: TagMatch): void { // Check depth limits if (this.currentDepth >= this.config.maxDepth) { throw ParserError.fromMaxDepth(this.config.maxDepth, tagMatch.path); } const definition = this.tagRegistry.get(tagMatch.tagName); // Create new nested tag const newTag: NestedTag = { tagName: tagMatch.tagName, content: definition?.defaultContent || '', children: [], attributes: { ...definition?.defaultAttributes, ...tagMatch.attributes }, parent: this.getCurrentParent(), path: this.buildPath(tagMatch.tagName), depth: this.currentDepth + 1, isSelfClosing: false, }; // Add to current parent's children if we have a parent const currentParent = this.getCurrentParent(); if (currentParent && currentParent.children) { currentParent.children.push(newTag); } // Update state this.currentDepth++; this.currentPath = newTag.path || ''; this.stats = { ...this.stats, maxDepthReached: Math.max(this.stats.maxDepthReached || 0, this.currentDepth), totalNestedTags: (this.stats.totalNestedTags || 0) + 1, }; // Push to stack this.tagStack.push({ tag: newTag, startIndex: tagMatch.startIndex, depth: this.currentDepth, path: newTag.path || '', }); // Emit events this.emit('tag_opened', newTag as unknown as Partial<T>, this.currentDepth, newTag.path || ''); this.emit('tag_started', newTag.tagName as T['tagName'], newTag.attributes as T['attributes']); definition?.onStart?.(newTag.tagName as T['tagName'], newTag.attributes as T['attributes']); } /** * Handle closing tag in nested mode */ private handleClosingTag(tagMatch: TagMatch): void { if (this.tagStack.length === 0) { throw new ParserError( `Unexpected closing tag: ${tagMatch.tagName}`, ParserErrorCode.MISMATCHED_CLOSING_TAG ); } const currentEntry = this.tagStack[this.tagStack.length - 1]!; // Check if closing tag matches the most recent opening tag if (currentEntry.tag.tagName !== tagMatch.tagName) { if (this.config.autoCloseUnclosed) { // Auto-close intervening tags while ( this.tagStack.length > 0 && this.tagStack[this.tagStack.length - 1]!.tag.tagName !== tagMatch.tagName ) { this.autoCloseTag(); } } else { throw ParserError.fromMismatchedClosing(currentEntry.tag.tagName, tagMatch.tagName); } } // Pop from stack and complete the tag const completedEntry = this.tagStack.pop()!; this.completeTag(completedEntry.tag); // Update state this.currentDepth--; this.currentPath = this.tagStack.length > 0 ? this.tagStack[this.tagStack.length - 1]!.path : ''; } /** * Auto-close unclosed tag */ private autoCloseTag(): void { if (this.tagStack.length === 0) return; const entry = this.tagStack.pop()!; this.completeTag(entry.tag); this.currentDepth--; this.currentPath = this.tagStack.length > 0 ? this.tagStack[this.tagStack.length - 1]!.path : ''; } /** * Complete a tag with validation and transformation */ private completeTag(tag: NestedTag): void { const definition = this.tagRegistry.get(tag.tagName); if (definition) { TagValidator.validate(tag, definition); TagTransformer.transform(tag, definition); definition.onComplete?.(tag as unknown as T); } this.stats = { ...this.stats, totalTagsParsed: this.stats.totalTagsParsed + 1 }; this.emit('tag_closed', tag as unknown as T, tag.depth || 0, tag.path || ''); this.emit('tag_completed', tag as unknown as T); // Emit subtree_completed if tag has children (indicating a completed subtree) if (tag.children && tag.children.length > 0) { this.emit('subtree_completed', tag as unknown as T, tag.depth || 0); } } /** * Handle self-closing tag in nested mode */ private handleSelfClosingTag(tagMatch: TagMatch): void { const definition = this.tagRegistry.get(tagMatch.tagName); // Create self-closing tag const tag: NestedTag = { tagName: tagMatch.tagName, content: definition?.defaultContent || '', children: [], attributes: { ...definition?.defaultAttributes, ...tagMatch.attributes }, parent: this.getCurrentParent(), path: this.buildPath(tagMatch.tagName), depth: this.currentDepth + 1, isSelfClosing: true, }; // Add to current parent's children if we have a parent const currentParent = this.getCurrentParent(); if (currentParent && currentParent.children) { currentParent.children.push(tag); } // Complete the tag immediately this.completeTag(tag); } /** * Get current parent tag from stack */ private getCurrentParent(): NestedTag | undefined { return this.tagStack.length > 0 ? this.tagStack[this.tagStack.length - 1]!.tag : undefined; } /** * Build path string for current tag */ private buildPath(tagName: string): string { return this.currentPath ? `${this.currentPath}/${tagName}` : tagName; } /** * Handle text content between tags */ private handleTextContent(textContent: string): void { if (!textContent || textContent.trim() === '') return; const currentParent = this.getCurrentParent(); if (currentParent) { // Add text content to current parent if (currentParent.content) { currentParent.content += textContent; } else { currentParent.content = textContent; } // Emit with correct parameters: tagName and partialContent this.emit('tag_content_update', currentParent.tagName as T['tagName'], textContent); } } /** * Remove processed content from buffer */ private removeProcessedContent(matches: TagMatch[]): void { if (matches.length === 0) return; // Find the end of the last processed tag const lastMatch = matches[matches.length - 1]!; this.bufferManager.consume(lastMatch.endIndex); } /** * Initialize parser statistics */ private initializeStats(): ParserStats { return { totalTagsParsed: 0, totalBytesProcessed: 0, errorCount: 0, bufferSize: 0, state: ParserState.IDLE, registeredTagsCount: 0, maxDepthReached: 0, totalNestedTags: 0, }; } /** * Update parser statistics */ private updateStats(): void { this.stats = { ...this.stats, totalBytesProcessed: this.bufferManager.getTotalProcessed(), bufferSize: this.bufferManager.getSize(), registeredTagsCount: this.tagRegistry.size, }; this.emit('stats_updated', this.stats); } /** * Emit error event with context */ private emitError(error: ParserError, context: unknown): void { this.emit('parse_error', error, context); } /** * Get current parser state */ getState(): ParserState { return this.state; } /** * Get parser statistics */ getStats(): Readonly<ParserStats> { return { ...this.stats }; } /** * Get current parsing depth (nested mode only) */ getCurrentDepth(): number { return this.currentDepth; } /** * Get current parsing path (nested mode only) */ getCurrentPath(): string { return this.currentPath; } /** * Get buffer size */ getBufferSize(): number { return this.bufferManager.getSize(); } /** * Clear buffer and reset parser state */ reset(): void { this.bufferManager.clear(); this.emit('buffer_cleared'); this.tagStack = []; this.currentDepth = 0; this.currentPath = ''; this.state = ParserState.IDLE; this.stats = this.initializeStats(); this.emit('parser_reset'); } /** * Finalize parsing and auto-close any remaining open tags */ finalize(): void { // Collect root tags before auto-closing const rootTags: T[] = []; for (const entry of this.tagStack) { if (entry.depth === 1) { // Root level tags rootTags.push(entry.tag as unknown as T); } } // Auto-close any remaining open tags in nested mode while (this.tagStack.length > 0) { this.autoCloseTag(); } this.state = ParserState.COMPLETED; this.updateStats(); // Emit document completed with root tags if (rootTags.length > 0) { this.emit('document_completed', rootTags); } this.emit('parsing_finalized', this.stats); } }