UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

393 lines (392 loc) 13.1 kB
/** * MDocument - Main Document Processing Class * * Provides a fluent interface for document processing using the Factory + Registry pattern. * Supports various document types, chunking strategies, and metadata extraction. * * @example * ```typescript * const doc = await MDocument.fromText(content); * const chunks = await doc.chunk({ * strategy: 'recursive', * config: { maxSize: 1000, overlap: 200 } * }); * const enriched = await doc.extractMetadata({ * title: true, * summary: true, * keywords: true * }); * ``` */ import { randomUUID } from "crypto"; import { logger } from "../../utils/logger.js"; import { ChunkerRegistry } from "../chunking/chunkerRegistry.js"; import { LLMMetadataExtractor } from "../metadata/metadataExtractor.js"; /** * MDocument class for comprehensive document processing * * Provides a chainable API for: * - Loading documents from various sources * - Chunking with multiple strategies * - Metadata extraction using LLMs * - Embedding generation */ export class MDocument { state; documentId; /** * Create a new MDocument instance * @param content - Document content * @param config - Document configuration */ constructor(content, config) { this.documentId = randomUUID(); this.state = { content, type: config?.type ?? "text", metadata: { ...config?.metadata, documentId: this.documentId, createdAt: new Date().toISOString(), }, chunks: [], embeddings: [], history: ["created"], }; } // ============================================================================ // Static Factory Methods // ============================================================================ /** * Create MDocument from plain text * @param text - Plain text content * @param metadata - Optional metadata * @returns MDocument instance */ static fromText(text, metadata) { return new MDocument(text, { type: "text", metadata }); } /** * Create MDocument from markdown content * @param markdown - Markdown content * @param metadata - Optional metadata * @returns MDocument instance */ static fromMarkdown(markdown, metadata) { return new MDocument(markdown, { type: "markdown", metadata }); } /** * Create MDocument from HTML content * @param html - HTML content * @param metadata - Optional metadata * @returns MDocument instance */ static fromHTML(html, metadata) { return new MDocument(html, { type: "html", metadata }); } /** * Create MDocument from JSON content * @param json - JSON string or object * @param metadata - Optional metadata * @returns MDocument instance */ static fromJSONContent(json, metadata) { const content = typeof json === "string" ? json : JSON.stringify(json, null, 2); return new MDocument(content, { type: "json", metadata }); } /** * Create MDocument from LaTeX content * @param latex - LaTeX content * @param metadata - Optional metadata * @returns MDocument instance */ static fromLaTeX(latex, metadata) { return new MDocument(latex, { type: "latex", metadata }); } /** * Create MDocument from CSV content * @param csv - CSV content * @param metadata - Optional metadata * @returns MDocument instance */ static fromCSV(csv, metadata) { return new MDocument(csv, { type: "csv", metadata }); } // ============================================================================ // Core Processing Methods // ============================================================================ /** * Chunk the document using specified strategy * @param params - Chunking parameters * @returns This MDocument instance (for chaining) */ async chunk(params) { const { strategy = this.getDefaultStrategy(), config = {} } = params || {}; logger.debug("[MDocument] Chunking document", { documentId: this.documentId, strategy, contentLength: this.state.content.length, }); const chunker = ChunkerRegistry.get(strategy); // Merge document metadata into chunk config const chunkConfig = { ...config, metadata: { ...config.metadata, source: this.state.metadata.source, documentType: this.state.type, }, }; this.state.chunks = await chunker.chunk(this.state.content, chunkConfig); this.state.history.push(`chunked:${strategy}`); logger.info("[MDocument] Document chunked", { documentId: this.documentId, strategy, chunkCount: this.state.chunks.length, }); return this; } /** * Extract metadata from chunks using LLM * @param params - Extraction parameters * @param options - Extractor options * @returns This MDocument instance (for chaining) */ async extractMetadata(params, options) { if (this.state.chunks.length === 0) { logger.warn("[MDocument] No chunks to extract metadata from. Call chunk() first."); return this; } logger.debug("[MDocument] Extracting metadata", { documentId: this.documentId, chunkCount: this.state.chunks.length, params: Object.keys(params), }); const extractor = new LLMMetadataExtractor(options); const results = await extractor.extract(this.state.chunks, params); // Merge extraction results into chunk metadata for (let i = 0; i < this.state.chunks.length && i < results.length; i++) { const result = results[i]; if (result.title) { this.state.chunks[i].metadata.title = result.title; } if (result.summary) { this.state.chunks[i].metadata.summary = result.summary; } if (result.keywords) { this.state.chunks[i].metadata.keywords = result.keywords; } if (result.custom) { this.state.chunks[i].metadata.custom = { ...(this.state.chunks[i].metadata.custom || {}), ...result.custom, }; } } this.state.history.push(`metadata:${Object.keys(params).join(",")}`); logger.info("[MDocument] Metadata extracted", { documentId: this.documentId, extractedFields: Object.keys(params), }); return this; } /** * Generate embeddings for all chunks * @param provider - Embedding provider name * @param modelName - Embedding model name * @returns This MDocument instance (for chaining) */ async embed(provider = "openai", modelName = "text-embedding-3-small") { if (this.state.chunks.length === 0) { logger.warn("[MDocument] No chunks to embed. Call chunk() first."); return this; } // Lazy import to avoid circular dependencies const { ProviderFactory } = await import("../../factories/providerFactory.js"); logger.debug("[MDocument] Generating embeddings", { documentId: this.documentId, chunkCount: this.state.chunks.length, provider, model: modelName, }); const embeddingProvider = await ProviderFactory.createProvider(provider, modelName); if (typeof embeddingProvider.embed !== "function") { throw new Error(`Provider ${provider} does not support embeddings`); } this.state.embeddings = []; for (const chunk of this.state.chunks) { const embedding = await embeddingProvider.embed(chunk.text); this.state.embeddings.push(embedding); chunk.embedding = embedding; } this.state.history.push(`embedded:${provider}:${modelName}`); logger.info("[MDocument] Embeddings generated", { documentId: this.documentId, embeddingCount: this.state.embeddings.length, dimension: this.state.embeddings[0]?.length, }); return this; } // ============================================================================ // Accessor Methods // ============================================================================ /** * Get document ID */ getId() { return this.documentId; } /** * Get raw document content */ getContent() { return this.state.content; } /** * Get document type */ getType() { return this.state.type; } /** * Get document metadata */ getMetadata() { return { ...this.state.metadata }; } /** * Get processed chunks */ getChunks() { return [...this.state.chunks]; } /** * Get chunk embeddings */ getEmbeddings() { return [...this.state.embeddings]; } /** * Get processing history */ getHistory() { return [...this.state.history]; } /** * Check if document has been chunked */ isChunked() { return this.state.chunks.length > 0; } /** * Check if document has embeddings */ hasEmbeddings() { return this.state.embeddings.length > 0; } /** * Get chunk count */ getChunkCount() { return this.state.chunks.length; } // ============================================================================ // Transformation Methods // ============================================================================ /** * Set document metadata * @param key - Metadata key * @param value - Metadata value * @returns This MDocument instance (for chaining) */ setMetadata(key, value) { this.state.metadata[key] = value; return this; } /** * Merge metadata into document * @param metadata - Metadata to merge * @returns This MDocument instance (for chaining) */ mergeMetadata(metadata) { this.state.metadata = { ...this.state.metadata, ...metadata }; return this; } /** * Filter chunks based on predicate * @param predicate - Filter function * @returns New MDocument with filtered chunks */ filterChunks(predicate) { const doc = new MDocument(this.state.content, { type: this.state.type, metadata: this.state.metadata, }); doc.state.chunks = this.state.chunks.filter(predicate); doc.state.embeddings = this.state.embeddings.filter((_, i) => predicate(this.state.chunks[i])); doc.state.history = [...this.state.history, "filtered"]; return doc; } /** * Map transformation over chunks * @param transform - Transform function * @returns New MDocument with transformed chunks */ mapChunks(transform) { const doc = new MDocument(this.state.content, { type: this.state.type, metadata: this.state.metadata, }); doc.state.chunks = this.state.chunks.map(transform); doc.state.embeddings = [...this.state.embeddings]; doc.state.history = [...this.state.history, "mapped"]; return doc; } // ============================================================================ // Serialization Methods // ============================================================================ /** * Convert to plain object for serialization */ toJSON() { return { id: this.documentId, content: this.state.content, type: this.state.type, metadata: this.state.metadata, chunks: this.state.chunks, history: this.state.history, }; } /** * Create MDocument from serialized JSON * @param json - Serialized document data * @returns MDocument instance */ static fromJSON(json) { const doc = new MDocument(json.content, { type: json.type, metadata: json.metadata, }); if (json.id) { doc.documentId = json.id; } if (json.chunks) { doc.state.chunks = json.chunks; } if (json.history) { doc.state.history = json.history; } return doc; } // ============================================================================ // Private Helper Methods // ============================================================================ /** * Get default chunking strategy based on document type */ getDefaultStrategy() { return ChunkerRegistry.getRecommendedStrategy(this.state.type); } }