UNPKG

@chainlink/mcp-server

Version:
233 lines 10.3 kB
"use strict"; /** * @fileoverview Document Processing Pipeline for Vector Database Ingestion * * Provides document processing capabilities for converting raw text documents * into vector embeddings suitable for storage in the vector database. Handles * text chunking, embedding generation, and batch processing of document * collections using LlamaIndex and configurable embedding providers. */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.DocumentProcessor = void 0; const llamaindex_1 = require("llamaindex"); const logger_1 = require("../utils/logger"); const embedding_provider_1 = require("./embedding-provider"); const promises_1 = __importDefault(require("fs/promises")); const path_1 = __importDefault(require("path")); /** * Document processor for converting text documents into vector embeddings * * Orchestrates the complete document processing pipeline including text * chunking, embedding generation, and database entry creation. Provides * both single document and batch directory processing capabilities with * configurable chunking parameters and multiple embedding provider support. * * Features: * - Intelligent text chunking with configurable overlap * - Multiple embedding provider support (OpenAI, Ollama) * - Batch processing for entire directories * - Metadata preservation * - JSON serialization for intermediate storage * - Comprehensive error handling and logging * * Processing workflow: * 1. Read document content from filesystem * 2. Split text into chunks using SentenceSplitter * 3. Generate vector embeddings for each chunk * 4. Create DbDocEntry objects with metadata * 5. Return entries ready for database insertion * * @class DocumentProcessor */ class DocumentProcessor { embedManager; chunkSize = 700; chunkOverlap = 50; /** * Initialize the document processor with embedding manager */ constructor() { this.embedManager = (0, embedding_provider_1.createEmbeddingManager)(); } /** * Initialize the document processor and embedding provider * * Sets up the embedding provider and validates configuration. * Must be called before processing any documents. * * @returns Promise that resolves when initialization is complete */ async initialize() { await this.embedManager.initialize(); logger_1.Logger.log("info", `Document processor initialized with ${this.embedManager.getProvider()} embeddings`); } /** * Process a single document into vector database entries * * Reads the specified document, splits it into chunks using LlamaIndex, * generates embeddings for each chunk, and creates database entries with * appropriate metadata. * * @param filePath - Path to the document file to process * @param product - Chainlink product identifier (default: "ccip") // TODO @dev: Expand beyond CCIP * @returns Promise resolving to array of database entries ready for insertion * @throws {Error} When file reading, chunking, or embedding generation fails */ async processDocument(filePath, product = "ccip" // TODO @dev: Default to CCIP for now, expand to support other products ) { try { logger_1.Logger.log("info", `Processing document: ${filePath}`); // Read the document const content = await promises_1.default.readFile(filePath, "utf-8"); const filename = path_1.default.basename(filePath); // Infer product from filename if possible (e.g., *_CRE Documentation.md) const lower = filename.toLowerCase(); let inferredProduct = product; if (lower.includes("_cre documentation")) { inferredProduct = "cre"; } else if (lower.includes("data feeds documentation") || lower.includes("_data feeds documentation")) { inferredProduct = "data-feeds"; } else if (lower.includes("ccip")) { inferredProduct = "ccip"; } // Create Document object const document = new llamaindex_1.Document({ text: content, metadata: { source: filename, product: inferredProduct, }, }); // Create ingestion pipeline for text splitting only const pipeline = new llamaindex_1.IngestionPipeline({ transformations: [ new llamaindex_1.SentenceSplitter({ chunkSize: this.chunkSize, chunkOverlap: this.chunkOverlap, }), ], }); logger_1.Logger.log("info", "Running ingestion pipeline..."); console.time("Pipeline Run Time"); const nodes = await pipeline.run({ documents: [document] }); console.timeEnd("Pipeline Run Time"); logger_1.Logger.log("info", `Generated ${nodes.length} chunks from ${filename}`); // Generate embeddings for each chunk logger_1.Logger.log("info", `Generating embeddings with ${this.embedManager.getProvider()}...`); const entries = []; for (let index = 0; index < nodes.length; index++) { const node = nodes[index]; if (!node) continue; const chunkText = node.text || ""; // Generate embedding for this chunk const embedding = await this.embedManager.getEmbedding(chunkText); entries.push({ chunkId: node.id_ || `${filename}_chunk_${index}`, metadata: { sourceDocId: filename, sourceDocFilename: filename, sourceDocHash: node.hash || "", product: inferredProduct, }, vector: embedding, chunkType: node.type || "text", chunkText, chunkHash: node.hash || "", }); } logger_1.Logger.log("info", `Generated embeddings for ${entries.length} chunks`); return entries; } catch (error) { logger_1.Logger.log("error", `Failed to process document ${filePath}: ${error}`); throw error; } } /** * Process all documents in a directory into vector database entries * * Scans the specified directory for markdown and text files, processes * each file through the document pipeline, and returns combined results. * Ideal for batch processing of scraped documentation. * * @param directoryPath - Path to directory containing documents to process * @param product - Chainlink product identifier (default: "ccip") // TODO @dev: Expand beyond CCIP * @returns Promise resolving to array of all database entries from the directory * @throws {Error} When directory reading or document processing fails */ async processDirectory(directoryPath, product = "ccip" // TODO @dev: Default to CCIP for now, expand to support other products ) { try { logger_1.Logger.log("info", `Processing directory: ${directoryPath}`); const files = await promises_1.default.readdir(directoryPath); const allEntries = []; for (const file of files) { const filePath = path_1.default.join(directoryPath, file); const stat = await promises_1.default.stat(filePath); if (stat.isFile() && (file.endsWith(".md") || file.endsWith(".txt"))) { const entries = await this.processDocument(filePath, product); allEntries.push(...entries); } } logger_1.Logger.log("info", `Processed ${files.length} files, generated ${allEntries.length} total chunks`); return allEntries; } catch (error) { logger_1.Logger.log("error", `Failed to process directory ${directoryPath}: ${error}`); throw error; } } /** * Save processed database entries to JSON file * * Serializes the database entries array to JSON format for intermediate * storage or debugging purposes. Useful for caching processed results * before database insertion. * * @param entries - Array of database entries to save * @param outputPath - File path where JSON should be written * @returns Promise that resolves when file is written * @throws {Error} When file writing fails */ async saveEntriesToJson(entries, outputPath) { try { await promises_1.default.writeFile(outputPath, JSON.stringify(entries, null, 2)); logger_1.Logger.log("info", `Saved ${entries.length} entries to ${outputPath}`); } catch (error) { logger_1.Logger.log("error", `Failed to save entries to ${outputPath}: ${error}`); throw error; } } /** * Load processed database entries from JSON file * * Deserializes database entries from JSON format, useful for loading * previously processed results or importing data from external sources. * * @param inputPath - File path to read JSON from * @returns Promise resolving to array of loaded database entries * @throws {Error} When file reading or JSON parsing fails */ async loadEntriesFromJson(inputPath) { try { const content = await promises_1.default.readFile(inputPath, "utf-8"); const entries = JSON.parse(content); logger_1.Logger.log("info", `Loaded ${entries.length} entries from ${inputPath}`); return entries; } catch (error) { logger_1.Logger.log("error", `Failed to load entries from ${inputPath}: ${error}`); throw error; } } } exports.DocumentProcessor = DocumentProcessor; //# sourceMappingURL=pipeline.js.map