@chainlink/mcp-server
Version:
Prototype MCP Server for CLL
233 lines • 10.3 kB
JavaScript
;
/**
* @fileoverview Document Processing Pipeline for Vector Database Ingestion
*
* Provides document processing capabilities for converting raw text documents
* into vector embeddings suitable for storage in the vector database. Handles
* text chunking, embedding generation, and batch processing of document
* collections using LlamaIndex and configurable embedding providers.
*/
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.DocumentProcessor = void 0;
const llamaindex_1 = require("llamaindex");
const logger_1 = require("../utils/logger");
const embedding_provider_1 = require("./embedding-provider");
const promises_1 = __importDefault(require("fs/promises"));
const path_1 = __importDefault(require("path"));
/**
* Document processor for converting text documents into vector embeddings
*
* Orchestrates the complete document processing pipeline including text
* chunking, embedding generation, and database entry creation. Provides
* both single document and batch directory processing capabilities with
* configurable chunking parameters and multiple embedding provider support.
*
* Features:
* - Intelligent text chunking with configurable overlap
* - Multiple embedding provider support (OpenAI, Ollama)
* - Batch processing for entire directories
* - Metadata preservation
* - JSON serialization for intermediate storage
* - Comprehensive error handling and logging
*
* Processing workflow:
* 1. Read document content from filesystem
* 2. Split text into chunks using SentenceSplitter
* 3. Generate vector embeddings for each chunk
* 4. Create DbDocEntry objects with metadata
* 5. Return entries ready for database insertion
*
* @class DocumentProcessor
*/
class DocumentProcessor {
embedManager;
chunkSize = 700;
chunkOverlap = 50;
/**
* Initialize the document processor with embedding manager
*/
constructor() {
this.embedManager = (0, embedding_provider_1.createEmbeddingManager)();
}
/**
* Initialize the document processor and embedding provider
*
* Sets up the embedding provider and validates configuration.
* Must be called before processing any documents.
*
* @returns Promise that resolves when initialization is complete
*/
async initialize() {
await this.embedManager.initialize();
logger_1.Logger.log("info", `Document processor initialized with ${this.embedManager.getProvider()} embeddings`);
}
/**
* Process a single document into vector database entries
*
* Reads the specified document, splits it into chunks using LlamaIndex,
* generates embeddings for each chunk, and creates database entries with
* appropriate metadata.
*
* @param filePath - Path to the document file to process
* @param product - Chainlink product identifier (default: "ccip") // TODO @dev: Expand beyond CCIP
* @returns Promise resolving to array of database entries ready for insertion
* @throws {Error} When file reading, chunking, or embedding generation fails
*/
async processDocument(filePath, product = "ccip" // TODO @dev: Default to CCIP for now, expand to support other products
) {
try {
logger_1.Logger.log("info", `Processing document: ${filePath}`);
// Read the document
const content = await promises_1.default.readFile(filePath, "utf-8");
const filename = path_1.default.basename(filePath);
// Infer product from filename if possible (e.g., *_CRE Documentation.md)
const lower = filename.toLowerCase();
let inferredProduct = product;
if (lower.includes("_cre documentation")) {
inferredProduct = "cre";
}
else if (lower.includes("data feeds documentation") ||
lower.includes("_data feeds documentation")) {
inferredProduct = "data-feeds";
}
else if (lower.includes("ccip")) {
inferredProduct = "ccip";
}
// Create Document object
const document = new llamaindex_1.Document({
text: content,
metadata: {
source: filename,
product: inferredProduct,
},
});
// Create ingestion pipeline for text splitting only
const pipeline = new llamaindex_1.IngestionPipeline({
transformations: [
new llamaindex_1.SentenceSplitter({
chunkSize: this.chunkSize,
chunkOverlap: this.chunkOverlap,
}),
],
});
logger_1.Logger.log("info", "Running ingestion pipeline...");
console.time("Pipeline Run Time");
const nodes = await pipeline.run({ documents: [document] });
console.timeEnd("Pipeline Run Time");
logger_1.Logger.log("info", `Generated ${nodes.length} chunks from ${filename}`);
// Generate embeddings for each chunk
logger_1.Logger.log("info", `Generating embeddings with ${this.embedManager.getProvider()}...`);
const entries = [];
for (let index = 0; index < nodes.length; index++) {
const node = nodes[index];
if (!node)
continue;
const chunkText = node.text || "";
// Generate embedding for this chunk
const embedding = await this.embedManager.getEmbedding(chunkText);
entries.push({
chunkId: node.id_ || `${filename}_chunk_${index}`,
metadata: {
sourceDocId: filename,
sourceDocFilename: filename,
sourceDocHash: node.hash || "",
product: inferredProduct,
},
vector: embedding,
chunkType: node.type || "text",
chunkText,
chunkHash: node.hash || "",
});
}
logger_1.Logger.log("info", `Generated embeddings for ${entries.length} chunks`);
return entries;
}
catch (error) {
logger_1.Logger.log("error", `Failed to process document ${filePath}: ${error}`);
throw error;
}
}
/**
* Process all documents in a directory into vector database entries
*
* Scans the specified directory for markdown and text files, processes
* each file through the document pipeline, and returns combined results.
* Ideal for batch processing of scraped documentation.
*
* @param directoryPath - Path to directory containing documents to process
* @param product - Chainlink product identifier (default: "ccip") // TODO @dev: Expand beyond CCIP
* @returns Promise resolving to array of all database entries from the directory
* @throws {Error} When directory reading or document processing fails
*/
async processDirectory(directoryPath, product = "ccip" // TODO @dev: Default to CCIP for now, expand to support other products
) {
try {
logger_1.Logger.log("info", `Processing directory: ${directoryPath}`);
const files = await promises_1.default.readdir(directoryPath);
const allEntries = [];
for (const file of files) {
const filePath = path_1.default.join(directoryPath, file);
const stat = await promises_1.default.stat(filePath);
if (stat.isFile() && (file.endsWith(".md") || file.endsWith(".txt"))) {
const entries = await this.processDocument(filePath, product);
allEntries.push(...entries);
}
}
logger_1.Logger.log("info", `Processed ${files.length} files, generated ${allEntries.length} total chunks`);
return allEntries;
}
catch (error) {
logger_1.Logger.log("error", `Failed to process directory ${directoryPath}: ${error}`);
throw error;
}
}
/**
* Save processed database entries to JSON file
*
* Serializes the database entries array to JSON format for intermediate
* storage or debugging purposes. Useful for caching processed results
* before database insertion.
*
* @param entries - Array of database entries to save
* @param outputPath - File path where JSON should be written
* @returns Promise that resolves when file is written
* @throws {Error} When file writing fails
*/
async saveEntriesToJson(entries, outputPath) {
try {
await promises_1.default.writeFile(outputPath, JSON.stringify(entries, null, 2));
logger_1.Logger.log("info", `Saved ${entries.length} entries to ${outputPath}`);
}
catch (error) {
logger_1.Logger.log("error", `Failed to save entries to ${outputPath}: ${error}`);
throw error;
}
}
/**
* Load processed database entries from JSON file
*
* Deserializes database entries from JSON format, useful for loading
* previously processed results or importing data from external sources.
*
* @param inputPath - File path to read JSON from
* @returns Promise resolving to array of loaded database entries
* @throws {Error} When file reading or JSON parsing fails
*/
async loadEntriesFromJson(inputPath) {
try {
const content = await promises_1.default.readFile(inputPath, "utf-8");
const entries = JSON.parse(content);
logger_1.Logger.log("info", `Loaded ${entries.length} entries from ${inputPath}`);
return entries;
}
catch (error) {
logger_1.Logger.log("error", `Failed to load entries from ${inputPath}: ${error}`);
throw error;
}
}
}
exports.DocumentProcessor = DocumentProcessor;
//# sourceMappingURL=pipeline.js.map