UNPKG

rag-system-pgvector

Version:

A complete Retrieval-Augmented Generation system using pgvector, LangChain, and LangGraph for Node.js applications with dynamic embedding and model providers, structured data queries, and chat history - supports OpenAI, Anthropic, HuggingFace, Azure, Goog

627 lines (519 loc) 19.2 kB
import { PGVectorStore } from '@langchain/community/vectorstores/pgvector'; import { Document } from '@langchain/core/documents'; import { v4 as uuidv4 } from 'uuid'; class DocumentStoreLangChain { constructor(config = {}) { this.config = config; // Database is optional this.pool = config.database?.pool || null; this.embeddings = config.embeddings; this.hasDatabase = !!this.pool; // Only create vectorStoreConfig if we have a database if (this.hasDatabase) { this.vectorStoreConfig = { pool: this.pool, tableName: config.vectorStore?.tableName || 'document_chunks_vector', columns: { idColumnName: 'id', vectorColumnName: config.vectorStore?.vectorColumnName || 'embedding', contentColumnName: config.vectorStore?.contentColumnName || 'content', metadataColumnName: config.vectorStore?.metadataColumnName || 'metadata', }, distanceStrategy: 'cosine', }; } else { this.vectorStoreConfig = null; console.log('⚠️ DocumentStore initialized without database. In-memory mode only.'); } this.vectorStore = null; } async initializeVectorStore() { if (!this.vectorStore) { if (!this.hasDatabase) { throw new Error('Cannot initialize vector store without database configuration. Please provide database config when creating RAGSystem.'); } // Be very explicit about the table configuration const config = { ...this.vectorStoreConfig, tableName: 'document_chunks_vector', // Force the correct table name }; console.log('Vector store config:', config); this.vectorStore = await PGVectorStore.initialize( this.embeddings, config ); } return this.vectorStore; } async initialize() { // Initialize the vector store when the DocumentStore is initialized console.log('Initializing LangChain vector store...'); await this.initializeVectorStore(); console.log('✓ LangChain vector store initialized'); return this; } async saveDocument(documentData) { if (!this.hasDatabase) { throw new Error('Cannot save document without database configuration. Please provide database config when creating RAGSystem.'); } const client = await this.pool.connect(); try { await client.query('BEGIN'); // Insert document metadata into the documents table const documentId = uuidv4(); const documentQuery = ` INSERT INTO documents (id, title, content, file_path, file_type, metadata) VALUES ($1, $2, $3, $4, $5, $6) RETURNING id; `; await client.query(documentQuery, [ documentId, documentData.title, documentData.content, documentData.filePath, documentData.fileType, documentData.metadata, ]); await client.query('COMMIT'); console.log('✓ Document metadata saved to documents table'); // Now add chunks to the LangChain vector store (separate transaction) await this.addDocumentChunks(documentData, documentId); return { documentId, success: true }; } catch (error) { await client.query('ROLLBACK'); console.error('Error saving document:', error); throw error; } finally { client.release(); } } async addDocumentChunks(documentData, documentId) { try { // Initialize vector store await this.initializeVectorStore(); // Prepare documents for vector store const documents = documentData.chunks.map((chunk, index) => { return new Document({ pageContent: chunk.content, metadata: { // Preserve original chunk metadata ...chunk.metadata, // Add document-level metadata for filtering ...documentData.metadata, // Add standard metadata documentId: documentId, chunkIndex: chunk.index, title: documentData.title, filePath: documentData.filePath, fileType: documentData.fileType, }, }); }); // Generate IDs for chunks const chunkIds = documentData.chunks.map(() => uuidv4()); // Add documents to vector store (this should use document_chunks_vector table) await this.vectorStore.addDocuments(documents, { ids: chunkIds }); console.log(`✓ Added ${documents.length} chunks to vector store`); return { chunksAdded: documents.length, success: true }; } catch (error) { console.error('Error adding document chunks to vector store:', error); throw error; } } async searchSimilarChunks(queryEmbedding, limit = 10, threshold = 0.5) { try { await this.initializeVectorStore(); // Convert embedding array to query text (we'll use the vector store's search) // Since we have the embedding, we'll use similaritySearchVectorWithScore const results = await this.vectorStore.similaritySearchVectorWithScore( queryEmbedding, limit ); // Filter by threshold and format results const filteredResults = results .filter(([doc, score]) => score >= threshold) .map(([doc, score]) => ({ id: doc.metadata.id || uuidv4(), content: doc.pageContent, chunk_index: doc.metadata.chunkIndex, metadata: doc.metadata, title: doc.metadata.title, file_path: doc.metadata.filePath, file_type: doc.metadata.fileType, similarity: score, })); return filteredResults; } catch (error) { console.error('Error searching similar chunks:', error); throw error; } } async searchSimilarChunksByText(queryText, limit = 10, threshold = 0.5) { try { await this.initializeVectorStore(); // Use text-based similarity search const results = await this.vectorStore.similaritySearchWithScore( queryText, limit ); // Filter by threshold and format results const filteredResults = results .filter(([doc, score]) => score >= threshold) .map(([doc, score]) => ({ id: doc.metadata.id || uuidv4(), content: doc.pageContent, chunk_index: doc.metadata.chunkIndex, metadata: doc.metadata, title: doc.metadata.title, file_path: doc.metadata.filePath, file_type: doc.metadata.fileType, similarity: score, })); return filteredResults; } catch (error) { console.error('Error searching similar chunks by text:', error); throw error; } } // Enhanced search method with metadata filtering support async searchSimilarChunksByTextWithFilter(queryText, options = {}) { try { await this.initializeVectorStore(); const { limit = 10, threshold = 0.5, filter = {}, userId = null, knowledgebotId = null, ...additionalFilters } = options; // Build comprehensive filter object const searchFilter = { ...filter, ...additionalFilters }; // Add userId filter if provided if (userId) { searchFilter.userId = userId; } // Add knowledgebotId filter if provided if (knowledgebotId) { searchFilter.knowledgebotId = knowledgebotId; } // Use the existing retrieveDocuments method with filter const results = await this.retrieveDocuments(queryText, { k: limit, filter: searchFilter, scoreThreshold: threshold }); console.log(`✓ Filtered search returned ${results.length} chunks for query: "${queryText}"`); if (Object.keys(searchFilter).length > 0) { console.log(`✓ Applied filters:`, searchFilter); } return results; } catch (error) { console.error('Error searching with filter:', error); throw error; } } // Search by userId specifically async searchByUserId(queryText, userId, options = {}) { return this.searchSimilarChunksByTextWithFilter(queryText, { ...options, userId: userId }); } // Search by knowledgebot ID specifically async searchByKnowledgebotId(queryText, knowledgebotId, options = {}) { return this.searchSimilarChunksByTextWithFilter(queryText, { ...options, knowledgebotId: knowledgebotId }); } // Search with multiple filters async searchWithMultipleFilters(queryText, filters = {}, options = {}) { return this.searchSimilarChunksByTextWithFilter(queryText, { ...options, ...filters }); } // Get all documents for a specific user async getDocumentsByUserId(userId, options = {}) { try { await this.initializeVectorStore(); const { limit = 100 } = options; // Use a generic query to get all documents for the user const results = await this.vectorStore.similaritySearch( '', // Empty query to get all limit, { userId: userId } ); const formattedResults = results.map((doc, index) => ({ id: doc.metadata.id || uuidv4(), content: doc.pageContent, chunk_index: doc.metadata.chunkIndex || index, metadata: doc.metadata, title: doc.metadata.title || 'Unknown', file_path: doc.metadata.filePath || '', file_type: doc.metadata.fileType || '', documentId: doc.metadata.documentId })); console.log(`✓ Retrieved ${formattedResults.length} documents for userId: ${userId}`); return formattedResults; } catch (error) { console.error('Error getting documents by userId:', error); throw error; } } // Get all documents for a specific knowledgebot async getDocumentsByKnowledgebotId(knowledgebotId, options = {}) { try { await this.initializeVectorStore(); const { limit = 100 } = options; // Use a generic query to get all documents for the knowledgebot const results = await this.vectorStore.similaritySearch( '', // Empty query to get all limit, { knowledgebotId: knowledgebotId } ); const formattedResults = results.map((doc, index) => ({ id: doc.metadata.id || uuidv4(), content: doc.pageContent, chunk_index: doc.metadata.chunkIndex || index, metadata: doc.metadata, title: doc.metadata.title || 'Unknown', file_path: doc.metadata.filePath || '', file_type: doc.metadata.fileType || '', documentId: doc.metadata.documentId })); console.log(`✓ Retrieved ${formattedResults.length} documents for knowledgebotId: ${knowledgebotId}`); return formattedResults; } catch (error) { console.error('Error getting documents by knowledgebotId:', error); throw error; } } // New retriever method that provides better abstraction async retrieveDocuments(queryText, options = {}) { try { await this.initializeVectorStore(); const { k = 10, filter = {}, searchType = 'similarity', scoreThreshold = 0.1 } = options; // Create retriever with options const retriever = this.vectorStore.asRetriever({ k: k, filter: filter, searchType: searchType, searchKwargs: { scoreThreshold: scoreThreshold } }); // Use retriever to get documents const documents = await retriever.invoke(queryText); console.log(`✓ Retrieved ${documents.length} documents using retriever for query: "${queryText}"`); // Format results to match your existing structure const formattedResults = documents.map((doc, index) => ({ id: doc.metadata.id || uuidv4(), content: doc.pageContent, chunk_index: doc.metadata.chunkIndex || index, metadata: doc.metadata, title: doc.metadata.title || 'Unknown', file_path: doc.metadata.filePath || '', file_type: doc.metadata.fileType || '', similarity: doc.metadata.score || 1.0, // Retriever doesn't always return scores })); return formattedResults; } catch (error) { console.error('Error retrieving documents:', error); throw error; } } // Enhanced retriever with additional features async retrieveWithFilters(queryText, options = {}) { try { await this.initializeVectorStore(); const { k = 10, documentTypes = [], // Filter by document types dateRange = null, // Filter by date range sources = [], // Filter by specific sources scoreThreshold = 0.1 } = options; // Build metadata filter const filter = {}; if (documentTypes.length > 0) { filter.fileType = { in: documentTypes }; } if (sources.length > 0) { filter.title = { in: sources }; } // Create retriever with advanced filtering const retriever = this.vectorStore.asRetriever({ k: k * 2, // Get more results initially for better filtering filter: filter, searchType: 'similarity_score_threshold', searchKwargs: { scoreThreshold: scoreThreshold } }); const documents = await retriever.invoke(queryText); // Additional post-processing let processedResults = documents.map((doc, index) => ({ id: doc.metadata.id || uuidv4(), content: doc.pageContent, chunk_index: doc.metadata.chunkIndex || index, metadata: doc.metadata, title: doc.metadata.title || 'Unknown', file_path: doc.metadata.filePath || '', file_type: doc.metadata.fileType || '', similarity: doc.metadata.score || 1.0, })); // Apply date range filter if specified if (dateRange) { processedResults = processedResults.filter(result => { const createdAt = new Date(result.metadata.createdAt || 0); return createdAt >= dateRange.start && createdAt <= dateRange.end; }); } // Limit to requested k after filtering processedResults = processedResults.slice(0, k); console.log(`✓ Retrieved ${processedResults.length} filtered documents using enhanced retriever`); return processedResults; } catch (error) { console.error('Error in enhanced retrieval:', error); throw error; } } async getDocumentById(documentId) { const client = await pool.connect(); try { const documentQuery = ` SELECT * FROM documents WHERE id = $1; `; const documentResult = await client.query(documentQuery, [documentId]); if (documentResult.rows.length === 0) { return null; } // Get chunks from vector store await this.initializeVectorStore(); // Search for chunks belonging to this document const chunks = await this.vectorStore.similaritySearch( documentResult.rows[0].title, // Use title as query to find related chunks 100, // Get many chunks { documentId: documentId } // Filter by document ID ); return { ...documentResult.rows[0], chunks: chunks.map(doc => ({ content: doc.pageContent, metadata: doc.metadata, chunk_index: doc.metadata.chunkIndex, })), }; } catch (error) { console.error('Error getting document:', error); throw error; } finally { client.release(); } } async getAllDocuments() { const client = await pool.connect(); try { const query = ` SELECT d.*, COUNT(CASE WHEN dcv.metadata->>'documentId' = d.id::text THEN 1 END) as chunk_count FROM documents d LEFT JOIN document_chunks_vector dcv ON dcv.metadata->>'documentId' = d.id::text GROUP BY d.id ORDER BY d.created_at DESC; `; const result = await client.query(query); return result.rows; } catch (error) { console.error('Error getting all documents:', error); throw error; } finally { client.release(); } } async deleteDocument(documentId) { const client = await pool.connect(); try { await client.query('BEGIN'); // Delete from vector store first await this.initializeVectorStore(); // Find all chunks for this document and delete them const vectorResults = await this.vectorStore.similaritySearch( '', // Empty query 1000, // High limit { documentId: documentId } ); if (vectorResults.length > 0) { // Get the IDs of chunks to delete (this is a limitation - we might need to track IDs separately) // For now, we'll delete from the table directly await client.query( 'DELETE FROM document_chunks_vector WHERE metadata->>\'documentId\' = $1', [documentId] ); } // Delete document const result = await client.query( 'DELETE FROM documents WHERE id = $1 RETURNING *', [documentId] ); await client.query('COMMIT'); return result.rows[0]; } catch (error) { await client.query('ROLLBACK'); console.error('Error deleting document:', error); throw error; } finally { client.release(); } } async saveSearchSession(query, results) { const client = await pool.connect(); try { const searchQuery = ` INSERT INTO search_sessions (query, results) VALUES ($1, $2) RETURNING id; `; const result = await client.query(searchQuery, [query, results]); return result.rows[0].id; } catch (error) { console.error('Error saving search session:', error); throw error; } finally { client.release(); } } async createHnswIndex() { try { await this.initializeVectorStore(); // Create HNSW index for better performance await this.vectorStore.createHnswIndex({ dimensions: this.config.openai?.embeddingDimensions || 1536, efConstruction: 64, m: 16, }); console.log('✓ HNSW index created for better performance'); } catch (error) { console.error('Error creating HNSW index:', error); // Don't throw error as this is optional } } async close() { if (this.vectorStore) { await this.vectorStore.end(); } } } export { DocumentStoreLangChain }; export default DocumentStoreLangChain;