rag-system-pgvector
Version:
A complete Retrieval-Augmented Generation system using pgvector, LangChain, and LangGraph for Node.js applications with dynamic embedding and model providers, structured data queries, and chat history - supports OpenAI, Anthropic, HuggingFace, Azure, Goog
627 lines (519 loc) • 19.2 kB
JavaScript
import { PGVectorStore } from '@langchain/community/vectorstores/pgvector';
import { Document } from '@langchain/core/documents';
import { v4 as uuidv4 } from 'uuid';
class DocumentStoreLangChain {
constructor(config = {}) {
this.config = config;
// Database is optional
this.pool = config.database?.pool || null;
this.embeddings = config.embeddings;
this.hasDatabase = !!this.pool;
// Only create vectorStoreConfig if we have a database
if (this.hasDatabase) {
this.vectorStoreConfig = {
pool: this.pool,
tableName: config.vectorStore?.tableName || 'document_chunks_vector',
columns: {
idColumnName: 'id',
vectorColumnName: config.vectorStore?.vectorColumnName || 'embedding',
contentColumnName: config.vectorStore?.contentColumnName || 'content',
metadataColumnName: config.vectorStore?.metadataColumnName || 'metadata',
},
distanceStrategy: 'cosine',
};
} else {
this.vectorStoreConfig = null;
console.log('⚠️ DocumentStore initialized without database. In-memory mode only.');
}
this.vectorStore = null;
}
async initializeVectorStore() {
if (!this.vectorStore) {
if (!this.hasDatabase) {
throw new Error('Cannot initialize vector store without database configuration. Please provide database config when creating RAGSystem.');
}
// Be very explicit about the table configuration
const config = {
...this.vectorStoreConfig,
tableName: 'document_chunks_vector', // Force the correct table name
};
console.log('Vector store config:', config);
this.vectorStore = await PGVectorStore.initialize(
this.embeddings,
config
);
}
return this.vectorStore;
}
async initialize() {
// Initialize the vector store when the DocumentStore is initialized
console.log('Initializing LangChain vector store...');
await this.initializeVectorStore();
console.log('✓ LangChain vector store initialized');
return this;
}
async saveDocument(documentData) {
if (!this.hasDatabase) {
throw new Error('Cannot save document without database configuration. Please provide database config when creating RAGSystem.');
}
const client = await this.pool.connect();
try {
await client.query('BEGIN');
// Insert document metadata into the documents table
const documentId = uuidv4();
const documentQuery = `
INSERT INTO documents (id, title, content, file_path, file_type, metadata)
VALUES ($1, $2, $3, $4, $5, $6)
RETURNING id;
`;
await client.query(documentQuery, [
documentId,
documentData.title,
documentData.content,
documentData.filePath,
documentData.fileType,
documentData.metadata,
]);
await client.query('COMMIT');
console.log('✓ Document metadata saved to documents table');
// Now add chunks to the LangChain vector store (separate transaction)
await this.addDocumentChunks(documentData, documentId);
return { documentId, success: true };
} catch (error) {
await client.query('ROLLBACK');
console.error('Error saving document:', error);
throw error;
} finally {
client.release();
}
}
async addDocumentChunks(documentData, documentId) {
try {
// Initialize vector store
await this.initializeVectorStore();
// Prepare documents for vector store
const documents = documentData.chunks.map((chunk, index) => {
return new Document({
pageContent: chunk.content,
metadata: {
// Preserve original chunk metadata
...chunk.metadata,
// Add document-level metadata for filtering
...documentData.metadata,
// Add standard metadata
documentId: documentId,
chunkIndex: chunk.index,
title: documentData.title,
filePath: documentData.filePath,
fileType: documentData.fileType,
},
});
});
// Generate IDs for chunks
const chunkIds = documentData.chunks.map(() => uuidv4());
// Add documents to vector store (this should use document_chunks_vector table)
await this.vectorStore.addDocuments(documents, { ids: chunkIds });
console.log(`✓ Added ${documents.length} chunks to vector store`);
return { chunksAdded: documents.length, success: true };
} catch (error) {
console.error('Error adding document chunks to vector store:', error);
throw error;
}
}
async searchSimilarChunks(queryEmbedding, limit = 10, threshold = 0.5) {
try {
await this.initializeVectorStore();
// Convert embedding array to query text (we'll use the vector store's search)
// Since we have the embedding, we'll use similaritySearchVectorWithScore
const results = await this.vectorStore.similaritySearchVectorWithScore(
queryEmbedding,
limit
);
// Filter by threshold and format results
const filteredResults = results
.filter(([doc, score]) => score >= threshold)
.map(([doc, score]) => ({
id: doc.metadata.id || uuidv4(),
content: doc.pageContent,
chunk_index: doc.metadata.chunkIndex,
metadata: doc.metadata,
title: doc.metadata.title,
file_path: doc.metadata.filePath,
file_type: doc.metadata.fileType,
similarity: score,
}));
return filteredResults;
} catch (error) {
console.error('Error searching similar chunks:', error);
throw error;
}
}
async searchSimilarChunksByText(queryText, limit = 10, threshold = 0.5) {
try {
await this.initializeVectorStore();
// Use text-based similarity search
const results = await this.vectorStore.similaritySearchWithScore(
queryText,
limit
);
// Filter by threshold and format results
const filteredResults = results
.filter(([doc, score]) => score >= threshold)
.map(([doc, score]) => ({
id: doc.metadata.id || uuidv4(),
content: doc.pageContent,
chunk_index: doc.metadata.chunkIndex,
metadata: doc.metadata,
title: doc.metadata.title,
file_path: doc.metadata.filePath,
file_type: doc.metadata.fileType,
similarity: score,
}));
return filteredResults;
} catch (error) {
console.error('Error searching similar chunks by text:', error);
throw error;
}
}
// Enhanced search method with metadata filtering support
async searchSimilarChunksByTextWithFilter(queryText, options = {}) {
try {
await this.initializeVectorStore();
const {
limit = 10,
threshold = 0.5,
filter = {},
userId = null,
knowledgebotId = null,
...additionalFilters
} = options;
// Build comprehensive filter object
const searchFilter = {
...filter,
...additionalFilters
};
// Add userId filter if provided
if (userId) {
searchFilter.userId = userId;
}
// Add knowledgebotId filter if provided
if (knowledgebotId) {
searchFilter.knowledgebotId = knowledgebotId;
}
// Use the existing retrieveDocuments method with filter
const results = await this.retrieveDocuments(queryText, {
k: limit,
filter: searchFilter,
scoreThreshold: threshold
});
console.log(`✓ Filtered search returned ${results.length} chunks for query: "${queryText}"`);
if (Object.keys(searchFilter).length > 0) {
console.log(`✓ Applied filters:`, searchFilter);
}
return results;
} catch (error) {
console.error('Error searching with filter:', error);
throw error;
}
}
// Search by userId specifically
async searchByUserId(queryText, userId, options = {}) {
return this.searchSimilarChunksByTextWithFilter(queryText, {
...options,
userId: userId
});
}
// Search by knowledgebot ID specifically
async searchByKnowledgebotId(queryText, knowledgebotId, options = {}) {
return this.searchSimilarChunksByTextWithFilter(queryText, {
...options,
knowledgebotId: knowledgebotId
});
}
// Search with multiple filters
async searchWithMultipleFilters(queryText, filters = {}, options = {}) {
return this.searchSimilarChunksByTextWithFilter(queryText, {
...options,
...filters
});
}
// Get all documents for a specific user
async getDocumentsByUserId(userId, options = {}) {
try {
await this.initializeVectorStore();
const { limit = 100 } = options;
// Use a generic query to get all documents for the user
const results = await this.vectorStore.similaritySearch(
'', // Empty query to get all
limit,
{ userId: userId }
);
const formattedResults = results.map((doc, index) => ({
id: doc.metadata.id || uuidv4(),
content: doc.pageContent,
chunk_index: doc.metadata.chunkIndex || index,
metadata: doc.metadata,
title: doc.metadata.title || 'Unknown',
file_path: doc.metadata.filePath || '',
file_type: doc.metadata.fileType || '',
documentId: doc.metadata.documentId
}));
console.log(`✓ Retrieved ${formattedResults.length} documents for userId: ${userId}`);
return formattedResults;
} catch (error) {
console.error('Error getting documents by userId:', error);
throw error;
}
}
// Get all documents for a specific knowledgebot
async getDocumentsByKnowledgebotId(knowledgebotId, options = {}) {
try {
await this.initializeVectorStore();
const { limit = 100 } = options;
// Use a generic query to get all documents for the knowledgebot
const results = await this.vectorStore.similaritySearch(
'', // Empty query to get all
limit,
{ knowledgebotId: knowledgebotId }
);
const formattedResults = results.map((doc, index) => ({
id: doc.metadata.id || uuidv4(),
content: doc.pageContent,
chunk_index: doc.metadata.chunkIndex || index,
metadata: doc.metadata,
title: doc.metadata.title || 'Unknown',
file_path: doc.metadata.filePath || '',
file_type: doc.metadata.fileType || '',
documentId: doc.metadata.documentId
}));
console.log(`✓ Retrieved ${formattedResults.length} documents for knowledgebotId: ${knowledgebotId}`);
return formattedResults;
} catch (error) {
console.error('Error getting documents by knowledgebotId:', error);
throw error;
}
}
// New retriever method that provides better abstraction
async retrieveDocuments(queryText, options = {}) {
try {
await this.initializeVectorStore();
const {
k = 10,
filter = {},
searchType = 'similarity',
scoreThreshold = 0.1
} = options;
// Create retriever with options
const retriever = this.vectorStore.asRetriever({
k: k,
filter: filter,
searchType: searchType,
searchKwargs: {
scoreThreshold: scoreThreshold
}
});
// Use retriever to get documents
const documents = await retriever.invoke(queryText);
console.log(`✓ Retrieved ${documents.length} documents using retriever for query: "${queryText}"`);
// Format results to match your existing structure
const formattedResults = documents.map((doc, index) => ({
id: doc.metadata.id || uuidv4(),
content: doc.pageContent,
chunk_index: doc.metadata.chunkIndex || index,
metadata: doc.metadata,
title: doc.metadata.title || 'Unknown',
file_path: doc.metadata.filePath || '',
file_type: doc.metadata.fileType || '',
similarity: doc.metadata.score || 1.0, // Retriever doesn't always return scores
}));
return formattedResults;
} catch (error) {
console.error('Error retrieving documents:', error);
throw error;
}
}
// Enhanced retriever with additional features
async retrieveWithFilters(queryText, options = {}) {
try {
await this.initializeVectorStore();
const {
k = 10,
documentTypes = [], // Filter by document types
dateRange = null, // Filter by date range
sources = [], // Filter by specific sources
scoreThreshold = 0.1
} = options;
// Build metadata filter
const filter = {};
if (documentTypes.length > 0) {
filter.fileType = { in: documentTypes };
}
if (sources.length > 0) {
filter.title = { in: sources };
}
// Create retriever with advanced filtering
const retriever = this.vectorStore.asRetriever({
k: k * 2, // Get more results initially for better filtering
filter: filter,
searchType: 'similarity_score_threshold',
searchKwargs: {
scoreThreshold: scoreThreshold
}
});
const documents = await retriever.invoke(queryText);
// Additional post-processing
let processedResults = documents.map((doc, index) => ({
id: doc.metadata.id || uuidv4(),
content: doc.pageContent,
chunk_index: doc.metadata.chunkIndex || index,
metadata: doc.metadata,
title: doc.metadata.title || 'Unknown',
file_path: doc.metadata.filePath || '',
file_type: doc.metadata.fileType || '',
similarity: doc.metadata.score || 1.0,
}));
// Apply date range filter if specified
if (dateRange) {
processedResults = processedResults.filter(result => {
const createdAt = new Date(result.metadata.createdAt || 0);
return createdAt >= dateRange.start && createdAt <= dateRange.end;
});
}
// Limit to requested k after filtering
processedResults = processedResults.slice(0, k);
console.log(`✓ Retrieved ${processedResults.length} filtered documents using enhanced retriever`);
return processedResults;
} catch (error) {
console.error('Error in enhanced retrieval:', error);
throw error;
}
}
async getDocumentById(documentId) {
const client = await pool.connect();
try {
const documentQuery = `
SELECT * FROM documents WHERE id = $1;
`;
const documentResult = await client.query(documentQuery, [documentId]);
if (documentResult.rows.length === 0) {
return null;
}
// Get chunks from vector store
await this.initializeVectorStore();
// Search for chunks belonging to this document
const chunks = await this.vectorStore.similaritySearch(
documentResult.rows[0].title, // Use title as query to find related chunks
100, // Get many chunks
{ documentId: documentId } // Filter by document ID
);
return {
...documentResult.rows[0],
chunks: chunks.map(doc => ({
content: doc.pageContent,
metadata: doc.metadata,
chunk_index: doc.metadata.chunkIndex,
})),
};
} catch (error) {
console.error('Error getting document:', error);
throw error;
} finally {
client.release();
}
}
async getAllDocuments() {
const client = await pool.connect();
try {
const query = `
SELECT
d.*,
COUNT(CASE WHEN dcv.metadata->>'documentId' = d.id::text THEN 1 END) as chunk_count
FROM documents d
LEFT JOIN document_chunks_vector dcv ON dcv.metadata->>'documentId' = d.id::text
GROUP BY d.id
ORDER BY d.created_at DESC;
`;
const result = await client.query(query);
return result.rows;
} catch (error) {
console.error('Error getting all documents:', error);
throw error;
} finally {
client.release();
}
}
async deleteDocument(documentId) {
const client = await pool.connect();
try {
await client.query('BEGIN');
// Delete from vector store first
await this.initializeVectorStore();
// Find all chunks for this document and delete them
const vectorResults = await this.vectorStore.similaritySearch(
'', // Empty query
1000, // High limit
{ documentId: documentId }
);
if (vectorResults.length > 0) {
// Get the IDs of chunks to delete (this is a limitation - we might need to track IDs separately)
// For now, we'll delete from the table directly
await client.query(
'DELETE FROM document_chunks_vector WHERE metadata->>\'documentId\' = $1',
[documentId]
);
}
// Delete document
const result = await client.query(
'DELETE FROM documents WHERE id = $1 RETURNING *',
[documentId]
);
await client.query('COMMIT');
return result.rows[0];
} catch (error) {
await client.query('ROLLBACK');
console.error('Error deleting document:', error);
throw error;
} finally {
client.release();
}
}
async saveSearchSession(query, results) {
const client = await pool.connect();
try {
const searchQuery = `
INSERT INTO search_sessions (query, results)
VALUES ($1, $2)
RETURNING id;
`;
const result = await client.query(searchQuery, [query, results]);
return result.rows[0].id;
} catch (error) {
console.error('Error saving search session:', error);
throw error;
} finally {
client.release();
}
}
async createHnswIndex() {
try {
await this.initializeVectorStore();
// Create HNSW index for better performance
await this.vectorStore.createHnswIndex({
dimensions: this.config.openai?.embeddingDimensions || 1536,
efConstruction: 64,
m: 16,
});
console.log('✓ HNSW index created for better performance');
} catch (error) {
console.error('Error creating HNSW index:', error);
// Don't throw error as this is optional
}
}
async close() {
if (this.vectorStore) {
await this.vectorStore.end();
}
}
}
export { DocumentStoreLangChain };
export default DocumentStoreLangChain;