universal-ai-brain
Version:
🧠 UNIVERSAL AI BRAIN 3.3 - The world's most advanced cognitive architecture with 24 specialized systems, MongoDB 8.1 $rankFusion hybrid search, latest Voyage 3.5 embeddings, and framework-agnostic design. Works with Mastra, Vercel AI, LangChain, OpenAI A
615 lines (554 loc) • 17.1 kB
text/typescript
/**
* @file MongoVectorStore - Production-ready MongoDB Atlas Vector Search implementation
*
* Based on MongoDB's official documentation and production RAG implementation.
* This provides the core vector search capabilities for the Universal AI Brain.
*
* Features:
* - Atlas Vector Search with proper indexing
* - Hybrid search (vector + text)
* - Automatic embedding generation
* - Performance optimization
* - Error handling and fallbacks
*/
import { Collection, Db, ObjectId, Document } from 'mongodb';
import { MongoConnection } from '../persistance/MongoConnection';
export interface VectorDocument {
_id?: ObjectId;
text: string;
embedding: number[];
metadata: Record<string, any>;
source: string;
timestamp: Date;
chunkIndex?: number;
parentDocumentId?: string;
tokenCount?: number;
}
export interface VectorSearchOptions {
limit?: number;
numCandidates?: number;
filter?: Record<string, any>;
minScore?: number;
index?: string;
includeEmbeddings?: boolean;
searchType?: 'vector' | 'hybrid' | 'text';
}
export interface VectorSearchResult extends VectorDocument {
score: number;
}
export interface VectorIndexDefinition {
name: string;
type: 'vectorSearch';
definition: {
fields: Array<{
type: 'vector' | 'filter';
path: string;
numDimensions?: number;
similarity?: 'euclidean' | 'cosine' | 'dotProduct';
quantization?: 'none' | 'scalar' | 'binary';
}>;
};
}
export interface EmbeddingProvider {
generateEmbedding(text: string): Promise<number[]>;
getDimensions(): number;
getModel(): string;
}
/**
* MongoVectorStore - Production-ready MongoDB Atlas Vector Search implementation
*
* This class provides comprehensive vector search capabilities using MongoDB Atlas Vector Search.
* It follows MongoDB's best practices for production RAG applications.
*/
export class MongoVectorStore {
private collection: Collection<VectorDocument>;
private db: Db;
private vectorIndexName: string;
private textIndexName: string;
private embeddingProvider: EmbeddingProvider | null = null;
private isInitialized: boolean = false;
constructor(
mongoConnection: MongoConnection,
collectionName: string = 'embedded_content',
vectorIndexName: string = 'vector_index',
textIndexName: string = 'text_index'
) {
this.db = mongoConnection.getDb();
this.collection = this.db.collection<VectorDocument>(collectionName);
this.vectorIndexName = vectorIndexName;
this.textIndexName = textIndexName;
}
/**
* Initialize the vector store with embedding provider
*/
async initialize(embeddingProvider: EmbeddingProvider): Promise<void> {
if (this.isInitialized) {
return;
}
this.embeddingProvider = embeddingProvider;
// Ensure indexes exist
await this.ensureIndexes();
this.isInitialized = true;
console.log('✅ MongoVectorStore initialized successfully');
}
/**
* Store a document with its vector embedding
*/
async storeDocument(
text: string,
metadata: Record<string, any> = {},
source: string = 'unknown',
embedding?: number[]
): Promise<string> {
this.ensureInitialized();
try {
// Generate embedding if not provided
const vectorEmbedding = embedding || await this.generateEmbedding(text);
const document: VectorDocument = {
text,
embedding: vectorEmbedding,
metadata: {
...metadata,
indexed_at: new Date()
},
source,
timestamp: new Date(),
tokenCount: this.estimateTokenCount(text)
};
const result = await this.collection.insertOne(document);
return result.insertedId.toString();
} catch (error) {
console.error('Error storing document:', error);
throw new Error(`Failed to store document: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Store multiple documents in batch
*/
async storeDocuments(
documents: Array<{
text: string;
metadata?: Record<string, any>;
source?: string;
embedding?: number[];
}>
): Promise<string[]> {
this.ensureInitialized();
try {
const vectorDocuments: VectorDocument[] = await Promise.all(
documents.map(async (doc) => ({
text: doc.text,
embedding: doc.embedding || await this.generateEmbedding(doc.text),
metadata: {
...doc.metadata,
indexed_at: new Date()
},
source: doc.source || 'unknown',
timestamp: new Date(),
tokenCount: this.estimateTokenCount(doc.text)
}))
);
const result = await this.collection.insertMany(vectorDocuments);
return Object.values(result.insertedIds).map(id => id.toString());
} catch (error) {
console.error('Error storing documents:', error);
throw new Error(`Failed to store documents: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Perform vector search using MongoDB Atlas Vector Search
* Based on MongoDB's production RAG implementation
*/
async vectorSearch(
query: string | number[],
options: VectorSearchOptions = {}
): Promise<VectorSearchResult[]> {
this.ensureInitialized();
const {
limit = 10,
numCandidates = 50,
filter = {},
minScore = 0.7,
index = this.vectorIndexName
} = options;
try {
// Generate embedding for text query
const queryEmbedding = typeof query === 'string'
? await this.generateEmbedding(query)
: query;
const pipeline = [
{
$vectorSearch: {
index,
queryVector: queryEmbedding,
path: "embedding",
filter,
limit,
numCandidates
}
},
{
$addFields: {
score: {
$meta: "vectorSearchScore"
}
}
},
{
$match: {
score: { $gte: minScore }
}
}
];
// Add projection to exclude embeddings if not needed
if (!options.includeEmbeddings) {
(pipeline as any[]).push({
$project: {
embedding: 0
}
});
}
const results = await this.collection.aggregate<VectorSearchResult>(pipeline).toArray();
return results;
} catch (error) {
console.error('Error in vector search:', error);
throw new Error(`Vector search failed: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Hybrid search combining vector search with text search
* Essential for production RAG applications
*/
async hybridSearch(
query: string,
options: VectorSearchOptions = {}
): Promise<VectorSearchResult[]> {
this.ensureInitialized();
try {
// Get vector search results
const vectorResults = await this.vectorSearch(query, {
...options,
searchType: 'vector'
});
// Get text search results
const textResults = await this.textSearch(query, options);
// Merge and deduplicate results
const combinedResults = new Map<string, VectorSearchResult>();
// Add vector results with higher weight (0.7)
vectorResults.forEach(result => {
const id = result._id!.toString();
combinedResults.set(id, {
...result,
score: result.score * 0.7,
metadata: { ...result.metadata, searchType: 'vector' }
});
});
// Add text results with lower weight (0.3)
textResults.forEach(result => {
const id = result._id!.toString();
if (!combinedResults.has(id)) {
combinedResults.set(id, {
...result,
score: result.score * 0.3,
metadata: { ...result.metadata, searchType: 'text' }
});
} else {
// Boost score for documents found in both searches
const existing = combinedResults.get(id)!;
existing.score = Math.min(existing.score + (result.score * 0.3), 1.0);
existing.metadata.searchType = 'hybrid';
}
});
return Array.from(combinedResults.values())
.sort((a, b) => b.score - a.score)
.slice(0, options.limit || 10);
} catch (error) {
console.error('Error in hybrid search:', error);
// Fallback to vector search only
return this.vectorSearch(query, options);
}
}
/**
* Text search using MongoDB text indexes
*/
async textSearch(
query: string,
options: VectorSearchOptions = {}
): Promise<VectorSearchResult[]> {
try {
const pipeline = [
{
$search: {
index: this.textIndexName,
text: {
query: query,
path: ['text', 'metadata.title', 'metadata.description']
}
}
},
{
$addFields: {
score: { $meta: 'searchScore' }
}
},
{
$limit: options.limit || 10
}
];
if (options.filter && Object.keys(options.filter).length > 0) {
(pipeline as any[]).splice(1, 0, { $match: options.filter });
}
const results = await this.collection.aggregate<VectorSearchResult>(pipeline).toArray();
return results;
} catch (error) {
console.warn('Text search failed, this is normal if text index is not created:', error instanceof Error ? error.message : String(error));
return [];
}
}
/**
* Find similar documents to a given document ID
*/
async findSimilar(
documentId: string,
options: VectorSearchOptions = {}
): Promise<VectorSearchResult[]> {
this.ensureInitialized();
try {
// Get the document's embedding
const document = await this.collection.findOne({ _id: new ObjectId(documentId) });
if (!document) {
throw new Error(`Document with ID ${documentId} not found`);
}
// Search for similar documents
return this.vectorSearch(document.embedding, {
...options,
filter: {
_id: { $ne: new ObjectId(documentId) }, // Exclude the original document
...options.filter
}
});
} catch (error) {
console.error('Error finding similar documents:', error);
throw new Error(`Failed to find similar documents: ${error instanceof Error ? error.message : String(error)}`);
}
}
// Private helper methods will be added in the next part...
private ensureInitialized(): void {
if (!this.isInitialized) {
throw new Error('MongoVectorStore not initialized. Call initialize() first.');
}
}
private async generateEmbedding(text: string): Promise<number[]> {
if (!this.embeddingProvider) {
throw new Error('Embedding provider not configured');
}
return this.embeddingProvider.generateEmbedding(text);
}
private estimateTokenCount(text: string): number {
// Rough estimation: ~4 characters per token
return Math.ceil(text.length / 4);
}
private async ensureIndexes(): Promise<void> {
try {
// Create text search index for hybrid search
await this.collection.createIndex(
{
text: "text",
"metadata.title": "text",
"metadata.description": "text"
},
{
name: this.textIndexName,
background: true
}
);
// Create compound indexes for filtering
await this.collection.createIndex({ source: 1, timestamp: -1 });
await this.collection.createIndex({ "metadata.type": 1 });
await this.collection.createIndex({ timestamp: -1 });
console.log('✅ MongoDB indexes created successfully');
} catch (error) {
console.warn('⚠️ Some indexes may already exist:', error instanceof Error ? error.message : String(error));
}
}
/**
* Create vector search index definition for Atlas
* This needs to be created in Atlas UI or via Atlas CLI
*/
getVectorIndexDefinition(dimensions: number = 1536): VectorIndexDefinition {
return {
name: this.vectorIndexName,
type: 'vectorSearch',
definition: {
fields: [
{
type: 'vector',
path: 'embedding',
numDimensions: dimensions,
similarity: 'cosine'
},
{
type: 'filter',
path: 'source'
},
{
type: 'filter',
path: 'metadata.type'
},
{
type: 'filter',
path: 'timestamp'
}
]
}
};
}
/**
* Get collection statistics
*/
async getStats(): Promise<any> {
try {
const [count, sampleDoc] = await Promise.all([
this.collection.countDocuments(),
this.collection.findOne({}, { projection: { embedding: 0 } })
]);
// Get collection stats using database command
const stats = await this.db.command({ collStats: this.collection.collectionName });
return {
collectionStats: stats,
documentCount: count,
sampleDocument: sampleDoc,
indexName: this.vectorIndexName,
isInitialized: this.isInitialized,
embeddingProvider: this.embeddingProvider?.getModel() || 'none'
};
} catch (error) {
console.error('Error getting vector store stats:', error);
return { error: error instanceof Error ? error.message : String(error) };
}
}
/**
* Delete documents by filter
*/
async deleteDocuments(filter: Record<string, any>): Promise<number> {
try {
const result = await this.collection.deleteMany(filter);
return result.deletedCount;
} catch (error) {
console.error('Error deleting documents:', error);
throw new Error(`Failed to delete documents: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Update document metadata
*/
async updateDocumentMetadata(
documentId: string,
metadata: Record<string, any>
): Promise<boolean> {
try {
const result = await this.collection.updateOne(
{ _id: new ObjectId(documentId) },
{
$set: {
metadata: {
...metadata,
updated_at: new Date()
}
}
}
);
return result.modifiedCount > 0;
} catch (error) {
console.error('Error updating document metadata:', error);
throw new Error(`Failed to update document: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Get document by ID
*/
async getDocument(documentId: string, includeEmbedding: boolean = false): Promise<VectorDocument | null> {
try {
const projection = includeEmbedding ? {} : { embedding: 0 };
return await this.collection.findOne(
{ _id: new ObjectId(documentId) },
{ projection }
);
} catch (error) {
console.error('Error getting document:', error);
return null;
}
}
/**
* Search documents by metadata
*/
async searchByMetadata(
filter: Record<string, any>,
options: { limit?: number; sort?: Record<string, 1 | -1> } = {}
): Promise<VectorDocument[]> {
try {
const { limit = 10, sort = { timestamp: -1 } } = options;
return await this.collection
.find(filter, { projection: { embedding: 0 } })
.sort(sort)
.limit(limit)
.toArray();
} catch (error) {
console.error('Error searching by metadata:', error);
throw new Error(`Failed to search by metadata: ${error instanceof Error ? error.message : String(error)}`);
}
}
/**
* Get recent documents
*/
async getRecentDocuments(
limit: number = 10,
source?: string
): Promise<VectorDocument[]> {
const filter = source ? { source } : {};
return this.searchByMetadata(filter, {
limit,
sort: { timestamp: -1 }
});
}
/**
* Cleanup old documents
*/
async cleanupOldDocuments(
olderThanDays: number,
source?: string
): Promise<number> {
const cutoffDate = new Date();
cutoffDate.setDate(cutoffDate.getDate() - olderThanDays);
const filter: Record<string, any> = {
timestamp: { $lt: cutoffDate }
};
if (source) {
filter.source = source;
}
return this.deleteDocuments(filter);
}
/**
* Perform health check
*/
async healthCheck(): Promise<{ isHealthy: boolean; details: any }> {
try {
const stats = await this.getStats();
const testQuery = await this.vectorSearch('test query', { limit: 1 });
return {
isHealthy: true,
details: {
isInitialized: this.isInitialized,
documentCount: stats.documentCount,
embeddingProvider: stats.embeddingProvider,
canQuery: testQuery !== null
}
};
} catch (error) {
return {
isHealthy: false,
details: { error: error instanceof Error ? error.message : String(error) }
};
}
}
}