universal-ai-brain
Version:
🧠 UNIVERSAL AI BRAIN 3.3 - The world's most advanced cognitive architecture with 24 specialized systems, MongoDB 8.1 $rankFusion hybrid search, latest Voyage 3.5 embeddings, and framework-agnostic design. Works with Mastra, Vercel AI, LangChain, OpenAI A
753 lines (696 loc) • 23.7 kB
text/typescript
/**
* 🚀 MONGODB ATLAS HYBRID SEARCH ENGINE
*
* ✅ PERFECTLY ALIGNED with MongoDB Atlas 2025 Documentation
* ✅ Uses $rankFusion with reciprocal rank fusion (MongoDB 8.1+)
* ✅ Automatic fallback for older MongoDB versions
* ✅ Supports both vector and full-text search with optimal weighting
*
* Key Features:
* - Native MongoDB $rankFusion implementation
* - Reciprocal rank fusion with rank_constant = 60 (MongoDB default)
* - Named pipeline structure: vectorPipeline + fullTextPipeline
* - Proper combination.weights syntax
* - MongoDB version detection and compatibility
* - Production-ready with Voyage AI and OpenAI embedding providers
*
* MongoDB Requirements:
* - MongoDB Atlas 8.1+ for $rankFusion support
* - Vector Search Index on embedding.values field
* - Atlas Search Index on content.text and content.summary fields
*/
import { Collection, Db, Document } from 'mongodb';
import { MongoEmbeddingProvider } from '../persistance/MongoEmbeddingProvider';
import { OpenAIEmbeddingProvider } from '../embeddings/OpenAIEmbeddingProvider';
import { VoyageAIEmbeddingProvider } from '../embeddings/VoyageAIEmbeddingProvider';
// Embedding provider interface for flexibility
export interface HybridSearchEmbeddingProvider {
generateEmbedding(text: string): Promise<number[]>;
}
// Fallback embedding provider (mock implementation for development/testing)
export class DefaultEmbeddingProvider implements HybridSearchEmbeddingProvider {
async generateEmbedding(text: string): Promise<number[]> {
console.warn(`Using fallback mock embedding provider for: ${text.substring(0, 50)}...`);
console.warn('WARNING: This is a mock implementation. For production, configure a real embedding provider.');
// Mock implementation - generates consistent but meaningless embeddings
return Array(1536).fill(0).map(() => Math.random() * 2 - 1); // Random values between -1 and 1
}
}
// Search result interface
export interface HybridSearchResult {
_id: string;
embedding_id: string;
content: {
text: string;
summary?: string;
};
metadata: Record<string, any>;
scores: {
vector_score: number;
text_score: number;
combined_score: number;
};
relevance_explanation: string;
}
// Search filters interface
export interface SearchFilters {
source_type?: string;
agent_id?: string;
created_after?: Date;
created_before?: Date;
metadata_filters?: Record<string, any>;
min_confidence?: number;
}
// Search options interface
export interface SearchOptions {
limit?: number;
vector_weight?: number;
text_weight?: number;
vector_index?: string;
text_index?: string;
include_embeddings?: boolean;
explain_relevance?: boolean;
}
/**
* Advanced Hybrid Search Engine
* Combines vector similarity search with full-text search for optimal relevance
*/
export class HybridSearchEngine {
private db: Db;
private embeddingProvider: HybridSearchEmbeddingProvider;
private embeddingStore: MongoEmbeddingProvider<Document>;
constructor(
db: Db,
embeddingProvider?: HybridSearchEmbeddingProvider,
collectionName: string = 'vector_embeddings'
) {
this.db = db;
// Use production-ready OpenAI embedding provider by default
this.embeddingProvider = embeddingProvider || this.createDefaultEmbeddingProvider();
this.embeddingStore = new MongoEmbeddingProvider(db, collectionName, 'vector_search_index');
}
/**
* Perform hybrid search combining vector and text search
*/
async search(
query: string,
filters: SearchFilters = {},
options: SearchOptions = {}
): Promise<HybridSearchResult[]> {
const {
limit = 20,
vector_weight = 0.7,
text_weight = 0.3,
vector_index = 'vector_search_index',
text_index = 'text_search_index',
include_embeddings = false,
explain_relevance = true
} = options;
try {
// Generate query embedding
const queryEmbedding = await this.embeddingProvider.generateEmbedding(query);
// Build filter conditions
const filterConditions = this.buildFilterConditions(filters);
// Check MongoDB version and use appropriate hybrid search method
const mongoVersion = await this.getMongoDBVersion();
const supportsRankFusion = this.isRankFusionSupported(mongoVersion);
let results: HybridSearchResult[];
if (supportsRankFusion) {
console.log(`🚀 Using MongoDB Atlas $rankFusion (MongoDB ${mongoVersion}) for optimal hybrid search`);
results = await this.executeHybridSearchWithRankFusion(
query,
queryEmbedding,
filterConditions,
{
limit,
vector_weight,
text_weight,
vector_index,
text_index,
include_embeddings,
explain_relevance
}
);
} else {
console.log(`⚠️ MongoDB ${mongoVersion} detected - $rankFusion requires 8.1+, using manual hybrid search`);
results = await this.executeHybridSearchPipeline(
query,
queryEmbedding,
filterConditions,
{
limit,
vector_weight,
text_weight,
vector_index,
text_index,
include_embeddings,
explain_relevance
}
);
}
return results;
} catch (error) {
console.error('Hybrid search failed:', error);
// Fallback to text-only search
return await this.fallbackTextSearch(query, filters, options);
}
}
/**
* Execute the hybrid search aggregation pipeline
*/
private async executeHybridSearchPipeline(
query: string,
queryEmbedding: number[],
filterConditions: Record<string, any>,
options: Required<SearchOptions>
): Promise<HybridSearchResult[]> {
const collection = this.db.collection('vector_embeddings');
const pipeline: any[] = [
// Stage 1: Vector similarity search
{
$vectorSearch: {
index: options.vector_index,
queryVector: queryEmbedding,
path: 'embedding.values',
numCandidates: Math.max(options.limit * 10, 150),
limit: Math.max(options.limit * 2, 50),
filter: filterConditions,
},
},
{
$addFields: {
vector_score: { $meta: 'vectorSearchScore' },
},
},
// Stage 2: Text search (if text index exists)
{
$search: {
index: options.text_index,
compound: {
must: [
{
text: {
query: query,
path: ['content.text', 'content.summary'],
},
},
],
filter: [filterConditions],
},
},
},
{
$addFields: {
text_score: { $meta: 'searchScore' },
},
},
// Stage 3: Combine scores with weights
{
$addFields: {
combined_score: {
$add: [
{ $multiply: ['$vector_score', options.vector_weight] },
{ $multiply: ['$text_score', options.text_weight] },
],
},
},
},
// Stage 4: Sort by combined score
{ $sort: { combined_score: -1 } },
// Stage 5: Limit results
{ $limit: options.limit },
// Stage 6: Project final results
{
$project: {
_id: 1,
embedding_id: 1,
content: 1,
metadata: 1,
vector_score: 1,
text_score: 1,
combined_score: 1,
...(options.include_embeddings && { 'embedding.values': 1 }),
...(options.explain_relevance && {
relevance_explanation: {
$concat: [
'Vector similarity: ', { $toString: { $round: ['$vector_score', 3] } },
', Text relevance: ', { $toString: { $round: ['$text_score', 3] } },
', Combined score: ', { $toString: { $round: ['$combined_score', 3] } }
]
}
})
},
},
];
const results = await collection.aggregate(pipeline).toArray();
return results.map(doc => ({
_id: doc._id.toString(),
embedding_id: doc.embedding_id,
content: doc.content,
metadata: doc.metadata,
scores: {
vector_score: doc.vector_score || 0,
text_score: doc.text_score || 0,
combined_score: doc.combined_score || 0,
},
relevance_explanation: doc.relevance_explanation || 'No explanation available'
}));
}
/**
* Execute hybrid search using MongoDB Atlas $rankFusion (MongoDB 8.1+)
* EXACTLY following the official MongoDB 2025 documentation
* Uses reciprocal rank fusion with rank_constant = 60 (MongoDB default)
*/
private async executeHybridSearchWithRankFusion(
query: string,
queryEmbedding: number[],
filterConditions: Record<string, any>,
options: Required<SearchOptions>
): Promise<HybridSearchResult[]> {
const collection = this.db.collection('vector_embeddings');
try {
// EXACT MongoDB Atlas $rankFusion syntax from 2025 documentation
const pipeline: any[] = [
{
$rankFusion: {
input: {
pipelines: {
// Named pipeline for vector search (EXACT docs format)
vectorPipeline: [
{
$vectorSearch: {
index: options.vector_index,
path: 'embedding.values',
queryVector: queryEmbedding,
numCandidates: Math.max(options.limit * 5, 100),
limit: options.limit,
// Add filters if provided (MongoDB Atlas format)
...(Object.keys(filterConditions).length > 0 && { filter: filterConditions })
}
}
],
// Named pipeline for full-text search (EXACT docs format)
fullTextPipeline: [
// Use compound query structure if filters are present
...(Object.keys(filterConditions).length > 0 ? [
{
$search: {
index: options.text_index,
compound: {
must: [
{
text: {
query: query,
path: ['content.text', 'content.summary']
}
}
],
filter: [filterConditions]
}
}
}
] : [
// Simple text search when no filters
{
$search: {
index: options.text_index,
text: {
query: query,
path: ['content.text', 'content.summary']
}
}
}
]),
{ $limit: options.limit }
]
}
},
// EXACT combination syntax from MongoDB docs
combination: {
weights: {
vectorPipeline: options.vector_weight,
fullTextPipeline: options.text_weight
}
},
// Enable score details for debugging (optional)
scoreDetails: options.explain_relevance
}
},
// Project results with score details from $meta
{
$project: {
_id: 1,
embedding_id: 1,
content: 1,
metadata: 1,
// Get the reciprocal rank fusion score
combined_score: { $meta: 'rankFusionScore' },
// Get detailed scores if available
...(options.explain_relevance && {
scoreDetails: { $meta: 'scoreDetails' }
}),
...(options.include_embeddings && { 'embedding.values': 1 })
}
},
// Final limit (MongoDB $rankFusion handles internal ranking)
{ $limit: options.limit }
];
const results = await collection.aggregate(pipeline).toArray();
return results.map(doc => {
// Extract individual pipeline scores from scoreDetails if available
const vectorScore = doc.scoreDetails?.vectorPipeline?.score || 0;
const textScore = doc.scoreDetails?.fullTextPipeline?.score || 0;
return {
_id: doc._id.toString(),
embedding_id: doc.embedding_id,
content: doc.content,
metadata: doc.metadata,
scores: {
vector_score: vectorScore,
text_score: textScore,
combined_score: doc.combined_score || 0,
},
relevance_explanation: options.explain_relevance
? `MongoDB RankFusion (RRF): Vector=${vectorScore.toFixed(3)}, Text=${textScore.toFixed(3)}, Combined=${(doc.combined_score || 0).toFixed(3)}`
: 'MongoDB Atlas Hybrid Search with Reciprocal Rank Fusion'
};
});
} catch (error) {
console.error('MongoDB Atlas RankFusion failed (requires MongoDB 8.1+), falling back to manual approach:', error);
// Fallback to the existing manual approach for older MongoDB versions
return await this.executeHybridSearchPipeline(query, queryEmbedding, filterConditions, options);
}
}
/**
* Fallback to text-only search when vector search fails
*/
private async fallbackTextSearch(
query: string,
filters: SearchFilters,
options: SearchOptions
): Promise<HybridSearchResult[]> {
console.log('Falling back to text-only search');
const collection = this.db.collection('vector_embeddings');
const filterConditions = this.buildFilterConditions(filters);
try {
const pipeline = [
{
$search: {
index: options.text_index || 'text_search_index',
compound: {
must: [
{
text: {
query: query,
path: ['content.text', 'content.summary'],
},
},
],
filter: [filterConditions],
},
},
},
{
$addFields: {
text_score: { $meta: 'searchScore' },
},
},
{ $sort: { text_score: -1 } },
{ $limit: options.limit || 20 },
{
$project: {
_id: 1,
embedding_id: 1,
content: 1,
metadata: 1,
text_score: 1,
},
},
];
const results = await collection.aggregate(pipeline).toArray();
return results.map(doc => ({
_id: doc._id.toString(),
embedding_id: doc.embedding_id,
content: doc.content,
metadata: doc.metadata,
scores: {
vector_score: 0,
text_score: doc.text_score || 0,
combined_score: doc.text_score || 0,
},
relevance_explanation: `Text-only search (vector search unavailable): ${doc.text_score?.toFixed(3) || 'N/A'}`
}));
} catch (error) {
console.error('Text search also failed:', error);
return [];
}
}
/**
* Build MongoDB filter conditions from search filters
*/
private buildFilterConditions(filters: SearchFilters): Record<string, any> {
const conditions: Record<string, any> = {};
if (filters.source_type) {
conditions.source_type = filters.source_type;
}
if (filters.agent_id) {
conditions.agent_id = filters.agent_id;
}
if (filters.created_after || filters.created_before) {
conditions.created_at = {};
if (filters.created_after) {
conditions.created_at.$gte = filters.created_after;
}
if (filters.created_before) {
conditions.created_at.$lte = filters.created_before;
}
}
if (filters.min_confidence) {
conditions['content.confidence'] = { $gte: filters.min_confidence };
}
if (filters.metadata_filters) {
for (const [key, value] of Object.entries(filters.metadata_filters)) {
conditions[`metadata.${key}`] = value;
}
}
return conditions;
}
/**
* Semantic search using only vector similarity
*/
async semanticSearch(
query: string,
filters: SearchFilters = {},
limit: number = 20
): Promise<HybridSearchResult[]> {
try {
const queryEmbedding = await this.embeddingProvider.generateEmbedding(query);
const filterConditions = this.buildFilterConditions(filters);
const collection = this.db.collection('vector_embeddings');
const pipeline = [
{
$vectorSearch: {
index: 'vector_search_index',
queryVector: queryEmbedding,
path: 'embedding.values',
numCandidates: Math.max(limit * 10, 150),
limit,
filter: filterConditions,
},
},
{
$addFields: {
vector_score: { $meta: 'vectorSearchScore' },
},
},
{
$project: {
_id: 1,
embedding_id: 1,
content: 1,
metadata: 1,
vector_score: 1,
},
},
];
const results = await collection.aggregate(pipeline).toArray();
return results.map(doc => ({
_id: doc._id.toString(),
embedding_id: doc.embedding_id,
content: doc.content,
metadata: doc.metadata,
scores: {
vector_score: doc.vector_score || 0,
text_score: 0,
combined_score: doc.vector_score || 0,
},
relevance_explanation: `Semantic similarity: ${doc.vector_score?.toFixed(3) || 'N/A'}`
}));
} catch (error) {
console.error('Semantic search failed:', error);
return [];
}
}
/**
* Full-text search using only text matching
*/
async textSearch(
query: string,
filters: SearchFilters = {},
limit: number = 20
): Promise<HybridSearchResult[]> {
return await this.fallbackTextSearch(query, filters, { limit });
}
/**
* Get search suggestions based on query
*/
async getSuggestions(
partialQuery: string,
limit: number = 5
): Promise<string[]> {
try {
const collection = this.db.collection('vector_embeddings');
const pipeline = [
{
$search: {
index: 'text_search_index',
autocomplete: {
query: partialQuery,
path: 'content.text',
},
},
},
{ $limit: limit },
{
$project: {
suggestion: { $substr: ['$content.text', 0, 100] },
},
},
];
const results = await collection.aggregate(pipeline).toArray();
return results.map(doc => doc.suggestion);
} catch (error) {
console.error('Failed to get suggestions:', error);
return [];
}
}
/**
* Analyze search performance and provide insights
*/
async analyzeSearchPerformance(
query: string,
filters: SearchFilters = {}
): Promise<{
query: string;
total_candidates: number;
vector_results: number;
text_results: number;
hybrid_results: number;
performance_ms: number;
recommendations: string[];
}> {
const startTime = Date.now();
try {
const [vectorResults, textResults, hybridResults] = await Promise.all([
this.semanticSearch(query, filters, 100),
this.textSearch(query, filters, 100),
this.search(query, filters, { limit: 100 })
]);
const performance_ms = Date.now() - startTime;
const recommendations: string[] = [];
if (vectorResults.length === 0) {
recommendations.push('Consider improving embedding quality or expanding vector index');
}
if (textResults.length === 0) {
recommendations.push('Consider improving text content or expanding text index');
}
if (hybridResults.length < Math.max(vectorResults.length, textResults.length)) {
recommendations.push('Hybrid search may need weight adjustment');
}
if (performance_ms > 1000) {
recommendations.push('Search performance is slow - consider index optimization');
}
return {
query,
total_candidates: Math.max(vectorResults.length, textResults.length),
vector_results: vectorResults.length,
text_results: textResults.length,
hybrid_results: hybridResults.length,
performance_ms,
recommendations
};
} catch (error) {
console.error('Search performance analysis failed:', error);
return {
query,
total_candidates: 0,
vector_results: 0,
text_results: 0,
hybrid_results: 0,
performance_ms: Date.now() - startTime,
recommendations: ['Search analysis failed - check index configuration']
};
}
}
/**
* Get MongoDB version to determine $rankFusion support
*/
private async getMongoDBVersion(): Promise<string> {
try {
const admin = this.db.admin();
const buildInfo = await admin.buildInfo();
return buildInfo.version;
} catch (error) {
console.warn('Could not determine MongoDB version:', error);
return '7.0.0'; // Assume older version if detection fails
}
}
/**
* Check if MongoDB version supports $rankFusion (requires 8.1+)
*/
private isRankFusionSupported(version: string): boolean {
try {
const [major, minor] = version.split('.').map(Number);
return major > 8 || (major === 8 && minor >= 1);
} catch (error) {
console.warn('Could not parse MongoDB version:', version);
return false; // Assume not supported if parsing fails
}
}
/**
* Create default embedding provider with fallback to mock
* Priority: Voyage AI > OpenAI > Mock
*/
private createDefaultEmbeddingProvider(): HybridSearchEmbeddingProvider {
// Try Voyage AI first (preferred for better retrieval performance)
const voyageApiKey = process.env.VOYAGE_API_KEY;
if (voyageApiKey && voyageApiKey.trim() !== '') {
try {
console.log('🚀 Using Voyage AI embedding provider for state-of-the-art embeddings');
return VoyageAIEmbeddingProvider.forGeneralPurpose(voyageApiKey);
} catch (error) {
console.warn('Failed to initialize Voyage AI embedding provider:', error);
console.warn('Falling back to OpenAI...');
}
}
// Fallback to OpenAI if available
const openaiApiKey = process.env.OPENAI_API_KEY;
if (openaiApiKey && openaiApiKey.trim() !== '') {
try {
console.log('Using OpenAI embedding provider for production-ready embeddings');
return new OpenAIEmbeddingProvider({
apiKey: openaiApiKey,
model: 'text-embedding-3-small'
});
} catch (error) {
console.warn('Failed to initialize OpenAI embedding provider:', error);
console.warn('Falling back to mock embedding provider');
}
} else {
console.warn('No VOYAGE_API_KEY or OPENAI_API_KEY found in environment variables');
console.warn('Using mock embedding provider - not suitable for production');
}
// Fallback to mock provider
return new DefaultEmbeddingProvider();
}
}