@andrejs1979/document
Version:
MongoDB-compatible document database for NoSQL
601 lines • 24.5 kB
JavaScript
/**
* NoSQL - Hybrid Document+Vector Search
* Advanced search combining text, metadata, and vector similarity
*/
import { DocumentError } from '../types';
/**
* Hybrid search engine combining document queries with vector similarity
*/
export class HybridSearchEngine {
documentStorage;
queryEngine;
config;
searchCache = new Map();
constructor(documentStorage, queryEngine, config) {
this.documentStorage = documentStorage;
this.queryEngine = queryEngine;
this.config = config;
}
/**
* Perform hybrid search combining text, vector, and metadata filters
*/
async hybridSearch(database, collection, query) {
const startTime = Date.now();
try {
// Check cache first
const cacheKey = this.getSearchCacheKey(database, collection, query);
if (this.config.enableQueryCache && this.searchCache.has(cacheKey)) {
const cached = this.searchCache.get(cacheKey);
if (Date.now() - cached.metadata.executionTime < (this.config.queryCacheTTL || 300) * 1000) {
return cached;
}
this.searchCache.delete(cacheKey);
}
// Determine search strategy based on query components
const searchType = this.determineSearchType(query);
let result;
switch (searchType) {
case 'text':
result = await this.performTextSearch(database, collection, query);
break;
case 'vector':
result = await this.performVectorSearch(database, collection, query);
break;
case 'hybrid':
result = await this.performHybridSearch(database, collection, query);
break;
default:
throw new DocumentError('Invalid search type', 'INVALID_SEARCH_TYPE');
}
// Update execution time
result.metadata.executionTime = Date.now() - startTime;
// Cache the result
if (this.config.enableQueryCache) {
this.searchCache.set(cacheKey, result);
}
return result;
}
catch (error) {
throw new DocumentError(`Hybrid search failed: ${error.message}`, 'HYBRID_SEARCH_ERROR');
}
}
/**
* Search documents by text content with optional vector boost
*/
async textSearch(database, collection, searchText, options = {}) {
try {
const query = {
text: searchText,
filter: options.filters,
weights: options.vectorBoost ? { text: 0.7, vector: 0.3, metadata: 0.0 } : { text: 1.0, vector: 0.0, metadata: 0.0 },
options: {
limit: options.limit || 10,
threshold: options.threshold || 0.0
}
};
const result = await this.hybridSearch(database, collection, query);
return this.convertToVectorDocumentResults(result);
}
catch (error) {
throw new DocumentError(`Text search failed: ${error.message}`, 'TEXT_SEARCH_ERROR');
}
}
/**
* Search documents by vector similarity with optional text boost
*/
async vectorSearch(database, collection, queryVector, options = {}) {
try {
const query = {
vector: queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector),
filter: options.filters,
weights: options.textBoost ? { text: 0.3, vector: 0.7, metadata: 0.0 } : { text: 0.0, vector: 1.0, metadata: 0.0 },
options: {
limit: options.limit || 10,
threshold: options.threshold || 0.0
}
};
const result = await this.hybridSearch(database, collection, query);
return this.convertToVectorDocumentResults(result);
}
catch (error) {
throw new DocumentError(`Vector search failed: ${error.message}`, 'VECTOR_SEARCH_ERROR');
}
}
/**
* Semantic search combining text embedding and vector similarity
*/
async semanticSearch(database, collection, searchText, options = {}) {
try {
// Generate embedding for search text
const embedding = await this.generateEmbedding(searchText, options.embeddingModel);
const query = {
text: searchText,
vector: embedding,
filter: options.filters,
weights: {
text: options.textWeight || 0.3,
vector: options.vectorWeight || 0.7,
metadata: 0.0
},
options: {
limit: options.limit || 10,
threshold: options.threshold || 0.0
}
};
const result = await this.hybridSearch(database, collection, query);
return this.convertToVectorDocumentResults(result);
}
catch (error) {
throw new DocumentError(`Semantic search failed: ${error.message}`, 'SEMANTIC_SEARCH_ERROR');
}
}
/**
* Multi-modal search across different content types
*/
async multiModalSearch(database, collection, queries, options = {}) {
try {
const embeddings = [];
const modalities = [];
// Generate embeddings for each modality
if (queries.text) {
const textEmbedding = await this.generateEmbedding(queries.text);
embeddings.push(textEmbedding);
modalities.push('text');
}
if (queries.image) {
const imageEmbedding = await this.generateImageEmbedding(queries.image);
embeddings.push(imageEmbedding);
modalities.push('image');
}
// Combine embeddings (simple average for now)
const combinedEmbedding = this.combineEmbeddings(embeddings);
const query = {
text: queries.text,
vector: combinedEmbedding,
filter: options.filters,
weights: options.weights || { text: 0.3, vector: 0.7, metadata: 0.0 },
options: {
limit: options.limit || 10,
threshold: options.threshold || 0.0
}
};
const result = await this.hybridSearch(database, collection, query);
return this.convertToVectorDocumentResults(result);
}
catch (error) {
throw new DocumentError(`Multi-modal search failed: ${error.message}`, 'MULTIMODAL_SEARCH_ERROR');
}
}
/**
* Similar document finder using document as query
*/
async findSimilarDocuments(database, collection, documentId, options = {}) {
try {
// Get the reference document
const referenceDoc = await this.documentStorage.findOne(collection, { _id: documentId });
if (!referenceDoc) {
throw new DocumentError(`Reference document ${documentId} not found`, 'DOCUMENT_NOT_FOUND');
}
// Extract search components from reference document
const searchText = options.useText !== false ? this.extractSearchableText(referenceDoc) : undefined;
const vector = options.useVector !== false && referenceDoc._vector ? referenceDoc._vector.data : undefined;
// Build metadata filters
let metadataFilters = {};
if (options.useMetadata !== false) {
metadataFilters = this.extractMetadataFilters(referenceDoc);
}
// Combine with additional filters
const combinedFilters = {
...metadataFilters,
...options.filters,
_id: { $ne: documentId } // Exclude the reference document itself
};
const query = {
text: searchText,
vector: vector,
filter: combinedFilters,
weights: {
text: options.useText !== false ? 0.3 : 0.0,
vector: options.useVector !== false ? 0.6 : 0.0,
metadata: options.useMetadata !== false ? 0.1 : 0.0
},
options: {
limit: options.limit || 10,
threshold: options.threshold || 0.0
}
};
const result = await this.hybridSearch(database, collection, query);
return this.convertToVectorDocumentResults(result);
}
catch (error) {
throw new DocumentError(`Similar documents search failed: ${error.message}`, 'SIMILAR_DOCS_ERROR');
}
}
/**
* Recommendation engine based on user interaction history
*/
async getRecommendations(database, collection, userHistory, options = {}) {
try {
const recommendations = [];
const seenDocuments = new Set();
// Add viewed/liked documents to exclusion list
if (userHistory.viewedDocuments) {
userHistory.viewedDocuments.forEach(id => seenDocuments.add(id));
}
if (userHistory.likedDocuments) {
userHistory.likedDocuments.forEach(id => seenDocuments.add(id));
}
// Get recommendations based on liked documents
if (userHistory.likedDocuments && userHistory.likedDocuments.length > 0) {
for (const docId of userHistory.likedDocuments.slice(-5)) { // Use last 5 liked documents
const similarDocs = await this.findSimilarDocuments(database, collection, docId, {
filters: {
...options.filters,
_id: { $nin: Array.from(seenDocuments) }
},
limit: 5,
threshold: 0.5
});
for (const doc of similarDocs) {
if (!seenDocuments.has(doc._id) && recommendations.length < (options.limit || 20)) {
recommendations.push(doc);
seenDocuments.add(doc._id);
}
}
}
}
// Get recommendations based on search queries
if (userHistory.searchQueries && userHistory.searchQueries.length > 0) {
const recentQueries = userHistory.searchQueries.slice(-3); // Use last 3 search queries
for (const query of recentQueries) {
const searchResults = await this.semanticSearch(database, collection, query, {
filters: {
...options.filters,
_id: { $nin: Array.from(seenDocuments) }
},
limit: 5,
threshold: 0.3
});
for (const doc of searchResults) {
if (!seenDocuments.has(doc._id) && recommendations.length < (options.limit || 20)) {
recommendations.push(doc);
seenDocuments.add(doc._id);
}
}
}
}
// Apply diversity factor to avoid too similar recommendations
const diverseRecommendations = this.applyDiversityFilter(recommendations, options.diversityFactor || 0.7);
return diverseRecommendations.slice(0, options.limit || 10);
}
catch (error) {
throw new DocumentError(`Recommendations failed: ${error.message}`, 'RECOMMENDATIONS_ERROR');
}
}
// ===============================
// Private Methods
// ===============================
determineSearchType(query) {
const hasText = !!query.text;
const hasVector = !!query.vector;
if (hasText && hasVector)
return 'hybrid';
if (hasVector)
return 'vector';
if (hasText)
return 'text';
return 'text'; // Default to text search
}
async performTextSearch(database, collection, query) {
// Build text search filter
const filter = {
...query.filter,
$text: { $search: query.text }
};
// Execute search
const documents = await this.documentStorage.find(collection, filter, query.options);
// Calculate text scores (simplified)
const scores = documents.map((doc, index) => ({
text: this.calculateTextScore(doc, query.text),
vector: 0,
combined: this.calculateTextScore(doc, query.text)
}));
return {
documents,
scores,
metadata: {
totalMatches: documents.length,
searchType: 'text',
executionTime: 0 // Will be updated by caller
}
};
}
async performVectorSearch(database, collection, query) {
// First get documents with vectors
const vectorFilter = {
...query.filter,
_vector: { $exists: true }
};
const candidateDocuments = await this.documentStorage.find(collection, vectorFilter, {
...query.options,
limit: (query.options?.limit || 10) * 5 // Get more candidates for better results
});
// Calculate vector similarities
const vectorResults = [];
for (const doc of candidateDocuments) {
if (doc._vector) {
const similarity = this.calculateVectorSimilarity(query.vector, doc._vector.data);
if (similarity >= (query.options?.threshold || 0)) {
vectorResults.push({ document: doc, score: similarity });
}
}
}
// Sort by similarity and take top results
vectorResults.sort((a, b) => b.score - a.score);
const topResults = vectorResults.slice(0, query.options?.limit || 10);
const documents = topResults.map(r => r.document);
const scores = topResults.map(r => ({
text: 0,
vector: r.score,
combined: r.score
}));
return {
documents,
scores,
metadata: {
totalMatches: documents.length,
searchType: 'vector',
executionTime: 0
}
};
}
async performHybridSearch(database, collection, query) {
// Get text search results
const textResults = query.text ? await this.performTextSearch(database, collection, {
...query,
vector: undefined
}) : null;
// Get vector search results
const vectorResults = query.vector ? await this.performVectorSearch(database, collection, {
...query,
text: undefined
}) : null;
// Merge and rank results
const combinedResults = this.mergeSearchResults(textResults, vectorResults, query.weights);
return {
documents: combinedResults.documents,
scores: combinedResults.scores,
metadata: {
totalMatches: combinedResults.documents.length,
searchType: 'hybrid',
executionTime: 0
}
};
}
mergeSearchResults(textResults, vectorResults, weights) {
const defaultWeights = { text: 0.5, vector: 0.5, metadata: 0.0 };
const w = weights || defaultWeights;
// Create maps for efficient lookup
const textDocMap = new Map();
const vectorDocMap = new Map();
if (textResults) {
textResults.documents.forEach((doc, index) => {
textDocMap.set(doc._id, {
doc,
score: textResults.scores[index].text
});
});
}
if (vectorResults) {
vectorResults.documents.forEach((doc, index) => {
vectorDocMap.set(doc._id, {
doc,
score: vectorResults.scores[index].vector
});
});
}
// Combine results
const allDocIds = new Set([...textDocMap.keys(), ...vectorDocMap.keys()]);
const combinedResults = [];
for (const docId of allDocIds) {
const textResult = textDocMap.get(docId);
const vectorResult = vectorDocMap.get(docId);
const textScore = textResult?.score || 0;
const vectorScore = vectorResult?.score || 0;
const combinedScore = (textScore * w.text) + (vectorScore * w.vector);
// Use the document from whichever source has it
const document = textResult?.doc || vectorResult?.doc;
if (document) {
combinedResults.push({
document,
textScore,
vectorScore,
combinedScore
});
}
}
// Sort by combined score
combinedResults.sort((a, b) => b.combinedScore - a.combinedScore);
return {
documents: combinedResults.map(r => r.document),
scores: combinedResults.map(r => ({
text: r.textScore,
vector: r.vectorScore,
combined: r.combinedScore
}))
};
}
calculateTextScore(document, searchText) {
// Simplified text scoring based on term frequency
const searchTerms = searchText.toLowerCase().split(/\s+/);
const documentText = this.extractSearchableText(document).toLowerCase();
let score = 0;
for (const term of searchTerms) {
const matches = (documentText.match(new RegExp(term, 'g')) || []).length;
score += matches / searchTerms.length;
}
return Math.min(score, 1.0); // Normalize to 0-1
}
calculateVectorSimilarity(vector1, vector2) {
// Cosine similarity
if (vector1.length !== vector2.length) {
return 0;
}
let dotProduct = 0;
let norm1 = 0;
let norm2 = 0;
for (let i = 0; i < vector1.length; i++) {
dotProduct += vector1[i] * vector2[i];
norm1 += vector1[i] * vector1[i];
norm2 += vector2[i] * vector2[i];
}
if (norm1 === 0 || norm2 === 0) {
return 0;
}
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
extractSearchableText(document) {
const searchableFields = ['title', 'content', 'description', 'text', 'name', 'summary'];
const texts = [];
const extractText = (obj, depth = 0) => {
if (depth > 3)
return;
for (const [key, value] of Object.entries(obj)) {
if (typeof value === 'string' && (searchableFields.includes(key) || key.includes('text'))) {
texts.push(value);
}
else if (typeof value === 'object' && value !== null && !Array.isArray(value)) {
extractText(value, depth + 1);
}
}
};
extractText(document);
return texts.join(' ');
}
extractMetadataFilters(document) {
const filters = {};
// Extract common metadata fields for similarity
const metadataFields = ['category', 'type', 'status', 'tags', 'author'];
for (const field of metadataFields) {
if (document[field]) {
if (Array.isArray(document[field])) {
// For arrays, find documents with any matching elements
filters[field] = { $in: document[field] };
}
else {
// For simple values, find exact matches
filters[field] = document[field];
}
}
}
return filters;
}
async generateEmbedding(text, model) {
// This would integrate with the vector module's embedding pipeline
// For now, return a mock embedding
const dimensions = this.config.vectorConfig?.defaultDimensions || 1536;
const embedding = new Float32Array(dimensions);
// Simple hash-based mock embedding
let hash = 0;
for (let i = 0; i < text.length; i++) {
const char = text.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
for (let i = 0; i < dimensions; i++) {
embedding[i] = Math.sin(hash + i) * 0.1;
}
return embedding;
}
async generateImageEmbedding(imageData) {
// Mock image embedding generation
const dimensions = this.config.vectorConfig?.defaultDimensions || 1536;
const embedding = new Float32Array(dimensions);
// Simple hash-based mock embedding from image data
let hash = 0;
for (let i = 0; i < Math.min(imageData.length, 1000); i++) {
hash = ((hash << 5) - hash) + imageData[i];
hash = hash & hash;
}
for (let i = 0; i < dimensions; i++) {
embedding[i] = Math.sin(hash + i) * 0.1;
}
return embedding;
}
combineEmbeddings(embeddings) {
if (embeddings.length === 0) {
return new Float32Array(this.config.vectorConfig?.defaultDimensions || 1536);
}
if (embeddings.length === 1) {
return embeddings[0];
}
// Average the embeddings
const dimensions = embeddings[0].length;
const combined = new Float32Array(dimensions);
for (let i = 0; i < dimensions; i++) {
let sum = 0;
for (const embedding of embeddings) {
sum += embedding[i];
}
combined[i] = sum / embeddings.length;
}
return combined;
}
convertToVectorDocumentResults(result) {
return result.documents.map((doc, index) => ({
...doc,
_textScore: result.scores[index].text,
_vectorScore: result.scores[index].vector,
_hybridScore: result.scores[index].combined
}));
}
applyDiversityFilter(recommendations, diversityFactor) {
if (diversityFactor <= 0 || recommendations.length <= 1) {
return recommendations;
}
const diverse = [recommendations[0]]; // Always include the top result
for (let i = 1; i < recommendations.length; i++) {
const candidate = recommendations[i];
// Check diversity against already selected documents
let minSimilarity = 1.0;
for (const selected of diverse) {
const similarity = this.calculateDocumentSimilarity(candidate, selected);
minSimilarity = Math.min(minSimilarity, similarity);
}
// Include if diverse enough
if (minSimilarity < diversityFactor) {
diverse.push(candidate);
}
}
return diverse;
}
calculateDocumentSimilarity(doc1, doc2) {
// Simple similarity based on shared categories/tags
const tags1 = new Set(doc1.tags || []);
const tags2 = new Set(doc2.tags || []);
if (tags1.size === 0 && tags2.size === 0)
return 0;
const intersection = new Set([...tags1].filter(tag => tags2.has(tag)));
const union = new Set([...tags1, ...tags2]);
return intersection.size / union.size; // Jaccard similarity
}
getSearchCacheKey(database, collection, query) {
return `${database}:${collection}:${JSON.stringify(query)}`;
}
/**
* Clear search cache
*/
clearSearchCache() {
this.searchCache.clear();
}
/**
* Get search cache statistics
*/
getSearchCacheStats() {
return {
size: this.searchCache.size,
memoryUsage: JSON.stringify([...this.searchCache.entries()]).length
};
}
}
//# sourceMappingURL=hybrid-search.js.map