UNPKG

@andrejs1979/document

Version:

MongoDB-compatible document database for NoSQL

601 lines 24.5 kB
/** * NoSQL - Hybrid Document+Vector Search * Advanced search combining text, metadata, and vector similarity */ import { DocumentError } from '../types'; /** * Hybrid search engine combining document queries with vector similarity */ export class HybridSearchEngine { documentStorage; queryEngine; config; searchCache = new Map(); constructor(documentStorage, queryEngine, config) { this.documentStorage = documentStorage; this.queryEngine = queryEngine; this.config = config; } /** * Perform hybrid search combining text, vector, and metadata filters */ async hybridSearch(database, collection, query) { const startTime = Date.now(); try { // Check cache first const cacheKey = this.getSearchCacheKey(database, collection, query); if (this.config.enableQueryCache && this.searchCache.has(cacheKey)) { const cached = this.searchCache.get(cacheKey); if (Date.now() - cached.metadata.executionTime < (this.config.queryCacheTTL || 300) * 1000) { return cached; } this.searchCache.delete(cacheKey); } // Determine search strategy based on query components const searchType = this.determineSearchType(query); let result; switch (searchType) { case 'text': result = await this.performTextSearch(database, collection, query); break; case 'vector': result = await this.performVectorSearch(database, collection, query); break; case 'hybrid': result = await this.performHybridSearch(database, collection, query); break; default: throw new DocumentError('Invalid search type', 'INVALID_SEARCH_TYPE'); } // Update execution time result.metadata.executionTime = Date.now() - startTime; // Cache the result if (this.config.enableQueryCache) { this.searchCache.set(cacheKey, result); } return result; } catch (error) { throw new DocumentError(`Hybrid search failed: ${error.message}`, 'HYBRID_SEARCH_ERROR'); } } /** * Search documents by text content with optional vector boost */ async textSearch(database, collection, searchText, options = {}) { try { const query = { text: searchText, filter: options.filters, weights: options.vectorBoost ? { text: 0.7, vector: 0.3, metadata: 0.0 } : { text: 1.0, vector: 0.0, metadata: 0.0 }, options: { limit: options.limit || 10, threshold: options.threshold || 0.0 } }; const result = await this.hybridSearch(database, collection, query); return this.convertToVectorDocumentResults(result); } catch (error) { throw new DocumentError(`Text search failed: ${error.message}`, 'TEXT_SEARCH_ERROR'); } } /** * Search documents by vector similarity with optional text boost */ async vectorSearch(database, collection, queryVector, options = {}) { try { const query = { vector: queryVector instanceof Float32Array ? queryVector : new Float32Array(queryVector), filter: options.filters, weights: options.textBoost ? { text: 0.3, vector: 0.7, metadata: 0.0 } : { text: 0.0, vector: 1.0, metadata: 0.0 }, options: { limit: options.limit || 10, threshold: options.threshold || 0.0 } }; const result = await this.hybridSearch(database, collection, query); return this.convertToVectorDocumentResults(result); } catch (error) { throw new DocumentError(`Vector search failed: ${error.message}`, 'VECTOR_SEARCH_ERROR'); } } /** * Semantic search combining text embedding and vector similarity */ async semanticSearch(database, collection, searchText, options = {}) { try { // Generate embedding for search text const embedding = await this.generateEmbedding(searchText, options.embeddingModel); const query = { text: searchText, vector: embedding, filter: options.filters, weights: { text: options.textWeight || 0.3, vector: options.vectorWeight || 0.7, metadata: 0.0 }, options: { limit: options.limit || 10, threshold: options.threshold || 0.0 } }; const result = await this.hybridSearch(database, collection, query); return this.convertToVectorDocumentResults(result); } catch (error) { throw new DocumentError(`Semantic search failed: ${error.message}`, 'SEMANTIC_SEARCH_ERROR'); } } /** * Multi-modal search across different content types */ async multiModalSearch(database, collection, queries, options = {}) { try { const embeddings = []; const modalities = []; // Generate embeddings for each modality if (queries.text) { const textEmbedding = await this.generateEmbedding(queries.text); embeddings.push(textEmbedding); modalities.push('text'); } if (queries.image) { const imageEmbedding = await this.generateImageEmbedding(queries.image); embeddings.push(imageEmbedding); modalities.push('image'); } // Combine embeddings (simple average for now) const combinedEmbedding = this.combineEmbeddings(embeddings); const query = { text: queries.text, vector: combinedEmbedding, filter: options.filters, weights: options.weights || { text: 0.3, vector: 0.7, metadata: 0.0 }, options: { limit: options.limit || 10, threshold: options.threshold || 0.0 } }; const result = await this.hybridSearch(database, collection, query); return this.convertToVectorDocumentResults(result); } catch (error) { throw new DocumentError(`Multi-modal search failed: ${error.message}`, 'MULTIMODAL_SEARCH_ERROR'); } } /** * Similar document finder using document as query */ async findSimilarDocuments(database, collection, documentId, options = {}) { try { // Get the reference document const referenceDoc = await this.documentStorage.findOne(collection, { _id: documentId }); if (!referenceDoc) { throw new DocumentError(`Reference document ${documentId} not found`, 'DOCUMENT_NOT_FOUND'); } // Extract search components from reference document const searchText = options.useText !== false ? this.extractSearchableText(referenceDoc) : undefined; const vector = options.useVector !== false && referenceDoc._vector ? referenceDoc._vector.data : undefined; // Build metadata filters let metadataFilters = {}; if (options.useMetadata !== false) { metadataFilters = this.extractMetadataFilters(referenceDoc); } // Combine with additional filters const combinedFilters = { ...metadataFilters, ...options.filters, _id: { $ne: documentId } // Exclude the reference document itself }; const query = { text: searchText, vector: vector, filter: combinedFilters, weights: { text: options.useText !== false ? 0.3 : 0.0, vector: options.useVector !== false ? 0.6 : 0.0, metadata: options.useMetadata !== false ? 0.1 : 0.0 }, options: { limit: options.limit || 10, threshold: options.threshold || 0.0 } }; const result = await this.hybridSearch(database, collection, query); return this.convertToVectorDocumentResults(result); } catch (error) { throw new DocumentError(`Similar documents search failed: ${error.message}`, 'SIMILAR_DOCS_ERROR'); } } /** * Recommendation engine based on user interaction history */ async getRecommendations(database, collection, userHistory, options = {}) { try { const recommendations = []; const seenDocuments = new Set(); // Add viewed/liked documents to exclusion list if (userHistory.viewedDocuments) { userHistory.viewedDocuments.forEach(id => seenDocuments.add(id)); } if (userHistory.likedDocuments) { userHistory.likedDocuments.forEach(id => seenDocuments.add(id)); } // Get recommendations based on liked documents if (userHistory.likedDocuments && userHistory.likedDocuments.length > 0) { for (const docId of userHistory.likedDocuments.slice(-5)) { // Use last 5 liked documents const similarDocs = await this.findSimilarDocuments(database, collection, docId, { filters: { ...options.filters, _id: { $nin: Array.from(seenDocuments) } }, limit: 5, threshold: 0.5 }); for (const doc of similarDocs) { if (!seenDocuments.has(doc._id) && recommendations.length < (options.limit || 20)) { recommendations.push(doc); seenDocuments.add(doc._id); } } } } // Get recommendations based on search queries if (userHistory.searchQueries && userHistory.searchQueries.length > 0) { const recentQueries = userHistory.searchQueries.slice(-3); // Use last 3 search queries for (const query of recentQueries) { const searchResults = await this.semanticSearch(database, collection, query, { filters: { ...options.filters, _id: { $nin: Array.from(seenDocuments) } }, limit: 5, threshold: 0.3 }); for (const doc of searchResults) { if (!seenDocuments.has(doc._id) && recommendations.length < (options.limit || 20)) { recommendations.push(doc); seenDocuments.add(doc._id); } } } } // Apply diversity factor to avoid too similar recommendations const diverseRecommendations = this.applyDiversityFilter(recommendations, options.diversityFactor || 0.7); return diverseRecommendations.slice(0, options.limit || 10); } catch (error) { throw new DocumentError(`Recommendations failed: ${error.message}`, 'RECOMMENDATIONS_ERROR'); } } // =============================== // Private Methods // =============================== determineSearchType(query) { const hasText = !!query.text; const hasVector = !!query.vector; if (hasText && hasVector) return 'hybrid'; if (hasVector) return 'vector'; if (hasText) return 'text'; return 'text'; // Default to text search } async performTextSearch(database, collection, query) { // Build text search filter const filter = { ...query.filter, $text: { $search: query.text } }; // Execute search const documents = await this.documentStorage.find(collection, filter, query.options); // Calculate text scores (simplified) const scores = documents.map((doc, index) => ({ text: this.calculateTextScore(doc, query.text), vector: 0, combined: this.calculateTextScore(doc, query.text) })); return { documents, scores, metadata: { totalMatches: documents.length, searchType: 'text', executionTime: 0 // Will be updated by caller } }; } async performVectorSearch(database, collection, query) { // First get documents with vectors const vectorFilter = { ...query.filter, _vector: { $exists: true } }; const candidateDocuments = await this.documentStorage.find(collection, vectorFilter, { ...query.options, limit: (query.options?.limit || 10) * 5 // Get more candidates for better results }); // Calculate vector similarities const vectorResults = []; for (const doc of candidateDocuments) { if (doc._vector) { const similarity = this.calculateVectorSimilarity(query.vector, doc._vector.data); if (similarity >= (query.options?.threshold || 0)) { vectorResults.push({ document: doc, score: similarity }); } } } // Sort by similarity and take top results vectorResults.sort((a, b) => b.score - a.score); const topResults = vectorResults.slice(0, query.options?.limit || 10); const documents = topResults.map(r => r.document); const scores = topResults.map(r => ({ text: 0, vector: r.score, combined: r.score })); return { documents, scores, metadata: { totalMatches: documents.length, searchType: 'vector', executionTime: 0 } }; } async performHybridSearch(database, collection, query) { // Get text search results const textResults = query.text ? await this.performTextSearch(database, collection, { ...query, vector: undefined }) : null; // Get vector search results const vectorResults = query.vector ? await this.performVectorSearch(database, collection, { ...query, text: undefined }) : null; // Merge and rank results const combinedResults = this.mergeSearchResults(textResults, vectorResults, query.weights); return { documents: combinedResults.documents, scores: combinedResults.scores, metadata: { totalMatches: combinedResults.documents.length, searchType: 'hybrid', executionTime: 0 } }; } mergeSearchResults(textResults, vectorResults, weights) { const defaultWeights = { text: 0.5, vector: 0.5, metadata: 0.0 }; const w = weights || defaultWeights; // Create maps for efficient lookup const textDocMap = new Map(); const vectorDocMap = new Map(); if (textResults) { textResults.documents.forEach((doc, index) => { textDocMap.set(doc._id, { doc, score: textResults.scores[index].text }); }); } if (vectorResults) { vectorResults.documents.forEach((doc, index) => { vectorDocMap.set(doc._id, { doc, score: vectorResults.scores[index].vector }); }); } // Combine results const allDocIds = new Set([...textDocMap.keys(), ...vectorDocMap.keys()]); const combinedResults = []; for (const docId of allDocIds) { const textResult = textDocMap.get(docId); const vectorResult = vectorDocMap.get(docId); const textScore = textResult?.score || 0; const vectorScore = vectorResult?.score || 0; const combinedScore = (textScore * w.text) + (vectorScore * w.vector); // Use the document from whichever source has it const document = textResult?.doc || vectorResult?.doc; if (document) { combinedResults.push({ document, textScore, vectorScore, combinedScore }); } } // Sort by combined score combinedResults.sort((a, b) => b.combinedScore - a.combinedScore); return { documents: combinedResults.map(r => r.document), scores: combinedResults.map(r => ({ text: r.textScore, vector: r.vectorScore, combined: r.combinedScore })) }; } calculateTextScore(document, searchText) { // Simplified text scoring based on term frequency const searchTerms = searchText.toLowerCase().split(/\s+/); const documentText = this.extractSearchableText(document).toLowerCase(); let score = 0; for (const term of searchTerms) { const matches = (documentText.match(new RegExp(term, 'g')) || []).length; score += matches / searchTerms.length; } return Math.min(score, 1.0); // Normalize to 0-1 } calculateVectorSimilarity(vector1, vector2) { // Cosine similarity if (vector1.length !== vector2.length) { return 0; } let dotProduct = 0; let norm1 = 0; let norm2 = 0; for (let i = 0; i < vector1.length; i++) { dotProduct += vector1[i] * vector2[i]; norm1 += vector1[i] * vector1[i]; norm2 += vector2[i] * vector2[i]; } if (norm1 === 0 || norm2 === 0) { return 0; } return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2)); } extractSearchableText(document) { const searchableFields = ['title', 'content', 'description', 'text', 'name', 'summary']; const texts = []; const extractText = (obj, depth = 0) => { if (depth > 3) return; for (const [key, value] of Object.entries(obj)) { if (typeof value === 'string' && (searchableFields.includes(key) || key.includes('text'))) { texts.push(value); } else if (typeof value === 'object' && value !== null && !Array.isArray(value)) { extractText(value, depth + 1); } } }; extractText(document); return texts.join(' '); } extractMetadataFilters(document) { const filters = {}; // Extract common metadata fields for similarity const metadataFields = ['category', 'type', 'status', 'tags', 'author']; for (const field of metadataFields) { if (document[field]) { if (Array.isArray(document[field])) { // For arrays, find documents with any matching elements filters[field] = { $in: document[field] }; } else { // For simple values, find exact matches filters[field] = document[field]; } } } return filters; } async generateEmbedding(text, model) { // This would integrate with the vector module's embedding pipeline // For now, return a mock embedding const dimensions = this.config.vectorConfig?.defaultDimensions || 1536; const embedding = new Float32Array(dimensions); // Simple hash-based mock embedding let hash = 0; for (let i = 0; i < text.length; i++) { const char = text.charCodeAt(i); hash = ((hash << 5) - hash) + char; hash = hash & hash; // Convert to 32-bit integer } for (let i = 0; i < dimensions; i++) { embedding[i] = Math.sin(hash + i) * 0.1; } return embedding; } async generateImageEmbedding(imageData) { // Mock image embedding generation const dimensions = this.config.vectorConfig?.defaultDimensions || 1536; const embedding = new Float32Array(dimensions); // Simple hash-based mock embedding from image data let hash = 0; for (let i = 0; i < Math.min(imageData.length, 1000); i++) { hash = ((hash << 5) - hash) + imageData[i]; hash = hash & hash; } for (let i = 0; i < dimensions; i++) { embedding[i] = Math.sin(hash + i) * 0.1; } return embedding; } combineEmbeddings(embeddings) { if (embeddings.length === 0) { return new Float32Array(this.config.vectorConfig?.defaultDimensions || 1536); } if (embeddings.length === 1) { return embeddings[0]; } // Average the embeddings const dimensions = embeddings[0].length; const combined = new Float32Array(dimensions); for (let i = 0; i < dimensions; i++) { let sum = 0; for (const embedding of embeddings) { sum += embedding[i]; } combined[i] = sum / embeddings.length; } return combined; } convertToVectorDocumentResults(result) { return result.documents.map((doc, index) => ({ ...doc, _textScore: result.scores[index].text, _vectorScore: result.scores[index].vector, _hybridScore: result.scores[index].combined })); } applyDiversityFilter(recommendations, diversityFactor) { if (diversityFactor <= 0 || recommendations.length <= 1) { return recommendations; } const diverse = [recommendations[0]]; // Always include the top result for (let i = 1; i < recommendations.length; i++) { const candidate = recommendations[i]; // Check diversity against already selected documents let minSimilarity = 1.0; for (const selected of diverse) { const similarity = this.calculateDocumentSimilarity(candidate, selected); minSimilarity = Math.min(minSimilarity, similarity); } // Include if diverse enough if (minSimilarity < diversityFactor) { diverse.push(candidate); } } return diverse; } calculateDocumentSimilarity(doc1, doc2) { // Simple similarity based on shared categories/tags const tags1 = new Set(doc1.tags || []); const tags2 = new Set(doc2.tags || []); if (tags1.size === 0 && tags2.size === 0) return 0; const intersection = new Set([...tags1].filter(tag => tags2.has(tag))); const union = new Set([...tags1, ...tags2]); return intersection.size / union.size; // Jaccard similarity } getSearchCacheKey(database, collection, query) { return `${database}:${collection}:${JSON.stringify(query)}`; } /** * Clear search cache */ clearSearchCache() { this.searchCache.clear(); } /** * Get search cache statistics */ getSearchCacheStats() { return { size: this.searchCache.size, memoryUsage: JSON.stringify([...this.searchCache.entries()]).length }; } } //# sourceMappingURL=hybrid-search.js.map