UNPKG

@knath2000/codebase-indexing-mcp

Version:

MCP server for codebase indexing with Voyage AI embeddings and Qdrant vector storage

204 lines 7.56 kB
export class HybridSearchService { constructor(config) { this.enabled = config.enableHybridSearch; this.alpha = config.hybridSearchAlpha; } /** * Check if hybrid search is enabled */ isEnabled() { return this.enabled; } /** * Perform hybrid search combining dense and sparse retrieval */ async hybridSearch(_query, denseResults, sparseResults) { if (!this.enabled || !sparseResults) { // Return dense-only results if hybrid is disabled or sparse unavailable return { denseResults, sparseResults: sparseResults || [], combinedResults: denseResults, alpha: 1.0 }; } console.log(`🔀 [HybridSearch] Combining ${denseResults.length} dense + ${sparseResults.length} sparse results`); try { // Combine and score results const combinedResults = this.combineResults(denseResults, sparseResults, this.alpha); console.log(`✅ [HybridSearch] Combined to ${combinedResults.length} results with α=${this.alpha}`); return { denseResults, sparseResults, combinedResults, alpha: this.alpha }; } catch (error) { console.error(`❌ [HybridSearch] Hybrid search failed:`, error); // Fallback to dense results only return { denseResults, sparseResults: sparseResults || [], combinedResults: denseResults, alpha: 1.0 }; } } /** * Combine dense and sparse results using weighted scoring */ combineResults(denseResults, sparseResults, alpha) { // Create maps for efficient lookup const denseMap = new Map(); const sparseMap = new Map(); // Normalize scores to 0-1 range const maxDenseScore = Math.max(...denseResults.map(r => r.score), 0.01); const maxSparseScore = Math.max(...sparseResults.map(r => r.score), 0.01); // Index dense results denseResults.forEach(result => { const normalizedResult = { ...result, score: result.score / maxDenseScore }; denseMap.set(result.id, normalizedResult); }); // Index sparse results sparseResults.forEach(result => { const normalizedResult = { ...result, score: result.score / maxSparseScore }; sparseMap.set(result.id, normalizedResult); }); // Get all unique result IDs const allIds = new Set([...denseMap.keys(), ...sparseMap.keys()]); // Combine scores for each result const combinedResults = []; for (const id of allIds) { const denseResult = denseMap.get(id); const sparseResult = sparseMap.get(id); // Calculate hybrid score: α * dense + (1-α) * sparse const denseScore = denseResult?.score || 0; const sparseScore = sparseResult?.score || 0; const hybridScore = alpha * denseScore + (1 - alpha) * sparseScore; // Use the result with more complete data (prefer dense, fallback to sparse) const baseResult = denseResult || sparseResult; if (!baseResult) continue; const combinedResult = { ...baseResult, score: hybridScore, hybridScore: { dense: denseScore, sparse: sparseScore, combined: hybridScore } }; combinedResults.push(combinedResult); } // Sort by combined score (descending) combinedResults.sort((a, b) => b.score - a.score); return combinedResults; } /** * Generate sparse vector representation for BM25-style search * This is a simplified implementation - in production, you'd use a proper BM25 library */ generateSparseVector(text, vocabulary) { const terms = this.tokenize(text); const termFreq = new Map(); // Count term frequencies terms.forEach(term => { termFreq.set(term, (termFreq.get(term) || 0) + 1); }); const indices = []; const values = []; // Convert to sparse vector format termFreq.forEach((freq, term) => { const termId = vocabulary.get(term); if (termId !== undefined) { indices.push(termId); values.push(freq); // Could apply TF-IDF weighting here } }); return { indices, values }; } /** * Simple tokenization for sparse vector generation */ tokenize(text) { return text .toLowerCase() .replace(/[^\w\s]/g, ' ') .split(/\s+/) .filter(token => token.length > 1) .filter(token => !this.isStopWord(token)); } /** * Check if a word is a stop word */ isStopWord(word) { const stopWords = new Set([ 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those' ]); return stopWords.has(word); } /** * Adjust the alpha parameter for different query types */ adaptiveAlpha(query) { let adaptedAlpha = this.alpha; // Boost dense search for semantic queries if (this.isSemanticQuery(query.query)) { adaptedAlpha = Math.min(1.0, this.alpha + 0.1); } // Boost sparse search for exact matches and identifier searches if (this.isExactMatchQuery(query.query)) { adaptedAlpha = Math.max(0.0, this.alpha - 0.2); } return adaptedAlpha; } /** * Detect if query is semantic in nature */ isSemanticQuery(query) { const semanticIndicators = [ 'how to', 'what is', 'explain', 'implement', 'create', 'build', 'algorithm', 'pattern', 'similar to', 'like', 'example' ]; const lowerQuery = query.toLowerCase(); return semanticIndicators.some(indicator => lowerQuery.includes(indicator)); } /** * Detect if query is looking for exact matches */ isExactMatchQuery(query) { // Queries with camelCase, snake_case, or specific identifiers const exactMatchPatterns = [ /[a-z][A-Z]/, // camelCase /_[a-z]/, // snake_case /^[A-Z][a-z]+$/, // PascalCase /^\w+\(\)$/, // function calls /^[\w.]+$/ // dot notation ]; return exactMatchPatterns.some(pattern => pattern.test(query.trim())); } /** * Get hybrid search statistics */ getStats() { return { enabled: this.enabled, alpha: this.alpha, totalQueries: 0, // TODO: Implement query tracking denseOnlyQueries: 0, // TODO: Implement tracking hybridQueries: 0, // TODO: Implement tracking averageImprovement: 0.15 // TODO: Implement improvement tracking }; } } //# sourceMappingURL=hybrid-search.js.map