@knath2000/codebase-indexing-mcp
Version:
MCP server for codebase indexing with Voyage AI embeddings and Qdrant vector storage
204 lines • 7.56 kB
JavaScript
export class HybridSearchService {
constructor(config) {
this.enabled = config.enableHybridSearch;
this.alpha = config.hybridSearchAlpha;
}
/**
* Check if hybrid search is enabled
*/
isEnabled() {
return this.enabled;
}
/**
* Perform hybrid search combining dense and sparse retrieval
*/
async hybridSearch(_query, denseResults, sparseResults) {
if (!this.enabled || !sparseResults) {
// Return dense-only results if hybrid is disabled or sparse unavailable
return {
denseResults,
sparseResults: sparseResults || [],
combinedResults: denseResults,
alpha: 1.0
};
}
console.log(`🔀 [HybridSearch] Combining ${denseResults.length} dense + ${sparseResults.length} sparse results`);
try {
// Combine and score results
const combinedResults = this.combineResults(denseResults, sparseResults, this.alpha);
console.log(`✅ [HybridSearch] Combined to ${combinedResults.length} results with α=${this.alpha}`);
return {
denseResults,
sparseResults,
combinedResults,
alpha: this.alpha
};
}
catch (error) {
console.error(`❌ [HybridSearch] Hybrid search failed:`, error);
// Fallback to dense results only
return {
denseResults,
sparseResults: sparseResults || [],
combinedResults: denseResults,
alpha: 1.0
};
}
}
/**
* Combine dense and sparse results using weighted scoring
*/
combineResults(denseResults, sparseResults, alpha) {
// Create maps for efficient lookup
const denseMap = new Map();
const sparseMap = new Map();
// Normalize scores to 0-1 range
const maxDenseScore = Math.max(...denseResults.map(r => r.score), 0.01);
const maxSparseScore = Math.max(...sparseResults.map(r => r.score), 0.01);
// Index dense results
denseResults.forEach(result => {
const normalizedResult = {
...result,
score: result.score / maxDenseScore
};
denseMap.set(result.id, normalizedResult);
});
// Index sparse results
sparseResults.forEach(result => {
const normalizedResult = {
...result,
score: result.score / maxSparseScore
};
sparseMap.set(result.id, normalizedResult);
});
// Get all unique result IDs
const allIds = new Set([...denseMap.keys(), ...sparseMap.keys()]);
// Combine scores for each result
const combinedResults = [];
for (const id of allIds) {
const denseResult = denseMap.get(id);
const sparseResult = sparseMap.get(id);
// Calculate hybrid score: α * dense + (1-α) * sparse
const denseScore = denseResult?.score || 0;
const sparseScore = sparseResult?.score || 0;
const hybridScore = alpha * denseScore + (1 - alpha) * sparseScore;
// Use the result with more complete data (prefer dense, fallback to sparse)
const baseResult = denseResult || sparseResult;
if (!baseResult)
continue;
const combinedResult = {
...baseResult,
score: hybridScore,
hybridScore: {
dense: denseScore,
sparse: sparseScore,
combined: hybridScore
}
};
combinedResults.push(combinedResult);
}
// Sort by combined score (descending)
combinedResults.sort((a, b) => b.score - a.score);
return combinedResults;
}
/**
* Generate sparse vector representation for BM25-style search
* This is a simplified implementation - in production, you'd use a proper BM25 library
*/
generateSparseVector(text, vocabulary) {
const terms = this.tokenize(text);
const termFreq = new Map();
// Count term frequencies
terms.forEach(term => {
termFreq.set(term, (termFreq.get(term) || 0) + 1);
});
const indices = [];
const values = [];
// Convert to sparse vector format
termFreq.forEach((freq, term) => {
const termId = vocabulary.get(term);
if (termId !== undefined) {
indices.push(termId);
values.push(freq); // Could apply TF-IDF weighting here
}
});
return { indices, values };
}
/**
* Simple tokenization for sparse vector generation
*/
tokenize(text) {
return text
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter(token => token.length > 1)
.filter(token => !this.isStopWord(token));
}
/**
* Check if a word is a stop word
*/
isStopWord(word) {
const stopWords = new Set([
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'
]);
return stopWords.has(word);
}
/**
* Adjust the alpha parameter for different query types
*/
adaptiveAlpha(query) {
let adaptedAlpha = this.alpha;
// Boost dense search for semantic queries
if (this.isSemanticQuery(query.query)) {
adaptedAlpha = Math.min(1.0, this.alpha + 0.1);
}
// Boost sparse search for exact matches and identifier searches
if (this.isExactMatchQuery(query.query)) {
adaptedAlpha = Math.max(0.0, this.alpha - 0.2);
}
return adaptedAlpha;
}
/**
* Detect if query is semantic in nature
*/
isSemanticQuery(query) {
const semanticIndicators = [
'how to', 'what is', 'explain', 'implement', 'create', 'build',
'algorithm', 'pattern', 'similar to', 'like', 'example'
];
const lowerQuery = query.toLowerCase();
return semanticIndicators.some(indicator => lowerQuery.includes(indicator));
}
/**
* Detect if query is looking for exact matches
*/
isExactMatchQuery(query) {
// Queries with camelCase, snake_case, or specific identifiers
const exactMatchPatterns = [
/[a-z][A-Z]/, // camelCase
/_[a-z]/, // snake_case
/^[A-Z][a-z]+$/, // PascalCase
/^\w+\(\)$/, // function calls
/^[\w.]+$/ // dot notation
];
return exactMatchPatterns.some(pattern => pattern.test(query.trim()));
}
/**
* Get hybrid search statistics
*/
getStats() {
return {
enabled: this.enabled,
alpha: this.alpha,
totalQueries: 0, // TODO: Implement query tracking
denseOnlyQueries: 0, // TODO: Implement tracking
hybridQueries: 0, // TODO: Implement tracking
averageImprovement: 0.15 // TODO: Implement improvement tracking
};
}
}
//# sourceMappingURL=hybrid-search.js.map