codevault
Version:
AI-powered semantic code search via Model Context Protocol
542 lines • 22.3 kB
JavaScript
import fs from 'fs';
import path from 'path';
import { createEmbeddingProvider } from '../providers/index.js';
import { Database } from '../database/db.js';
import { readCodemap } from '../codemap/io.js';
import { normalizeScopeFilters, applyScope } from '../search/scope.js';
import { BM25Index } from '../search/bm25.js';
import { reciprocalRankFusion } from '../search/hybrid.js';
import { rerankWithAPI } from '../ranking/api-reranker.js';
import { applySymbolBoost } from '../ranking/symbol-boost.js';
import { readChunkFromDisk } from '../storage/encrypted-chunks.js';
// FIX: Add cache size limits to prevent memory leaks in long-running processes
const MAX_BM25_CACHE_SIZE = Number.parseInt(process.env.CODEVAULT_MAX_BM25_CACHE || '10', 10);
const MAX_CHUNK_TEXT_CACHE_SIZE = Number.parseInt(process.env.CODEVAULT_MAX_CHUNK_CACHE || '1000', 10);
const bm25IndexCache = new Map();
const chunkTextCache = new Map();
const RERANKER_MAX_CANDIDATES = Number.parseInt(process.env.CODEVAULT_RERANKER_MAX || '50', 10);
// Cache eviction helper for BM25 index cache (LRU)
function evictOldestBm25Index() {
if (bm25IndexCache.size >= MAX_BM25_CACHE_SIZE) {
let oldestKey = null;
let oldestTime = Infinity;
for (const [key, value] of bm25IndexCache.entries()) {
if (value.lastAccess < oldestTime) {
oldestTime = value.lastAccess;
oldestKey = key;
}
}
if (oldestKey) {
bm25IndexCache.delete(oldestKey);
}
}
}
// Cache eviction helper for chunk text cache (LRU)
function evictOldestChunkText() {
if (chunkTextCache.size >= MAX_CHUNK_TEXT_CACHE_SIZE) {
let oldestKey = null;
let oldestTime = Infinity;
for (const [key, value] of chunkTextCache.entries()) {
if (value.lastAccess < oldestTime) {
oldestTime = value.lastAccess;
oldestKey = key;
}
}
if (oldestKey) {
chunkTextCache.delete(oldestKey);
}
}
}
// Public function to clear caches (useful for long-running processes)
export function clearSearchCaches() {
bm25IndexCache.clear();
chunkTextCache.clear();
}
function normalizeQuery(query) {
return query
.toLowerCase()
.trim()
.replace(/[¿?]/g, '')
.replace(/\s+/g, ' ');
}
function cosineSimilarity(a, b) {
if (a.length !== b.length)
return 0;
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
function getBm25CacheKey(basePath, providerName, dimensions) {
return `${basePath}::${providerName}::${dimensions}`;
}
function getChunkCacheKey(basePath, sha) {
return `${basePath}::${sha}`;
}
function readChunkTextCached(sha, chunkDir, basePath) {
if (!sha) {
return null;
}
const cacheKey = getChunkCacheKey(basePath, sha);
const cached = chunkTextCache.get(cacheKey);
if (cached) {
// Update access time for LRU
cached.lastAccess = Date.now();
return cached.text;
}
try {
const result = readChunkFromDisk({ chunkDir, sha });
const code = result ? result.code : null;
evictOldestChunkText();
chunkTextCache.set(cacheKey, { text: code, lastAccess: Date.now() });
return code;
}
catch (error) {
evictOldestChunkText();
chunkTextCache.set(cacheKey, { text: null, lastAccess: Date.now() });
return null;
}
}
function buildBm25Document(chunk, codeText) {
if (!chunk) {
return '';
}
const parts = [
chunk.symbol,
chunk.file_path,
chunk.codevault_description,
chunk.codevault_intent,
codeText
].filter(value => typeof value === 'string' && value.trim().length > 0);
return parts.join('\n');
}
function ensureBm25IndexForChunks(basePath, chunkDir, providerName, dimensions, chunks) {
if (!Array.isArray(chunks) || chunks.length === 0) {
return null;
}
const key = getBm25CacheKey(basePath, providerName, dimensions);
let entry = bm25IndexCache.get(key);
if (!entry) {
evictOldestBm25Index();
entry = { index: new BM25Index(), added: new Set(), lastAccess: Date.now() };
bm25IndexCache.set(key, entry);
}
else {
// Update access time for LRU
entry.lastAccess = Date.now();
}
const toAdd = [];
for (const chunk of chunks) {
if (!chunk || !chunk.id || entry.added.has(chunk.id)) {
continue;
}
const codeText = readChunkTextCached(chunk.sha, chunkDir, basePath);
const docText = buildBm25Document(chunk, codeText);
if (docText && docText.trim().length > 0) {
toAdd.push({ id: chunk.id, text: docText });
}
entry.added.add(chunk.id);
}
if (toAdd.length > 0) {
entry.index.addDocuments(toAdd);
}
entry.index.consolidate();
return entry.index;
}
export async function searchCode(query, limit = 10, provider = 'auto', workingPath = '.', scopeOptions = {}) {
const basePath = path.resolve(workingPath);
const dbPath = path.join(basePath, '.codevault/codevault.db');
const chunkDir = path.join(basePath, '.codevault/chunks');
const codemapPath = path.join(basePath, 'codevault.codemap.json');
if (!query || !query.trim()) {
return getOverview(limit, workingPath);
}
const normalizedScope = normalizeScopeFilters(scopeOptions);
const effectiveProvider = normalizedScope.provider || provider;
const hybridEnabled = normalizedScope.hybrid !== false;
const bm25Enabled = normalizedScope.bm25 !== false;
const symbolBoostEnabled = normalizedScope.symbol_boost !== false;
const embeddingProvider = createEmbeddingProvider(effectiveProvider);
// FIX: Ensure database is always closed, even on error paths
let db = null;
try {
if (!fs.existsSync(dbPath)) {
return {
success: false,
error: 'database_not_found',
message: `Database not found at ${dbPath}. Project needs to be indexed first.`,
suggestion: `Run index_project on directory: ${workingPath}`,
provider: embeddingProvider.getName(),
scope: normalizedScope,
hybrid: { enabled: hybridEnabled, bm25Enabled },
symbolBoost: { enabled: symbolBoostEnabled, boosted: false },
reranker: normalizedScope.reranker,
results: []
};
}
db = new Database(dbPath);
const chunks = await db.getChunks(embeddingProvider.getName(), embeddingProvider.getDimensions());
const codemapData = readCodemap(codemapPath);
if (chunks.length === 0) {
return {
success: false,
error: 'no_chunks_found',
message: `No indexed chunks found with ${embeddingProvider.getName()} in ${basePath}`,
suggestion: `Run: codevault index --provider ${effectiveProvider} from ${basePath}`,
provider: embeddingProvider.getName(),
scope: normalizedScope,
hybrid: { enabled: hybridEnabled, bm25Enabled },
reranker: normalizedScope.reranker,
results: []
};
}
const scopedChunks = applyScope(chunks, normalizedScope);
const chunkInfoById = new Map();
const results = [];
let queryEmbedding = null;
if (scopedChunks.length > 0) {
if (embeddingProvider.init) {
await embeddingProvider.init();
}
queryEmbedding = await embeddingProvider.generateEmbedding(query);
}
for (const chunk of scopedChunks) {
const embedding = JSON.parse(chunk.embedding.toString());
const vectorSimilarity = queryEmbedding ? cosineSimilarity(queryEmbedding, embedding) : 0;
let boostScore = 0;
if (chunk.codevault_intent && query.toLowerCase().includes(chunk.codevault_intent.toLowerCase())) {
boostScore += 0.2;
}
if (chunk.codevault_tags) {
try {
const tags = JSON.parse(chunk.codevault_tags || '[]');
const queryLower = query.toLowerCase();
tags.forEach((tag) => {
if (typeof tag === 'string' && queryLower.includes(tag.toLowerCase())) {
boostScore += 0.1;
}
});
}
catch (error) {
// Ignore tag parsing errors
}
}
let docBoost = 0;
const filePath = chunk.file_path.toLowerCase();
if (filePath.includes('readme') ||
filePath.includes('/docs/') ||
filePath.startsWith('docs/') ||
filePath.includes('changelog') ||
filePath.includes('contributing') ||
filePath.endsWith('.md')) {
docBoost = 0.15;
}
const finalScore = Math.min(vectorSimilarity + boostScore + docBoost, 1.0);
const info = {
id: chunk.id,
file_path: chunk.file_path,
symbol: chunk.symbol,
sha: chunk.sha,
lang: chunk.lang,
chunk_type: chunk.chunk_type,
codevault_intent: chunk.codevault_intent,
codevault_description: chunk.codevault_description,
score: finalScore,
vectorScore: vectorSimilarity,
boostScore: boostScore
};
chunkInfoById.set(chunk.id, info);
results.push(info);
}
if (symbolBoostEnabled) {
try {
applySymbolBoost(results, { query, codemap: codemapData });
}
catch (error) {
// Symbol boost fails silently
}
}
const sortedResults = results.sort((a, b) => b.score - a.score);
const remainingSlots = limit;
let vectorResults = [];
let bm25Fused = false;
let bm25CandidateCount = 0;
if (remainingSlots > 0) {
const selectionBudget = Math.max(remainingSlots, 60);
const vectorPool = sortedResults.slice(0, selectionBudget);
if (hybridEnabled && bm25Enabled) {
const bm25Index = ensureBm25IndexForChunks(basePath, chunkDir, embeddingProvider.getName(), embeddingProvider.getDimensions(), scopedChunks);
if (bm25Index) {
const allowedIds = new Set(scopedChunks.map((chunk) => chunk.id));
const bm25RawResults = bm25Index.search(query, selectionBudget);
const bm25Results = bm25RawResults.filter(result => allowedIds.has(result.id));
bm25CandidateCount = bm25Results.length;
if (bm25Results.length > 0) {
const fused = reciprocalRankFusion({
vectorResults: vectorPool.map((item) => ({ id: item.id, score: item.score })),
bm25Results: bm25Results.map(item => ({ id: item.id, score: item.score })),
limit: selectionBudget,
k: 60
});
if (fused.length > 0) {
bm25Fused = true;
vectorResults = fused
.map(entry => {
const info = chunkInfoById.get(entry.id);
if (!info) {
return null;
}
info.hybridScore = entry.score;
info.bm25Score = entry.bm25Score;
info.bm25Rank = entry.bm25Rank;
info.vectorRank = entry.vectorRank;
return info;
})
.filter(Boolean);
}
}
}
}
if (vectorResults.length === 0) {
vectorResults = vectorPool;
}
const hasSymbolBoost = symbolBoostEnabled && vectorResults.some((candidate) => typeof candidate.symbolBoost === 'number' && candidate.symbolBoost > 0);
if (hasSymbolBoost && vectorResults.length > 1) {
vectorResults.sort((a, b) => {
const scoreA = typeof a.score === 'number' ? a.score : 0;
const scoreB = typeof b.score === 'number' ? b.score : 0;
if (scoreB !== scoreA) {
return scoreB - scoreA;
}
const boostA = typeof a.symbolBoost === 'number' ? a.symbolBoost : 0;
const boostB = typeof b.symbolBoost === 'number' ? b.symbolBoost : 0;
if (boostB !== boostA) {
return boostB - boostA;
}
const hybridA = typeof a.hybridScore === 'number' ? a.hybridScore : Number.NEGATIVE_INFINITY;
const hybridB = typeof b.hybridScore === 'number' ? b.hybridScore : Number.NEGATIVE_INFINITY;
return hybridB - hybridA;
});
}
vectorResults = vectorResults.slice(0, remainingSlots);
if (vectorResults.length > 1 && normalizedScope.reranker === 'api') {
try {
const reranked = await rerankWithAPI(query, vectorResults, {
max: Math.min(RERANKER_MAX_CANDIDATES, vectorResults.length),
getText: (candidate) => {
const codeText = readChunkTextCached(candidate.sha, chunkDir, basePath) || '';
return buildBm25Document(candidate, codeText);
}
});
if (Array.isArray(reranked) && reranked.length === vectorResults.length) {
vectorResults = reranked;
}
}
catch (error) {
// Silent fallback when reranker is unavailable
}
}
}
const vectorSearchType = bm25Fused ? 'hybrid' : 'vector';
const combinedResults = vectorResults.map((result) => {
const rawScore = typeof result.score === 'number' ? result.score : 0;
const meta = {
id: result.id,
symbol: result.symbol,
score: Math.min(1, rawScore),
intent: result.codevault_intent,
description: result.codevault_description,
searchType: vectorSearchType,
vectorScore: result.vectorScore
};
if (typeof result.hybridScore === 'number') {
meta.hybridScore = result.hybridScore;
}
if (typeof result.bm25Score === 'number') {
meta.bm25Score = result.bm25Score;
}
if (typeof result.bm25Rank === 'number') {
meta.bm25Rank = result.bm25Rank;
}
if (typeof result.vectorRank === 'number') {
meta.vectorRank = result.vectorRank;
}
if (typeof result.rerankerScore === 'number') {
meta.rerankerScore = result.rerankerScore;
}
if (typeof result.rerankerRank === 'number') {
meta.rerankerRank = result.rerankerRank;
}
if (typeof result.symbolBoost === 'number' && result.symbolBoost > 0) {
meta.symbolBoost = result.symbolBoost;
if (Array.isArray(result.symbolBoostSources) && result.symbolBoostSources.length > 0) {
meta.symbolBoostSources = result.symbolBoostSources;
}
}
if (typeof rawScore === 'number' && rawScore > 1) {
meta.scoreRaw = rawScore;
}
return {
type: 'code',
lang: result.lang,
path: result.file_path,
sha: result.sha,
data: null,
meta
};
});
combinedResults.sort((a, b) => {
const hasRerankerA = typeof a.meta?.rerankerScore === 'number';
const hasRerankerB = typeof b.meta?.rerankerScore === 'number';
if (hasRerankerA && hasRerankerB) {
return b.meta.rerankerScore - a.meta.rerankerScore;
}
const scoreA = a.meta?.score ?? 0;
const scoreB = b.meta?.score ?? 0;
return scoreB - scoreA;
});
if (combinedResults.length === 0) {
return {
success: false,
error: 'no_relevant_matches',
message: `No relevant matches found for "${query}"`,
suggestion: 'Try broader search terms or check if the project is properly indexed',
provider: embeddingProvider.getName(),
scope: normalizedScope,
hybrid: { enabled: hybridEnabled, bm25Enabled },
symbolBoost: { enabled: symbolBoostEnabled, boosted: false },
reranker: normalizedScope.reranker,
results: []
};
}
if (symbolBoostEnabled && combinedResults.length > 0 && combinedResults[0].meta.score > 0.8) {
await db.recordIntention(normalizeQuery(query), query, combinedResults[0].sha, combinedResults[0].meta.score);
}
const pattern = query
.toLowerCase()
.replace(/\b[\w-]+Session\b/gi, '[SESSION]')
.replace(/\bstripe\b/gi, '[PAYMENT_PROVIDER]')
.replace(/\b\w+Service\b/gi, '[SERVICE]')
.replace(/\b\w+Controller\b/gi, '[CONTROLLER]')
.trim();
await db.recordQueryPattern(pattern);
return {
success: true,
query,
searchType: bm25Fused ? 'hybrid' : 'vector',
vectorResults: vectorResults.length,
provider: embeddingProvider.getName(),
scope: normalizedScope,
reranker: normalizedScope.reranker,
hybrid: {
enabled: hybridEnabled,
bm25Enabled,
fused: bm25Fused,
bm25Candidates: bm25CandidateCount
},
symbolBoost: {
enabled: symbolBoostEnabled,
boosted: symbolBoostEnabled && vectorResults.some((result) => typeof result.symbolBoost === 'number' && result.symbolBoost > 0)
},
results: combinedResults
};
}
catch (error) {
console.error('Error in searchCode:', error);
return {
success: false,
error: 'search_error',
message: error.message,
provider: embeddingProvider.getName(),
scope: normalizedScope,
hybrid: { enabled: hybridEnabled, bm25Enabled },
symbolBoost: { enabled: symbolBoostEnabled, boosted: false },
reranker: normalizedScope.reranker,
results: []
};
}
finally {
// FIX: Always close database connection in finally block
if (db) {
try {
db.close();
}
catch (closeError) {
// Ignore close errors during cleanup
}
}
}
}
export async function getOverview(limit = 20, workingPath = '.') {
const basePath = path.resolve(workingPath);
const dbPath = path.join(basePath, '.codevault/codevault.db');
try {
if (!fs.existsSync(dbPath)) {
return {
success: false,
error: 'database_not_found',
message: `Database not found at ${dbPath}. Project needs to be indexed first.`,
suggestion: `Run index_project on directory: ${workingPath}`,
provider: 'unknown',
results: []
};
}
const db = new Database(dbPath);
const chunks = await db.getOverviewChunks(limit);
db.close();
const results = chunks.map(chunk => ({
type: 'code',
lang: chunk.lang,
path: chunk.file_path,
sha: chunk.sha,
data: null,
meta: {
id: chunk.id,
symbol: chunk.symbol,
score: 1.0
}
}));
return {
success: true,
provider: 'overview',
results
};
}
catch (error) {
return {
success: false,
error: 'overview_error',
message: error.message,
provider: 'overview',
results: []
};
}
}
export async function getChunk(sha, workingPath = '.') {
const basePath = path.resolve(workingPath);
const chunkDir = path.join(basePath, '.codevault/chunks');
try {
const result = readChunkFromDisk({ chunkDir, sha });
if (!result) {
const plainPath = path.join(chunkDir, `${sha}.gz`);
const encryptedPath = path.join(chunkDir, `${sha}.gz.enc`);
throw new Error(`Chunk ${sha} not found at ${plainPath} or ${encryptedPath}`);
}
return { success: true, code: result.code };
}
catch (error) {
if (error && error.code === 'ENCRYPTION_KEY_REQUIRED') {
return {
success: false,
error: `Chunk ${sha} is encrypted. Configure CODEVAULT_ENCRYPTION_KEY to decrypt.`
};
}
return { success: false, error: error.message };
}
}
//# sourceMappingURL=search.js.map