hikma-engine
Version:
Code Knowledge Graph Indexer - A sophisticated TypeScript-based indexer that transforms Git repositories into multi-dimensional knowledge stores for AI agents
440 lines (439 loc) • 18.6 kB
JavaScript
/**
* @file Enhanced search functionality specifically designed for the embedding_nodes table.
* Provides semantic vector search and metadata-based queries using the unified embedding storage.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.EnhancedSearchService = void 0;
const embedding_service_1 = require("./embedding-service");
const db_clients_1 = require("../persistence/db-clients");
const logger_1 = require("../utils/logger");
const error_handling_1 = require("../utils/error-handling");
/**
* Enhanced search service specifically for embedding_nodes table.
*/
class EnhancedSearchService {
constructor(config) {
this.logger = (0, logger_1.getLogger)('EnhancedSearchService');
this.isInitialized = false;
this.config = config;
const dbConfig = config.getDatabaseConfig();
this.embeddingService = new embedding_service_1.EmbeddingService(config);
this.sqliteClient = new db_clients_1.SQLiteClient(dbConfig.sqlite.path);
}
/**
* Initializes the enhanced search service.
*/
async initialize() {
if (this.isInitialized) {
this.logger.debug('Enhanced search service already initialized');
return;
}
const operation = this.logger.operation('Initializing enhanced search service');
try {
this.logger.info('Loading embedding model...');
await this.embeddingService.loadModel();
this.logger.info('Connecting to SQLite database...');
await this.sqliteClient.connect();
// Verify embedding_nodes table exists
await this.verifyEmbeddingTable();
this.isInitialized = true;
this.logger.info('Enhanced search service initialized successfully');
operation();
}
catch (error) {
this.logger.error('Failed to initialize enhanced search service', { error: (0, error_handling_1.getErrorMessage)(error) });
operation();
throw error;
}
}
/**
* Performs semantic search using vector embeddings on embedding_nodes table.
*/
async semanticSearch(query, options = {}) {
if (!this.isInitialized) {
await this.initialize();
}
const { limit = 10, nodeTypes, minSimilarity = 0.1, filePaths, includeEmbedding = false } = options;
const operation = this.logger.operation(`Semantic search on embeddings: "${query}"`);
try {
this.logger.info('Performing semantic search on embedding_nodes', {
query: query.substring(0, 100),
limit,
nodeTypes,
filePaths
});
// Check if vector search is available
const vectorAvailable = await this.sqliteClient.isVectorSearchAvailable();
if (!vectorAvailable) {
this.logger.warn('Vector search not available, falling back to text-based search');
return await this.textBasedSearch(query, options);
}
// Generate embedding for the query
const queryEmbedding = await this.embeddingService.embedQuery(query);
// Build SQL query with filters
let sql = `
SELECT id, node_id, node_type, file_path, source_text,
vec_distance_cosine(embedding, ?) as distance
FROM embedding_nodes
WHERE embedding IS NOT NULL
`;
const params = [Buffer.from(new Float32Array(queryEmbedding).buffer)];
// Add filters
if (nodeTypes && nodeTypes.length > 0) {
const placeholders = nodeTypes.map(() => '?').join(',');
sql += ` AND node_type IN (${placeholders})`;
params.push(...nodeTypes);
}
if (filePaths && filePaths.length > 0) {
const fileConditions = filePaths.map(() => 'file_path LIKE ?').join(' OR ');
sql += ` AND (${fileConditions})`;
params.push(...filePaths.map(path => `%${path}%`));
}
// Add similarity threshold
if (minSimilarity > 0) {
const distanceThreshold = 1 - minSimilarity;
sql += ` AND vec_distance_cosine(embedding, ?) <= ?`;
params.push(Buffer.from(new Float32Array(queryEmbedding).buffer), distanceThreshold);
}
sql += ` ORDER BY distance ASC LIMIT ?`;
params.push(limit);
const results = this.sqliteClient.all(sql, params);
const allResults = results.map((row, index) => ({
node: {
id: row.id,
nodeId: row.node_id,
nodeType: row.node_type,
filePath: row.file_path,
sourceText: row.source_text,
...(includeEmbedding && { embedding: this.deserializeEmbedding(row.embedding) })
},
similarity: 1 - row.distance, // Convert distance to similarity
rank: index + 1
}));
// Filter out test files
const searchResults = this.filterOutTestFiles(allResults);
this.logger.info('Semantic search completed', {
query: query.substring(0, 50),
resultsFound: searchResults.length,
topSimilarity: searchResults[0]?.similarity || 0
});
operation();
return searchResults;
}
catch (error) {
this.logger.error('Semantic search failed', { error: (0, error_handling_1.getErrorMessage)(error) });
operation();
throw error;
}
}
/**
* Performs text-based search when vector search is not available.
*/
async textBasedSearch(query, options = {}) {
const { limit = 10, nodeTypes, filePaths } = options;
try {
let sql = `
SELECT id, node_id, node_type, file_path, source_text
FROM embedding_nodes
WHERE source_text LIKE ?
`;
const params = [`%${query}%`];
// Add filters
if (nodeTypes && nodeTypes.length > 0) {
const placeholders = nodeTypes.map(() => '?').join(',');
sql += ` AND node_type IN (${placeholders})`;
params.push(...nodeTypes);
}
if (filePaths && filePaths.length > 0) {
const fileConditions = filePaths.map(() => 'file_path LIKE ?').join(' OR ');
sql += ` AND (${fileConditions})`;
params.push(...filePaths.map(path => `%${path}%`));
}
sql += ` ORDER BY LENGTH(source_text) ASC LIMIT ?`;
params.push(limit);
const results = this.sqliteClient.all(sql, params);
const allResults = results.map((row, index) => ({
node: {
id: row.id,
nodeId: row.node_id,
nodeType: row.node_type,
filePath: row.file_path,
sourceText: row.source_text
},
similarity: 0.8, // Default similarity for text search
rank: index + 1
}));
// Filter out test files
return this.filterOutTestFiles(allResults);
}
catch (error) {
this.logger.error('Text-based search failed', { error: (0, error_handling_1.getErrorMessage)(error) });
return [];
}
}
/**
* Performs metadata-based search on embedding_nodes.
*/
async metadataSearch(filters, options = {}) {
if (!this.isInitialized) {
await this.initialize();
}
const { limit = 100 } = options;
const operation = this.logger.operation('Metadata search on embeddings');
try {
this.logger.info('Performing metadata search on embedding_nodes', { filters });
let sql = 'SELECT id, node_id, node_type, file_path, source_text FROM embedding_nodes WHERE 1=1';
const params = [];
if (filters.nodeType) {
sql += ' AND node_type = ?';
params.push(filters.nodeType);
}
if (filters.filePath) {
sql += ' AND file_path LIKE ?';
params.push(`%${filters.filePath}%`);
}
if (filters.fileExtension) {
sql += ' AND file_path LIKE ?';
params.push(`%.${filters.fileExtension}`);
}
if (filters.sourceTextContains) {
sql += ' AND source_text LIKE ?';
params.push(`%${filters.sourceTextContains}%`);
}
sql += ' ORDER BY id LIMIT ?';
params.push(limit);
const results = this.sqliteClient.all(sql, params);
const allResults = results.map((row, index) => ({
node: {
id: row.id,
nodeId: row.node_id,
nodeType: row.node_type,
filePath: row.file_path,
sourceText: row.source_text
},
similarity: 0.9, // High similarity for metadata matches
rank: index + 1
}));
// Filter out test files
const searchResults = this.filterOutTestFiles(allResults);
this.logger.info('Metadata search completed', {
filters,
resultsFound: searchResults.length
});
operation();
return searchResults;
}
catch (error) {
this.logger.error('Metadata search failed', { error: (0, error_handling_1.getErrorMessage)(error) });
operation();
throw error;
}
}
/**
* Performs hybrid search combining semantic and metadata filters.
*/
async hybridSearch(query, filters = {}, options = {}) {
const { limit = 20 } = options;
try {
// Perform semantic search with metadata filters
const semanticResults = await this.semanticSearch(query, {
...options,
limit: Math.ceil(limit * 0.7), // 70% from semantic search
nodeTypes: filters.nodeType ? [filters.nodeType] : options.nodeTypes,
filePaths: filters.filePath ? [filters.filePath] : options.filePaths
});
// Perform metadata search
const metadataResults = await this.metadataSearch(filters, {
...options,
limit: Math.ceil(limit * 0.3) // 30% from metadata search
});
// Combine and deduplicate results
const allResults = [...semanticResults, ...metadataResults];
const uniqueResults = new Map();
for (const result of allResults) {
const key = result.node.id;
if (!uniqueResults.has(key) || uniqueResults.get(key).similarity < result.similarity) {
uniqueResults.set(key, result);
}
}
// Sort by similarity and limit
return Array.from(uniqueResults.values())
.sort((a, b) => b.similarity - a.similarity)
.slice(0, limit)
.map((result, index) => ({ ...result, rank: index + 1 }));
}
catch (error) {
this.logger.error('Hybrid search failed', { error: (0, error_handling_1.getErrorMessage)(error) });
throw error;
}
}
/**
* Gets statistics about the embedding_nodes table.
*/
async getEmbeddingStats() {
try {
const totalNodes = this.sqliteClient.get('SELECT COUNT(*) as count FROM embedding_nodes')?.count || 0;
const nodeTypeResults = this.sqliteClient.all(`
SELECT node_type, COUNT(*) as count
FROM embedding_nodes
GROUP BY node_type
ORDER BY count DESC
`);
const filePathResults = this.sqliteClient.all(`
SELECT file_path, COUNT(*) as count
FROM embedding_nodes
GROUP BY file_path
ORDER BY count DESC
LIMIT 20
`);
const embeddedCount = this.sqliteClient.get('SELECT COUNT(*) as count FROM embedding_nodes WHERE embedding IS NOT NULL')?.count || 0;
const nodeTypeBreakdown = {};
for (const row of nodeTypeResults) {
nodeTypeBreakdown[row.node_type] = row.count;
}
const filePathBreakdown = {};
for (const row of filePathResults) {
filePathBreakdown[row.file_path] = row.count;
}
return {
totalNodes,
nodeTypeBreakdown,
filePathBreakdown,
embeddingCoverage: totalNodes > 0 ? embeddedCount / totalNodes : 0
};
}
catch (error) {
this.logger.error('Failed to get embedding stats', { error: (0, error_handling_1.getErrorMessage)(error) });
throw error;
}
}
/**
* Finds similar nodes to a given node ID.
*/
async findSimilarNodes(nodeId, options = {}) {
try {
// Get the embedding for the source node
const sourceNode = this.sqliteClient.get('SELECT embedding, node_type, file_path FROM embedding_nodes WHERE node_id = ?', [nodeId]);
if (!sourceNode || !sourceNode.embedding) {
throw new Error(`Node ${nodeId} not found or has no embedding`);
}
const sourceEmbedding = this.deserializeEmbedding(sourceNode.embedding);
// Perform vector search using the source embedding
return await this.vectorSearchWithEmbedding(sourceEmbedding, {
...options,
nodeTypes: options.nodeTypes || [sourceNode.node_type]
});
}
catch (error) {
this.logger.error('Failed to find similar nodes', { nodeId, error: (0, error_handling_1.getErrorMessage)(error) });
throw error;
}
}
/**
* Performs vector search with a given embedding.
*/
async vectorSearchWithEmbedding(embedding, options = {}) {
const { limit = 10, nodeTypes, minSimilarity = 0.1, filePaths } = options;
let sql = `
SELECT id, node_id, node_type, file_path, source_text,
vec_distance_cosine(embedding, ?) as distance
FROM embedding_nodes
WHERE embedding IS NOT NULL
`;
const params = [Buffer.from(new Float32Array(embedding).buffer)];
// Add filters
if (nodeTypes && nodeTypes.length > 0) {
const placeholders = nodeTypes.map(() => '?').join(',');
sql += ` AND node_type IN (${placeholders})`;
params.push(...nodeTypes);
}
if (filePaths && filePaths.length > 0) {
const fileConditions = filePaths.map(() => 'file_path LIKE ?').join(' OR ');
sql += ` AND (${fileConditions})`;
params.push(...filePaths.map(path => `%${path}%`));
}
if (minSimilarity > 0) {
const distanceThreshold = 1 - minSimilarity;
sql += ` AND vec_distance_cosine(embedding, ?) <= ?`;
params.push(Buffer.from(new Float32Array(embedding).buffer), distanceThreshold);
}
sql += ` ORDER BY distance ASC LIMIT ?`;
params.push(limit);
const results = this.sqliteClient.all(sql, params);
const allResults = results.map((row, index) => ({
node: {
id: row.id,
nodeId: row.node_id,
nodeType: row.node_type,
filePath: row.file_path,
sourceText: row.source_text
},
similarity: 1 - row.distance,
rank: index + 1
}));
// Filter out test files
return this.filterOutTestFiles(allResults);
}
/**
* Verifies that the embedding_nodes table exists and has the expected structure.
*/
async verifyEmbeddingTable() {
try {
const tableInfo = this.sqliteClient.all("PRAGMA table_info(embedding_nodes)");
if (tableInfo.length === 0) {
throw new Error('embedding_nodes table does not exist');
}
const requiredColumns = ['id', 'node_id', 'embedding', 'source_text', 'node_type', 'file_path'];
const existingColumns = tableInfo.map((col) => col.name);
for (const col of requiredColumns) {
if (!existingColumns.includes(col)) {
throw new Error(`Required column '${col}' missing from embedding_nodes table`);
}
}
this.logger.info('embedding_nodes table structure verified', { columns: existingColumns });
}
catch (error) {
this.logger.error('Failed to verify embedding_nodes table', { error: (0, error_handling_1.getErrorMessage)(error) });
throw error;
}
}
/**
* Deserializes embedding from database blob format.
*/
deserializeEmbedding(blob) {
const float32Array = new Float32Array(blob.buffer, blob.byteOffset, blob.byteLength / 4);
return Array.from(float32Array);
}
/**
* Filter out test files from search results
*/
filterOutTestFiles(results) {
return results.filter(result => {
const filePath = result.node.filePath;
// Check if file is a test file based on common patterns
const isTestFile = /\.(test|spec)\.(ts|js|tsx|jsx)$/i.test(filePath) ||
/\/tests?\//.test(filePath) ||
/\/test\//.test(filePath) ||
/\/__tests__\//.test(filePath) ||
/\.test\./i.test(filePath) ||
/\.spec\./i.test(filePath);
return !isTestFile;
});
}
/**
* Disconnects from the database.
*/
async disconnect() {
try {
this.sqliteClient.disconnect();
this.isInitialized = false;
this.logger.info('Enhanced search service disconnected');
}
catch (error) {
this.logger.error('Failed to disconnect enhanced search service', { error: (0, error_handling_1.getErrorMessage)(error) });
throw error;
}
}
}
exports.EnhancedSearchService = EnhancedSearchService;
;