UNPKG

hikma-engine

Version:

Code Knowledge Graph Indexer - A sophisticated TypeScript-based indexer that transforms Git repositories into multi-dimensional knowledge stores for AI agents

440 lines (439 loc) 18.6 kB
"use strict"; /** * @file Enhanced search functionality specifically designed for the embedding_nodes table. * Provides semantic vector search and metadata-based queries using the unified embedding storage. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.EnhancedSearchService = void 0; const embedding_service_1 = require("./embedding-service"); const db_clients_1 = require("../persistence/db-clients"); const logger_1 = require("../utils/logger"); const error_handling_1 = require("../utils/error-handling"); /** * Enhanced search service specifically for embedding_nodes table. */ class EnhancedSearchService { constructor(config) { this.logger = (0, logger_1.getLogger)('EnhancedSearchService'); this.isInitialized = false; this.config = config; const dbConfig = config.getDatabaseConfig(); this.embeddingService = new embedding_service_1.EmbeddingService(config); this.sqliteClient = new db_clients_1.SQLiteClient(dbConfig.sqlite.path); } /** * Initializes the enhanced search service. */ async initialize() { if (this.isInitialized) { this.logger.debug('Enhanced search service already initialized'); return; } const operation = this.logger.operation('Initializing enhanced search service'); try { this.logger.info('Loading embedding model...'); await this.embeddingService.loadModel(); this.logger.info('Connecting to SQLite database...'); await this.sqliteClient.connect(); // Verify embedding_nodes table exists await this.verifyEmbeddingTable(); this.isInitialized = true; this.logger.info('Enhanced search service initialized successfully'); operation(); } catch (error) { this.logger.error('Failed to initialize enhanced search service', { error: (0, error_handling_1.getErrorMessage)(error) }); operation(); throw error; } } /** * Performs semantic search using vector embeddings on embedding_nodes table. */ async semanticSearch(query, options = {}) { if (!this.isInitialized) { await this.initialize(); } const { limit = 10, nodeTypes, minSimilarity = 0.1, filePaths, includeEmbedding = false } = options; const operation = this.logger.operation(`Semantic search on embeddings: "${query}"`); try { this.logger.info('Performing semantic search on embedding_nodes', { query: query.substring(0, 100), limit, nodeTypes, filePaths }); // Check if vector search is available const vectorAvailable = await this.sqliteClient.isVectorSearchAvailable(); if (!vectorAvailable) { this.logger.warn('Vector search not available, falling back to text-based search'); return await this.textBasedSearch(query, options); } // Generate embedding for the query const queryEmbedding = await this.embeddingService.embedQuery(query); // Build SQL query with filters let sql = ` SELECT id, node_id, node_type, file_path, source_text, vec_distance_cosine(embedding, ?) as distance FROM embedding_nodes WHERE embedding IS NOT NULL `; const params = [Buffer.from(new Float32Array(queryEmbedding).buffer)]; // Add filters if (nodeTypes && nodeTypes.length > 0) { const placeholders = nodeTypes.map(() => '?').join(','); sql += ` AND node_type IN (${placeholders})`; params.push(...nodeTypes); } if (filePaths && filePaths.length > 0) { const fileConditions = filePaths.map(() => 'file_path LIKE ?').join(' OR '); sql += ` AND (${fileConditions})`; params.push(...filePaths.map(path => `%${path}%`)); } // Add similarity threshold if (minSimilarity > 0) { const distanceThreshold = 1 - minSimilarity; sql += ` AND vec_distance_cosine(embedding, ?) <= ?`; params.push(Buffer.from(new Float32Array(queryEmbedding).buffer), distanceThreshold); } sql += ` ORDER BY distance ASC LIMIT ?`; params.push(limit); const results = this.sqliteClient.all(sql, params); const allResults = results.map((row, index) => ({ node: { id: row.id, nodeId: row.node_id, nodeType: row.node_type, filePath: row.file_path, sourceText: row.source_text, ...(includeEmbedding && { embedding: this.deserializeEmbedding(row.embedding) }) }, similarity: 1 - row.distance, // Convert distance to similarity rank: index + 1 })); // Filter out test files const searchResults = this.filterOutTestFiles(allResults); this.logger.info('Semantic search completed', { query: query.substring(0, 50), resultsFound: searchResults.length, topSimilarity: searchResults[0]?.similarity || 0 }); operation(); return searchResults; } catch (error) { this.logger.error('Semantic search failed', { error: (0, error_handling_1.getErrorMessage)(error) }); operation(); throw error; } } /** * Performs text-based search when vector search is not available. */ async textBasedSearch(query, options = {}) { const { limit = 10, nodeTypes, filePaths } = options; try { let sql = ` SELECT id, node_id, node_type, file_path, source_text FROM embedding_nodes WHERE source_text LIKE ? `; const params = [`%${query}%`]; // Add filters if (nodeTypes && nodeTypes.length > 0) { const placeholders = nodeTypes.map(() => '?').join(','); sql += ` AND node_type IN (${placeholders})`; params.push(...nodeTypes); } if (filePaths && filePaths.length > 0) { const fileConditions = filePaths.map(() => 'file_path LIKE ?').join(' OR '); sql += ` AND (${fileConditions})`; params.push(...filePaths.map(path => `%${path}%`)); } sql += ` ORDER BY LENGTH(source_text) ASC LIMIT ?`; params.push(limit); const results = this.sqliteClient.all(sql, params); const allResults = results.map((row, index) => ({ node: { id: row.id, nodeId: row.node_id, nodeType: row.node_type, filePath: row.file_path, sourceText: row.source_text }, similarity: 0.8, // Default similarity for text search rank: index + 1 })); // Filter out test files return this.filterOutTestFiles(allResults); } catch (error) { this.logger.error('Text-based search failed', { error: (0, error_handling_1.getErrorMessage)(error) }); return []; } } /** * Performs metadata-based search on embedding_nodes. */ async metadataSearch(filters, options = {}) { if (!this.isInitialized) { await this.initialize(); } const { limit = 100 } = options; const operation = this.logger.operation('Metadata search on embeddings'); try { this.logger.info('Performing metadata search on embedding_nodes', { filters }); let sql = 'SELECT id, node_id, node_type, file_path, source_text FROM embedding_nodes WHERE 1=1'; const params = []; if (filters.nodeType) { sql += ' AND node_type = ?'; params.push(filters.nodeType); } if (filters.filePath) { sql += ' AND file_path LIKE ?'; params.push(`%${filters.filePath}%`); } if (filters.fileExtension) { sql += ' AND file_path LIKE ?'; params.push(`%.${filters.fileExtension}`); } if (filters.sourceTextContains) { sql += ' AND source_text LIKE ?'; params.push(`%${filters.sourceTextContains}%`); } sql += ' ORDER BY id LIMIT ?'; params.push(limit); const results = this.sqliteClient.all(sql, params); const allResults = results.map((row, index) => ({ node: { id: row.id, nodeId: row.node_id, nodeType: row.node_type, filePath: row.file_path, sourceText: row.source_text }, similarity: 0.9, // High similarity for metadata matches rank: index + 1 })); // Filter out test files const searchResults = this.filterOutTestFiles(allResults); this.logger.info('Metadata search completed', { filters, resultsFound: searchResults.length }); operation(); return searchResults; } catch (error) { this.logger.error('Metadata search failed', { error: (0, error_handling_1.getErrorMessage)(error) }); operation(); throw error; } } /** * Performs hybrid search combining semantic and metadata filters. */ async hybridSearch(query, filters = {}, options = {}) { const { limit = 20 } = options; try { // Perform semantic search with metadata filters const semanticResults = await this.semanticSearch(query, { ...options, limit: Math.ceil(limit * 0.7), // 70% from semantic search nodeTypes: filters.nodeType ? [filters.nodeType] : options.nodeTypes, filePaths: filters.filePath ? [filters.filePath] : options.filePaths }); // Perform metadata search const metadataResults = await this.metadataSearch(filters, { ...options, limit: Math.ceil(limit * 0.3) // 30% from metadata search }); // Combine and deduplicate results const allResults = [...semanticResults, ...metadataResults]; const uniqueResults = new Map(); for (const result of allResults) { const key = result.node.id; if (!uniqueResults.has(key) || uniqueResults.get(key).similarity < result.similarity) { uniqueResults.set(key, result); } } // Sort by similarity and limit return Array.from(uniqueResults.values()) .sort((a, b) => b.similarity - a.similarity) .slice(0, limit) .map((result, index) => ({ ...result, rank: index + 1 })); } catch (error) { this.logger.error('Hybrid search failed', { error: (0, error_handling_1.getErrorMessage)(error) }); throw error; } } /** * Gets statistics about the embedding_nodes table. */ async getEmbeddingStats() { try { const totalNodes = this.sqliteClient.get('SELECT COUNT(*) as count FROM embedding_nodes')?.count || 0; const nodeTypeResults = this.sqliteClient.all(` SELECT node_type, COUNT(*) as count FROM embedding_nodes GROUP BY node_type ORDER BY count DESC `); const filePathResults = this.sqliteClient.all(` SELECT file_path, COUNT(*) as count FROM embedding_nodes GROUP BY file_path ORDER BY count DESC LIMIT 20 `); const embeddedCount = this.sqliteClient.get('SELECT COUNT(*) as count FROM embedding_nodes WHERE embedding IS NOT NULL')?.count || 0; const nodeTypeBreakdown = {}; for (const row of nodeTypeResults) { nodeTypeBreakdown[row.node_type] = row.count; } const filePathBreakdown = {}; for (const row of filePathResults) { filePathBreakdown[row.file_path] = row.count; } return { totalNodes, nodeTypeBreakdown, filePathBreakdown, embeddingCoverage: totalNodes > 0 ? embeddedCount / totalNodes : 0 }; } catch (error) { this.logger.error('Failed to get embedding stats', { error: (0, error_handling_1.getErrorMessage)(error) }); throw error; } } /** * Finds similar nodes to a given node ID. */ async findSimilarNodes(nodeId, options = {}) { try { // Get the embedding for the source node const sourceNode = this.sqliteClient.get('SELECT embedding, node_type, file_path FROM embedding_nodes WHERE node_id = ?', [nodeId]); if (!sourceNode || !sourceNode.embedding) { throw new Error(`Node ${nodeId} not found or has no embedding`); } const sourceEmbedding = this.deserializeEmbedding(sourceNode.embedding); // Perform vector search using the source embedding return await this.vectorSearchWithEmbedding(sourceEmbedding, { ...options, nodeTypes: options.nodeTypes || [sourceNode.node_type] }); } catch (error) { this.logger.error('Failed to find similar nodes', { nodeId, error: (0, error_handling_1.getErrorMessage)(error) }); throw error; } } /** * Performs vector search with a given embedding. */ async vectorSearchWithEmbedding(embedding, options = {}) { const { limit = 10, nodeTypes, minSimilarity = 0.1, filePaths } = options; let sql = ` SELECT id, node_id, node_type, file_path, source_text, vec_distance_cosine(embedding, ?) as distance FROM embedding_nodes WHERE embedding IS NOT NULL `; const params = [Buffer.from(new Float32Array(embedding).buffer)]; // Add filters if (nodeTypes && nodeTypes.length > 0) { const placeholders = nodeTypes.map(() => '?').join(','); sql += ` AND node_type IN (${placeholders})`; params.push(...nodeTypes); } if (filePaths && filePaths.length > 0) { const fileConditions = filePaths.map(() => 'file_path LIKE ?').join(' OR '); sql += ` AND (${fileConditions})`; params.push(...filePaths.map(path => `%${path}%`)); } if (minSimilarity > 0) { const distanceThreshold = 1 - minSimilarity; sql += ` AND vec_distance_cosine(embedding, ?) <= ?`; params.push(Buffer.from(new Float32Array(embedding).buffer), distanceThreshold); } sql += ` ORDER BY distance ASC LIMIT ?`; params.push(limit); const results = this.sqliteClient.all(sql, params); const allResults = results.map((row, index) => ({ node: { id: row.id, nodeId: row.node_id, nodeType: row.node_type, filePath: row.file_path, sourceText: row.source_text }, similarity: 1 - row.distance, rank: index + 1 })); // Filter out test files return this.filterOutTestFiles(allResults); } /** * Verifies that the embedding_nodes table exists and has the expected structure. */ async verifyEmbeddingTable() { try { const tableInfo = this.sqliteClient.all("PRAGMA table_info(embedding_nodes)"); if (tableInfo.length === 0) { throw new Error('embedding_nodes table does not exist'); } const requiredColumns = ['id', 'node_id', 'embedding', 'source_text', 'node_type', 'file_path']; const existingColumns = tableInfo.map((col) => col.name); for (const col of requiredColumns) { if (!existingColumns.includes(col)) { throw new Error(`Required column '${col}' missing from embedding_nodes table`); } } this.logger.info('embedding_nodes table structure verified', { columns: existingColumns }); } catch (error) { this.logger.error('Failed to verify embedding_nodes table', { error: (0, error_handling_1.getErrorMessage)(error) }); throw error; } } /** * Deserializes embedding from database blob format. */ deserializeEmbedding(blob) { const float32Array = new Float32Array(blob.buffer, blob.byteOffset, blob.byteLength / 4); return Array.from(float32Array); } /** * Filter out test files from search results */ filterOutTestFiles(results) { return results.filter(result => { const filePath = result.node.filePath; // Check if file is a test file based on common patterns const isTestFile = /\.(test|spec)\.(ts|js|tsx|jsx)$/i.test(filePath) || /\/tests?\//.test(filePath) || /\/test\//.test(filePath) || /\/__tests__\//.test(filePath) || /\.test\./i.test(filePath) || /\.spec\./i.test(filePath); return !isTestFile; }); } /** * Disconnects from the database. */ async disconnect() { try { this.sqliteClient.disconnect(); this.isInitialized = false; this.logger.info('Enhanced search service disconnected'); } catch (error) { this.logger.error('Failed to disconnect enhanced search service', { error: (0, error_handling_1.getErrorMessage)(error) }); throw error; } } } exports.EnhancedSearchService = EnhancedSearchService;