UNPKG

hikma-engine

Version:

Code Knowledge Graph Indexer - A sophisticated TypeScript-based indexer that transforms Git repositories into multi-dimensional knowledge stores for AI agents

682 lines (681 loc) 29 kB
"use strict"; /** * @file Responsible for generating vector embeddings for various node types. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.EmbeddingService = void 0; const logger_1 = require("../utils/logger"); const error_handling_1 = require("../utils/error-handling"); const transformers_1 = require("@xenova/transformers"); const embedding_py_1 = require("./embedding-py"); class EmbeddingService { /** * Initializes the Embedding Service. * @param {ConfigManager} config - Configuration manager instance. */ constructor(config) { this.logger = (0, logger_1.getLogger)('EmbeddingService'); this.model = null; this.config = config; this.logger.info('Initializing EmbeddingService'); // Set environment for transformers // env.allowLocalModels = false; // Removed as per edit hint } /** * Loads the pre-trained embedding model. */ async loadModel() { if (this.model) { this.logger.debug('Model already loaded, skipping'); return; } const operation = this.logger.operation('Loading embedding model'); try { const aiConfig = this.config.getAIConfig(); // Log only relevant fields to avoid confusion when switching providers this.logger.info('Loading embedding model', { model: aiConfig.embedding.model, batchSize: aiConfig.embedding.batchSize, provider: aiConfig.embedding.provider }); if (aiConfig.embedding.provider === 'local') { // Test connection to LM Studio await this.testLMStudioConnection(); this.model = { provider: 'local', endpoint: aiConfig.embedding.localEndpoint, }; } else if (aiConfig.embedding.provider === 'transformers') { // Configure transformers.js to use local models if needed transformers_1.env.allowRemoteModels = true; transformers_1.env.allowLocalModels = true; // Load the transformers pipeline for feature extraction (embeddings) this.logger.info('Loading transformers.js embedding model', { model: aiConfig.embedding.model, }); this.model = await (0, transformers_1.pipeline)('feature-extraction', aiConfig.embedding.model); this.logger.info('Transformers.js embedding model loaded successfully'); } else if (aiConfig.embedding.provider === 'python') { // For Python provider, we don't pre-load the model // The Python script handles model loading this.logger.info('Using Python embedding provider', { model: aiConfig.embedding.model, }); this.model = { provider: 'python', model: aiConfig.embedding.model, }; this.logger.info('Python embedding provider configured successfully'); } else if (aiConfig.embedding.provider === 'server') { // For Server provider, validate configuration if (!aiConfig.embedding.server?.apiUrl) { throw new Error('Server embedding provider requires apiUrl configuration'); } if (!aiConfig.embedding.server?.model) { throw new Error('Server embedding provider requires model configuration'); } const normalizedApiUrl = this.normalizeOpenAIBaseUrl(aiConfig.embedding.server.apiUrl); this.logger.info('Using Server embedding provider', { apiUrl: normalizedApiUrl, model: aiConfig.embedding.server.model, }); this.model = { provider: 'server', apiUrl: normalizedApiUrl, apiKey: aiConfig.embedding.server.apiKey, model: aiConfig.embedding.server.model, }; this.logger.info('Server embedding provider configured successfully'); } else { throw new Error(`Unsupported embedding provider: ${aiConfig.embedding.provider}. Supported providers: 'local', 'transformers', 'python', 'server'`); } this.logger.info('Embedding model loaded successfully'); operation(); } catch (error) { this.logger.error('Failed to load embedding model', { error: (0, error_handling_1.getErrorMessage)(error), }); operation(); throw error; } } /** * Extracts meaningful text content from a node for embedding generation. * @param {BaseNode} node - The node to extract text from. * @returns {string} The extracted text content. */ getTextForNode(node) { switch (node.type) { case 'CodeNode': { const codeNode = node; const parts = [ codeNode.properties.name, codeNode.properties.signature || '', codeNode.properties.docstring || '', ].filter((part) => part.trim() !== ''); return parts.join(' '); } case 'FileNode': { const fileNode = node; const parts = [ fileNode.properties.fileName, fileNode.properties.aiSummary || '', (fileNode.properties.imports || []).join(' '), (fileNode.properties.exports || []).join(' '), ].filter((part) => part.trim() !== ''); return parts.join(' '); } case 'RepositoryNode': { const repoNode = node; const parts = [ repoNode.properties.repoName, repoNode.properties.repoPath, ].filter((part) => part.trim() !== ''); return parts.join(' '); } case 'CommitNode': { const commitNode = node; const parts = [ commitNode.properties.message, commitNode.properties.author, commitNode.properties.diffSummary || '', ].filter((part) => part.trim() !== ''); return parts.join(' '); } case 'TestNode': { const testNode = node; const parts = [ testNode.properties.name, testNode.properties.framework || '', ].filter((part) => part.trim() !== ''); return parts.join(' '); } case 'PullRequestNode': { const prNode = node; const parts = [ prNode.properties.title, prNode.properties.body || '', prNode.properties.author, ].filter((part) => part.trim() !== ''); return parts.join(' '); } // FunctionNode case 'FunctionNode': { const functionNode = node; const parts = [ functionNode.properties.name, functionNode.properties.signature || '', functionNode.properties.body || '', ].filter((part) => part.trim() !== ''); return parts.join(' '); } default: this.logger.warn(`Unknown node type for text extraction:`, node); return `${node.type} ${node.id}`; } } /** * Generates a vector embedding for a given text. * @param {string} text - The text to embed. * @param {boolean} isQuery - Whether this text is a search query (requires special prompt for some models). * @returns {Promise<number[]>} The generated embedding vector. */ async generateEmbedding(text, isQuery = false) { const aiConfig = this.config.getAIConfig(); if (aiConfig.embedding.provider === 'local' && this.model && typeof this.model === 'object' && 'provider' in this.model && this.model.provider === 'local') { return await this.generateLMStudioEmbedding(text); } else if (aiConfig.embedding.provider === 'transformers' && this.model && typeof this.model === 'function') { return await this.generateTransformersEmbedding(text, isQuery); } else if (aiConfig.embedding.provider === 'python' && this.model && typeof this.model === 'object' && 'provider' in this.model && this.model.provider === 'python') { return await this.generatePythonEmbedding(text, isQuery); } else if (aiConfig.embedding.provider === 'server' && this.model && typeof this.model === 'object' && 'provider' in this.model && this.model.provider === 'server') { return await this.generateOpenAIEmbedding(text, isQuery); } else { // Simple fallback embedding - hash-based approach this.logger.warn('Using fallback hash-based embedding generation', { provider: aiConfig.embedding.provider, modelType: typeof this.model, modelLoaded: !!this.model, }); const hash = this.simpleHash(text); // Use the correct dimensions for the configured model const stats = await this.getStats(); const dimensions = stats.dimensions; return Array.from({ length: dimensions }, (_, i) => (hash[i % hash.length] / 255) * 2 - 1); } } simpleHash(text) { const hash = []; for (let i = 0; i < text.length; i++) { hash.push(text.charCodeAt(i)); } return hash; } /** * Normalizes user-provided OpenAI-compatible API URL to a base URL. * Accepts inputs like base or full paths and returns a clean base such as http://localhost:11434 */ normalizeOpenAIBaseUrl(rawUrl) { try { let url = (rawUrl || '').trim(); url = url.replace(/\/+$/, ''); url = url.replace(/\/v1\/embeddings$/i, ''); url = url.replace(/\/api\/embeddings$/i, ''); url = url.replace(/\/v1$/i, ''); url = url.replace(/\/+$/, ''); return url; } catch { return rawUrl; } } /** * Tests connection to LM Studio server. */ async testLMStudioConnection() { const aiConfig = this.config.getAIConfig(); const endpoint = aiConfig.embedding.localEndpoint; if (!endpoint) { throw new Error('LM Studio endpoint not configured'); } try { this.logger.debug('Testing LM Studio connection', { endpoint }); // Test with a simple health check or models endpoint const response = await fetch(`${endpoint}/v1/models`, { method: 'GET', headers: { 'Content-Type': 'application/json', }, signal: AbortSignal.timeout(5000), // 5 second timeout }); if (!response.ok) { throw new Error(`LM Studio server responded with status: ${response.status}`); } const result = await response.json(); // Check if any models are loaded if (!result.data || result.data.length === 0) { this.logger.warn('LM Studio server is running but no models are loaded', { endpoint }); this.logger.info('Please load an embedding model in LM Studio before proceeding'); } else { this.logger.info('LM Studio connection successful', { endpoint, modelsLoaded: result.data.length, }); } } catch (error) { this.logger.error('Failed to connect to LM Studio', { endpoint, error: (0, error_handling_1.getErrorMessage)(error), }); throw new Error(`Cannot connect to LM Studio at ${endpoint}: ${(0, error_handling_1.getErrorMessage)(error)}`); } } /** * Generates embedding using transformers.js pipeline. * @param {string} text - The text to generate an embedding for. * @param {boolean} isQuery - Whether this text is a search query. * @returns {Promise<number[]>} The generated embedding vector. */ async generateTransformersEmbedding(text, isQuery = false) { try { const aiConfig = this.config.getAIConfig(); // Apply query prompt for specific models that require it let processedText = text; if (isQuery && aiConfig.embedding.model === 'mixedbread-ai/mxbai-embed-large-v1') { processedText = `Represent this sentence for searching relevant passages: ${text}`; } this.logger.debug('Generating embedding via transformers.js', { textLength: processedText.length, isQuery, hasPrompt: processedText !== text, }); // Generate embedding using the loaded pipeline // Use 'cls' pooling for mixedbread-ai model as recommended in their docs const poolingStrategy = aiConfig.embedding.model === 'mixedbread-ai/mxbai-embed-large-v1' ? 'cls' : 'mean'; const result = await this.model(processedText, { pooling: poolingStrategy, normalize: true, }); // Extract the embedding vector from the result let embedding; if (result.data) { embedding = Array.from(result.data); } else if (Array.isArray(result)) { embedding = result; } else { throw new Error('Unexpected embedding result format'); } this.logger.debug('Transformers.js embedding generated successfully', { embeddingLength: embedding.length, }); return embedding; } catch (error) { this.logger.error('Failed to generate embedding via transformers.js', { error: (0, error_handling_1.getErrorMessage)(error), textLength: text.length, }); throw new Error(`Transformers.js embedding generation failed: ${(0, error_handling_1.getErrorMessage)(error)}`); } } /** * Generates embedding using Python script. * @param {string} text - The text to generate an embedding for. * @param {boolean} isQuery - Whether this text is a search query. * @returns {Promise<number[]>} The generated embedding vector. */ async generatePythonEmbedding(text, isQuery = false) { try { this.logger.debug('Generating embedding via Python script', { textLength: text.length, isQuery, }); const embedding = await (0, embedding_py_1.getCodeEmbedding)(text, isQuery); this.logger.debug('Python embedding generated successfully', { embeddingLength: embedding.length, }); return embedding; } catch (error) { this.logger.error('Failed to generate embedding via Python script', { error: (0, error_handling_1.getErrorMessage)(error), textLength: text.length, }); throw new Error(`Python embedding generation failed: ${(0, error_handling_1.getErrorMessage)(error)}`); } } /** * Generates embedding using LM Studio server. * @param {string} text - The text to generate an embedding for. * @returns {Promise<number[]>} The generated embedding vector. */ async generateLMStudioEmbedding(text) { const aiConfig = this.config.getAIConfig(); const endpoint = aiConfig.embedding.localEndpoint; if (!endpoint) { throw new Error('LM Studio endpoint not configured'); } try { this.logger.debug('Generating embedding via LM Studio', { endpoint, textLength: text.length, }); const response = await fetch(`${endpoint}/v1/embeddings`, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ input: text, model: aiConfig.embedding.model || 'default', }), signal: AbortSignal.timeout(30000), // 30 second timeout }); if (!response.ok) { const errorText = await response.text(); this.logger.error('LM Studio API error', { endpoint, status: response.status, error: errorText, }); // Provide helpful error message for common issues if (response.status === 404 && errorText.includes('No models loaded')) { throw new Error('No embedding model loaded in LM Studio. Please load an embedding model (e.g., nomic-ai/nomic-embed-text-v1.5) in LM Studio first.'); } throw new Error(`LM Studio API error (${response.status}): ${errorText}`); } const result = await response.json(); if (!result.data || !Array.isArray(result.data) || result.data.length === 0) { throw new Error('Invalid response format from LM Studio'); } const embedding = result.data[0].embedding; if (!Array.isArray(embedding) || embedding.length === 0) { throw new Error('Invalid embedding format from LM Studio'); } this.logger.debug('LM Studio embedding generated successfully', { dimensions: embedding.length, textLength: text.length, }); return embedding; } catch (error) { this.logger.error('Failed to generate LM Studio embedding', { endpoint, error: (0, error_handling_1.getErrorMessage)(error), }); throw error; } } /** * Generates embedding using OpenAI-compatible API (like Ollama). * @param {string} text - The text to generate an embedding for. * @param {boolean} isQuery - Whether this is a query embedding. * @returns {Promise<number[]>} The generated embedding vector. */ async generateOpenAIEmbedding(text, isQuery = false) { if (!this.model || this.model.provider !== 'server') { throw new Error('Server embedding model not configured'); } try { this.logger.debug('Generating embedding via OpenAI API', { apiUrl: this.model.apiUrl, model: this.model.model, textLength: text.length, isQuery, }); const headers = { 'Content-Type': 'application/json', }; // Add API key if provided if (this.model.apiKey) { headers['Authorization'] = `Bearer ${this.model.apiKey}`; } const response = await fetch(`${this.model.apiUrl}/v1/embeddings`, { method: 'POST', headers, body: JSON.stringify({ input: text, model: this.model.model, }), signal: AbortSignal.timeout(30000), // 30 second timeout }); if (!response.ok) { const errorText = await response.text(); this.logger.error('OpenAI API error', { apiUrl: this.model.apiUrl, status: response.status, error: errorText, }); throw new Error(`OpenAI API error (${response.status}): ${errorText}`); } const result = await response.json(); if (!result.data || !Array.isArray(result.data) || result.data.length === 0) { throw new Error('Invalid response format from OpenAI API'); } const embedding = result.data[0].embedding; if (!Array.isArray(embedding) || embedding.length === 0) { throw new Error('Invalid embedding format from OpenAI API'); } this.logger.debug('OpenAI embedding generated successfully', { dimensions: embedding.length, textLength: text.length, }); return embedding; } catch (error) { this.logger.error('Failed to generate OpenAI embedding', { apiUrl: this.model.apiUrl, error: (0, error_handling_1.getErrorMessage)(error), }); throw error; } } /** * Processes nodes in batches to generate embeddings efficiently. * @param {BaseNode[]} nodes - Array of nodes to embed. * @returns {Promise<NodeWithEmbedding[]>} Array of nodes with embeddings. */ async processBatch(nodes) { const aiConfig = this.config.getAIConfig(); const batchSize = aiConfig.embedding.batchSize; const results = []; for (let i = 0; i < nodes.length; i += batchSize) { const batch = nodes.slice(i, i + batchSize); this.logger.info(`Processing embedding batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(nodes.length / batchSize)}`); const batchPromises = batch.map(async (node) => { const text = this.getTextForNode(node); const embedding = await this.generateEmbedding(text); return { ...node, embedding, sourceText: text, }; }); const batchResults = await Promise.all(batchPromises); results.push(...batchResults); } return results; } /** * Generates embeddings for all provided nodes. * @param {BaseNode[]} nodes - Array of nodes to generate embeddings for. * @returns {Promise<NodeWithEmbedding[]>} Array of nodes with embeddings attached. */ async embedNodes(nodes) { const operation = this.logger.operation(`Generating embeddings for ${nodes.length} nodes`); try { this.logger.info(`Starting embedding generation for ${nodes.length} nodes`); if (nodes.length === 0) { this.logger.info('No nodes to embed'); operation(); return []; } // Ensure model is loaded if (!this.model) { await this.loadModel(); } // Process nodes in batches const nodesWithEmbeddings = await this.processBatch(nodes); // Validate embeddings const validEmbeddings = nodesWithEmbeddings.filter((node) => node.embedding && node.embedding.length > 0); if (validEmbeddings.length !== nodesWithEmbeddings.length) { this.logger.warn(`Some embeddings failed to generate`, { total: nodesWithEmbeddings.length, valid: validEmbeddings.length, failed: nodesWithEmbeddings.length - validEmbeddings.length, }); } this.logger.info('Embedding generation completed', { totalNodes: nodes.length, successfulEmbeddings: validEmbeddings.length, embeddingDimensions: validEmbeddings[0]?.embedding?.length || 0, nodeTypes: this.getNodeTypeStats(validEmbeddings), }); operation(); return validEmbeddings; } catch (error) { this.logger.error('Embedding generation failed', { error: (0, error_handling_1.getErrorMessage)(error), }); operation(); throw error; } } /** * Generates an embedding for a single text query (useful for search). * @param {string} query - The query text to embed. * @returns {Promise<number[]>} The embedding vector for the query. */ async embedQuery(query) { this.logger.debug('Generating embedding for query', { queryLength: query.length, }); if (!this.model) { await this.loadModel(); } return await this.generateEmbedding(query, true); } /** * Calculates cosine similarity between two embedding vectors. * @param {number[]} embedding1 - First embedding vector. * @param {number[]} embedding2 - Second embedding vector. * @returns {number} Cosine similarity score between -1 and 1. */ calculateSimilarity(embedding1, embedding2) { if (embedding1.length !== embedding2.length) { throw new Error('Embedding vectors must have the same dimensions'); } let dotProduct = 0; let norm1 = 0; let norm2 = 0; for (let i = 0; i < embedding1.length; i++) { dotProduct += embedding1[i] * embedding2[i]; norm1 += embedding1[i] * embedding1[i]; norm2 += embedding2[i] * embedding2[i]; } const magnitude = Math.sqrt(norm1) * Math.sqrt(norm2); return magnitude === 0 ? 0 : dotProduct / magnitude; } /** * Finds the most similar nodes to a query embedding. * @param {number[]} queryEmbedding - The query embedding vector. * @param {NodeWithEmbedding[]} nodes - Array of nodes with embeddings. * @param {number} topK - Number of top similar nodes to return. * @returns {Array<{node: NodeWithEmbedding, similarity: number}>} Top similar nodes with similarity scores. */ findSimilarNodes(queryEmbedding, nodes, topK = 10) { const similarities = nodes.map((node) => ({ node, similarity: this.calculateSimilarity(queryEmbedding, node.embedding), })); return similarities .sort((a, b) => b.similarity - a.similarity) .slice(0, topK); } /** * Gets statistics about the embedded nodes by type. * @param {NodeWithEmbedding[]} nodes - Array of nodes with embeddings. * @returns {Record<string, number>} Node type statistics. */ getNodeTypeStats(nodes) { const stats = {}; for (const node of nodes) { stats[node.type] = (stats[node.type] || 0) + 1; } return stats; } /** * Generates an embedding for a search query. * @param {string} query - The search query text. * @returns {Promise<number[]>} The generated embedding vector. */ async generateQueryEmbedding(query) { return await this.generateEmbedding(query, true); } /** * Generates an embedding for document content. * @param {string} text - The document text. * @returns {Promise<number[]>} The generated embedding vector. */ async generateDocumentEmbedding(text) { return await this.generateEmbedding(text, false); } /** * Gets embedding service statistics. * @returns {Promise<{modelLoaded: boolean, model: string, dimensions: number}>} */ async getStats() { const aiConfig = this.config.getAIConfig(); // Determine dimensions based on the model name // Most common embedding models and their dimensions const modelDimensions = { 'Xenova/all-MiniLM-L6-v2': 384, 'Xenova/all-mpnet-base-v2': 768, 'Xenova/distilbert-base-uncased': 768, 'sentence-transformers/all-MiniLM-L6-v2': 384, 'sentence-transformers/all-mpnet-base-v2': 768, 'mixedbread-ai/mxbai-embed-large-v1': 1024, 'jinaai/jina-embeddings-v2-base-code': 768, }; const modelName = aiConfig.embedding.model; const dimensions = modelDimensions[modelName] || 384; // Default to 384 if unknown return { modelLoaded: !!this.model, model: modelName, dimensions, }; } } exports.EmbeddingService = EmbeddingService;