UNPKG

@hpbyte/h-codex-core

Version:

Core indexing and search functionality for h-codex

108 lines 6 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.chunkEmbeddingsRepository = exports.ChunkEmbeddingsRepository = void 0; const drizzle_orm_1 = require("drizzle-orm"); const embedding_1 = require("../config/embedding"); const projects_repository_1 = require("./projects.repository"); const schemas_1 = require("./schemas"); const connection_1 = require("./connection"); class ChunkEmbeddingsRepository { async insertCodeChunks(chunks) { return connection_1.db.insert(schemas_1.codeChunks).values(chunks).onConflictDoNothing().returning(); } async insertEmbeddings(codeChunks, embeddings) { if (codeChunks.length === 0 || embeddings.length === 0) { return []; } if (codeChunks.length !== embeddings.length) { throw new Error('Code chunks and embeddings must have the same length'); } const chunkEmbeddings = codeChunks.map((chunk, index) => ({ chunkId: chunk.id, embedding: embeddings[index], })); return connection_1.db.insert(schemas_1.embeddings).values(chunkEmbeddings).onConflictDoNothing().returning(); } async findSimilarChunks(queryEmbedding, options = {}) { const { limit = embedding_1.embeddingsConfig.searchResultsLimit, threshold = embedding_1.embeddingsConfig.similarityThreshold, filePaths, languages, nodeTypes, excludeFilePaths, minSimilarity = 0.2, diversityFactor = 0.8, projects, } = options; const conditions = [ (0, drizzle_orm_1.sql) `1 - (${(0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding)}) > ${Math.max(threshold, minSimilarity)}`, ]; if (projects && projects.length > 0) { const projectRecords = await Promise.all(projects.map(name => projects_repository_1.projectsRepository.get(name))); const projectIds = projectRecords.filter(p => p !== null).map(p => p.id); if (projectIds.length > 0) { conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.projectId, projectIds)); } } if (filePaths && filePaths.length > 0) { conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.filePath, filePaths)); } if (excludeFilePaths && excludeFilePaths.length > 0) { conditions.push((0, drizzle_orm_1.sql) `${schemas_1.codeChunks.filePath} NOT IN ${excludeFilePaths}`); } if (languages && languages.length > 0) { conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.language, languages)); } if (nodeTypes && nodeTypes.length > 0) { conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.nodeType, nodeTypes)); } const retrievalLimit = Math.min(limit * 3, 100); const similarChunks = await connection_1.db .select({ chunk: schemas_1.codeChunks, similarity: (0, drizzle_orm_1.sql) `1 - (${(0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding)})`, distance: (0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding), }) .from(schemas_1.embeddings) .innerJoin(schemas_1.codeChunks, (0, drizzle_orm_1.eq)(schemas_1.embeddings.chunkId, schemas_1.codeChunks.id)) .where((0, drizzle_orm_1.and)(...conditions)) .orderBy((0, drizzle_orm_1.sql) `${(0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding)}`) .limit(retrievalLimit); const diversifiedResults = this.diversifyResults(similarChunks.map(row => ({ chunk: row.chunk, similarity: row.similarity, })), diversityFactor); return diversifiedResults.slice(0, limit); } async findSimilarChunksWithContext(queryEmbedding, options = {}) { const { contextLines = 3, ...searchOptions } = options; const results = await this.findSimilarChunks(queryEmbedding, searchOptions); const contextualResults = await Promise.all(results.map(async ({ chunk, similarity }) => { const contextChunks = await connection_1.db .select() .from(schemas_1.codeChunks) .where((0, drizzle_orm_1.and)((0, drizzle_orm_1.eq)(schemas_1.codeChunks.filePath, chunk.filePath), (0, drizzle_orm_1.eq)(schemas_1.codeChunks.projectId, chunk.projectId), (0, drizzle_orm_1.sql) `${schemas_1.codeChunks.startLine} BETWEEN ${chunk.startLine - contextLines * 10} AND ${chunk.endLine + contextLines * 10}`)) .orderBy(schemas_1.codeChunks.startLine); return { chunk, similarity, context: contextChunks.filter(c => c.id !== chunk.id), }; })); return contextualResults; } async clearChunkEmbeddings(projectId) { return connection_1.db.delete(schemas_1.codeChunks).where((0, drizzle_orm_1.eq)(schemas_1.codeChunks.projectId, projectId)); } diversifyResults(results, diversityFactor) { if (diversityFactor >= 1.0) return results; const diversified = []; const filePathCounts = {}; for (const result of results) { const filePath = result.chunk.filePath; const currentCount = filePathCounts[filePath] || 0; const diversityPenalty = currentCount * (1 - diversityFactor); const adjustedScore = result.similarity * (1 - diversityPenalty); if (adjustedScore > 0.1) { diversified.push(result); filePathCounts[filePath] = currentCount + 1; } } return diversified.sort((a, b) => b.similarity - a.similarity); } } exports.ChunkEmbeddingsRepository = ChunkEmbeddingsRepository; exports.chunkEmbeddingsRepository = new ChunkEmbeddingsRepository(); //# sourceMappingURL=chunk-embeddings.repository.js.map