@hpbyte/h-codex-core
Version:
Core indexing and search functionality for h-codex
108 lines • 6 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.chunkEmbeddingsRepository = exports.ChunkEmbeddingsRepository = void 0;
const drizzle_orm_1 = require("drizzle-orm");
const embedding_1 = require("../config/embedding");
const projects_repository_1 = require("./projects.repository");
const schemas_1 = require("./schemas");
const connection_1 = require("./connection");
class ChunkEmbeddingsRepository {
async insertCodeChunks(chunks) {
return connection_1.db.insert(schemas_1.codeChunks).values(chunks).onConflictDoNothing().returning();
}
async insertEmbeddings(codeChunks, embeddings) {
if (codeChunks.length === 0 || embeddings.length === 0) {
return [];
}
if (codeChunks.length !== embeddings.length) {
throw new Error('Code chunks and embeddings must have the same length');
}
const chunkEmbeddings = codeChunks.map((chunk, index) => ({
chunkId: chunk.id,
embedding: embeddings[index],
}));
return connection_1.db.insert(schemas_1.embeddings).values(chunkEmbeddings).onConflictDoNothing().returning();
}
async findSimilarChunks(queryEmbedding, options = {}) {
const { limit = embedding_1.embeddingsConfig.searchResultsLimit, threshold = embedding_1.embeddingsConfig.similarityThreshold, filePaths, languages, nodeTypes, excludeFilePaths, minSimilarity = 0.2, diversityFactor = 0.8, projects, } = options;
const conditions = [
(0, drizzle_orm_1.sql) `1 - (${(0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding)}) > ${Math.max(threshold, minSimilarity)}`,
];
if (projects && projects.length > 0) {
const projectRecords = await Promise.all(projects.map(name => projects_repository_1.projectsRepository.get(name)));
const projectIds = projectRecords.filter(p => p !== null).map(p => p.id);
if (projectIds.length > 0) {
conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.projectId, projectIds));
}
}
if (filePaths && filePaths.length > 0) {
conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.filePath, filePaths));
}
if (excludeFilePaths && excludeFilePaths.length > 0) {
conditions.push((0, drizzle_orm_1.sql) `${schemas_1.codeChunks.filePath} NOT IN ${excludeFilePaths}`);
}
if (languages && languages.length > 0) {
conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.language, languages));
}
if (nodeTypes && nodeTypes.length > 0) {
conditions.push((0, drizzle_orm_1.inArray)(schemas_1.codeChunks.nodeType, nodeTypes));
}
const retrievalLimit = Math.min(limit * 3, 100);
const similarChunks = await connection_1.db
.select({
chunk: schemas_1.codeChunks,
similarity: (0, drizzle_orm_1.sql) `1 - (${(0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding)})`,
distance: (0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding),
})
.from(schemas_1.embeddings)
.innerJoin(schemas_1.codeChunks, (0, drizzle_orm_1.eq)(schemas_1.embeddings.chunkId, schemas_1.codeChunks.id))
.where((0, drizzle_orm_1.and)(...conditions))
.orderBy((0, drizzle_orm_1.sql) `${(0, drizzle_orm_1.cosineDistance)(schemas_1.embeddings.embedding, queryEmbedding)}`)
.limit(retrievalLimit);
const diversifiedResults = this.diversifyResults(similarChunks.map(row => ({
chunk: row.chunk,
similarity: row.similarity,
})), diversityFactor);
return diversifiedResults.slice(0, limit);
}
async findSimilarChunksWithContext(queryEmbedding, options = {}) {
const { contextLines = 3, ...searchOptions } = options;
const results = await this.findSimilarChunks(queryEmbedding, searchOptions);
const contextualResults = await Promise.all(results.map(async ({ chunk, similarity }) => {
const contextChunks = await connection_1.db
.select()
.from(schemas_1.codeChunks)
.where((0, drizzle_orm_1.and)((0, drizzle_orm_1.eq)(schemas_1.codeChunks.filePath, chunk.filePath), (0, drizzle_orm_1.eq)(schemas_1.codeChunks.projectId, chunk.projectId), (0, drizzle_orm_1.sql) `${schemas_1.codeChunks.startLine} BETWEEN ${chunk.startLine - contextLines * 10} AND ${chunk.endLine + contextLines * 10}`))
.orderBy(schemas_1.codeChunks.startLine);
return {
chunk,
similarity,
context: contextChunks.filter(c => c.id !== chunk.id),
};
}));
return contextualResults;
}
async clearChunkEmbeddings(projectId) {
return connection_1.db.delete(schemas_1.codeChunks).where((0, drizzle_orm_1.eq)(schemas_1.codeChunks.projectId, projectId));
}
diversifyResults(results, diversityFactor) {
if (diversityFactor >= 1.0)
return results;
const diversified = [];
const filePathCounts = {};
for (const result of results) {
const filePath = result.chunk.filePath;
const currentCount = filePathCounts[filePath] || 0;
const diversityPenalty = currentCount * (1 - diversityFactor);
const adjustedScore = result.similarity * (1 - diversityPenalty);
if (adjustedScore > 0.1) {
diversified.push(result);
filePathCounts[filePath] = currentCount + 1;
}
}
return diversified.sort((a, b) => b.similarity - a.similarity);
}
}
exports.ChunkEmbeddingsRepository = ChunkEmbeddingsRepository;
exports.chunkEmbeddingsRepository = new ChunkEmbeddingsRepository();
//# sourceMappingURL=chunk-embeddings.repository.js.map