@alvinveroy/codecompass
Version:
AI-powered MCP server for codebase navigation and LLM prompt optimization
709 lines (708 loc) • 44.3 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getGlobalIndexingStatus = getGlobalIndexingStatus;
exports.validateGitRepository = validateGitRepository;
exports.indexRepository = indexRepository;
exports.getRepositoryDiff = getRepositoryDiff;
exports.getCommitHistoryWithChanges = getCommitHistoryWithChanges;
const git = __importStar(require("isomorphic-git")); // Use namespace import
const promises_1 = __importDefault(require("fs/promises"));
const path_1 = __importDefault(require("path"));
const child_process_1 = require("child_process"); // Import exec
const util_1 = require("util"); // To promisify exec
const text_utils_1 = require("../utils/text-utils");
const Diff = __importStar(require("diff"));
// import { Buffer } from 'buffer'; // Buffer is global in Node.js
const config_service_1 = require("./config-service");
// import { generateEmbedding } from "./ollama"; // We will use llmProvider.generateEmbedding() instead.
const uuid_1 = require("uuid"); // Import uuidv4
const fs_1 = __importDefault(require("fs")); // Standard fs for isomorphic-git functions requiring it
const qdrant_1 = require("./qdrant");
let currentIndexingStatus = {
status: 'idle',
message: 'Indexing not started.',
overallProgress: 0,
lastUpdatedAt: new Date().toISOString(),
};
function getGlobalIndexingStatus() {
return { ...currentIndexingStatus, lastUpdatedAt: new Date().toISOString() };
}
async function validateGitRepository(repoPath) {
try {
const gitdir = path_1.default.join(repoPath, ".git");
await promises_1.default.access(gitdir);
await git.resolveRef({ fs: fs_1.default, dir: repoPath, gitdir, ref: "HEAD" });
// logger.info(`Valid Git repository at: ${repoPath}`);
return true;
}
catch (error) {
config_service_1.logger.warn(`Git repository validation failed for ${repoPath}`, { error: error instanceof Error ? error.message : String(error) });
return false;
}
}
// Index Repository
async function indexRepository(qdrantClient, repoPath, llmProvider) {
currentIndexingStatus = {
status: 'initializing',
message: `Starting repository indexing for: ${repoPath}`,
overallProgress: 0,
lastUpdatedAt: new Date().toISOString(),
};
config_service_1.logger.info(currentIndexingStatus.message);
const isGitRepo = await validateGitRepository(repoPath);
if (!isGitRepo) {
config_service_1.logger.warn(`Skipping repository indexing: ${repoPath} is not a valid Git repository`);
currentIndexingStatus = {
status: 'error',
message: `Repository path ${repoPath} is not a valid Git repository.`,
errorDetails: `Validation failed for ${repoPath}.`,
overallProgress: 0,
lastUpdatedAt: new Date().toISOString(),
};
return;
}
currentIndexingStatus.status = 'validating_repo';
currentIndexingStatus.message = 'Repository validated. Listing files...';
currentIndexingStatus.overallProgress = 5;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
const files = await git.listFiles({ fs: fs_1.default, dir: repoPath, gitdir: path_1.default.join(repoPath, ".git"), ref: "HEAD" });
config_service_1.logger.info(`Found ${files.length} files in repository`);
if (!files.length) {
config_service_1.logger.warn("No files to index in repository.");
return;
}
currentIndexingStatus.status = 'listing_files';
currentIndexingStatus.message = `Found ${files.length} total files. Filtering for code files...`;
currentIndexingStatus.overallProgress = 7;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
const codeExtensions = ['.ts', '.js', '.tsx', '.jsx', '.json', '.md', '.html', '.css', '.scss', '.py', '.java', '.c', '.cpp', '.go', '.rs', '.php', '.rb'];
const filteredFiles = files.filter(file => {
const ext = path_1.default.extname(file).toLowerCase();
return codeExtensions.includes(ext) && !file.includes('node_modules/') && !file.includes('dist/');
});
config_service_1.logger.info(`Filtered to ${filteredFiles.length} code files for indexing`);
currentIndexingStatus.message = `Found ${filteredFiles.length} code files to process.`;
currentIndexingStatus.totalFilesToIndex = filteredFiles.length;
currentIndexingStatus.filesIndexed = 0;
currentIndexingStatus.overallProgress = 10;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
if (filteredFiles.length === 0) {
config_service_1.logger.warn("No code files found to index after filtering.");
}
// Clean up stale entries from Qdrant
try {
currentIndexingStatus.status = 'cleaning_stale_entries';
currentIndexingStatus.message = 'Checking for and removing stale entries from Qdrant index...';
currentIndexingStatus.overallProgress = 15;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
config_service_1.logger.info("Checking for stale entries in Qdrant index...");
const currentFilePathsInRepo = new Set(filteredFiles);
const pointsToDelete = []; // Qdrant point IDs can be string or number
let nextOffset = undefined;
const scrollLimit = 250; // Number of points to fetch per scroll request
config_service_1.logger.debug(`Starting scroll operation to fetch all indexed filepaths from collection: ${config_service_1.configService.COLLECTION_NAME}`);
do {
const scrollResult = await qdrantClient.scroll(config_service_1.configService.COLLECTION_NAME, {
with_payload: true, // Fetch the whole payload to check dataType
with_vector: false,
limit: scrollLimit,
offset: nextOffset,
});
if (scrollResult.points.length > 0) {
config_service_1.logger.debug(`Scrolled ${scrollResult.points.length} points from Qdrant.`);
}
for (const point of scrollResult.points) {
const pointId = point.id; // Qdrant point IDs can be string or number
const payload = point.payload; // Use Partial for safety
if (payload && payload.dataType === 'file_chunk') {
const fileChunkPayload = payload; // Now we know it's a FileChunkPayload
if (fileChunkPayload.filepath) {
if (!currentFilePathsInRepo.has(fileChunkPayload.filepath)) {
pointsToDelete.push(String(pointId)); // Ensure ID is string for Qdrant selector
config_service_1.logger.debug(`Marking stale file_chunk entry for deletion: ${fileChunkPayload.filepath} (ID: ${pointId})`);
}
}
else {
config_service_1.logger.warn(`Found file_chunk point in Qdrant (ID: ${pointId}) without a 'filepath' in its payload. Skipping stale check for this point.`);
}
}
else {
// This point is not a file_chunk, or has no payload/dataType.
// We only perform stale checks based on filepath for file_chunks in this routine.
// Other data types (commit_info, diff_chunk) might have different stale criteria or be managed elsewhere.
config_service_1.logger.debug(`Point ID ${pointId} is not a 'file_chunk' or lacks expected payload structure. Skipping filepath-based stale check.`);
}
}
// Handle different types for next_page_offset to ensure type safety.
// Qdrant's next_page_offset can be string, number, null, or undefined (or an object for complex cursors).
// We only want to assign string or number to nextOffset, otherwise, pagination stops.
const rawNextOffset = scrollResult.next_page_offset;
if (typeof rawNextOffset === 'string' || typeof rawNextOffset === 'number') {
nextOffset = rawNextOffset;
}
else {
// If rawNextOffset is null, undefined, or an object (complex cursor),
// set nextOffset to undefined to stop pagination.
nextOffset = undefined;
}
} while (nextOffset);
if (pointsToDelete.length > 0) {
config_service_1.logger.info(`Found ${pointsToDelete.length} stale entries to remove from Qdrant.`);
const pointsSelector = { points: pointsToDelete.map(id => String(id)) };
await qdrantClient.delete(config_service_1.configService.COLLECTION_NAME, pointsSelector);
config_service_1.logger.info(`Successfully removed ${pointsToDelete.length} stale entries from Qdrant.`);
}
else {
config_service_1.logger.info("No stale entries found in Qdrant index.");
}
}
catch (error) {
config_service_1.logger.error("Error during stale entry cleanup in Qdrant. Indexing of current files will proceed.", {
message: error instanceof Error ? error.message : String(error),
stack: error instanceof Error ? error.stack : undefined
});
// Depending on policy, you might choose to re-throw or handle more gracefully.
currentIndexingStatus.status = 'error';
currentIndexingStatus.message = 'Error during stale entry cleanup in Qdrant.';
currentIndexingStatus.errorDetails = error instanceof Error ? error.message : String(error);
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
// Continue with indexing current files despite stale cleanup error
}
let successCount = 0;
let errorCount = 0;
if (filteredFiles.length > 0) {
currentIndexingStatus.message = 'Stale entry cleanup complete. Starting file content indexing.';
currentIndexingStatus.status = 'indexing_file_content';
currentIndexingStatus.overallProgress = 20;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
}
for (const filepath of filteredFiles) {
config_service_1.logger.info(`[DEBUG] indexRepository: Processing file: ${filepath}`); // Example debug log
try {
const fullPath = path_1.default.join(repoPath, filepath);
const content = await promises_1.default.readFile(fullPath, "utf8");
const last_modified = (await promises_1.default.stat(fullPath)).mtime.toISOString();
if (!content.trim()) {
config_service_1.logger.info(`Skipping ${filepath}: empty file`);
continue;
}
currentIndexingStatus.currentFile = filepath;
// Add new chunking logic using chunkText and new payload structure:
const processedContent = (0, text_utils_1.preprocessText)(content); // Preprocess before chunking
const contentChunks = (0, text_utils_1.chunkText)(processedContent, config_service_1.configService.FILE_INDEXING_CHUNK_SIZE_CHARS, config_service_1.configService.FILE_INDEXING_CHUNK_OVERLAP_CHARS);
if (contentChunks.length > 0) {
config_service_1.logger.info(`Indexing ${filepath} in ${contentChunks.length} chunks.`);
const pointsToUpsert = [];
for (let i = 0; i < contentChunks.length; i++) {
const chunkContent = contentChunks[i];
if (!chunkContent.trim()) {
config_service_1.logger.debug(`Skipping empty chunk ${i + 1}/${contentChunks.length} for ${filepath}`);
continue;
}
// Embed the preprocessed chunk
const embedding = await llmProvider.generateEmbedding(chunkContent); // Use llmProvider
// Generate UUID for pointId
const pointId = (0, uuid_1.v4)();
const payload = {
dataType: 'file_chunk',
filepath,
file_content_chunk: chunkContent,
last_modified,
chunk_index: i,
total_chunks: contentChunks.length,
repositoryPath: repoPath, // Optional: add repoPath if useful for multi-repo scenarios
};
pointsToUpsert.push({ id: pointId, vector: embedding, payload: payload });
}
if (pointsToUpsert.length > 0) {
const simplePointsFileChunks = pointsToUpsert.map(p => ({ ...p, payload: p.payload }));
await (0, qdrant_1.batchUpsertVectors)(qdrantClient, config_service_1.configService.COLLECTION_NAME, simplePointsFileChunks, config_service_1.configService.QDRANT_BATCH_UPSERT_SIZE);
config_service_1.logger.info(`Successfully indexed ${pointsToUpsert.length} chunks for ${filepath}`);
if (currentIndexingStatus.filesIndexed !== undefined && currentIndexingStatus.totalFilesToIndex && currentIndexingStatus.totalFilesToIndex > 0) {
currentIndexingStatus.filesIndexed++;
const fileProgressContribution = 50; // Assuming file indexing is 50% of total work (20% to 70%)
currentIndexingStatus.overallProgress = 20 + Math.round((currentIndexingStatus.filesIndexed / currentIndexingStatus.totalFilesToIndex) * fileProgressContribution);
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
}
successCount++;
}
else {
config_service_1.logger.warn(`File ${filepath} produced 0 valid chunks after processing.`);
// errorCount++; // Or handle as appropriate
}
}
else {
config_service_1.logger.warn(`File ${filepath} was processed but produced 0 chunks (original content length: ${content.length}).`);
// errorCount++; // Or handle as appropriate
}
}
catch (error) {
config_service_1.logger.error(`[DEBUG] indexRepository: Error processing file ${filepath}`, { /* ... */}); // Ensure errors in loops are logged
config_service_1.logger.error(`Failed to index ${filepath}`, {
message: error instanceof Error ? error.message : String(error)
});
errorCount++;
}
}
currentIndexingStatus.status = 'indexing_commits_diffs';
currentIndexingStatus.message = 'File content indexing complete. Starting commit and diff indexing.';
currentIndexingStatus.currentFile = undefined;
currentIndexingStatus.overallProgress = 70; // Files done, moving to commits
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
try {
config_service_1.logger.info(`Starting indexing of commit history and diffs for ${repoPath}`);
await indexCommitsAndDiffs(qdrantClient, repoPath, llmProvider);
}
catch (commitIndexError) {
currentIndexingStatus.status = 'error';
currentIndexingStatus.message = 'Failed to index commit history and diffs.';
currentIndexingStatus.errorDetails = commitIndexError instanceof Error ? commitIndexError.message : String(commitIndexError);
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
config_service_1.logger.error(`Failed to index commit history and diffs for ${repoPath}`, {
message: commitIndexError instanceof Error ? commitIndexError.message : String(commitIndexError),
stack: commitIndexError instanceof Error ? commitIndexError.stack : undefined,
});
// Increment errorCount or handle as a separate category of error
}
if (currentIndexingStatus.status !== 'error') {
currentIndexingStatus.status = 'completed';
currentIndexingStatus.message = `Repository indexing complete. ${successCount} files indexed. ${errorCount} errors during file indexing.`;
currentIndexingStatus.overallProgress = 100;
currentIndexingStatus.currentCommit = undefined;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
config_service_1.logger.info(currentIndexingStatus.message);
}
else {
config_service_1.logger.error(`Indexing finished with an error state: ${currentIndexingStatus.message} - ${currentIndexingStatus.errorDetails}`);
}
config_service_1.logger.info(`[DEBUG] indexRepository: Finished for repoPath: ${repoPath}`);
}
// Get Repository Diff
const execAsync = (0, util_1.promisify)(child_process_1.exec); // Promisify exec for async/await usage
const MAX_DIFF_LENGTH = 10000; // Max characters for diff output
async function getRepositoryDiff(repoPath,
// Add an optional validator parameter for testing
validatorFunc) {
// Use the provided validator if available, otherwise default to the module's own validateGitRepository
const isGitRepo = validatorFunc
? await validatorFunc(repoPath)
: await validateGitRepository(repoPath);
if (!isGitRepo) {
config_service_1.logger.warn(`Cannot get repository diff: ${repoPath} is not a valid Git repository`);
return "No Git repository found";
}
try {
const commits = await git.log({ fs: fs_1.default, dir: repoPath, depth: 2, gitdir: path_1.default.join(repoPath, ".git") });
if (commits.length < 2) {
// logger.info("Not enough commits to generate a diff."); // Original SUT had this commented out
config_service_1.logger.info(`Not enough commits in ${repoPath} to generate a diff (found ${commits.length}).`); // More informative
return "No previous commits to compare";
}
const [latest, previous] = commits;
// Use git diff command to get textual diff
const command = `git diff ${previous.oid} ${latest.oid}`;
config_service_1.logger.info(`Executing diff command: ${command} in ${repoPath}`);
const { stdout, stderr } = await execAsync(command, { cwd: repoPath, maxBuffer: 1024 * 1024 * 5 }); // 5MB buffer
if (stderr) {
config_service_1.logger.warn(`Git diff command produced stderr: ${stderr}`);
// Continue if stderr is just a warning, but log it.
// If it's a fatal error, the command would likely throw.
}
let diffOutput = stdout.trim();
if (!diffOutput) {
return "No textual changes found between last two commits.";
}
if (diffOutput.length > MAX_DIFF_LENGTH) {
config_service_1.logger.info(`Diff output is too long (${diffOutput.length} chars), truncating to ${MAX_DIFF_LENGTH} chars.`);
diffOutput = diffOutput.substring(0, MAX_DIFF_LENGTH) + "\n... (diff truncated)";
}
return diffOutput;
}
catch (error) {
const err = error instanceof Error ? error : new Error(String(error));
// Add stderr to the logged error object if it's an ExecException from execAsync
const errorDetails = {
message: err.message,
stack: err.stack
};
// Type guard for ExecException like errors
if (typeof error === 'object' && error !== null) {
if ('stderr' in error && typeof error.stderr === 'string') {
errorDetails.stderr = error.stderr;
}
if ('code' in error && (typeof error.code === 'number' || typeof error.code === 'string')) {
errorDetails.code = error.code;
}
}
config_service_1.logger.error(`Error retrieving git diff for ${repoPath}: ${err.message}`, errorDetails);
const errorMessage = err && typeof err.message === 'string' ? err.message : String(err);
return `Failed to retrieve diff for ${repoPath}: ${errorMessage}`;
}
}
async function getCommitHistoryWithChanges(repoPath, options) {
const gitdir = path_1.default.join(repoPath, ".git");
const detailedCommits = [];
try {
const logOptions = {
fs: fs_1.default,
dir: repoPath,
gitdir,
};
if (options?.count) {
logOptions.depth = options.count;
}
if (options?.since) {
logOptions.since = options.since;
}
if (options?.ref) {
logOptions.ref = options.ref;
}
const commits = await git.log(logOptions);
for (const commitEntry of commits) {
// commitEntry from git.log already has oid, message, author, committer
// We need to read the full commit to get tree and parent info reliably
const commitData = await git.readCommit({
fs: fs_1.default,
dir: repoPath,
gitdir,
oid: commitEntry.oid,
});
const parentOids = commitData.commit.parent || []; // Ensure parents is always an array
const changedFiles = [];
if (commitData.commit.parent && commitData.commit.parent.length > 0) {
// Not an initial commit, compare with the first parent
const parentOid = commitData.commit.parent[0];
const parentCommitData = await git.readCommit({
fs: fs_1.default,
dir: repoPath,
gitdir,
oid: parentOid,
});
// Manual diff logic using git.walk
// For git.TREE, we pass the tree OID as the 'ref' argument.
// This relies on isomorphic-git's TREE walker factory being able to resolve a tree OID passed as 'ref'.
// If this specific usage is problematic, an alternative would be to read the tree objects
// and then use their entries, but `walk` with `TREE` walkers is idiomatic for diff-like operations.
await git.walk({
fs: fs_1.default,
dir: repoPath,
gitdir,
trees: [git.TREE({ ref: parentCommitData.commit.tree }), git.TREE({ ref: commitData.commit.tree })],
map: async function (filepath, entries) {
if (filepath === '.')
return null; // Skip root
const [entry1, entry2] = entries; // entry from parent tree, entry from current tree
const type1 = entry1 ? await entry1.type() : null;
const oid1 = entry1 ? await entry1.oid() : null;
// const mode1 = entry1 ? await entry1.mode() : null; // mode1 not used in current logic
const type2 = entry2 ? await entry2.type() : null;
const oid2 = entry2 ? await entry2.oid() : null;
// const mode2 = entry2 ? await entry2.mode() : null; // mode2 not used in current logic
if (type1 === 'blob' || type2 === 'blob') { // Only consider file changes
let changeEntry = null;
if (!entry1 && entry2) { // File added
changeEntry = { path: filepath, type: 'add', oldOid: null, newOid: oid2 };
}
else if (entry1 && !entry2) { // File deleted
changeEntry = { path: filepath, type: 'delete', oldOid: oid1, newOid: null };
}
else if (entry1 && entry2) { // File potentially modified or typechanged
if (oid1 !== oid2) {
changeEntry = { path: filepath, type: 'modify', oldOid: oid1, newOid: oid2 };
}
else if (type1 !== type2) {
// OIDs are same, but types differ (e.g. blob to symlink)
changeEntry = { path: filepath, type: 'typechange', oldOid: oid1, newOid: oid2 };
}
// Mode-only changes are not captured if OIDs are identical and types are same.
}
if (changeEntry) {
if (changeEntry.type === 'add' || changeEntry.type === 'modify' || changeEntry.type === 'delete') {
try {
const contentA = changeEntry.oldOid ? Buffer.from((await git.readBlob({ fs: fs_1.default, dir: repoPath, gitdir, oid: changeEntry.oldOid })).blob).toString('utf8') : '';
const contentB = changeEntry.newOid ? Buffer.from((await git.readBlob({ fs: fs_1.default, dir: repoPath, gitdir, oid: changeEntry.newOid })).blob).toString('utf8') : '';
// Using configService for diff context lines
changeEntry.diffText = Diff.createPatch(filepath, contentA, contentB, '', '', { context: config_service_1.configService.DIFF_LINES_OF_CONTEXT });
}
catch (diffError) {
config_service_1.logger.warn(`Could not generate diff for ${filepath} in commit ${commitEntry.oid}`, { error: diffError instanceof Error ? diffError.message : String(diffError) });
// Keep the changeEntry without diffText if diff generation fails
}
}
changedFiles.push(changeEntry);
}
}
return null;
}
});
}
else {
// Initial commit, list all files as 'add'
await git.walk({
fs: fs_1.default,
dir: repoPath,
gitdir,
// For an initial commit, list all files as 'add'.
// We can use the tree OID directly with gitWalk's 'oids' parameter,
// or iterate through the tree using readTree then gitWalk on its entries if needed.
// A simpler way for listing all files in a tree is to use GIT_TREE walker with the tree's OID.
// The `trees` parameter expects an array of Walker instances.
// GIT_TREE() (the function call, not the symbol) creates a Walker for the current commit's tree.
// However, to specify a particular tree OID, it's usually done by providing the OID to `readObject`
// and then walking that, or by using the `oids` parameter in `walk`.
// Let's use `readTree` and then iterate.
// A more direct way with walk for a specific tree:
// The `trees` parameter is an array of Walker objects.
// `TREE` is a symbol. `GIT_TREE()` is a function that returns a Walker.
// To walk a specific tree OID, you'd typically pass it to `gitLog` or similar,
// or use `readTree` and then process its entries.
// The most straightforward way to list files in a specific tree with `walk`
// is to provide its OID to the `oids` parameter.
// However, the existing code uses `map` which expects entries.
// Correct approach for listing files in a specific tree (initial commit):
// We need to provide the tree OID to the walk function.
// The `map` function will then receive entries from this tree.
// The `trees` parameter is for specifying which "sources" (like HEAD, STAGE, WORKDIR)
// are being walked when comparing. For a single tree, this is simpler.
// We can use `GIT_TREE()` to get a Walker for the current commit's tree if `ref` is HEAD.
// For a specific tree OID, we can use `readTree` and then iterate, or use `walk` with `oids`.
// Let's use `GIT_TREE()` which refers to the tree of the current ref (HEAD by default).
// Since we have the specific tree OID, we should use that.
// The `walk` function can take an `oids` array.
// The `map` function's second argument `entries` is an array of `WalkerEntry | null`.
// If we are walking a single tree, `entries` will have one item.
map: async function (filepath, [entry]) {
if (filepath === '.')
return; // Skip root
// For an initial commit, all files in its tree are 'add'.
// The `entry` here will be from the commit's tree.
if (entry && await entry.type() === 'blob') { // Ensure it's a file
const oid = await entry.oid();
const changeEntry = { path: filepath, type: 'add', oldOid: null, newOid: oid };
try {
const contentB = Buffer.from((await git.readBlob({ fs: fs_1.default, dir: repoPath, gitdir, oid })).blob).toString('utf8');
// For 'add' in initial commit, diff is against an empty file.
changeEntry.diffText = Diff.createPatch(filepath, '', contentB, '', '', { context: config_service_1.configService.DIFF_LINES_OF_CONTEXT });
}
catch (diffError) {
config_service_1.logger.warn(`Could not generate diff for added file ${filepath} in initial commit ${commitEntry.oid}`, { error: diffError instanceof Error ? diffError.message : String(diffError) });
}
changedFiles.push(changeEntry);
}
return null;
},
// We need to tell `walk` which tree to process.
// Since `commitData.commit.tree` is the OID of the tree for this initial commit:
trees: [git.TREE()], // This refers to the tree of the current ref (HEAD)
// which is what we want for the initial commit's files.
// If commitData.commit.tree is different from HEAD's tree (it shouldn't be for initial commit processing)
// then a different approach is needed.
// For an initial commit, its tree *is* the state.
// The `gitWalk` function, when given `trees: [GIT_TREE()]`, will walk the tree
// of the current commit (which `commitData` represents).
// This seems correct for listing files of an initial commit.
// The error TS2353 was about `GIT_TREE({ oid: ... })`.
// The correct usage is just `GIT_TREE()` if you mean the tree of the current ref,
// or you need to pass the OID differently if you want to specify an arbitrary tree.
// Given this is for the initial commit, `GIT_TREE()` should point to its tree.
// Let's assume `GIT_TREE()` correctly resolves to the tree of `commitData.commit.tree`.
// The original code was: trees: [GIT_TREE({ oid: commitData.commit.tree })],
// The `TREE` symbol itself is not a function. `GIT_TREE()` is.
// The error indicates TS thinks `GIT_TREE` is a function taking `{ref?: string}`.
// This suggests a type definition issue or a misunderstanding of the API.
// `isomorphic-git`'s `TREE` is a function that returns a `WalkerFactory`.
// So, `GIT_TREE()` should be used.
// The error `TS2353: Object literal may only specify known properties, and 'oid' does not exist in type '{ ref?: string | undefined; }'.`
// implies that `GIT_TREE` is being seen as `function TREE(options?: { ref?: string }): Walker`.
// This is not how `GIT_TREE({oid: ...})` is meant to be used.
// It should be `GIT_TREE()` if you want the current ref's tree.
// To walk a *specific* tree OID (like `commitData.commit.tree`), you'd typically use `readTree`
// and then iterate its entries, or use `walk` with the `oids` parameter.
// Given the context of an initial commit, `GIT_TREE()` should refer to its tree.
// The simplest fix for the `walk` call, assuming `GIT_TREE()` refers to the tree of the current commit:
});
}
detailedCommits.push({
oid: commitEntry.oid,
message: commitEntry.commit.message,
author: commitEntry.commit.author,
committer: commitEntry.commit.committer,
changedFiles,
parents: parentOids, // Add this line
});
}
config_service_1.logger.info(`Retrieved ${detailedCommits.length} commits with changes for ${repoPath}`);
return detailedCommits;
}
catch (error) {
const err = error instanceof Error ? error : new Error(String(error));
config_service_1.logger.error(`Failed to get commit history with changes for ${repoPath}: ${err.message}`, { stack: err.stack });
// Depending on desired behavior, you might want to re-throw or return empty array
throw err; // Or return [];
}
}
async function indexCommitsAndDiffs(qdrantClient, repoPath, llmProvider) {
config_service_1.logger.info(`Indexing commit history and diffs for repository: ${repoPath}`);
currentIndexingStatus.message = 'Fetching commit history...';
currentIndexingStatus.totalCommitsToIndex = 0;
currentIndexingStatus.commitsIndexed = 0;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
const historyOptions = {};
if (config_service_1.configService.COMMIT_HISTORY_MAX_COUNT_FOR_INDEXING > 0) {
historyOptions.count = config_service_1.configService.COMMIT_HISTORY_MAX_COUNT_FOR_INDEXING;
}
const commits = await getCommitHistoryWithChanges(repoPath, historyOptions);
if (!commits || commits.length === 0) {
config_service_1.logger.info(`No commit history found or processed for ${repoPath}. Skipping commit/diff indexing.`);
return;
}
currentIndexingStatus.totalCommitsToIndex = commits.length;
currentIndexingStatus.message = `Found ${commits.length} commits to process for diffs and history.`;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
const pointsToUpsert = [];
for (const commit of commits) {
// 1. Index Commit Info
currentIndexingStatus.currentCommit = commit.oid;
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
const changedFilesSummary = commit.changedFiles.map((cf) => `${cf.type.charAt(0).toUpperCase()}: ${cf.path}`);
// Prepare text for embedding commit information
const commitDate = new Date(commit.author.timestamp * 1000).toISOString();
const commitTextToEmbed = (0, text_utils_1.preprocessText)(`Commit: ${commit.oid}\nAuthor: ${commit.author.name} <${commit.author.email}>\nDate: ${commitDate}\nMessage: ${commit.message}\nParents: ${commit.parents.join(', ')}\nChanges: ${changedFilesSummary.join('; ')}`);
try {
// === Commit Info ID Generation and Embedding ===
if (!commit.oid || typeof commit.oid !== 'string') {
config_service_1.logger.error(`Skipping commit due to invalid OID for ID generation. Commit: ${JSON.stringify(commit)}`);
continue;
}
// Generate commit ID first
const commitPointId = (0, uuid_1.v4)();
const commitPayload = {
dataType: 'commit_info',
commit_oid: commit.oid,
commit_message: commit.message,
commit_author_name: commit.author.name,
commit_author_email: commit.author.email,
commit_date: commitDate,
changed_files_summary: changedFilesSummary,
parent_oids: commit.parents,
repositoryPath: repoPath, // Optional
};
// Now attempt embedding
const commitVector = await llmProvider.generateEmbedding(commitTextToEmbed);
pointsToUpsert.push({
id: commitPointId,
vector: commitVector,
payload: commitPayload,
});
}
catch (embedError) {
// This catch block will now primarily catch errors from llmProvider.generateEmbedding
config_service_1.logger.error(`Failed to process or generate embedding for commit ${commit.oid}`, { error: embedError instanceof Error ? embedError.message : String(embedError) });
continue; // Skip this commit if ID generation or embedding fails
}
// 2. Index Diffs for each changed file in the commit
for (const changedFile of commit.changedFiles) {
if (changedFile.diffText && (changedFile.type === 'add' || changedFile.type === 'modify' || changedFile.type === 'delete')) {
const processedDiffText = (0, text_utils_1.preprocessText)(changedFile.diffText);
const diffChunks = (0, text_utils_1.chunkText)(processedDiffText, config_service_1.configService.DIFF_CHUNK_SIZE_CHARS, config_service_1.configService.DIFF_CHUNK_OVERLAP_CHARS);
for (let i = 0; i < diffChunks.length; i++) {
const diffChunk = diffChunks[i];
if (!diffChunk.trim())
continue;
// Text to embed for diff could include commit context for better searchability
const diffContextualText = (0, text_utils_1.preprocessText)(`Diff for ${changedFile.path} in commit ${commit.oid} (type: ${changedFile.type}):\n${diffChunk}`);
try {
// === Diff Chunk ID Generation and Embedding ===
if (!commit.oid || typeof commit.oid !== 'string' || !changedFile.path || typeof changedFile.path !== 'string') {
config_service_1.logger.error(`Skipping diff chunk due to invalid commit OID or changed file path for ID generation. Commit OID: ${commit.oid}, FilePath: ${changedFile.path}`);
continue;
}
// Generate diff ID first
const diffPointId = (0, uuid_1.v4)();
const diffPayload = {
dataType: 'diff_chunk',
commit_oid: commit.oid,
filepath: changedFile.path,
diff_content_chunk: diffChunk,
chunk_index: i,
total_chunks: diffChunks.length,
change_type: changedFile.type, // Ensure type compatibility
repositoryPath: repoPath, // Optional
};
pointsToUpsert.push({
id: diffPointId,
// Now attempt embedding
vector: await llmProvider.generateEmbedding(diffContextualText), // Embedding done here
payload: diffPayload,
});
}
catch (embedError) {
// This catch block will now primarily catch errors from llmProvider.generateEmbedding for diffs
config_service_1.logger.error(`Failed to process or generate embedding for diff chunk of ${changedFile.path} in commit ${commit.oid}`, { error: embedError instanceof Error ? embedError.message : String(embedError) });
// Continue to next chunk/file
}
}
}
}
// Batch upsert periodically
if (pointsToUpsert.length >= config_service_1.configService.QDRANT_BATCH_UPSERT_SIZE) {
config_service_1.logger.info(`Upserting batch of ${pointsToUpsert.length} commit/diff points...`);
const simplePointsBatch1 = pointsToUpsert.map(p => ({ ...p, payload: p.payload }));
await (0, qdrant_1.batchUpsertVectors)(qdrantClient, config_service_1.configService.COLLECTION_NAME, simplePointsBatch1, config_service_1.configService.QDRANT_BATCH_UPSERT_SIZE);
pointsToUpsert.length = 0; // Clear the array
}
}
if (currentIndexingStatus.commitsIndexed !== undefined && currentIndexingStatus.totalCommitsToIndex && currentIndexingStatus.totalCommitsToIndex > 0) {
currentIndexingStatus.commitsIndexed++;
const commitProgressContribution = 25; // Commits are 70% to 95%
currentIndexingStatus.overallProgress = 70 + Math.round((currentIndexingStatus.commitsIndexed / currentIndexingStatus.totalCommitsToIndex) * commitProgressContribution);
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
}
// Upsert any remaining points
if (pointsToUpsert.length > 0) {
config_service_1.logger.info(`Upserting final batch of ${pointsToUpsert.length} commit/diff points...`);
const simplePointsFinalBatch = pointsToUpsert.map(p => ({ ...p, payload: p.payload }));
await (0, qdrant_1.batchUpsertVectors)(qdrantClient, config_service_1.configService.COLLECTION_NAME, simplePointsFinalBatch, config_service_1.configService.QDRANT_BATCH_UPSERT_SIZE);
}
currentIndexingStatus.message = `Commit and diff indexing phase complete. Finalizing...`;
currentIndexingStatus.currentCommit = undefined;
currentIndexingStatus.overallProgress = Math.min(99, currentIndexingStatus.overallProgress || 95); // Cap at 99 before final completion
currentIndexingStatus.lastUpdatedAt = new Date().toISOString();
config_service_1.logger.info(`Finished indexing ${commits.length} commits and their diffs for ${repoPath}`);
}