@alvinveroy/codecompass

import * as git from "isomorphic-git"; // Use namespace import import fs from "fs/promises"; import path from "path"; import { exec } from "child_process"; // Import exec import { promisify } from "util"; // To promisify exec import { QdrantClient } from "@qdrant/js-client-rest"; import { LLMProvider } from './llm-provider'; // Assuming path, adjust if necessary import { QdrantPoint, FileChunkPayload, CommitInfoPayload, DiffChunkPayload, } from './types'; import { preprocessText, chunkText } from '../utils/text-utils'; import * as Diff from 'diff'; // import { Buffer } from 'buffer'; // Buffer is global in Node.js import { configService, logger } from "./config-service"; // import { generateEmbedding } from "./ollama"; // We will use llmProvider.generateEmbedding() instead. import { v4 as uuidv4 } from 'uuid'; // Import uuidv4 import nodeFs from 'fs'; // Standard fs for isomorphic-git functions requiring it import { batchUpsertVectors } from './qdrant'; export interface IndexingStatusReport { status: 'idle' | 'initializing' | 'validating_repo' | 'listing_files' | 'cleaning_stale_entries' | 'indexing_file_content' | 'indexing_commits_diffs' | 'completed' | 'error'; message: string; totalFilesToIndex?: number; filesIndexed?: number; totalCommitsToIndex?: number; commitsIndexed?: number; currentFile?: string; currentCommit?: string; errorDetails?: string; overallProgress?: number; lastUpdatedAt: string; } let currentIndexingStatus: IndexingStatusReport = { status: 'idle', message: 'Indexing not started.', overallProgress: 0, lastUpdatedAt: new Date().toISOString(), }; export interface CommitChange { path: string; type: 'equal' | 'modify' | 'add' | 'delete' | 'typechange'; oldOid?: string | null; // OID of the blob before the change newOid?: string | null; // OID of the blob after the change diffText?: string; // Textual diff for 'modify', 'add', 'delete' } export interface CommitDetail { oid: string; message: string; author: { name: string; email: string; timestamp: number; timezoneOffset: number }; committer: { name: string; email: string; timestamp: number; timezoneOffset: number }; parents: string[]; // Add parent OIDs changedFiles: CommitChange[]; } export function getGlobalIndexingStatus(): IndexingStatusReport { return { ...currentIndexingStatus, lastUpdatedAt: new Date().toISOString() }; } export async function validateGitRepository(repoPath: string): Promise<boolean> { try { const gitdir = path.join(repoPath, ".git"); await fs.access(gitdir); await git.resolveRef({ fs: nodeFs, dir: repoPath, gitdir, ref: "HEAD" }); // logger.info(`Valid Git repository at: ${repoPath}`); return true; } catch (error: unknown) { logger.warn(`Git repository validation failed for ${repoPath}`, { error: error instanceof Error ? error.message : String(error) }); return false; } } // Index Repository export async function indexRepository(qdrantClient: QdrantClient, repoPath: string, llmProvider: LLMProvider): Promise<void> { currentIndexingStatus = { status: 'initializing', message: `Starting repository indexing for: ${repoPath}`, overallProgress: 0, lastUpdatedAt: new Date().toISOString(), }; logger.info(currentIndexingStatus.message); const isGitRepo = await validateGitRepository(repoPath); if (!isGitRepo) { logger.warn(`Skipping repository indexing: ${repoPath} is not a valid Git repository`); currentIndexingStatus = { status: 'error', message: `Repository path ${repoPath} is not a valid Git repository.`, errorDetails: `Validation failed for ${repoPath}.`, overallProgress: 0, lastUpdatedAt: new Date().toISOString(), }; return; } currentIndexingStatus.status = 'validating_repo'; currentIndexingStatus.message = 'Repository validated. Listing files...'; currentIndexingStatus.overallProgress = 5; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); const files = await git.listFiles({ fs: nodeFs, dir: repoPath, gitdir: path.join(repoPath, ".git"), ref: "HEAD" }); logger.info(`Found ${files.length} files in repository`); if (!files.length) { logger.warn("No files to index in repository."); return; } currentIndexingStatus.status = 'listing_files'; currentIndexingStatus.message = `Found ${files.length} total files. Filtering for code files...`; currentIndexingStatus.overallProgress = 7; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); const codeExtensions = ['.ts', '.js', '.tsx', '.jsx', '.json', '.md', '.html', '.css', '.scss', '.py', '.java', '.c', '.cpp', '.go', '.rs', '.php', '.rb']; const filteredFiles = files.filter(file => { const ext = path.extname(file).toLowerCase(); return codeExtensions.includes(ext) && !file.includes('node_modules/') && !file.includes('dist/'); }); logger.info(`Filtered to ${filteredFiles.length} code files for indexing`); currentIndexingStatus.message = `Found ${filteredFiles.length} code files to process.`; currentIndexingStatus.totalFilesToIndex = filteredFiles.length; currentIndexingStatus.filesIndexed = 0; currentIndexingStatus.overallProgress = 10; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); if (filteredFiles.length === 0) { logger.warn("No code files found to index after filtering."); } // Clean up stale entries from Qdrant try { currentIndexingStatus.status = 'cleaning_stale_entries'; currentIndexingStatus.message = 'Checking for and removing stale entries from Qdrant index...'; currentIndexingStatus.overallProgress = 15; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); logger.info("Checking for stale entries in Qdrant index..."); const currentFilePathsInRepo = new Set(filteredFiles); const pointsToDelete: (string | number)[] = []; // Qdrant point IDs can be string or number let nextOffset: string | number | undefined = undefined; const scrollLimit = 250; // Number of points to fetch per scroll request logger.debug(`Starting scroll operation to fetch all indexed filepaths from collection: ${configService.COLLECTION_NAME}`); do { const scrollResult = await qdrantClient.scroll(configService.COLLECTION_NAME, { with_payload: true, // Fetch the whole payload to check dataType with_vector: false, limit: scrollLimit, offset: nextOffset, }); if (scrollResult.points.length > 0) { logger.debug(`Scrolled ${scrollResult.points.length} points from Qdrant.`); } for (const point of scrollResult.points) { const pointId = point.id; // Qdrant point IDs can be string or number const payload = point.payload as Partial<FileChunkPayload | CommitInfoPayload | DiffChunkPayload>; // Use Partial for safety if (payload && payload.dataType === 'file_chunk') { const fileChunkPayload = payload as FileChunkPayload; // Now we know it's a FileChunkPayload if (fileChunkPayload.filepath) { if (!currentFilePathsInRepo.has(fileChunkPayload.filepath)) { pointsToDelete.push(String(pointId)); // Ensure ID is string for Qdrant selector logger.debug(`Marking stale file_chunk entry for deletion: ${fileChunkPayload.filepath} (ID: ${pointId})`); } } else { logger.warn(`Found file_chunk point in Qdrant (ID: ${pointId}) without a 'filepath' in its payload. Skipping stale check for this point.`); } } else { // This point is not a file_chunk, or has no payload/dataType. // We only perform stale checks based on filepath for file_chunks in this routine. // Other data types (commit_info, diff_chunk) might have different stale criteria or be managed elsewhere. logger.debug(`Point ID ${pointId} is not a 'file_chunk' or lacks expected payload structure. Skipping filepath-based stale check.`); } } // Handle different types for next_page_offset to ensure type safety. // Qdrant's next_page_offset can be string, number, null, or undefined (or an object for complex cursors). // We only want to assign string or number to nextOffset, otherwise, pagination stops. const rawNextOffset = scrollResult.next_page_offset; if (typeof rawNextOffset === 'string' || typeof rawNextOffset === 'number') { nextOffset = rawNextOffset; } else { // If rawNextOffset is null, undefined, or an object (complex cursor), // set nextOffset to undefined to stop pagination. nextOffset = undefined; } } while (nextOffset); if (pointsToDelete.length > 0) { logger.info(`Found ${pointsToDelete.length} stale entries to remove from Qdrant.`); const pointsSelector = { points: pointsToDelete.map(id => String(id)) }; await qdrantClient.delete(configService.COLLECTION_NAME, pointsSelector); logger.info(`Successfully removed ${pointsToDelete.length} stale entries from Qdrant.`); } else { logger.info("No stale entries found in Qdrant index."); } } catch (error) { logger.error("Error during stale entry cleanup in Qdrant. Indexing of current files will proceed.", { message: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : undefined }); // Depending on policy, you might choose to re-throw or handle more gracefully. currentIndexingStatus.status = 'error'; currentIndexingStatus.message = 'Error during stale entry cleanup in Qdrant.'; currentIndexingStatus.errorDetails = error instanceof Error ? error.message : String(error); currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); // Continue with indexing current files despite stale cleanup error } let successCount = 0; let errorCount = 0; if (filteredFiles.length > 0) { currentIndexingStatus.message = 'Stale entry cleanup complete. Starting file content indexing.'; currentIndexingStatus.status = 'indexing_file_content'; currentIndexingStatus.overallProgress = 20; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); } for (const filepath of filteredFiles) { logger.info(`[DEBUG] indexRepository: Processing file: ${filepath}`); // Example debug log try { const fullPath = path.join(repoPath, filepath); const content = await fs.readFile(fullPath, "utf8"); const last_modified = (await fs.stat(fullPath)).mtime.toISOString(); if (!content.trim()) { logger.info(`Skipping ${filepath}: empty file`); continue; } currentIndexingStatus.currentFile = filepath; // Add new chunking logic using chunkText and new payload structure: const processedContent = preprocessText(content); // Preprocess before chunking const contentChunks = chunkText( processedContent, configService.FILE_INDEXING_CHUNK_SIZE_CHARS, configService.FILE_INDEXING_CHUNK_OVERLAP_CHARS ); if (contentChunks.length > 0) { logger.info(`Indexing ${filepath} in ${contentChunks.length} chunks.`); const pointsToUpsert: QdrantPoint[] = []; for (let i = 0; i < contentChunks.length; i++) { const chunkContent = contentChunks[i]; if (!chunkContent.trim()) { logger.debug(`Skipping empty chunk ${i + 1}/${contentChunks.length} for ${filepath}`); continue; } // Embed the preprocessed chunk const embedding = await llmProvider.generateEmbedding(chunkContent); // Use llmProvider // Generate UUID for pointId const pointId = uuidv4(); const payload: FileChunkPayload = { dataType: 'file_chunk', filepath, file_content_chunk: chunkContent, last_modified, chunk_index: i, total_chunks: contentChunks.length, repositoryPath: repoPath, // Optional: add repoPath if useful for multi-repo scenarios }; pointsToUpsert.push({ id: pointId, vector: embedding, payload: payload }); } if (pointsToUpsert.length > 0) { const simplePointsFileChunks = pointsToUpsert.map(p => ({ ...p, payload: p.payload as unknown as Record<string, unknown> })); await batchUpsertVectors(qdrantClient, configService.COLLECTION_NAME, simplePointsFileChunks, configService.QDRANT_BATCH_UPSERT_SIZE); logger.info(`Successfully indexed ${pointsToUpsert.length} chunks for ${filepath}`); if (currentIndexingStatus.filesIndexed !== undefined && currentIndexingStatus.totalFilesToIndex && currentIndexingStatus.totalFilesToIndex > 0) { currentIndexingStatus.filesIndexed++; const fileProgressContribution = 50; // Assuming file indexing is 50% of total work (20% to 70%) currentIndexingStatus.overallProgress = 20 + Math.round((currentIndexingStatus.filesIndexed / currentIndexingStatus.totalFilesToIndex) * fileProgressContribution); currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); } successCount++; } else { logger.warn(`File ${filepath} produced 0 valid chunks after processing.`); // errorCount++; // Or handle as appropriate } } else { logger.warn(`File ${filepath} was processed but produced 0 chunks (original content length: ${content.length}).`); // errorCount++; // Or handle as appropriate } } catch (error: unknown) { logger.error(`[DEBUG] indexRepository: Error processing file ${filepath}`, { /* ... */ }); // Ensure errors in loops are logged logger.error(`Failed to index ${filepath}`, { message: error instanceof Error ? error.message : String(error) }); errorCount++; } } currentIndexingStatus.status = 'indexing_commits_diffs'; currentIndexingStatus.message = 'File content indexing complete. Starting commit and diff indexing.'; currentIndexingStatus.currentFile = undefined; currentIndexingStatus.overallProgress = 70; // Files done, moving to commits currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); try { logger.info(`Starting indexing of commit history and diffs for ${repoPath}`); await indexCommitsAndDiffs(qdrantClient, repoPath, llmProvider); } catch (commitIndexError) { currentIndexingStatus.status = 'error'; currentIndexingStatus.message = 'Failed to index commit history and diffs.'; currentIndexingStatus.errorDetails = commitIndexError instanceof Error ? commitIndexError.message : String(commitIndexError); currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); logger.error(`Failed to index commit history and diffs for ${repoPath}`, { message: commitIndexError instanceof Error ? commitIndexError.message : String(commitIndexError), stack: commitIndexError instanceof Error ? commitIndexError.stack : undefined, }); // Increment errorCount or handle as a separate category of error } if (currentIndexingStatus.status !== 'error') { currentIndexingStatus.status = 'completed'; currentIndexingStatus.message = `Repository indexing complete. ${successCount} files indexed. ${errorCount} errors during file indexing.`; currentIndexingStatus.overallProgress = 100; currentIndexingStatus.currentCommit = undefined; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); logger.info(currentIndexingStatus.message); } else { logger.error(`Indexing finished with an error state: ${currentIndexingStatus.message} - ${currentIndexingStatus.errorDetails}`); } logger.info(`[DEBUG] indexRepository: Finished for repoPath: ${repoPath}`); } // Get Repository Diff const execAsync = promisify(exec); // Promisify exec for async/await usage const MAX_DIFF_LENGTH = 10000; // Max characters for diff output export async function getRepositoryDiff( repoPath: string, // Add an optional validator parameter for testing validatorFunc?: (p: string) => Promise<boolean> ): Promise<string> { // Use the provided validator if available, otherwise default to the module's own validateGitRepository const isGitRepo = validatorFunc ? await validatorFunc(repoPath) : await validateGitRepository(repoPath); if (!isGitRepo) { logger.warn(`Cannot get repository diff: ${repoPath} is not a valid Git repository`); return "No Git repository found"; } try { const commits = await git.log({ fs: nodeFs, dir: repoPath, depth: 2, gitdir: path.join(repoPath, ".git") }); if (commits.length < 2) { // logger.info("Not enough commits to generate a diff."); // Original SUT had this commented out logger.info(`Not enough commits in ${repoPath} to generate a diff (found ${commits.length}).`); // More informative return "No previous commits to compare"; } const [latest, previous] = commits; // Use git diff command to get textual diff const command = `git diff ${previous.oid} ${latest.oid}`; logger.info(`Executing diff command: ${command} in ${repoPath}`); const { stdout, stderr } = await execAsync(command, { cwd: repoPath, maxBuffer: 1024 * 1024 * 5 }); // 5MB buffer if (stderr) { logger.warn(`Git diff command produced stderr: ${stderr}`); // Continue if stderr is just a warning, but log it. // If it's a fatal error, the command would likely throw. } let diffOutput = stdout.trim(); if (!diffOutput) { return "No textual changes found between last two commits."; } if (diffOutput.length > MAX_DIFF_LENGTH) { logger.info(`Diff output is too long (${diffOutput.length} chars), truncating to ${MAX_DIFF_LENGTH} chars.`); diffOutput = diffOutput.substring(0, MAX_DIFF_LENGTH) + "\n... (diff truncated)"; } return diffOutput; } catch (error: unknown) { const err = error instanceof Error ? error : new Error(String(error)); // Add stderr to the logged error object if it's an ExecException from execAsync const errorDetails: { message: string; stack?: string; stderr?: string; code?: number | string } = { // code can be string message: err.message, stack: err.stack }; // Type guard for ExecException like errors if (typeof error === 'object' && error !== null) { if ('stderr' in error && typeof (error as { stderr?: unknown }).stderr === 'string') { errorDetails.stderr = (error as { stderr: string }).stderr; } if ('code' in error && (typeof (error as { code?: unknown }).code === 'number' || typeof (error as { code?: unknown }).code === 'string')) { errorDetails.code = (error as { code: number | string }).code; } } logger.error(`Error retrieving git diff for ${repoPath}: ${err.message}`, errorDetails); const errorMessage = err && typeof err.message === 'string' ? err.message : String(err); return `Failed to retrieve diff for ${repoPath}: ${errorMessage}`; } } export async function getCommitHistoryWithChanges( repoPath: string, options?: { since?: Date; count?: number; ref?: string } ): Promise<CommitDetail[]> { const gitdir = path.join(repoPath, ".git"); const detailedCommits: CommitDetail[] = []; try { const logOptions: { fs: typeof nodeFs; // Use the imported standard fs dir: string; gitdir: string; depth?: number; since?: Date; ref?: string; } = { fs: nodeFs, dir: repoPath, gitdir, }; if (options?.count) { logOptions.depth = options.count; } if (options?.since) { logOptions.since = options.since; } if (options?.ref) { logOptions.ref = options.ref; } const commits = await git.log(logOptions); for (const commitEntry of commits) { // commitEntry from git.log already has oid, message, author, committer // We need to read the full commit to get tree and parent info reliably const commitData = await git.readCommit({ fs: nodeFs, dir: repoPath, gitdir, oid: commitEntry.oid, }); const parentOids = commitData.commit.parent || []; // Ensure parents is always an array const changedFiles: CommitChange[] = []; if (commitData.commit.parent && commitData.commit.parent.length > 0) { // Not an initial commit, compare with the first parent const parentOid = commitData.commit.parent[0]; const parentCommitData = await git.readCommit({ fs: nodeFs, dir: repoPath, gitdir, oid: parentOid, }); // Manual diff logic using git.walk // For git.TREE, we pass the tree OID as the 'ref' argument. // This relies on isomorphic-git's TREE walker factory being able to resolve a tree OID passed as 'ref'. // If this specific usage is problematic, an alternative would be to read the tree objects // and then use their entries, but `walk` with `TREE` walkers is idiomatic for diff-like operations. await git.walk({ fs: nodeFs, dir: repoPath, gitdir, trees: [git.TREE({ ref: parentCommitData.commit.tree }), git.TREE({ ref: commitData.commit.tree })], map: async function(filepath, entries) { if (filepath === '.') return null; // Skip root const [entry1, entry2] = entries; // entry from parent tree, entry from current tree const type1 = entry1 ? await entry1.type() : null; const oid1 = entry1 ? await entry1.oid() : null; // const mode1 = entry1 ? await entry1.mode() : null; // mode1 not used in current logic const type2 = entry2 ? await entry2.type() : null; const oid2 = entry2 ? await entry2.oid() : null; // const mode2 = entry2 ? await entry2.mode() : null; // mode2 not used in current logic if (type1 === 'blob' || type2 === 'blob') { // Only consider file changes let changeEntry: CommitChange | null = null; if (!entry1 && entry2) { // File added changeEntry = { path: filepath, type: 'add', oldOid: null, newOid: oid2 }; } else if (entry1 && !entry2) { // File deleted changeEntry = { path: filepath, type: 'delete', oldOid: oid1, newOid: null }; } else if (entry1 && entry2) { // File potentially modified or typechanged if (oid1 !== oid2) { changeEntry = { path: filepath, type: 'modify', oldOid: oid1, newOid: oid2 }; } else if (type1 !== type2) { // OIDs are same, but types differ (e.g. blob to symlink) changeEntry = { path: filepath, type: 'typechange', oldOid: oid1, newOid: oid2 }; } // Mode-only changes are not captured if OIDs are identical and types are same. } if (changeEntry) { if (changeEntry.type === 'add' || changeEntry.type === 'modify' || changeEntry.type === 'delete') { try { const contentA = changeEntry.oldOid ? Buffer.from((await git.readBlob({ fs: nodeFs, dir: repoPath, gitdir, oid: changeEntry.oldOid })).blob).toString('utf8') : ''; const contentB = changeEntry.newOid ? Buffer.from((await git.readBlob({ fs: nodeFs, dir: repoPath, gitdir, oid: changeEntry.newOid })).blob).toString('utf8') : ''; // Using configService for diff context lines changeEntry.diffText = Diff.createPatch(filepath, contentA, contentB, '', '', { context: configService.DIFF_LINES_OF_CONTEXT }); } catch (diffError) { logger.warn(`Could not generate diff for ${filepath} in commit ${commitEntry.oid}`, { error: diffError instanceof Error ? diffError.message : String(diffError) }); // Keep the changeEntry without diffText if diff generation fails } } changedFiles.push(changeEntry); } } return null; } }); } else { // Initial commit, list all files as 'add' await git.walk({ fs: nodeFs, dir: repoPath, gitdir, // For an initial commit, list all files as 'add'. // We can use the tree OID directly with gitWalk's 'oids' parameter, // or iterate through the tree using readTree then gitWalk on its entries if needed. // A simpler way for listing all files in a tree is to use GIT_TREE walker with the tree's OID. // The `trees` parameter expects an array of Walker instances. // GIT_TREE() (the function call, not the symbol) creates a Walker for the current commit's tree. // However, to specify a particular tree OID, it's usually done by providing the OID to `readObject` // and then walking that, or by using the `oids` parameter in `walk`. // Let's use `readTree` and then iterate. // A more direct way with walk for a specific tree: // The `trees` parameter is an array of Walker objects. // `TREE` is a symbol. `GIT_TREE()` is a function that returns a Walker. // To walk a specific tree OID, you'd typically pass it to `gitLog` or similar, // or use `readTree` and then process its entries. // The most straightforward way to list files in a specific tree with `walk` // is to provide its OID to the `oids` parameter. // However, the existing code uses `map` which expects entries. // Correct approach for listing files in a specific tree (initial commit): // We need to provide the tree OID to the walk function. // The `map` function will then receive entries from this tree. // The `trees` parameter is for specifying which "sources" (like HEAD, STAGE, WORKDIR) // are being walked when comparing. For a single tree, this is simpler. // We can use `GIT_TREE()` to get a Walker for the current commit's tree if `ref` is HEAD. // For a specific tree OID, we can use `readTree` and then iterate, or use `walk` with `oids`. // Let's use `GIT_TREE()` which refers to the tree of the current ref (HEAD by default). // Since we have the specific tree OID, we should use that. // The `walk` function can take an `oids` array. // The `map` function's second argument `entries` is an array of `WalkerEntry | null`. // If we are walking a single tree, `entries` will have one item. map: async function(filepath, [entry]) { // entry is WalkerEntry | null if (filepath === '.') return; // Skip root // For an initial commit, all files in its tree are 'add'. // The `entry` here will be from the commit's tree. if (entry && await entry.type() === 'blob') { // Ensure it's a file const oid = await entry.oid(); const changeEntry: CommitChange = { path: filepath, type: 'add', oldOid: null, newOid: oid }; try { const contentB = Buffer.from((await git.readBlob({ fs: nodeFs, dir: repoPath, gitdir, oid })).blob).toString('utf8'); // For 'add' in initial commit, diff is against an empty file. changeEntry.diffText = Diff.createPatch(filepath, '', contentB, '', '', { context: configService.DIFF_LINES_OF_CONTEXT }); } catch (diffError) { logger.warn(`Could not generate diff for added file ${filepath} in initial commit ${commitEntry.oid}`, { error: diffError instanceof Error ? diffError.message : String(diffError) }); } changedFiles.push(changeEntry); } return null; }, // We need to tell `walk` which tree to process. // Since `commitData.commit.tree` is the OID of the tree for this initial commit: trees: [git.TREE()], // This refers to the tree of the current ref (HEAD) // which is what we want for the initial commit's files. // If commitData.commit.tree is different from HEAD's tree (it shouldn't be for initial commit processing) // then a different approach is needed. // For an initial commit, its tree *is* the state. // The `gitWalk` function, when given `trees: [GIT_TREE()]`, will walk the tree // of the current commit (which `commitData` represents). // This seems correct for listing files of an initial commit. // The error TS2353 was about `GIT_TREE({ oid: ... })`. // The correct usage is just `GIT_TREE()` if you mean the tree of the current ref, // or you need to pass the OID differently if you want to specify an arbitrary tree. // Given this is for the initial commit, `GIT_TREE()` should point to its tree. // Let's assume `GIT_TREE()` correctly resolves to the tree of `commitData.commit.tree`. // The original code was: trees: [GIT_TREE({ oid: commitData.commit.tree })], // The `TREE` symbol itself is not a function. `GIT_TREE()` is. // The error indicates TS thinks `GIT_TREE` is a function taking `{ref?: string}`. // This suggests a type definition issue or a misunderstanding of the API. // `isomorphic-git`'s `TREE` is a function that returns a `WalkerFactory`. // So, `GIT_TREE()` should be used. // The error `TS2353: Object literal may only specify known properties, and 'oid' does not exist in type '{ ref?: string | undefined; }'.` // implies that `GIT_TREE` is being seen as `function TREE(options?: { ref?: string }): Walker`. // This is not how `GIT_TREE({oid: ...})` is meant to be used. // It should be `GIT_TREE()` if you want the current ref's tree. // To walk a *specific* tree OID (like `commitData.commit.tree`), you'd typically use `readTree` // and then iterate its entries, or use `walk` with the `oids` parameter. // Given the context of an initial commit, `GIT_TREE()` should refer to its tree. // The simplest fix for the `walk` call, assuming `GIT_TREE()` refers to the tree of the current commit: }); } detailedCommits.push({ oid: commitEntry.oid, message: commitEntry.commit.message, author: commitEntry.commit.author, committer: commitEntry.commit.committer, changedFiles, parents: parentOids, // Add this line }); } logger.info(`Retrieved ${detailedCommits.length} commits with changes for ${repoPath}`); return detailedCommits; } catch (error: unknown) { const err = error instanceof Error ? error : new Error(String(error)); logger.error(`Failed to get commit history with changes for ${repoPath}: ${err.message}`, { stack: err.stack }); // Depending on desired behavior, you might want to re-throw or return empty array throw err; // Or return []; } } async function indexCommitsAndDiffs( qdrantClient: QdrantClient, repoPath: string, llmProvider: LLMProvider, // allRepoFiles: string[] // Potentially useful context, currently unused ): Promise<void> { logger.info(`Indexing commit history and diffs for repository: ${repoPath}`); currentIndexingStatus.message = 'Fetching commit history...'; currentIndexingStatus.totalCommitsToIndex = 0; currentIndexingStatus.commitsIndexed = 0; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); const historyOptions: { count?: number } = {}; if (configService.COMMIT_HISTORY_MAX_COUNT_FOR_INDEXING > 0) { historyOptions.count = configService.COMMIT_HISTORY_MAX_COUNT_FOR_INDEXING; } const commits = await getCommitHistoryWithChanges(repoPath, historyOptions); if (!commits || commits.length === 0) { logger.info(`No commit history found or processed for ${repoPath}. Skipping commit/diff indexing.`); return; } currentIndexingStatus.totalCommitsToIndex = commits.length; currentIndexingStatus.message = `Found ${commits.length} commits to process for diffs and history.`; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); const pointsToUpsert: QdrantPoint[] = []; for (const commit of commits) { // 1. Index Commit Info currentIndexingStatus.currentCommit = commit.oid; currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); const changedFilesSummary = commit.changedFiles.map( (cf) => `${cf.type.charAt(0).toUpperCase()}: ${cf.path}` ); // Prepare text for embedding commit information const commitDate = new Date(commit.author.timestamp * 1000).toISOString(); const commitTextToEmbed = preprocessText( `Commit: ${commit.oid}\nAuthor: ${commit.author.name} <${commit.author.email}>\nDate: ${commitDate}\nMessage: ${commit.message}\nParents: ${commit.parents.join(', ')}\nChanges: ${changedFilesSummary.join('; ')}` ); try { // === Commit Info ID Generation and Embedding === if (!commit.oid || typeof commit.oid !== 'string') { logger.error(`Skipping commit due to invalid OID for ID generation. Commit: ${JSON.stringify(commit)}`); continue; } // Generate commit ID first const commitPointId = uuidv4(); const commitPayload: CommitInfoPayload = { // Define payload before embedding attempt dataType: 'commit_info', commit_oid: commit.oid, commit_message: commit.message, commit_author_name: commit.author.name, commit_author_email: commit.author.email, commit_date: commitDate, changed_files_summary: changedFilesSummary, parent_oids: commit.parents, repositoryPath: repoPath, // Optional }; // Now attempt embedding const commitVector = await llmProvider.generateEmbedding(commitTextToEmbed); pointsToUpsert.push({ id: commitPointId, vector: commitVector, payload: commitPayload, }); } catch (embedError) { // This catch block will now primarily catch errors from llmProvider.generateEmbedding logger.error(`Failed to process or generate embedding for commit ${commit.oid}`, { error: embedError instanceof Error ? embedError.message : String(embedError) }); continue; // Skip this commit if ID generation or embedding fails } // 2. Index Diffs for each changed file in the commit for (const changedFile of commit.changedFiles) { if (changedFile.diffText && (changedFile.type === 'add' || changedFile.type === 'modify' || changedFile.type === 'delete')) { const processedDiffText = preprocessText(changedFile.diffText); const diffChunks = chunkText( processedDiffText, configService.DIFF_CHUNK_SIZE_CHARS, configService.DIFF_CHUNK_OVERLAP_CHARS ); for (let i = 0; i < diffChunks.length; i++) { const diffChunk = diffChunks[i]; if (!diffChunk.trim()) continue; // Text to embed for diff could include commit context for better searchability const diffContextualText = preprocessText(`Diff for ${changedFile.path} in commit ${commit.oid} (type: ${changedFile.type}):\n${diffChunk}`); try { // === Diff Chunk ID Generation and Embedding === if (!commit.oid || typeof commit.oid !== 'string' || !changedFile.path || typeof changedFile.path !== 'string') { logger.error(`Skipping diff chunk due to invalid commit OID or changed file path for ID generation. Commit OID: ${commit.oid}, FilePath: ${changedFile.path}`); continue; } // Generate diff ID first const diffPointId = uuidv4(); const diffPayload: DiffChunkPayload = { // Define payload before embedding dataType: 'diff_chunk', commit_oid: commit.oid, filepath: changedFile.path, diff_content_chunk: diffChunk, chunk_index: i, total_chunks: diffChunks.length, change_type: changedFile.type as 'modify' | 'add' | 'delete' | 'typechange', // Ensure type compatibility repositoryPath: repoPath, // Optional }; pointsToUpsert.push({ id: diffPointId, // Now attempt embedding vector: await llmProvider.generateEmbedding(diffContextualText), // Embedding done here payload: diffPayload, }); } catch (embedError) { // This catch block will now primarily catch errors from llmProvider.generateEmbedding for diffs logger.error(`Failed to process or generate embedding for diff chunk of ${changedFile.path} in commit ${commit.oid}`, { error: embedError instanceof Error ? embedError.message : String(embedError) }); // Continue to next chunk/file } } } } // Batch upsert periodically if (pointsToUpsert.length >= configService.QDRANT_BATCH_UPSERT_SIZE) { logger.info(`Upserting batch of ${pointsToUpsert.length} commit/diff points...`); const simplePointsBatch1 = pointsToUpsert.map(p => ({ ...p, payload: p.payload as unknown as Record<string, unknown> })); await batchUpsertVectors(qdrantClient, configService.COLLECTION_NAME, simplePointsBatch1, configService.QDRANT_BATCH_UPSERT_SIZE); pointsToUpsert.length = 0; // Clear the array } } if (currentIndexingStatus.commitsIndexed !== undefined && currentIndexingStatus.totalCommitsToIndex && currentIndexingStatus.totalCommitsToIndex > 0) { currentIndexingStatus.commitsIndexed++; const commitProgressContribution = 25; // Commits are 70% to 95% currentIndexingStatus.overallProgress = 70 + Math.round((currentIndexingStatus.commitsIndexed / currentIndexingStatus.totalCommitsToIndex) * commitProgressContribution); currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); } // Upsert any remaining points if (pointsToUpsert.length > 0) { logger.info(`Upserting final batch of ${pointsToUpsert.length} commit/diff points...`); const simplePointsFinalBatch = pointsToUpsert.map(p => ({ ...p, payload: p.payload as unknown as Record<string, unknown> })); await batchUpsertVectors(qdrantClient, configService.COLLECTION_NAME, simplePointsFinalBatch, configService.QDRANT_BATCH_UPSERT_SIZE); } currentIndexingStatus.message = `Commit and diff indexing phase complete. Finalizing...`; currentIndexingStatus.currentCommit = undefined; currentIndexingStatus.overallProgress = Math.min(99, currentIndexingStatus.overallProgress || 95); // Cap at 99 before final completion currentIndexingStatus.lastUpdatedAt = new Date().toISOString(); logger.info(`Finished indexing ${commits.length} commits and their diffs for ${repoPath}`); }