ai-index
Version:
AI-powered local code indexing and search system for any codebase
430 lines (370 loc) • 13 kB
JavaScript
import { globby } from 'globby';
import fs from 'fs/promises';
import path from 'path';
import crypto from 'crypto';
import { execSync } from 'child_process';
import { fileURLToPath } from 'url';
import { loadConfig } from './config.js';
import { createLocalEmbedder } from './local-embedder.js';
import { createLocalVectorStore } from './local-vector-store.js';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
async function getVersion() {
try {
const packageJsonPath = path.join(__dirname, '..', 'package.json');
const content = await fs.readFile(packageJsonPath, 'utf-8');
const pkg = JSON.parse(content);
return pkg.version;
} catch {
return 'unknown';
}
}
// Parse command line arguments
const args = process.argv.slice(2);
const targetFolder = args[0] || process.cwd();
const REPO_ROOT = path.resolve(targetFolder);
const FORCE_REINDEX = args.includes('--force');
// Generate index name from folder path
const folderName = path.basename(REPO_ROOT);
const INDEX_NAME = folderName.replace(/[^a-zA-Z0-9_-]/g, '_');
// Load configuration and initialize local components
const config = await loadConfig();
console.log('🏠 Running in LOCAL mode');
console.log(`Indexing folder: ${REPO_ROOT}`);
console.log(`Index key: ${INDEX_NAME}`);
const embedder = await createLocalEmbedder(config);
const vectorStore = await createLocalVectorStore(config, INDEX_NAME);
const EMBED_DIM = embedder.getDimensions();
async function embed(text) {
return await embedder.embed(text);
}
async function createIndex() {
console.log('Local vector store initialized');
}
async function createPipeline() {
// Not needed for local mode
}
function getFileArea(filePath) {
const p = filePath.toLowerCase();
// Backend-ish
if (
p.includes('/app/api/') || p.includes('/app/models/') || p.includes('/app/helpers/') ||
p.includes('/app/jobs/') || p.includes('/app/worker') || p.includes('/app/server')
) {
return 'backend';
}
// Frontend app code (support common src-based layouts too)
if (
p.includes('/app/components/') || p.includes('/app/pages/') || p.includes('/app/data/') || p.includes('/app/public/') ||
p.includes('/src/components/') || p.includes('/src/pages/') || p.includes('/src/hooks/') || p.includes('/src/context') ||
p.endsWith('/router.tsx') || p.endsWith('/router.jsx')
) {
return 'frontend';
}
// Infra/config & PWA related
if (
p.includes('/terraform/') || p.includes('/k8s/') || p.includes('/docker') || p.includes('dockerfile') ||
p.endsWith('vite.config.ts') || p.endsWith('vite.config.js') ||
p.includes('workbox') || p.includes('service-worker') ||
p.endsWith('manifest.ts') || p.endsWith('manifest.json') || p.endsWith('/index.html') ||
p.includes('/scripts/')
) {
return 'infra';
}
// Docs
if (p.includes('/docs/') || p.includes('readme') || p.includes('documentation')) {
return 'docs';
}
return 'other';
}
function getLanguage(filePath) {
const ext = path.extname(filePath).toLowerCase();
const langMap = {
'.js': 'javascript',
'.mjs': 'javascript',
'.jsx': 'javascript',
'.ts': 'typescript',
'.tsx': 'typescript',
'.py': 'python',
'.go': 'go',
'.tf': 'terraform',
'.yml': 'yaml',
'.yaml': 'yaml',
'.json': 'json',
'.md': 'markdown',
'.scss': 'scss',
'.css': 'css',
'.sql': 'sql',
'.sh': 'bash'
};
return langMap[ext] || 'unknown';
}
function getFileType(filePath) {
const p = filePath.toLowerCase();
if (
p.endsWith('.d.ts') || p.includes('/types/') || p.includes('/@types/')
) return 'types';
if (
p.endsWith('vite.config.ts') || p.endsWith('vite.config.js') || p.includes('workbox') ||
p.includes('service-worker') || p.endsWith('manifest.ts') || p.endsWith('manifest.json') ||
p.endsWith('/index.html') || p.includes('/scripts/')
) return 'config';
if (p.endsWith('.md')) return 'docs';
return 'code';
}
async function chunkFile(filePath, content) {
const chunks = [];
const lines = content.split('\n');
const language = getLanguage(filePath);
const area = getFileArea(filePath);
const fileType = getFileType(filePath);
// Tighter chunks for config and type definition files to sharpen lexical signals
const maxLinesPerChunk = (language === 'markdown' || area === 'docs') ? 50 : (fileType === 'config' || fileType === 'types') ? 18 : 30;
const overlap = (fileType === 'config' || fileType === 'types') ? 3 : 5;
for (let i = 0; i < lines.length; i += maxLinesPerChunk - overlap) {
const chunkLines = lines.slice(i, i + maxLinesPerChunk);
const chunkContent = chunkLines.join('\n');
if (chunkContent.trim().length > 50) {
const chunkId = crypto.createHash('md5')
.update(`${filePath}:${i}`)
.digest('hex');
chunks.push({
id: chunkId,
repo_path: filePath,
content: chunkContent,
language,
area,
file_type: fileType,
parent_id: filePath,
start_line: i,
end_line: Math.min(i + maxLinesPerChunk, lines.length)
});
}
}
return chunks;
}
async function indexDocuments(documents) {
if (documents.length === 0) return;
await vectorStore.addDocuments(documents);
}
async function loadFileHashes() {
const hashFilePath = path.join(REPO_ROOT, 'ai_index/file_hashes.json');
try {
const content = await fs.readFile(hashFilePath, 'utf-8');
return JSON.parse(content);
} catch (err) {
if (err.code === 'ENOENT') {
return {};
}
throw err;
}
}
async function saveFileHashes(hashes) {
const hashFilePath = path.join(REPO_ROOT, 'ai_index/file_hashes.json');
await fs.mkdir(path.dirname(hashFilePath), { recursive: true });
await fs.writeFile(hashFilePath, JSON.stringify(hashes, null, 2));
}
function calculateFileHash(content) {
return crypto.createHash('sha256').update(content, 'utf-8').digest('hex');
}
async function processFiles() {
const patterns = [
'**/*.{js,mjs,jsx,ts,tsx}',
'**/*.{json,yml,yaml}',
'**/*.md',
'**/*.{py,go,java,scala,rs,cpp,c,h}',
'**/*.{tf,Dockerfile}',
'**/*.{sql,sh,bash}'
];
const ignorePatterns = [
'**/node_modules/**',
'**/dist/**',
'**/build/**',
'**/*.min.js',
'**/package-lock.json',
'**/public/fonts/**',
'**/public/icons/**',
'**/public/flags/**'
];
const files = await globby(patterns, {
cwd: REPO_ROOT,
ignore: ignorePatterns,
absolute: false
});
console.log(`Found ${files.length} files to index`);
const previousHashes = await loadFileHashes();
const currentHashes = {};
const filesToProcess = [];
console.log('Checking file changes...');
for (const file of files) {
const fullPath = path.join(REPO_ROOT, file);
try {
const content = await fs.readFile(fullPath, 'utf-8');
const currentHash = calculateFileHash(content);
currentHashes[file] = currentHash;
if (FORCE_REINDEX || previousHashes[file] !== currentHash) {
filesToProcess.push({ file, content });
}
} catch (err) {
console.error(`Error reading ${file}:`, err.message);
}
}
const skippedCount = files.length - filesToProcess.length;
if (FORCE_REINDEX) {
console.log(`Files to process: ${filesToProcess.length} (force reindex enabled)`);
} else {
console.log(`Files to process: ${filesToProcess.length} (${skippedCount} unchanged files skipped)`);
}
if (filesToProcess.length === 0 && !FORCE_REINDEX) {
console.log('No files need reindexing. Index is up to date.');
return;
}
let sha = 'unknown';
try {
sha = execSync('git rev-parse HEAD', { cwd: REPO_ROOT })
.toString().trim();
} catch (err) {
console.log('Warning: Not a git repository, using fallback SHA');
sha = `local-${Date.now()}`;
}
const chunkMap = [];
const MAX_BULK_SIZE = 50;
let totalChunks = 0;
let pendingDocuments = [];
console.log('Processing changed files and creating embeddings...');
for (let i = 0; i < filesToProcess.length; i++) {
const { file, content } = filesToProcess[i];
try {
if (previousHashes[file]) {
const removedCount = await vectorStore.removeDocumentsByFile(file);
if (removedCount > 0) {
console.log(`Removed ${removedCount} outdated chunks for ${file}`);
}
}
const chunks = await chunkFile(file, content);
for (const chunk of chunks) {
const embedding = await embed(chunk.content);
pendingDocuments.push({
...chunk,
sha,
embedding
});
chunkMap.push({
chunk_id: chunk.id,
file: chunk.repo_path,
start: chunk.start_line,
end: chunk.end_line,
parent_id: chunk.parent_id,
area: chunk.area
});
if (pendingDocuments.length >= MAX_BULK_SIZE) {
console.log(`Bulk indexing ${pendingDocuments.length} documents...`);
await indexDocuments(pendingDocuments);
totalChunks += pendingDocuments.length;
console.log(`✅ Indexed ${totalChunks} chunks total`);
pendingDocuments = [];
}
}
if ((i + 1) % 10 === 0) {
console.log(`Processed ${i + 1}/${filesToProcess.length} changed files...`);
}
} catch (err) {
console.error(`Error processing ${file}:`, err.message);
}
}
if (pendingDocuments.length > 0) {
console.log(`Bulk indexing final ${pendingDocuments.length} documents...`);
await indexDocuments(pendingDocuments);
totalChunks += pendingDocuments.length;
console.log(`✅ Indexed ${totalChunks} chunks total`);
}
const deletedFiles = Object.keys(previousHashes).filter(file => !currentHashes[file]);
if (deletedFiles.length > 0) {
console.log(`Removing ${deletedFiles.length} deleted files from index...`);
for (const file of deletedFiles) {
const removedCount = await vectorStore.removeDocumentsByFile(file);
if (removedCount > 0) {
console.log(`Removed ${removedCount} chunks for deleted file: ${file}`);
}
}
}
await saveFileHashes(currentHashes);
const chunkMapPath = path.join(REPO_ROOT, 'ai_index/search/chunkmap.jsonl');
await fs.mkdir(path.dirname(chunkMapPath), { recursive: true });
await fs.writeFile(
chunkMapPath,
chunkMap.map(c => JSON.stringify(c)).join('\n')
);
const manifest = {
mode: 'local',
index: INDEX_NAME,
folder: REPO_ROOT,
embed_model: config.EMBED_MODEL,
dim: EMBED_DIM,
last_built_at: new Date().toISOString(),
total_chunks: totalChunks,
total_files: files.length,
processed_files: filesToProcess.length,
skipped_files: skippedCount,
sha
};
await fs.writeFile(
path.join(REPO_ROOT, 'ai_index/manifest.json'),
JSON.stringify(manifest, null, 2)
);
if (FORCE_REINDEX) {
console.log(`\nIndexing complete! Indexed ${totalChunks} chunks from ${filesToProcess.length} files (force reindex)`);
} else {
console.log(`\nIndexing complete! Processed ${filesToProcess.length} changed files, indexed ${totalChunks} new chunks (${skippedCount} files unchanged)`);
}
}
async function generateSymbolIndex() {
console.log('\nGenerating symbol indexes...');
try {
const scipOutput = path.join(REPO_ROOT, 'ai_index/scip/index.scip');
await fs.mkdir(path.dirname(scipOutput), { recursive: true });
execSync(`npx @sourcegraph/scip-typescript index --project-root ${REPO_ROOT} --output ${scipOutput} --infer-tsconfig`, {
cwd: REPO_ROOT,
stdio: 'inherit'
});
console.log('SCIP index generated');
} catch (err) {
console.error('Failed to generate SCIP index:', err.message);
}
try {
const tagsOutput = path.join(REPO_ROOT, 'ai_index/tags');
execSync(`ctags -R --languages=JavaScript,TypeScript --fields=+n --extras=+q -f ${tagsOutput} ${REPO_ROOT}`, {
cwd: REPO_ROOT,
stdio: 'inherit'
});
console.log('ctags index generated');
} catch (err) {
console.error('Failed to generate ctags index:', err.message);
}
}
async function main() {
const version = await getVersion();
console.log(`Starting AI index build... (v${version})`);
console.log(`Embedding dimension: ${EMBED_DIM}`);
try {
// Check if target folder exists
try {
await fs.access(REPO_ROOT);
} catch {
console.error(`Error: Folder does not exist: ${REPO_ROOT}`);
process.exit(1);
}
await createIndex();
await createPipeline();
await processFiles();
await generateSymbolIndex();
console.log('\nBuild complete!');
const stats = await vectorStore.getStats();
console.log(`📊 Vector store stats:`, stats);
} catch (err) {
console.error('Build failed:', err);
process.exit(1);
}
}
main();