seta-indexer
Version:
Vector database indexer for documentation
119 lines (118 loc) โข 5.31 kB
JavaScript
import { processDocumentationFiles, chunkDocument, } from "./documentProcessor.js";
import { initializeEmbeddingModel, generateEmbeddingForChunk, } from "./embeddingUtils.js";
import { connectToLanceDb, openOrCreateTable, addChunksToTable, deleteChunksBySourceFile, } from "./lanceDbUtils.js";
import { loadIndexState, saveIndexState, hasFileChanged, hashFile, } from "./fileUtils.js";
import path from "path";
import fs from "fs-extra";
export async function runIndexingWorkflow(folder, options) {
const verbose = options.verbose;
const force = options.force;
const chunkSize = parseInt(options.chunkSize || "1000", 10);
const chunkOverlap = parseInt(options.chunkOverlap || "200", 10);
const dbPath = options.dbPath || path.join(folder, ".seta_lancedb");
if (verbose) {
console.log(`๐ Starting indexing process for: ${folder}`);
console.log(`๐ Documentation path: ${folder}`);
console.log(`๐๏ธ Database path: ${dbPath}`);
console.log("๐ Loading existing index state...");
}
// If force, remove DB and state
if (force) {
if (verbose)
console.log("๐งน Removing previous database and state (force reindex)...");
await fs.remove(dbPath);
}
let indexState = await loadIndexState(dbPath);
if (verbose)
console.log("๐ Scanning documentation files...");
const documentFiles = await processDocumentationFiles(folder, {
include: options.include,
exclude: options.exclude,
});
if (verbose)
console.log(`๐ Found ${documentFiles.length} documentation files`);
if (verbose)
console.log("๐ง Initializing embedding model...");
const model = await initializeEmbeddingModel();
if (verbose)
console.log("๐ Connecting to LanceDB...");
const db = await connectToLanceDb(dbPath);
const table = await openOrCreateTable(db);
// Handle deleted files (incremental mode only)
if (!force) {
const currentFiles = new Set(documentFiles.map((f) => f.filePath));
const previouslyIndexed = Object.keys(indexState.files);
for (const oldFile of previouslyIndexed) {
if (!currentFiles.has(oldFile)) {
if (verbose)
console.log(` ๐๏ธ Removing deleted file from DB: ${oldFile}`);
const relPath = oldFile.startsWith(folder)
? oldFile.slice(folder.length + 1).replace(/\\/g, "/")
: oldFile;
await deleteChunksBySourceFile(table, relPath);
delete indexState.files[oldFile];
}
}
}
console.log(`โ๏ธ Processing ${documentFiles.length} documentation files...`);
let processedCount = 0;
let skippedCount = 0;
let totalChunks = 0;
for (const file of documentFiles) {
if (!force && !(await hasFileChanged(file.filePath, indexState))) {
if (verbose)
console.log(` โฉ Skipping unchanged: ${file.relativePath}`);
skippedCount++;
continue;
}
if (verbose)
console.log(` ๐ Processing: ${file.relativePath}`);
const chunks = await chunkDocument(file, chunkSize, chunkOverlap, verbose);
if (chunks.length === 0) {
if (verbose)
console.log(` โ ๏ธ No chunks generated for: ${file.relativePath}`);
continue;
}
if (verbose)
console.log(` ๐ Generated ${chunks.length} chunks`);
const embeddedChunks = [];
for (const chunk of chunks) {
const embedding = await generateEmbeddingForChunk(chunk.text, model);
embeddedChunks.push({ ...chunk, embedding });
}
if (verbose)
console.log(` โ
Generated ${embeddedChunks.length} embedded chunks`);
await deleteChunksBySourceFile(table, file.relativePath);
await addChunksToTable(table, embeddedChunks);
// Update index state
const stat = await fs.stat(file.filePath);
const hash = await hashFile(file.filePath);
indexState.files[file.filePath] = {
hash: hash,
lastModified: stat.mtime.toISOString(),
chunkCount: embeddedChunks.length,
};
processedCount++;
totalChunks += embeddedChunks.length;
if (!verbose && processedCount % 5 === 0) {
console.log(` ๐ Processed ${processedCount}/${documentFiles.length} files...`);
}
}
if (!verbose) {
console.log(` โ
Processed ${processedCount} files, skipped ${skippedCount} unchanged`);
}
indexState.lastUpdated = new Date().toISOString();
await saveIndexState(dbPath, indexState);
// Show final stats
console.log("โ
Indexing completed");
console.log("\n๐ Final Statistics:");
const all = await table.query().limit(100000).toArray();
const libraries = new Set(all.map((c) => c.libraryId));
const topics = new Set(all.map((c) => c.topicName));
console.log(` - Total chunks: ${all.length}`);
console.log(` - Unique libraries: ${libraries.size}`);
console.log(` - Unique topics: ${topics.size}`);
if (verbose) {
console.log(` - Database path: ${dbPath}`);
}
}