seta-indexer
Version:
Vector database indexer for documentation
113 lines (112 loc) • 4.33 kB
JavaScript
import { SUPPORTED_EXTENSIONS, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, } from "./config.js";
import fs from "fs-extra";
import { globby } from "globby";
import path from "path";
import { extractTextFromPdf } from "./pdfUtils.js";
function splitTextSmart(text, chunkSize, chunkOverlap) {
// Split by paragraph, then by sentence, then by chunk size
const paras = text.split(/\n{2,}/);
let chunks = [];
for (const para of paras) {
if (para.length <= chunkSize) {
chunks.push(para);
}
else {
// Split by sentence
const sentences = para.match(/[^.!?\n]+[.!?\n]+/g) || [para];
let buf = "";
for (const sent of sentences) {
if ((buf + sent).length > chunkSize) {
if (buf)
chunks.push(buf);
buf = sent;
}
else {
buf += sent;
}
}
if (buf)
chunks.push(buf);
}
}
// Add overlap
if (chunkOverlap > 0 && chunks.length > 1) {
const overlapped = [];
for (let i = 0; i < chunks.length; i++) {
let chunk = chunks[i];
if (i > 0) {
const prev = chunks[i - 1];
chunk = prev.slice(-chunkOverlap) + chunk;
}
overlapped.push(chunk);
}
return overlapped;
}
return chunks;
}
export async function processDocumentationFiles(folder, options = {}) {
const patterns = options.include && options.include.length > 0
? options.include
: SUPPORTED_EXTENSIONS.map((ext) => `**/*${ext}`);
const files = await globby(patterns, {
cwd: folder,
absolute: true,
ignore: options.exclude || [],
});
const documentFiles = [];
for (const filePath of files) {
const relativePath = path.relative(folder, filePath).replace(/\\/g, "/");
// Use first folder as libraryId, second as topicName if present
const parts = relativePath.split("/");
let libraryId = parts[0];
let topicName = parts.length > 2 ? parts[1] : undefined;
documentFiles.push({ filePath, relativePath, libraryId, topicName });
}
return documentFiles;
}
export async function chunkDocument(file, chunkSize = DEFAULT_CHUNK_SIZE, chunkOverlap = DEFAULT_CHUNK_OVERLAP, verbose = false) {
let text = "";
if (file.filePath.endsWith(".pdf")) {
if (verbose)
console.log(` 📄 Processing PDF: ${file.relativePath}`);
try {
text = await extractTextFromPdf(file.filePath);
if (verbose)
console.log(` ✅ PDF processed successfully: ${file.relativePath}`);
}
catch (error) {
console.log(` ⚠️ PDF processing failed: ${file.relativePath} - ${error instanceof Error ? error.message : String(error)}`);
if (verbose)
console.log(` 📝 Skipping PDF file due to processing error`);
return []; // Return empty chunks for failed PDFs
}
}
else {
if (verbose) {
const ext = path.extname(file.filePath).toLowerCase();
const fileType = ext === ".md" ? "Markdown" : ext === ".txt" ? "Text" : "Document";
console.log(` 📄 Processing ${fileType}: ${file.relativePath}`);
}
try {
text = await fs.readFile(file.filePath, "utf8");
if (verbose)
console.log(` ✅ File processed successfully: ${file.relativePath}`);
}
catch (error) {
console.log(` ⚠️ File processing failed: ${file.relativePath} - ${error instanceof Error ? error.message : String(error)}`);
if (verbose)
console.log(` 📝 Skipping file due to processing error`);
return []; // Return empty chunks for failed files
}
}
const chunks = splitTextSmart(text, chunkSize, chunkOverlap);
return chunks.map((chunk, i) => ({
id: `${file.relativePath}::${i}`,
libraryId: file.libraryId,
topicName: file.topicName,
originalFilePath: file.relativePath,
text: chunk,
order: i,
metadata: {},
}));
}