@coworker-agency/rag
Version:
Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering
380 lines (316 loc) • 12.8 kB
JavaScript
/**
* Document Indexer
*
* This module handles indexing documents from Supabase storage into a vector database.
* It extracts text from various file formats, generates context-aware vectors,
* and stores them in Supabase's vector store.
*/
import { createClient } from '@supabase/supabase-js';
import { getContextAwareVectors, extractRelevantContext } from '../cav/index.js';
import { extractTextFromPDF, extractTextFromCSV, extractTextFromJSON } from './extractors.js';
import { generateDocumentMetadata, getFileExtension } from './metadata.js';
/**
* Process documents from Supabase storage and index them into the vector store
* @param {string} supabaseUrl - Supabase project URL
* @param {string} supabaseSecretKey - Supabase service role key
* @param {string} bucket - Supabase storage bucket name
* @param {Object} options - Optional parameters
* @returns {Promise<{processed: number, failed: number, files: string[]}>} Processing results
*/
export async function processDocuments(supabaseUrl, supabaseSecretKey, bucket, options = {}) {
const {
openaiApiKey = process.env.OPENAI_API_KEY,
tableName = 'vector_documents',
batchSize = 10,
maxFileSize = 20 * 1024 * 1024, // 20MB max file size by default
maxTextLength = 1000000, // 1M chars max text length for very large files
skipExisting = true,
} = options;
// Initialize Supabase client
const supabase = createClient(supabaseUrl, supabaseSecretKey, {
auth: { persistSession: false }
});
// Track processing statistics
const stats = {
processed: 0,
failed: 0,
files: []
};
try {
console.log(`Listing files in bucket: ${bucket}`);
// List all files in the bucket
const { data: files, error: listError } = await supabase.storage
.from(bucket)
.list();
if (listError) {
console.error('Error listing files:', listError);
throw listError;
}
console.log(`Found ${files.length} files in bucket ${bucket}`);
// Process files one by one
for (const file of files) {
try {
// Skip folders and non-supported files
if (file.id === null || !isSupportedFile(file.name)) {
console.log(`Skipping unsupported file: ${file.name}`);
continue;
}
// Check if file is already processed
const { data: existingDoc } = await supabase
.from(tableName)
.select('id')
.eq('metadata->>fileName', file.name)
.maybeSingle();
if (existingDoc && skipExisting) {
console.log(`Skipping already indexed file: ${file.name}`);
continue;
}
console.log(`Processing file: ${file.name}`);
await indexSupabaseFile(supabase, bucket, file, tableName, openaiApiKey, options);
// Update stats
stats.processed++;
stats.files.push(file.name);
console.log(`Successfully processed: ${file.name}`);
} catch (fileError) {
console.error(`Error processing file ${file.name}:`, fileError);
stats.failed++;
}
}
return stats;
} catch (error) {
console.error('Error processing documents:', error);
throw error;
}
}
/**
* Index a single file from Supabase storage
* @param {Object} supabase - Supabase client
* @param {string} bucket - Storage bucket name
* @param {Object} file - File object
* @param {string} tableName - Vector store table name
* @param {string} openaiApiKey - OpenAI API key
* @param {Object} options - Additional options
* @returns {Promise<void>}
*/
async function indexSupabaseFile(supabase, bucket, file, tableName, openaiApiKey, options = {}) {
// Remove any existing records for this file
await removeExistingDocumentRecords(supabase, tableName, file.name);
// Get file metadata
const documentMetadata = generateDocumentMetadata(file);
// Download file
const { data: fileData, error: downloadError } = await supabase.storage
.from(bucket)
.download(file.name);
if (downloadError) {
console.error(`Error downloading file ${file.name}:`, downloadError);
throw downloadError;
}
// Extract text based on file type
let textContent;
const fileExt = getFileExtension(file.name);
// Use the appropriate extractor based on file type
if (fileExt === 'pdf') {
console.log(`Extracting text from PDF: ${file.name}`);
textContent = await extractTextFromPDF(fileData);
} else if (fileExt === 'csv') {
console.log(`Extracting text from CSV: ${file.name}`);
textContent = await extractTextFromCSV(fileData);
} else if (fileExt === 'json') {
console.log(`Extracting text from JSON: ${file.name}`);
textContent = await extractTextFromJSON(fileData);
} else {
// Handle text files directly
textContent = await fileData.text();
}
// Check for empty content
if (!textContent || textContent.trim() === '') {
console.warn(`File ${file.name} contains no extractable text content`);
return;
}
// Handle large files - improved memory efficiency for large files
// Addresses 'JavaScript heap out of memory' issues mentioned in the memory
const isLargeFile = textContent.length > 100000; // 100K chars threshold
if (isLargeFile) {
console.log(`Large file detected (${textContent.length} chars): ${file.name}`);
await processLargeFile(textContent, documentMetadata, supabase, tableName, openaiApiKey, options);
} else {
// Process normal sized file
console.log(`Generating vectors for: ${file.name}`);
// Use regular processing for smaller files
const vectors = await getContextAwareVectors(textContent, openaiApiKey);
// Store vectors in database
await storeVectors(vectors, documentMetadata, supabase, tableName);
}
}
/**
* Process a large file by splitting it into manageable parts
* Memory-efficient processing to avoid heap out of memory errors
* @param {string} textContent - Full text content
* @param {Object} documentMetadata - Document metadata
* @param {Object} supabase - Supabase client
* @param {string} tableName - Vector store table name
* @param {string} openaiApiKey - OpenAI API key
* @param {Object} options - Additional options
* @returns {Promise<void>}
*/
async function processLargeFile(textContent, documentMetadata, supabase, tableName, openaiApiKey, options = {}) {
const {
partSize = 50000,
maxLlmBatchSize = 5,
skipLlmRefinementThreshold = 500000 // Skip LLM refinement for files larger than 500K chars
} = options;
// Split large files into parts of roughly 50K chars
const parts = [];
for (let i = 0; i < textContent.length; i += partSize) {
// Find a suitable split point - prefer end of paragraph
let endPos = Math.min(i + partSize, textContent.length);
// Try to find paragraph end
if (endPos < textContent.length) {
const nextParaEnd = textContent.indexOf('\n\n', endPos - 1000);
if (nextParaEnd !== -1 && nextParaEnd < endPos + 1000) {
endPos = nextParaEnd + 2;
}
}
parts.push(textContent.substring(i, endPos));
}
console.log(`Split large file into ${parts.length} parts for memory-efficient processing`);
// Process each part separately with memory-efficient options
for (let i = 0; i < parts.length; i++) {
const partMetadata = {
...documentMetadata,
partNumber: i + 1,
totalParts: parts.length
};
console.log(`Processing part ${i + 1}/${parts.length}`);
// Configure memory-efficient options based on content size
const cavOptions = {
batchSize: maxLlmBatchSize, // Use smaller batch size for large files
skipLlmRefinement: textContent.length > skipLlmRefinementThreshold, // Skip LLM refinement for very large files
// Use a more memory-efficient approach for very large files
isLargeDocument: true
};
// For very large content, only pass an excerpt to context-aware vector generation
// This addresses the memory issue mentioned in the memory
let contentToProcess;
if (textContent.length > skipLlmRefinementThreshold) {
console.log('Using memory-efficient processing for very large content');
// Extract a representative sample instead of using the full content
contentToProcess = extractRelevantContext(parts[i], 10000);
} else {
contentToProcess = parts[i];
}
// Generate context-aware vectors with memory-efficient options
const vectors = await getContextAwareVectors(contentToProcess, openaiApiKey, cavOptions);
// Store vectors
await storeVectors(vectors, partMetadata, supabase, tableName);
// Help garbage collection between parts
if (global.gc) {
console.log('Running garbage collection between file parts');
global.gc();
}
}
}
/**
* Store vectors in the database
* @param {Array<{context: string, content: string, vector: number[]}>} vectors - Vectors to store
* @param {Object} documentMetadata - Document metadata
* @param {Object} supabase - Supabase client
* @param {string} tableName - Vector store table name
* @returns {Promise<void>}
*/
async function storeVectors(vectors, documentMetadata, supabase, tableName) {
console.log(`Storing ${vectors.length} vectors in table: ${tableName}`);
// Keep track of inserted IDs
const insertedIds = [];
try {
// Insert vectors in small batches to avoid memory issues
const batchSize = 50;
const batches = Math.ceil(vectors.length / batchSize);
for (let batchIndex = 0; batchIndex < batches; batchIndex++) {
const start = batchIndex * batchSize;
const end = Math.min((batchIndex + 1) * batchSize, vectors.length);
const currentBatch = vectors.slice(start, end);
const records = currentBatch.map(vector => ({
content: vector.content,
metadata: {
...documentMetadata,
context: vector.context // Store context in metadata for reference
},
embedding: vector.vector
}));
// Insert batch into Supabase
const { data, error } = await supabase
.from(tableName)
.insert(records)
.select('id');
if (error) {
console.error(`Error inserting batch ${batchIndex + 1}/${batches}:`, error);
throw error;
}
// Keep track of inserted IDs
insertedIds.push(...(data || []).map(record => record.id));
console.log(`Inserted batch ${batchIndex + 1}/${batches} with ${currentBatch.length} vectors`);
}
console.log(`Successfully stored ${insertedIds.length} vectors in the database`);
} catch (error) {
console.error('Error storing vectors:', error);
throw error;
}
return insertedIds;
}
/**
* Check if file type is supported
* @param {string} fileName - File name
* @returns {boolean} Is supported
*/
function isSupportedFile(fileName) {
const supportedExtensions = ['pdf', 'txt', 'md', 'html', 'csv', 'json'];
const ext = getFileExtension(fileName);
return supportedExtensions.includes(ext);
}
/**
* Remove all existing vector records for a document
* @param {Object} supabase - Supabase client
* @param {string} tableName - Vector store table name
* @param {string} fileName - File name
* @returns {Promise<void>}
*/
async function removeExistingDocumentRecords(supabase, tableName, fileName) {
try {
console.log(`Removing existing records for file: ${fileName}`);
// Find records with matching filename in metadata
const { data: existingRecords, error: queryError } = await supabase
.from(tableName)
.select('id')
.eq('metadata->>fileName', fileName);
if (queryError) {
console.error('Error querying existing records:', queryError);
return;
}
if (!existingRecords || existingRecords.length === 0) {
console.log('No existing records found');
return;
}
console.log(`Found ${existingRecords.length} existing records to remove`);
// Delete records in batches to avoid RPC timeouts
const batchSize = 100;
const batches = Math.ceil(existingRecords.length / batchSize);
for (let i = 0; i < batches; i++) {
const batchIds = existingRecords
.slice(i * batchSize, (i + 1) * batchSize)
.map(record => record.id);
const { error: deleteError } = await supabase
.from(tableName)
.delete()
.in('id', batchIds);
if (deleteError) {
console.error(`Error deleting batch ${i + 1}/${batches}:`, deleteError);
}
}
console.log('Successfully removed existing records');
} catch (error) {
console.error('Error removing existing document records:', error);
}
}
// Export functions
export { processDocuments };