UNPKG

@coworker-agency/rag

Version:

Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering

380 lines (316 loc) 12.8 kB
/** * Document Indexer * * This module handles indexing documents from Supabase storage into a vector database. * It extracts text from various file formats, generates context-aware vectors, * and stores them in Supabase's vector store. */ import { createClient } from '@supabase/supabase-js'; import { getContextAwareVectors, extractRelevantContext } from '../cav/index.js'; import { extractTextFromPDF, extractTextFromCSV, extractTextFromJSON } from './extractors.js'; import { generateDocumentMetadata, getFileExtension } from './metadata.js'; /** * Process documents from Supabase storage and index them into the vector store * @param {string} supabaseUrl - Supabase project URL * @param {string} supabaseSecretKey - Supabase service role key * @param {string} bucket - Supabase storage bucket name * @param {Object} options - Optional parameters * @returns {Promise<{processed: number, failed: number, files: string[]}>} Processing results */ export async function processDocuments(supabaseUrl, supabaseSecretKey, bucket, options = {}) { const { openaiApiKey = process.env.OPENAI_API_KEY, tableName = 'vector_documents', batchSize = 10, maxFileSize = 20 * 1024 * 1024, // 20MB max file size by default maxTextLength = 1000000, // 1M chars max text length for very large files skipExisting = true, } = options; // Initialize Supabase client const supabase = createClient(supabaseUrl, supabaseSecretKey, { auth: { persistSession: false } }); // Track processing statistics const stats = { processed: 0, failed: 0, files: [] }; try { console.log(`Listing files in bucket: ${bucket}`); // List all files in the bucket const { data: files, error: listError } = await supabase.storage .from(bucket) .list(); if (listError) { console.error('Error listing files:', listError); throw listError; } console.log(`Found ${files.length} files in bucket ${bucket}`); // Process files one by one for (const file of files) { try { // Skip folders and non-supported files if (file.id === null || !isSupportedFile(file.name)) { console.log(`Skipping unsupported file: ${file.name}`); continue; } // Check if file is already processed const { data: existingDoc } = await supabase .from(tableName) .select('id') .eq('metadata->>fileName', file.name) .maybeSingle(); if (existingDoc && skipExisting) { console.log(`Skipping already indexed file: ${file.name}`); continue; } console.log(`Processing file: ${file.name}`); await indexSupabaseFile(supabase, bucket, file, tableName, openaiApiKey, options); // Update stats stats.processed++; stats.files.push(file.name); console.log(`Successfully processed: ${file.name}`); } catch (fileError) { console.error(`Error processing file ${file.name}:`, fileError); stats.failed++; } } return stats; } catch (error) { console.error('Error processing documents:', error); throw error; } } /** * Index a single file from Supabase storage * @param {Object} supabase - Supabase client * @param {string} bucket - Storage bucket name * @param {Object} file - File object * @param {string} tableName - Vector store table name * @param {string} openaiApiKey - OpenAI API key * @param {Object} options - Additional options * @returns {Promise<void>} */ async function indexSupabaseFile(supabase, bucket, file, tableName, openaiApiKey, options = {}) { // Remove any existing records for this file await removeExistingDocumentRecords(supabase, tableName, file.name); // Get file metadata const documentMetadata = generateDocumentMetadata(file); // Download file const { data: fileData, error: downloadError } = await supabase.storage .from(bucket) .download(file.name); if (downloadError) { console.error(`Error downloading file ${file.name}:`, downloadError); throw downloadError; } // Extract text based on file type let textContent; const fileExt = getFileExtension(file.name); // Use the appropriate extractor based on file type if (fileExt === 'pdf') { console.log(`Extracting text from PDF: ${file.name}`); textContent = await extractTextFromPDF(fileData); } else if (fileExt === 'csv') { console.log(`Extracting text from CSV: ${file.name}`); textContent = await extractTextFromCSV(fileData); } else if (fileExt === 'json') { console.log(`Extracting text from JSON: ${file.name}`); textContent = await extractTextFromJSON(fileData); } else { // Handle text files directly textContent = await fileData.text(); } // Check for empty content if (!textContent || textContent.trim() === '') { console.warn(`File ${file.name} contains no extractable text content`); return; } // Handle large files - improved memory efficiency for large files // Addresses 'JavaScript heap out of memory' issues mentioned in the memory const isLargeFile = textContent.length > 100000; // 100K chars threshold if (isLargeFile) { console.log(`Large file detected (${textContent.length} chars): ${file.name}`); await processLargeFile(textContent, documentMetadata, supabase, tableName, openaiApiKey, options); } else { // Process normal sized file console.log(`Generating vectors for: ${file.name}`); // Use regular processing for smaller files const vectors = await getContextAwareVectors(textContent, openaiApiKey); // Store vectors in database await storeVectors(vectors, documentMetadata, supabase, tableName); } } /** * Process a large file by splitting it into manageable parts * Memory-efficient processing to avoid heap out of memory errors * @param {string} textContent - Full text content * @param {Object} documentMetadata - Document metadata * @param {Object} supabase - Supabase client * @param {string} tableName - Vector store table name * @param {string} openaiApiKey - OpenAI API key * @param {Object} options - Additional options * @returns {Promise<void>} */ async function processLargeFile(textContent, documentMetadata, supabase, tableName, openaiApiKey, options = {}) { const { partSize = 50000, maxLlmBatchSize = 5, skipLlmRefinementThreshold = 500000 // Skip LLM refinement for files larger than 500K chars } = options; // Split large files into parts of roughly 50K chars const parts = []; for (let i = 0; i < textContent.length; i += partSize) { // Find a suitable split point - prefer end of paragraph let endPos = Math.min(i + partSize, textContent.length); // Try to find paragraph end if (endPos < textContent.length) { const nextParaEnd = textContent.indexOf('\n\n', endPos - 1000); if (nextParaEnd !== -1 && nextParaEnd < endPos + 1000) { endPos = nextParaEnd + 2; } } parts.push(textContent.substring(i, endPos)); } console.log(`Split large file into ${parts.length} parts for memory-efficient processing`); // Process each part separately with memory-efficient options for (let i = 0; i < parts.length; i++) { const partMetadata = { ...documentMetadata, partNumber: i + 1, totalParts: parts.length }; console.log(`Processing part ${i + 1}/${parts.length}`); // Configure memory-efficient options based on content size const cavOptions = { batchSize: maxLlmBatchSize, // Use smaller batch size for large files skipLlmRefinement: textContent.length > skipLlmRefinementThreshold, // Skip LLM refinement for very large files // Use a more memory-efficient approach for very large files isLargeDocument: true }; // For very large content, only pass an excerpt to context-aware vector generation // This addresses the memory issue mentioned in the memory let contentToProcess; if (textContent.length > skipLlmRefinementThreshold) { console.log('Using memory-efficient processing for very large content'); // Extract a representative sample instead of using the full content contentToProcess = extractRelevantContext(parts[i], 10000); } else { contentToProcess = parts[i]; } // Generate context-aware vectors with memory-efficient options const vectors = await getContextAwareVectors(contentToProcess, openaiApiKey, cavOptions); // Store vectors await storeVectors(vectors, partMetadata, supabase, tableName); // Help garbage collection between parts if (global.gc) { console.log('Running garbage collection between file parts'); global.gc(); } } } /** * Store vectors in the database * @param {Array<{context: string, content: string, vector: number[]}>} vectors - Vectors to store * @param {Object} documentMetadata - Document metadata * @param {Object} supabase - Supabase client * @param {string} tableName - Vector store table name * @returns {Promise<void>} */ async function storeVectors(vectors, documentMetadata, supabase, tableName) { console.log(`Storing ${vectors.length} vectors in table: ${tableName}`); // Keep track of inserted IDs const insertedIds = []; try { // Insert vectors in small batches to avoid memory issues const batchSize = 50; const batches = Math.ceil(vectors.length / batchSize); for (let batchIndex = 0; batchIndex < batches; batchIndex++) { const start = batchIndex * batchSize; const end = Math.min((batchIndex + 1) * batchSize, vectors.length); const currentBatch = vectors.slice(start, end); const records = currentBatch.map(vector => ({ content: vector.content, metadata: { ...documentMetadata, context: vector.context // Store context in metadata for reference }, embedding: vector.vector })); // Insert batch into Supabase const { data, error } = await supabase .from(tableName) .insert(records) .select('id'); if (error) { console.error(`Error inserting batch ${batchIndex + 1}/${batches}:`, error); throw error; } // Keep track of inserted IDs insertedIds.push(...(data || []).map(record => record.id)); console.log(`Inserted batch ${batchIndex + 1}/${batches} with ${currentBatch.length} vectors`); } console.log(`Successfully stored ${insertedIds.length} vectors in the database`); } catch (error) { console.error('Error storing vectors:', error); throw error; } return insertedIds; } /** * Check if file type is supported * @param {string} fileName - File name * @returns {boolean} Is supported */ function isSupportedFile(fileName) { const supportedExtensions = ['pdf', 'txt', 'md', 'html', 'csv', 'json']; const ext = getFileExtension(fileName); return supportedExtensions.includes(ext); } /** * Remove all existing vector records for a document * @param {Object} supabase - Supabase client * @param {string} tableName - Vector store table name * @param {string} fileName - File name * @returns {Promise<void>} */ async function removeExistingDocumentRecords(supabase, tableName, fileName) { try { console.log(`Removing existing records for file: ${fileName}`); // Find records with matching filename in metadata const { data: existingRecords, error: queryError } = await supabase .from(tableName) .select('id') .eq('metadata->>fileName', fileName); if (queryError) { console.error('Error querying existing records:', queryError); return; } if (!existingRecords || existingRecords.length === 0) { console.log('No existing records found'); return; } console.log(`Found ${existingRecords.length} existing records to remove`); // Delete records in batches to avoid RPC timeouts const batchSize = 100; const batches = Math.ceil(existingRecords.length / batchSize); for (let i = 0; i < batches; i++) { const batchIds = existingRecords .slice(i * batchSize, (i + 1) * batchSize) .map(record => record.id); const { error: deleteError } = await supabase .from(tableName) .delete() .in('id', batchIds); if (deleteError) { console.error(`Error deleting batch ${i + 1}/${batches}:`, deleteError); } } console.log('Successfully removed existing records'); } catch (error) { console.error('Error removing existing document records:', error); } } // Export functions export { processDocuments };