UNPKG

@coworker-agency/rag

Version:

Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering

168 lines (140 loc) 5.68 kB
/** * Uses LLM to improve chunk quality and provide context for each chunk */ import OpenAI from 'openai'; /** * Extract a basic summary from a chunk without using LLM * @param {string} chunk - Text chunk * @param {string} documentContext - Optional document context * @returns {string} Chunk summary */ export function extractChunkSummary(chunk, documentContext = '') { // Get the first 100 characters to use as a fallback context const chunkPreview = chunk.substring(0, 150).trim(); // Extract potential headings or titles const headingMatch = chunk.match(/^#{1,6}\s+(.+)$|^(.+)\n[=-]+\n/m); const heading = headingMatch ? headingMatch[1] || headingMatch[2] : ''; // Use document context if available, otherwise use chunk preview return heading ? `${heading}: ${chunkPreview}...` : `${chunkPreview}...`; } /** * Get detailed prompt template for chunk processing * @returns {string} Prompt template */ function getDetailedChunkPrompt() { return `You are processing a document chunk for a RAG system. The chunk is part of a larger document. Document excerpt for context: {{DOCUMENT_EXCERPT}} Current chunk content: {{CHUNK_CONTENT}} Please perform two tasks: 1. Create a brief contextual summary of this chunk (1-2 sentences) 2. Clean up the chunk content if needed (fix formatting, remove artifacts) Return your result as a JSON object with: - "context": The brief contextual summary - "content": The cleaned chunk content The context should help understand where this chunk fits in the document.`; } /** * Get compact prompt template for chunk processing (used for large documents) * @returns {string} Prompt template */ function getCompactChunkPrompt() { return `Document excerpt: {{DOCUMENT_EXCERPT}} Chunk: {{CHUNK_CONTENT}} Return JSON with: - "context": Brief summary (1-2 sentences) - "content": Cleaned chunk`; } /** * Fix chunks and add context using LLM * @param {Array<string>} chunks - Document chunks * @param {string} documentExcerpt - Excerpt of full document for context * @param {string} openaiApiKey - OpenAI API key * @param {Object} options - Additional options * @returns {Promise<Array<{context: string, content: string}>>} Array of chunks with context */ export async function fixChunks(chunks, documentExcerpt, openaiApiKey, options = {}) { const { modelName = 'gpt-4o', batchSize = 10, skipLlmRefinement = false, maxTokensPerRequest = 4000, isLargeDocument = false } = options; // Memory-efficient batch processing for large documents const openai = new OpenAI({ apiKey: openaiApiKey }); // Process in smaller batches for memory efficiency const fixedChunks = []; const batchCount = Math.ceil(chunks.length / batchSize); // If document is very large, use more efficient processing options if (skipLlmRefinement) { console.log('Skipping LLM refinement for large document, using basic context extraction'); // For very large documents, skip LLM refinement and use simpler approach return chunks.map(chunk => ({ context: extractChunkSummary(chunk, documentExcerpt), content: chunk })); } for (let batchIndex = 0; batchIndex < batchCount; batchIndex++) { const batchStart = batchIndex * batchSize; const batchEnd = Math.min((batchIndex + 1) * batchSize, chunks.length); const currentBatch = chunks.slice(batchStart, batchEnd); console.log(`Processing batch ${batchIndex + 1}/${batchCount} with ${currentBatch.length} chunks`); // For large documents, use a more succinct prompt to reduce tokens const promptTemplate = isLargeDocument ? getCompactChunkPrompt() : getDetailedChunkPrompt(); // Process each chunk in the current batch const batchPromises = currentBatch.map(async (chunk, index) => { try { const prompt = promptTemplate .replace('{{DOCUMENT_EXCERPT}}', documentExcerpt.substring(0, 2000) + '...') .replace('{{CHUNK_CONTENT}}', chunk); const response = await openai.chat.completions.create({ model: modelName, messages: [ { role: "system", content: "You are a document processing assistant that extracts context and fixes text chunks." }, { role: "user", content: prompt } ], temperature: 0.3, max_tokens: maxTokensPerRequest, response_format: { type: "json_object" } }); // Parse response and add fixed chunk try { const responseContent = response.choices[0].message.content; const parsedResponse = JSON.parse(responseContent); return { context: parsedResponse.context || extractChunkSummary(chunk), content: parsedResponse.content || chunk }; } catch (parseError) { console.warn(`Error parsing LLM response for chunk ${batchStart + index}, using original chunk:`, parseError); return { context: extractChunkSummary(chunk), content: chunk }; } } catch (error) { console.warn(`Error processing chunk ${batchStart + index}, using original chunk:`, error); return { context: extractChunkSummary(chunk), content: chunk }; } }); // Wait for all chunks in this batch to be processed const batchResults = await Promise.all(batchPromises); fixedChunks.push(...batchResults); // Explicitly run garbage collection between batches for very large documents if (isLargeDocument && global.gc) { global.gc(); } } return fixedChunks; }