@coworker-agency/rag
Version:
Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering
168 lines (140 loc) • 5.68 kB
JavaScript
/**
* Uses LLM to improve chunk quality and provide context for each chunk
*/
import OpenAI from 'openai';
/**
* Extract a basic summary from a chunk without using LLM
* @param {string} chunk - Text chunk
* @param {string} documentContext - Optional document context
* @returns {string} Chunk summary
*/
export function extractChunkSummary(chunk, documentContext = '') {
// Get the first 100 characters to use as a fallback context
const chunkPreview = chunk.substring(0, 150).trim();
// Extract potential headings or titles
const headingMatch = chunk.match(/^#{1,6}\s+(.+)$|^(.+)\n[=-]+\n/m);
const heading = headingMatch ? headingMatch[1] || headingMatch[2] : '';
// Use document context if available, otherwise use chunk preview
return heading
? `${heading}: ${chunkPreview}...`
: `${chunkPreview}...`;
}
/**
* Get detailed prompt template for chunk processing
* @returns {string} Prompt template
*/
function getDetailedChunkPrompt() {
return `You are processing a document chunk for a RAG system. The chunk is part of a larger document.
Document excerpt for context:
{{DOCUMENT_EXCERPT}}
Current chunk content:
{{CHUNK_CONTENT}}
Please perform two tasks:
1. Create a brief contextual summary of this chunk (1-2 sentences)
2. Clean up the chunk content if needed (fix formatting, remove artifacts)
Return your result as a JSON object with:
- "context": The brief contextual summary
- "content": The cleaned chunk content
The context should help understand where this chunk fits in the document.`;
}
/**
* Get compact prompt template for chunk processing (used for large documents)
* @returns {string} Prompt template
*/
function getCompactChunkPrompt() {
return `Document excerpt:
{{DOCUMENT_EXCERPT}}
Chunk:
{{CHUNK_CONTENT}}
Return JSON with:
- "context": Brief summary (1-2 sentences)
- "content": Cleaned chunk`;
}
/**
* Fix chunks and add context using LLM
* @param {Array<string>} chunks - Document chunks
* @param {string} documentExcerpt - Excerpt of full document for context
* @param {string} openaiApiKey - OpenAI API key
* @param {Object} options - Additional options
* @returns {Promise<Array<{context: string, content: string}>>} Array of chunks with context
*/
export async function fixChunks(chunks, documentExcerpt, openaiApiKey, options = {}) {
const {
modelName = 'gpt-4o',
batchSize = 10,
skipLlmRefinement = false,
maxTokensPerRequest = 4000,
isLargeDocument = false
} = options;
// Memory-efficient batch processing for large documents
const openai = new OpenAI({ apiKey: openaiApiKey });
// Process in smaller batches for memory efficiency
const fixedChunks = [];
const batchCount = Math.ceil(chunks.length / batchSize);
// If document is very large, use more efficient processing options
if (skipLlmRefinement) {
console.log('Skipping LLM refinement for large document, using basic context extraction');
// For very large documents, skip LLM refinement and use simpler approach
return chunks.map(chunk => ({
context: extractChunkSummary(chunk, documentExcerpt),
content: chunk
}));
}
for (let batchIndex = 0; batchIndex < batchCount; batchIndex++) {
const batchStart = batchIndex * batchSize;
const batchEnd = Math.min((batchIndex + 1) * batchSize, chunks.length);
const currentBatch = chunks.slice(batchStart, batchEnd);
console.log(`Processing batch ${batchIndex + 1}/${batchCount} with ${currentBatch.length} chunks`);
// For large documents, use a more succinct prompt to reduce tokens
const promptTemplate = isLargeDocument
? getCompactChunkPrompt()
: getDetailedChunkPrompt();
// Process each chunk in the current batch
const batchPromises = currentBatch.map(async (chunk, index) => {
try {
const prompt = promptTemplate
.replace('{{DOCUMENT_EXCERPT}}', documentExcerpt.substring(0, 2000) + '...')
.replace('{{CHUNK_CONTENT}}', chunk);
const response = await openai.chat.completions.create({
model: modelName,
messages: [
{ role: "system", content: "You are a document processing assistant that extracts context and fixes text chunks." },
{ role: "user", content: prompt }
],
temperature: 0.3,
max_tokens: maxTokensPerRequest,
response_format: { type: "json_object" }
});
// Parse response and add fixed chunk
try {
const responseContent = response.choices[0].message.content;
const parsedResponse = JSON.parse(responseContent);
return {
context: parsedResponse.context || extractChunkSummary(chunk),
content: parsedResponse.content || chunk
};
} catch (parseError) {
console.warn(`Error parsing LLM response for chunk ${batchStart + index}, using original chunk:`, parseError);
return {
context: extractChunkSummary(chunk),
content: chunk
};
}
} catch (error) {
console.warn(`Error processing chunk ${batchStart + index}, using original chunk:`, error);
return {
context: extractChunkSummary(chunk),
content: chunk
};
}
});
// Wait for all chunks in this batch to be processed
const batchResults = await Promise.all(batchPromises);
fixedChunks.push(...batchResults);
// Explicitly run garbage collection between batches for very large documents
if (isLargeDocument && global.gc) {
global.gc();
}
}
return fixedChunks;
}