UNPKG

rag-system-pgvector

Version:

A complete Retrieval-Augmented Generation system using pgvector, LangChain, and LangGraph for Node.js applications with dynamic embedding and model providers, structured data queries, and chat history - supports OpenAI, Anthropic, HuggingFace, Azure, Goog

902 lines (753 loc) 27.9 kB
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'; import pdf from 'pdf-parse'; import mammoth from 'mammoth'; import * as cheerio from 'cheerio'; import fs from 'fs/promises'; import path from 'path'; import https from 'https'; import http from 'http'; import { URL } from 'url'; class DocumentProcessor { constructor(config = {}) { this.config = config; this.embeddings = config.embeddings; this.textSplitter = new RecursiveCharacterTextSplitter({ chunkSize: config.processing?.chunkSize || 1000, chunkOverlap: config.processing?.chunkOverlap || 200, separators: ['\n\n', '\n', '. ', ' ', ''], }); } async extractTextFromFile(filePath, fileType) { try { switch (fileType.toLowerCase()) { case 'pdf': return await this.extractTextFromPDF(filePath); case 'docx': case 'doc': return await this.extractTextFromWord(filePath); case 'txt': return await this.extractTextFromTxt(filePath); case 'html': case 'htm': return await this.extractTextFromHTML(filePath); case 'md': case 'markdown': return await this.extractTextFromMarkdown(filePath); case 'json': return await this.extractTextFromJSONFile(filePath); default: throw new Error(`Unsupported file type: ${fileType}`); } } catch (error) { console.error(`Error extracting text from ${filePath}:`, error); throw error; } } async extractTextFromPDF(filePath) { const dataBuffer = await fs.readFile(filePath); const data = await pdf(dataBuffer); return data.text; } async extractTextFromWord(filePath) { const result = await mammoth.extractRawText({ path: filePath }); return result.value; } async extractTextFromTxt(filePath) { return await fs.readFile(filePath, 'utf-8'); } async extractTextFromHTML(filePath) { const htmlContent = await fs.readFile(filePath, 'utf-8'); const $ = cheerio.load(htmlContent); // Remove script and style elements $('script, style').remove(); // Extract text content return $('body').text().replace(/\s+/g, ' ').trim(); } async extractTextFromMarkdown(filePath) { const content = await fs.readFile(filePath, 'utf-8'); // Basic markdown processing - remove common markdown syntax return content .replace(/#{1,6}\s+/g, '') // Remove headers .replace(/\*\*(.*?)\*\*/g, '$1') // Remove bold .replace(/\*(.*?)\*/g, '$1') // Remove italic .replace(/\[(.*?)\]\(.*?\)/g, '$1') // Remove links, keep text .replace(/`{1,3}[^`]*`{1,3}/g, '') // Remove code blocks .replace(/^\s*[-*+]\s+/gm, '') // Remove bullet points .replace(/^\s*\d+\.\s+/gm, '') // Remove numbered lists .trim(); } async extractTextFromJSONFile(filePath) { const content = await fs.readFile(filePath, 'utf-8'); return this.extractTextFromJSONBuffer(Buffer.from(content, 'utf-8')); } async extractTextFromBuffer(buffer, fileType, filename = 'buffer') { try { switch (fileType.toLowerCase()) { case 'pdf': return await this.extractTextFromPDFBuffer(buffer); case 'docx': case 'doc': return await this.extractTextFromWordBuffer(buffer); case 'txt': return await this.extractTextFromTxtBuffer(buffer); case 'html': case 'htm': return await this.extractTextFromHTMLBuffer(buffer); case 'md': case 'markdown': return await this.extractTextFromMarkdownBuffer(buffer); case 'json': return await this.extractTextFromJSONBuffer(buffer); default: throw new Error(`Unsupported file type for buffer processing: ${fileType}`); } } catch (error) { console.error(`Error extracting text from buffer (${filename}):`, error); throw error; } } async extractTextFromPDFBuffer(buffer) { const data = await pdf(buffer); return data.text; } async extractTextFromWordBuffer(buffer) { const result = await mammoth.extractRawText({ buffer }); return result.value; } async extractTextFromTxtBuffer(buffer) { return buffer.toString('utf-8'); } async extractTextFromHTMLBuffer(buffer) { const htmlContent = buffer.toString('utf-8'); const $ = cheerio.load(htmlContent); // Remove script and style elements $('script, style').remove(); // Extract text content return $('body').text().replace(/\s+/g, ' ').trim(); } async extractTextFromMarkdownBuffer(buffer) { const content = buffer.toString('utf-8'); // Basic markdown processing - remove common markdown syntax return content .replace(/#{1,6}\s+/g, '') // Remove headers .replace(/\*\*(.*?)\*\*/g, '$1') // Remove bold .replace(/\*(.*?)\*/g, '$1') // Remove italic .replace(/\[(.*?)\]\(.*?\)/g, '$1') // Remove links, keep text .replace(/`{1,3}[^`]*`{1,3}/g, '') // Remove code blocks .replace(/^\s*[-*+]\s+/gm, '') // Remove bullet points .replace(/^\s*\d+\.\s+/gm, '') // Remove numbered lists .trim(); } async extractTextFromJSONBuffer(buffer) { try { const jsonContent = JSON.parse(buffer.toString('utf-8')); // Extract text from JSON recursively const extractTextFromJSON = (obj, depth = 0) => { if (depth > 10) return ''; // Prevent infinite recursion if (typeof obj === 'string') { return obj + ' '; } else if (typeof obj === 'number' || typeof obj === 'boolean') { return obj.toString() + ' '; } else if (Array.isArray(obj)) { return obj.map(item => extractTextFromJSON(item, depth + 1)).join(''); } else if (typeof obj === 'object' && obj !== null) { return Object.values(obj).map(value => extractTextFromJSON(value, depth + 1)).join(''); } return ''; }; return extractTextFromJSON(jsonContent).trim(); } catch (error) { throw new Error(`Invalid JSON format: ${error.message}`); } } async splitTextIntoChunks(text, metadata = {}) { const chunks = await this.textSplitter.splitText(text); return chunks.map((chunk, index) => ({ content: chunk.trim(), index, metadata: { ...metadata, chunkIndex: index, chunkLength: chunk.length, }, })); } async generateEmbeddings(texts) { try { const embeddings = await this.embeddings.embedDocuments(texts); return embeddings; } catch (error) { console.error('Error generating embeddings:', error); throw error; } } async generateSingleEmbedding(text) { try { const embedding = await this.embeddings.embedQuery(text); return embedding; } catch (error) { console.error('Error generating single embedding:', error); throw error; } } getFileType(filePath) { return path.extname(filePath).slice(1).toLowerCase(); } getFileTypeFromUrl(url) { try { const urlObj = new URL(url); const pathname = urlObj.pathname; const extension = path.extname(pathname).slice(1).toLowerCase(); // If no extension found, try to determine from Content-Type header later return extension || null; } catch (error) { return null; } } async downloadFile(url, options = {}) { const { timeout = 30000, maxSize = 50 * 1024 * 1024, // 50MB limit userAgent = 'DocumentProcessor/1.0' } = options; return new Promise((resolve, reject) => { const urlObj = new URL(url); const protocol = urlObj.protocol === 'https:' ? https : http; const request = protocol.get(url, { timeout, headers: { 'User-Agent': userAgent, 'Accept': '*/*' } }, (response) => { // Check response status if (response.statusCode < 200 || response.statusCode >= 300) { reject(new Error(`HTTP ${response.statusCode}: ${response.statusMessage}`)); return; } // Check content length const contentLength = parseInt(response.headers['content-length']); if (contentLength && contentLength > maxSize) { reject(new Error(`File too large: ${contentLength} bytes (max: ${maxSize})`)); return; } const chunks = []; let totalSize = 0; response.on('data', (chunk) => { totalSize += chunk.length; if (totalSize > maxSize) { reject(new Error(`File too large: ${totalSize} bytes (max: ${maxSize})`)); return; } chunks.push(chunk); }); response.on('end', () => { const buffer = Buffer.concat(chunks); const contentType = response.headers['content-type'] || ''; resolve({ buffer, contentType, size: totalSize, filename: this.getFilenameFromUrl(url) || 'downloaded-file', url }); }); response.on('error', (error) => { reject(error); }); }); request.on('timeout', () => { request.destroy(); reject(new Error(`Download timeout after ${timeout}ms`)); }); request.on('error', (error) => { reject(error); }); request.setTimeout(timeout); }); } getFilenameFromUrl(url) { try { const urlObj = new URL(url); const pathname = urlObj.pathname; return path.basename(pathname) || null; } catch (error) { return null; } } getFileTypeFromContentType(contentType) { const contentTypeMap = { 'application/pdf': 'pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', 'application/msword': 'doc', 'text/plain': 'txt', 'text/html': 'html', 'text/markdown': 'md', 'application/json': 'json', 'text/csv': 'csv' }; const baseType = contentType.split(';')[0].trim().toLowerCase(); return contentTypeMap[baseType] || null; } async processDocumentFromUrl(url, options = {}) { try { console.log(`Processing document from URL: ${url}`); const { title = null, downloadOptions = {}, tempDir = './temp' } = options; // Download the file const downloadResult = await this.downloadFile(url, downloadOptions); // Determine file type let fileType = this.getFileTypeFromUrl(url); if (!fileType) { fileType = this.getFileTypeFromContentType(downloadResult.contentType); } if (!fileType) { throw new Error(`Cannot determine file type for URL: ${url}`); } // Create temp directory if it doesn't exist try { await fs.mkdir(tempDir, { recursive: true }); } catch (error) { // Directory might already exist } // Create temporary file const filename = downloadResult.filename || `temp-${Date.now()}.${fileType}`; const tempFilePath = path.join(tempDir, filename); try { // Write buffer to temporary file await fs.writeFile(tempFilePath, downloadResult.buffer); // Process the temporary file const documentTitle = title || filename.replace(/\.[^/.]+$/, ''); const result = await this.processDocument(tempFilePath, documentTitle); // Add URL metadata result.metadata = { ...result.metadata, sourceUrl: url, downloadedAt: new Date().toISOString(), contentType: downloadResult.contentType, downloadSize: downloadResult.size }; // Update filePath to show it came from URL result.filePath = url; console.log(`✓ Processed document from URL: ${url}`); return result; } finally { // Clean up temporary file try { await fs.unlink(tempFilePath); } catch (error) { console.warn(`Could not delete temporary file: ${tempFilePath}`); } } } catch (error) { console.error(`Error processing document from URL ${url}:`, error); throw error; } } async processDocumentsFromUrls(urls, options = {}) { try { const { concurrency = 2, // Lower concurrency for URL downloads skipErrors = true, onProgress = null, downloadOptions = {}, tempDir = './temp', metadata = {} } = options; console.log(`Processing ${urls.length} documents from URLs...`); const results = []; let processed = 0; // Process URLs in batches for (let i = 0; i < urls.length; i += concurrency) { const batch = urls.slice(i, i + concurrency); const batchPromises = batch.map(async (url, batchIndex) => { try { const globalIndex = i + batchIndex; const title = options.titles?.[globalIndex] || null; const result = await this.processDocumentFromUrl(url, { title, downloadOptions, tempDir }); // Merge additional metadata result.metadata = { ...result.metadata, ...metadata, batchIndex: Math.floor(globalIndex / concurrency), globalIndex }; processed++; if (onProgress) { onProgress({ processed, total: urls.length, current: url, progress: (processed / urls.length) * 100 }); } return { success: true, url, result }; } catch (error) { const errorInfo = { success: false, url, error: error.message, stack: error.stack }; if (skipErrors) { console.warn(`⚠️ Skipping ${url}: ${error.message}`); processed++; if (onProgress) { onProgress({ processed, total: urls.length, current: url, progress: (processed / urls.length) * 100, error: error.message }); } return errorInfo; } else { throw error; } } }); const batchResults = await Promise.all(batchPromises); results.push(...batchResults); } const successful = results.filter(r => r.success).map(r => r.result); const failed = results.filter(r => !r.success); console.log(`✓ Successfully processed ${successful.length}/${urls.length} documents from URLs`); if (failed.length > 0) { console.log(`⚠️ Failed to process ${failed.length} URLs`); } return { documents: successful, errors: failed, summary: { total: urls.length, successful: successful.length, failed: failed.length, totalChunks: successful.reduce((sum, doc) => sum + doc.chunks.length, 0) } }; } catch (error) { console.error('Error processing multiple documents from URLs:', error); throw error; } } async processDocument(filePath, title = null) { try { console.log(`Processing document: ${filePath}`); const fileType = this.getFileType(filePath); const documentTitle = title || path.basename(filePath, path.extname(filePath)); // Extract text from file const text = await this.extractTextFromFile(filePath, fileType); if (!text || text.trim().length === 0) { throw new Error('No text content found in document'); } // Split text into chunks const chunks = await this.splitTextIntoChunks(text, { fileName: path.basename(filePath), fileType, filePath, }); if (chunks.length === 0) { throw new Error('No chunks generated from document'); } // Generate embeddings for all chunks const chunkTexts = chunks.map(chunk => chunk.content); const embeddings = await this.generateEmbeddings(chunkTexts); // Combine chunks with their embeddings const processedChunks = chunks.map((chunk, index) => ({ ...chunk, embedding: embeddings[index], })); console.log(`✓ Processed ${chunks.length} chunks from ${filePath}`); return { title: documentTitle, content: text, filePath, fileType, chunks: processedChunks, metadata: { originalLength: text.length, chunkCount: chunks.length, fileType, processedAt: new Date().toISOString(), }, }; } catch (error) { console.error(`Error processing document ${filePath}:`, error); throw error; } } async processDocumentFromBuffer(buffer, fileType, options = {}) { try { const { title = null, filename = `buffer-${Date.now()}`, metadata = {} } = options; console.log(`Processing document from buffer (${filename}, type: ${fileType})`); if (!buffer || buffer.length === 0) { throw new Error('Buffer is empty or null'); } if (!fileType) { throw new Error('File type must be specified for buffer processing'); } // Extract text from buffer const text = await this.extractTextFromBuffer(buffer, fileType, filename); if (!text || text.trim().length === 0) { throw new Error('No text content found in buffer'); } // Split text into chunks const chunks = await this.splitTextIntoChunks(text, { fileName: filename, fileType: fileType.toLowerCase(), source: 'buffer', ...metadata }); if (chunks.length === 0) { throw new Error('No chunks generated from buffer'); } // Generate embeddings for all chunks const chunkTexts = chunks.map(chunk => chunk.content); const embeddings = await this.generateEmbeddings(chunkTexts); // Combine chunks with their embeddings const processedChunks = chunks.map((chunk, index) => ({ ...chunk, embedding: embeddings[index], })); const documentTitle = title || filename.replace(/\.[^/.]+$/, ''); console.log(`✓ Processed ${chunks.length} chunks from buffer (${filename})`); return { title: documentTitle, content: text, filePath: filename, fileType: fileType.toLowerCase(), chunks: processedChunks, metadata: { originalLength: text.length, chunkCount: chunks.length, fileType: fileType.toLowerCase(), source: 'buffer', processedAt: new Date().toISOString(), bufferSize: buffer.length, ...metadata }, }; } catch (error) { console.error(`Error processing buffer (${options.filename || 'unknown'}):`, error); throw error; } } async processDocumentsFromBuffers(buffers, options = {}) { try { const { concurrency = 3, skipErrors = true, onProgress = null, metadata = {} } = options; console.log(`Processing ${buffers.length} documents from buffers...`); const results = []; let processed = 0; // Validate input if (!Array.isArray(buffers)) { throw new Error('Buffers must be an array'); } // Process buffers in batches for (let i = 0; i < buffers.length; i += concurrency) { const batch = buffers.slice(i, i + concurrency); const batchPromises = batch.map(async (bufferInfo, batchIndex) => { try { const globalIndex = i + batchIndex; // Extract buffer info let buffer, fileType, title, filename, bufferMetadata; if (Buffer.isBuffer(bufferInfo)) { // Simple buffer without metadata throw new Error(`Buffer at index ${globalIndex} requires fileType. Use {buffer, fileType} format.`); } else if (typeof bufferInfo === 'object') { // Buffer with metadata buffer = bufferInfo.buffer; fileType = bufferInfo.fileType; title = bufferInfo.title; filename = bufferInfo.filename || `buffer-${globalIndex}`; bufferMetadata = bufferInfo.metadata || {}; } else { throw new Error(`Invalid buffer format at index ${globalIndex}`); } if (!Buffer.isBuffer(buffer)) { throw new Error(`Invalid buffer at index ${globalIndex}`); } if (!fileType) { throw new Error(`FileType required for buffer at index ${globalIndex}`); } const result = await this.processDocumentFromBuffer(buffer, fileType, { title, filename, metadata: { ...metadata, ...bufferMetadata, batchIndex: Math.floor(globalIndex / concurrency), globalIndex } }); processed++; if (onProgress) { onProgress({ processed, total: buffers.length, current: filename, progress: (processed / buffers.length) * 100 }); } return { success: true, filename, result }; } catch (error) { const errorInfo = { success: false, filename: bufferInfo.filename || `buffer-${globalIndex}`, error: error.message, stack: error.stack }; if (skipErrors) { console.warn(`⚠️ Skipping buffer ${errorInfo.filename}: ${error.message}`); processed++; if (onProgress) { onProgress({ processed, total: buffers.length, current: errorInfo.filename, progress: (processed / buffers.length) * 100, error: error.message }); } return errorInfo; } else { throw error; } } }); const batchResults = await Promise.all(batchPromises); results.push(...batchResults); } const successful = results.filter(r => r.success).map(r => r.result); const failed = results.filter(r => !r.success); console.log(`✓ Successfully processed ${successful.length}/${buffers.length} documents from buffers`); if (failed.length > 0) { console.log(`⚠️ Failed to process ${failed.length} buffers`); } return { documents: successful, errors: failed, summary: { total: buffers.length, successful: successful.length, failed: failed.length, totalChunks: successful.reduce((sum, doc) => sum + doc.chunks.length, 0) } }; } catch (error) { console.error('Error processing multiple documents from buffers:', error); throw error; } } async processDocuments(filePaths, options = {}) { try { const { concurrency = 3, // Process up to 3 documents concurrently skipErrors = true, // Continue processing if individual files fail onProgress = null, // Progress callback function metadata = {} // Additional metadata for all documents } = options; console.log(`Processing ${filePaths.length} documents...`); const results = []; const errors = []; let processed = 0; // Process documents in batches to control concurrency for (let i = 0; i < filePaths.length; i += concurrency) { const batch = filePaths.slice(i, i + concurrency); const batchPromises = batch.map(async (filePath, batchIndex) => { try { const globalIndex = i + batchIndex; const title = options.titles?.[globalIndex] || null; // Add global metadata to document-specific metadata const docMetadata = { ...metadata, batchIndex: Math.floor(globalIndex / concurrency), globalIndex }; const result = await this.processDocument(filePath, title); // Merge additional metadata result.metadata = { ...result.metadata, ...docMetadata }; processed++; // Call progress callback if provided if (onProgress) { onProgress({ processed, total: filePaths.length, current: filePath, progress: (processed / filePaths.length) * 100 }); } return { success: true, filePath, result }; } catch (error) { const errorInfo = { success: false, filePath, error: error.message, stack: error.stack }; if (skipErrors) { console.warn(`⚠️ Skipping ${filePath}: ${error.message}`); errors.push(errorInfo); processed++; if (onProgress) { onProgress({ processed, total: filePaths.length, current: filePath, progress: (processed / filePaths.length) * 100, error: error.message }); } return errorInfo; } else { throw error; } } }); const batchResults = await Promise.all(batchPromises); results.push(...batchResults); } // Separate successful results from errors const successful = results.filter(r => r.success).map(r => r.result); const failed = results.filter(r => !r.success); console.log(`✓ Successfully processed ${successful.length}/${filePaths.length} documents`); if (failed.length > 0) { console.log(`⚠️ Failed to process ${failed.length} documents`); } return { documents: successful, errors: failed, summary: { total: filePaths.length, successful: successful.length, failed: failed.length, totalChunks: successful.reduce((sum, doc) => sum + doc.chunks.length, 0) } }; } catch (error) { console.error('Error processing multiple documents:', error); throw error; } } } export { DocumentProcessor }; export default DocumentProcessor;