rag-system-pgvector
Version:
A complete Retrieval-Augmented Generation system using pgvector, LangChain, and LangGraph for Node.js applications with dynamic embedding and model providers, structured data queries, and chat history - supports OpenAI, Anthropic, HuggingFace, Azure, Goog
902 lines (753 loc) • 27.9 kB
JavaScript
import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters';
import pdf from 'pdf-parse';
import mammoth from 'mammoth';
import * as cheerio from 'cheerio';
import fs from 'fs/promises';
import path from 'path';
import https from 'https';
import http from 'http';
import { URL } from 'url';
class DocumentProcessor {
constructor(config = {}) {
this.config = config;
this.embeddings = config.embeddings;
this.textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: config.processing?.chunkSize || 1000,
chunkOverlap: config.processing?.chunkOverlap || 200,
separators: ['\n\n', '\n', '. ', ' ', ''],
});
}
async extractTextFromFile(filePath, fileType) {
try {
switch (fileType.toLowerCase()) {
case 'pdf':
return await this.extractTextFromPDF(filePath);
case 'docx':
case 'doc':
return await this.extractTextFromWord(filePath);
case 'txt':
return await this.extractTextFromTxt(filePath);
case 'html':
case 'htm':
return await this.extractTextFromHTML(filePath);
case 'md':
case 'markdown':
return await this.extractTextFromMarkdown(filePath);
case 'json':
return await this.extractTextFromJSONFile(filePath);
default:
throw new Error(`Unsupported file type: ${fileType}`);
}
} catch (error) {
console.error(`Error extracting text from ${filePath}:`, error);
throw error;
}
}
async extractTextFromPDF(filePath) {
const dataBuffer = await fs.readFile(filePath);
const data = await pdf(dataBuffer);
return data.text;
}
async extractTextFromWord(filePath) {
const result = await mammoth.extractRawText({ path: filePath });
return result.value;
}
async extractTextFromTxt(filePath) {
return await fs.readFile(filePath, 'utf-8');
}
async extractTextFromHTML(filePath) {
const htmlContent = await fs.readFile(filePath, 'utf-8');
const $ = cheerio.load(htmlContent);
// Remove script and style elements
$('script, style').remove();
// Extract text content
return $('body').text().replace(/\s+/g, ' ').trim();
}
async extractTextFromMarkdown(filePath) {
const content = await fs.readFile(filePath, 'utf-8');
// Basic markdown processing - remove common markdown syntax
return content
.replace(/#{1,6}\s+/g, '') // Remove headers
.replace(/\*\*(.*?)\*\*/g, '$1') // Remove bold
.replace(/\*(.*?)\*/g, '$1') // Remove italic
.replace(/\[(.*?)\]\(.*?\)/g, '$1') // Remove links, keep text
.replace(/`{1,3}[^`]*`{1,3}/g, '') // Remove code blocks
.replace(/^\s*[-*+]\s+/gm, '') // Remove bullet points
.replace(/^\s*\d+\.\s+/gm, '') // Remove numbered lists
.trim();
}
async extractTextFromJSONFile(filePath) {
const content = await fs.readFile(filePath, 'utf-8');
return this.extractTextFromJSONBuffer(Buffer.from(content, 'utf-8'));
}
async extractTextFromBuffer(buffer, fileType, filename = 'buffer') {
try {
switch (fileType.toLowerCase()) {
case 'pdf':
return await this.extractTextFromPDFBuffer(buffer);
case 'docx':
case 'doc':
return await this.extractTextFromWordBuffer(buffer);
case 'txt':
return await this.extractTextFromTxtBuffer(buffer);
case 'html':
case 'htm':
return await this.extractTextFromHTMLBuffer(buffer);
case 'md':
case 'markdown':
return await this.extractTextFromMarkdownBuffer(buffer);
case 'json':
return await this.extractTextFromJSONBuffer(buffer);
default:
throw new Error(`Unsupported file type for buffer processing: ${fileType}`);
}
} catch (error) {
console.error(`Error extracting text from buffer (${filename}):`, error);
throw error;
}
}
async extractTextFromPDFBuffer(buffer) {
const data = await pdf(buffer);
return data.text;
}
async extractTextFromWordBuffer(buffer) {
const result = await mammoth.extractRawText({ buffer });
return result.value;
}
async extractTextFromTxtBuffer(buffer) {
return buffer.toString('utf-8');
}
async extractTextFromHTMLBuffer(buffer) {
const htmlContent = buffer.toString('utf-8');
const $ = cheerio.load(htmlContent);
// Remove script and style elements
$('script, style').remove();
// Extract text content
return $('body').text().replace(/\s+/g, ' ').trim();
}
async extractTextFromMarkdownBuffer(buffer) {
const content = buffer.toString('utf-8');
// Basic markdown processing - remove common markdown syntax
return content
.replace(/#{1,6}\s+/g, '') // Remove headers
.replace(/\*\*(.*?)\*\*/g, '$1') // Remove bold
.replace(/\*(.*?)\*/g, '$1') // Remove italic
.replace(/\[(.*?)\]\(.*?\)/g, '$1') // Remove links, keep text
.replace(/`{1,3}[^`]*`{1,3}/g, '') // Remove code blocks
.replace(/^\s*[-*+]\s+/gm, '') // Remove bullet points
.replace(/^\s*\d+\.\s+/gm, '') // Remove numbered lists
.trim();
}
async extractTextFromJSONBuffer(buffer) {
try {
const jsonContent = JSON.parse(buffer.toString('utf-8'));
// Extract text from JSON recursively
const extractTextFromJSON = (obj, depth = 0) => {
if (depth > 10) return ''; // Prevent infinite recursion
if (typeof obj === 'string') {
return obj + ' ';
} else if (typeof obj === 'number' || typeof obj === 'boolean') {
return obj.toString() + ' ';
} else if (Array.isArray(obj)) {
return obj.map(item => extractTextFromJSON(item, depth + 1)).join('');
} else if (typeof obj === 'object' && obj !== null) {
return Object.values(obj).map(value => extractTextFromJSON(value, depth + 1)).join('');
}
return '';
};
return extractTextFromJSON(jsonContent).trim();
} catch (error) {
throw new Error(`Invalid JSON format: ${error.message}`);
}
}
async splitTextIntoChunks(text, metadata = {}) {
const chunks = await this.textSplitter.splitText(text);
return chunks.map((chunk, index) => ({
content: chunk.trim(),
index,
metadata: {
...metadata,
chunkIndex: index,
chunkLength: chunk.length,
},
}));
}
async generateEmbeddings(texts) {
try {
const embeddings = await this.embeddings.embedDocuments(texts);
return embeddings;
} catch (error) {
console.error('Error generating embeddings:', error);
throw error;
}
}
async generateSingleEmbedding(text) {
try {
const embedding = await this.embeddings.embedQuery(text);
return embedding;
} catch (error) {
console.error('Error generating single embedding:', error);
throw error;
}
}
getFileType(filePath) {
return path.extname(filePath).slice(1).toLowerCase();
}
getFileTypeFromUrl(url) {
try {
const urlObj = new URL(url);
const pathname = urlObj.pathname;
const extension = path.extname(pathname).slice(1).toLowerCase();
// If no extension found, try to determine from Content-Type header later
return extension || null;
} catch (error) {
return null;
}
}
async downloadFile(url, options = {}) {
const {
timeout = 30000,
maxSize = 50 * 1024 * 1024, // 50MB limit
userAgent = 'DocumentProcessor/1.0'
} = options;
return new Promise((resolve, reject) => {
const urlObj = new URL(url);
const protocol = urlObj.protocol === 'https:' ? https : http;
const request = protocol.get(url, {
timeout,
headers: {
'User-Agent': userAgent,
'Accept': '*/*'
}
}, (response) => {
// Check response status
if (response.statusCode < 200 || response.statusCode >= 300) {
reject(new Error(`HTTP ${response.statusCode}: ${response.statusMessage}`));
return;
}
// Check content length
const contentLength = parseInt(response.headers['content-length']);
if (contentLength && contentLength > maxSize) {
reject(new Error(`File too large: ${contentLength} bytes (max: ${maxSize})`));
return;
}
const chunks = [];
let totalSize = 0;
response.on('data', (chunk) => {
totalSize += chunk.length;
if (totalSize > maxSize) {
reject(new Error(`File too large: ${totalSize} bytes (max: ${maxSize})`));
return;
}
chunks.push(chunk);
});
response.on('end', () => {
const buffer = Buffer.concat(chunks);
const contentType = response.headers['content-type'] || '';
resolve({
buffer,
contentType,
size: totalSize,
filename: this.getFilenameFromUrl(url) || 'downloaded-file',
url
});
});
response.on('error', (error) => {
reject(error);
});
});
request.on('timeout', () => {
request.destroy();
reject(new Error(`Download timeout after ${timeout}ms`));
});
request.on('error', (error) => {
reject(error);
});
request.setTimeout(timeout);
});
}
getFilenameFromUrl(url) {
try {
const urlObj = new URL(url);
const pathname = urlObj.pathname;
return path.basename(pathname) || null;
} catch (error) {
return null;
}
}
getFileTypeFromContentType(contentType) {
const contentTypeMap = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/msword': 'doc',
'text/plain': 'txt',
'text/html': 'html',
'text/markdown': 'md',
'application/json': 'json',
'text/csv': 'csv'
};
const baseType = contentType.split(';')[0].trim().toLowerCase();
return contentTypeMap[baseType] || null;
}
async processDocumentFromUrl(url, options = {}) {
try {
console.log(`Processing document from URL: ${url}`);
const {
title = null,
downloadOptions = {},
tempDir = './temp'
} = options;
// Download the file
const downloadResult = await this.downloadFile(url, downloadOptions);
// Determine file type
let fileType = this.getFileTypeFromUrl(url);
if (!fileType) {
fileType = this.getFileTypeFromContentType(downloadResult.contentType);
}
if (!fileType) {
throw new Error(`Cannot determine file type for URL: ${url}`);
}
// Create temp directory if it doesn't exist
try {
await fs.mkdir(tempDir, { recursive: true });
} catch (error) {
// Directory might already exist
}
// Create temporary file
const filename = downloadResult.filename || `temp-${Date.now()}.${fileType}`;
const tempFilePath = path.join(tempDir, filename);
try {
// Write buffer to temporary file
await fs.writeFile(tempFilePath, downloadResult.buffer);
// Process the temporary file
const documentTitle = title || filename.replace(/\.[^/.]+$/, '');
const result = await this.processDocument(tempFilePath, documentTitle);
// Add URL metadata
result.metadata = {
...result.metadata,
sourceUrl: url,
downloadedAt: new Date().toISOString(),
contentType: downloadResult.contentType,
downloadSize: downloadResult.size
};
// Update filePath to show it came from URL
result.filePath = url;
console.log(`✓ Processed document from URL: ${url}`);
return result;
} finally {
// Clean up temporary file
try {
await fs.unlink(tempFilePath);
} catch (error) {
console.warn(`Could not delete temporary file: ${tempFilePath}`);
}
}
} catch (error) {
console.error(`Error processing document from URL ${url}:`, error);
throw error;
}
}
async processDocumentsFromUrls(urls, options = {}) {
try {
const {
concurrency = 2, // Lower concurrency for URL downloads
skipErrors = true,
onProgress = null,
downloadOptions = {},
tempDir = './temp',
metadata = {}
} = options;
console.log(`Processing ${urls.length} documents from URLs...`);
const results = [];
let processed = 0;
// Process URLs in batches
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchPromises = batch.map(async (url, batchIndex) => {
try {
const globalIndex = i + batchIndex;
const title = options.titles?.[globalIndex] || null;
const result = await this.processDocumentFromUrl(url, {
title,
downloadOptions,
tempDir
});
// Merge additional metadata
result.metadata = {
...result.metadata,
...metadata,
batchIndex: Math.floor(globalIndex / concurrency),
globalIndex
};
processed++;
if (onProgress) {
onProgress({
processed,
total: urls.length,
current: url,
progress: (processed / urls.length) * 100
});
}
return { success: true, url, result };
} catch (error) {
const errorInfo = {
success: false,
url,
error: error.message,
stack: error.stack
};
if (skipErrors) {
console.warn(`⚠️ Skipping ${url}: ${error.message}`);
processed++;
if (onProgress) {
onProgress({
processed,
total: urls.length,
current: url,
progress: (processed / urls.length) * 100,
error: error.message
});
}
return errorInfo;
} else {
throw error;
}
}
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
}
const successful = results.filter(r => r.success).map(r => r.result);
const failed = results.filter(r => !r.success);
console.log(`✓ Successfully processed ${successful.length}/${urls.length} documents from URLs`);
if (failed.length > 0) {
console.log(`⚠️ Failed to process ${failed.length} URLs`);
}
return {
documents: successful,
errors: failed,
summary: {
total: urls.length,
successful: successful.length,
failed: failed.length,
totalChunks: successful.reduce((sum, doc) => sum + doc.chunks.length, 0)
}
};
} catch (error) {
console.error('Error processing multiple documents from URLs:', error);
throw error;
}
}
async processDocument(filePath, title = null) {
try {
console.log(`Processing document: ${filePath}`);
const fileType = this.getFileType(filePath);
const documentTitle = title || path.basename(filePath, path.extname(filePath));
// Extract text from file
const text = await this.extractTextFromFile(filePath, fileType);
if (!text || text.trim().length === 0) {
throw new Error('No text content found in document');
}
// Split text into chunks
const chunks = await this.splitTextIntoChunks(text, {
fileName: path.basename(filePath),
fileType,
filePath,
});
if (chunks.length === 0) {
throw new Error('No chunks generated from document');
}
// Generate embeddings for all chunks
const chunkTexts = chunks.map(chunk => chunk.content);
const embeddings = await this.generateEmbeddings(chunkTexts);
// Combine chunks with their embeddings
const processedChunks = chunks.map((chunk, index) => ({
...chunk,
embedding: embeddings[index],
}));
console.log(`✓ Processed ${chunks.length} chunks from ${filePath}`);
return {
title: documentTitle,
content: text,
filePath,
fileType,
chunks: processedChunks,
metadata: {
originalLength: text.length,
chunkCount: chunks.length,
fileType,
processedAt: new Date().toISOString(),
},
};
} catch (error) {
console.error(`Error processing document ${filePath}:`, error);
throw error;
}
}
async processDocumentFromBuffer(buffer, fileType, options = {}) {
try {
const {
title = null,
filename = `buffer-${Date.now()}`,
metadata = {}
} = options;
console.log(`Processing document from buffer (${filename}, type: ${fileType})`);
if (!buffer || buffer.length === 0) {
throw new Error('Buffer is empty or null');
}
if (!fileType) {
throw new Error('File type must be specified for buffer processing');
}
// Extract text from buffer
const text = await this.extractTextFromBuffer(buffer, fileType, filename);
if (!text || text.trim().length === 0) {
throw new Error('No text content found in buffer');
}
// Split text into chunks
const chunks = await this.splitTextIntoChunks(text, {
fileName: filename,
fileType: fileType.toLowerCase(),
source: 'buffer',
...metadata
});
if (chunks.length === 0) {
throw new Error('No chunks generated from buffer');
}
// Generate embeddings for all chunks
const chunkTexts = chunks.map(chunk => chunk.content);
const embeddings = await this.generateEmbeddings(chunkTexts);
// Combine chunks with their embeddings
const processedChunks = chunks.map((chunk, index) => ({
...chunk,
embedding: embeddings[index],
}));
const documentTitle = title || filename.replace(/\.[^/.]+$/, '');
console.log(`✓ Processed ${chunks.length} chunks from buffer (${filename})`);
return {
title: documentTitle,
content: text,
filePath: filename,
fileType: fileType.toLowerCase(),
chunks: processedChunks,
metadata: {
originalLength: text.length,
chunkCount: chunks.length,
fileType: fileType.toLowerCase(),
source: 'buffer',
processedAt: new Date().toISOString(),
bufferSize: buffer.length,
...metadata
},
};
} catch (error) {
console.error(`Error processing buffer (${options.filename || 'unknown'}):`, error);
throw error;
}
}
async processDocumentsFromBuffers(buffers, options = {}) {
try {
const {
concurrency = 3,
skipErrors = true,
onProgress = null,
metadata = {}
} = options;
console.log(`Processing ${buffers.length} documents from buffers...`);
const results = [];
let processed = 0;
// Validate input
if (!Array.isArray(buffers)) {
throw new Error('Buffers must be an array');
}
// Process buffers in batches
for (let i = 0; i < buffers.length; i += concurrency) {
const batch = buffers.slice(i, i + concurrency);
const batchPromises = batch.map(async (bufferInfo, batchIndex) => {
try {
const globalIndex = i + batchIndex;
// Extract buffer info
let buffer, fileType, title, filename, bufferMetadata;
if (Buffer.isBuffer(bufferInfo)) {
// Simple buffer without metadata
throw new Error(`Buffer at index ${globalIndex} requires fileType. Use {buffer, fileType} format.`);
} else if (typeof bufferInfo === 'object') {
// Buffer with metadata
buffer = bufferInfo.buffer;
fileType = bufferInfo.fileType;
title = bufferInfo.title;
filename = bufferInfo.filename || `buffer-${globalIndex}`;
bufferMetadata = bufferInfo.metadata || {};
} else {
throw new Error(`Invalid buffer format at index ${globalIndex}`);
}
if (!Buffer.isBuffer(buffer)) {
throw new Error(`Invalid buffer at index ${globalIndex}`);
}
if (!fileType) {
throw new Error(`FileType required for buffer at index ${globalIndex}`);
}
const result = await this.processDocumentFromBuffer(buffer, fileType, {
title,
filename,
metadata: {
...metadata,
...bufferMetadata,
batchIndex: Math.floor(globalIndex / concurrency),
globalIndex
}
});
processed++;
if (onProgress) {
onProgress({
processed,
total: buffers.length,
current: filename,
progress: (processed / buffers.length) * 100
});
}
return { success: true, filename, result };
} catch (error) {
const errorInfo = {
success: false,
filename: bufferInfo.filename || `buffer-${globalIndex}`,
error: error.message,
stack: error.stack
};
if (skipErrors) {
console.warn(`⚠️ Skipping buffer ${errorInfo.filename}: ${error.message}`);
processed++;
if (onProgress) {
onProgress({
processed,
total: buffers.length,
current: errorInfo.filename,
progress: (processed / buffers.length) * 100,
error: error.message
});
}
return errorInfo;
} else {
throw error;
}
}
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
}
const successful = results.filter(r => r.success).map(r => r.result);
const failed = results.filter(r => !r.success);
console.log(`✓ Successfully processed ${successful.length}/${buffers.length} documents from buffers`);
if (failed.length > 0) {
console.log(`⚠️ Failed to process ${failed.length} buffers`);
}
return {
documents: successful,
errors: failed,
summary: {
total: buffers.length,
successful: successful.length,
failed: failed.length,
totalChunks: successful.reduce((sum, doc) => sum + doc.chunks.length, 0)
}
};
} catch (error) {
console.error('Error processing multiple documents from buffers:', error);
throw error;
}
}
async processDocuments(filePaths, options = {}) {
try {
const {
concurrency = 3, // Process up to 3 documents concurrently
skipErrors = true, // Continue processing if individual files fail
onProgress = null, // Progress callback function
metadata = {} // Additional metadata for all documents
} = options;
console.log(`Processing ${filePaths.length} documents...`);
const results = [];
const errors = [];
let processed = 0;
// Process documents in batches to control concurrency
for (let i = 0; i < filePaths.length; i += concurrency) {
const batch = filePaths.slice(i, i + concurrency);
const batchPromises = batch.map(async (filePath, batchIndex) => {
try {
const globalIndex = i + batchIndex;
const title = options.titles?.[globalIndex] || null;
// Add global metadata to document-specific metadata
const docMetadata = {
...metadata,
batchIndex: Math.floor(globalIndex / concurrency),
globalIndex
};
const result = await this.processDocument(filePath, title);
// Merge additional metadata
result.metadata = {
...result.metadata,
...docMetadata
};
processed++;
// Call progress callback if provided
if (onProgress) {
onProgress({
processed,
total: filePaths.length,
current: filePath,
progress: (processed / filePaths.length) * 100
});
}
return { success: true, filePath, result };
} catch (error) {
const errorInfo = {
success: false,
filePath,
error: error.message,
stack: error.stack
};
if (skipErrors) {
console.warn(`⚠️ Skipping ${filePath}: ${error.message}`);
errors.push(errorInfo);
processed++;
if (onProgress) {
onProgress({
processed,
total: filePaths.length,
current: filePath,
progress: (processed / filePaths.length) * 100,
error: error.message
});
}
return errorInfo;
} else {
throw error;
}
}
});
const batchResults = await Promise.all(batchPromises);
results.push(...batchResults);
}
// Separate successful results from errors
const successful = results.filter(r => r.success).map(r => r.result);
const failed = results.filter(r => !r.success);
console.log(`✓ Successfully processed ${successful.length}/${filePaths.length} documents`);
if (failed.length > 0) {
console.log(`⚠️ Failed to process ${failed.length} documents`);
}
return {
documents: successful,
errors: failed,
summary: {
total: filePaths.length,
successful: successful.length,
failed: failed.length,
totalChunks: successful.reduce((sum, doc) => sum + doc.chunks.length, 0)
}
};
} catch (error) {
console.error('Error processing multiple documents:', error);
throw error;
}
}
}
export { DocumentProcessor };
export default DocumentProcessor;