UNPKG

@coworker-agency/rag

Version:

Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering

87 lines (73 loc) 2.42 kB
/** * Document Text Extractors * * Tools for extracting text content from different file formats */ import * as pdfjs from 'pdfjs-dist'; /** * Extract text from a PDF file * @param {Blob} pdfBlob - PDF file as a Blob object * @returns {Promise<string>} Extracted text content */ export async function extractTextFromPDF(pdfBlob) { try { // Convert blob to ArrayBuffer const arrayBuffer = await pdfBlob.arrayBuffer(); // Set worker source pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.min.js`; // Load PDF document const loadingTask = pdfjs.getDocument({ data: arrayBuffer }); const pdf = await loadingTask.promise; console.log(`PDF loaded with ${pdf.numPages} pages`); // Extract text from each page let fullText = ''; for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) { const page = await pdf.getPage(pageNum); const content = await page.getTextContent(); // Concatenate text items with appropriate spacing const pageText = content.items .map(item => item.str) .join(' '); fullText += pageText + '\n\n'; if (pageNum % 10 === 0) { console.log(`Extracted text from ${pageNum}/${pdf.numPages} pages`); } } return fullText; } catch (error) { console.error('Error extracting text from PDF:', error); throw error; } } /** * Extract text from a CSV file * @param {Blob} csvBlob - CSV file as a Blob object * @returns {Promise<string>} Extracted text content */ export async function extractTextFromCSV(csvBlob) { try { // Simple CSV to text conversion const text = await csvBlob.text(); return text; } catch (error) { console.error('Error extracting text from CSV:', error); throw error; } } /** * Extract text from a JSON file * @param {Blob} jsonBlob - JSON file as a Blob object * @returns {Promise<string>} Extracted text content */ export async function extractTextFromJSON(jsonBlob) { try { const text = await jsonBlob.text(); // Parse and re-stringify to ensure it's valid JSON // Also provides some basic formatting const jsonObj = JSON.parse(text); return JSON.stringify(jsonObj, null, 2); } catch (error) { console.error('Error extracting text from JSON:', error); throw error; } }