@coworker-agency/rag
Version:
Retrieval Augmented Generation (RAG) library for document indexing, vector storage, and AI-powered question answering
87 lines (73 loc) • 2.42 kB
JavaScript
/**
* Document Text Extractors
*
* Tools for extracting text content from different file formats
*/
import * as pdfjs from 'pdfjs-dist';
/**
* Extract text from a PDF file
* @param {Blob} pdfBlob - PDF file as a Blob object
* @returns {Promise<string>} Extracted text content
*/
export async function extractTextFromPDF(pdfBlob) {
try {
// Convert blob to ArrayBuffer
const arrayBuffer = await pdfBlob.arrayBuffer();
// Set worker source
pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.min.js`;
// Load PDF document
const loadingTask = pdfjs.getDocument({ data: arrayBuffer });
const pdf = await loadingTask.promise;
console.log(`PDF loaded with ${pdf.numPages} pages`);
// Extract text from each page
let fullText = '';
for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
const page = await pdf.getPage(pageNum);
const content = await page.getTextContent();
// Concatenate text items with appropriate spacing
const pageText = content.items
.map(item => item.str)
.join(' ');
fullText += pageText + '\n\n';
if (pageNum % 10 === 0) {
console.log(`Extracted text from ${pageNum}/${pdf.numPages} pages`);
}
}
return fullText;
} catch (error) {
console.error('Error extracting text from PDF:', error);
throw error;
}
}
/**
* Extract text from a CSV file
* @param {Blob} csvBlob - CSV file as a Blob object
* @returns {Promise<string>} Extracted text content
*/
export async function extractTextFromCSV(csvBlob) {
try {
// Simple CSV to text conversion
const text = await csvBlob.text();
return text;
} catch (error) {
console.error('Error extracting text from CSV:', error);
throw error;
}
}
/**
* Extract text from a JSON file
* @param {Blob} jsonBlob - JSON file as a Blob object
* @returns {Promise<string>} Extracted text content
*/
export async function extractTextFromJSON(jsonBlob) {
try {
const text = await jsonBlob.text();
// Parse and re-stringify to ensure it's valid JSON
// Also provides some basic formatting
const jsonObj = JSON.parse(text);
return JSON.stringify(jsonObj, null, 2);
} catch (error) {
console.error('Error extracting text from JSON:', error);
throw error;
}
}