UNPKG

vexify

Version:

Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.

281 lines (239 loc) 7.54 kB
'use strict'; const { getDocumentProxy, extractText } = require('unpdf'); const { createWorker } = require('tesseract.js'); const fs = require('fs'); const { execSync } = require('child_process'); const path = require('path'); const os = require('os'); const originalWarn = console.warn; console.warn = function(...args) { const msg = args.join(' '); if (msg.includes('TT: undefined function')) { return; } originalWarn.apply(console, args); }; class PDFReader { constructor(options = {}) { this.document = null; this.pdfPath = null; this.useOCR = options.useOCR !== false; this.ocrLanguage = options.ocrLanguage || 'eng'; this.ocrWorker = null; this.parallelPages = options.parallelPages || 20; } async load(pdfPath) { this.pdfPath = pdfPath; const buffer = fs.readFileSync(pdfPath); const uint8Array = new Uint8Array(buffer); try { this.document = await getDocumentProxy(uint8Array, { standardFontDataUrl: 'https://unpkg.com/pdfjs-dist@3.11.174/standard_fonts/', verbosity: 0 }); } catch (error) { try { this.document = await getDocumentProxy(uint8Array, { standardFontDataUrl: 'https://unpkg.com/pdfjs-dist@3.11.174/standard_fonts/', verbosity: 0, stopAtErrors: false, isEvalSupported: false, disableFontFace: true }); } catch (retryError) { this.useFallbackExtraction = true; } } return this; } async loadFromBuffer(buffer) { const uint8Array = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer); try { this.document = await getDocumentProxy(uint8Array, { standardFontDataUrl: 'https://unpkg.com/pdfjs-dist@3.11.174/standard_fonts/', verbosity: 0 }); } catch (error) { try { this.document = await getDocumentProxy(uint8Array, { standardFontDataUrl: 'https://unpkg.com/pdfjs-dist@3.11.174/standard_fonts/', verbosity: 0, stopAtErrors: false, isEvalSupported: false, disableFontFace: true }); } catch (retryError) { throw new Error(`Invalid PDF structure: ${retryError.message}`); } } return this; } getPageCount() { if (this.useFallbackExtraction) { return 1; } if (!this.document) { throw new Error('No PDF loaded. Call load() or loadFromBuffer() first.'); } return this.document.numPages; } async extractPage(pageNumber) { if (!this.document) { throw new Error('No PDF loaded. Call load() or loadFromBuffer() first.'); } if (pageNumber < 1 || pageNumber > this.document.numPages) { throw new Error(`Page ${pageNumber} out of range. PDF has ${this.document.numPages} pages.`); } const page = await this.document.getPage(pageNumber); const textContent = await page.getTextContent(); let text = textContent.items .map(item => item.str) .join(' ') .replace(/\s+/g, ' ') .trim(); let usedOCR = false; const useOCRFallback = this.useOCR && this.pdfPath && text.length < 200; if (useOCRFallback) { const ocrText = await this.extractPageWithOCR(pageNumber); if (ocrText.length > text.length) { text = ocrText; usedOCR = true; } } return { pageNumber, text, metadata: { width: page.view[2], height: page.view[3], ocr: usedOCR } }; } async getOCRWorker() { if (!this.ocrWorker) { this.ocrWorker = await createWorker(this.ocrLanguage); } return this.ocrWorker; } async extractPageWithOCR(pageNumber) { if (!this.pdfPath) { return ''; } const tempDir = os.tmpdir(); const baseName = path.basename(this.pdfPath, '.pdf'); const outputPrefix = path.join(tempDir, `${baseName}-page${pageNumber}`); try { execSync(`pdftoppm -f ${pageNumber} -l ${pageNumber} -r 150 "${this.pdfPath}" "${outputPrefix}"`, { stdio: ['pipe', 'pipe', 'pipe'] }); const files = fs.readdirSync(tempDir); const imageFile = files.find(f => f.startsWith(`${baseName}-page${pageNumber}-`) && f.endsWith('.ppm')); if (!imageFile) { return ''; } const imagePath = path.join(tempDir, imageFile); const worker = await this.getOCRWorker(); const { data: { text } } = await worker.recognize(imagePath); try { fs.unlinkSync(imagePath); } catch (e) {} return text.trim(); } catch (error) { console.error(`OCR error for page ${pageNumber}:`, error.message); try { const files = fs.readdirSync(tempDir); const imageFiles = files.filter(f => f.startsWith(`${baseName}-page${pageNumber}-`) && f.endsWith('.ppm')); imageFiles.forEach(f => { try { fs.unlinkSync(path.join(tempDir, f)); } catch (e) {} }); } catch (e) {} return ''; } } async extractAllPages() { if (this.useFallbackExtraction) { return await this.extractWithFallback(); } if (!this.document) { throw new Error('No PDF loaded. Call load() or loadFromBuffer() first.'); } const numPages = this.document.numPages; const pages = new Array(numPages); for (let i = 0; i < numPages; i += this.parallelPages) { const batch = []; for (let j = 0; j < this.parallelPages && i + j < numPages; j++) { batch.push(this.extractPage(i + j + 1)); } const results = await Promise.all(batch); for (let j = 0; j < results.length; j++) { pages[i + j] = results[j]; } } if (this.ocrWorker) { await this.ocrWorker.terminate(); this.ocrWorker = null; } return pages; } async extractText() { if (!this.document) { throw new Error('No PDF loaded. Call load() or loadFromBuffer() first.'); } const text = await extractText(this.document); return text; } async toMarkdown() { const pages = await this.extractAllPages(); let markdown = ''; for (const page of pages) { if (page.text.trim()) { markdown += `## Page ${page.pageNumber}\n\n`; markdown += `${page.text}\n\n`; markdown += '---\n\n'; } } return markdown.trim(); } async extractPageRange(startPage, endPage) { if (!this.document) { throw new Error('No PDF loaded. Call load() or loadFromBuffer() first.'); } if (startPage < 1 || endPage > this.document.numPages || startPage > endPage) { throw new Error(`Invalid page range: ${startPage}-${endPage}. PDF has ${this.document.numPages} pages.`); } const pages = []; for (let i = startPage; i <= endPage; i++) { const pageData = await this.extractPage(i); pages.push(pageData); } return pages; } async extractWithFallback() { if (!this.pdfPath) { throw new Error('Fallback extraction requires a file path'); } try { const text = execSync(`pdftotext "${this.pdfPath}" -`, { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024, timeout: 30000 }); return [{ pageNumber: 1, text: text.trim(), metadata: { extractedWith: 'pdftotext', width: 0, height: 0, ocr: false } }]; } catch (error) { throw new Error(`Fallback extraction failed: ${error.message}`); } } } module.exports = { PDFReader };