UNPKG

pdf-ocr-cli

Version:

A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification

179 lines (178 loc) 7.78 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.performOcr = performOcr; const mistralai_1 = require("@mistralai/mistralai"); const dotenv_1 = __importDefault(require("dotenv")); // Load environment variables dotenv_1.default.config(); /** * Sleep for a specified number of milliseconds * @param ms - Milliseconds to sleep */ const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); /** * Performs OCR on a PDF using Mistral API * @param pdfBuffer - Buffer containing the PDF data * @param options - OCR processing options * @returns Extracted text from the PDF * @throws Error if OCR fails or API key is missing */ async function performOcr(pdfBuffer, options = {}) { // Set default options const opts = { maxRetries: options.maxRetries ?? 3, retryDelay: options.retryDelay ?? 1000, verbose: options.verbose ?? false, timeout: options.timeout ?? 30000, verifyContent: options.verifyContent ?? false, contentVerificationOptions: options.contentVerificationOptions ?? {}, }; // Check if API key is set if (!process.env.MISTRAL_API_KEY) { throw new Error('MISTRAL_API_KEY environment variable is not set'); } // Initialize Mistral client const mistral = new mistralai_1.Mistral({ apiKey: process.env.MISTRAL_API_KEY, }); // Implement retry logic let lastError = null; for (let attempt = 1; attempt <= opts.maxRetries; attempt++) { try { if (opts.verbose) { console.log(`OCR attempt ${attempt}/${opts.maxRetries}...`); } // Convert the PDF buffer to base64 const base64Pdf = pdfBuffer.toString('base64'); // Use the OCR API directly with the base64 encoded PDF if (opts.verbose) { console.log('Processing OCR with base64 encoded PDF...'); } const result = await mistral.ocr.process({ model: 'mistral-ocr-latest', document: { type: 'document_url', documentUrl: `data:application/pdf;base64,${base64Pdf}`, }, }); // Extract text from the result let extractedText = ''; // Check if the result has pages with markdown content if (result.pages && Array.isArray(result.pages)) { // Concatenate markdown from all pages extractedText = result.pages .map(page => page.markdown || '') .filter(text => text.length > 0) .join('\n\n'); } else { // Fallback to content or text fields extractedText = result.content || result.text || ''; } if (opts.verbose) { console.log(`OCR successful on attempt ${attempt}`); console.log('OCR result structure:', JSON.stringify(result, null, 2)); console.log('Extracted text length:', extractedText.length); if (extractedText.length > 0) { console.log('First 200 characters of extracted text:', extractedText.substring(0, 200)); } else { console.log('No text was extracted from the PDF'); } } // Verify and improve the extracted text if enabled if (opts.verifyContent && extractedText.length > 0) { if (opts.verbose) { console.log('Verifying and improving OCR text...'); } try { // Dynamically import the content verification module const { verifyContent } = await Promise.resolve().then(() => __importStar(require('./contentVerification'))); // Pass the verbose option from OCR options to content verification options const contentOpts = { ...opts.contentVerificationOptions, verbose: opts.verbose, }; // Verify and improve the extracted text const verifiedText = await verifyContent(extractedText, contentOpts); if (opts.verbose) { console.log('Content verification complete'); if (verifiedText !== extractedText) { console.log('Text was improved by content verification'); } else { console.log('No changes were made by content verification'); } } return verifiedText; } catch (verifyError) { // If content verification fails, log the error and return the original text if (opts.verbose) { console.error('Content verification failed:', verifyError instanceof Error ? verifyError.message : String(verifyError)); console.log('Returning original OCR text'); } } } return extractedText; } catch (error) { lastError = error instanceof Error ? error : new Error('Unknown error'); if (opts.verbose) { console.error(`OCR attempt ${attempt} failed: ${lastError.message}`); } // If this is not the last attempt, wait before retrying if (attempt < opts.maxRetries) { // Exponential backoff: increase delay with each retry const delay = opts.retryDelay * Math.pow(2, attempt - 1); if (opts.verbose) { console.log(`Retrying in ${delay}ms...`); } await sleep(delay); } } } // If we've exhausted all retries, throw the last error if (lastError) { throw new Error(`OCR failed after ${opts.maxRetries} attempts: ${lastError.message}`); } throw new Error(`OCR failed after ${opts.maxRetries} attempts: Unknown error`); }