UNPKG

pdf-ocr-cli

Version:

A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification

github.com/luandro/pdf-ocr

luandro/pdf-ocr

139 lines (138 loc) • 6.56 kB

JavaScript

#!/usr/bin/env node "use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.processPdf = processPdf; exports.createCli = createCli; const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const commander_1 = require("commander"); const ocr_1 = require("./ocr"); const textToPdf_1 = require("./textToPdf"); const splitPdf_1 = require("./splitPdf"); const mergePdfs_1 = require("./mergePdfs"); /** * Sleep for a specified number of milliseconds * @param ms - Milliseconds to sleep */ const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); /** * Process a PDF file through the OCR pipeline * @param inputPath - Path to the input PDF file * @param outputPath - Path to save the output PDF file * @param concurrency - Number of pages to process in parallel (not used) * @param maxPages - Maximum number of pages to process * @param ocrOptions - Options for OCR processing * @param sleepTime - Time to sleep between processing pages in milliseconds */ async function processPdf(inputPath, outputPath, concurrency = 2, maxPages, ocrOptions, sleepTime = 5000) { try { // Read the input PDF const inputPdfBuffer = fs_1.default.readFileSync(inputPath); // Split the PDF into individual pages const pdfPages = await (0, splitPdf_1.splitPdf)(inputPdfBuffer, maxPages); if (ocrOptions?.verbose) { console.log(`PDF split into ${pdfPages.length} pages`); } // Process each page individually const processedPages = []; for (let i = 0; i < pdfPages.length; i++) { if (ocrOptions?.verbose) { console.log(`Processing page ${i + 1}/${pdfPages.length}...`); } try { // Perform OCR on the current page const ocrText = await (0, ocr_1.performOcr)(pdfPages[i], ocrOptions); // Convert OCR text back to PDF const pdfBuffer = await (0, textToPdf_1.textToPdf)(ocrText); // Add the processed page to the result processedPages.push(pdfBuffer); if (ocrOptions?.verbose) { console.log(`Page ${i + 1} processed successfully`); } // Sleep between pages (except after the last page) if (i < pdfPages.length - 1) { if (ocrOptions?.verbose) { console.log(`Sleeping for ${sleepTime}ms before processing next page...`); } await sleep(sleepTime); } } catch (error) { if (ocrOptions?.verbose) { console.error(`Error processing page ${i + 1}: ${error instanceof Error ? error.message : String(error)}`); } throw error; } } // Merge the processed pages back into a single PDF const outputPdfBuffer = await (0, mergePdfs_1.mergePdfs)(processedPages); // Write the output PDF fs_1.default.writeFileSync(outputPath, outputPdfBuffer); } catch (error) { // Re-throw the error to be handled by the caller throw error; } } /** * Create the CLI program * @returns The commander program instance */ function createCli() { const program = new commander_1.Command(); program .name('pdf-ocr') .description('OCR a PDF file using Mistral API with optional LLM verification') .version(require('../package.json').version) .requiredOption('-i, --input <path>', 'Input PDF file path') .requiredOption('-o, --output <path>', 'Output PDF file path') .option('-c, --concurrency <number>', 'Number of pages to process in parallel', (value) => parseInt(value, 10), 2) .option('-m, --max-pages <number>', 'Maximum number of pages to process', (value) => parseInt(value, 10)) .option('-r, --retries <number>', 'Maximum number of OCR retry attempts', (value) => parseInt(value, 10), 3) .option('-d, --retry-delay <number>', 'Delay between OCR retries in milliseconds', (value) => parseInt(value, 10), 1000) .option('-t, --timeout <number>', 'Timeout for OCR API requests in milliseconds', (value) => parseInt(value, 10), 30000) .option('-s, --sleep <number>', 'Time to sleep between processing pages in milliseconds', (value) => parseInt(value, 10), 5000) .option('-v, --verbose', 'Enable verbose logging for OCR process') .option('--verify', 'Verify and improve OCR text using LLM') .option('--max-tokens <number>', 'Maximum number of tokens for LLM verification', (value) => parseInt(value, 10), 1000) .option('--temperature <number>', 'Temperature for LLM verification', (value) => parseFloat(value), 0.7) .option('--top-p <number>', 'Top-p for LLM verification', (value) => parseFloat(value), 0.9) .action(async (options) => { try { // Resolve paths to absolute paths const inputPath = path_1.default.resolve(options.input); const outputPath = path_1.default.resolve(options.output); // Create OCR options from CLI options const ocrOptions = { maxRetries: options.retries, retryDelay: options.retryDelay, timeout: options.timeout, verbose: options.verbose || false, verifyContent: options.verify, contentVerificationOptions: { maxTokens: options.maxTokens, temperature: options.temperature, topP: options.topP, verbose: options.verbose || false } }; console.log(`Processing ${inputPath}...`); // Process the PDF await processPdf(inputPath, outputPath, options.concurrency, options.maxPages, ocrOptions, options.sleep); console.log(`OCR complete! Output saved to ${outputPath}`); } catch (error) { console.error('Error:', error instanceof Error ? error.message : String(error)); process.exit(1); } }); return program; } // Only run the CLI if this file is executed directly if (require.main === module) { const program = createCli(); program.parse(process.argv); }