pdf-ocr-cli
Version:
A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification
139 lines (138 loc) • 6.56 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.processPdf = processPdf;
exports.createCli = createCli;
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const commander_1 = require("commander");
const ocr_1 = require("./ocr");
const textToPdf_1 = require("./textToPdf");
const splitPdf_1 = require("./splitPdf");
const mergePdfs_1 = require("./mergePdfs");
/**
* Sleep for a specified number of milliseconds
* @param ms - Milliseconds to sleep
*/
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
/**
* Process a PDF file through the OCR pipeline
* @param inputPath - Path to the input PDF file
* @param outputPath - Path to save the output PDF file
* @param concurrency - Number of pages to process in parallel (not used)
* @param maxPages - Maximum number of pages to process
* @param ocrOptions - Options for OCR processing
* @param sleepTime - Time to sleep between processing pages in milliseconds
*/
async function processPdf(inputPath, outputPath, concurrency = 2, maxPages, ocrOptions, sleepTime = 5000) {
try {
// Read the input PDF
const inputPdfBuffer = fs_1.default.readFileSync(inputPath);
// Split the PDF into individual pages
const pdfPages = await (0, splitPdf_1.splitPdf)(inputPdfBuffer, maxPages);
if (ocrOptions?.verbose) {
console.log(`PDF split into ${pdfPages.length} pages`);
}
// Process each page individually
const processedPages = [];
for (let i = 0; i < pdfPages.length; i++) {
if (ocrOptions?.verbose) {
console.log(`Processing page ${i + 1}/${pdfPages.length}...`);
}
try {
// Perform OCR on the current page
const ocrText = await (0, ocr_1.performOcr)(pdfPages[i], ocrOptions);
// Convert OCR text back to PDF
const pdfBuffer = await (0, textToPdf_1.textToPdf)(ocrText);
// Add the processed page to the result
processedPages.push(pdfBuffer);
if (ocrOptions?.verbose) {
console.log(`Page ${i + 1} processed successfully`);
}
// Sleep between pages (except after the last page)
if (i < pdfPages.length - 1) {
if (ocrOptions?.verbose) {
console.log(`Sleeping for ${sleepTime}ms before processing next page...`);
}
await sleep(sleepTime);
}
}
catch (error) {
if (ocrOptions?.verbose) {
console.error(`Error processing page ${i + 1}: ${error instanceof Error ? error.message : String(error)}`);
}
throw error;
}
}
// Merge the processed pages back into a single PDF
const outputPdfBuffer = await (0, mergePdfs_1.mergePdfs)(processedPages);
// Write the output PDF
fs_1.default.writeFileSync(outputPath, outputPdfBuffer);
}
catch (error) {
// Re-throw the error to be handled by the caller
throw error;
}
}
/**
* Create the CLI program
* @returns The commander program instance
*/
function createCli() {
const program = new commander_1.Command();
program
.name('pdf-ocr')
.description('OCR a PDF file using Mistral API with optional LLM verification')
.version(require('../package.json').version)
.requiredOption('-i, --input <path>', 'Input PDF file path')
.requiredOption('-o, --output <path>', 'Output PDF file path')
.option('-c, --concurrency <number>', 'Number of pages to process in parallel', (value) => parseInt(value, 10), 2)
.option('-m, --max-pages <number>', 'Maximum number of pages to process', (value) => parseInt(value, 10))
.option('-r, --retries <number>', 'Maximum number of OCR retry attempts', (value) => parseInt(value, 10), 3)
.option('-d, --retry-delay <number>', 'Delay between OCR retries in milliseconds', (value) => parseInt(value, 10), 1000)
.option('-t, --timeout <number>', 'Timeout for OCR API requests in milliseconds', (value) => parseInt(value, 10), 30000)
.option('-s, --sleep <number>', 'Time to sleep between processing pages in milliseconds', (value) => parseInt(value, 10), 5000)
.option('-v, --verbose', 'Enable verbose logging for OCR process')
.option('--verify', 'Verify and improve OCR text using LLM')
.option('--max-tokens <number>', 'Maximum number of tokens for LLM verification', (value) => parseInt(value, 10), 1000)
.option('--temperature <number>', 'Temperature for LLM verification', (value) => parseFloat(value), 0.7)
.option('--top-p <number>', 'Top-p for LLM verification', (value) => parseFloat(value), 0.9)
.action(async (options) => {
try {
// Resolve paths to absolute paths
const inputPath = path_1.default.resolve(options.input);
const outputPath = path_1.default.resolve(options.output);
// Create OCR options from CLI options
const ocrOptions = {
maxRetries: options.retries,
retryDelay: options.retryDelay,
timeout: options.timeout,
verbose: options.verbose || false,
verifyContent: options.verify,
contentVerificationOptions: {
maxTokens: options.maxTokens,
temperature: options.temperature,
topP: options.topP,
verbose: options.verbose || false
}
};
console.log(`Processing ${inputPath}...`);
// Process the PDF
await processPdf(inputPath, outputPath, options.concurrency, options.maxPages, ocrOptions, options.sleep);
console.log(`OCR complete! Output saved to ${outputPath}`);
}
catch (error) {
console.error('Error:', error instanceof Error ? error.message : String(error));
process.exit(1);
}
});
return program;
}
// Only run the CLI if this file is executed directly
if (require.main === module) {
const program = createCli();
program.parse(process.argv);
}