pdf-ocr-cli
Version:
A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification
179 lines (178 loc) • 7.78 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.performOcr = performOcr;
const mistralai_1 = require("@mistralai/mistralai");
const dotenv_1 = __importDefault(require("dotenv"));
// Load environment variables
dotenv_1.default.config();
/**
* Sleep for a specified number of milliseconds
* @param ms - Milliseconds to sleep
*/
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
/**
* Performs OCR on a PDF using Mistral API
* @param pdfBuffer - Buffer containing the PDF data
* @param options - OCR processing options
* @returns Extracted text from the PDF
* @throws Error if OCR fails or API key is missing
*/
async function performOcr(pdfBuffer, options = {}) {
// Set default options
const opts = {
maxRetries: options.maxRetries ?? 3,
retryDelay: options.retryDelay ?? 1000,
verbose: options.verbose ?? false,
timeout: options.timeout ?? 30000,
verifyContent: options.verifyContent ?? false,
contentVerificationOptions: options.contentVerificationOptions ?? {},
};
// Check if API key is set
if (!process.env.MISTRAL_API_KEY) {
throw new Error('MISTRAL_API_KEY environment variable is not set');
}
// Initialize Mistral client
const mistral = new mistralai_1.Mistral({
apiKey: process.env.MISTRAL_API_KEY,
});
// Implement retry logic
let lastError = null;
for (let attempt = 1; attempt <= opts.maxRetries; attempt++) {
try {
if (opts.verbose) {
console.log(`OCR attempt ${attempt}/${opts.maxRetries}...`);
}
// Convert the PDF buffer to base64
const base64Pdf = pdfBuffer.toString('base64');
// Use the OCR API directly with the base64 encoded PDF
if (opts.verbose) {
console.log('Processing OCR with base64 encoded PDF...');
}
const result = await mistral.ocr.process({
model: 'mistral-ocr-latest',
document: {
type: 'document_url',
documentUrl: `data:application/pdf;base64,${base64Pdf}`,
},
});
// Extract text from the result
let extractedText = '';
// Check if the result has pages with markdown content
if (result.pages && Array.isArray(result.pages)) {
// Concatenate markdown from all pages
extractedText = result.pages
.map(page => page.markdown || '')
.filter(text => text.length > 0)
.join('\n\n');
}
else {
// Fallback to content or text fields
extractedText = result.content || result.text || '';
}
if (opts.verbose) {
console.log(`OCR successful on attempt ${attempt}`);
console.log('OCR result structure:', JSON.stringify(result, null, 2));
console.log('Extracted text length:', extractedText.length);
if (extractedText.length > 0) {
console.log('First 200 characters of extracted text:', extractedText.substring(0, 200));
}
else {
console.log('No text was extracted from the PDF');
}
}
// Verify and improve the extracted text if enabled
if (opts.verifyContent && extractedText.length > 0) {
if (opts.verbose) {
console.log('Verifying and improving OCR text...');
}
try {
// Dynamically import the content verification module
const { verifyContent } = await Promise.resolve().then(() => __importStar(require('./contentVerification')));
// Pass the verbose option from OCR options to content verification options
const contentOpts = {
...opts.contentVerificationOptions,
verbose: opts.verbose,
};
// Verify and improve the extracted text
const verifiedText = await verifyContent(extractedText, contentOpts);
if (opts.verbose) {
console.log('Content verification complete');
if (verifiedText !== extractedText) {
console.log('Text was improved by content verification');
}
else {
console.log('No changes were made by content verification');
}
}
return verifiedText;
}
catch (verifyError) {
// If content verification fails, log the error and return the original text
if (opts.verbose) {
console.error('Content verification failed:', verifyError instanceof Error ? verifyError.message : String(verifyError));
console.log('Returning original OCR text');
}
}
}
return extractedText;
}
catch (error) {
lastError = error instanceof Error
? error
: new Error('Unknown error');
if (opts.verbose) {
console.error(`OCR attempt ${attempt} failed: ${lastError.message}`);
}
// If this is not the last attempt, wait before retrying
if (attempt < opts.maxRetries) {
// Exponential backoff: increase delay with each retry
const delay = opts.retryDelay * Math.pow(2, attempt - 1);
if (opts.verbose) {
console.log(`Retrying in ${delay}ms...`);
}
await sleep(delay);
}
}
}
// If we've exhausted all retries, throw the last error
if (lastError) {
throw new Error(`OCR failed after ${opts.maxRetries} attempts: ${lastError.message}`);
}
throw new Error(`OCR failed after ${opts.maxRetries} attempts: Unknown error`);
}