pdf-ocr-cli
Version:
A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification
71 lines (70 loc) • 2.68 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.renderPdfToPng = renderPdfToPng;
const pdf_lib_1 = require("pdf-lib");
const fs_1 = __importDefault(require("fs"));
const path_1 = __importDefault(require("path"));
const os_1 = __importDefault(require("os"));
const pdf2pic_1 = require("pdf2pic");
const uuid_1 = require("uuid");
/**
* Renders a single-page PDF to a PNG image
* @param pdfBuffer - Buffer containing a single-page PDF
* @param resolution - DPI resolution for rendering (default: 150)
* @returns Buffer containing the PNG image
* @throws Error if the PDF has multiple pages or is invalid
*/
async function renderPdfToPng(pdfBuffer, resolution = 150) {
try {
// Load the PDF document to check page count
const pdfDoc = await pdf_lib_1.PDFDocument.load(pdfBuffer);
// Ensure it's a single-page PDF
if (pdfDoc.getPageCount() !== 1) {
throw new Error('Input PDF must be a single page');
}
// Create a temporary directory
const tempDir = path_1.default.join(os_1.default.tmpdir(), `pdf-ocr-${(0, uuid_1.v4)()}`);
fs_1.default.mkdirSync(tempDir, { recursive: true });
// Configure pdf2pic
const pdf2picOptions = {
density: resolution,
savePath: tempDir,
format: "png",
width: 2000, // Max width
height: 2000 // Max height
};
// Convert PDF to PNG
const converter = (0, pdf2pic_1.fromBuffer)(pdfBuffer, pdf2picOptions);
const result = await converter(1); // Convert page 1
if (!result || !result.path) {
throw new Error('Failed to render PDF to PNG');
}
// Read the PNG file
const pngBuffer = fs_1.default.readFileSync(result.path);
// Clean up temporary files
try {
fs_1.default.unlinkSync(result.path);
fs_1.default.rmdirSync(tempDir);
}
catch (cleanupError) {
console.warn('Failed to clean up temporary files:', cleanupError);
}
// Return the PNG buffer
return pngBuffer;
}
catch (error) {
// Handle errors
if (error instanceof Error) {
if (error.message === 'Input PDF must be a single page') {
throw error; // Re-throw our own error
}
throw new Error(`Invalid PDF: ${error.message}`);
}
else {
throw new Error('Invalid PDF: Unknown error');
}
}
}