UNPKG

pdf-ocr-cli

Version:

A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification

71 lines (70 loc) 2.68 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.renderPdfToPng = renderPdfToPng; const pdf_lib_1 = require("pdf-lib"); const fs_1 = __importDefault(require("fs")); const path_1 = __importDefault(require("path")); const os_1 = __importDefault(require("os")); const pdf2pic_1 = require("pdf2pic"); const uuid_1 = require("uuid"); /** * Renders a single-page PDF to a PNG image * @param pdfBuffer - Buffer containing a single-page PDF * @param resolution - DPI resolution for rendering (default: 150) * @returns Buffer containing the PNG image * @throws Error if the PDF has multiple pages or is invalid */ async function renderPdfToPng(pdfBuffer, resolution = 150) { try { // Load the PDF document to check page count const pdfDoc = await pdf_lib_1.PDFDocument.load(pdfBuffer); // Ensure it's a single-page PDF if (pdfDoc.getPageCount() !== 1) { throw new Error('Input PDF must be a single page'); } // Create a temporary directory const tempDir = path_1.default.join(os_1.default.tmpdir(), `pdf-ocr-${(0, uuid_1.v4)()}`); fs_1.default.mkdirSync(tempDir, { recursive: true }); // Configure pdf2pic const pdf2picOptions = { density: resolution, savePath: tempDir, format: "png", width: 2000, // Max width height: 2000 // Max height }; // Convert PDF to PNG const converter = (0, pdf2pic_1.fromBuffer)(pdfBuffer, pdf2picOptions); const result = await converter(1); // Convert page 1 if (!result || !result.path) { throw new Error('Failed to render PDF to PNG'); } // Read the PNG file const pngBuffer = fs_1.default.readFileSync(result.path); // Clean up temporary files try { fs_1.default.unlinkSync(result.path); fs_1.default.rmdirSync(tempDir); } catch (cleanupError) { console.warn('Failed to clean up temporary files:', cleanupError); } // Return the PNG buffer return pngBuffer; } catch (error) { // Handle errors if (error instanceof Error) { if (error.message === 'Input PDF must be a single page') { throw error; // Re-throw our own error } throw new Error(`Invalid PDF: ${error.message}`); } else { throw new Error('Invalid PDF: Unknown error'); } } }