UNPKG

pdf-ocr-cli

Version:

A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification

github.com/luandro/pdf-ocr

luandro/pdf-ocr

46 lines (45 loc) • 1.77 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.splitPdf = splitPdf; const pdf_lib_1 = require("pdf-lib"); /** * Splits a PDF buffer into individual pages * @param pdfBuffer - Buffer containing the PDF data * @param maxPages - Maximum number of pages to extract (optional) * @returns Array of Buffers, each containing a single page PDF * @throws Error if the PDF is invalid */ async function splitPdf(pdfBuffer, maxPages) { try { // Load the PDF document const pdfDoc = await pdf_lib_1.PDFDocument.load(pdfBuffer); // Get the total number of pages const pageCount = pdfDoc.getPageCount(); // Determine how many pages to process const pagesToProcess = maxPages ? Math.min(pageCount, maxPages) : pageCount; // Create an array to store the individual page buffers const pageBuffers = []; // Process each page for (let i = 0; i < pagesToProcess; i++) { // Create a new document for this page const newPdfDoc = await pdf_lib_1.PDFDocument.create(); // Copy the page from the original document const [copiedPage] = await newPdfDoc.copyPages(pdfDoc, [i]); newPdfDoc.addPage(copiedPage); // Save the new document to a buffer const newPdfBytes = await newPdfDoc.save(); // Convert to Buffer and add to the array pageBuffers.push(Buffer.from(newPdfBytes)); } return pageBuffers; } catch (error) { // Handle errors if (error instanceof Error) { throw new Error(`Invalid PDF: ${error.message}`); } else { throw new Error('Invalid PDF: Unknown error'); } } }