pdf-ocr-cli
Version:
A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification
46 lines (45 loc) • 1.77 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.splitPdf = splitPdf;
const pdf_lib_1 = require("pdf-lib");
/**
* Splits a PDF buffer into individual pages
* @param pdfBuffer - Buffer containing the PDF data
* @param maxPages - Maximum number of pages to extract (optional)
* @returns Array of Buffers, each containing a single page PDF
* @throws Error if the PDF is invalid
*/
async function splitPdf(pdfBuffer, maxPages) {
try {
// Load the PDF document
const pdfDoc = await pdf_lib_1.PDFDocument.load(pdfBuffer);
// Get the total number of pages
const pageCount = pdfDoc.getPageCount();
// Determine how many pages to process
const pagesToProcess = maxPages ? Math.min(pageCount, maxPages) : pageCount;
// Create an array to store the individual page buffers
const pageBuffers = [];
// Process each page
for (let i = 0; i < pagesToProcess; i++) {
// Create a new document for this page
const newPdfDoc = await pdf_lib_1.PDFDocument.create();
// Copy the page from the original document
const [copiedPage] = await newPdfDoc.copyPages(pdfDoc, [i]);
newPdfDoc.addPage(copiedPage);
// Save the new document to a buffer
const newPdfBytes = await newPdfDoc.save();
// Convert to Buffer and add to the array
pageBuffers.push(Buffer.from(newPdfBytes));
}
return pageBuffers;
}
catch (error) {
// Handle errors
if (error instanceof Error) {
throw new Error(`Invalid PDF: ${error.message}`);
}
else {
throw new Error('Invalid PDF: Unknown error');
}
}
}