UNPKG

pdf-ocr-cli

Version:

A CLI tool for OCR processing of PDF files using Mistral API with optional LLM verification

118 lines (117 loc) 4.62 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.textToPdf = textToPdf; const pdf_lib_1 = require("pdf-lib"); /** * Converts text to a PDF document * @param text - Text to convert to PDF * @param options - PDF creation options * @returns Buffer containing the PDF document */ async function textToPdf(text, options) { // Set default options const { width = 612, // Letter width (8.5 inches) height = 792, // Letter height (11 inches) fontSize = 12, lineHeight = 1.2, margin = 72, // 1 inch margin } = options || {}; // Create a new PDF document const pdfDoc = await pdf_lib_1.PDFDocument.create(); // Embed the font (use Times-Roman which has better Unicode support than Helvetica) const font = await pdfDoc.embedFont(pdf_lib_1.StandardFonts.TimesRoman); // Calculate text layout parameters const effectiveWidth = width - 2 * margin; const lineHeightInPoints = fontSize * lineHeight; const linesPerPage = Math.floor((height - 2 * margin) / lineHeightInPoints); // Handle newlines and split text into lines that fit within the page width const paragraphs = text.split('\n'); const lines = []; for (const paragraph of paragraphs) { // If paragraph is empty, add an empty line if (paragraph.trim() === '') { lines.push(''); continue; } const words = paragraph.split(/\s+/); let currentLine = ''; for (const word of words) { // Skip empty words if (word === '') continue; try { // Try to measure the line width const testLine = currentLine ? `${currentLine} ${word}` : word; const testLineWidth = font.widthOfTextAtSize(testLine, fontSize); if (testLineWidth <= effectiveWidth) { currentLine = testLine; } else { lines.push(currentLine); currentLine = word; } } catch (error) { // If we can't measure the width (e.g., due to unsupported characters), // just add the word to the current line and hope for the best if (currentLine) { lines.push(currentLine); } currentLine = word; } } // Add the last line if it's not empty if (currentLine) { lines.push(currentLine); } } // If no text, create an empty page if (lines.length === 0) { const page = pdfDoc.addPage([width, height]); // No text to add } else { // Split lines into pages for (let i = 0; i < lines.length; i += linesPerPage) { const pageLines = lines.slice(i, i + linesPerPage); const page = pdfDoc.addPage([width, height]); // Add text to the page for (let j = 0; j < pageLines.length; j++) { const line = pageLines[j]; const y = height - margin - j * lineHeightInPoints; try { page.drawText(line, { x: margin, y, size: fontSize, font, color: (0, pdf_lib_1.rgb)(0, 0, 0), }); } catch (error) { // If we can't draw the text (e.g., due to unsupported characters), // try to draw each character individually let xPos = margin; for (const char of line) { try { const charWidth = font.widthOfTextAtSize(char, fontSize); page.drawText(char, { x: xPos, y, size: fontSize, font, color: (0, pdf_lib_1.rgb)(0, 0, 0), }); xPos += charWidth; } catch (charError) { // Skip characters that can't be drawn xPos += fontSize / 2; // Approximate width for skipped character } } } } } } // Save the PDF to a buffer const pdfBytes = await pdfDoc.save(); return Buffer.from(pdfBytes); }