UNPKG

mindee

Version:

Mindee Client Library for Node.js

261 lines (260 loc) 11 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.compressPdf = compressPdf; const logger_1 = require("../logger"); const tmp_1 = __importDefault(require("tmp")); const pdfUtils_1 = require("./pdfUtils"); const fs = __importStar(require("node:fs")); const node_poppler_1 = require("node-poppler"); const pdf_lib_1 = require("@cantoo/pdf-lib"); const imageOperations_1 = require("../imageOperations"); /** * Compresses each page of a provided PDF buffer. * @param pdfData The input PDF as a Buffer. * @param imageQuality Compression quality (70-100 for most JPG images). * @param forceSourceTextCompression If true, attempts to re-write detected text. * @param disableSourceText If true, doesn't re-apply source text to the output PDF. * @returns A Promise resolving to the compressed PDF as a Buffer. */ async function compressPdf(pdfData, imageQuality = 85, forceSourceTextCompression = false, disableSourceText = true) { handleCompressionWarnings(forceSourceTextCompression, disableSourceText); if (await (0, pdfUtils_1.hasSourceText)(pdfData)) { if (forceSourceTextCompression) { if (!disableSourceText) { logger_1.logger.warn("Re-writing PDF source-text is an EXPERIMENTAL feature."); } else { logger_1.logger.warn("Source file contains text, but disable_source_text flag. " + "is set to false. Resulting file will not contain any embedded text."); } } else { logger_1.logger.warn("Found text inside of the provided PDF file. Compression operation aborted since disableSourceText " + "is set to 'true'."); return pdfData; } } const extractedText = disableSourceText ? await (0, pdfUtils_1.extractTextFromPdf)(pdfData) : null; const extractedPdfInfo = await (0, pdfUtils_1.extractTextFromPdf)(pdfData); const compressedPages = await compressPdfPages(pdfData, extractedPdfInfo, imageQuality, disableSourceText, extractedText); if (!compressedPages) { logger_1.logger.warn("Could not compress PDF to a smaller size. Returning original PDF."); return pdfData; } return createNewPdfFromCompressedPages(compressedPages); } /** * Handles compression warnings based on the provided parameters. * @param forceSourceTextCompression If true, attempts to re-write detected text. * @param disableSourceText If true, doesn't re-apply source text to the output PDF. */ function handleCompressionWarnings(forceSourceTextCompression, disableSourceText) { if (forceSourceTextCompression) { if (!disableSourceText) { logger_1.logger.warn("Re-writing PDF source-text is an EXPERIMENTAL feature."); } else { logger_1.logger.warn("Source file contains text, but the disable_source_text is set to false. " + "Resulting file will not contain any embedded text."); } } } /** * Compresses PDF pages and returns an array of compressed page buffers. * @param pdfData The input PDF as a Buffer. * @param extractedPdfInfo Extracted PDF information. * @param imageQuality Initial compression quality. * @param disableSourceText If true, doesn't re-apply source text to the output PDF. * @param extractedText Extracted text from the PDF. * @returns A Promise resolving to an array of compressed page buffers, or null if compression fails. */ async function compressPdfPages(pdfData, extractedPdfInfo, imageQuality, disableSourceText, extractedText) { const originalSize = pdfData.length; const MIN_QUALITY = 1; let imageQualityLoop = imageQuality; while (imageQualityLoop >= MIN_QUALITY) { const compressedPages = await compressPagesWithQuality(pdfData, extractedPdfInfo, imageQualityLoop, disableSourceText, extractedText); const totalCompressedSize = calculateTotalCompressedSize(compressedPages); if (isCompressionSuccessful(totalCompressedSize, originalSize, imageQuality)) { return compressedPages; } imageQualityLoop -= Math.round(lerp(1, 10, imageQualityLoop / 100)); } return null; } /** * Compresses pages with a specific quality. * @param pdfData The input PDF as a Buffer. * @param extractedPdfInfo Extracted PDF information. * @param imageQuality Compression quality. * @param disableSourceText If true, doesn't re-apply source text to the output PDF. * @param extractedText Extracted text from the PDF. * @returns A Promise resolving to an array of compressed page buffers. */ async function compressPagesWithQuality(pdfData, extractedPdfInfo, imageQuality, disableSourceText, extractedText) { const pdfDoc = await pdf_lib_1.PDFDocument.load(pdfData, { ignoreEncryption: true, password: "" }); const compressedPages = []; for (let i = 0; i < extractedPdfInfo.pages.length; i++) { const page = pdfDoc.getPages()[i]; const rasterizedPage = await rasterizePage(pdfData, i + 1, imageQuality); const compressedImage = await (0, imageOperations_1.compressImage)(Buffer.from(rasterizedPage, "binary"), imageQuality); if (!disableSourceText) { await addTextToPdfPage(page, extractedText); } compressedPages.push(compressedImage); } return compressedPages; } /** * Calculates the total size of compressed pages. * @param compressedPages Array of compressed page buffers. * @returns The total size of compressed pages. */ function calculateTotalCompressedSize(compressedPages) { return compressedPages.reduce((sum, page) => sum + page.length, 0); } /** * Checks if the compression was successful based on the compressed size and original size. * Note: Not quite sure how or why the rasterization quality ratio is correlated with the overhead generated by the * image's inclusion into the pdf data, but this makes the following lerp() necessary if we want consistency during * compression. * * @param totalCompressedSize Total size of compressed pages. * @param originalSize Original PDF size. * @param imageQuality Compression quality. * @returns True if compression was successful, false otherwise. */ function isCompressionSuccessful(totalCompressedSize, originalSize, imageQuality) { const overhead = lerp(0.54, 0.18, imageQuality / 100); return totalCompressedSize + totalCompressedSize * overhead < originalSize; } /** * Creates a new PDF document from compressed page buffers. * @param compressedPages Array of compressed page buffers. * @returns A Promise resolving to the new PDF as a Buffer. */ async function createNewPdfFromCompressedPages(compressedPages) { const newPdfDoc = await pdf_lib_1.PDFDocument.create(); for (const compressedPage of compressedPages) { const image = await newPdfDoc.embedJpg(compressedPage); const newPage = newPdfDoc.addPage([image.width, image.height]); newPage.drawImage(image, { x: 0, y: 0, width: image.width, height: image.height, }); } const compressedPdfBytes = await newPdfDoc.save(); return Buffer.from(compressedPdfBytes); } async function addTextToPdfPage(page, textInfo) { if (textInfo === null) { return; } for (const textPages of textInfo.pages) { for (const textPage of textPages.content) { page.drawText(textPage.str, { x: textPage.x, y: textPage.y, size: textPage.height, color: (0, pdf_lib_1.rgb)(0, 0, 0), font: await getFontFromName(textPage.fontName) }); } } } async function getFontFromName(fontName) { const pdfDoc = await pdf_lib_1.PDFDocument.create(); let font; if (Object.values(pdf_lib_1.StandardFonts).map(value => value.toString()).includes(fontName)) { font = await pdfDoc.embedFont(fontName); } else { font = await pdfDoc.embedFont(pdf_lib_1.StandardFonts.Helvetica); } return font; } /** * Rasterizes a PDF page. * * @param pdfData Buffer representation of the entire PDF file. * @param index Index of the page to rasterize. * @param quality Quality to apply during rasterization. */ async function rasterizePage(pdfData, index, quality = 85) { const poppler = new node_poppler_1.Poppler(); const tmpPdf = tmp_1.default.fileSync(); const tempPdfPath = tmpPdf.name; const antialiasOption = "best"; try { await fs.promises.writeFile(tempPdfPath, pdfData); const options = { antialias: antialiasOption, firstPageToConvert: index, lastPageToConvert: index, jpegFile: true, jpegOptions: `quality=${quality}`, singleFile: true }; const jpegBuffer = await poppler.pdfToCairo(tempPdfPath, undefined, options); await fs.promises.unlink(tempPdfPath); return jpegBuffer; } catch (error) { logger_1.logger.error("Error rasterizing PDF:", error); throw error; } finally { tmpPdf.removeCallback(); } } /** * Performs linear interpolation between two numbers. * @param start The starting value. * @param end The ending value. * @param t The interpolation factor (0 to 1). * @returns The interpolated value. */ function lerp(start, end, t) { return start * (1 - t) + end * t; }