mindee
Version:
Mindee Client Library for Node.js
261 lines (260 loc) • 11 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.compressPdf = compressPdf;
const logger_1 = require("../logger");
const tmp_1 = __importDefault(require("tmp"));
const pdfUtils_1 = require("./pdfUtils");
const fs = __importStar(require("node:fs"));
const node_poppler_1 = require("node-poppler");
const pdf_lib_1 = require("@cantoo/pdf-lib");
const imageOperations_1 = require("../imageOperations");
/**
* Compresses each page of a provided PDF buffer.
* @param pdfData The input PDF as a Buffer.
* @param imageQuality Compression quality (70-100 for most JPG images).
* @param forceSourceTextCompression If true, attempts to re-write detected text.
* @param disableSourceText If true, doesn't re-apply source text to the output PDF.
* @returns A Promise resolving to the compressed PDF as a Buffer.
*/
async function compressPdf(pdfData, imageQuality = 85, forceSourceTextCompression = false, disableSourceText = true) {
handleCompressionWarnings(forceSourceTextCompression, disableSourceText);
if (await (0, pdfUtils_1.hasSourceText)(pdfData)) {
if (forceSourceTextCompression) {
if (!disableSourceText) {
logger_1.logger.warn("Re-writing PDF source-text is an EXPERIMENTAL feature.");
}
else {
logger_1.logger.warn("Source file contains text, but disable_source_text flag. " +
"is set to false. Resulting file will not contain any embedded text.");
}
}
else {
logger_1.logger.warn("Found text inside of the provided PDF file. Compression operation aborted since disableSourceText "
+ "is set to 'true'.");
return pdfData;
}
}
const extractedText = disableSourceText ? await (0, pdfUtils_1.extractTextFromPdf)(pdfData) : null;
const extractedPdfInfo = await (0, pdfUtils_1.extractTextFromPdf)(pdfData);
const compressedPages = await compressPdfPages(pdfData, extractedPdfInfo, imageQuality, disableSourceText, extractedText);
if (!compressedPages) {
logger_1.logger.warn("Could not compress PDF to a smaller size. Returning original PDF.");
return pdfData;
}
return createNewPdfFromCompressedPages(compressedPages);
}
/**
* Handles compression warnings based on the provided parameters.
* @param forceSourceTextCompression If true, attempts to re-write detected text.
* @param disableSourceText If true, doesn't re-apply source text to the output PDF.
*/
function handleCompressionWarnings(forceSourceTextCompression, disableSourceText) {
if (forceSourceTextCompression) {
if (!disableSourceText) {
logger_1.logger.warn("Re-writing PDF source-text is an EXPERIMENTAL feature.");
}
else {
logger_1.logger.warn("Source file contains text, but the disable_source_text is set to false. "
+ "Resulting file will not contain any embedded text.");
}
}
}
/**
* Compresses PDF pages and returns an array of compressed page buffers.
* @param pdfData The input PDF as a Buffer.
* @param extractedPdfInfo Extracted PDF information.
* @param imageQuality Initial compression quality.
* @param disableSourceText If true, doesn't re-apply source text to the output PDF.
* @param extractedText Extracted text from the PDF.
* @returns A Promise resolving to an array of compressed page buffers, or null if compression fails.
*/
async function compressPdfPages(pdfData, extractedPdfInfo, imageQuality, disableSourceText, extractedText) {
const originalSize = pdfData.length;
const MIN_QUALITY = 1;
let imageQualityLoop = imageQuality;
while (imageQualityLoop >= MIN_QUALITY) {
const compressedPages = await compressPagesWithQuality(pdfData, extractedPdfInfo, imageQualityLoop, disableSourceText, extractedText);
const totalCompressedSize = calculateTotalCompressedSize(compressedPages);
if (isCompressionSuccessful(totalCompressedSize, originalSize, imageQuality)) {
return compressedPages;
}
imageQualityLoop -= Math.round(lerp(1, 10, imageQualityLoop / 100));
}
return null;
}
/**
* Compresses pages with a specific quality.
* @param pdfData The input PDF as a Buffer.
* @param extractedPdfInfo Extracted PDF information.
* @param imageQuality Compression quality.
* @param disableSourceText If true, doesn't re-apply source text to the output PDF.
* @param extractedText Extracted text from the PDF.
* @returns A Promise resolving to an array of compressed page buffers.
*/
async function compressPagesWithQuality(pdfData, extractedPdfInfo, imageQuality, disableSourceText, extractedText) {
const pdfDoc = await pdf_lib_1.PDFDocument.load(pdfData, {
ignoreEncryption: true,
password: ""
});
const compressedPages = [];
for (let i = 0; i < extractedPdfInfo.pages.length; i++) {
const page = pdfDoc.getPages()[i];
const rasterizedPage = await rasterizePage(pdfData, i + 1, imageQuality);
const compressedImage = await (0, imageOperations_1.compressImage)(Buffer.from(rasterizedPage, "binary"), imageQuality);
if (!disableSourceText) {
await addTextToPdfPage(page, extractedText);
}
compressedPages.push(compressedImage);
}
return compressedPages;
}
/**
* Calculates the total size of compressed pages.
* @param compressedPages Array of compressed page buffers.
* @returns The total size of compressed pages.
*/
function calculateTotalCompressedSize(compressedPages) {
return compressedPages.reduce((sum, page) => sum + page.length, 0);
}
/**
* Checks if the compression was successful based on the compressed size and original size.
* Note: Not quite sure how or why the rasterization quality ratio is correlated with the overhead generated by the
* image's inclusion into the pdf data, but this makes the following lerp() necessary if we want consistency during
* compression.
*
* @param totalCompressedSize Total size of compressed pages.
* @param originalSize Original PDF size.
* @param imageQuality Compression quality.
* @returns True if compression was successful, false otherwise.
*/
function isCompressionSuccessful(totalCompressedSize, originalSize, imageQuality) {
const overhead = lerp(0.54, 0.18, imageQuality / 100);
return totalCompressedSize + totalCompressedSize * overhead < originalSize;
}
/**
* Creates a new PDF document from compressed page buffers.
* @param compressedPages Array of compressed page buffers.
* @returns A Promise resolving to the new PDF as a Buffer.
*/
async function createNewPdfFromCompressedPages(compressedPages) {
const newPdfDoc = await pdf_lib_1.PDFDocument.create();
for (const compressedPage of compressedPages) {
const image = await newPdfDoc.embedJpg(compressedPage);
const newPage = newPdfDoc.addPage([image.width, image.height]);
newPage.drawImage(image, {
x: 0,
y: 0,
width: image.width,
height: image.height,
});
}
const compressedPdfBytes = await newPdfDoc.save();
return Buffer.from(compressedPdfBytes);
}
async function addTextToPdfPage(page, textInfo) {
if (textInfo === null) {
return;
}
for (const textPages of textInfo.pages) {
for (const textPage of textPages.content) {
page.drawText(textPage.str, {
x: textPage.x,
y: textPage.y,
size: textPage.height,
color: (0, pdf_lib_1.rgb)(0, 0, 0),
font: await getFontFromName(textPage.fontName)
});
}
}
}
async function getFontFromName(fontName) {
const pdfDoc = await pdf_lib_1.PDFDocument.create();
let font;
if (Object.values(pdf_lib_1.StandardFonts).map(value => value.toString()).includes(fontName)) {
font = await pdfDoc.embedFont(fontName);
}
else {
font = await pdfDoc.embedFont(pdf_lib_1.StandardFonts.Helvetica);
}
return font;
}
/**
* Rasterizes a PDF page.
*
* @param pdfData Buffer representation of the entire PDF file.
* @param index Index of the page to rasterize.
* @param quality Quality to apply during rasterization.
*/
async function rasterizePage(pdfData, index, quality = 85) {
const poppler = new node_poppler_1.Poppler();
const tmpPdf = tmp_1.default.fileSync();
const tempPdfPath = tmpPdf.name;
const antialiasOption = "best";
try {
await fs.promises.writeFile(tempPdfPath, pdfData);
const options = {
antialias: antialiasOption,
firstPageToConvert: index,
lastPageToConvert: index,
jpegFile: true,
jpegOptions: `quality=${quality}`,
singleFile: true
};
const jpegBuffer = await poppler.pdfToCairo(tempPdfPath, undefined, options);
await fs.promises.unlink(tempPdfPath);
return jpegBuffer;
}
catch (error) {
logger_1.logger.error("Error rasterizing PDF:", error);
throw error;
}
finally {
tmpPdf.removeCallback();
}
}
/**
* Performs linear interpolation between two numbers.
* @param start The starting value.
* @param end The ending value.
* @param t The interpolation factor (0 to 1).
* @returns The interpolated value.
*/
function lerp(start, end, t) {
return start * (1 - t) + end * t;
}