mindee
Version:
Mindee Client Library for Node.js
74 lines (73 loc) • 3.12 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractReceipts = extractReceipts;
const pdf_lib_1 = require("@cantoo/pdf-lib");
const errors_1 = require("../../errors");
const extractedMultiReceiptImage_1 = require("./extractedMultiReceiptImage");
const common_1 = require("../common");
/**
* Given a page and a set of coordinates, extracts & assigns individual receipts to an ExtractedMultiReceiptImage
* object.
*
* @param pdfPage PDF Page to extract from.
* @param boundingBoxes A set of coordinates delimiting the position of each receipt.
* @param pageId Id of the page the receipt is extracted from. Caution: this starts at 0, unlike the numbering in PDF
* pages.
*/
async function extractReceiptsFromPage(pdfPage, boundingBoxes, pageId) {
const extractedReceiptsRaw = await (0, common_1.extractFromPage)(pdfPage, boundingBoxes);
const extractedReceipts = [];
for (let i = 0; i < extractedReceiptsRaw.length; i++) {
extractedReceipts.push(new extractedMultiReceiptImage_1.ExtractedMultiReceiptImage(extractedReceiptsRaw[i], pageId, i));
}
return extractedReceipts;
}
async function loadPdfDoc(inputFile) {
let pdfDoc;
if (!["image/jpeg", "image/jpg", "image/png", "application/pdf"].includes(inputFile.mimeType)) {
throw new errors_1.MindeeMimeTypeError('Unsupported file type "' +
inputFile.mimeType +
'" Currently supported types are .png, .jpg and .pdf');
}
else if (inputFile.isPdf()) {
pdfDoc = await pdf_lib_1.PDFDocument.load(inputFile.fileObject, {
ignoreEncryption: true,
password: ""
});
}
else {
pdfDoc = await pdf_lib_1.PDFDocument.create();
let image;
if (inputFile.mimeType === "image/png") {
image = await pdfDoc.embedPng(inputFile.fileObject);
}
else {
image = await pdfDoc.embedJpg(inputFile.fileObject);
}
const imageDims = image.scale(1);
const pageImage = pdfDoc.addPage([imageDims.width, imageDims.height]);
pageImage.drawImage(image);
}
return pdfDoc;
}
/**
* Extracts individual receipts from multi-receipts documents.
*
* @param inputFile File to extract sub-receipts from.
* @param inference Results of the inference.
* @returns Individual extracted receipts as an array of ExtractedMultiReceiptImage.
*/
async function extractReceipts(inputFile, inference) {
const images = [];
if (!inference.prediction.receipts) {
throw new errors_1.MindeeError("No possible receipts candidates found for MultiReceipts extraction.");
}
const pdfDoc = await loadPdfDoc(inputFile);
for (let pageId = 0; pageId < pdfDoc.getPageCount(); pageId++) {
const [page] = await pdfDoc.copyPages(pdfDoc, [pageId]);
const receiptPositions = inference.pages[pageId].prediction.receipts.map((receipt) => receipt.boundingBox);
const extractedReceipts = await extractReceiptsFromPage(page, receiptPositions, pageId);
images.push(...extractedReceipts);
}
return images;
}
;