UNPKG

mindee

Version:

Mindee Client Library for Node.js

56 lines (55 loc) 1.84 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.extractTextFromPdf = extractTextFromPdf; exports.hasSourceText = hasSourceText; const pdf_js_extract_1 = require("pdf.js-extract"); const mindeeError_1 = require("../errors/mindeeError"); function getConcatenatedText(pages) { return pages.flatMap(page => page.content.map(item => item.str)).join(" "); } /** * Extracts text from a full PDF document. * * @returns A Promise containing the extracted text as a string. * @param pdfBuffer PDF handle, as a buffer. */ async function extractTextFromPdf(pdfBuffer) { const pdfExtract = new pdf_js_extract_1.PDFExtract(); const options = {}; const pdf = await new Promise((resolve, reject) => { pdfExtract.extractBuffer(pdfBuffer, options, (err, result) => { if (err) reject(err); if (result === undefined) reject(new mindeeError_1.MindeePdfError("Couldn't process result.")); else resolve(result); }); }); const pages = pdf.pages.map((page, index) => ({ pageNumber: index + 1, content: page.content.map(item => ({ str: item.str, x: item.x, y: item.y, width: item.width, height: item.height, fontName: item.fontName, })), })); return { pages, getConcatenatedText: () => getConcatenatedText(pages), }; } /** * Checks if a PDF contains source text. * * @param pdfData Buffer representing the content of the PDF file. * * @returns A Promise containing a boolean indicating if the PDF has source text. */ async function hasSourceText(pdfData) { const text = await extractTextFromPdf(pdfData); return text.getConcatenatedText().trim().length > 0; }