UNPKG

pomljs

Version:

Prompt Orchestration Markup Language

79 lines (73 loc) 2.56 kB
'use strict'; var PDFJS = require('pdfjs-dist/legacy/build/pdf.js'); function _interopNamespaceDefault(e) { var n = Object.create(null); if (e) { Object.keys(e).forEach(function (k) { if (k !== 'default') { var d = Object.getOwnPropertyDescriptor(e, k); Object.defineProperty(n, k, d.get ? d : { enumerable: true, get: function () { return e[k]; } }); } }); } n.default = e; return Object.freeze(n); } var PDFJS__namespace = /*#__PURE__*/_interopNamespaceDefault(PDFJS); const log = console.log; console.log = (m, ...a) => /Cannot polyfill `(DOMMatrix|Path2D)`/.test(m) ? null : log(m, ...a); let pdfjs = PDFJS__namespace; if (PDFJS__namespace.GlobalWorkerOptions === undefined) { // in esm pdfjs = PDFJS__namespace.default; } else { // in commonjs pdfjs = PDFJS__namespace; } pdfjs.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/legacy/build/pdf.worker.js'; console.log = log; // restore original console.log async function getNumPages(pdfBuffer) { const uint8Array = pdfBuffer instanceof ArrayBuffer ? new Uint8Array(pdfBuffer) : new Uint8Array(pdfBuffer); const loadingTask = pdfjs.getDocument({ data: uint8Array }); const pdfDocument = await loadingTask.promise; return pdfDocument.numPages; } async function pdfParse(pdfBuffer, maxPages) { const uint8Array = pdfBuffer instanceof ArrayBuffer ? new Uint8Array(pdfBuffer) : new Uint8Array(pdfBuffer); const loadingTask = pdfjs.getDocument({ data: uint8Array }); const pdfDocument = await loadingTask.promise; let fullTexts = []; if (maxPages == undefined) { maxPages = pdfDocument.numPages; } else { maxPages = Math.min(maxPages, pdfDocument.numPages); } for (let pageNum = 1; pageNum <= maxPages; pageNum++) { const page = await pdfDocument.getPage(pageNum); const pageText = await extractTextFromPage(page); fullTexts.push(pageText); } return fullTexts.join('\n\n'); } async function extractTextFromPage(page) { const textContent = await page.getTextContent(); let lastY, text = ''; for (let item of textContent.items) { if (lastY == item.transform[5] || !lastY) { text += item.str; } else { text += '\n' + item.str; } lastY = item.transform[5]; } return text; } exports.getNumPages = getNumPages; exports.pdfParse = pdfParse; //# sourceMappingURL=pdf.cjs.map