UNPKG

pomljs

Version:

Prompt Orchestration Markup Language

57 lines (54 loc) 1.93 kB
import * as PDFJS from 'pdfjs-dist/legacy/build/pdf.js'; const log = console.log; console.log = (m, ...a) => /Cannot polyfill `(DOMMatrix|Path2D)`/.test(m) ? null : log(m, ...a); let pdfjs = PDFJS; if (PDFJS.GlobalWorkerOptions === undefined) { // in esm pdfjs = PDFJS.default; } else { // in commonjs pdfjs = PDFJS; } pdfjs.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/legacy/build/pdf.worker.js'; console.log = log; // restore original console.log async function getNumPages(pdfBuffer) { const uint8Array = pdfBuffer instanceof ArrayBuffer ? new Uint8Array(pdfBuffer) : new Uint8Array(pdfBuffer); const loadingTask = pdfjs.getDocument({ data: uint8Array }); const pdfDocument = await loadingTask.promise; return pdfDocument.numPages; } async function pdfParse(pdfBuffer, maxPages) { const uint8Array = pdfBuffer instanceof ArrayBuffer ? new Uint8Array(pdfBuffer) : new Uint8Array(pdfBuffer); const loadingTask = pdfjs.getDocument({ data: uint8Array }); const pdfDocument = await loadingTask.promise; let fullTexts = []; if (maxPages == undefined) { maxPages = pdfDocument.numPages; } else { maxPages = Math.min(maxPages, pdfDocument.numPages); } for (let pageNum = 1; pageNum <= maxPages; pageNum++) { const page = await pdfDocument.getPage(pageNum); const pageText = await extractTextFromPage(page); fullTexts.push(pageText); } return fullTexts.join('\n\n'); } async function extractTextFromPage(page) { const textContent = await page.getTextContent(); let lastY, text = ''; for (let item of textContent.items) { if (lastY == item.transform[5] || !lastY) { text += item.str; } else { text += '\n' + item.str; } lastY = item.transform[5]; } return text; } export { getNumPages, pdfParse }; //# sourceMappingURL=pdf.js.map