pomljs
Version:
Prompt Orchestration Markup Language
57 lines (54 loc) • 1.93 kB
JavaScript
import * as PDFJS from 'pdfjs-dist/legacy/build/pdf.js';
const log = console.log;
console.log = (m, ...a) => /Cannot polyfill `(DOMMatrix|Path2D)`/.test(m) ? null : log(m, ...a);
let pdfjs = PDFJS;
if (PDFJS.GlobalWorkerOptions === undefined) {
// in esm
pdfjs = PDFJS.default;
}
else {
// in commonjs
pdfjs = PDFJS;
}
pdfjs.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/legacy/build/pdf.worker.js';
console.log = log; // restore original console.log
async function getNumPages(pdfBuffer) {
const uint8Array = pdfBuffer instanceof ArrayBuffer ? new Uint8Array(pdfBuffer) : new Uint8Array(pdfBuffer);
const loadingTask = pdfjs.getDocument({ data: uint8Array });
const pdfDocument = await loadingTask.promise;
return pdfDocument.numPages;
}
async function pdfParse(pdfBuffer, maxPages) {
const uint8Array = pdfBuffer instanceof ArrayBuffer ? new Uint8Array(pdfBuffer) : new Uint8Array(pdfBuffer);
const loadingTask = pdfjs.getDocument({ data: uint8Array });
const pdfDocument = await loadingTask.promise;
let fullTexts = [];
if (maxPages == undefined) {
maxPages = pdfDocument.numPages;
}
else {
maxPages = Math.min(maxPages, pdfDocument.numPages);
}
for (let pageNum = 1; pageNum <= maxPages; pageNum++) {
const page = await pdfDocument.getPage(pageNum);
const pageText = await extractTextFromPage(page);
fullTexts.push(pageText);
}
return fullTexts.join('\n\n');
}
async function extractTextFromPage(page) {
const textContent = await page.getTextContent();
let lastY, text = '';
for (let item of textContent.items) {
if (lastY == item.transform[5] || !lastY) {
text += item.str;
}
else {
text += '\n' + item.str;
}
lastY = item.transform[5];
}
return text;
}
export { getNumPages, pdfParse };
//# sourceMappingURL=pdf.js.map