UNPKG

@rpidanny/pdf2md

Version:
128 lines (113 loc) 3.84 kB
const path = require("path"); const pdfjsPromise = import("pdfjs-dist/legacy/build/pdf.mjs"); pdfjsPromise.then((pdfjs) => { pdfjs.GlobalWorkerOptions.workerSrc = `pdfjs-dist/legacy/build/pdf.worker.mjs`; }); const { findPageNumbers, findFirstPage, removePageNumber, } = require("../../lib/util/page-number-functions"); const TextItem = require("../models/TextItem"); const Page = require("../models/Page"); const NO_OP = () => {}; exports.parse = async function parse(docOptions, callbacks) { const { metadataParsed, pageParsed, fontParsed, documentParsed } = { metadataParsed: NO_OP, pageParsed: NO_OP, fontParsed: NO_OP, documentParsed: NO_OP, ...(callbacks || {}), }; const fontDataPath = path.join( path.resolve(require.resolve("pdfjs-dist"), "../../standard_fonts"), "/" ); if (typeof docOptions === "string" || docOptions instanceof URL) { docOptions = { url: docOptions }; } else if ( docOptions instanceof ArrayBuffer || ArrayBuffer.isView(docOptions) ) { docOptions = { data: docOptions }; } const pdfjs = await pdfjsPromise; const pdfDocument = await pdfjs.getDocument({ ...docOptions, standardFontDataUrl: fontDataPath, }).promise; const metadata = await pdfDocument.getMetadata(); metadataParsed(metadata); const pages = [...Array(pdfDocument.numPages).keys()].map( (index) => new Page({ index }) ); documentParsed(pdfDocument, pages); const fonts = { ids: new Set(), map: new Map(), }; let pageIndexNumMap = {}; let firstPage; for (let j = 1; j <= pdfDocument.numPages; j++) { const page = await pdfDocument.getPage(j); const textContent = await page.getTextContent(); if (Object.keys(pageIndexNumMap).length < 10) { pageIndexNumMap = findPageNumbers( pageIndexNumMap, page.pageNumber - 1, textContent.items ); } else { firstPage = findFirstPage(pageIndexNumMap); break; } } let pageNum = firstPage ? firstPage.pageNum : 0; for (let j = 1; j <= pdfDocument.numPages; j++) { const page = await pdfDocument.getPage(j); // Trigger the font retrieval for the page await page.getOperatorList(); const scale = 1.0; const viewport = page.getViewport({ scale }); let textContent = await page.getTextContent(); if (firstPage && page.pageIndex >= firstPage.pageIndex) { textContent = removePageNumber(textContent, pageNum); pageNum++; } const textItems = textContent.items.map((item) => { const tx = pdfjs.Util.transform(viewport.transform, item.transform); const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); const dividedHeight = item.height / fontHeight; return new TextItem({ x: Math.round(item.transform[4]), y: Math.round(item.transform[5]), width: Math.round(item.width), height: Math.round(dividedHeight <= 1 ? item.height : dividedHeight), text: item.str, font: item.fontName, }); }); pages[page.pageNumber - 1].items = textItems; pageParsed(pages); const fontIds = new Set(textItems.map((t) => t.font)); for (const fontId of fontIds) { if (!fonts.ids.has(fontId) && fontId.startsWith("g_d")) { // Depending on which build of pdfjs-dist is used, the // WorkerTransport containing the font objects is either transport or _transport const transport = pdfDocument.transport || pdfDocument._transport; // eslint-disable-line no-underscore-dangle const font = await new Promise((resolve) => transport.commonObjs.get(fontId, resolve) ); fonts.ids.add(fontId); fonts.map.set(fontId, font); fontParsed(fonts); } } } return { fonts, metadata, pages, pdfDocument, }; };