@rpidanny/pdf2md
Version:
A PDF to Markdown Converter
128 lines (113 loc) • 3.84 kB
JavaScript
const path = require("path");
const pdfjsPromise = import("pdfjs-dist/legacy/build/pdf.mjs");
pdfjsPromise.then((pdfjs) => {
pdfjs.GlobalWorkerOptions.workerSrc = `pdfjs-dist/legacy/build/pdf.worker.mjs`;
});
const {
findPageNumbers,
findFirstPage,
removePageNumber,
} = require("../../lib/util/page-number-functions");
const TextItem = require("../models/TextItem");
const Page = require("../models/Page");
const NO_OP = () => {};
exports.parse = async function parse(docOptions, callbacks) {
const { metadataParsed, pageParsed, fontParsed, documentParsed } = {
metadataParsed: NO_OP,
pageParsed: NO_OP,
fontParsed: NO_OP,
documentParsed: NO_OP,
...(callbacks || {}),
};
const fontDataPath = path.join(
path.resolve(require.resolve("pdfjs-dist"), "../../standard_fonts"),
"/"
);
if (typeof docOptions === "string" || docOptions instanceof URL) {
docOptions = { url: docOptions };
} else if (
docOptions instanceof ArrayBuffer ||
ArrayBuffer.isView(docOptions)
) {
docOptions = { data: docOptions };
}
const pdfjs = await pdfjsPromise;
const pdfDocument = await pdfjs.getDocument({
...docOptions,
standardFontDataUrl: fontDataPath,
}).promise;
const metadata = await pdfDocument.getMetadata();
metadataParsed(metadata);
const pages = [...Array(pdfDocument.numPages).keys()].map(
(index) => new Page({ index })
);
documentParsed(pdfDocument, pages);
const fonts = {
ids: new Set(),
map: new Map(),
};
let pageIndexNumMap = {};
let firstPage;
for (let j = 1; j <= pdfDocument.numPages; j++) {
const page = await pdfDocument.getPage(j);
const textContent = await page.getTextContent();
if (Object.keys(pageIndexNumMap).length < 10) {
pageIndexNumMap = findPageNumbers(
pageIndexNumMap,
page.pageNumber - 1,
textContent.items
);
} else {
firstPage = findFirstPage(pageIndexNumMap);
break;
}
}
let pageNum = firstPage ? firstPage.pageNum : 0;
for (let j = 1; j <= pdfDocument.numPages; j++) {
const page = await pdfDocument.getPage(j);
// Trigger the font retrieval for the page
await page.getOperatorList();
const scale = 1.0;
const viewport = page.getViewport({ scale });
let textContent = await page.getTextContent();
if (firstPage && page.pageIndex >= firstPage.pageIndex) {
textContent = removePageNumber(textContent, pageNum);
pageNum++;
}
const textItems = textContent.items.map((item) => {
const tx = pdfjs.Util.transform(viewport.transform, item.transform);
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
const dividedHeight = item.height / fontHeight;
return new TextItem({
x: Math.round(item.transform[4]),
y: Math.round(item.transform[5]),
width: Math.round(item.width),
height: Math.round(dividedHeight <= 1 ? item.height : dividedHeight),
text: item.str,
font: item.fontName,
});
});
pages[page.pageNumber - 1].items = textItems;
pageParsed(pages);
const fontIds = new Set(textItems.map((t) => t.font));
for (const fontId of fontIds) {
if (!fonts.ids.has(fontId) && fontId.startsWith("g_d")) {
// Depending on which build of pdfjs-dist is used, the
// WorkerTransport containing the font objects is either transport or _transport
const transport = pdfDocument.transport || pdfDocument._transport; // eslint-disable-line no-underscore-dangle
const font = await new Promise((resolve) =>
transport.commonObjs.get(fontId, resolve)
);
fonts.ids.add(fontId);
fonts.map.set(fontId, font);
fontParsed(fonts);
}
}
}
return {
fonts,
metadata,
pages,
pdfDocument,
};
};