afpp
Version:
another f*cking pdf parser
141 lines • 6.03 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.PROCESSING_TYPE = void 0;
exports.parsePdfFile = parsePdfFile;
var PROCESSING_TYPE;
(function (PROCESSING_TYPE) {
PROCESSING_TYPE["IMAGE"] = "IMAGE";
PROCESSING_TYPE["MIXED"] = "MIXED";
PROCESSING_TYPE["TEXT"] = "TEXT";
})(PROCESSING_TYPE || (exports.PROCESSING_TYPE = PROCESSING_TYPE = {}));
const promises_1 = require("node:fs/promises");
const p_limit_1 = __importDefault(require("p-limit"));
const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
const processPdfPageTypeMixed = async (page, canvasFactory, pageNumber, pageCount, scale, encoding, callback) => {
const textContent = await page.getTextContent({
includeMarkedContent: false,
});
const items = textContent.items;
if (items.length === 0) {
const viewport = page.getViewport({ scale });
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
await page.render({
canvas: canvasAndContext.canvas,
canvasContext: canvasAndContext.context,
viewport,
}).promise;
//@ts-expect-error this should be fixed in release
const imageBuffer = await canvasAndContext.canvas.encode(encoding);
canvasFactory.destroy(canvasAndContext);
return callback(imageBuffer, pageNumber, pageCount);
}
const pageText = items.map((item) => item.str || '').join(' ');
return callback(pageText, pageNumber, pageCount);
};
const processPdfPageTypeText = async (page) => {
const textContent = await page.getTextContent({
includeMarkedContent: false,
});
const items = textContent.items;
if (items.length === 0) {
return '';
}
else {
return items.map((item) => item.str || '').join(' ');
}
};
const processPdfPageTypeImage = async (page, canvasFactory, pageNumber, pageCount, scale, encoding) => {
const viewport = page.getViewport({ scale });
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
await page.render({
canvas: canvasAndContext.canvas,
canvasContext: canvasAndContext.context,
viewport,
}).promise;
//@ts-expect-error this should be fixed in release
const imageBuffer = await canvasAndContext.canvas.encode(encoding);
canvasFactory.destroy(canvasAndContext);
return imageBuffer;
};
const validateParameters = async (input, options) => {
const documentInitParameters = {};
switch (true) {
case typeof input === 'string':
documentInitParameters.data = new Uint8Array(await (0, promises_1.readFile)(input));
break;
case Buffer.isBuffer(input):
documentInitParameters.data = new Uint8Array(input);
break;
case input instanceof Uint8Array:
documentInitParameters.data = input;
break;
case input instanceof URL:
documentInitParameters.url = input;
break;
default:
throw new Error(`Invalid source type: ${typeof input}`);
}
documentInitParameters.password = options?.password;
documentInitParameters.verbosity = pdf_mjs_1.VerbosityLevel.ERRORS;
const scale = options?.scale ?? 2.0;
const concurrency = options?.concurrency ?? 1;
const encoding = options?.imageEncoding ?? 'png';
if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
throw new Error(`Unsupported image encoding format: '${encoding}'`);
}
return { concurrency, documentInitParameters, encoding, scale };
};
async function parsePdfFile(type, input, options, callback) {
const { concurrency, documentInitParameters, encoding, scale } = await validateParameters(input, options);
const limit = (0, p_limit_1.default)(concurrency);
const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters);
const pdfDocument = await loadingTask.promise;
const { numPages } = pdfDocument;
if (type === PROCESSING_TYPE.MIXED) {
if (!callback || typeof callback !== 'function') {
throw new Error(`Invalid callback type: ${typeof callback}`);
}
const results = new Array(numPages);
const pageTasks = Array.from({ length: numPages }, (_, i) => {
const pageNum = i + 1;
return limit(async () => {
const page = await pdfDocument.getPage(pageNum);
const canvasFactory = pdfDocument.canvasFactory;
const result = await processPdfPageTypeMixed(page, canvasFactory, pageNum, numPages, scale, encoding, callback);
results[i] = result;
});
});
await Promise.all(pageTasks);
return results;
}
if (type === PROCESSING_TYPE.TEXT) {
const results = new Array(numPages);
const pageTasks = Array.from({ length: numPages }, (_, i) => {
const pageNum = i + 1;
return limit(async () => {
const page = await pdfDocument.getPage(pageNum);
results[i] = await processPdfPageTypeText(page);
});
});
await Promise.all(pageTasks);
return results;
}
if (type === PROCESSING_TYPE.IMAGE) {
const results = new Array(numPages);
const pageTasks = Array.from({ length: numPages }, (_, i) => {
const pageNum = i + 1;
return limit(async () => {
const page = await pdfDocument.getPage(pageNum);
const canvasFactory = pdfDocument.canvasFactory;
results[i] = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding);
});
});
await Promise.all(pageTasks);
return results;
}
throw new Error('Invalid PROCESSING_TYPE');
}
//# sourceMappingURL=core.js.map