afpp
Version:
Async Fast PDF Parser for Node.js — dependency-light, TypeScript-first, production-ready.
244 lines • 10.3 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.PROCESSING_TYPE = void 0;
exports.parsePdfFile = parsePdfFile;
exports.streamPdfFile = streamPdfFile;
const promises_1 = require("node:fs/promises");
const node_os_1 = require("node:os");
const canvas_1 = require("@napi-rs/canvas");
const p_limit_1 = __importDefault(require("p-limit"));
const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
var PROCESSING_TYPE;
(function (PROCESSING_TYPE) {
PROCESSING_TYPE["IMAGE"] = "IMAGE";
PROCESSING_TYPE["MIXED"] = "MIXED";
PROCESSING_TYPE["TEXT"] = "TEXT";
})(PROCESSING_TYPE || (exports.PROCESSING_TYPE = PROCESSING_TYPE = {}));
class PooledCanvasFactory {
pool = [];
maxSize;
constructor(maxSize) {
this.maxSize = maxSize;
}
create(width, height) {
const existing = this.pool.pop();
if (existing) {
this.reset(existing, width, height);
return existing;
}
const canvas = (0, canvas_1.createCanvas)(width, height);
const context = canvas.getContext('2d');
return { canvas, context };
}
reset(canvasAndContext, width, height) {
canvasAndContext.canvas.width = width;
canvasAndContext.canvas.height = height;
// Resizing implicitly clears the canvas in @napi-rs/canvas
}
destroy(canvasAndContext) {
if (this.pool.length < this.maxSize) {
this.pool.push(canvasAndContext);
}
// Otherwise let it GC — pool is at capacity
}
}
const extractText = (items) => {
const parts = [];
for (const item of items) {
if (item.str)
parts.push(item.str);
}
return parts.join(' ');
};
const processPdfPageTypeMixed = async (page, canvasFactory, pageNumber, pageCount, scale, encoding, callback) => {
const textContent = await page.getTextContent({
includeMarkedContent: false,
});
const items = textContent.items;
if (items.length === 0) {
const viewport = page.getViewport({ scale });
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
try {
await page.render({
canvas: canvasAndContext.canvas,
canvasContext: canvasAndContext.context,
viewport,
}).promise;
//@ts-expect-error this should be fixed in release
const imageBuffer = await canvasAndContext.canvas.encode(encoding);
return callback(imageBuffer, pageNumber, pageCount);
}
finally {
canvasFactory.destroy(canvasAndContext);
}
}
return callback(extractText(items), pageNumber, pageCount);
};
const processPdfPageTypeText = async (page) => {
const textContent = await page.getTextContent({
includeMarkedContent: false,
});
const items = textContent.items;
return extractText(items);
};
const processPdfPageTypeImage = async (page, canvasFactory, pageNumber, pageCount, scale, encoding) => {
const viewport = page.getViewport({ scale });
const canvasAndContext = canvasFactory.create(viewport.width, viewport.height);
try {
await page.render({
canvas: canvasAndContext.canvas,
canvasContext: canvasAndContext.context,
viewport,
}).promise;
//@ts-expect-error this should be fixed in release
const imageBuffer = await canvasAndContext.canvas.encode(encoding);
return imageBuffer;
}
finally {
canvasFactory.destroy(canvasAndContext);
}
};
const validateParameters = async (input, options) => {
const documentInitParameters = {};
switch (true) {
case typeof input === 'string':
documentInitParameters.data = new Uint8Array(await (0, promises_1.readFile)(input));
break;
case Buffer.isBuffer(input):
documentInitParameters.data = new Uint8Array(input);
break;
case input instanceof Uint8Array:
documentInitParameters.data = input;
break;
case input instanceof URL:
documentInitParameters.url = input;
break;
default:
throw new Error(`Invalid source type: ${typeof input}`);
}
documentInitParameters.password = options?.password;
documentInitParameters.verbosity = pdf_mjs_1.VerbosityLevel.ERRORS;
// Performance optimizations for local file processing
documentInitParameters.disableAutoFetch = true; // Don't prefetch - we have full data
documentInitParameters.disableStream = true; // Don't stream - we have full data
documentInitParameters.disableRange = true; // Don't use range requests - we have full data
const scale = options?.scale ?? 1.0;
if (Number.isNaN(scale) || scale < 0.1 || scale > 10) {
throw new Error(`Invalid scale value: ${scale}. Must be a number between 0.1 and 10.`);
}
const concurrency = options?.concurrency === 'auto'
? Math.min((0, node_os_1.availableParallelism)(), 8)
: (options?.concurrency ?? 1);
if (typeof concurrency === 'number' &&
(!Number.isInteger(concurrency) || concurrency < 1)) {
throw new Error(`Invalid concurrency value: ${concurrency}. Must be a positive integer or 'auto'.`);
}
const encoding = options?.imageEncoding ?? 'png';
if (!['avif', 'jpeg', 'png', 'webp'].includes(encoding)) {
throw new Error(`Unsupported image encoding format: '${encoding}'`);
}
return { concurrency, documentInitParameters, encoding, scale };
};
async function parsePdfFile(type, input, options, callback) {
const { concurrency, documentInitParameters, encoding, scale } = await validateParameters(input, options);
const pooledFactory = type !== PROCESSING_TYPE.TEXT
? new PooledCanvasFactory(concurrency)
: undefined;
if (pooledFactory) {
// @ts-expect-error - PooledCanvasFactory is structurally compatible with pdfjs BaseCanvasFactory
documentInitParameters.canvasFactory = pooledFactory;
}
const limit = (0, p_limit_1.default)(concurrency);
const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters);
const pdfDocument = await loadingTask.promise;
try {
const { numPages } = pdfDocument;
if (type === PROCESSING_TYPE.MIXED) {
if (!callback || typeof callback !== 'function') {
throw new Error(`Invalid callback type: ${typeof callback}`);
}
const results = Array.from({ length: numPages });
const pageTasks = Array.from({ length: numPages }, (_, i) => {
const pageNum = i + 1;
return limit(async () => {
const page = await pdfDocument.getPage(pageNum);
const canvasFactory = pooledFactory ?? pdfDocument.canvasFactory;
const result = await processPdfPageTypeMixed(page, canvasFactory, pageNum, numPages, scale, encoding, callback);
results[i] = result;
page.cleanup();
});
});
await Promise.all(pageTasks);
return results;
}
if (type === PROCESSING_TYPE.TEXT) {
const results = Array.from({ length: numPages });
const pageTasks = Array.from({ length: numPages }, (_, i) => {
const pageNum = i + 1;
return limit(async () => {
const page = await pdfDocument.getPage(pageNum);
results[i] = await processPdfPageTypeText(page);
page.cleanup();
});
});
await Promise.all(pageTasks);
return results;
}
if (type === PROCESSING_TYPE.IMAGE) {
const results = Array.from({ length: numPages });
const pageTasks = Array.from({ length: numPages }, (_, i) => {
const pageNum = i + 1;
return limit(async () => {
const page = await pdfDocument.getPage(pageNum);
const canvasFactory = pooledFactory ?? pdfDocument.canvasFactory;
results[i] = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding);
page.cleanup();
});
});
await Promise.all(pageTasks);
return results;
}
throw new Error('Invalid PROCESSING_TYPE');
}
finally {
// Clean up pdfjs resources to prevent memory leaks
pdfDocument.cleanup();
await pdfDocument.destroy();
loadingTask.destroy();
}
}
async function* streamPdfFile(type, input, options) {
const { documentInitParameters, encoding, scale } = await validateParameters(input, options);
const pooledFactory = type === PROCESSING_TYPE.IMAGE ? new PooledCanvasFactory(1) : undefined;
if (pooledFactory) {
// @ts-expect-error - PooledCanvasFactory is structurally compatible with pdfjs BaseCanvasFactory
documentInitParameters.canvasFactory = pooledFactory;
}
const loadingTask = (0, pdf_mjs_1.getDocument)(documentInitParameters);
const pdfDocument = await loadingTask.promise;
try {
const { numPages } = pdfDocument;
for (let pageNum = 1; pageNum <= numPages; pageNum++) {
const page = await pdfDocument.getPage(pageNum);
if (type === PROCESSING_TYPE.IMAGE) {
const canvasFactory = pooledFactory ?? pdfDocument.canvasFactory;
const data = await processPdfPageTypeImage(page, canvasFactory, pageNum, numPages, scale, encoding);
yield { data, pageCount: numPages, pageNumber: pageNum };
}
else {
const data = await processPdfPageTypeText(page);
yield { data, pageCount: numPages, pageNumber: pageNum };
}
page.cleanup();
}
}
finally {
pdfDocument.cleanup();
await pdfDocument.destroy();
loadingTask.destroy();
}
}
//# sourceMappingURL=core.js.map