@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
411 lines (410 loc) • 15.7 kB
JavaScript
/**
* PDF Processor with Image Fallback Support
*
* Handles PDF processing for all providers:
* - Native PDF support for providers that accept PDF directly (Google AI, Vertex, OpenAI, Anthropic, Bedrock)
* - PDF → Image conversion for providers that don't support native PDF (Azure, Mistral, Ollama)
*
* The conversion uses pdf-to-img package (MuPDF-based) for high-quality conversion.
*/
import { PDF_LIMITS } from "../core/constants.js";
import { ErrorFactory } from "./errorHandling.js";
import { logger } from "./logger.js";
/**
* Provider configurations for PDF handling
*
* supportsNative: true = Send PDF as FilePart (mimeType: application/pdf)
* supportsNative: false = Convert PDF pages to PNG images and send as ImageParts
*/
const PDF_PROVIDER_CONFIGS = {
anthropic: {
maxSizeMB: 5,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "document",
},
bedrock: {
maxSizeMB: 5,
maxPages: 100,
supportsNative: true,
requiresCitations: "auto",
apiType: "document",
},
"google-vertex": {
maxSizeMB: 5,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "document",
},
vertex: {
maxSizeMB: 5,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "document",
},
"google-ai-studio": {
maxSizeMB: 2000,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "files-api",
},
gemini: {
maxSizeMB: 2000,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "files-api",
},
"google-ai": {
maxSizeMB: 2000,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "files-api",
},
openai: {
maxSizeMB: 10,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "files-api",
},
// Azure does NOT support native PDF - must convert to images
azure: {
maxSizeMB: 10,
maxPages: 100,
supportsNative: false,
requiresCitations: false,
apiType: "files-api",
},
"azure-openai": {
maxSizeMB: 10,
maxPages: 100,
supportsNative: false,
requiresCitations: false,
apiType: "files-api",
},
litellm: {
maxSizeMB: 10,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "files-api",
},
"openai-compatible": {
maxSizeMB: 10,
maxPages: 100,
supportsNative: false, // LiteLLM is a proxy — underlying model may not support native PDF; default to safe text extraction
requiresCitations: false,
apiType: "files-api",
},
mistral: {
maxSizeMB: 10,
maxPages: 100,
supportsNative: false,
requiresCitations: false,
apiType: "files-api",
},
"hugging-face": {
maxSizeMB: 10,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "files-api",
},
huggingface: {
maxSizeMB: 10,
maxPages: 100,
supportsNative: true,
requiresCitations: false,
apiType: "files-api",
},
};
export class PDFProcessor {
// PDF magic bytes: %PDF-
static PDF_SIGNATURE = Buffer.from("%PDF-", "ascii");
// ============================================================================
// PDF Validation & Processing
// ============================================================================
static async process(content, options) {
const provider = (options?.provider || "unknown").toLowerCase();
const config = PDF_PROVIDER_CONFIGS[provider];
if (!PDFProcessor.isValidPDF(content)) {
throw new Error("Invalid PDF file format. File must start with %PDF- header.");
}
if (!config) {
const supportedProviders = Object.keys(PDF_PROVIDER_CONFIGS).join(", ");
throw new Error(`PDF files are not configured for ${provider} provider.\n` +
`Configured providers: ${supportedProviders}\n` +
`Current provider: ${provider}\n\n` +
`Options:\n` +
`1. Switch to a configured provider (--provider openai or --provider vertex)\n` +
`2. Contact support to add ${provider} PDF configuration`);
}
const sizeMB = content.length / (1024 * 1024);
if (sizeMB > config.maxSizeMB) {
throw new Error(`PDF size ${sizeMB.toFixed(2)}MB exceeds ${config.maxSizeMB}MB limit for ${provider}`);
}
const metadata = PDFProcessor.extractBasicMetadata(content);
if (metadata.estimatedPages && metadata.estimatedPages > config.maxPages) {
const enforceLimits = options?.enforceLimits !== false;
if (enforceLimits) {
throw ErrorFactory.pdfPageLimitExceeded(metadata.estimatedPages, config.maxPages, provider);
}
else {
logger.warn(`[PDF] ⚠️ LIMIT BYPASS: Processing ${metadata.estimatedPages}-page PDF despite ${config.maxPages}-page limit for ${provider}. ` +
`This may cause API rejection, token errors, or unexpected costs. ` +
`Consider splitting the PDF or using a different provider.`);
}
}
if (provider === "bedrock" && options?.bedrockApiMode === "converse") {
logger.info("[PDF] Using Bedrock Converse API. " +
"Visual PDF analysis requires citations enabled. " +
"Text-only mode: ~1,000 tokens/3 pages. " +
"Visual mode: ~7,000 tokens/3 pages.");
}
logger.info("[PDF] ✅ Validated PDF file", {
provider,
size: `${sizeMB.toFixed(2)}MB`,
version: metadata.version,
estimatedPages: metadata.estimatedPages,
apiType: config.apiType,
supportsNative: config.supportsNative,
});
return {
type: "pdf",
content,
mimeType: "application/pdf",
metadata: {
confidence: 100,
size: content.length,
...metadata,
provider,
apiType: config.apiType,
},
};
}
/**
* Check if a provider supports native PDF input
* @param provider - Provider name
* @returns true if provider can accept PDF directly, false if requires image conversion
*/
static supportsNativePDF(provider) {
const normalizedProvider = provider.toLowerCase();
const config = PDF_PROVIDER_CONFIGS[normalizedProvider];
return config?.supportsNative ?? false;
}
static getProviderConfig(provider) {
return PDF_PROVIDER_CONFIGS[provider] || null;
}
static isValidPDF(buffer) {
if (buffer.length < 5) {
return false;
}
return buffer.subarray(0, 5).equals(PDFProcessor.PDF_SIGNATURE);
}
static extractBasicMetadata(buffer) {
const headerSize = Math.min(10000, buffer.length);
const header = buffer.toString("utf-8", 0, headerSize);
const versionMatch = header.match(/%PDF-(\d\.\d)/);
const version = versionMatch ? versionMatch[1] : "unknown";
const pageMatches = header.match(/\/Type\s*\/Page[^s]/g);
const estimatedPages = pageMatches ? pageMatches.length : null;
return {
version,
estimatedPages,
filename: undefined,
};
}
static estimateTokens(pageCount, mode = "visual") {
if (mode === "text-only") {
return Math.ceil((pageCount / 3) * 1000);
}
else {
return Math.ceil((pageCount / 3) * 7000);
}
}
// ============================================================================
// PDF → Image Conversion (for providers without native PDF support)
// ============================================================================
/**
* Convert a PDF buffer to an array of base64 PNG images
*
* This is used automatically when a provider (like Azure, Mistral, Ollama) doesn't
* support native PDF input but does support image input. The PDF pages are converted
* to PNG images and sent as vision content.
*
* @param pdfBuffer - PDF file content as Buffer
* @param options - Conversion options
* @returns Promise with conversion result including base64 images
*
* @example
* ```typescript
* // Check if conversion is needed
* if (!PDFProcessor.supportsNativePDF('azure')) {
* const result = await PDFProcessor.convertToImages(pdfBuffer, {
* scale: 2,
* maxPages: 10
* });
* // Use images in LLM input instead of PDF
* options.input.images = result.images;
* }
* ```
*/
static async convertToImages(pdfBuffer, options) {
const startTime = Date.now();
const { scale = 2, maxPages = PDF_LIMITS.DEFAULT_MAX_PAGES, format = "png", } = options || {};
const images = [];
const warnings = [];
// ============================================================================
// INPUT VALIDATION (Security: Prevent malformed/malicious PDF processing)
// ============================================================================
// 0. Validate format is supported and case-sensitive
if (format !== "png") {
throw new Error(`Invalid format: "${format}". Only "png" format is currently supported.`);
}
// 1. Validate buffer is not empty or too small
if (!pdfBuffer || pdfBuffer.length < 5) {
throw new Error("Invalid PDF: Buffer is too small or empty. " +
"A valid PDF must be at least 5 bytes (PDF header).");
}
// 2. Validate PDF magic bytes (%PDF-)
if (!PDFProcessor.isValidPDF(pdfBuffer)) {
throw new Error("Invalid PDF: File must start with %PDF- header. " +
"The provided buffer does not appear to be a valid PDF file.");
}
// 3. Validate maximum buffer size to prevent memory exhaustion
const sizeMB = pdfBuffer.length / (1024 * 1024);
if (sizeMB > PDF_LIMITS.MAX_SIZE_MB) {
throw new Error(`PDF too large for image conversion: ${sizeMB.toFixed(2)}MB exceeds ${PDF_LIMITS.MAX_SIZE_MB}MB limit. ` +
"Consider splitting the PDF or using a provider with native PDF support.");
}
logger.debug("[PDF→Image] ✅ PDF validation passed", {
bufferSize: pdfBuffer.length,
sizeMB: sizeMB.toFixed(2),
maxPages,
});
try {
// Dynamic import to avoid loading MuPDF binaries until needed
const pdfToImgModule = await import("pdf-to-img");
const pdf = pdfToImgModule.pdf;
logger.debug("[PDF→Image] Starting PDF to image conversion", {
bufferSize: pdfBuffer.length,
scale,
maxPages: maxPages || "all",
});
// Create PDF document iterator
const document = await pdf(pdfBuffer, { scale });
let pageIndex = 0;
// Iterate through pages and convert to base64
for await (const page of document) {
// Check if we've reached the max pages limit
if (maxPages !== undefined && pageIndex >= maxPages) {
warnings.push(`Stopped at page ${pageIndex} (maxPages limit: ${maxPages})`);
break;
}
// Convert PNG buffer to base64
const base64Image = page.toString("base64");
images.push(base64Image);
pageIndex++;
logger.debug(`[PDF→Image] Converted page ${pageIndex}`, {
imageSizeBytes: page.length,
base64Length: base64Image.length,
});
}
// Check for empty PDF (0 pages)
if (images.length === 0) {
throw new Error("PDF has 0 pages. Cannot convert empty PDF to images.");
}
const conversionTimeMs = Date.now() - startTime;
logger.info("[PDF→Image] ✅ PDF conversion completed", {
pageCount: images.length,
conversionTimeMs,
totalImageBytes: images.reduce((sum, img) => sum + img.length, 0),
});
return {
images,
pageCount: images.length,
conversionTimeMs,
warnings: warnings.length > 0 ? warnings : undefined,
};
}
catch (error) {
const conversionTimeMs = Date.now() - startTime;
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error("[PDF→Image] ❌ PDF conversion failed", {
error: errorMessage,
conversionTimeMs,
});
throw new Error(`PDF to image conversion failed: ${errorMessage}`, {
cause: error,
});
}
}
/**
* Convert a PDF file path to an array of base64 PNG images
*
* @param pdfPath - Path to the PDF file
* @param options - Conversion options
* @returns Promise with conversion result
*/
static async convertFromPath(pdfPath, options) {
const fs = await import("fs/promises");
const pdfBuffer = await fs.readFile(pdfPath);
return PDFProcessor.convertToImages(pdfBuffer, options);
}
/**
* Check if PDF to image conversion is available
* Useful for feature detection
*
* @returns true if pdf-to-img package is available
*/
static async isImageConversionAvailable() {
try {
await import("pdf-to-img");
return true;
}
catch {
return false;
}
}
/**
* Get estimated memory usage for converting a PDF
*
* @param pdfSizeBytes - Size of PDF file in bytes
* @param pageCount - Estimated number of pages
* @param scale - Scale factor
* @returns Estimated memory usage in MB
*/
static estimateConversionMemoryUsage(pdfSizeBytes, pageCount, scale = 2) {
// Rough estimation:
// - Each page at scale 2 produces ~1-3MB PNG
// - MuPDF needs ~2x PDF size for processing
// - Output images need ~2MB per page on average
const pdfProcessingMB = (pdfSizeBytes / (1024 * 1024)) * 2;
const outputImagesMB = pageCount * 2 * scale;
return Math.ceil(pdfProcessingMB + outputImagesMB);
}
/**
* Get list of providers that require PDF → Image conversion
*/
static getImageFallbackProviders() {
return Object.entries(PDF_PROVIDER_CONFIGS)
.filter(([_, config]) => !config.supportsNative)
.map(([name]) => name);
}
/**
* Get list of providers that support native PDF
*/
static getNativePDFProviders() {
return Object.entries(PDF_PROVIDER_CONFIGS)
.filter(([_, config]) => config.supportsNative)
.map(([name]) => name);
}
}
// Export PDFImageConverter as an alias for backward compatibility
export const PDFImageConverter = PDFProcessor;