@wonderwhy-er/desktop-commander
Version:
MCP server for terminal operations and file editing
133 lines (132 loc) • 5.81 kB
JavaScript
import { getDocumentProxy, extractImages } from 'unpdf';
/**
* Optimized image extraction from PDF using unpdf's built-in extractImages method
* @param pdfBuffer PDF file as Uint8Array
* @param pageNumbers Optional array of specific page numbers to process
* @param compressionOptions Image compression settings
* @returns Record of page numbers to extracted images
*/
export async function extractImagesFromPdf(pdfBuffer, pageNumbers, compressionOptions = {}) {
const pdfDocument = await getDocumentProxy(pdfBuffer);
const pagesToProcess = pageNumbers || Array.from({ length: pdfDocument.numPages }, (_, i) => i + 1);
const pageResults = {};
try {
// Process pages in parallel batches for better performance
const batchSize = 5; // Process 5 pages at a time
const batches = [];
for (let i = 0; i < pagesToProcess.length; i += batchSize) {
batches.push(pagesToProcess.slice(i, i + batchSize));
}
for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) {
const batch = batches[batchIndex];
const batchPromises = batch.map(async (pageNum) => {
if (pageNum < 1 || pageNum > pdfDocument.numPages) {
return { pageNum, images: [] };
}
try {
// Use unpdf's built-in extractImages
const extractedImages = await extractImages(pdfDocument, pageNum);
const pageImages = [];
for (let index = 0; index < extractedImages.length; index++) {
const img = extractedImages[index];
const originalSize = img.data.length;
try {
const compressionResult = await convertRawImageToBase64(img.data, img.width, img.height, img.channels, compressionOptions);
if (compressionResult) {
pageImages.push({
objId: index, // Use index as objId since unpdf doesn't provide original objId
width: img.width,
height: img.height,
data: compressionResult.data,
mimeType: compressionResult.mimeType,
originalSize,
compressedSize: Math.round(compressionResult.data.length * 0.75) // Approximate base64 overhead
});
}
}
catch (err) {
// Ignore conversion errors as requested
console.warn(`Failed to convert image ${index} on page ${pageNum}:`, err instanceof Error ? err.message : String(err));
}
}
return { pageNum, images: pageImages };
}
catch (error) {
console.warn(`Failed to extract images from page ${pageNum}:`, error instanceof Error ? error.message : String(error));
return { pageNum, images: [] };
}
});
// Wait for the current batch to complete
const batchResults = await Promise.all(batchPromises);
// Store results
for (const { pageNum, images } of batchResults) {
pageResults[pageNum] = images;
}
}
}
finally {
// Clean up document
try {
if (typeof pdfDocument.cleanup === 'function') {
await pdfDocument.cleanup(false);
}
}
catch (e) { /* Ignore cleanup errors */ }
try {
if (typeof pdfDocument.destroy === 'function') {
await pdfDocument.destroy();
}
}
catch (e) { /* Ignore cleanup errors */ }
}
return pageResults;
}
/**
* Convert raw image data to compressed base64 using sharp
*/
async function convertRawImageToBase64(data, width, height, channels, options = {}) {
const { format = 'webp', quality = 85, maxDimension = 1200 } = options;
// Smart resizing - only resize large images
let targetWidth = width;
let targetHeight = height;
if (width > maxDimension || height > maxDimension) {
const scale = maxDimension / Math.max(width, height);
targetWidth = Math.round(width * scale);
targetHeight = Math.round(height * scale);
}
try {
// Try to dynamically import sharp
const sharp = (await import('sharp')).default;
// sharp takes Buffer, Uint8Array, etc.
// unpdf returns Uint8ClampedArray, which works with Buffer.from()
let pipeline = sharp(Buffer.from(data), {
raw: {
width,
height,
channels: channels
}
});
if (targetWidth !== width || targetHeight !== height) {
pipeline = pipeline.resize(targetWidth, targetHeight);
}
let outputBuffer;
let mimeType;
if (format === 'jpeg') {
outputBuffer = await pipeline.jpeg({ quality }).toBuffer();
mimeType = 'image/jpeg';
}
else {
// Default to webp
outputBuffer = await pipeline.webp({ quality }).toBuffer();
mimeType = 'image/webp';
}
return {
data: outputBuffer.toString('base64'),
mimeType
};
}
catch (error) {
console.warn('Image conversion failed (likely missing sharp or invalid data):', error instanceof Error ? error.message : String(error));
return null;
}
}