@smythos/sdk
Version:
199 lines (176 loc) • 10.1 kB
text/typescript
import { DocParser, TDocumentParseSettings, TParsedDocument } from '../DocParser.class';
import { readFile } from 'fs/promises';
import path from 'path';
export class PDFParser extends DocParser {
protected supportedMimeTypes: string[] = ['application/pdf'];
protected supportedExtensions: string[] = ['pdf'];
async parse(source: string, params?: TDocumentParseSettings): Promise<TParsedDocument> {
// Suppress canvas-related warnings since we only need text extraction
const originalConsoleWarn = console.warn;
console.warn = (...args) => {
const message = args.join(' ');
// Filter out canvas and rendering warnings
if (
message.includes('@napi-rs/canvas') ||
message.includes('Cannot polyfill') ||
message.includes('DOMMatrix') ||
message.includes('ImageData') ||
message.includes('Path2D') ||
message.includes('rendering may be broken')
) {
return; // Suppress these warnings
}
originalConsoleWarn.apply(console, args);
};
// Lazy-load pdfjs-dist to reduce initial bundle size
const pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
// Set up the worker for pdfjs-dist (legacy build for Node.js)
// This is necessary for the library to function correctly in a Node.js environment.
if (pdfjsLib.GlobalWorkerOptions.workerSrc !== 'pdfjs-dist/legacy/build/pdf.worker.mjs') {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/legacy/build/pdf.worker.mjs';
}
try {
const dataBuffer = await readFile(source);
const fileNameWithoutExtension = path.basename(source, path.extname(source));
// Use pdfjs-dist for text extraction and metadata
const loadingTask = pdfjsLib.getDocument({
data: new Uint8Array(dataBuffer),
fontExtraProperties: true,
} as any);
const pdfDocument = await loadingTask.promise;
let fullText = '';
const pages = [];
// Process each page
for (let i = 1; i <= pdfDocument.numPages; i++) {
const page = await pdfDocument.getPage(i);
const textContent = await page.getTextContent();
const operatorList = await page.getOperatorList();
const content: any[] = [];
// 1. Extract text content
const pageText = textContent.items.map((item: any) => item.str).join(' ');
if (pageText.trim().length > 0) {
content.push({
type: 'text' as const,
data: pageText,
text: pageText,
});
}
// 2. Extract embedded images with improved approach
const imageNames = new Set<string>();
// First, collect all image names from the operator list
for (let j = 0; j < operatorList.fnArray.length; j++) {
const fn = operatorList.fnArray[j];
const args = operatorList.argsArray[j];
if (fn === pdfjsLib.OPS.paintImageXObject) {
const imageName = args[0];
imageNames.add(imageName);
}
}
// Then try to extract each unique image
for (const imageName of imageNames) {
try {
// Wait a bit to ensure objects are loaded
await new Promise((resolve) => setTimeout(resolve, 100));
// Try to get the image object
const imageObj = page.objs.get(imageName);
if (imageObj) {
console.log(`Found image object ${imageName}:`, {
hasData: !!imageObj.data,
dataType: typeof imageObj.data,
dataLength: imageObj.data?.length,
width: imageObj.width,
height: imageObj.height,
kind: imageObj.kind,
keys: Object.keys(imageObj),
});
if (imageObj.data && (imageObj.data instanceof Uint8Array || Buffer.isBuffer(imageObj.data))) {
const dataArray = imageObj.data instanceof Uint8Array ? imageObj.data : new Uint8Array(imageObj.data);
// Check for standard image file signatures first
let isValidImageFile = false;
let mimeType = 'image/png';
if (dataArray.length > 4) {
const headerBytes = [];
for (let i = 0; i < 4; i++) {
headerBytes.push(dataArray[i].toString(16).padStart(2, '0'));
}
const header = headerBytes.join('');
if (header.startsWith('ffd8')) {
// Valid JPEG file
isValidImageFile = true;
mimeType = 'image/jpeg';
console.log(`Found valid JPEG file ${imageName}`);
} else if (header.startsWith('8950')) {
// Valid PNG file
isValidImageFile = true;
mimeType = 'image/png';
console.log(`Found valid PNG file ${imageName}`);
} else {
console.log(`Image ${imageName} header: ${header} - appears to be raw pixel data`);
}
}
if (isValidImageFile) {
// Extract the valid image file
const base64Image = Buffer.from(dataArray).toString('base64');
content.push({
type: 'image' as const,
data: `data:${mimeType};base64,${base64Image}`,
text: `[Embedded Image: ${imageObj.width || 'unknown'}x${imageObj.height || 'unknown'}]`,
});
console.log(`Successfully extracted ${mimeType} image ${imageName} from page ${i}`);
} else {
// This is raw pixel data - provide metadata without invalid base64
console.log(`Image ${imageName} contains raw pixel data, providing metadata only`);
content.push({
type: 'image' as const,
data: '', // Empty data to avoid invalid base64
text: `[Image Placeholder: ${imageObj.width || 'unknown'}x${
imageObj.height || 'unknown'
} - Raw pixel data not extractable via PDF.js]`,
metadata: {
imageName: imageName,
width: imageObj.width,
height: imageObj.height,
kind: imageObj.kind,
dataLength: dataArray.length,
note: 'This image exists in the PDF but is stored as raw pixel data. Consider using specialized PDF image extraction tools for full image recovery.',
},
});
}
} else {
console.warn(`Image ${imageName} has no extractable data`);
}
} else {
console.warn(`Image object ${imageName} not found`);
}
} catch (error: any) {
console.warn(`Could not extract image ${imageName} from page ${i}: ${error.message}`);
}
}
fullText += pageText + '\n\n';
pages.push({
content: content,
metadata: { pageNumber: i },
});
}
// Extract metadata
const metadata = await pdfDocument.getMetadata();
const info = (metadata.info as any) || {};
return {
title: info.Title || fileNameWithoutExtension || '',
metadata: {
uri: source,
author: info.Author || '',
date: info.CreationDate || '',
tags: (info.Keywords || '')
.split(',')
.map((k: string) => k.trim())
.filter(Boolean),
},
pages: pages,
};
} catch (error) {
console.error('PDF parsing error:', error);
throw error;
}
}
}