UNPKG

file2md

Version:

A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with image and layout preservation

147 lines 5.44 kB
import fs from 'node:fs'; import path from 'node:path'; import { ImageExtractionError } from '../types/errors.js'; export class ImageExtractor { outputDir; imageCounter = 0; extractedImages = new Map(); constructor(outputDir = 'images') { this.outputDir = outputDir; // Reset counter to ensure fresh start this.reset(); // Create images directory if it doesn't exist if (!fs.existsSync(this.outputDir)) { fs.mkdirSync(this.outputDir, { recursive: true }); } } /** * Extract images from a ZIP archive (DOCX, XLSX, PPTX) */ async extractImagesFromZip(zip, basePath = '') { const images = []; zip.forEach((relativePath, file) => { // Check for image files in common locations if (this.isImageFile(relativePath)) { images.push({ path: relativePath, file, basePath }); } }); const extractedImages = []; for (const img of images) { try { const imageData = await img.file.async('nodebuffer'); const savedPath = await this.saveImage(imageData, img.path, img.basePath); if (savedPath) { extractedImages.push({ originalPath: img.path, savedPath, basePath: img.basePath, format: this.getImageFormat(img.path), size: imageData.length }); } } catch (error) { console.warn(`Failed to extract image ${img.path}:`, error instanceof Error ? error.message : 'Unknown error'); } } return extractedImages; } /** * Save an image buffer to disk */ async saveImage(buffer, originalPath, basePath = '') { this.imageCounter++; const ext = path.extname(originalPath) || '.png'; const filename = `image_${this.imageCounter}${ext}`; const fullPath = path.join(this.outputDir, filename); try { console.log(`[DEBUG] Saving image: ${filename} (counter: ${this.imageCounter})`); console.log(`[DEBUG] Original path: ${originalPath}, Base path: ${basePath}`); console.log(`[DEBUG] Full output path: ${fullPath}`); console.log(`[DEBUG] Image buffer size: ${buffer.length} bytes`); fs.writeFileSync(fullPath, buffer); // Store mapping for reference lookup const key = basePath + originalPath; this.extractedImages.set(key, filename); console.log(`[DEBUG] Successfully saved image and stored mapping: ${key} -> ${filename}`); // Return the full absolute path, not just the filename return path.resolve(fullPath); } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; console.error(`[DEBUG] Failed to save image ${filename}: ${message}`); throw new ImageExtractionError(`Failed to save image ${filename}: ${message}`, error); } } /** * Check if a file path represents an image */ isImageFile(filePath) { const imageExtensions = ['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.svg', '.emf', '.wmf']; const ext = path.extname(filePath).toLowerCase(); return imageExtensions.includes(ext) || filePath.includes('/media/') || filePath.includes('/images/') || filePath.includes('/media/') || filePath.includes('/images/'); } /** * Get image format from file extension */ getImageFormat(filePath) { const ext = path.extname(filePath).toLowerCase(); return ext.startsWith('.') ? ext.slice(1) : 'unknown'; } /** * Get markdown reference for an image by its original path */ getImageReference(originalPath, basePath = '') { const key = basePath + originalPath; const savedFilename = this.extractedImages.get(key); if (savedFilename) { return `![Image](${this.outputDir}/${savedFilename})`; } return null; } /** * Create markdown image reference using HTML img tag for better compatibility */ getImageMarkdown(description = 'Image', imagePath) { if (imagePath) { // Use relative path - just the directory name, not the full path const relativePath = `./images/${imagePath}`; return `<img src="${relativePath}" alt="${description}" style="max-width:100%;height:auto" />`; } return `<img src="./image-not-found" alt="${description}" />`; } /** * Reset the image counter and clear extracted images map */ reset() { this.imageCounter = 0; this.extractedImages.clear(); } /** * Get the output directory for images */ get imageDirectory() { return this.outputDir; } /** * Get the current image counter */ get currentImageCount() { return this.imageCounter; } /** * Get all extracted image mappings */ get extractedImageMappings() { return this.extractedImages; } } //# sourceMappingURL=image-extractor.js.map