UNPKG

docxmcp

Version:

A Model Context Protocol (MCP) server for processing .docx files into markdown with image extraction

106 lines (88 loc) 3.17 kB
import mammoth from 'mammoth'; import { promises as fs } from 'fs'; import { dirname } from 'path'; import { fileURLToPath } from 'url'; import { PROJECT_DIR } from './constants.js'; import path from 'path'; import { spawn } from 'child_process'; const __dirname = dirname(fileURLToPath(import.meta.url)); // Create images directory if it doesn't exist const imagesDir = path.join(PROJECT_DIR, 'tmp', 'images'); await fs.mkdir(imagesDir, { recursive: true }).catch(console.error); export async function processDocx(filePath) { try { const buffer = await fs.readFile(filePath); let imageCounter = 0; const images = []; let markdownWithPlaceholders = ''; // First pass: extract images and collect their data const imageMap = new Map(); // Configure mammoth to handle images const options = { convertImage: mammoth.images.imgElement(async (image) => { imageCounter++; const imageBuffer = await image.read(); const extension = getImageExtension(image.contentType); const filename = `image-${imageCounter}.${extension}`; const imagePath = path.join(imagesDir, filename); // Save the image temporarily (for fallback/debugging) await fs.writeFile(imagePath, imageBuffer); // Convert image buffer to base64 const base64Data = imageBuffer.toString('base64'); // Create a placeholder for this image const placeholder = `[Image ${imageCounter}]`; // Store image info with base64 data const imageInfo = { path: imagePath, filename: filename, contentType: image.contentType, base64Data: base64Data, alt: `Image ${imageCounter}`, placeholder: placeholder }; images.push(imageInfo); imageMap.set(placeholder, imageInfo); // Return a placeholder instead of base64 data URL return { src: placeholder, alt: `Image ${imageCounter}` }; }) }; // Extract text with image placeholders const result = await mammoth.convertToMarkdown({ buffer }, options); // Clean up the markdown to use our simple placeholders markdownWithPlaceholders = result.value.replace(/!\[([^\]]*)\]\(\[Image (\d+)\]\)/g, '[Image $2]'); return { markdown: markdownWithPlaceholders, images: images, messages: result.messages }; } catch (error) { throw new Error(`Failed to process document: ${error.message}`); } } function getImageExtension(contentType) { const typeMap = { 'image/png': 'png', 'image/jpeg': 'jpg', 'image/jpg': 'jpg', 'image/gif': 'gif', 'image/bmp': 'bmp', 'image/tiff': 'tiff', 'image/webp': 'webp' }; return typeMap[contentType] || 'png'; } export async function cleanupImages(images) { if (!Array.isArray(images)) return; for (const image of images) { try { if (image && image.path) { await fs.unlink(image.path); } } catch (error) { console.error(`Failed to cleanup image ${image.path}:`, error); } } }