docxmcp
Version:
A Model Context Protocol (MCP) server for processing .docx files into markdown with image extraction
106 lines (88 loc) • 3.17 kB
JavaScript
import mammoth from 'mammoth';
import { promises as fs } from 'fs';
import { dirname } from 'path';
import { fileURLToPath } from 'url';
import { PROJECT_DIR } from './constants.js';
import path from 'path';
import { spawn } from 'child_process';
const __dirname = dirname(fileURLToPath(import.meta.url));
// Create images directory if it doesn't exist
const imagesDir = path.join(PROJECT_DIR, 'tmp', 'images');
await fs.mkdir(imagesDir, { recursive: true }).catch(console.error);
export async function processDocx(filePath) {
try {
const buffer = await fs.readFile(filePath);
let imageCounter = 0;
const images = [];
let markdownWithPlaceholders = '';
// First pass: extract images and collect their data
const imageMap = new Map();
// Configure mammoth to handle images
const options = {
convertImage: mammoth.images.imgElement(async (image) => {
imageCounter++;
const imageBuffer = await image.read();
const extension = getImageExtension(image.contentType);
const filename = `image-${imageCounter}.${extension}`;
const imagePath = path.join(imagesDir, filename);
// Save the image temporarily (for fallback/debugging)
await fs.writeFile(imagePath, imageBuffer);
// Convert image buffer to base64
const base64Data = imageBuffer.toString('base64');
// Create a placeholder for this image
const placeholder = `[Image ${imageCounter}]`;
// Store image info with base64 data
const imageInfo = {
path: imagePath,
filename: filename,
contentType: image.contentType,
base64Data: base64Data,
alt: `Image ${imageCounter}`,
placeholder: placeholder
};
images.push(imageInfo);
imageMap.set(placeholder, imageInfo);
// Return a placeholder instead of base64 data URL
return {
src: placeholder,
alt: `Image ${imageCounter}`
};
})
};
// Extract text with image placeholders
const result = await mammoth.convertToMarkdown({ buffer }, options);
// Clean up the markdown to use our simple placeholders
markdownWithPlaceholders = result.value.replace(/!\[([^\]]*)\]\(\[Image (\d+)\]\)/g, '[Image $2]');
return {
markdown: markdownWithPlaceholders,
images: images,
messages: result.messages
};
} catch (error) {
throw new Error(`Failed to process document: ${error.message}`);
}
}
function getImageExtension(contentType) {
const typeMap = {
'image/png': 'png',
'image/jpeg': 'jpg',
'image/jpg': 'jpg',
'image/gif': 'gif',
'image/bmp': 'bmp',
'image/tiff': 'tiff',
'image/webp': 'webp'
};
return typeMap[contentType] || 'png';
}
export async function cleanupImages(images) {
if (!Array.isArray(images)) return;
for (const image of images) {
try {
if (image && image.path) {
await fs.unlink(image.path);
}
} catch (error) {
console.error(`Failed to cleanup image ${image.path}:`, error);
}
}
}