file2md
Version:
A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with image and layout preservation
408 lines • 21.6 kB
JavaScript
import JSZip from 'jszip';
import { parseStringPromise } from 'xml2js';
import path from 'node:path';
import { PptxVisualParser } from '../utils/pptx-visual-parser.js';
import { ParseError } from '../types/errors.js';
/**
* Parse PPTX buffer and convert to markdown with layout preservation
*/
export async function parsePptx(buffer, imageExtractor, chartExtractor, options = {}) {
try {
return await parsePptxToMarkdown(buffer, imageExtractor, chartExtractor, options);
}
catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
throw new ParseError('PPTX', message, error);
}
}
/**
* Parse PPTX to markdown without image generation
*/
async function parsePptxToMarkdown(buffer, imageExtractor, chartExtractor, options) {
const zip = await JSZip.loadAsync(buffer);
// Enhanced visual parsing if requested (for text extraction)
let visualLayouts;
if (options.useVisualParser !== false) {
try {
const visualParser = new PptxVisualParser();
const readonlyLayouts = await visualParser.parseVisualElements(buffer);
visualLayouts = [...readonlyLayouts];
console.log(`Visual parser extracted ${visualLayouts.length} slide layouts`);
}
catch (visualError) {
console.warn('Visual parsing failed, continuing with standard processing:', visualError);
}
}
// Extract embedded images metadata only (not for slide screenshots)
const extractedImages = options.extractImages !== false
? await imageExtractor.extractImagesFromZip(zip, 'ppt/')
: [];
// Extract charts if enabled
const extractedCharts = options.extractCharts !== false
? await chartExtractor.extractChartsFromZip(zip, 'ppt/')
: [];
const slideFiles = [];
zip.forEach((relativePath, file) => {
if (relativePath.startsWith('ppt/slides/slide') && relativePath.endsWith('.xml')) {
slideFiles.push({
path: relativePath,
file
});
}
});
slideFiles.sort((a, b) => {
const aNum = parseInt(a.path.match(/slide(\d+)\.xml/)?.[1] || '0', 10);
const bNum = parseInt(b.path.match(/slide(\d+)\.xml/)?.[1] || '0', 10);
return aNum - bNum;
});
// Extract title from PPTX metadata
const title = await extractPptxTitle(buffer);
let markdown = '';
if (title) {
markdown += `# ${title}\n\n`;
}
// Use visual layouts for enhanced text extraction if available
if (visualLayouts && visualLayouts.length === slideFiles.length) {
for (let i = 0; i < slideFiles.length; i++) {
const slideFile = slideFiles[i];
const slideNumber = i + 1;
const layout = visualLayouts[i];
if (layout?.title) {
markdown += `## Slide ${slideNumber}: ${layout.title}\n\n`;
}
else {
markdown += `## Slide ${slideNumber}\n\n`;
}
// Extract text from visual layout
const textElements = layout.elements.filter(e => e.type === 'text');
if (textElements.length > 0) {
textElements.forEach((element) => {
if (element.type === 'text' && element.content?.text) {
const textContent = element.content.text.trim();
if (textContent) {
markdown += `${textContent}\n\n`;
}
}
});
}
else {
// Fallback to XML extraction if no text in visual layout
const xmlContent = await slideFile.file.async('string');
const slideContent = await extractSlideTextContent(xmlContent);
if (slideContent.trim()) {
markdown += `${slideContent}\n\n`;
}
else {
markdown += '*No content*\n\n';
}
}
// Debug: Log all elements found in this slide
console.log(`[DEBUG] Slide ${slideNumber} elements:`, layout.elements.map(e => e.type));
// Process image elements and generate markdown references
const imageElements = layout.elements.filter(e => e.type === 'image');
console.log(`[DEBUG] Slide ${slideNumber} found ${imageElements.length} image elements`);
if (imageElements.length > 0) {
markdown += '### Images\n\n';
imageElements.forEach((element, index) => {
if (element.type === 'image' && element.content) {
const imageContent = element.content;
const imagePath = imageContent.imagePath;
console.log(`[DEBUG] Processing image ${index + 1}: ${imagePath}`);
// Try to get markdown reference using the image extractor
const imageRef = imageExtractor.getImageReference(imagePath, 'ppt/');
console.log(`[DEBUG] Image reference from extractor: ${imageRef}`);
if (imageRef) {
markdown += `${imageRef}\n\n`;
}
else {
// Fallback: try to find the image by matching the path
const matchingImage = extractedImages.find(img => img.originalPath === imagePath ||
img.originalPath.endsWith(imagePath.split('/').pop() || ''));
console.log(`[DEBUG] Matching image found: ${matchingImage ? 'yes' : 'no'}`);
if (matchingImage) {
const imageName = path.basename(matchingImage.savedPath);
markdown += `\n\n`;
console.log(`[DEBUG] Generated image reference: `);
}
else if (extractedImages.length > 0) {
// Try to use any available image from extracted images
const availableImage = extractedImages[Math.min(index, extractedImages.length - 1)];
const imageName = path.basename(availableImage.savedPath);
markdown += `\n\n`;
console.log(`[DEBUG] Generated image reference using available image: `);
}
else {
// Last resort: generic reference
markdown += `\n\n`;
console.log(`[DEBUG] Generated fallback image reference`);
}
}
}
});
}
// Process chart elements and potentially embedded images
const chartElements = layout.elements.filter(e => e.type === 'chart');
console.log(`[DEBUG] Slide ${slideNumber} found ${chartElements.length} chart elements`);
// Since visual parser misidentifies images as charts, let's embed images inline for slides with charts
if (chartElements.length > 0) {
// Calculate which images belong to this slide
const imagesPerSlide = Math.ceil(extractedImages.length / slideFiles.length);
const startIndex = (slideNumber - 1) * imagesPerSlide;
const endIndex = Math.min(startIndex + imagesPerSlide, extractedImages.length);
const slideImages = extractedImages.slice(startIndex, endIndex);
console.log(`[DEBUG] Slide ${slideNumber} embedding ${slideImages.length} images inline with charts (from index ${startIndex} to ${endIndex})`);
// Embed images directly in the slide content, not in a separate section
slideImages.forEach((image, index) => {
const imageName = path.basename(image.savedPath);
markdown += `\n\n`;
console.log(`[DEBUG] Embedded inline image reference: `);
});
// Process actual charts if any chart data is available
chartElements.forEach((element, index) => {
if (element.type === 'chart' && element.content) {
const chartContent = element.content;
const chartType = chartContent.chartType || 'unknown';
// Try to find matching extracted chart data
const matchingChart = extractedCharts.find(chart => chart.data.type === chartType || chart.data.title.includes(`Slide ${slideNumber}`));
if (matchingChart) {
// Use the chart extractor's formatChartAsMarkdown method for proper formatting
const chartMarkdown = chartExtractor.formatChartAsMarkdown(matchingChart.data);
markdown += chartMarkdown;
}
else {
// Try to use any available chart from the slide
const slideChart = extractedCharts[index] || extractedCharts[0];
if (slideChart) {
const chartMarkdown = chartExtractor.formatChartAsMarkdown(slideChart.data);
markdown += chartMarkdown;
}
// If no actual chart data, don't show anything since images are already embedded
}
}
});
}
// Process table elements
const tableElements = layout.elements.filter(e => e.type === 'table');
console.log(`[DEBUG] Slide ${slideNumber} found ${tableElements.length} table elements`);
if (tableElements.length > 0) {
markdown += '### Tables\n\n';
tableElements.forEach((element, index) => {
if (element.type === 'table' && element.content) {
markdown += `**Table ${index + 1}**\n\n`;
markdown += `*Table content extraction not yet implemented*\n\n`;
}
});
}
// Show "Other Elements" only for non-chart/image/text/table elements
const processedElements = imageElements.length + chartElements.length + textElements.length + tableElements.length;
const totalElements = layout.elements.length;
const otherElements = totalElements - processedElements;
// Only show other elements if there are some unprocessed ones (like shapes or groups)
if (otherElements > 0) {
// Count different types of other elements
const shapeElements = layout.elements.filter(e => e.type === 'shape').length;
const groupElements = layout.elements.filter(e => e.type === 'group').length;
const unknownElements = otherElements - shapeElements - groupElements;
// Only show "Other Elements" if there are actually non-image/chart elements
if (shapeElements > 0 || groupElements > 0 || unknownElements > 0) {
markdown += '### Other Elements\n\n';
if (shapeElements > 0) {
markdown += `- ${shapeElements} shape(s)\n`;
}
if (groupElements > 0) {
markdown += `- ${groupElements} group(s)\n`;
}
if (unknownElements > 0) {
markdown += `- ${unknownElements} other element(s)\n`;
}
markdown += '\n';
}
}
}
}
else {
// Standard text extraction without visual layouts
for (let i = 0; i < slideFiles.length; i++) {
const slideFile = slideFiles[i];
const slideNumber = i + 1;
markdown += `## Slide ${slideNumber}\n\n`;
const xmlContent = await slideFile.file.async('string');
const slideContent = await extractSlideTextContent(xmlContent);
if (slideContent.trim()) {
markdown += `${slideContent}\n\n`;
}
else {
markdown += '*No content*\n\n';
}
// Debug: Log when visual parser is not available
console.log(`[DEBUG] Slide ${slideNumber} using fallback XML parsing (no visual layouts)`);
// Embed images inline for this slide
if (extractedImages.length > 0) {
const imagesPerSlide = Math.ceil(extractedImages.length / slideFiles.length);
const startIndex = (slideNumber - 1) * imagesPerSlide;
const endIndex = Math.min(startIndex + imagesPerSlide, extractedImages.length);
const fallbackSlideImages = extractedImages.slice(startIndex, endIndex);
console.log(`[DEBUG] Slide ${slideNumber} XML fallback: embedding ${fallbackSlideImages.length} images inline (from index ${startIndex} to ${endIndex})`);
fallbackSlideImages.forEach((image, index) => {
const imageName = path.basename(image.savedPath);
markdown += `\n\n`;
console.log(`[DEBUG] Generated XML fallback inline image reference: `);
});
}
// Add fallback chart processing for when visual parser is not available
if (extractedCharts.length > 0) {
extractedCharts.forEach((chart) => {
const chartMarkdown = chartExtractor.formatChartAsMarkdown(chart.data);
markdown += chartMarkdown;
});
}
}
}
return {
markdown: markdown.trim(),
images: extractedImages,
charts: extractedCharts.map(chart => chart.data),
slideCount: slideFiles.length,
metadata: {
totalSlides: slideFiles.length,
hasImages: extractedImages.length > 0,
hasCharts: extractedCharts.length > 0,
renderMethod: 'text-extraction',
hasVisualLayouts: visualLayouts !== undefined
}
};
}
/**
* Extract PPTX title from document properties
*/
async function extractPptxTitle(buffer) {
try {
const zip = await JSZip.loadAsync(buffer);
const corePropsFile = zip.file('docProps/core.xml');
if (corePropsFile) {
const corePropsContent = await corePropsFile.async('string');
const result = await parseStringPromise(corePropsContent);
// Try to extract title from core properties
const title = result?.['cp:coreProperties']?.[0]?.['dc:title']?.[0];
if (title && typeof title === 'string' && title.trim()) {
return title.trim();
}
}
}
catch {
// Ignore errors and return undefined
}
return undefined;
}
/**
* Extract text content from slide XML
*/
async function extractSlideTextContent(xmlContent) {
try {
const result = await parseStringPromise(xmlContent);
// Simple text extraction function
function extractText(obj) {
let text = '';
if (typeof obj === 'object' && obj !== null) {
if (Array.isArray(obj)) {
for (const item of obj) {
text += extractText(item);
}
}
else {
// Extract text content
if (obj['a:t']) {
if (Array.isArray(obj['a:t'])) {
for (const textItem of obj['a:t']) {
if (typeof textItem === 'string') {
text += `${textItem} `;
}
else if (textItem && typeof textItem === 'object' && '_' in textItem) {
text += `${textItem._} `;
}
}
}
}
// Recursively process nested objects
for (const key in obj) {
if (key !== 'a:t') {
text += extractText(obj[key]);
}
}
}
}
return text;
}
// Extract all text content
const textContent = extractText(result).trim();
return textContent;
}
catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
throw new ParseError('PPTX', `Failed to extract text content from slide: ${message}`, error);
}
}
/**
* Extract image references from slide XML when visual parser is not available
*/
async function extractSlideImages(xmlContent, slideNumber, imageExtractor, extractedImages) {
try {
const result = await parseStringPromise(xmlContent);
const imageRefs = [];
console.log(`[DEBUG] extractSlideImages: Processing slide ${slideNumber}`);
console.log(`[DEBUG] extractSlideImages: Found ${extractedImages.length} extracted images total`);
// Extract picture elements from slide XML
const spTree = result?.['p:sld']?.['p:cSld']?.[0]?.['p:spTree']?.[0];
const pics = spTree?.['p:pic'];
console.log(`[DEBUG] extractSlideImages: Found ${pics?.length || 0} picture elements in XML`);
if (pics && Array.isArray(pics)) {
pics.forEach((pic, index) => {
try {
console.log(`[DEBUG] extractSlideImages: Processing picture ${index + 1}`);
// Extract image reference from picture element
const blipFill = pic?.['p:blipFill']?.[0];
const blip = blipFill?.['a:blip']?.[0];
const rEmbed = blip?.$?.['r:embed'];
console.log(`[DEBUG] extractSlideImages: rEmbed = ${rEmbed}`);
if (rEmbed) {
// Try to get markdown reference using the image extractor
// Note: Without visual parser, we don't have the actual image path, so we use a generic approach
const matchingImage = extractedImages.find(img => img.originalPath.includes(rEmbed) ||
img.originalPath.includes(`slide${slideNumber}`));
console.log(`[DEBUG] extractSlideImages: Matching image found: ${matchingImage ? 'yes' : 'no'}`);
if (matchingImage) {
const imageName = path.basename(matchingImage.savedPath);
const imageRef = ``;
imageRefs.push(imageRef);
console.log(`[DEBUG] extractSlideImages: Generated image reference: ${imageRef}`);
}
else if (extractedImages.length > 0) {
// Try to use any available image
const availableImage = extractedImages[Math.min(index, extractedImages.length - 1)];
const imageName = path.basename(availableImage.savedPath);
const imageRef = ``;
imageRefs.push(imageRef);
console.log(`[DEBUG] extractSlideImages: Generated image reference using available image: ${imageRef}`);
}
else {
// Fallback: use sequential image naming
const imageRef = ``;
imageRefs.push(imageRef);
console.log(`[DEBUG] extractSlideImages: Generated fallback image reference: ${imageRef}`);
}
}
}
catch (error) {
console.warn(`Error extracting image ${index} from slide ${slideNumber}:`, error);
}
});
}
console.log(`[DEBUG] extractSlideImages: Returning ${imageRefs.length} image references`);
return imageRefs;
}
catch (error) {
console.warn(`Error extracting images from slide ${slideNumber}:`, error);
return [];
}
}
//# sourceMappingURL=pptx-parser.js.map