UNPKG

file2md

Version:

A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with image and layout preservation

647 lines 22.3 kB
import JSZip from 'jszip'; import { parseStringPromise } from 'xml2js'; import { ParseError } from '../types/errors.js'; export class PptxVisualParser { zip = null; slideCount = 0; relationships = new Map(); themes = new Map(); /** * Parse PPTX buffer to extract comprehensive visual information */ async parseVisualElements(pptxBuffer) { try { // Load PPTX as ZIP this.zip = await JSZip.loadAsync(pptxBuffer); // Parse presentation structure const presentationXml = await this.getXmlContent('ppt/presentation.xml'); const presentation = await parseStringPromise(presentationXml); // Extract slide references const slideIds = this.extractSlideReferences(presentation); this.slideCount = slideIds.length; console.log(`Found ${this.slideCount} slides to parse`); // Load relationships and themes await this.loadRelationships(); await this.loadThemes(); // Parse each slide const slides = []; for (const [i, slideId] of slideIds.entries()) { try { const slide = await this.parseSlide(slideId.id, slideId.rId, i + 1); slides.push(slide); console.log(`Parsed slide ${i + 1}: ${slide.elements.length} elements`); } catch (slideError) { console.warn(`Failed to parse slide ${i + 1}:`, slideError); // Create a placeholder slide slides.push(this.createPlaceholderSlide(slideId.id, i + 1)); } } return slides; } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; throw new ParseError('PptxVisualParser', `Visual parsing failed: ${message}`, error); } } /** * Extract slide references from presentation.xml */ extractSlideReferences(presentation) { const slideIds = []; try { const slideIdList = presentation?.['p:presentation']?.['p:sldIdLst']?.[0]?.['p:sldId']; if (slideIdList && Array.isArray(slideIdList)) { for (const slide of slideIdList) { const id = slide.$?.id; const rId = slide.$?.['r:id']; if (id && rId) { slideIds.push({ id, rId }); } } } } catch (error) { console.warn('Error extracting slide references:', error); } return slideIds; } /** * Load relationships from _rels files */ async loadRelationships() { try { // Load presentation relationships const presRels = await this.getXmlContent('ppt/_rels/presentation.xml.rels'); if (presRels) { const relsDoc = await parseStringPromise(presRels); this.relationships.set('presentation', relsDoc); } // Load slide relationships this.zip?.forEach((relativePath, file) => { if (relativePath.includes('ppt/slides/_rels/') && relativePath.endsWith('.rels')) { // Load slide-specific relationships file.async('string').then(async (content) => { try { const relsDoc = await parseStringPromise(content); const slideId = relativePath.match(/slide(\d+)\.xml\.rels/)?.[1]; if (slideId) { this.relationships.set(`slide${slideId}`, relsDoc); } } catch (error) { console.warn(`Failed to parse relationships for ${relativePath}:`, error); } }); } }); } catch (error) { console.warn('Error loading relationships:', error); } } /** * Load theme information */ async loadThemes() { try { this.zip?.forEach((relativePath, file) => { if (relativePath.includes('ppt/theme/') && relativePath.endsWith('.xml')) { file.async('string').then(async (content) => { try { const themeDoc = await parseStringPromise(content); const themeId = relativePath.match(/theme(\d+)\.xml/)?.[1] || '1'; this.themes.set(themeId, themeDoc); } catch (error) { console.warn(`Failed to parse theme ${relativePath}:`, error); } }); } }); } catch (error) { console.warn('Error loading themes:', error); } } /** * Parse individual slide */ async parseSlide(_slideId, rId, slideNumber) { // Get slide path from relationships const slideRel = this.findRelationshipTarget('presentation', rId); const slidePath = slideRel ? `ppt/${slideRel.target}` : `ppt/slides/slide${slideNumber}.xml`; // Load slide XML const slideXml = await this.getXmlContent(slidePath); const slideDoc = await parseStringPromise(slideXml); // Extract slide dimensions const dimensions = this.extractSlideDimensions(); // Extract background const background = this.extractSlideBackground(slideDoc); // Extract title const title = this.extractSlideTitle(slideDoc); // Parse visual elements const elements = await this.parseSlideElements(slideDoc, slideNumber); return { slideId: _slideId, slideNumber, title, background, elements, dimensions }; } /** * Extract slide dimensions */ extractSlideDimensions() { try { // Default PowerPoint slide dimensions in EMUs (English Metric Units) // Standard 16:9 slide: 12192000 x 6858000 EMUs const defaultWidth = 12192000; const defaultHeight = 6858000; // Try to extract actual dimensions from slide master or layout // This is a simplified implementation return { width: defaultWidth, height: defaultHeight, units: 'EMU' }; } catch { // Fallback to standard dimensions return { width: 12192000, height: 6858000, units: 'EMU' }; } } /** * Extract slide background information */ extractSlideBackground(slideDoc) { try { const bg = slideDoc?.['p:sld']?.['p:cSld']?.[0]?.['p:bg']; if (bg) { // This is a simplified implementation // Real implementation would parse various background types return { type: 'solid', color: '#FFFFFF' }; } } catch { // No background or parsing failed } return undefined; } /** * Extract slide title */ extractSlideTitle(slideDoc) { try { const shapes = slideDoc?.['p:sld']?.['p:cSld']?.[0]?.['p:spTree']?.[0]?.['p:sp']; if (shapes && Array.isArray(shapes)) { for (const shape of shapes) { const nvSpPr = shape?.['p:nvSpPr']?.[0]; const ph = nvSpPr?.['p:nvPr']?.[0]?.['p:ph']?.[0]; if (ph?.$ && ph.$.type === 'title') { // Extract text from title shape const textBody = shape?.['p:txBody']?.[0]; if (textBody) { const text = this.extractTextFromBody(textBody); return text; } } } } } catch { // Title extraction failed } return undefined; } /** * Parse all visual elements in a slide */ async parseSlideElements(slideDoc, slideNumber) { const elements = []; try { const spTree = slideDoc?.['p:sld']?.['p:cSld']?.[0]?.['p:spTree']?.[0]; if (!spTree) { return elements; } // Parse shapes const shapes = spTree['p:sp']; if (shapes && Array.isArray(shapes)) { for (const shape of shapes) { const element = await this.parseShape(shape, slideNumber); if (element) { elements.push(element); } } } // Parse groups const groups = spTree['p:grpSp']; if (groups && Array.isArray(groups)) { for (const group of groups) { const element = await this.parseGroup(group, slideNumber); if (element) { elements.push(element); } } } // Parse pictures const pics = spTree['p:pic']; if (pics && Array.isArray(pics)) { for (const pic of pics) { const element = await this.parsePicture(pic, slideNumber); if (element) { elements.push(element); } } } // Parse charts const charts = spTree['p:graphicFrame']; if (charts && Array.isArray(charts)) { for (const chart of charts) { const element = await this.parseChart(chart); if (element) { elements.push(element); } } } } catch (error) { console.warn(`Error parsing slide ${slideNumber} elements:`, error); } return elements; } /** * Parse a shape element */ async parseShape(shape, _slideNumber) { try { const id = shape?.['p:nvSpPr']?.[0]?.['p:cNvPr']?.[0]?.$.id || 'unknown'; const name = shape?.['p:nvSpPr']?.[0]?.['p:cNvPr']?.[0]?.$.name || 'shape'; // Extract position and size const spPr = shape?.['p:spPr']?.[0]; const xfrm = spPr?.['a:xfrm']?.[0]; const position = this.extractPosition(xfrm); const size = this.extractSize(xfrm); // Check if it has text content const textBody = shape?.['p:txBody']?.[0]; if (textBody) { // Text element const text = this.extractTextFromBody(textBody); const paragraphs = this.extractParagraphsFromBody(textBody); return { id: `${id}_${name}`, type: 'text', position, size, content: { text, paragraphs }, style: this.extractElementStyle(spPr, textBody) }; } else { // Shape element return { id: `${id}_${name}`, type: 'shape', position, size, content: { shapeType: this.extractShapeType(spPr), geometry: this.extractShapeGeometry(spPr) }, style: this.extractElementStyle(spPr) }; } } catch (error) { console.warn('Error parsing shape:', error); return null; } } /** * Parse a group element */ async parseGroup(group, slideNumber) { try { const id = group?.['p:nvGrpSpPr']?.[0]?.['p:cNvPr']?.[0]?.$.id || 'unknown'; const name = group?.['p:nvGrpSpPr']?.[0]?.['p:cNvPr']?.[0]?.$.name || 'group'; // Extract position and size const grpSpPr = group?.['p:grpSpPr']?.[0]; const xfrm = grpSpPr?.['a:xfrm']?.[0]; const position = this.extractPosition(xfrm); const size = this.extractSize(xfrm); // Parse child elements const children = []; // Child shapes const shapes = group['p:sp']; if (shapes && Array.isArray(shapes)) { for (const shape of shapes) { const child = await this.parseShape(shape, slideNumber); if (child) children.push(child); } } return { id: `${id}_${name}`, type: 'group', position, size, content: {}, children }; } catch (error) { console.warn('Error parsing group:', error); return null; } } /** * Parse a picture element */ async parsePicture(pic, slideNumber) { try { const id = pic?.['p:nvPicPr']?.[0]?.['p:cNvPr']?.[0]?.$.id || 'unknown'; const name = pic?.['p:nvPicPr']?.[0]?.['p:cNvPr']?.[0]?.$.name || 'image'; // Extract position and size const spPr = pic?.['p:spPr']?.[0]; const xfrm = spPr?.['a:xfrm']?.[0]; const position = this.extractPosition(xfrm); const size = this.extractSize(xfrm); // Extract image reference const blip = pic?.['p:blipFill']?.[0]?.['a:blip']?.[0]; const rEmbed = blip?.$?.['r:embed']; let imagePath = ''; if (rEmbed) { const rel = this.findRelationshipTarget(`slide${slideNumber}`, rEmbed); if (rel) { imagePath = rel.target; } } return { id: `${id}_${name}`, type: 'image', position, size, content: { imagePath, originalSize: size, aspectRatio: size.width / size.height }, style: this.extractElementStyle(spPr) }; } catch (error) { console.warn('Error parsing picture:', error); return null; } } /** * Parse a chart element */ async parseChart(chart) { try { const id = chart?.['p:nvGraphicFramePr']?.[0]?.['p:cNvPr']?.[0]?.$.id || 'unknown'; const name = chart?.['p:nvGraphicFramePr']?.[0]?.['p:cNvPr']?.[0]?.$.name || 'chart'; // Extract position and size const xfrm = chart?.['p:xfrm']?.[0]; const position = this.extractPosition(xfrm); const size = this.extractSize(xfrm); return { id: `${id}_${name}`, type: 'chart', position, size, content: { chartType: 'unknown', // Would need deeper parsing data: null, series: [] } }; } catch (error) { console.warn('Error parsing chart:', error); return null; } } // Helper methods extractPosition(xfrm) { try { const off = xfrm?.['a:off']?.[0]; return { x: parseInt(off?.$.x || '0', 10), y: parseInt(off?.$.y || '0', 10), z: 0 }; } catch { return { x: 0, y: 0, z: 0 }; } } extractSize(xfrm) { try { const ext = xfrm?.['a:ext']?.[0]; return { width: parseInt(ext?.$.cx || '0', 10), height: parseInt(ext?.$.cy || '0', 10) }; } catch { return { width: 0, height: 0 }; } } extractElementStyle(spPr, textBody) { const style = {}; try { // Extract fill color const solidFill = spPr?.['a:solidFill']?.[0]; if (solidFill) { // This is simplified - real implementation would handle various color formats style.fill = '#000000'; } // Extract font information from text body if (textBody) { const defRPr = textBody?.['a:lstStyle']?.[0]?.['a:lvl1pPr']?.[0]?.['a:defRPr']?.[0]; if (defRPr) { style.font = this.extractFontInfo(defRPr); } } } catch { // Style extraction failed } return style; } extractFontInfo(rPr) { return { family: rPr?.['a:latin']?.[0]?.$.typeface || 'Arial', size: parseInt(rPr?.$.sz || '1800', 10) / 100, // Convert from hundredths of points color: '#000000', // Simplified bold: rPr?.$.b === '1', italic: rPr?.$.i === '1', underline: rPr?.$.u !== undefined }; } extractTextFromBody(textBody) { try { const paragraphs = textBody?.['a:p']; if (paragraphs && Array.isArray(paragraphs)) { return paragraphs .map((p) => this.extractTextFromParagraph(p)) .filter(text => text.length > 0) .join('\n'); } } catch { // Text extraction failed } return ''; } extractParagraphsFromBody(textBody) { const paragraphs = []; try { const pArray = textBody?.['a:p']; if (pArray && Array.isArray(pArray)) { for (const p of pArray) { const text = this.extractTextFromParagraph(p); if (text) { paragraphs.push({ text, runs: this.extractTextRuns(p), alignment: this.extractAlignment(p), level: parseInt(p?.['a:pPr']?.[0]?.$.lvl || '0', 10) }); } } } } catch { // Paragraph extraction failed } return paragraphs; } extractTextFromParagraph(p) { try { const runs = p?.['a:r'] || []; return runs .map((run) => run?.['a:t']?.[0] || '') .join(''); } catch { return ''; } } extractTextRuns(p) { const runs = []; try { const runArray = p?.['a:r']; if (runArray && Array.isArray(runArray)) { for (const run of runArray) { const text = run?.['a:t']?.[0] || ''; if (text) { runs.push({ text, font: this.extractFontInfo(run?.['a:rPr']?.[0]) }); } } } } catch { // Run extraction failed } return runs; } extractAlignment(p) { try { const algn = p?.['a:pPr']?.[0]?.$.algn; switch (algn) { case 'ctr': return 'center'; case 'r': return 'right'; case 'just': return 'justify'; default: return 'left'; } } catch { return 'left'; } } extractShapeType(spPr) { try { const prstGeom = spPr?.['a:prstGeom']?.[0]; return prstGeom?.$.prst || 'unknown'; } catch { return 'unknown'; } } extractShapeGeometry(spPr) { try { const custGeom = spPr?.['a:custGeom']?.[0]; if (custGeom) { return custGeom; } const prstGeom = spPr?.['a:prstGeom']?.[0]; if (prstGeom) { return { preset: prstGeom.$.prst }; } } catch { // Geometry extraction failed } return null; } findRelationshipTarget(sourceId, relationshipId) { try { const rels = this.relationships.get(sourceId); if (rels?.Relationships?.Relationship) { const relationships = Array.isArray(rels.Relationships.Relationship) ? rels.Relationships.Relationship : [rels.Relationships.Relationship]; for (const rel of relationships) { if (rel.$.Id === relationshipId) { return { target: rel.$.Target, type: rel.$.Type }; } } } } catch { // Relationship lookup failed } return null; } createPlaceholderSlide(slideId, slideNumber) { return { slideId, slideNumber, title: `Slide ${slideNumber}`, elements: [], dimensions: { width: 12192000, height: 6858000, units: 'EMU' } }; } async getXmlContent(path) { const file = this.zip?.file(path); if (!file) { throw new Error(`File not found: ${path}`); } return await file.async('string'); } } //# sourceMappingURL=pptx-visual-parser.js.map