UNPKG

file2md

Version:

A TypeScript library for converting various document types (PDF, DOCX, XLSX, PPTX, HWP, HWPX) into Markdown with image and layout preservation

258 lines 10.3 kB
import { parseStringPromise } from 'xml2js'; import { ChartExtractionError } from '../types/errors.js'; export class ChartExtractor { imageExtractor; chartCounter = 0; constructor(imageExtractor) { this.imageExtractor = imageExtractor; } /** * Extract charts from a ZIP archive (DOCX, XLSX, PPTX) */ async extractChartsFromZip(zip, basePath = '') { const charts = []; zip.forEach((relativePath, file) => { // Look for chart files if (relativePath.includes('/charts/') && relativePath.endsWith('.xml')) { charts.push({ path: relativePath, file, basePath }); } }); const extractedCharts = []; for (const chart of charts) { try { const chartData = await this.parseChart(chart.file); if (chartData) { extractedCharts.push({ originalPath: chart.path, data: chartData, basePath }); } } catch (error) { console.warn(`Failed to extract chart ${chart.path}:`, error instanceof Error ? error.message : 'Unknown error'); } } return extractedCharts; } /** * Parse a chart XML file */ async parseChart(chartFile) { try { const xmlContent = await chartFile.async('string'); const result = await parseStringPromise(xmlContent); const chartData = { type: 'unknown', title: '', series: [], categories: [] }; // Extract chart type if (result['c:chartSpace']) { const chart = result['c:chartSpace'][0]['c:chart'][0]; // Extract title if (chart['c:title']?.[0]?.['c:tx']) { chartData.title = this.extractTextFromTitle(chart['c:title'][0]['c:tx'][0]); } // Extract plot area if (chart['c:plotArea']) { const plotArea = chart['c:plotArea'][0]; // Determine chart type and extract data if (plotArea['c:barChart']) { chartData.type = 'bar'; const { series, categories } = this.extractBarChartData(plotArea['c:barChart'][0]); chartData.series = series; chartData.categories = categories; } else if (plotArea['c:lineChart']) { chartData.type = 'line'; const { series, categories } = this.extractLineChartData(plotArea['c:lineChart'][0]); chartData.series = series; chartData.categories = categories; } else if (plotArea['c:pieChart']) { chartData.type = 'pie'; const { series, categories } = this.extractPieChartData(plotArea['c:pieChart'][0]); chartData.series = series; chartData.categories = categories; } else if (plotArea['c:scatterChart']) { chartData.type = 'scatter'; const { series, categories } = this.extractScatterChartData(plotArea['c:scatterChart'][0]); chartData.series = series; chartData.categories = categories; } } } return chartData; } catch (error) { const message = error instanceof Error ? error.message : 'Unknown error'; throw new ChartExtractionError(`Failed to parse chart: ${message}`, error); } } extractTextFromTitle(titleData) { // Simplified title extraction - in a real implementation, this would need more robust typing try { const title = titleData; if (title?.['c:rich']?.[0]?.['a:p']) { const paragraphs = title['c:rich'][0]['a:p']; let titleText = ''; for (const para of paragraphs) { if (para?.['a:r']?.[0]?.['a:t']?.[0]) { titleText += `${para['a:r'][0]['a:t'][0]} `; } } return titleText.trim(); } } catch { // Ignore parsing errors for title } return ''; } extractBarChartData(barChart) { return this.extractGenericChartData(barChart); } extractLineChartData(lineChart) { return this.extractGenericChartData(lineChart); } extractPieChartData(pieChart) { return this.extractGenericChartData(pieChart); } extractScatterChartData(scatterChart) { return this.extractGenericChartData(scatterChart); } extractGenericChartData(chartData) { const series = []; let allCategories = []; if (chartData['c:ser']) { for (const seriesData of chartData['c:ser']) { const seriesInfo = { name: '', values: [], categories: undefined }; // Extract series name if (seriesData['c:tx']?.[0]?.['c:strRef']?.[0]?.['c:strCache']?.[0]?.['c:pt']?.[0]) { seriesInfo.name = seriesData['c:tx'][0]['c:strRef'][0]['c:strCache'][0]['c:pt'][0]['c:v'][0]; } // Extract values if (seriesData['c:val']?.[0]?.['c:numRef']?.[0]?.['c:numCache']?.[0]?.['c:pt']) { for (const pt of seriesData['c:val'][0]['c:numRef'][0]['c:numCache'][0]['c:pt']) { seriesInfo.values.push(parseFloat(pt['c:v'][0]) || 0); } } // Extract categories for this series if (seriesData['c:cat']?.[0]?.['c:strRef']?.[0]?.['c:strCache']?.[0]?.['c:pt']) { const categories = []; for (const pt of seriesData['c:cat'][0]['c:strRef'][0]['c:strCache'][0]['c:pt']) { categories.push(pt['c:v'][0]); } seriesInfo.categories = categories; if (allCategories.length === 0) { allCategories = categories; } } series.push(seriesInfo); } } return { series, categories: allCategories }; } /** * Format chart data as markdown */ formatChartAsMarkdown(chartData) { this.chartCounter++; let markdown = `#### Chart ${this.chartCounter}: ${chartData.title || `${chartData.type.toUpperCase()} Chart`}\n\n`; if (chartData.series.length === 0) { return `${markdown}*No chart data available*\n\n`; } switch (chartData.type) { case 'bar': case 'line': markdown += this.formatBarLineChart(chartData); break; case 'pie': markdown += this.formatPieChart(chartData); break; default: markdown += this.formatGenericChart(chartData); } return `${markdown}\n`; } formatBarLineChart(chartData) { let markdown = '| Category |'; // Add series headers for (const series of chartData.series) { markdown += ` ${series.name || 'Series'} |`; } markdown += '\n'; // Add separator markdown += '| --- |'; // eslint-disable-next-line @typescript-eslint/no-unused-vars for (const _ of chartData.series) { markdown += ' --- |'; } markdown += '\n'; // Find maximum number of categories const maxCategories = Math.max(chartData.categories.length, ...chartData.series.map(s => s.categories?.length || 0)); // Add data rows for (let i = 0; i < maxCategories; i++) { const category = chartData.categories[i] || chartData.series[0]?.categories?.[i] || `Item ${i + 1}`; markdown += `| ${category} |`; for (const series of chartData.series) { const value = series.values[i] || 0; markdown += ` ${value} |`; } markdown += '\n'; } return markdown; } formatPieChart(chartData) { const series = chartData.series[0]; if (!series) return '*No pie chart data*\n'; let markdown = '| Category | Value | Percentage |\n'; markdown += '| --- | --- | --- |\n'; const total = series.values.reduce((sum, val) => sum + val, 0); const categories = series.categories || chartData.categories; for (let i = 0; i < Math.min(categories.length, series.values.length); i++) { const category = categories[i]; const value = series.values[i] || 0; const percentage = total > 0 ? ((value / total) * 100).toFixed(1) : '0'; markdown += `| ${category} | ${value} | ${percentage}% |\n`; } return markdown; } formatGenericChart(chartData) { let markdown = `*${chartData.type.toUpperCase()} chart with ${chartData.series.length} series*\n\n`; for (const [i, series] of chartData.series.entries()) { markdown += `**Series ${i + 1}: ${series.name}**\n`; markdown += `Values: ${series.values.join(', ')}\n`; if (series.categories && series.categories.length > 0) { markdown += `Categories: ${series.categories.join(', ')}\n`; } markdown += '\n'; } return markdown; } /** * Reset internal counters */ reset() { this.chartCounter = 0; } /** * Get current chart counter */ get currentChartCount() { return this.chartCounter; } } //# sourceMappingURL=chart-extractor.js.map