UNPKG

@knowcode/convert-to-markdown

Version:

Convert Excel, PDF, and Word documents to clean, AI-ready formats like Markdown and JSON

206 lines (170 loc) 6.58 kB
/** * Excel converter functions * Handles conversion of Excel files to JSON and Markdown formats */ const XLSX = require('xlsx'); const ExcelJS = require('exceljs'); const { dataToMarkdownTable, isRowEmpty, cleanRowData, createMetadataHeader, calculateStats } = require('../utils/common'); /** * Converts Excel buffer to JSON format * @param {Buffer} buffer - Excel file buffer * @param {Object} options - Conversion options * @param {string} options.filename - Original filename * @param {string} options.sheetPrefix - Optional sheet name prefix filter * @returns {Promise<Object>} Conversion result with content and statistics */ async function convertExcelToJson(buffer, options = {}) { const { filename = 'document.xlsx', sheetPrefix } = options; // Read the workbook const workbook = new ExcelJS.Workbook(); await workbook.xlsx.load(buffer); const statistics = { fileName: filename, fileSize: { bytes: buffer.length, KB: (buffer.length / 1024).toFixed(2), MB: (buffer.length / (1024 * 1024)).toFixed(2) }, sheets: [] }; const allSheets = {}; let totalCells = 0; let totalNonEmptyCells = 0; // Process each worksheet for (const worksheet of workbook.worksheets) { const sheetName = worksheet.name; // Skip if sheetPrefix is specified and doesn't match if (sheetPrefix && !sheetName.startsWith(sheetPrefix)) { continue; } const sheetData = []; let cellCount = 0; let nonEmptyCellCount = 0; // Get headers from the first row const headers = []; const firstRow = worksheet.getRow(1); firstRow.eachCell({ includeEmpty: false }, (cell, colNumber) => { headers[colNumber - 1] = cell.value || `Column${colNumber}`; }); // Process data rows worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => { if (rowNumber === 1) return; // Skip header row const rowData = {}; let hasData = false; row.eachCell({ includeEmpty: true }, (cell, colNumber) => { cellCount++; const header = headers[colNumber - 1] || `Column${colNumber}`; let value = cell.value; // Handle different cell types if (cell.type === ExcelJS.ValueType.Date) { value = cell.value.toISOString().split('T')[0]; } else if (cell.type === ExcelJS.ValueType.Formula) { // Get the calculated result value = cell.result || cell.value; } else if (cell.type === ExcelJS.ValueType.RichText) { value = cell.value.richText.map(rt => rt.text).join(''); } if (value !== null && value !== undefined && value !== '') { nonEmptyCellCount++; hasData = true; } rowData[header] = value; }); if (hasData) { sheetData.push(rowData); } }); if (sheetData.length > 0) { allSheets[sheetName] = sheetData; } totalCells += cellCount; totalNonEmptyCells += nonEmptyCellCount; statistics.sheets.push({ name: sheetName, rowCount: sheetData.length, columnCount: headers.length, cellCount: cellCount, nonEmptyCells: nonEmptyCellCount, emptyRate: cellCount > 0 ? ((cellCount - nonEmptyCellCount) / cellCount).toFixed(2) : 0 }); } // Calculate token estimation based on stringified content const content = JSON.stringify(allSheets, null, 2); const { estimatedTokens } = calculateStats(content); statistics.totalSheets = Object.keys(allSheets).length; statistics.totalCells = totalCells; statistics.totalNonEmptyCells = totalNonEmptyCells; statistics.overallEmptyRate = totalCells > 0 ? ((totalCells - totalNonEmptyCells) / totalCells).toFixed(2) : 0; statistics.estimatedTokens = estimatedTokens; return { content, statistics }; } /** * Converts Excel buffer to Markdown format * @param {Buffer} buffer - Excel file buffer * @param {Object} options - Conversion options * @param {string} options.filename - Original filename * @returns {Promise<Object>} Conversion result with markdown and statistics */ async function convertExcelToMarkdown(buffer, options = {}) { const { filename = 'document.xlsx' } = options; const workbook = XLSX.read(buffer); let markdownContent = []; const sheetStats = {}; // Add metadata section const metadata = { title: filename.replace(/\.[^/.]+$/, ''), type: 'markdown', source: 'xlsx', sheets: workbook.SheetNames }; markdownContent.push(createMetadataHeader(metadata)); // Process each sheet workbook.SheetNames.forEach(sheetName => { const sheet = workbook.Sheets[sheetName]; const sheetData = XLSX.utils.sheet_to_json(sheet, { defval: '', raw: false, dateNF: 'yyyy-mm-dd' }); // Clean and filter the data const cleanedData = sheetData .map(cleanRowData) .filter(row => !isRowEmpty(row)); // Add sheet header markdownContent.push(`## Sheet: ${sheetName}\n`); // Convert sheet to markdown table const tableMarkdown = dataToMarkdownTable(cleanedData); markdownContent.push(tableMarkdown + '\n\n'); // Calculate sheet statistics sheetStats[sheetName] = { rowCount: cleanedData.length, columnCount: cleanedData.length > 0 ? Object.keys(cleanedData[0]).length : 0 }; }); const finalMarkdown = markdownContent.join('\n'); // Calculate statistics const stats = calculateStats(finalMarkdown, { numberOfSheets: workbook.SheetNames.length, sheets: sheetStats, numberOfTables: workbook.SheetNames.length, estimatedTokensPerSheet: Math.ceil(calculateStats(finalMarkdown).estimatedTokens / workbook.SheetNames.length) }); return { document: finalMarkdown, markdown: finalMarkdown, stats }; } module.exports = { convertExcelToJson, convertExcelToMarkdown };