@knowcode/convert-to-markdown
Version:
Convert Excel, PDF, and Word documents to clean, AI-ready formats like Markdown and JSON
206 lines (170 loc) • 6.58 kB
JavaScript
/**
* Excel converter functions
* Handles conversion of Excel files to JSON and Markdown formats
*/
const XLSX = require('xlsx');
const ExcelJS = require('exceljs');
const {
dataToMarkdownTable,
isRowEmpty,
cleanRowData,
createMetadataHeader,
calculateStats
} = require('../utils/common');
/**
* Converts Excel buffer to JSON format
* @param {Buffer} buffer - Excel file buffer
* @param {Object} options - Conversion options
* @param {string} options.filename - Original filename
* @param {string} options.sheetPrefix - Optional sheet name prefix filter
* @returns {Promise<Object>} Conversion result with content and statistics
*/
async function convertExcelToJson(buffer, options = {}) {
const { filename = 'document.xlsx', sheetPrefix } = options;
// Read the workbook
const workbook = new ExcelJS.Workbook();
await workbook.xlsx.load(buffer);
const statistics = {
fileName: filename,
fileSize: {
bytes: buffer.length,
KB: (buffer.length / 1024).toFixed(2),
MB: (buffer.length / (1024 * 1024)).toFixed(2)
},
sheets: []
};
const allSheets = {};
let totalCells = 0;
let totalNonEmptyCells = 0;
// Process each worksheet
for (const worksheet of workbook.worksheets) {
const sheetName = worksheet.name;
// Skip if sheetPrefix is specified and doesn't match
if (sheetPrefix && !sheetName.startsWith(sheetPrefix)) {
continue;
}
const sheetData = [];
let cellCount = 0;
let nonEmptyCellCount = 0;
// Get headers from the first row
const headers = [];
const firstRow = worksheet.getRow(1);
firstRow.eachCell({ includeEmpty: false }, (cell, colNumber) => {
headers[colNumber - 1] = cell.value || `Column${colNumber}`;
});
// Process data rows
worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => {
if (rowNumber === 1) return; // Skip header row
const rowData = {};
let hasData = false;
row.eachCell({ includeEmpty: true }, (cell, colNumber) => {
cellCount++;
const header = headers[colNumber - 1] || `Column${colNumber}`;
let value = cell.value;
// Handle different cell types
if (cell.type === ExcelJS.ValueType.Date) {
value = cell.value.toISOString().split('T')[0];
} else if (cell.type === ExcelJS.ValueType.Formula) {
// Get the calculated result
value = cell.result || cell.value;
} else if (cell.type === ExcelJS.ValueType.RichText) {
value = cell.value.richText.map(rt => rt.text).join('');
}
if (value !== null && value !== undefined && value !== '') {
nonEmptyCellCount++;
hasData = true;
}
rowData[header] = value;
});
if (hasData) {
sheetData.push(rowData);
}
});
if (sheetData.length > 0) {
allSheets[sheetName] = sheetData;
}
totalCells += cellCount;
totalNonEmptyCells += nonEmptyCellCount;
statistics.sheets.push({
name: sheetName,
rowCount: sheetData.length,
columnCount: headers.length,
cellCount: cellCount,
nonEmptyCells: nonEmptyCellCount,
emptyRate: cellCount > 0 ? ((cellCount - nonEmptyCellCount) / cellCount).toFixed(2) : 0
});
}
// Calculate token estimation based on stringified content
const content = JSON.stringify(allSheets, null, 2);
const { estimatedTokens } = calculateStats(content);
statistics.totalSheets = Object.keys(allSheets).length;
statistics.totalCells = totalCells;
statistics.totalNonEmptyCells = totalNonEmptyCells;
statistics.overallEmptyRate = totalCells > 0 ? ((totalCells - totalNonEmptyCells) / totalCells).toFixed(2) : 0;
statistics.estimatedTokens = estimatedTokens;
return {
content,
statistics
};
}
/**
* Converts Excel buffer to Markdown format
* @param {Buffer} buffer - Excel file buffer
* @param {Object} options - Conversion options
* @param {string} options.filename - Original filename
* @returns {Promise<Object>} Conversion result with markdown and statistics
*/
async function convertExcelToMarkdown(buffer, options = {}) {
const { filename = 'document.xlsx' } = options;
const workbook = XLSX.read(buffer);
let markdownContent = [];
const sheetStats = {};
// Add metadata section
const metadata = {
title: filename.replace(/\.[^/.]+$/, ''),
type: 'markdown',
source: 'xlsx',
sheets: workbook.SheetNames
};
markdownContent.push(createMetadataHeader(metadata));
// Process each sheet
workbook.SheetNames.forEach(sheetName => {
const sheet = workbook.Sheets[sheetName];
const sheetData = XLSX.utils.sheet_to_json(sheet, {
defval: '',
raw: false,
dateNF: 'yyyy-mm-dd'
});
// Clean and filter the data
const cleanedData = sheetData
.map(cleanRowData)
.filter(row => !isRowEmpty(row));
// Add sheet header
markdownContent.push(`## Sheet: ${sheetName}\n`);
// Convert sheet to markdown table
const tableMarkdown = dataToMarkdownTable(cleanedData);
markdownContent.push(tableMarkdown + '\n\n');
// Calculate sheet statistics
sheetStats[sheetName] = {
rowCount: cleanedData.length,
columnCount: cleanedData.length > 0 ? Object.keys(cleanedData[0]).length : 0
};
});
const finalMarkdown = markdownContent.join('\n');
// Calculate statistics
const stats = calculateStats(finalMarkdown, {
numberOfSheets: workbook.SheetNames.length,
sheets: sheetStats,
numberOfTables: workbook.SheetNames.length,
estimatedTokensPerSheet: Math.ceil(calculateStats(finalMarkdown).estimatedTokens / workbook.SheetNames.length)
});
return {
document: finalMarkdown,
markdown: finalMarkdown,
stats
};
}
module.exports = {
convertExcelToJson,
convertExcelToMarkdown
};