UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

315 lines 12.5 kB
"use strict"; /** * Excel Parser Implementation * Supports .xlsx, .xls, and .xlsm formats using ExcelJS */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ExcelParser = exports.ExcelDetector = void 0; exports.createExcelParser = createExcelParser; const fs_1 = require("fs"); const path = __importStar(require("path")); const ExcelJS = __importStar(require("exceljs")); const data_parser_1 = require("./base/data-parser"); const types_1 = require("../core/types"); const logger_1 = require("../utils/logger"); /** * Excel Format Detector */ class ExcelDetector { getSupportedExtensions() { return ['.xlsx', '.xls', '.xlsm']; } getFormatName() { return 'excel'; } async detect(filePath) { try { // Check extension first const ext = path.extname(filePath).toLowerCase(); const extensionScore = this.getSupportedExtensions().includes(ext) ? 0.3 : 0; if (extensionScore === 0) { return { format: 'excel', confidence: 0, metadata: { reason: 'Unsupported extension' }, }; } // Try to read Excel metadata without parsing full file const metadata = await this.readExcelMetadata(filePath); if (metadata.sheetCount > 0) { const confidence = Math.min(0.95, extensionScore + 0.7); return { format: 'excel', confidence, metadata, estimatedRows: metadata.sheets.reduce((total, sheet) => total + sheet.rowCount, 0), estimatedColumns: Math.max(...metadata.sheets.map((sheet) => sheet.columnCount)), suggestedOptions: { sheetIndex: 0, // Default to first sheet with data sheetName: metadata.sheets.find((s) => s.hasData)?.name, }, }; } return { format: 'excel', confidence: extensionScore, metadata: { reason: 'No sheets with data found' }, }; } catch (error) { logger_1.logger.warn(`Excel detection failed: ${error.message}`); return { format: 'excel', confidence: 0, metadata: { error: error.message }, }; } } async readExcelMetadata(filePath) { const workbook = new ExcelJS.Workbook(); try { await workbook.xlsx.readFile(filePath); const sheets = workbook.worksheets.map((worksheet, index) => ({ name: worksheet.name, index, rowCount: worksheet.rowCount, columnCount: worksheet.columnCount, hasData: worksheet.rowCount > 0 && worksheet.columnCount > 0, })); return { sheetCount: sheets.length, sheets, }; } catch (error) { throw new types_1.DataPilotError(`Failed to read Excel metadata: ${error.message}`, 'EXCEL_METADATA_ERROR', types_1.ErrorSeverity.MEDIUM, types_1.ErrorCategory.PARSING); } } } exports.ExcelDetector = ExcelDetector; /** * Excel Parser Implementation */ class ExcelParser extends data_parser_1.BaseParser { headers = []; getSupportedExtensions() { return ['.xlsx', '.xls', '.xlsm']; } getFormatName() { return 'excel'; } async detect(filePath) { const detector = new ExcelDetector(); return detector.detect(filePath); } async *parse(filePath, options) { const mergedOptions = { ...this.options, ...options }; try { const workbook = new ExcelJS.Workbook(); await workbook.xlsx.readFile(filePath); // Get file size for stats const fileStats = await fs_1.promises.stat(filePath); this.updateStats(fileStats.size, 0); // Select worksheet const worksheet = this.selectWorksheet(workbook, mergedOptions); if (!worksheet) { throw new types_1.DataPilotError('No valid worksheet found or specified sheet does not exist', 'EXCEL_WORKSHEET_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION); } logger_1.logger.info(`Parsing Excel sheet: ${worksheet.name} (${worksheet.rowCount} rows, ${worksheet.columnCount} columns)`); yield* this.parseWorksheet(worksheet, mergedOptions); } catch (error) { throw new types_1.DataPilotError(`Excel parsing failed: ${error.message}`, 'EXCEL_PARSING_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING); } } selectWorksheet(workbook, options) { // Priority 1: Specific sheet name if (options.sheetName) { const worksheet = workbook.getWorksheet(options.sheetName); if (worksheet) { return worksheet; } else { throw new types_1.DataPilotError(`Sheet "${options.sheetName}" not found. Available sheets: ${workbook.worksheets.map((ws) => ws.name).join(', ')}`, 'EXCEL_SHEET_NOT_FOUND', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION); } } // Priority 2: Specific sheet index if (options.sheetIndex !== undefined) { const worksheet = workbook.getWorksheet(options.sheetIndex + 1); // ExcelJS is 1-based if (worksheet) { return worksheet; } else { throw new types_1.DataPilotError(`Sheet index ${options.sheetIndex} not found. Available sheets: 0-${workbook.worksheets.length - 1}`, 'EXCEL_SHEET_INDEX_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION); } } // Priority 3: First worksheet with data const worksheetWithData = workbook.worksheets.find((ws) => ws.rowCount > 0 && ws.columnCount > 0); if (worksheetWithData) { return worksheetWithData; } // Priority 4: First worksheet (even if empty) return workbook.worksheets[0] || null; } async *parseWorksheet(worksheet, options) { const maxRows = options.maxRows || worksheet.rowCount; const hasHeader = options.hasHeader ?? true; let rowIndex = 0; let dataRowIndex = 0; // Collect rows first since eachRow doesn't support async iteration const rows = []; // Iterate through rows worksheet.eachRow({ includeEmpty: false }, (row, rowNumber) => { if (this.aborted || dataRowIndex >= maxRows) return; // Convert ExcelJS row to string array const values = row.values; // Skip the first element (it's undefined in ExcelJS) const cellData = values.slice(1).map((cell) => this.formatCellValue(cell)); // Handle header row if (rowIndex === 0 && hasHeader) { this.headers = cellData.map((cell) => String(cell || `Column_${cellData.indexOf(cell) + 1}`)); rowIndex++; return; // Skip yielding header row } // Ensure consistent column count with headers const normalizedData = this.normalizeRowData(cellData, this.headers.length); rows.push({ index: dataRowIndex++, data: normalizedData, raw: normalizedData.join('\t'), // Use tab as delimiter for Excel metadata: { originalType: 'excel', sheetName: worksheet.name, excelRow: rowNumber, columnCount: normalizedData.length, }, }); this.updateStats(0, 1); rowIndex++; }); // If no headers were detected, generate them if (this.headers.length === 0 && rows.length > 0) { // Get first row to determine column count const firstRow = worksheet.getRow(1); const columnCount = firstRow.cellCount; this.headers = Array.from({ length: columnCount }, (_, i) => `Column_${i + 1}`); } // Yield collected rows for (const row of rows) { yield row; } } formatCellValue(cell) { if (cell === null || cell === undefined) { return ''; } // Handle ExcelJS cell objects if (typeof cell === 'object') { // Rich text cell if (cell.richText) { return cell.richText.map((rt) => rt.text || '').join(''); } // Formula cell if (cell.formula) { return cell.result !== undefined ? String(cell.result) : ''; } // Hyperlink cell if (cell.hyperlink) { return cell.text || cell.hyperlink.text || ''; } // Regular cell with text property if (cell.text !== undefined) { return String(cell.text); } // Cell with value property if (cell.value !== undefined) { return String(cell.value); } } // Handle Date objects if (cell instanceof Date) { return cell.toISOString().split('T')[0]; // Return date as YYYY-MM-DD } // Handle primitive values return String(cell); } normalizeRowData(data, expectedLength) { // Ensure consistent column count const normalized = [...data]; // Pad with empty strings if row is shorter while (normalized.length < expectedLength) { normalized.push(''); } // Truncate if row is longer if (normalized.length > expectedLength) { normalized.length = expectedLength; } return normalized; } /** * Get detected headers for column mapping */ getHeaders() { return [...this.headers]; } /** * Get available worksheets in the file */ async getWorksheets(filePath) { try { const workbook = new ExcelJS.Workbook(); await workbook.xlsx.readFile(filePath); return workbook.worksheets.map((worksheet, index) => ({ name: worksheet.name, index, rowCount: worksheet.rowCount, columnCount: worksheet.columnCount, })); } catch (error) { throw new types_1.DataPilotError(`Failed to read Excel worksheets: ${error.message}`, 'EXCEL_WORKSHEETS_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING); } } } exports.ExcelParser = ExcelParser; /** * Factory function to create Excel parser */ function createExcelParser(options) { return new ExcelParser(options); } //# sourceMappingURL=excel-parser.js.map