UNPKG

cleanifix

Version:

Intelligent data cleaning CLI with natural language support - Docker-powered Python engine

222 lines 8.63 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.cleanCommand = cleanCommand; const fs_1 = require("fs"); const path = __importStar(require("path")); const logger_1 = require("../utils/logger"); const python_bridge_1 = require("../services/python-bridge"); const spinner_1 = require("../utils/spinner"); async function cleanCommand(input, options) { const spin = (0, spinner_1.spinner)(); let pythonBridge = null; try { // Initialize Python bridge pythonBridge = new python_bridge_1.PythonBridge(); await pythonBridge.initialize(); // Validate input file exists const inputPath = path.resolve(input); const inputStats = await fs_1.promises.stat(inputPath); if (!inputStats.isFile()) { throw new Error(`Input must be a file, not a directory: ${input}`); } // Get output path from options if (!options.output) { throw new Error('Output path is required. Use --output <path>'); } // Ensure output directory exists const outputPath = path.resolve(options.output); const outputDir = path.dirname(outputPath); await fs_1.promises.mkdir(outputDir, { recursive: true }); // Parse cleaning rules let cleaningRules = []; if (options.config) { // Load rules from config file const configPath = path.resolve(options.config); const configContent = await fs_1.promises.readFile(configPath, 'utf-8'); const config = JSON.parse(configContent); cleaningRules = config.rules || []; } else if (options.rules) { // Parse inline rules cleaningRules = parseInlineRules(options.rules); } else { // Default rules if none specified cleaningRules = [ { type: 'missing_values', config: { strategy: 'drop' } }, { type: 'duplicates', config: { keep: 'first' } }, { type: 'standardize', config: { columns: 'all' } } ]; } // Create backup if requested if (options.backup && !options.dryRun) { const backupPath = `${inputPath}.backup`; await fs_1.promises.copyFile(inputPath, backupPath); logger_1.logger.info(`Created backup: ${backupPath}`); } spin.start('Loading data...'); // Prepare cleaning options const cleaningOptions = { backup: options.backup, dry_run: options.dryRun }; // Handle duplicate cleaning if (options.duplicates) { cleaningOptions.duplicates = true; cleaningOptions.strategy = options.strategy || 'first'; if (options.subset) { cleaningOptions.subset = options.subset.split(',').map(c => c.trim()); } if (options.aggregationRules) { try { cleaningOptions.aggregation_rules = JSON.parse(options.aggregationRules); } catch (e) { logger_1.logger.warn('Invalid aggregation rules JSON, ignoring'); } } } // Handle missing value cleaning (can be combined with duplicates) if (options.missing) { cleaningOptions.missing = true; cleaningOptions.strategy = options.strategy || 'drop'; if (options.fillValue) { cleaningOptions.fill_value = options.fillValue; } if (options.columns) { cleaningOptions.columns = options.columns.split(',').map(c => c.trim()); } if (options.threshold) { cleaningOptions.threshold = parseFloat(options.threshold); } } // Clean the data spin.start('Cleaning data...'); const cleanResult = await pythonBridge.clean(inputPath, outputPath, cleaningOptions); if (!cleanResult.success) { throw new Error(`Failed to clean data: ${cleanResult.error?.message}`); } spin.stop(); // Display message from result if available if (cleanResult.message) { logger_1.logger.info(cleanResult.message); } // Check for warnings in the result if (cleanResult.warning) { logger_1.logger.warn(cleanResult.warning); console.log(cleanResult.warning); // Also output to stdout for test } logger_1.logger.info(`✅ Data cleaning completed successfully!`); if (!options.dryRun) { logger_1.logger.info(`📁 Cleaned data saved to: ${outputPath}`); } else { logger_1.logger.info(`🔍 Dry run completed - no files were modified`); } // Generate report if requested if (options.report) { const reportPath = `${outputPath}.report.json`; const report = { timestamp: new Date().toISOString(), input_file: inputPath, output_file: outputPath, dry_run: options.dryRun || false, success: true, message: 'Data cleaning completed successfully' }; await fs_1.promises.writeFile(reportPath, JSON.stringify(report, null, 2)); logger_1.logger.info(`📄 Cleaning report saved to: ${reportPath}`); } } catch (error) { spin.stop(); logger_1.logger.error('Cleaning failed:', error); process.exit(1); } finally { // Always cleanup Python bridge if (pythonBridge) { await pythonBridge.shutdown(); } } } function parseInlineRules(rulesString) { const rules = []; const ruleNames = rulesString.split(',').map(r => r.trim()); for (const ruleName of ruleNames) { switch (ruleName) { case 'missing': case 'missing_values': rules.push({ type: 'missing_values', config: { strategy: 'drop' } }); break; case 'duplicates': case 'dedup': rules.push({ type: 'duplicates', config: { keep: 'first' } }); break; case 'standardize': case 'format': rules.push({ type: 'standardize', config: { columns: 'all' } }); break; case 'trim': case 'whitespace': rules.push({ type: 'trim_whitespace', config: { columns: 'all' } }); break; case 'outliers': rules.push({ type: 'outliers', config: { method: 'iqr', threshold: 1.5 } }); break; default: logger_1.logger.warn(`Unknown cleaning rule: ${ruleName}`); } } return rules; } // Simplified version - advanced cleaning rules will be implemented later //# sourceMappingURL=clean.js.map