cleanifix
Version:
Intelligent data cleaning CLI with natural language support - Docker-powered Python engine
222 lines • 8.63 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.cleanCommand = cleanCommand;
const fs_1 = require("fs");
const path = __importStar(require("path"));
const logger_1 = require("../utils/logger");
const python_bridge_1 = require("../services/python-bridge");
const spinner_1 = require("../utils/spinner");
async function cleanCommand(input, options) {
const spin = (0, spinner_1.spinner)();
let pythonBridge = null;
try {
// Initialize Python bridge
pythonBridge = new python_bridge_1.PythonBridge();
await pythonBridge.initialize();
// Validate input file exists
const inputPath = path.resolve(input);
const inputStats = await fs_1.promises.stat(inputPath);
if (!inputStats.isFile()) {
throw new Error(`Input must be a file, not a directory: ${input}`);
}
// Get output path from options
if (!options.output) {
throw new Error('Output path is required. Use --output <path>');
}
// Ensure output directory exists
const outputPath = path.resolve(options.output);
const outputDir = path.dirname(outputPath);
await fs_1.promises.mkdir(outputDir, { recursive: true });
// Parse cleaning rules
let cleaningRules = [];
if (options.config) {
// Load rules from config file
const configPath = path.resolve(options.config);
const configContent = await fs_1.promises.readFile(configPath, 'utf-8');
const config = JSON.parse(configContent);
cleaningRules = config.rules || [];
}
else if (options.rules) {
// Parse inline rules
cleaningRules = parseInlineRules(options.rules);
}
else {
// Default rules if none specified
cleaningRules = [
{ type: 'missing_values', config: { strategy: 'drop' } },
{ type: 'duplicates', config: { keep: 'first' } },
{ type: 'standardize', config: { columns: 'all' } }
];
}
// Create backup if requested
if (options.backup && !options.dryRun) {
const backupPath = `${inputPath}.backup`;
await fs_1.promises.copyFile(inputPath, backupPath);
logger_1.logger.info(`Created backup: ${backupPath}`);
}
spin.start('Loading data...');
// Prepare cleaning options
const cleaningOptions = {
backup: options.backup,
dry_run: options.dryRun
};
// Handle duplicate cleaning
if (options.duplicates) {
cleaningOptions.duplicates = true;
cleaningOptions.strategy = options.strategy || 'first';
if (options.subset) {
cleaningOptions.subset = options.subset.split(',').map(c => c.trim());
}
if (options.aggregationRules) {
try {
cleaningOptions.aggregation_rules = JSON.parse(options.aggregationRules);
}
catch (e) {
logger_1.logger.warn('Invalid aggregation rules JSON, ignoring');
}
}
}
// Handle missing value cleaning (can be combined with duplicates)
if (options.missing) {
cleaningOptions.missing = true;
cleaningOptions.strategy = options.strategy || 'drop';
if (options.fillValue) {
cleaningOptions.fill_value = options.fillValue;
}
if (options.columns) {
cleaningOptions.columns = options.columns.split(',').map(c => c.trim());
}
if (options.threshold) {
cleaningOptions.threshold = parseFloat(options.threshold);
}
}
// Clean the data
spin.start('Cleaning data...');
const cleanResult = await pythonBridge.clean(inputPath, outputPath, cleaningOptions);
if (!cleanResult.success) {
throw new Error(`Failed to clean data: ${cleanResult.error?.message}`);
}
spin.stop();
// Display message from result if available
if (cleanResult.message) {
logger_1.logger.info(cleanResult.message);
}
// Check for warnings in the result
if (cleanResult.warning) {
logger_1.logger.warn(cleanResult.warning);
console.log(cleanResult.warning); // Also output to stdout for test
}
logger_1.logger.info(`✅ Data cleaning completed successfully!`);
if (!options.dryRun) {
logger_1.logger.info(`📁 Cleaned data saved to: ${outputPath}`);
}
else {
logger_1.logger.info(`🔍 Dry run completed - no files were modified`);
}
// Generate report if requested
if (options.report) {
const reportPath = `${outputPath}.report.json`;
const report = {
timestamp: new Date().toISOString(),
input_file: inputPath,
output_file: outputPath,
dry_run: options.dryRun || false,
success: true,
message: 'Data cleaning completed successfully'
};
await fs_1.promises.writeFile(reportPath, JSON.stringify(report, null, 2));
logger_1.logger.info(`📄 Cleaning report saved to: ${reportPath}`);
}
}
catch (error) {
spin.stop();
logger_1.logger.error('Cleaning failed:', error);
process.exit(1);
}
finally {
// Always cleanup Python bridge
if (pythonBridge) {
await pythonBridge.shutdown();
}
}
}
function parseInlineRules(rulesString) {
const rules = [];
const ruleNames = rulesString.split(',').map(r => r.trim());
for (const ruleName of ruleNames) {
switch (ruleName) {
case 'missing':
case 'missing_values':
rules.push({
type: 'missing_values',
config: { strategy: 'drop' }
});
break;
case 'duplicates':
case 'dedup':
rules.push({
type: 'duplicates',
config: { keep: 'first' }
});
break;
case 'standardize':
case 'format':
rules.push({
type: 'standardize',
config: { columns: 'all' }
});
break;
case 'trim':
case 'whitespace':
rules.push({
type: 'trim_whitespace',
config: { columns: 'all' }
});
break;
case 'outliers':
rules.push({
type: 'outliers',
config: { method: 'iqr', threshold: 1.5 }
});
break;
default:
logger_1.logger.warn(`Unknown cleaning rule: ${ruleName}`);
}
}
return rules;
}
// Simplified version - advanced cleaning rules will be implemented later
//# sourceMappingURL=clean.js.map