UNPKG

cleanifix

Version:

Intelligent data cleaning CLI with natural language support - Docker-powered Python engine

373 lines 13.6 kB
"use strict"; /** * Natural language command parser for Cleanifix CLI. * Converts user-friendly commands into structured operations. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.naturalLanguageParser = exports.NaturalLanguageParser = void 0; class NaturalLanguageParser { commandPatterns; constructor() { this.commandPatterns = this.initializePatterns(); } /** * Parse a natural language command into structured format */ parse(input) { const normalizedInput = this.normalizeInput(input); // Try each pattern until we find a match for (const pattern of this.commandPatterns) { const match = normalizedInput.match(pattern.pattern); if (match) { const result = pattern.handler(match); return { command: result.command || 'unknown', operation: result.operation, targets: result.targets || [], options: result.options || {}, confidence: this.calculateConfidence(normalizedInput, pattern) }; } } // If no pattern matches, try to extract basic intent return this.extractBasicIntent(normalizedInput); } /** * Initialize command patterns */ initializePatterns() { return [ // Analyze commands { pattern: /^(analyze|check|inspect|examine)\s+(.*?)\s+for\s+(missing values?|duplicates?|format issues?|quality issues?|problems?)$/i, handler: (match) => ({ command: 'analyze', targets: [match[2]], operation: this.mapAnalysisType(match[3]) }), examples: [ 'analyze data.csv for missing values', 'check my file for duplicates', 'inspect dataset for format issues' ] }, { pattern: /^(analyze|check|inspect|examine)\s+(.+)$/i, handler: (match) => ({ command: 'analyze', targets: [match[2]], operation: 'all' }), examples: [ 'analyze data.csv', 'check my dataset', 'inspect file.csv' ] }, // Clean commands { pattern: /^(clean|fix|remove|handle)\s+(missing values?|duplicates?|format issues?)\s+(in|from)\s+(.+)$/i, handler: (match) => ({ command: 'clean', operation: this.mapCleaningType(match[2]), targets: [match[4]] }), examples: [ 'clean missing values in data.csv', 'remove duplicates from my file', 'fix format issues in dataset.csv' ] }, { pattern: /^(clean|fix|cleanup)\s+(.+)$/i, handler: (match) => ({ command: 'clean', targets: [match[2]], operation: 'all' }), examples: [ 'clean data.csv', 'fix my dataset', 'cleanup file.csv' ] }, // Specific missing value operations { pattern: /^(fill|impute|replace)\s+missing\s+(values?|data)\s+(in|from)\s+(.+?)\s+with\s+(.+)$/i, handler: (match) => ({ command: 'clean', operation: 'missing', targets: [match[4]], options: { strategy: 'fill', fillValue: match[5] } }), examples: [ 'fill missing values in data.csv with 0', 'replace missing data in file.csv with mean', 'impute missing values in dataset with median' ] }, { pattern: /^(drop|remove|delete)\s+(rows?|records?)\s+with\s+missing\s+(values?|data)\s+(in|from)\s+(.+)$/i, handler: (match) => ({ command: 'clean', operation: 'missing', targets: [match[5]], options: { strategy: 'drop' } }), examples: [ 'drop rows with missing values in data.csv', 'remove records with missing data from file.csv' ] }, // Deduplication operations { pattern: /^(remove|delete|drop)\s+duplicate\s+(rows?|records?)\s+(in|from)\s+(.+?)(?:\s+keeping\s+(first|last))?$/i, handler: (match) => ({ command: 'clean', operation: 'duplicates', targets: [match[4]], options: { keep: match[5] || 'first' } }), examples: [ 'remove duplicate rows from data.csv', 'delete duplicate records in file.csv keeping last', 'drop duplicates from dataset keeping first' ] }, // Format standardization { pattern: /^(standardize|normalize|format)\s+(dates?|phones?|emails?|currency|numbers?)\s+(in|from)\s+(.+?)(?:\s+to\s+(.+))?$/i, handler: (match) => ({ command: 'clean', operation: 'format', targets: [match[4]], options: { type: match[2].replace(/s$/, ''), format: match[5] || 'default' } }), examples: [ 'standardize dates in data.csv to ISO', 'format phone numbers in file.csv', 'normalize emails in dataset.csv' ] }, // Transform commands { pattern: /^(transform|convert|change)\s+(.+?)\s+to\s+(.+)$/i, handler: (match) => ({ command: 'transform', targets: [match[2]], options: { outputFormat: match[3] } }), examples: [ 'transform data.csv to json', 'convert file.xlsx to csv', 'change dataset.json to parquet' ] }, // Validate commands { pattern: /^(validate|verify|check)\s+(.+?)\s+(?:for\s+)?(completeness|consistency|quality)$/i, handler: (match) => ({ command: 'validate', targets: [match[2]], operation: match[3] }), examples: [ 'validate data.csv for completeness', 'verify file.csv consistency', 'check dataset quality' ] }, // Init/config commands { pattern: /^(init|initialize|setup|configure)(?:\s+(.+))?$/i, handler: (match) => ({ command: 'init', options: match[2] ? { path: match[2] } : {} }), examples: [ 'init', 'setup my project', 'initialize cleanifix' ] }, // Help commands { pattern: /^(help|how|what)\s*(.*)?$/i, handler: (match) => ({ command: 'help', options: { topic: match[2] || 'general' } }), examples: [ 'help', 'how do I clean missing values', 'what commands are available' ] } ]; } /** * Normalize input for better pattern matching */ normalizeInput(input) { return input .trim() .toLowerCase() .replace(/\s+/g, ' ') // Normalize whitespace .replace(/['']/g, "'") // Normalize quotes .replace(/[""]/g, '"'); } /** * Map analysis type from natural language */ mapAnalysisType(type) { const typeMap = { 'missing values': 'missing', 'missing value': 'missing', 'duplicates': 'duplicates', 'duplicate': 'duplicates', 'format issues': 'format', 'format issue': 'format', 'quality issues': 'quality', 'quality issue': 'quality', 'problems': 'all', 'problem': 'all' }; return typeMap[type.toLowerCase()] || 'all'; } /** * Map cleaning type from natural language */ mapCleaningType(type) { const typeMap = { 'missing values': 'missing', 'missing value': 'missing', 'duplicates': 'duplicates', 'duplicate': 'duplicates', 'format issues': 'format', 'format issue': 'format' }; return typeMap[type.toLowerCase()] || 'all'; } /** * Calculate confidence score for a match */ calculateConfidence(input, pattern) { // Base confidence from pattern match let confidence = 0.8; // Boost confidence if input closely matches an example for (const example of pattern.examples) { const similarity = this.calculateSimilarity(input, example.toLowerCase()); if (similarity > 0.9) { confidence = Math.max(confidence, 0.95); break; } else if (similarity > 0.7) { confidence = Math.max(confidence, 0.9); } } return confidence; } /** * Calculate similarity between two strings */ calculateSimilarity(str1, str2) { const words1 = str1.split(' '); const words2 = str2.split(' '); const commonWords = words1.filter(word => words2.includes(word)); return commonWords.length / Math.max(words1.length, words2.length); } /** * Extract basic intent when no pattern matches */ extractBasicIntent(input) { // Look for file paths const fileMatch = input.match(/\b[\w\-]+\.(csv|json|xlsx?|parquet|tsv)\b/i); const file = fileMatch ? fileMatch[0] : undefined; // Look for command keywords const commands = ['analyze', 'clean', 'transform', 'validate', 'init', 'help', 'config']; let detectedCommand = 'unknown'; for (const cmd of commands) { if (input.includes(cmd)) { detectedCommand = cmd; break; } } // Look for operation keywords const operations = ['missing', 'duplicate', 'format', 'quality']; let detectedOperation = undefined; for (const op of operations) { if (input.includes(op)) { detectedOperation = op; break; } } return { command: detectedCommand, operation: detectedOperation, targets: file ? [file] : [], options: {}, confidence: 0.3 }; } /** * Get suggestions for incomplete commands */ getSuggestions(partialInput) { const normalized = this.normalizeInput(partialInput); const suggestions = []; // Collect all examples that start with the input for (const pattern of this.commandPatterns) { for (const example of pattern.examples) { if (example.toLowerCase().startsWith(normalized)) { suggestions.push(example); } } } // If no exact matches, look for partial matches if (suggestions.length === 0) { const words = normalized.split(' '); const lastWord = words[words.length - 1]; for (const pattern of this.commandPatterns) { for (const example of pattern.examples) { if (example.toLowerCase().includes(lastWord)) { suggestions.push(example); } } } } // Remove duplicates and limit to 5 suggestions return [...new Set(suggestions)].slice(0, 5); } /** * Get all available command examples */ getExamples() { const examples = {}; for (const pattern of this.commandPatterns) { const command = pattern.examples[0].split(' ')[0]; if (!examples[command]) { examples[command] = []; } examples[command].push(...pattern.examples); } return examples; } } exports.NaturalLanguageParser = NaturalLanguageParser; // Export singleton instance exports.naturalLanguageParser = new NaturalLanguageParser(); //# sourceMappingURL=natural-language.js.map