cleanifix
Version:
Intelligent data cleaning CLI with natural language support - Docker-powered Python engine
373 lines • 13.6 kB
JavaScript
;
/**
* Natural language command parser for Cleanifix CLI.
* Converts user-friendly commands into structured operations.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.naturalLanguageParser = exports.NaturalLanguageParser = void 0;
class NaturalLanguageParser {
commandPatterns;
constructor() {
this.commandPatterns = this.initializePatterns();
}
/**
* Parse a natural language command into structured format
*/
parse(input) {
const normalizedInput = this.normalizeInput(input);
// Try each pattern until we find a match
for (const pattern of this.commandPatterns) {
const match = normalizedInput.match(pattern.pattern);
if (match) {
const result = pattern.handler(match);
return {
command: result.command || 'unknown',
operation: result.operation,
targets: result.targets || [],
options: result.options || {},
confidence: this.calculateConfidence(normalizedInput, pattern)
};
}
}
// If no pattern matches, try to extract basic intent
return this.extractBasicIntent(normalizedInput);
}
/**
* Initialize command patterns
*/
initializePatterns() {
return [
// Analyze commands
{
pattern: /^(analyze|check|inspect|examine)\s+(.*?)\s+for\s+(missing values?|duplicates?|format issues?|quality issues?|problems?)$/i,
handler: (match) => ({
command: 'analyze',
targets: [match[2]],
operation: this.mapAnalysisType(match[3])
}),
examples: [
'analyze data.csv for missing values',
'check my file for duplicates',
'inspect dataset for format issues'
]
},
{
pattern: /^(analyze|check|inspect|examine)\s+(.+)$/i,
handler: (match) => ({
command: 'analyze',
targets: [match[2]],
operation: 'all'
}),
examples: [
'analyze data.csv',
'check my dataset',
'inspect file.csv'
]
},
// Clean commands
{
pattern: /^(clean|fix|remove|handle)\s+(missing values?|duplicates?|format issues?)\s+(in|from)\s+(.+)$/i,
handler: (match) => ({
command: 'clean',
operation: this.mapCleaningType(match[2]),
targets: [match[4]]
}),
examples: [
'clean missing values in data.csv',
'remove duplicates from my file',
'fix format issues in dataset.csv'
]
},
{
pattern: /^(clean|fix|cleanup)\s+(.+)$/i,
handler: (match) => ({
command: 'clean',
targets: [match[2]],
operation: 'all'
}),
examples: [
'clean data.csv',
'fix my dataset',
'cleanup file.csv'
]
},
// Specific missing value operations
{
pattern: /^(fill|impute|replace)\s+missing\s+(values?|data)\s+(in|from)\s+(.+?)\s+with\s+(.+)$/i,
handler: (match) => ({
command: 'clean',
operation: 'missing',
targets: [match[4]],
options: {
strategy: 'fill',
fillValue: match[5]
}
}),
examples: [
'fill missing values in data.csv with 0',
'replace missing data in file.csv with mean',
'impute missing values in dataset with median'
]
},
{
pattern: /^(drop|remove|delete)\s+(rows?|records?)\s+with\s+missing\s+(values?|data)\s+(in|from)\s+(.+)$/i,
handler: (match) => ({
command: 'clean',
operation: 'missing',
targets: [match[5]],
options: {
strategy: 'drop'
}
}),
examples: [
'drop rows with missing values in data.csv',
'remove records with missing data from file.csv'
]
},
// Deduplication operations
{
pattern: /^(remove|delete|drop)\s+duplicate\s+(rows?|records?)\s+(in|from)\s+(.+?)(?:\s+keeping\s+(first|last))?$/i,
handler: (match) => ({
command: 'clean',
operation: 'duplicates',
targets: [match[4]],
options: {
keep: match[5] || 'first'
}
}),
examples: [
'remove duplicate rows from data.csv',
'delete duplicate records in file.csv keeping last',
'drop duplicates from dataset keeping first'
]
},
// Format standardization
{
pattern: /^(standardize|normalize|format)\s+(dates?|phones?|emails?|currency|numbers?)\s+(in|from)\s+(.+?)(?:\s+to\s+(.+))?$/i,
handler: (match) => ({
command: 'clean',
operation: 'format',
targets: [match[4]],
options: {
type: match[2].replace(/s$/, ''),
format: match[5] || 'default'
}
}),
examples: [
'standardize dates in data.csv to ISO',
'format phone numbers in file.csv',
'normalize emails in dataset.csv'
]
},
// Transform commands
{
pattern: /^(transform|convert|change)\s+(.+?)\s+to\s+(.+)$/i,
handler: (match) => ({
command: 'transform',
targets: [match[2]],
options: {
outputFormat: match[3]
}
}),
examples: [
'transform data.csv to json',
'convert file.xlsx to csv',
'change dataset.json to parquet'
]
},
// Validate commands
{
pattern: /^(validate|verify|check)\s+(.+?)\s+(?:for\s+)?(completeness|consistency|quality)$/i,
handler: (match) => ({
command: 'validate',
targets: [match[2]],
operation: match[3]
}),
examples: [
'validate data.csv for completeness',
'verify file.csv consistency',
'check dataset quality'
]
},
// Init/config commands
{
pattern: /^(init|initialize|setup|configure)(?:\s+(.+))?$/i,
handler: (match) => ({
command: 'init',
options: match[2] ? { path: match[2] } : {}
}),
examples: [
'init',
'setup my project',
'initialize cleanifix'
]
},
// Help commands
{
pattern: /^(help|how|what)\s*(.*)?$/i,
handler: (match) => ({
command: 'help',
options: {
topic: match[2] || 'general'
}
}),
examples: [
'help',
'how do I clean missing values',
'what commands are available'
]
}
];
}
/**
* Normalize input for better pattern matching
*/
normalizeInput(input) {
return input
.trim()
.toLowerCase()
.replace(/\s+/g, ' ') // Normalize whitespace
.replace(/['']/g, "'") // Normalize quotes
.replace(/[""]/g, '"');
}
/**
* Map analysis type from natural language
*/
mapAnalysisType(type) {
const typeMap = {
'missing values': 'missing',
'missing value': 'missing',
'duplicates': 'duplicates',
'duplicate': 'duplicates',
'format issues': 'format',
'format issue': 'format',
'quality issues': 'quality',
'quality issue': 'quality',
'problems': 'all',
'problem': 'all'
};
return typeMap[type.toLowerCase()] || 'all';
}
/**
* Map cleaning type from natural language
*/
mapCleaningType(type) {
const typeMap = {
'missing values': 'missing',
'missing value': 'missing',
'duplicates': 'duplicates',
'duplicate': 'duplicates',
'format issues': 'format',
'format issue': 'format'
};
return typeMap[type.toLowerCase()] || 'all';
}
/**
* Calculate confidence score for a match
*/
calculateConfidence(input, pattern) {
// Base confidence from pattern match
let confidence = 0.8;
// Boost confidence if input closely matches an example
for (const example of pattern.examples) {
const similarity = this.calculateSimilarity(input, example.toLowerCase());
if (similarity > 0.9) {
confidence = Math.max(confidence, 0.95);
break;
}
else if (similarity > 0.7) {
confidence = Math.max(confidence, 0.9);
}
}
return confidence;
}
/**
* Calculate similarity between two strings
*/
calculateSimilarity(str1, str2) {
const words1 = str1.split(' ');
const words2 = str2.split(' ');
const commonWords = words1.filter(word => words2.includes(word));
return commonWords.length / Math.max(words1.length, words2.length);
}
/**
* Extract basic intent when no pattern matches
*/
extractBasicIntent(input) {
// Look for file paths
const fileMatch = input.match(/\b[\w\-]+\.(csv|json|xlsx?|parquet|tsv)\b/i);
const file = fileMatch ? fileMatch[0] : undefined;
// Look for command keywords
const commands = ['analyze', 'clean', 'transform', 'validate', 'init', 'help', 'config'];
let detectedCommand = 'unknown';
for (const cmd of commands) {
if (input.includes(cmd)) {
detectedCommand = cmd;
break;
}
}
// Look for operation keywords
const operations = ['missing', 'duplicate', 'format', 'quality'];
let detectedOperation = undefined;
for (const op of operations) {
if (input.includes(op)) {
detectedOperation = op;
break;
}
}
return {
command: detectedCommand,
operation: detectedOperation,
targets: file ? [file] : [],
options: {},
confidence: 0.3
};
}
/**
* Get suggestions for incomplete commands
*/
getSuggestions(partialInput) {
const normalized = this.normalizeInput(partialInput);
const suggestions = [];
// Collect all examples that start with the input
for (const pattern of this.commandPatterns) {
for (const example of pattern.examples) {
if (example.toLowerCase().startsWith(normalized)) {
suggestions.push(example);
}
}
}
// If no exact matches, look for partial matches
if (suggestions.length === 0) {
const words = normalized.split(' ');
const lastWord = words[words.length - 1];
for (const pattern of this.commandPatterns) {
for (const example of pattern.examples) {
if (example.toLowerCase().includes(lastWord)) {
suggestions.push(example);
}
}
}
}
// Remove duplicates and limit to 5 suggestions
return [...new Set(suggestions)].slice(0, 5);
}
/**
* Get all available command examples
*/
getExamples() {
const examples = {};
for (const pattern of this.commandPatterns) {
const command = pattern.examples[0].split(' ')[0];
if (!examples[command]) {
examples[command] = [];
}
examples[command].push(...pattern.examples);
}
return examples;
}
}
exports.NaturalLanguageParser = NaturalLanguageParser;
// Export singleton instance
exports.naturalLanguageParser = new NaturalLanguageParser();
//# sourceMappingURL=natural-language.js.map