@dbclean/cli
Version:
Transform messy CSV data into clean, standardized datasets using AI-powered automation
779 lines (652 loc) • 30 kB
JavaScript
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import csv from 'csv-parser';
import axios from 'axios';
import chalk from 'chalk';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// API Configuration
const API_BASE_URL = 'https://dbclean-api.dbcleandev.workers.dev';
// Load configuration from config.json
function loadConfig() {
try {
const possiblePaths = [
path.join(process.cwd(), 'config.json'),
path.join(__dirname, '..', 'config.json'),
path.join(__dirname, 'config.json')
];
for (const configPath of possiblePaths) {
if (fs.existsSync(configPath)) {
const configContent = fs.readFileSync(configPath, 'utf-8');
const config = JSON.parse(configContent);
config._configPath = configPath;
return config;
}
}
throw new Error('config.json not found in any expected location');
} catch (error) {
console.log(`⚠️ Warning: Could not load config.json: ${error.message}`);
return {
data_dir: "data",
data_cleaned_file_path: "data_cleaned.csv",
data_deduped_file_path: "data_deduped.csv",
outputs_dir: "outputs"
};
}
}
const config = loadConfig();
// Configure paths using working directory for outputs
const workingDir = process.cwd();
const dataDir = path.join(workingDir, config.data_dir || 'data');
const outputsDir = path.join(workingDir, config.outputs_dir || 'outputs');
// Ensure directories exist
if (!fs.existsSync(dataDir)) {
fs.mkdirSync(dataDir, { recursive: true });
}
if (!fs.existsSync(outputsDir)) {
fs.mkdirSync(outputsDir, { recursive: true });
}
const INPUT_CSV_PATH = path.join(dataDir, config.data_cleaned_file_path || 'data_cleaned.csv');
const OUTPUT_CSV_PATH = path.join(dataDir, config.data_deduped_file_path || 'data_deduped.csv');
const COLUMN_MAPPING_PATH = path.join(outputsDir, 'column_mapping.json');
const DEDUPE_LOG_PATH = path.join(outputsDir, 'dedupe_log.txt');
const DEDUPE_REPORT_PATH = path.join(outputsDir, 'dedupe_report.txt');
/**
* Load full column mapping
*/
function getColumnMapping() {
try {
if (!fs.existsSync(COLUMN_MAPPING_PATH)) {
console.log(`⚠️ Column mapping file not found: ${COLUMN_MAPPING_PATH}`);
return null;
}
const mappingContent = fs.readFileSync(COLUMN_MAPPING_PATH, 'utf-8');
return JSON.parse(mappingContent);
} catch (error) {
console.log(`⚠️ Error reading column mapping: ${error.message}`);
return null;
}
}
/**
* Load column mapping and find unique columns
*/
function getUniqueColumns() {
try {
const columnMapping = getColumnMapping();
if (!columnMapping) {
return [];
}
const uniqueColumns = [];
// Find original column names that are marked as unique
Object.entries(columnMapping).forEach(([originalColumn, mapping]) => {
if (mapping.unique === true) {
uniqueColumns.push({
originalName: originalColumn,
mappedName: mapping.name,
isExcluded: mapping.isExcluded || false
});
}
});
return uniqueColumns;
} catch (error) {
console.log(`⚠️ Error reading column mapping: ${error.message}`);
return [];
}
}
/**
* Create mapped headers from original headers using column mapping
*/
function createMappedHeaders(originalHeaders) {
const columnMapping = getColumnMapping();
if (!columnMapping) {
return originalHeaders; // Fallback to original headers
}
return originalHeaders.map(originalHeader => {
const mapping = columnMapping[originalHeader];
return mapping ? mapping.name : originalHeader;
});
}
/**
* Properly quote a CSV header if it contains commas, quotes, or newlines
*/
function quoteCsvHeader(header) {
if (typeof header === 'string' && (header.includes(',') || header.includes('"') || header.includes('\n'))) {
return '"' + header.replace(/"/g, '""') + '"';
}
return header;
}
class CSVDeduplicator {
constructor(options = {}) {
this.config = {
uniqueColumns: options.uniqueColumns || [], // Columns to use for deduplication
threshold: options.threshold || 0.85, // Similarity threshold
strategy: options.strategy || 'levenshtein', // Matching strategy
showInput: options.showInput || false, // Show formatted input without sending to AI
email: options.email || null,
apiKey: options.apiKey || null,
model: options.model || null
};
this.stats = {
originalCount: 0,
duplicateGroups: 0,
duplicatesRemoved: 0,
finalCount: 0
};
}
// Levenshtein distance calculation
levenshteinDistance(str1, str2) {
const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null));
for (let i = 0; i <= str1.length; i++) {
matrix[0][i] = i;
}
for (let j = 0; j <= str2.length; j++) {
matrix[j][0] = j;
}
for (let j = 1; j <= str2.length; j++) {
for (let i = 1; i <= str1.length; i++) {
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
matrix[j][i] = Math.min(
matrix[j][i - 1] + 1, // deletion
matrix[j - 1][i] + 1, // insertion
matrix[j - 1][i - 1] + cost // substitution
);
}
}
return matrix[str2.length][str1.length];
}
// Calculate similarity ratio (0-1)
similarity(str1, str2) {
const maxLen = Math.max(str1.length, str2.length);
if (maxLen === 0) return 1;
return 1 - (this.levenshteinDistance(str1, str2) / maxLen);
}
// Jaccard similarity for word-based comparison
jaccardSimilarity(str1, str2) {
const words1 = str1.split(/\s+/).filter(w => w.length > 0);
const words2 = str2.split(/\s+/).filter(w => w.length > 0);
const set1 = new Set(words1);
const set2 = new Set(words2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
const union = new Set([...set1, ...set2]);
return union.size === 0 ? 0 : intersection.size / union.size;
}
// Normalize strings for comparison (always case insensitive for better matching)
normalize(str) {
if (!str) return '';
let normalized = str.toString().toLowerCase();
// Trim whitespace and normalize
normalized = normalized
.trim()
.replace(/[^\w\s'-]/g, ' ') // Keep apostrophes and hyphens
.replace(/\s+/g, ' ') // Normalize spaces
.trim();
return normalized;
}
// Get comparison string based on unique columns
getComparisonString(record) {
const uniqueColumns = this.config.uniqueColumns;
if (uniqueColumns.length === 0) {
return '';
}
// Combine all unique columns into a single comparison string
const values = uniqueColumns.map(col => {
const value = record[col.originalName] || '';
return this.normalize(value);
}).filter(v => v.length > 0);
return values.join(' ');
}
// Calculate similarity based on strategy
calculateSimilarity(str1, str2) {
switch (this.config.strategy) {
case 'levenshtein':
return this.similarity(str1, str2);
case 'jaccard':
return this.jaccardSimilarity(str1, str2);
case 'combined':
const levScore = this.similarity(str1, str2);
const jaccardScore = this.jaccardSimilarity(str1, str2);
return (levScore * 0.7) + (jaccardScore * 0.3);
default:
return this.similarity(str1, str2);
}
}
// Find potential duplicate groups using unique columns
findPotentialDuplicates(records) {
const groups = [];
const used = new Set();
this.stats.originalCount = records.length;
for (let i = 0; i < records.length; i++) {
if (used.has(i)) continue;
const currentStr = this.getComparisonString(records[i]);
if (!currentStr) continue; // Skip empty values
const group = {
representative: { record: records[i], index: i, similarity: 1.0 },
duplicates: [],
uniqueColumns: this.config.uniqueColumns.map(col => col.originalName),
comparisonValue: currentStr
};
for (let j = i + 1; j < records.length; j++) {
if (used.has(j)) continue;
const candidateStr = this.getComparisonString(records[j]);
if (!candidateStr) continue;
const sim = this.calculateSimilarity(currentStr, candidateStr);
if (sim >= this.config.threshold) {
group.duplicates.push({
record: records[j],
index: j,
similarity: sim,
comparisonValue: candidateStr
});
used.add(j);
}
}
if (group.duplicates.length > 0) {
groups.push(group);
}
used.add(i);
}
return groups;
}
// Format potential duplicates for AI processing
formatPotentialDuplicatesForAI(duplicateGroups, originalHeaders) {
if (duplicateGroups.length === 0) {
return '';
}
// Create mapped headers using the column mapping
const mappedHeaders = createMappedHeaders(originalHeaders);
// Add ID as the first column
const headersWithId = ['ID', ...mappedHeaders];
// Properly quote headers that contain commas, quotes, or newlines
const quotedHeaders = headersWithId.map(quoteCsvHeader);
let formatted = '<potential_duplicates>\n';
formatted += quotedHeaders.join(',') + '\n';
duplicateGroups.forEach((group, groupIndex) => {
formatted += `<group_${groupIndex + 1}>\n`;
// Add representative with ID
const repValues = ['ID', ...originalHeaders].map((header, index) => {
let value;
if (header === 'ID') {
value = group.representative.index + 1; // 1-based ID
} else {
value = group.representative.record[header] || '';
}
if (typeof value === 'string' && (value.includes(',') || value.includes('"') || value.includes('\n'))) {
value = '"' + value.replace(/"/g, '""') + '"';
}
return value;
});
formatted += repValues.join(',') + '\n';
// Add duplicates with ID
group.duplicates.forEach(dup => {
const dupValues = ['ID', ...originalHeaders].map((header, index) => {
let value;
if (header === 'ID') {
value = dup.index + 1; // 1-based ID
} else {
value = dup.record[header] || '';
}
if (typeof value === 'string' && (value.includes(',') || value.includes('"') || value.includes('\n'))) {
value = '"' + value.replace(/"/g, '""') + '"';
}
return value;
});
formatted += dupValues.join(',') + '\n';
});
formatted += `</group_${groupIndex + 1}>\n`;
});
formatted += '</potential_duplicates>';
return formatted;
}
// Send to AI API for deduplication decisions
async sendToAI(potentialDuplicatesXML, uniqueColumns) {
try {
if (!this.config.email || !this.config.apiKey) {
throw new Error('Email and API key are required for AI processing');
}
const response = await axios.post(`${API_BASE_URL}/api/dedupe/process`, {
potentialDuplicates: potentialDuplicatesXML,
uniqueColumns: uniqueColumns.map(col => col.originalName),
model: this.config.model
}, {
headers: {
'Content-Type': 'application/json',
'X-Email': this.config.email,
'X-API-Key': this.config.apiKey
},
timeout: 300000 // 5 minute timeout
});
if (response.data && response.data.result) {
return { success: true, result: response.data.result };
} else {
return { success: false, error: 'Invalid response from AI API' };
}
} catch (error) {
console.error('AI API Error:', error);
if (error.response) {
return {
success: false,
error: `AI API Error (${error.response.status}): ${error.response.data?.error || error.response.statusText}`
};
} else if (error.code === 'ECONNREFUSED') {
return { success: false, error: 'Could not connect to AI API service' };
} else if (error.code === 'ETIMEDOUT') {
return { success: false, error: 'AI API request timed out' };
} else {
return { success: false, error: error.message };
}
}
}
// Parse AI response to get records to remove
parseAIResponse(aiResponse, duplicateGroups) {
try {
console.log('🔍 Parsing AI response:', aiResponse.substring(0, 500) + (aiResponse.length > 500 ? '...' : ''));
// The AI returns the record to KEEP for each group
// We need to remove all OTHER records in each group
const indicesToRemove = [];
// Find all group blocks in AI response
const groupMatches = aiResponse.matchAll(/<group_(\d+)>([\s\S]*?)<\/group_\1>/g);
for (const match of groupMatches) {
const groupNumber = parseInt(match[1]) - 1; // Convert to 0-based group index
const groupContent = match[2].trim();
if (groupContent && duplicateGroups[groupNumber]) {
// Group has content - extract the ID of record to KEEP
const keepIdMatch = groupContent.match(/^(\d+),/);
if (keepIdMatch) {
const keepId = parseInt(keepIdMatch[1]);
const keepIndex = keepId - 1; // Convert to 0-based index
console.log(`📋 Group ${groupNumber + 1}: AI wants to KEEP ID ${keepId} (index ${keepIndex})`);
// Get all record indices in this duplicate group
const group = duplicateGroups[groupNumber];
const allIndicesInGroup = [
group.representative.index,
...group.duplicates.map(dup => dup.index)
];
// Remove all indices EXCEPT the one the AI wants to keep
const toRemove = allIndicesInGroup.filter(index => index !== keepIndex);
indicesToRemove.push(...toRemove);
console.log(`📋 Group ${groupNumber + 1}: Removing indices [${toRemove.join(', ')}], keeping index ${keepIndex}`);
}
} else {
console.log(`📋 Group ${groupNumber + 1}: Empty or no group data - no duplicates to remove`);
}
}
if (indicesToRemove.length > 0) {
// Remove duplicates and sort
const uniqueIndicesToRemove = [...new Set(indicesToRemove)].sort((a, b) => a - b);
console.log('📋 Final indices to remove:', uniqueIndicesToRemove);
return { success: true, indicesToRemove: uniqueIndicesToRemove };
}
// Try JSON array as fallback (assume these are IDs to remove directly)
const jsonMatch = aiResponse.match(/\[[\d\s,]+\]/);
if (jsonMatch) {
try {
const idsToRemove = JSON.parse(jsonMatch[0]);
console.log('📋 Found JSON array - treating as IDs to remove:', idsToRemove);
const indicesToRemove = idsToRemove.map(id => parseInt(id) - 1).filter(index => index >= 0);
console.log('📋 Converted to indices:', indicesToRemove);
return { success: true, indicesToRemove };
} catch (parseError) {
console.log('⚠️ Failed to parse JSON array:', parseError.message);
}
}
console.log('📋 No records to remove - no duplicates found');
return { success: true, indicesToRemove: [] };
} catch (error) {
console.log('❌ Error parsing AI response:', error.message);
return { success: false, error: `Error parsing AI response: ${error.message}` };
}
}
// Generate cleaned dataset based on AI decisions
generateCleanedData(records, indicesToRemove) {
const removedSet = new Set(indicesToRemove);
const cleanedRecords = records.filter((_, index) => !removedSet.has(index));
this.stats.duplicatesRemoved = indicesToRemove.length;
this.stats.finalCount = cleanedRecords.length;
return cleanedRecords;
}
// Generate duplicate report
generateReport(duplicateGroups, aiResponse, indicesToRemove) {
let report = 'CSV AI-Powered Deduplication Report\n';
report += '====================================\n\n';
report += `Configuration:\n`;
report += `- Unique Columns: ${this.config.uniqueColumns.map(col => col.originalName).join(', ')}\n`;
report += `- Threshold: ${this.config.threshold}\n`;
report += `- Strategy: ${this.config.strategy}\n`;
report += `- AI Model: ${this.config.model || 'default'}\n`;
report += `- Show Input Only: ${this.config.showInput}\n\n`;
report += `Statistics:\n`;
report += `- Original Records: ${this.stats.originalCount}\n`;
report += `- Potential Duplicate Groups Found: ${duplicateGroups.length}\n`;
report += `- Records Removed by AI: ${this.stats.duplicatesRemoved}\n`;
report += `- Final Record Count: ${this.stats.finalCount}\n`;
report += `- Deduplication Rate: ${((this.stats.duplicatesRemoved / this.stats.originalCount) * 100).toFixed(2)}%\n\n`;
if (duplicateGroups.length > 0) {
report += 'Potential Duplicate Groups Found:\n';
report += '==================================\n\n';
duplicateGroups.forEach((group, groupIndex) => {
report += `Group ${groupIndex + 1}:\n`;
report += `Representative: ${JSON.stringify(group.representative.record)}\n`;
report += `Potential Duplicates:\n`;
group.duplicates.forEach((dup, dupIndex) => {
report += ` ${dupIndex + 1}. ${JSON.stringify(dup.record)} (Similarity: ${(dup.similarity * 100).toFixed(2)}%)\n`;
});
report += '\n';
});
}
report += '\nAI Decision:\n';
report += '============\n';
report += aiResponse + '\n\n';
report += `Records Removed: ${indicesToRemove.join(', ')}\n`;
return report;
}
// Convert records back to CSV
toCSV(records, headers) {
if (records.length === 0) return '';
const csvLines = [headers.join(',')];
records.forEach(record => {
const line = headers.map(header => {
let value = record[header] || '';
if (typeof value === 'string' && (value.includes(',') || value.includes('"') || value.includes('\n'))) {
value = '"' + value.replace(/"/g, '""') + '"';
}
return value;
}).join(',');
csvLines.push(line);
});
return csvLines.join('\n');
}
}
/**
* Read CSV file and return array of objects with headers
*/
function readCSV(filePath) {
return new Promise((resolve, reject) => {
const results = [];
let headers = null;
fs.createReadStream(filePath)
.pipe(csv({ skipEmptyLines: true }))
.on('headers', (headerList) => {
headers = headerList;
})
.on('data', (data) => {
results.push(data);
})
.on('end', () => {
resolve({ data: results, headers });
})
.on('error', (error) => {
reject(error);
});
});
}
/**
* Main deduplication function
*/
async function main(options = {}) {
try {
// Get unique columns from column mapping
const uniqueColumns = getUniqueColumns();
// Check if there are any unique columns
if (uniqueColumns.length === 0) {
console.log('ℹ️ No highly identifiable columns found in column mapping - skipping deduplication');
console.log('💡 To enable deduplication, mark columns as unique in the architect output using ```UNIQUE``` prefix');
return {
success: true,
skipped: true,
reason: 'No unique columns found',
stats: null,
uniqueColumns: [],
duplicateGroups: 0,
outputPath: null,
reportPath: null
};
}
// Log which columns will be used for deduplication
console.log(`🔍 Found ${uniqueColumns.length} unique column(s) for AI-powered deduplication:`);
uniqueColumns.forEach(col => {
const status = col.isExcluded ? ' (excluded from final output)' : '';
console.log(` - ${col.originalName} → ${col.mappedName}${status}`);
});
// Ensure output directory exists
if (!fs.existsSync(outputsDir)) {
fs.mkdirSync(outputsDir, { recursive: true });
}
// Check if input file exists
if (!fs.existsSync(INPUT_CSV_PATH)) {
throw new Error(`Input CSV file not found: ${INPUT_CSV_PATH}`);
}
// Read CSV data
const { data: records, headers } = await readCSV(INPUT_CSV_PATH);
if (records.length === 0) {
throw new Error('No data found in CSV file');
}
// Initialize deduplicator with unique columns
const deduplicator = new CSVDeduplicator({
...options,
uniqueColumns: uniqueColumns
});
// Find potential duplicates
const duplicateGroups = deduplicator.findPotentialDuplicates(records);
if (duplicateGroups.length === 0) {
console.log('✅ No potential duplicates found with current settings!');
// Write empty report
const reportContent = deduplicator.generateReport([], 'No potential duplicates found.', []);
fs.writeFileSync(DEDUPE_REPORT_PATH, reportContent, 'utf-8');
// Copy input to output since no changes needed
const originalCSV = deduplicator.toCSV(records, headers);
fs.writeFileSync(OUTPUT_CSV_PATH, originalCSV, 'utf-8');
return {
success: true,
skipped: false,
stats: {
originalCount: records.length,
duplicateGroups: 0,
duplicatesRemoved: 0,
finalCount: records.length
},
uniqueColumns: uniqueColumns.map(col => col.originalName),
duplicateGroups: 0,
outputPath: OUTPUT_CSV_PATH,
reportPath: DEDUPE_REPORT_PATH
};
}
console.log(`🤖 Found ${duplicateGroups.length} potential duplicate groups.`);
// Format for AI processing
const potentialDuplicatesXML = deduplicator.formatPotentialDuplicatesForAI(duplicateGroups, headers);
// If show-input mode, display the formatted input and exit
if (deduplicator.config.showInput) {
console.log(chalk.bold.cyan('\n📋 Formatted Input for AI:'));
console.log(chalk.gray('='.repeat(80)));
console.log(potentialDuplicatesXML);
console.log(chalk.gray('='.repeat(80)));
console.log(chalk.yellow('\n👀 This is the input that would be sent to the AI for deduplication decisions.'));
console.log(chalk.cyan('Run without --show-input to perform actual AI analysis.'));
return {
success: true,
skipped: false,
showInput: true,
stats: {
originalCount: records.length,
duplicateGroups: duplicateGroups.length,
duplicatesRemoved: 0,
finalCount: records.length
},
uniqueColumns: uniqueColumns.map(col => col.originalName),
duplicateGroups: duplicateGroups.length,
outputPath: null,
reportPath: null
};
}
console.log('🚀 Sending to AI for analysis...');
// Send to AI for decision making
const aiResult = await deduplicator.sendToAI(potentialDuplicatesXML, uniqueColumns);
// Always write log regardless of what happens next
const writeLog = (response, indices, error = null) => {
const logContent = [
'=== AI-POWERED DEDUPLICATION CONFIGURATION ===',
JSON.stringify({
...deduplicator.config,
uniqueColumns: uniqueColumns
}, null, 2),
'\n=== STATISTICS ===',
JSON.stringify(deduplicator.stats, null, 2),
'\n=== UNIQUE COLUMNS USED ===',
JSON.stringify(uniqueColumns, null, 2),
'\n=== AI INPUT (FORMATTED XML) ===',
potentialDuplicatesXML,
'\n=== AI RESPONSE ===',
response || 'No response received',
'\n=== INDICES TO REMOVE ===',
JSON.stringify(indices || []),
error ? '\n=== ERROR ===\n' + error : ''
].join('\n');
fs.writeFileSync(DEDUPE_LOG_PATH, logContent, 'utf-8');
};
if (!aiResult.success) {
writeLog('AI processing failed', [], aiResult.error);
throw new Error(`AI processing failed: ${aiResult.error}`);
}
console.log('🧠 AI analysis complete. Processing decisions...');
// Parse AI response to get records to remove
const parseResult = deduplicator.parseAIResponse(aiResult.result, duplicateGroups);
if (!parseResult.success) {
writeLog(aiResult.result, [], parseResult.error);
throw new Error(`Failed to parse AI response: ${parseResult.error}`);
}
const indicesToRemove = parseResult.indicesToRemove || [];
// Write log with successful results
writeLog(aiResult.result, indicesToRemove);
// Generate cleaned data
const cleanedRecords = deduplicator.generateCleanedData(records, indicesToRemove);
// Generate report
const reportContent = deduplicator.generateReport(duplicateGroups, aiResult.result, indicesToRemove);
// Write report
fs.writeFileSync(DEDUPE_REPORT_PATH, reportContent, 'utf-8');
// Write cleaned CSV
const cleanedCSV = deduplicator.toCSV(cleanedRecords, headers);
fs.writeFileSync(OUTPUT_CSV_PATH, cleanedCSV, 'utf-8');
return {
success: true,
skipped: false,
stats: deduplicator.stats,
uniqueColumns: uniqueColumns.map(col => col.originalName),
duplicateGroups: duplicateGroups.length,
outputPath: OUTPUT_CSV_PATH,
reportPath: DEDUPE_REPORT_PATH
};
} catch (error) {
throw new Error(`AI-powered deduplication failed: ${error.message}`);
}
}
// Export functions
export {
main,
CSVDeduplicator,
loadConfig,
readCSV,
getUniqueColumns,
getColumnMapping,
createMappedHeaders,
quoteCsvHeader
};