@dbclean/cli
Version:
Transform messy CSV data into clean, standardized datasets using AI-powered automation
377 lines (325 loc) β’ 13.7 kB
JavaScript
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import csv from 'csv-parser';
import { createObjectCsvWriter } from 'csv-writer';
import { program } from 'commander';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Load configuration from config.json
function loadConfig() {
try {
// Try multiple possible config locations
const possiblePaths = [
// Relative to current working directory
path.join(process.cwd(), 'config.json'),
// Relative to the script location
path.join(__dirname, '..', 'config.json'),
// In case it's in the same directory as the script
path.join(__dirname, 'config.json')
];
for (const configPath of possiblePaths) {
if (fs.existsSync(configPath)) {
const configContent = fs.readFileSync(configPath, 'utf-8');
return JSON.parse(configContent);
}
}
throw new Error('config.json not found in any expected location');
} catch (error) {
console.log(`β οΈ Warning: Could not load config.json: ${error.message}`);
// Return default values if config.json doesn't exist
return {
settings__dir: "settings",
settings_exclude_columns_file_path: "exclude_columns.txt",
data_dir: "data",
data_cleaned_file_path: "data_cleaned.csv",
};
}
}
const config = loadConfig();
/**
* Load excluded column names from a text file
*/
function loadExcludedColumns(excludeFilePath) {
if (!excludeFilePath || !fs.existsSync(excludeFilePath)) {
return new Set();
}
try {
const content = fs.readFileSync(excludeFilePath, 'utf-8');
const excludedColumns = new Set();
content.split('\n').forEach(line => {
line = line.trim();
// Skip empty lines and comments (lines starting with #)
if (line && !line.startsWith('#')) {
excludedColumns.add(line);
}
});
console.log(`π Loaded ${excludedColumns.size} excluded columns from: ${excludeFilePath}`);
if (excludedColumns.size > 0) {
console.log(` π« Excluding columns: ${Array.from(excludedColumns).sort().join(', ')}`);
}
return excludedColumns;
} catch (error) {
console.log(`β οΈ Warning: Could not load excluded columns from ${excludeFilePath}: ${error.message}`);
return new Set();
}
}
/**
* Clean text by removing newlines, replacing special characters, and handling non-UTF8 chars
*/
function cleanText(text) {
if (text === null || text === undefined || text === '') {
return text;
}
// Convert to string if not already
text = String(text);
// Remove newlines and replace with spaces
text = text.replace(/\n+/g, ' ');
text = text.replace(/\r+/g, ' ');
text = text.replace(/\t+/g, ' ');
// Remove multiple spaces
text = text.replace(/\s+/g, ' ');
// Strip leading/trailing whitespace
text = text.trim();
// Handle special characters by replacing them with closest ASCII equivalents
const replacements = {
// Quotes and apostrophes - expanded list
'\u2018': "'", // left single quotation mark
'\u2019': "'", // right single quotation mark
'\u201C': '"', // left double quotation mark
'\u201D': '"', // right double quotation mark
'β²': "'", // prime
'β΅': "'", // reversed prime
'`': "'", // grave accent
'β³': '"', // double prime
'βΆ': '"', // reversed double prime
'β΄': '"', // triple prime
'β·': '"', // reversed triple prime
'βΉ': '<', // single left-pointing angle quotation mark
'βΊ': '>', // single right-pointing angle quotation mark
'Β«': '<<', // left-pointing double angle quotation mark
'Β»': '>>', // right-pointing double angle quotation mark
// Other special characters
'β': '-', // en dash
'β': '-', // em dash
'β¦': '...', // ellipsis
'Β°': ' degrees', // degree symbol
'Γ': 'x', // multiplication sign
'Γ·': '/', // division sign
'Β±': '+/-', // plus-minus sign
'β€': '<=', // less than or equal
'β₯': '>=', // greater than or equal
'β ': '!=', // not equal
'β': '~', // approximately equal
'β': 'infinity', // infinity
'β': 'sqrt', // square root
'Β²': '^2', // squared
'Β³': '^3', // cubed
'ΒΌ': '1/4', // fractions
'Β½': '1/2',
'ΒΎ': '3/4',
'β
': '1/3',
'β
': '2/3',
'β
': '1/5',
'β
': '2/5',
'β
': '3/5',
'β
': '4/5',
'β
': '1/6',
'β
': '5/6',
'β
': '1/7',
'β
': '1/8',
'β
': '3/8',
'β
': '5/8',
'β
': '7/8',
'β
': '1/9',
'β
': '1/10',
};
for (const [specialChar, replacement] of Object.entries(replacements)) {
text = text.replace(new RegExp(specialChar.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), replacement);
}
// Handle escaped quotes that might occur during CSV processing
// Replace multiple consecutive quotes with single quotes
text = text.replace(/"{2,}/g, '"'); // Multiple quotes become single quote
text = text.replace(/'{2,}/g, "'"); // Multiple apostrophes become single apostrophe
// Remove any remaining non-ASCII characters that might cause issues
// Keep only printable ASCII characters and common punctuation
text = text.replace(/[^\x20-\x7E]/g, '');
return text;
}
/**
* Read CSV file and return array of objects
*/
function readCSV(filePath) {
return new Promise((resolve, reject) => {
const results = [];
let headers = null;
fs.createReadStream(filePath)
.pipe(csv({ skipEmptyLines: true }))
.on('headers', (headerList) => {
headers = headerList;
})
.on('data', (data) => {
results.push(data);
})
.on('end', () => {
resolve({ data: results, headers });
})
.on('error', (error) => {
reject(error);
});
});
}
/**
* Write CSV file from array of objects
*/
async function writeCSV(filePath, data, headers) {
const csvWriter = createObjectCsvWriter({
path: filePath,
header: headers.map(h => ({ id: h, title: h }))
});
await csvWriter.writeRecords(data);
}
/**
* Clean the CSV file by applying text cleaning to all string columns
*/
async function cleanCSV(inputPath, outputPath = null, excludeFilePath = null) {
console.log(`π§ Starting to clean CSV file: ${inputPath}`);
// Load excluded columns
const excludedColumns = loadExcludedColumns(excludeFilePath);
try {
// Read the CSV file
const { data, headers } = await readCSV(inputPath);
console.log(`π Loaded CSV with ${data.length} rows and ${headers.length} columns`);
// Clean column names using the same cleaning function
const originalHeaders = [...headers];
const cleanedHeaders = headers.map(cleanText);
console.log(` β
Cleaned ${cleanedHeaders.length} column headers`);
// Check if any excluded columns exist in the dataset and remove them
const existingExcluded = new Set(
cleanedHeaders.filter(header => excludedColumns.has(header))
);
let finalHeaders = cleanedHeaders;
let finalData = data;
if (existingExcluded.size > 0) {
console.log(` π« Found ${existingExcluded.size} excluded columns in dataset: ${Array.from(existingExcluded).sort().join(', ')}`);
console.log(` ποΈ Removing excluded columns from output...`);
// Filter out excluded columns
finalHeaders = cleanedHeaders.filter(header => !existingExcluded.has(header));
// Remove excluded columns from data
finalData = data.map(row => {
const newRow = {};
finalHeaders.forEach((header, index) => {
const originalHeader = originalHeaders[cleanedHeaders.indexOf(header)];
newRow[header] = row[originalHeader];
});
return newRow;
});
console.log(` β
Removed ${existingExcluded.size} excluded columns`);
} else {
// Just rename columns if no exclusions
finalData = data.map(row => {
const newRow = {};
finalHeaders.forEach((header, index) => {
newRow[header] = row[originalHeaders[index]];
});
return newRow;
});
}
// Clean all remaining string columns
let cleanedCount = 0;
finalHeaders.forEach(column => {
let changed = 0;
finalData.forEach(row => {
const originalValue = row[column];
const cleanedValue = cleanText(originalValue);
if (originalValue !== cleanedValue) {
row[column] = cleanedValue;
changed++;
}
});
if (changed > 0) {
console.log(` β
Cleaned column '${column}': ${changed} values modified`);
cleanedCount += changed;
}
});
// Always create a separate cleaned file, never modify the original
if (outputPath === null) {
const inputFile = path.parse(inputPath);
outputPath = path.join(inputFile.dir, `${inputFile.name}_cleaned${inputFile.ext}`);
}
// Ensure we're not overwriting the original file
if (path.resolve(outputPath) === path.resolve(inputPath)) {
throw new Error("Cannot overwrite original file. Please specify a different output path.");
}
// Save the cleaned CSV to the new location
await writeCSV(outputPath, finalData, finalHeaders);
console.log(`πΎ Saved cleaned CSV to: ${outputPath}`);
console.log(`π Final CSV contains ${finalHeaders.length} columns and ${finalData.length} rows`);
console.log(`π― Total values cleaned: ${cleanedCount}`);
if (existingExcluded.size > 0) {
console.log(`ποΈ Total columns removed: ${existingExcluded.size}`);
}
console.log(`π‘οΈ Original file preserved: ${inputPath}`);
return outputPath;
} catch (error) {
console.log(`β Error cleaning CSV: ${error.message}`);
return null;
}
}
/**
* Main function to handle command line arguments and execute cleaning
*/
async function main() {
program
.name('preclean')
.description('Clean CSV data by removing newlines, replacing special characters, and handling non-UTF8 chars.')
.version('1.0.0');
program
.option('--input <path>', 'Input CSV file path (default: data/data.csv)')
.option('--output <path>', 'Output CSV file path (default: data/data_cleaned.csv)')
.option('--exclude <path>', 'Path to text file containing column names to exclude from cleaning (default: settings/exclude_columns.txt)');
program.parse();
const options = program.opts();
// Define paths using config
const dataDir = config.data_dir || 'data';
const originalCsvPath = options.input || path.join(dataDir, 'data.csv');
const cleanedCsvPath = options.output || path.join(dataDir, config.data_cleaned_file_path || 'data_cleaned.csv');
const excludeFilePath = options.exclude || path.join(config.settings__dir || 'settings', config.settings_exclude_columns_file_path || 'exclude_columns.txt');
// Check if input file exists
if (!fs.existsSync(originalCsvPath)) {
console.log(`β Input CSV file not found: ${originalCsvPath}`);
console.log("Please ensure the data.csv file exists in the data/ directory.");
return false;
}
console.log(`π‘οΈ Original file will be preserved: ${originalCsvPath}`);
console.log(`π Cleaned file will be created: ${cleanedCsvPath}`);
if (fs.existsSync(excludeFilePath)) {
console.log(`π Using exclude file: ${excludeFilePath}`);
} else {
console.log(`π No exclude file found at: ${excludeFilePath} (will clean all columns)`);
}
// Clean the CSV and save to separate file
const cleanedPath = await cleanCSV(originalCsvPath, cleanedCsvPath, excludeFilePath);
if (cleanedPath) {
console.log(`\nβ
Successfully cleaned CSV data!`);
console.log(`π Original file (unchanged): ${originalCsvPath}`);
console.log(`π Cleaned file (new): ${cleanedPath}`);
console.log("\nπ You can now run architect.js with the cleaned data.");
return true;
} else {
console.log("β Failed to clean CSV data.");
return false;
}
}
// Export functions for use in other modules
export {
loadExcludedColumns,
cleanText,
cleanCSV,
readCSV,
writeCSV
};
// Run main function if this file is executed directly
if (import.meta.url === `file://${process.argv[1]}`) {
main().catch(console.error);
}