datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
61 lines • 2.23 kB
JavaScript
;
/**
* Shared Data Quality Utilities
* Provides consistent calculations across all sections to prevent inter-section discrepancies
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.calculateUniqueness = calculateUniqueness;
exports.defaultNormalizeValue = defaultNormalizeValue;
exports.calculateColumnUniqueness = calculateColumnUniqueness;
/**
* Standardized uniqueness calculation used across all sections
* Fixes inter-section consistency bug where Section 2 and Section 4 report different statistics
*/
function calculateUniqueness(data, columnIndex, normalizeValue) {
const valueMap = new Map();
let nonNullCount = 0;
// Default normalization function if none provided
const normalize = normalizeValue || defaultNormalizeValue;
for (let rowIdx = 0; rowIdx < data.length; rowIdx++) {
const value = normalize(data[rowIdx]?.[columnIndex]);
if (value !== null) {
nonNullCount++;
const key = String(value);
valueMap.set(key, (valueMap.get(key) || 0) + 1);
}
}
const uniqueCount = valueMap.size;
const uniquePercentage = nonNullCount > 0 ? (uniqueCount / nonNullCount) * 100 : 0;
const duplicateCount = nonNullCount - uniqueCount;
return {
uniqueCount,
uniquePercentage: Number(uniquePercentage.toFixed(2)), // Consistent rounding to 2 decimal places
duplicateCount,
totalNonNullValues: nonNullCount,
};
}
/**
* Default value normalization - consistent across sections
*/
function defaultNormalizeValue(value) {
if (value === null || value === undefined)
return null;
const trimmed = String(value).trim();
if (trimmed === '' || trimmed.toLowerCase() === 'null' || trimmed.toLowerCase() === 'na') {
return null;
}
return trimmed;
}
/**
* Calculate uniqueness for multiple columns efficiently
*/
function calculateColumnUniqueness(data, headers, normalizeValue) {
return headers.map((columnName, colIdx) => {
const result = calculateUniqueness(data, colIdx, normalizeValue);
return {
columnName,
...result,
};
});
}
//# sourceMappingURL=data-quality-utils.js.map