UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

479 lines 20.2 kB
"use strict"; /** * Section 2: Validity Dimension Analyzer * Validates data types, ranges, patterns, and business rules */ Object.defineProperty(exports, "__esModule", { value: true }); exports.ValidityAnalyzer = void 0; const types_1 = require("../../core/types"); const logger_1 = require("../../utils/logger"); const validation_patterns_1 = require("../../utils/validation-patterns"); class ValidityAnalyzer { data; headers; columnTypes; rowCount; columnCount; businessRules; customPatterns; customRanges; // Using shared validation patterns for consistency // EMAIL_PATTERN, PHONE_PATTERN, URL_PATTERN imported from validation-patterns static DATE_PATTERNS = [ /^\d{4}-\d{2}-\d{2}$/, // YYYY-MM-DD /^\d{2}\/\d{2}\/\d{4}$/, // MM/DD/YYYY /^\d{2}-\d{2}-\d{4}$/, // DD-MM-YYYY ]; constructor(input) { this.data = input.data; this.headers = input.headers; this.columnTypes = input.columnTypes; this.rowCount = input.rowCount; this.columnCount = input.columnCount; this.businessRules = input.businessRules || []; this.customPatterns = input.customPatterns || {}; this.customRanges = input.customRanges || {}; } analyze() { const start = performance.now(); // 1. Data type conformance const typeConformance = this.analyzeTypeConformance(); // 2. Range conformance const rangeConformance = this.analyzeRangeConformance(); // 3. Pattern conformance const patternConformance = this.analyzePatternConformance(); // 4. Business rules validation const businessRules = this.validateBusinessRules(); // 5. File structure validation const fileStructure = this.analyzeFileStructure(); // 6. Calculate overall score const score = this.calculateValidityScore(typeConformance, rangeConformance, patternConformance, businessRules); logger_1.logger.debug(`Validity analysis completed in ${(performance.now() - start).toFixed(2)}ms`, { operation: 'validity-analysis' }); return { typeConformance, rangeConformance, patternConformance, businessRules, fileStructure, score, }; } analyzeTypeConformance() { return this.headers.map((columnName, colIdx) => { const expectedType = this.columnTypes[colIdx]; const actualType = this.inferActualType(colIdx); const conformanceResults = this.checkTypeConformance(colIdx, expectedType); return { columnName, expectedType: this.formatDataType(expectedType), actualType: this.formatDataType(actualType.type), confidence: actualType.confidence, nonConformingCount: conformanceResults.nonConformingCount, conformancePercentage: conformanceResults.conformancePercentage, examples: conformanceResults.examples, conversionStrategy: this.suggestConversionStrategy(expectedType, actualType.type, conformanceResults), }; }); } analyzeRangeConformance() { const numericColumns = this.headers .map((header, idx) => ({ header, idx, type: this.columnTypes[idx] })) .filter(({ type }) => type === types_1.DataType.NUMBER || type === types_1.DataType.INTEGER || type === types_1.DataType.FLOAT); return numericColumns.map(({ header, idx }) => { const range = this.customRanges[header] || this.inferReasonableRange(header); const violations = this.findRangeViolations(idx, range); return { columnName: header, expectedRange: this.formatRange(range), violationsCount: violations.length, outliers: violations.slice(0, 10), // Limit examples }; }); } analyzePatternConformance() { const patternColumns = []; this.headers.forEach((columnName, colIdx) => { let pattern; let patternName; // Check custom patterns first if (this.customPatterns[columnName]) { pattern = this.customPatterns[columnName]; patternName = 'Custom Pattern'; } else { // Infer pattern based on column name and content const inference = this.inferPattern(columnName, colIdx); if (inference) { pattern = inference.pattern; patternName = inference.name; } } if (pattern && patternName) { const violations = this.findPatternViolations(colIdx, pattern); patternColumns.push({ columnName, expectedPattern: patternName, violationsCount: violations.length, examples: violations.slice(0, 5), }); } }); return patternColumns; } validateBusinessRules() { return this.businessRules.map((rule) => { const violations = this.evaluateBusinessRule(rule); return { ...rule, violations: violations.count, averageDiscrepancy: violations.averageDiscrepancy, examples: violations.examples.slice(0, 5), }; }); } analyzeFileStructure() { // Check column count consistency const columnCounts = this.data.map((row) => row?.length || 0); const modalColumnCount = this.getMostFrequent(columnCounts); const deviatingRows = columnCounts.filter((count) => count !== modalColumnCount).length; // Header conformance (simplified check) const headerConformance = this.headers.length === this.columnCount; return { consistentColumnCount: deviatingRows === 0, headerConformance, deviatingRows: deviatingRows > 0 ? deviatingRows : undefined, }; } inferActualType(colIdx) { const typeCounts = {}; let validValues = 0; for (let rowIdx = 0; rowIdx < this.rowCount; rowIdx++) { const value = this.data[rowIdx]?.[colIdx]; if (this.isValidValue(value)) { validValues++; const inferredType = this.inferValueType(value); typeCounts[inferredType] = (typeCounts[inferredType] || 0) + 1; } } if (validValues === 0) { return { type: types_1.DataType.UNKNOWN, confidence: 0 }; } // Find most common type const dominantType = Object.entries(typeCounts).sort(([, a], [, b]) => b - a)[0]; const confidence = (dominantType[1] / validValues) * 100; return { type: dominantType[0], confidence: Math.round(confidence), }; } checkTypeConformance(colIdx, expectedType) { const nonConformingExamples = []; let nonConformingCount = 0; let validValueCount = 0; for (let rowIdx = 0; rowIdx < this.rowCount; rowIdx++) { const value = this.data[rowIdx]?.[colIdx]; if (this.isValidValue(value)) { validValueCount++; const actualType = this.inferValueType(value); if (actualType !== expectedType && !this.isCompatibleType(actualType, expectedType)) { nonConformingCount++; if (nonConformingExamples.length < 5) { nonConformingExamples.push(String(value)); } } } } const conformancePercentage = validValueCount > 0 ? ((validValueCount - nonConformingCount) / validValueCount) * 100 : 0; return { nonConformingCount, conformancePercentage, examples: nonConformingExamples, }; } inferValueType(value) { const trimmed = value.trim(); // Boolean check if (['true', 'false', '1', '0', 'yes', 'no', 'y', 'n'].includes(trimmed.toLowerCase())) { return types_1.DataType.BOOLEAN; } // Number checks if (/^-?\d+$/.test(trimmed)) { return types_1.DataType.INTEGER; } if (/^-?\d*\.?\d+([eE][+-]?\d+)?$/.test(trimmed)) { return types_1.DataType.FLOAT; } // Date checks if (this.looksLikeDate(trimmed)) { return this.looksLikeDateTime(trimmed) ? types_1.DataType.DATETIME : types_1.DataType.DATE; } return types_1.DataType.STRING; } looksLikeDate(value) { return (ValidityAnalyzer.DATE_PATTERNS.some((pattern) => pattern.test(value)) || !isNaN(Date.parse(value))); } looksLikeDateTime(value) { return value.includes(':') || value.toLowerCase().includes('t'); } isCompatibleType(actual, expected) { // Define compatibility rules - include CSV type conversion scenarios const compatibilityMap = { [types_1.DataType.NUMBER]: [types_1.DataType.INTEGER, types_1.DataType.FLOAT, types_1.DataType.STRING], [types_1.DataType.INTEGER]: [types_1.DataType.NUMBER, types_1.DataType.FLOAT, types_1.DataType.STRING], [types_1.DataType.FLOAT]: [types_1.DataType.NUMBER, types_1.DataType.INTEGER, types_1.DataType.STRING], // CSV files initially parse as STRING, so detecting proper types is good conformance [types_1.DataType.STRING]: [ types_1.DataType.NUMBER, types_1.DataType.INTEGER, types_1.DataType.FLOAT, types_1.DataType.DATE, types_1.DataType.BOOLEAN, ], }; return compatibilityMap[expected]?.includes(actual) || false; } suggestConversionStrategy(expected, _actual, conformanceResults) { if (conformanceResults.conformancePercentage > 95) { return 'No conversion needed - high conformance'; } if (conformanceResults.conformancePercentage < 50) { return 'Manual review recommended - low conformance rate'; } switch (expected) { case types_1.DataType.NUMBER: case types_1.DataType.INTEGER: case types_1.DataType.FLOAT: return 'Attempt numeric conversion, flag non-convertible values'; case types_1.DataType.DATE: case types_1.DataType.DATETIME: return 'Parse with multiple date formats, standardise to ISO 8601'; case types_1.DataType.BOOLEAN: return 'Map common boolean representations (Yes/No, 1/0, True/False)'; default: return 'Convert to string with validation'; } } inferReasonableRange(columnName) { const lower = columnName.toLowerCase(); // Age-related columns if (lower.includes('age')) { return { min: 0, max: 120 }; } // Percentage columns if (lower.includes('percent') || lower.includes('rate') || lower.includes('%')) { return { min: 0, max: 100 }; } // Price/amount columns if (lower.includes('price') || lower.includes('amount') || lower.includes('cost')) { return { min: 0 }; // No upper limit for prices } // Year columns if (lower.includes('year')) { return { min: 1900, max: new Date().getFullYear() + 10 }; } // Rating columns if (lower.includes('rating') || lower.includes('score')) { return { min: 0, max: 10 }; // Assuming 0-10 scale } return {}; // No range constraints } findRangeViolations(colIdx, range) { const violations = []; for (let rowIdx = 0; rowIdx < this.rowCount; rowIdx++) { const value = this.data[rowIdx]?.[colIdx]; if (this.isValidValue(value)) { const numValue = Number(value); if (!isNaN(numValue)) { const violatesMin = range.min !== undefined && numValue < range.min; const violatesMax = range.max !== undefined && numValue > range.max; if (violatesMin || violatesMax) { violations.push({ value: numValue, rowIndex: rowIdx }); } } } } return violations; } inferPattern(columnName, colIdx) { const lower = columnName.toLowerCase(); // Email patterns if (lower.includes('email') || lower.includes('mail')) { return { pattern: validation_patterns_1.EMAIL_PATTERN.source, name: 'Email Format' }; } // Phone patterns if (lower.includes('phone') || lower.includes('tel') || lower.includes('mobile')) { return { pattern: validation_patterns_1.PHONE_PATTERN.source, name: 'Phone Number Format' }; } // URL patterns if (lower.includes('url') || lower.includes('website') || lower.includes('link')) { return { pattern: validation_patterns_1.URL_PATTERN.source, name: 'URL Format' }; } // Sample values to infer pattern const sampleValues = this.getSampleValues(colIdx, 50); // Check if all values follow a consistent pattern if (sampleValues.length > 10) { const patterns = this.detectCommonPatterns(sampleValues); if (patterns.length > 0) { return { pattern: patterns[0].pattern, name: patterns[0].name }; } } return null; } getSampleValues(colIdx, maxSamples) { const values = []; const step = Math.max(1, Math.floor(this.rowCount / maxSamples)); for (let rowIdx = 0; rowIdx < this.rowCount && values.length < maxSamples; rowIdx += step) { const value = this.data[rowIdx]?.[colIdx]; if (this.isValidValue(value)) { values.push(value.trim()); } } return values; } detectCommonPatterns(values) { const patterns = []; // Check for consistent length and character patterns const lengths = values.map((v) => v.length); const uniqueLengths = [...new Set(lengths)]; if (uniqueLengths.length === 1 && uniqueLengths[0] > 5) { // All values have same length - might be a code pattern const firstValue = values[0]; let pattern = ''; for (let i = 0; i < firstValue.length; i++) { const char = firstValue[i]; if (/\d/.test(char)) { pattern += '\\d'; } else if (/[a-zA-Z]/.test(char)) { pattern += '[a-zA-Z]'; } else { pattern += '\\' + char; // Escape special characters } } if (pattern.length > 0) { patterns.push({ pattern: `^${pattern}$`, name: 'Fixed Format Code' }); } } return patterns; } findPatternViolations(colIdx, pattern) { const violations = []; const regex = new RegExp(pattern); for (let rowIdx = 0; rowIdx < this.rowCount; rowIdx++) { const value = this.data[rowIdx]?.[colIdx]; if (this.isValidValue(value) && !regex.test(value.trim())) { if (violations.length < 10) { violations.push(value.trim()); } } } return violations; } evaluateBusinessRule(rule) { // This is a simplified implementation // In practice, business rules would need a more sophisticated parser const violations = []; const totalDiscrepancy = 0; // Example: Check if a calculation rule like "Total = Quantity * Price" if (rule.description.includes('EQUAL') && rule.description.includes('*')) { // Extract column names (simplified parsing) // Implementation would need proper rule parsing } // For now, return empty result return { count: violations.length, averageDiscrepancy: violations.length > 0 ? totalDiscrepancy / violations.length : undefined, examples: violations, }; } calculateValidityScore(typeConformance, rangeConformance, patternConformance, businessRules) { let score = 100; // Type conformance penalty const avgTypeConformance = typeConformance.reduce((sum, tc) => sum + tc.conformancePercentage, 0) / typeConformance.length; score -= (100 - avgTypeConformance) * 0.4; // 40% weight // Range violations penalty const totalRangeViolations = rangeConformance.reduce((sum, rc) => sum + rc.violationsCount, 0); const rangeViolationRate = (totalRangeViolations / this.rowCount) * 100; score -= Math.min(20, rangeViolationRate); // Max 20 points off // Pattern violations penalty const totalPatternViolations = patternConformance.reduce((sum, pc) => sum + pc.violationsCount, 0); const patternViolationRate = (totalPatternViolations / this.rowCount) * 100; score -= Math.min(15, patternViolationRate); // Max 15 points off // Business rule violations penalty const totalBusinessRuleViolations = businessRules.reduce((sum, br) => sum + br.violations, 0); const businessRuleViolationRate = (totalBusinessRuleViolations / this.rowCount) * 100; score -= Math.min(25, businessRuleViolationRate * 2); // Max 25 points off score = Math.max(0, score); let interpretation; if (score >= 95) interpretation = 'Excellent'; else if (score >= 85) interpretation = 'Good'; else if (score >= 70) interpretation = 'Fair'; else if (score >= 50) interpretation = 'Needs Improvement'; else interpretation = 'Poor'; return { score: Math.round(score * 100) / 100, interpretation, details: `${avgTypeConformance.toFixed(1)}% average type conformance, ${totalRangeViolations + totalPatternViolations + totalBusinessRuleViolations} total violations`, }; } formatDataType(type) { switch (type) { case types_1.DataType.STRING: return 'String'; case types_1.DataType.NUMBER: return 'Number'; case types_1.DataType.INTEGER: return 'Integer'; case types_1.DataType.FLOAT: return 'Float'; case types_1.DataType.DATE: return 'Date'; case types_1.DataType.DATETIME: return 'DateTime'; case types_1.DataType.BOOLEAN: return 'Boolean'; default: return 'Unknown'; } } formatRange(range) { if (range.min !== undefined && range.max !== undefined) { return `${range.min} to ${range.max}`; } else if (range.min !== undefined) { return `>= ${range.min}`; } else if (range.max !== undefined) { return `<= ${range.max}`; } return 'No range constraint'; } isValidValue(value) { return value !== null && value !== undefined && String(value).trim() !== ''; } getMostFrequent(array) { const counts = new Map(); for (const item of array) { counts.set(item, (counts.get(item) || 0) + 1); } let maxCount = 0; let mostFrequent = array[0]; for (const [item, count] of counts.entries()) { if (count > maxCount) { maxCount = count; mostFrequent = item; } } return mostFrequent; } } exports.ValidityAnalyzer = ValidityAnalyzer; //# sourceMappingURL=validity-analyzer.js.map