UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

758 lines 33 kB
"use strict"; /** * Pattern Validation Engine * Implements format validation, regex patterns, and standardization checks */ Object.defineProperty(exports, "__esModule", { value: true }); exports.PatternValidationEngine = void 0; const validation_patterns_1 = require("../../utils/validation-patterns"); class PatternValidationEngine { data; headers; config; patterns = []; violations = []; constructor(data, headers, config = {}) { this.data = data; this.headers = headers; this.config = { enableBuiltInPatterns: true, maxViolationsPerPattern: 100, enableFormatStandardization: true, ...config, }; this.initializePatterns(); } validatePatterns() { this.violations = []; // Validate patterns for (let colIndex = 0; colIndex < this.headers.length; colIndex++) { const columnName = this.headers[colIndex]; const applicablePatterns = this.patterns.filter((p) => p.enabled && p.columnPattern.test(columnName)); for (const pattern of applicablePatterns) { this.validateColumnPattern(colIndex, columnName, pattern); } } // Analyze format consistency let formatConsistency = []; if (this.config.enableFormatStandardization) { formatConsistency = this.analyzeFormatConsistency(); // Add unit standardization analysis formatConsistency.push(...this.addUnitStandardizationAnalysis()); } return { patternValidations: this.generatePatternReport(), formatConsistency, totalViolations: this.violations.length, }; } initializePatterns() { if (!this.config.enableBuiltInPatterns) { return; } // Email validation this.patterns.push({ id: 'email_format', name: 'Email Format Validation', description: 'Email addresses should follow standard email format', columnPattern: /(email|e-mail|mail)/i, valuePattern: validation_patterns_1.EMAIL_PATTERN, severity: 'high', examples: ['user@example.com', 'john.doe+newsletter@company.co.uk'], enabled: true, }); // Phone number validation (North American) this.patterns.push({ id: 'phone_nanp', name: 'North American Phone Number', description: 'Phone numbers should follow NANP format', columnPattern: /(phone|tel|mobile|cell)/i, valuePattern: /^(\+1[-.\s]?)?\(?[2-9]\d{2}\)?[-.\s]?[2-9]\d{2}[-.\s]?\d{4}$/, severity: 'medium', examples: ['(555) 123-4567', '+1-555-123-4567', '555.123.4567'], enabled: true, }); // SSN validation (US) this.patterns.push({ id: 'ssn_us', name: 'US Social Security Number', description: 'SSN should follow XXX-XX-XXXX format', columnPattern: /(ssn|social.*security|tax.*id)/i, valuePattern: /^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$/, severity: 'critical', examples: ['123-45-6789'], enabled: true, }); // Credit card validation (basic Luhn check would be ideal) this.patterns.push({ id: 'credit_card', name: 'Credit Card Number', description: 'Credit card numbers should be 13-19 digits', columnPattern: /(card|credit|payment)/i, valuePattern: /^\d{13,19}$/, severity: 'critical', examples: ['4532123456789012'], enabled: true, }); // URL validation this.patterns.push({ id: 'url_format', name: 'URL Format Validation', description: 'URLs should follow standard HTTP/HTTPS format', columnPattern: /(url|website|link|homepage)/i, valuePattern: /^https?:\/\/(?:[-\w.])+(?:\:[0-9]+)?(?:\/(?:[\w\/_.])*(?:\?(?:[\w&=%.])*)?(?:\#(?:[\w.])*)?)?$/, severity: 'medium', examples: ['https://example.com', 'http://subdomain.example.com/path'], enabled: true, }); // Date format validation (ISO 8601) this.patterns.push({ id: 'date_iso8601', name: 'ISO 8601 Date Format', description: 'Dates should follow ISO 8601 format (YYYY-MM-DD)', columnPattern: /(date|created|updated|birth|expir)/i, valuePattern: /^\d{4}-\d{2}-\d{2}(?:T\d{2}:\d{2}:\d{2}(?:\.\d{3})?(?:Z|[+-]\d{2}:\d{2})?)?$/, severity: 'low', examples: ['2023-12-31', '2023-12-31T23:59:59Z'], enabled: true, }); // Postal code validation (US ZIP) this.patterns.push({ id: 'zip_us', name: 'US ZIP Code', description: 'US ZIP codes should be 5 digits or 5+4 format', columnPattern: /(zip|postal|postcode)/i, valuePattern: /^\d{5}(-\d{4})?$/, severity: 'medium', examples: ['12345', '12345-6789'], enabled: true, }); // State code validation (US) this.patterns.push({ id: 'state_us', name: 'US State Code', description: 'US state codes should be 2-letter abbreviations', columnPattern: /^state$/i, valuePattern: /^(AL|AK|AZ|AR|CA|CO|CT|DE|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|VA|WA|WV|WI|WY)$/i, severity: 'medium', examples: ['CA', 'NY', 'TX'], enabled: true, }); // UUID validation this.patterns.push({ id: 'uuid_format', name: 'UUID Format', description: 'UUIDs should follow standard format', columnPattern: /(uuid|guid|id)/i, valuePattern: /^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i, severity: 'medium', examples: ['123e4567-e89b-12d3-a456-426614174000'], enabled: true, }); // Additional enhanced patterns this.addInternationalPatterns(); this.addBusinessPatterns(); this.addSecurityPatterns(); this.addEducationalPatterns(); // Add custom patterns if provided if (this.config.customPatterns) { this.patterns.push(...this.config.customPatterns.filter((p) => p.enabled)); } } /** * International format patterns */ addInternationalPatterns() { // International phone numbers (E.164 format) this.patterns.push({ id: 'phone_international', name: 'International Phone Number (E.164)', description: 'International phone numbers should follow E.164 format', columnPattern: /(phone|tel|mobile|cell|contact)/i, valuePattern: /^\+[1-9]\d{1,14}$/, severity: 'medium', examples: ['+1234567890123', '+441234567890'], enabled: true, }); // Canadian postal codes this.patterns.push({ id: 'postal_canada', name: 'Canadian Postal Code', description: 'Canadian postal codes should follow A1A 1A1 format', columnPattern: /(postal|postcode|zip)/i, valuePattern: /^[A-Z]\d[A-Z][\s-]?\d[A-Z]\d$/i, severity: 'medium', examples: ['K1A 0A6', 'M5V-3A8'], enabled: true, }); // UK postal codes this.patterns.push({ id: 'postal_uk', name: 'UK Postal Code', description: 'UK postal codes should follow standard format', columnPattern: /(postal|postcode|zip)/i, valuePattern: /^[A-Z]{1,2}\d[A-Z\d]?\s?\d[A-Z]{2}$/i, severity: 'medium', examples: ['SW1A 1AA', 'M1 1AA', 'B33 8TH'], enabled: true, }); // IBAN validation (basic structure) this.patterns.push({ id: 'iban_format', name: 'International Bank Account Number (IBAN)', description: 'IBAN should follow international standard format', columnPattern: /(iban|bank.*account)/i, valuePattern: /^[A-Z]{2}\d{2}[A-Z0-9]{4,30}$/i, severity: 'high', examples: ['GB82WEST12345698765432', 'DE89370400440532013000'], enabled: true, }); // ISO country codes (2-letter) this.patterns.push({ id: 'country_iso2', name: 'ISO 3166-1 Alpha-2 Country Code', description: 'Country codes should follow ISO 3166-1 alpha-2 standard', columnPattern: /(country.*code|ctry|nation.*code)/i, valuePattern: /^[A-Z]{2}$/, severity: 'medium', examples: ['US', 'GB', 'CA', 'DE'], enabled: true, }); // ISO currency codes this.patterns.push({ id: 'currency_iso', name: 'ISO 4217 Currency Code', description: 'Currency codes should follow ISO 4217 standard', columnPattern: /(currency|curr|money.*code)/i, valuePattern: /^[A-Z]{3}$/, severity: 'medium', examples: ['USD', 'EUR', 'GBP', 'CAD'], enabled: true, }); } /** * Business domain patterns */ addBusinessPatterns() { // EIN (US Employer Identification Number) this.patterns.push({ id: 'ein_us', name: 'US Employer Identification Number (EIN)', description: 'EIN should follow XX-XXXXXXX format', columnPattern: /(ein|tax.*id|employer.*id)/i, valuePattern: /^\d{2}-\d{7}$/, severity: 'high', examples: ['12-3456789'], enabled: true, }); // Stock symbols (basic) this.patterns.push({ id: 'stock_symbol', name: 'Stock Trading Symbol', description: 'Stock symbols should be 1-5 uppercase letters', columnPattern: /(symbol|ticker|stock)/i, valuePattern: /^[A-Z]{1,5}$/, severity: 'low', examples: ['AAPL', 'MSFT', 'GOOGL'], enabled: true, }); // SKU patterns (flexible business format) this.patterns.push({ id: 'sku_format', name: 'Stock Keeping Unit (SKU)', description: 'SKU should follow consistent alphanumeric format', columnPattern: /(sku|product.*code|item.*code)/i, valuePattern: /^[A-Z0-9]{3,20}(-[A-Z0-9]{1,10})*$/i, severity: 'medium', examples: ['ABC-123', 'PROD001-XL-BLU', 'SKU12345'], enabled: true, }); // Invoice numbers this.patterns.push({ id: 'invoice_number', name: 'Invoice Number Format', description: 'Invoice numbers should follow consistent format', columnPattern: /(invoice|bill|receipt)/i, valuePattern: /^(INV|BILL|RCP)[-]?\d{4,10}$/i, severity: 'low', examples: ['INV-123456', 'BILL0001234', 'RCP-456789'], enabled: true, }); // Purchase order numbers this.patterns.push({ id: 'po_number', name: 'Purchase Order Number', description: 'PO numbers should follow business format', columnPattern: /(po|purchase.*order|order.*number)/i, valuePattern: /^(PO|ORD)[-]?\d{4,12}$/i, severity: 'low', examples: ['PO-123456789', 'ORD001234567'], enabled: true, }); } /** * Security and compliance patterns */ addSecurityPatterns() { // Password complexity (basic) this.patterns.push({ id: 'password_complexity', name: 'Password Complexity', description: 'Passwords should meet basic complexity requirements', columnPattern: /(password|passwd|pwd)/i, valuePattern: /^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)(?=.*[@$!%*?&])[A-Za-z\d@$!%*?&]{8,}$/, severity: 'critical', examples: ['Password123!', 'MyS3cur3P@ss'], enabled: true, }); // API keys (generic pattern) this.patterns.push({ id: 'api_key_format', name: 'API Key Format', description: 'API keys should follow secure format standards', columnPattern: /(api.*key|access.*key|secret.*key)/i, valuePattern: /^[A-Za-z0-9_-]{20,128}$/, severity: 'critical', examples: ['sk_test_1234567890abcdef', 'pk_live_abcd1234567890ef'], enabled: true, }); // JWT tokens (basic structure) this.patterns.push({ id: 'jwt_token', name: 'JSON Web Token (JWT)', description: 'JWT tokens should have three base64 parts separated by dots', columnPattern: /(jwt|token|bearer)/i, valuePattern: /^[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+$/, severity: 'critical', examples: ['eyJhbGci.eyJzdWIi.SflKxwRJ'], enabled: true, }); // Hash values (SHA-256, MD5, etc.) this.patterns.push({ id: 'hash_format', name: 'Cryptographic Hash', description: 'Hash values should be valid hexadecimal strings', columnPattern: /(hash|checksum|digest|sha|md5)/i, valuePattern: /^[a-fA-F0-9]{32,128}$/, severity: 'medium', examples: ['d41d8cd98f00b204e9800998ecf8427e', '2cf24dba4f21d4288094e4b5c0d37b16'], enabled: true, }); } /** * Educational domain patterns */ addEducationalPatterns() { // Student ID formats this.patterns.push({ id: 'student_id', name: 'Student ID Format', description: 'Student IDs should follow institutional format', columnPattern: /(student.*id|matriculation|enrollment.*id)/i, valuePattern: /^(STU|ST)?\d{6,12}$/i, severity: 'medium', examples: ['STU1234567', '123456789', 'ST001234567'], enabled: true, }); // Course codes this.patterns.push({ id: 'course_code', name: 'Course Code Format', description: 'Course codes should follow academic format', columnPattern: /(course.*code|subject.*code|class.*code)/i, valuePattern: /^[A-Z]{2,4}\d{3,4}[A-Z]?$/i, severity: 'medium', examples: ['CS101', 'MATH2001', 'ENG1101A'], enabled: true, }); // GPA formats this.patterns.push({ id: 'gpa_format', name: 'Grade Point Average Format', description: 'GPA should be decimal number with appropriate precision', columnPattern: /(gpa|grade.*point)/i, valuePattern: /^[0-4]\.\d{1,3}$|^[0-5]\.\d{1,3}$|^[0-9]{1,2}(\.\d{1,2})?$/, severity: 'medium', examples: ['3.85', '4.0', '2.75', '85.5'], enabled: true, }); // Grade letter formats this.patterns.push({ id: 'letter_grade', name: 'Letter Grade Format', description: 'Letter grades should follow standard format', columnPattern: /(grade|letter.*grade|final.*grade)/i, valuePattern: /^[A-F][+-]?$|^(HD|D|C|P|F|N)$/i, severity: 'low', examples: ['A+', 'B-', 'C', 'HD', 'P'], enabled: true, }); // Academic year formats this.patterns.push({ id: 'academic_year', name: 'Academic Year Format', description: 'Academic years should follow YYYY-YY or YYYY format', columnPattern: /(academic.*year|school.*year|year)/i, valuePattern: /^(19|20)\d{2}(-\d{2})?$|^(19|20)\d{2}\/(19|20)?\d{2}$/, severity: 'low', examples: ['2023-24', '2023/24', '2023'], enabled: true, }); } /** * Enhanced format consistency analysis with unit standardization */ addUnitStandardizationAnalysis() { const unitAnalysis = []; for (let colIndex = 0; colIndex < this.headers.length; colIndex++) { const columnName = this.headers[colIndex]; // Weight/mass units if (/(weight|mass|kg|lb|pound)/i.test(columnName)) { const weightFormats = this.analyzeColumnFormatConsistency(colIndex, columnName, [ { pattern: /^\d+(\.\d+)?\s*kg$/i, name: 'Kilograms (kg)' }, { pattern: /^\d+(\.\d+)?\s*lb$/i, name: 'Pounds (lb)' }, { pattern: /^\d+(\.\d+)?\s*g$/i, name: 'Grams (g)' }, { pattern: /^\d+(\.\d+)?\s*oz$/i, name: 'Ounces (oz)' }, { pattern: /^\d+(\.\d+)?$/, name: 'Numeric only (no unit)' }, ]); if (weightFormats) { weightFormats.analysisType = 'unit_standardization'; weightFormats.recommendedAction = 'Standardize weight units to kilograms (kg)'; unitAnalysis.push(weightFormats); } } // Length/distance units if (/(length|height|distance|cm|ft|in|meter)/i.test(columnName)) { const lengthFormats = this.analyzeColumnFormatConsistency(colIndex, columnName, [ { pattern: /^\d+(\.\d+)?\s*cm$/i, name: 'Centimeters (cm)' }, { pattern: /^\d+(\.\d+)?\s*m$/i, name: 'Meters (m)' }, { pattern: /^\d+(\.\d+)?\s*ft$/i, name: 'Feet (ft)' }, { pattern: /^\d+(\.\d+)?\s*in$/i, name: 'Inches (in)' }, { pattern: /^\d+'\d+"?$/i, name: 'Feet\'Inches"' }, { pattern: /^\d+(\.\d+)?$/, name: 'Numeric only (no unit)' }, ]); if (lengthFormats) { lengthFormats.analysisType = 'unit_standardization'; lengthFormats.recommendedAction = 'Standardize length units to centimeters (cm)'; unitAnalysis.push(lengthFormats); } } // Currency units if (/(price|cost|amount|salary|revenue|\$|£|€)/i.test(columnName)) { const currencyFormats = this.analyzeColumnFormatConsistency(colIndex, columnName, [ { pattern: /^\$\d+(\.\d{2})?$/i, name: 'USD ($123.45)' }, { pattern: /^£\d+(\.\d{2})?$/i, name: 'GBP (£123.45)' }, { pattern: /^€\d+(\.\d{2})?$/i, name: 'EUR (€123.45)' }, { pattern: /^\d+(\.\d{2})?\s*(USD|GBP|EUR)$/i, name: 'Number with currency code' }, { pattern: /^\d+(\.\d{2})?$/, name: 'Numeric only (no currency)' }, ]); if (currencyFormats) { currencyFormats.analysisType = 'unit_standardization'; currencyFormats.recommendedAction = 'Standardize currency format with clear currency symbols'; unitAnalysis.push(currencyFormats); } } // Temperature units if (/(temperature|temp|°|degree)/i.test(columnName)) { const tempFormats = this.analyzeColumnFormatConsistency(colIndex, columnName, [ { pattern: /^\d+(\.\d+)?\s*°?C$/i, name: 'Celsius (°C)' }, { pattern: /^\d+(\.\d+)?\s*°?F$/i, name: 'Fahrenheit (°F)' }, { pattern: /^\d+(\.\d+)?\s*K$/i, name: 'Kelvin (K)' }, { pattern: /^\d+(\.\d+)?$/, name: 'Numeric only (no unit)' }, ]); if (tempFormats) { tempFormats.analysisType = 'unit_standardization'; tempFormats.recommendedAction = 'Standardize temperature units to Celsius (°C)'; unitAnalysis.push(tempFormats); } } } return unitAnalysis; } validateColumnPattern(colIndex, columnName, pattern) { let violationCount = 0; for (let rowIndex = 0; rowIndex < this.data.length; rowIndex++) { if (violationCount >= this.config.maxViolationsPerPattern) { break; } const value = this.data[rowIndex]?.[colIndex]; if (!value || typeof value !== 'string') { continue; // Skip null/empty values } const trimmedValue = value.trim(); if (trimmedValue === '') { continue; // Skip empty strings } if (!pattern.valuePattern.test(trimmedValue)) { this.violations.push({ patternId: pattern.id, columnName, value: trimmedValue, rowIndex, issue: `Value '${trimmedValue}' doesn't match ${pattern.name} pattern`, }); violationCount++; } } } analyzeFormatConsistency() { const formatAnalysis = []; for (let colIndex = 0; colIndex < this.headers.length; colIndex++) { const columnName = this.headers[colIndex]; // Analyze date format consistency if (/(date|created|updated|birth|expir)/i.test(columnName)) { const dateFormats = this.analyzeColumnFormatConsistency(colIndex, columnName, [ { pattern: /^\d{4}-\d{2}-\d{2}/, name: 'ISO 8601 (YYYY-MM-DD)' }, { pattern: /^\d{2}\/\d{2}\/\d{4}/, name: 'US Format (MM/DD/YYYY)' }, { pattern: /^\d{2}\/\d{2}\/\d{2}/, name: 'Short US (MM/DD/YY)' }, { pattern: /^\d{1,2}-\d{1,2}-\d{4}/, name: 'Dash Format (M-D-YYYY)' }, { pattern: /^\w{3}\s+\d{1,2},?\s+\d{4}/, name: 'Text Format (Mon DD, YYYY)' }, ]); if (dateFormats) formatAnalysis.push(dateFormats); } // Analyze phone format consistency if (/(phone|tel|mobile|cell)/i.test(columnName)) { const phoneFormats = this.analyzeColumnFormatConsistency(colIndex, columnName, [ { pattern: /^\(\d{3}\)\s\d{3}-\d{4}/, name: '(XXX) XXX-XXXX' }, { pattern: /^\d{3}-\d{3}-\d{4}/, name: 'XXX-XXX-XXXX' }, { pattern: /^\d{3}\.\d{3}\.\d{4}/, name: 'XXX.XXX.XXXX' }, { pattern: /^\+1\s\d{3}\s\d{3}\s\d{4}/, name: '+1 XXX XXX XXXX' }, { pattern: /^\d{10}/, name: 'XXXXXXXXXX' }, ]); if (phoneFormats) formatAnalysis.push(phoneFormats); } // Analyze boolean representation consistency if (/(is|has|can|should|enabled|active|valid)/i.test(columnName)) { const booleanFormats = this.analyzeColumnFormatConsistency(colIndex, columnName, [ { pattern: /^(true|false)$/i, name: 'true/false' }, { pattern: /^(yes|no)$/i, name: 'yes/no' }, { pattern: /^(y|n)$/i, name: 'y/n' }, { pattern: /^(1|0)$/, name: '1/0' }, { pattern: /^(on|off)$/i, name: 'on/off' }, ]); if (booleanFormats) formatAnalysis.push(booleanFormats); } // Analyze casing consistency for text fields if (/(name|title|city|company|description)/i.test(columnName)) { const casingConsistency = this.analyzeCasingConsistency(colIndex, columnName); if (casingConsistency) formatAnalysis.push(casingConsistency); } } return formatAnalysis; } analyzeColumnFormatConsistency(colIndex, columnName, formats) { const formatCounts = new Map(); const examples = new Map(); let totalValues = 0; // Count format occurrences for (let rowIndex = 0; rowIndex < this.data.length; rowIndex++) { const value = this.data[rowIndex]?.[colIndex]; if (!value || typeof value !== 'string' || value.trim() === '') { continue; } totalValues++; const trimmedValue = value.trim(); let formatFound = false; for (const format of formats) { if (format.pattern.test(trimmedValue)) { formatCounts.set(format.name, (formatCounts.get(format.name) || 0) + 1); if (!examples.has(format.name)) { examples.set(format.name, new Set()); } if (examples.get(format.name).size < 3) { examples.get(format.name).add(trimmedValue); } formatFound = true; break; } } if (!formatFound) { const otherKey = 'Other/Unrecognized'; formatCounts.set(otherKey, (formatCounts.get(otherKey) || 0) + 1); if (!examples.has(otherKey)) { examples.set(otherKey, new Set()); } if (examples.get(otherKey).size < 3) { examples.get(otherKey).add(trimmedValue); } } } // Only report if there are multiple formats or issues if (formatCounts.size <= 1) { return null; } const formatArray = Array.from(formatCounts.entries()) .sort((a, b) => b[1] - a[1]) .map(([format, count]) => ({ format, count, percentage: ((count / totalValues) * 100).toFixed(1), examples: Array.from(examples.get(format) || []), })); const dominantFormat = formatArray[0]; const hasInconsistency = formatArray.length > 1 && dominantFormat.count < totalValues * 0.9; if (!hasInconsistency) { return null; } return { columnName, analysisType: 'format_standardization', currentFormats: formatArray, recommendedAction: `Standardize to ${dominantFormat.format} format`, consistency: { isConsistent: false, dominantFormat: dominantFormat.format, inconsistencyCount: totalValues - dominantFormat.count, inconsistencyPercentage: (((totalValues - dominantFormat.count) / totalValues) * 100).toFixed(1), }, score: { score: Math.max(0, 100 - (formatArray.length - 1) * 20), interpretation: hasInconsistency ? 'Fair' : 'Good', }, }; } analyzeCasingConsistency(colIndex, columnName) { const casingPatterns = new Map(); const examples = new Map(); let totalValues = 0; for (let rowIndex = 0; rowIndex < this.data.length; rowIndex++) { const value = this.data[rowIndex]?.[colIndex]; if (!value || typeof value !== 'string' || value.trim() === '') { continue; } totalValues++; const trimmedValue = value.trim(); let casingType = 'Mixed/Other'; if (trimmedValue === trimmedValue.toLowerCase()) { casingType = 'lowercase'; } else if (trimmedValue === trimmedValue.toUpperCase()) { casingType = 'UPPERCASE'; } else if (trimmedValue === this.toTitleCase(trimmedValue)) { casingType = 'Title Case'; } else if (trimmedValue === this.toPascalCase(trimmedValue)) { casingType = 'PascalCase'; } else if (trimmedValue === this.toCamelCase(trimmedValue)) { casingType = 'camelCase'; } casingPatterns.set(casingType, (casingPatterns.get(casingType) || 0) + 1); if (!examples.has(casingType)) { examples.set(casingType, new Set()); } if (examples.get(casingType).size < 3) { examples.get(casingType).add(trimmedValue); } } if (casingPatterns.size <= 1) { return null; } const casingArray = Array.from(casingPatterns.entries()) .sort((a, b) => b[1] - a[1]) .map(([casing, count]) => ({ format: casing, count, percentage: ((count / totalValues) * 100).toFixed(1), examples: Array.from(examples.get(casing) || []), })); const dominantCasing = casingArray[0]; const hasInconsistency = casingArray.length > 1 && dominantCasing.count < totalValues * 0.8; if (!hasInconsistency) { return null; } return { columnName, analysisType: 'casing_consistency', currentFormats: casingArray, recommendedAction: `Standardize to ${dominantCasing.format}`, consistency: { isConsistent: false, dominantFormat: dominantCasing.format, inconsistencyCount: totalValues - dominantCasing.count, inconsistencyPercentage: (((totalValues - dominantCasing.count) / totalValues) * 100).toFixed(1), }, score: { score: Math.max(0, 100 - (casingArray.length - 1) * 15), interpretation: hasInconsistency ? 'Fair' : 'Good', }, }; } generatePatternReport() { const patternReport = []; const violationsByPattern = new Map(); // Group violations by pattern for (const violation of this.violations) { if (!violationsByPattern.has(violation.patternId)) { violationsByPattern.set(violation.patternId, []); } violationsByPattern.get(violation.patternId).push(violation); } // Generate report for each pattern that had violations for (const [patternId, violations] of violationsByPattern) { const pattern = this.patterns.find((p) => p.id === patternId); if (!pattern) continue; const affectedColumns = [...new Set(violations.map((v) => v.columnName))]; const examples = violations.slice(0, 5).map((v) => v.value); patternReport.push({ patternName: pattern.name, description: pattern.description, affectedColumns, violationCount: violations.length, examples, severity: pattern.severity, recommendedAction: `Update values to match ${pattern.name} pattern`, }); } return patternReport; } // Helper methods for casing detection toTitleCase(str) { return str.replace(/\w\S*/g, (txt) => txt.charAt(0).toUpperCase() + txt.substr(1).toLowerCase()); } toPascalCase(str) { return str.replace(/(?:^\w|[A-Z]|\b\w)/g, (word) => word.toUpperCase()).replace(/\s+/g, ''); } toCamelCase(str) { return str .replace(/(?:^\w|[A-Z]|\b\w)/g, (word, index) => index === 0 ? word.toLowerCase() : word.toUpperCase()) .replace(/\s+/g, ''); } getPatternSummary() { const violationsBySeverity = { critical: 0, high: 0, medium: 0, low: 0, }; const columnViolations = new Map(); for (const violation of this.violations) { const pattern = this.patterns.find((p) => p.id === violation.patternId); if (pattern) { violationsBySeverity[pattern.severity]++; } columnViolations.set(violation.columnName, (columnViolations.get(violation.columnName) || 0) + 1); } const mostProblematicColumns = Array.from(columnViolations.entries()) .sort((a, b) => b[1] - a[1]) .slice(0, 5) .map(([columnName, violationCount]) => ({ columnName, violationCount })); return { totalPatternsEvaluated: this.patterns.length, totalViolations: this.violations.length, violationsBySeverity, mostProblematicColumns, }; } } exports.PatternValidationEngine = PatternValidationEngine; //# sourceMappingURL=pattern-validation-engine.js.map