UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

565 lines 23.2 kB
"use strict"; /** * Enhanced Column Type Detection System * Sophisticated type inference for EDA analysis */ Object.defineProperty(exports, "__esModule", { value: true }); exports.EnhancedTypeDetector = void 0; const types_1 = require("../eda/types"); const validation_patterns_1 = require("../../utils/validation-patterns"); /** * Enhanced Type Detector for sophisticated column type inference */ class EnhancedTypeDetector { // Using shared validation patterns for consistency // EMAIL_PATTERN, URL_PATTERN imported from validation-patterns // Date patterns (various formats) static DATE_PATTERNS = [ /^\d{4}-\d{2}-\d{2}$/, // YYYY-MM-DD /^\d{2}\/\d{2}\/\d{4}$/, // MM/DD/YYYY /^\d{2}-\d{2}-\d{4}$/, // MM-DD-YYYY /^\d{4}\/\d{2}\/\d{2}$/, // YYYY/MM/DD /^\d{1,2}\/\d{1,2}\/\d{2,4}$/, // M/D/YY or MM/DD/YYYY /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}/, // ISO DateTime /^\d{2}\/\d{2}\/\d{4}\s+\d{1,2}:\d{2}/, // MM/DD/YYYY HH:MM ]; // Boolean patterns static BOOLEAN_PATTERNS = [ /^(true|false)$/i, /^(yes|no)$/i, /^(y|n)$/i, /^(1|0)$/, /^(on|off)$/i, /^(enabled|disabled)$/i, /^(active|inactive)$/i, ]; // Currency patterns static CURRENCY_PATTERNS = [ /^\$[\d,]+\.?\d*$/, // $1,234.56 /^[\d,]+\.?\d*\s*(USD|EUR|GBP|CAD|AUD)$/i, // 1234.56 USD /^(USD|EUR|GBP|CAD|AUD)\s*[\d,]+\.?\d*$/i, // USD 1234.56 ]; // Percentage pattern static PERCENTAGE_PATTERN = /^[\d.]+%$/; /** * Detect column types from sample data */ static detectColumnTypes(samples) { return samples.map((sample) => this.detectSingleColumnType(sample)); } /** * Detect type for a single column */ static detectSingleColumnType(sample) { const { values, columnName } = sample; // Calculate data quality before filtering const totalValues = values.length; const nullCount = values.filter((v) => v === null || v === undefined || v === '').length; const dataQualityRatio = totalValues > 0 ? (totalValues - nullCount) / totalValues : 0; // Filter out null/undefined/empty values for analysis const validValues = values .filter((v) => v !== null && v !== undefined && v !== '') .map((v) => String(v).trim()); if (validValues.length === 0) { return { dataType: types_1.EdaDataType.TEXT_GENERAL, semanticType: types_1.SemanticType.UNKNOWN, confidence: 0, reasons: ['No valid values found'], }; } // Run detection tests in order of specificity // Numerical comes early to prevent numbers being detected as dates const detectionTests = [ () => this.testNumerical(validValues, columnName), // Move numerical first () => this.testBoolean(validValues, columnName), () => this.testCurrency(validValues, columnName), () => this.testPercentage(validValues, columnName), () => this.testEmail(validValues, columnName), () => this.testURL(validValues, columnName), () => this.testDateTime(validValues, columnName), // Move datetime after numerical () => this.testCategorical(validValues, columnName), () => this.testText(validValues, columnName), ]; // Run tests and find the best match let bestResult = { dataType: types_1.EdaDataType.TEXT_GENERAL, semanticType: types_1.SemanticType.UNKNOWN, confidence: 0, reasons: ['Default fallback'], }; for (const test of detectionTests) { const result = test(); if (result && result.confidence > bestResult.confidence) { bestResult = result; } } // Apply data quality penalty to final confidence if (dataQualityRatio < 1.0) { const qualityPenalty = (1.0 - dataQualityRatio) * 0.15; // Up to 15% penalty for poor data quality bestResult.confidence = Math.max(0, bestResult.confidence - qualityPenalty); bestResult.reasons.push(`Data quality: ${Math.round(dataQualityRatio * 100)}% valid values`); } return bestResult; } /** * Test for DateTime columns */ static testDateTime(values, columnName) { let dateCount = 0; const reasons = []; // Skip obvious non-date columns to prevent misclassification const nonDateNames = ['gender', 'sex', 'type', 'category', 'status', 'class', 'group']; if (nonDateNames.some((name) => columnName.toLowerCase().includes(name))) { return null; } // Check for obvious categorical values that shouldn't be dates const uniqueValues = new Set(values); const commonCategorical = [ 'male', 'female', 'yes', 'no', 'true', 'false', 'good', 'bad', 'poor', 'excellent', ]; const hasCategoricalValues = Array.from(uniqueValues).some((val) => commonCategorical.includes(val.toLowerCase())); if (hasCategoricalValues && uniqueValues.size <= 10) { return null; } // Check column name hints - be more specific about date columns const nameHints = [ 'date', 'time', 'timestamp', 'created', 'updated', 'modified', 'birth', 'expir', ]; const nameHasHint = nameHints.some((hint) => columnName.toLowerCase().includes(hint)); // If column name doesn't suggest dates and has numeric name, probably not a date const numericNameHints = [ 'age', 'rate', 'pressure', 'sugar', 'weight', 'height', 'score', 'count', 'amount', 'price', 'salary', ]; const nameHasNumericHint = numericNameHints.some((hint) => columnName.toLowerCase().includes(hint)); if (nameHasNumericHint && !nameHasHint) { return null; // Don't even try date detection for clearly numeric columns } if (nameHasHint) { reasons.push('Column name suggests datetime'); } // Test values against date patterns - be more restrictive for (const value of values.slice(0, 100)) { // Sample first 100 if (this.isDateLike(value)) { dateCount++; } } const dateRatio = dateCount / Math.min(values.length, 100); // Require higher confidence for date detection, especially without name hints const requiredRatio = nameHasHint ? 0.7 : 0.9; if (dateRatio >= requiredRatio) { reasons.push(`${Math.round(dateRatio * 100)}% of values match date patterns`); return { dataType: types_1.EdaDataType.DATE_TIME, semanticType: this.inferDateSemanticType(columnName), confidence: Math.min(0.95, 0.5 + dateRatio * 0.3 + (nameHasHint ? 0.15 : 0)), reasons, }; } return null; } /** * Test for Boolean columns */ static testBoolean(values, _columnName) { let booleanCount = 0; const uniqueValues = new Set(values.map((v) => v.toLowerCase())); const reasons = []; // Check for common boolean patterns for (const value of values.slice(0, 100)) { if (this.isBooleanLike(value)) { booleanCount++; } } const booleanRatio = booleanCount / Math.min(values.length, 100); // Additional checks if (uniqueValues.size <= 3 && booleanRatio >= 0.9) { reasons.push(`Only ${uniqueValues.size} unique values, ${Math.round(booleanRatio * 100)}% match boolean patterns`); reasons.push(`Unique values: ${Array.from(uniqueValues).join(', ')}`); // Higher confidence for clear boolean patterns let confidence = 0.7 + booleanRatio * 0.25; // Extra boost for classic binary patterns like 1/0, true/false const classicBooleanValues = new Set(['1', '0', 'true', 'false', 'yes', 'no', 'y', 'n']); const isClassicBoolean = Array.from(uniqueValues).every(val => classicBooleanValues.has(val)); if (isClassicBoolean && uniqueValues.size === 2) { confidence += 0.05; // Extra boost for classic binary patterns } return { dataType: types_1.EdaDataType.BOOLEAN, semanticType: types_1.SemanticType.STATUS, confidence: Math.min(0.97, confidence), reasons, }; } return null; } /** * Test for Currency columns */ static testCurrency(values, columnName) { let currencyCount = 0; const reasons = []; // Check column name hints const nameHints = ['price', 'cost', 'amount', 'salary', 'revenue', 'fee', 'charge']; const nameHasHint = nameHints.some((hint) => columnName.toLowerCase().includes(hint)); if (nameHasHint) { reasons.push('Column name suggests currency'); } // Test values against currency patterns for (const value of values.slice(0, 100)) { if (this.isCurrencyLike(value)) { currencyCount++; } } const currencyRatio = currencyCount / Math.min(values.length, 100); if (currencyRatio >= 0.7) { reasons.push(`${Math.round(currencyRatio * 100)}% of values match currency patterns`); return { dataType: types_1.EdaDataType.NUMERICAL_FLOAT, semanticType: types_1.SemanticType.CURRENCY, confidence: Math.min(0.95, 0.5 + currencyRatio * 0.3 + (nameHasHint ? 0.15 : 0)), reasons, }; } return null; } /** * Test for Percentage columns */ static testPercentage(values, columnName) { let percentageCount = 0; const reasons = []; // Check column name hints const nameHints = ['percent', 'rate', 'ratio', '%']; const nameHasHint = nameHints.some((hint) => columnName.toLowerCase().includes(hint)); if (nameHasHint) { reasons.push('Column name suggests percentage'); } // Test values against percentage pattern for (const value of values.slice(0, 100)) { if (this.PERCENTAGE_PATTERN.test(value)) { percentageCount++; } } const percentageRatio = percentageCount / Math.min(values.length, 100); if (percentageRatio >= 0.8) { reasons.push(`${Math.round(percentageRatio * 100)}% of values match percentage pattern`); return { dataType: types_1.EdaDataType.NUMERICAL_FLOAT, semanticType: types_1.SemanticType.PERCENTAGE, confidence: Math.min(0.95, 0.6 + percentageRatio * 0.25 + (nameHasHint ? 0.1 : 0)), reasons, }; } return null; } /** * Test for Email columns */ static testEmail(values, columnName) { let emailCount = 0; const reasons = []; // Check column name hints const nameHasHint = columnName.toLowerCase().includes('email') || columnName.toLowerCase().includes('mail'); if (nameHasHint) { reasons.push('Column name suggests email'); } // Test values against email pattern for (const value of values.slice(0, 100)) { if (validation_patterns_1.EMAIL_PATTERN.test(value)) { emailCount++; } } const emailRatio = emailCount / Math.min(values.length, 100); if (emailRatio >= 0.9) { reasons.push(`${Math.round(emailRatio * 100)}% of values match email pattern`); return { dataType: types_1.EdaDataType.TEXT_ADDRESS, semanticType: types_1.SemanticType.IDENTIFIER, confidence: Math.min(0.98, 0.7 + emailRatio * 0.25 + (nameHasHint ? 0.03 : 0)), reasons, }; } return null; } /** * Test for URL columns */ static testURL(values, columnName) { let urlCount = 0; const reasons = []; // Check column name hints const nameHasHint = columnName.toLowerCase().includes('url') || columnName.toLowerCase().includes('link') || columnName.toLowerCase().includes('website'); if (nameHasHint) { reasons.push('Column name suggests URL'); } // Test values against URL pattern for (const value of values.slice(0, 100)) { if (validation_patterns_1.URL_PATTERN.test(value)) { urlCount++; } } const urlRatio = urlCount / Math.min(values.length, 100); if (urlRatio >= 0.8) { reasons.push(`${Math.round(urlRatio * 100)}% of values match URL pattern`); return { dataType: types_1.EdaDataType.TEXT_ADDRESS, semanticType: types_1.SemanticType.IDENTIFIER, confidence: Math.min(0.95, 0.6 + urlRatio * 0.3 + (nameHasHint ? 0.05 : 0)), reasons, }; } return null; } /** * Test for Numerical columns */ static testNumerical(values, columnName) { let numericCount = 0; let integerCount = 0; const reasons = []; // Enhanced column name hints including medical/scientific terms const nameHints = [ 'id', 'count', 'number', 'quantity', 'amount', 'size', 'length', 'age', 'rate', 'pressure', 'sugar', 'weight', 'height', 'score', 'price', 'salary', 'value', 'level', 'measurement', ]; const nameHasHint = nameHints.some((hint) => columnName.toLowerCase().includes(hint)); if (nameHasHint) { reasons.push('Column name suggests numerical data'); } // More robust numerical validation for (const value of values.slice(0, 100)) { const trimmedValue = String(value).trim(); // Skip empty values if (!trimmedValue) continue; // Check if it's a pure number (no separators that could be dates) const isPlainNumber = /^-?\d*\.?\d+$/.test(trimmedValue); if (isPlainNumber) { const num = Number(trimmedValue); if (!isNaN(num) && isFinite(num)) { numericCount++; // More precise integer detection if (Number.isInteger(num) && !trimmedValue.includes('.')) { integerCount++; } } } } const validSampleSize = Math.min(values.length, 100); const numericRatio = numericCount / validSampleSize; const integerRatio = numericCount > 0 ? integerCount / numericCount : 0; // Higher confidence for numerical detection, especially with name hints const threshold = nameHasHint ? 0.7 : 0.85; if (numericRatio >= threshold) { const isInteger = integerRatio >= 0.9; reasons.push(`${Math.round(numericRatio * 100)}% of values are numeric`); if (isInteger) { reasons.push(`${Math.round(integerRatio * 100)}% are integers`); } // Higher confidence scoring with more sensitivity to data quality let confidence = 0.5 + numericRatio * 0.35; if (nameHasHint) confidence += 0.15; if (numericRatio >= 0.95) confidence += 0.1; // Bonus for very clean data if (numericRatio < 0.8) confidence -= 0.1; // Penalty for messy data return { dataType: isInteger ? types_1.EdaDataType.NUMERICAL_INTEGER : types_1.EdaDataType.NUMERICAL_FLOAT, semanticType: this.inferNumericalSemanticType(columnName), confidence: Math.min(0.98, confidence), reasons, }; } return null; } /** * Test for Categorical columns */ static testCategorical(values, columnName) { const uniqueValues = new Set(values); const uniqueRatio = uniqueValues.size / values.length; const reasons = []; const valuesArray = Array.from(uniqueValues); // Check for specific demographic categories first (high priority) const genderValues = ['male', 'female', 'm', 'f', 'man', 'woman']; const educationValues = ['bachelor', 'master', 'phd', 'doctorate', 'high school', 'college']; const qualityValues = ['poor', 'fair', 'good', 'excellent', 'average']; // Check for gender column specifically if (columnName.toLowerCase().includes('gender') || columnName.toLowerCase().includes('sex')) { const isGenderLike = valuesArray.every((val) => genderValues.some((gv) => val.toLowerCase().includes(gv.toLowerCase()))); if (isGenderLike) { reasons.push('Column name and values indicate gender/demographic data'); return { dataType: types_1.EdaDataType.CATEGORICAL, semanticType: types_1.SemanticType.DEMOGRAPHIC, confidence: 0.98, reasons, }; } } // Check column name hints const nameHints = [ 'category', 'type', 'class', 'group', 'status', 'department', 'gender', 'education', 'quality', 'level', ]; const nameHasHint = nameHints.some((hint) => columnName.toLowerCase().includes(hint)); if (nameHasHint) { reasons.push('Column name suggests categorical data'); } // Categorical if reasonable unique ratio and number of categories // More permissive for small samples, stricter for large samples const maxAllowedRatio = values.length <= 10 ? 0.8 : 0.5; if (uniqueRatio <= maxAllowedRatio && uniqueValues.size >= 2 && uniqueValues.size <= 100) { reasons.push(`${uniqueValues.size} unique values (${Math.round(uniqueRatio * 100)}% of total)`); reasons.push('Low cardinality suggests categorical data'); return { dataType: types_1.EdaDataType.CATEGORICAL, semanticType: this.inferCategoricalSemanticType(columnName), confidence: Math.min(0.9, 0.4 + (1 - uniqueRatio) * 0.35 + (nameHasHint ? 0.2 : 0)), reasons, }; } return null; } /** * Test for Text columns (fallback) */ static testText(values, _columnName) { const avgLength = values.reduce((sum, val) => sum + val.length, 0) / values.length; const reasons = []; reasons.push(`Average text length: ${Math.round(avgLength)} characters`); // Determine if it's general text or could be something else const semanticType = avgLength > 50 ? types_1.SemanticType.UNKNOWN : types_1.SemanticType.CATEGORY; return { dataType: types_1.EdaDataType.TEXT_GENERAL, semanticType, confidence: 0.3, // Low confidence fallback reasons, }; } // Helper methods for pattern matching static isDateLike(value) { // First check explicit date patterns if (this.DATE_PATTERNS.some((pattern) => pattern.test(value))) { return true; } // Be much more restrictive with Date.parse // Only accept if it looks like a real date format and isn't just a number if (value.length < 4 || /^\d+$/.test(value)) { return false; // Don't accept pure numbers or very short strings } // Only accept Date.parse results if the string contains date-like separators if (!/[-\/\s:T]/.test(value)) { return false; // Must contain date/time separators } const parsed = Date.parse(value); if (isNaN(parsed)) { return false; } // Additional sanity check: parsed date should be between 1900 and 2100 const year = new Date(parsed).getFullYear(); return year >= 1900 && year <= 2100; } static isBooleanLike(value) { return this.BOOLEAN_PATTERNS.some((pattern) => pattern.test(value)); } static isCurrencyLike(value) { return this.CURRENCY_PATTERNS.some((pattern) => pattern.test(value)); } // Semantic type inference helpers static inferDateSemanticType(columnName) { const name = columnName.toLowerCase(); if (name.includes('transaction') || name.includes('payment')) { return types_1.SemanticType.DATE_TRANSACTION; } return types_1.SemanticType.UNKNOWN; } static inferNumericalSemanticType(columnName) { const name = columnName.toLowerCase(); // Add negative checks to avoid misclassification if (name.includes('percent') || name.includes('rate') || name.includes('%')) { return types_1.SemanticType.PERCENTAGE; } // More specific age detection to avoid words like "percentage", "average", "usage" if (name.includes('age') && !name.includes('percent') && !name.includes('average') && !name.includes('usage') && !name.includes('damage')) { return types_1.SemanticType.AGE; } if (name.includes('id')) return types_1.SemanticType.IDENTIFIER; if (name.includes('count') || name.includes('quantity')) return types_1.SemanticType.COUNT; if (name.includes('rating') || name.includes('score')) return types_1.SemanticType.RATING; return types_1.SemanticType.UNKNOWN; } static inferCategoricalSemanticType(columnName) { const name = columnName.toLowerCase(); if (name.includes('department') || name.includes('unit')) return types_1.SemanticType.ORGANIZATIONAL_UNIT; if (name.includes('status') || name.includes('state')) return types_1.SemanticType.STATUS; return types_1.SemanticType.CATEGORY; } } exports.EnhancedTypeDetector = EnhancedTypeDetector; //# sourceMappingURL=enhanced-type-detector.js.map