UNPKG

dwh-audit

Version:

Modular CLI tool for auditing data warehouses - extract, analyze, and report on schemas, data quality, and analytics readiness

621 lines (542 loc) 26.6 kB
#!/usr/bin/env node import { promises as fs } from "fs"; import path from "path"; // --- Configuration --- const config = { inputFile: process.env.RAW_DATA_FILE || process.argv[2] || "./output/reports/dataset_raw.json", outputDir: process.argv[3] || "./output" }; // --- Colors for Terminal Output --- const colors = { red: "\x1b[31m", green: "\x1b[32m", yellow: "\x1b[33m", cyan: "\x1b[36m", magenta: "\x1b[35m", nc: "\x1b[0m" }; // --- Field Pattern Detection Constants --- import { EXCLUDE_AS_JOIN_KEYS, VALID_TIMESTAMP_TYPES, TIMESTAMP_FIELD_PATTERNS, USER_ID_PATTERNS, EVENT_NAME_PATTERNS, SESSION_PATTERNS, PII_PATTERNS, ANALYTICS_PATTERNS, COMMERCE_PATTERNS, COMMUNICATION_PATTERNS, CONTENT_PATTERNS, EVENT_PROPERTY_PATTERNS, FOREIGN_KEY_PATTERNS, GEOGRAPHIC_PATTERNS, NUMERIC_TIMESTAMP_PATTERNS, ORGANIZATIONAL_PATTERNS, PRIMARY_KEY_PATTERNS, SYSTEM_METADATA_PATTERNS } from './entities.js'; // Enhanced field type detection using comprehensive patterns function detectFieldType(fieldName, dataType) { const detectedTypes = []; // Check ALL pattern groups for matches (including PII) const patternGroups = [ { patterns: [{ pattern: TIMESTAMP_FIELD_PATTERNS, type: 'timestamp' }], group: 'temporal' }, { patterns: [{ pattern: NUMERIC_TIMESTAMP_PATTERNS, type: 'numeric_timestamp' }], group: 'temporal' }, { patterns: [{ pattern: USER_ID_PATTERNS, type: 'user_id' }], group: 'identity' }, { patterns: [{ pattern: SESSION_PATTERNS, type: 'session' }], group: 'identity' }, { patterns: [{ pattern: PRIMARY_KEY_PATTERNS, type: 'primary_key' }], group: 'identity' }, { patterns: [{ pattern: FOREIGN_KEY_PATTERNS, type: 'foreign_key' }], group: 'identity' }, { patterns: [{ pattern: EVENT_NAME_PATTERNS, type: 'event_name' }], group: 'event' }, { patterns: [{ pattern: EVENT_PROPERTY_PATTERNS, type: 'event_properties' }], group: 'event' }, { patterns: GEOGRAPHIC_PATTERNS, group: 'geographic' }, { patterns: COMMERCE_PATTERNS, group: 'commerce' }, { patterns: SYSTEM_METADATA_PATTERNS, group: 'system' }, { patterns: CONTENT_PATTERNS, group: 'content' }, { patterns: ANALYTICS_PATTERNS, group: 'analytics' }, { patterns: ORGANIZATIONAL_PATTERNS, group: 'organizational' }, { patterns: COMMUNICATION_PATTERNS, group: 'communication' }, // Include PII patterns in the main detection { patterns: PII_PATTERNS.map(p => ({ pattern: p.pattern, type: p.type })), group: 'pii' } ]; // Test field name against all patterns patternGroups.forEach(group => { group.patterns.forEach(patternInfo => { if (patternInfo.pattern.test(fieldName)) { detectedTypes.push({ type: patternInfo.type, group: group.group }); } }); }); // Special handling for data type-based detection if (VALID_TIMESTAMP_TYPES.has(dataType)) { detectedTypes.push({ type: 'timestamp', group: 'temporal', by_data_type: true }); } // Check for boolean types if (/boolean|bool|bit/i.test(dataType) || /^(is_|has_|can_|should_|will_|was_)/i.test(fieldName)) { detectedTypes.push({ type: 'boolean_flag', group: 'system' }); } // Check for numeric types that might be IDs if (/int|integer|bigint|number|numeric/i.test(dataType) && /_id$/i.test(fieldName)) { detectedTypes.push({ type: 'numeric_id', group: 'identity' }); } // Remove duplicates and filter out unknown types const uniqueTypes = []; const seen = new Set(); detectedTypes.forEach(typeInfo => { if (typeInfo.type !== 'unknown' && !seen.has(typeInfo.type)) { seen.add(typeInfo.type); uniqueTypes.push(typeInfo); } }); return uniqueTypes; } // Analytics compatibility analysis based on Mixpanel requirements function analyzeAnalyticsCompatibility(tables) { const insights = { event_tables: [], // Tables with TIMESTAMP + USER_ID (core Mixpanel requirement) user_tables: [], // Tables with USER_ID (but no timestamp required) lookup_tables: [], // Tables with arbitrary join keys, no timestamp complex_fields: [], // Tables with complex nested structures pii_warnings: [], // Tables with potential PII fields data_quality: [], // All table analyses (required by TypeScript definition and UI) field_patterns: { timestamp_fields: new Set(), user_id_fields: new Set(), event_name_fields: new Set(), session_fields: new Set(), complex_fields: new Set(), pii_fields: new Set() } }; tables.forEach((table, index) => { try { const analysis = { table_name: table.table_name, table_type: table.table_type, table_category: 'UNKNOWN', // Will be: EVENT, USER, LOOKUP, or UNKNOWN mixpanel_compatibility: 0, // 0-10 score row_count: table.row_count || 0, size_bytes: table.size_bytes || 0, creation_time: table.creation_time, required_fields: { has_timestamp: false, has_user_id: false, timestamp_fields: [], user_id_fields: [] }, event_schema_type: null, // 'MULTI_SCHEMA', 'MONO_SCHEMA', or null schema_complexity: { total_fields: 0, complex_fields: [], // STRUCT, RECORD, JSON, REPEATED fields nested_depth: 0, total_subfields: 0 }, data_quality: { potential_pii: [], volume_category: 'UNKNOWN', // SMALL, MEDIUM, LARGE based on row count freshness: null // Days since last update }, field_details: {} // Detailed info per field }; // Analyze schema if available if (table.schema && table.schema.length > 0) { analysis.schema_complexity.total_fields = table.schema.length; // Process each field individually to preserve nested field structure // Don't group by column_name as this loses nested fields like linked_device.language table.schema.forEach(field => { const fieldPath = field.nested_field_path || field.column_name; const fieldName = fieldPath.toLowerCase(); const fieldType = field.nested_type || field.data_type || ''; // Check for timestamps const isTimestampType = VALID_TIMESTAMP_TYPES.has(fieldType); const isTimestampName = TIMESTAMP_FIELD_PATTERNS.test(fieldName); if (isTimestampType || isTimestampName) { analysis.required_fields.has_timestamp = true; analysis.required_fields.timestamp_fields.push({ name: fieldPath, type: fieldType, nullable: field.is_nullable === 'YES', by_type: isTimestampType, by_name: isTimestampName }); insights.field_patterns.timestamp_fields.add(fieldName); } // Check for user IDs if (USER_ID_PATTERNS.test(fieldName)) { analysis.required_fields.has_user_id = true; analysis.required_fields.user_id_fields.push({ name: fieldPath, type: fieldType, nullable: field.is_nullable === 'YES' }); insights.field_patterns.user_id_fields.add(fieldName); } // Check for event names (multi-schema indicator) if (EVENT_NAME_PATTERNS.test(fieldName)) { insights.field_patterns.event_name_fields.add(fieldName); } // Check for session fields if (SESSION_PATTERNS.test(fieldName)) { insights.field_patterns.session_fields.add(fieldName); } // Complex field analysis - check if this field is complex or nested const isNested = fieldPath.includes('.'); const isComplexType = ['STRUCT', 'RECORD', 'JSON'].some(type => fieldType.includes(type)) || fieldType.includes('REPEATED') || fieldType.includes('ARRAY'); if (isNested || isComplexType) { const complexField = { name: fieldPath, type: fieldType, subfield_count: 1, // Each field is counted individually now subfields: [{ path: fieldPath, type: fieldType, depth: fieldPath.split('.').length }], max_nesting_depth: fieldPath.split('.').length }; analysis.schema_complexity.complex_fields.push(complexField); analysis.schema_complexity.nested_depth = Math.max( analysis.schema_complexity.nested_depth, complexField.max_nesting_depth ); analysis.schema_complexity.total_subfields += 1; insights.field_patterns.complex_fields.add(fieldName); } // Enhanced PII detection const detectedPII = []; PII_PATTERNS.forEach(piiInfo => { if (piiInfo.pattern.test(fieldName)) { detectedPII.push(piiInfo.type); } }); if (detectedPII.length > 0) { analysis.data_quality.potential_pii.push({ field: fieldPath, types: detectedPII }); detectedPII.forEach(type => insights.field_patterns.pii_fields.add(`${fieldName}:${type}`)); } // Enhanced field type detection const detectedFieldTypes = detectFieldType(fieldName, fieldType); // Store field details using full path as key analysis.field_details[fieldPath] = { type: fieldType, nullable: field.is_nullable === 'YES', is_partitioning: field.is_partitioning_column, is_clustering: field.clustering_ordinal_position != null, subfield_count: 1, detected_pii: detectedPII, detected_field_types: detectedFieldTypes }; }); } // Determine table category based on Mixpanel requirements const hasTimestamp = analysis.required_fields.has_timestamp; const hasUserId = analysis.required_fields.has_user_id; const hasEventName = insights.field_patterns.event_name_fields.size > 0; if (hasTimestamp && hasUserId) { analysis.table_category = 'EVENT'; // Determine if multi-schema or mono-schema analysis.event_schema_type = hasEventName ? 'MULTI_SCHEMA' : 'MONO_SCHEMA'; } else if (hasUserId && !hasTimestamp) { analysis.table_category = 'USER'; } else if (!hasTimestamp && !hasUserId) { // Tables with join keys but no timestamp/user_id are likely lookup tables const hasJoinableFields = table.schema?.some(field => field.is_potential_join_key); analysis.table_category = hasJoinableFields ? 'LOOKUP' : 'UNKNOWN'; } else { analysis.table_category = 'UNKNOWN'; } // Calculate Mixpanel compatibility score (0-10) let score = 0; if (analysis.table_category === 'EVENT') { score = 7; // Start with high score for event tables // Bonus points for quality if (analysis.required_fields.timestamp_fields.some(f => !f.nullable)) score += 1; if (analysis.required_fields.user_id_fields.some(f => !f.nullable)) score += 1; if (insights.field_patterns.session_fields.size > 0) score += 0.5; // Penalty for overly complex schemas if (analysis.schema_complexity.complex_fields.length > 5) score -= 1; if (analysis.schema_complexity.nested_depth > 3) score -= 0.5; score = Math.max(0, Math.min(10, score)); } else if (analysis.table_category === 'USER') { score = 5; // Moderate score for user tables // Bonus for non-nullable user_id if (analysis.required_fields.user_id_fields.some(f => !f.nullable)) score += 1; // Penalty for complex schemas if (analysis.schema_complexity.complex_fields.length > 3) score -= 0.5; } else if (analysis.table_category === 'LOOKUP') { score = 3; // Lower score but still useful } analysis.mixpanel_compatibility = Math.round(score * 10) / 10; // Volume categorization if (analysis.row_count > 10000000) { analysis.data_quality.volume_category = 'LARGE'; } else if (analysis.row_count > 100000) { analysis.data_quality.volume_category = 'MEDIUM'; } else if (analysis.row_count > 0) { analysis.data_quality.volume_category = 'SMALL'; } // Calculate freshness (days since creation/update) if (table.creation_time) { const createdDate = new Date(table.creation_time); const now = new Date(); analysis.data_quality.freshness = Math.floor((now - createdDate) / (1000 * 60 * 60 * 24)); } // Categorize into insights switch (analysis.table_category) { case 'EVENT': insights.event_tables.push(analysis); break; case 'USER': insights.user_tables.push(analysis); break; case 'LOOKUP': insights.lookup_tables.push(analysis); break; } if (analysis.schema_complexity.complex_fields.length > 0) { insights.complex_fields.push(analysis); } if (analysis.data_quality.potential_pii.length > 0) { insights.pii_warnings.push(analysis); } // Add to global data_quality array (required by TypeScript definition and UI) insights.data_quality.push(analysis); } catch (error) { console.error(`Error processing table ${index} (${table.table_name}):`, error.message); console.error(`Error stack:`, error.stack); throw error; } }); // Convert Sets to Arrays for JSON serialization insights.field_patterns.timestamp_fields = Array.from(insights.field_patterns.timestamp_fields || []); insights.field_patterns.user_id_fields = Array.from(insights.field_patterns.user_id_fields || []); insights.field_patterns.event_name_fields = Array.from(insights.field_patterns.event_name_fields || []); insights.field_patterns.session_fields = Array.from(insights.field_patterns.session_fields || []); insights.field_patterns.complex_fields = Array.from(insights.field_patterns.complex_fields || []); insights.field_patterns.pii_fields = Array.from(insights.field_patterns.pii_fields || []); return insights; } // Build lineage graph from table relationships function buildLineageGraph(tables) { const lineageGraph = { nodes: [], edges: [] }; console.log(`${colors.yellow}Building table relationship graph...${colors.nc}`); // Add nodes for all tables and views tables.forEach(table => { lineageGraph.nodes.push({ id: table.table_name, type: table.table_type.toLowerCase(), row_count: table.row_count || 0, size_bytes: table.size_bytes || 0, analytics_score: 0 // Will be populated by analytics analysis }); }); // Detect join keys (fields that appear in multiple tables) const fieldOccurrences = new Map(); tables.forEach(table => { if (table.schema && table.schema.length) { table.schema.forEach(field => { const fieldName = field.column_name?.toLowerCase(); if (fieldName && !EXCLUDE_AS_JOIN_KEYS.includes(fieldName)) { if (!fieldOccurrences.has(fieldName)) { fieldOccurrences.set(fieldName, []); } fieldOccurrences.get(fieldName).push(table.table_name); } }); } }); // Mark potential join keys const joinKeyRelationships = new Map(); fieldOccurrences.forEach((tableList, fieldName) => { if (tableList.length > 1) { // This field appears in multiple tables - mark as potential join key tables.forEach(table => { if (table.schema && table.schema.length) { table.schema.forEach(field => { if (field.column_name && field.column_name.toLowerCase() === fieldName) { field.is_potential_join_key = true; } }); } }); joinKeyRelationships.set(fieldName, tableList); } }); // Extract view dependencies from view definitions tables.filter(t => t.table_type === 'VIEW' && t.view_definition).forEach(view => { const viewDef = view.view_definition.toLowerCase(); // Find table references in the view definition tables.filter(t => t.table_type === 'TABLE').forEach(table => { const tableName = table.table_name.toLowerCase(); // Look for table references (basic pattern matching) if (viewDef.includes(tableName) || viewDef.includes(`\`${tableName}\``)) { lineageGraph.edges.push({ source: table.table_name, target: view.table_name, type: "view_dependency", label: "depends on" }); } }); }); // Create edges between tables that share join keys joinKeyRelationships.forEach((tableList, joinKey) => { if (tableList.length > 1) { // Create relationships between all pairs of tables sharing this key for (let i = 0; i < tableList.length; i++) { for (let j = i + 1; j < tableList.length; j++) { // Avoid duplicate edges with view dependencies const hasViewDep = lineageGraph.edges.some( edge => (edge.source === tableList[i] && edge.target === tableList[j]) || (edge.source === tableList[j] && edge.target === tableList[i]) ); if (!hasViewDep) { lineageGraph.edges.push({ source: tableList[i], target: tableList[j], type: "join_key", label: joinKey, bidirectional: true }); } } } } }); console.log( `${colors.green}✓ Built relationship graph with ${lineageGraph.nodes.length} nodes and ${lineageGraph.edges.length} relationships.${colors.nc}` ); return lineageGraph; } async function runAudit() { console.log(`\n${colors.cyan}=== Running Data Warehouse Audit Analysis ===${colors.nc}`); console.log("---------------------------------------------"); console.log(`${colors.green}▸ Input File:${colors.nc} ${config.inputFile}`); console.log(`${colors.green}▸ Output Directory:${colors.nc} ${config.outputDir}`); console.log("---------------------------------------------\n"); try { // Read raw data console.log(`${colors.yellow}Reading raw dataset metadata...${colors.nc}`); const rawData = JSON.parse(await fs.readFile(config.inputFile, 'utf8')); console.log(`${colors.green}✓ Loaded raw data with ${rawData.tables.length} tables/views.${colors.nc}`); // Ensure output directory exists await fs.mkdir(path.join(config.outputDir, "reports"), { recursive: true }); // Build lineage graph const lineageGraph = buildLineageGraph(rawData.tables); // Analytics compatibility analysis console.log(`\n${colors.yellow}Analyzing tables for analytics compatibility...${colors.nc}`); const analyticsInsights = analyzeAnalyticsCompatibility(rawData.tables); console.log(`${colors.green}✓ Analytics analysis complete: ${analyticsInsights.event_tables.length} EVENT tables found.${colors.nc}`); // Enhance table schemas with detected field types console.log(`${colors.yellow}Enhancing schemas with field type detection...${colors.nc}`); rawData.tables.forEach(table => { if (table.schema && table.schema.length > 0) { table.schema.forEach(field => { const fieldPath = field.nested_field_path || field.column_name; const fieldName = fieldPath.toLowerCase(); const fieldType = field.nested_type || field.data_type || ''; field.detected_field_types = detectFieldType(fieldName, fieldType); }); } }); console.log(`${colors.green}✓ Field type detection complete.${colors.nc}`); // Update lineage graph nodes with analytics scores lineageGraph.nodes.forEach(node => { // Find the table analysis from any of the categorized arrays const allAnalyses = [ ...analyticsInsights.event_tables, ...analyticsInsights.user_tables, ...analyticsInsights.lookup_tables ]; const tableAnalysis = allAnalyses.find( analysis => analysis.table_name === node.id ); if (tableAnalysis) { node.analytics_score = tableAnalysis.mixpanel_compatibility; } }); // Calculate summary statistics const tables = rawData.tables.filter(t => t.table_type === 'TABLE'); const views = rawData.tables.filter(t => t.table_type === 'VIEW'); const summary = { total_tables: tables.length, total_views: views.length, total_objects: rawData.tables.length, failed_objects: rawData.tables.filter(t => t.has_permission_error).length, total_rows_accessible: rawData.tables .filter(t => !t.has_permission_error && typeof t.row_count === 'number') .reduce((sum, t) => sum + t.row_count, 0) }; // Build final audit result const auditResult = { audit_metadata: { generated_at: new Date().toISOString(), analysis_version: "1.0.0", source_file: config.inputFile, project_id: rawData.extraction_metadata?.project_id, dataset_id: rawData.extraction_metadata?.dataset_id, ...rawData.audit_metadata // Include original metadata }, tables: rawData.tables, // Include all raw table data lineage: lineageGraph, analytics: analyticsInsights, summary: summary }; // Write audit results const outputFile = path.join(config.outputDir, "reports", "dataset_audit.json"); await fs.writeFile(outputFile, JSON.stringify(auditResult, null, 2)); // Write summary CSV const auditSummaryCsv = [ "metric,value", `total_tables,${summary.total_tables}`, `total_views,${summary.total_views}`, `total_objects,${summary.total_objects}`, `failed_objects,${summary.failed_objects}`, `total_rows_accessible,${summary.total_rows_accessible}`, `mixpanel_ready_tables,${analyticsInsights.event_tables?.length || 0}`, `event_tables,${analyticsInsights.event_tables?.length || 0}`, `user_tables,${analyticsInsights.user_tables?.length || 0}` ].join("\n"); await fs.writeFile( path.join(config.outputDir, "reports", "audit_summary.csv"), auditSummaryCsv + "\n" ); console.log(`\n${colors.green}✔ Audit analysis complete!${colors.nc}`); console.log("=========================================="); console.log(`${colors.green}▸ Processed:${colors.nc} ${rawData.tables.length} tables/views`); console.log(`${colors.green}▸ Event Tables:${colors.nc} ${analyticsInsights.event_tables.length} tables`); console.log(`${colors.green}▸ Join Keys Found:${colors.nc} ${lineageGraph.edges.filter(e => e.type === 'join_key').length} relationships`); console.log(`${colors.green}▸ View Dependencies:${colors.nc} ${lineageGraph.edges.filter(e => e.type === 'view_dependency').length} relationships`); console.log("=========================================="); console.log(`Audit results: ${colors.cyan}${outputFile}${colors.nc}`); } catch (error) { console.error(`\n${colors.red}Error during audit analysis:${colors.nc}`); console.error(error.message); process.exit(1); } } // Run the audit if this file is executed directly if (import.meta.url === `file://${process.argv[1]}`) { runAudit().catch(console.error); } export { runAudit, analyzeAnalyticsCompatibility, buildLineageGraph };