dwh-audit
Version:
Modular CLI tool for auditing data warehouses - extract, analyze, and report on schemas, data quality, and analytics readiness
530 lines (400 loc) • 13.4 kB
TypeScript
/**
* TypeScript definitions for DWH Audit Tool
*
* This file defines the interfaces and types used throughout the data warehouse audit tool.
* It serves as both documentation and a contract for implementing support for new warehouses.
*/
type DataWarehouse = "bigquery" | "snowflake" | "redshift" | "databricks";
// ============================================================================
// CLI and Configuration Types
// ============================================================================
export interface AuditOptions {
/** Data warehouse type (currently only 'bigquery' supported) */
dwh: DataWarehouse;
/** Project ID (e.g., BigQuery project ID) */
project?: string;
/** Dataset ID to audit */
dataset?: string;
/** Warehouse region/location */
location?: string;
/** Number of sample rows to extract per table */
samples: number;
/** Output directory for audit results */
output: string;
/** Optional table filter (comma-separated list, supports glob patterns) */
filter?: string;
/** Optional path to credentials file */
credentials?: string;
}
// ============================================================================
// Raw Dataset Schema - Output from bigquery.js, input to audit.js
// ============================================================================
/**
* Schema field definition
* This is the core interface that all warehouses must implement
*/
export interface SchemaField {
/** Column name */
column_name: string;
/** Position in table schema (1-indexed) */
ordinal_position: number;
/** Whether field accepts NULL values ('YES' | 'NO') */
is_nullable: "YES" | "NO";
/** Base data type (e.g., 'STRING', 'INTEGER', 'TIMESTAMP') */
data_type: string;
/** Whether this field is used for partitioning */
is_partitioning_column: boolean;
/** Position in clustering key (null if not clustered) */
clustering_ordinal_position: number | null;
/** Full field path for nested fields (e.g., 'user.profile.name') */
nested_field_path: string;
/** Complete type including nested structures (e.g., 'ARRAY<STRUCT<name STRING>>') */
nested_type: string;
/** Whether this field appears in multiple tables (populated by audit.js) */
is_potential_join_key?: boolean;
}
/**
* Table metadata extracted from data warehouse
* This is the core interface that all warehouses must implement
*/
export interface TableMetadata {
/** Table name */
table_name: string;
/** Table type */
table_type: "TABLE" | "VIEW";
/** ISO timestamp of when table was created */
creation_time: string | null;
/** Whether extraction encountered permission errors */
has_permission_error: boolean;
/** Array of error messages encountered during extraction */
error_details: string[];
/** Complete schema definition */
schema: SchemaField[];
/** Number of rows in table */
row_count: number;
/** Size in bytes */
size_bytes: number;
/** Sample data rows */
sample_data: Record<string, any>[];
/** View definition/SQL (only for views) */
view_definition: string | null;
/** Partitioning information (warehouse-specific) */
partitioning_info: any;
/** Clustering information (warehouse-specific) */
clustering_info: any;
}
/**
* Raw dataset output format - what bigquery.js produces
* This is the contract for all warehouse extractors
*/
export interface RawDataset {
/** Metadata about the extraction process */
extraction_metadata: {
/** ISO timestamp of extraction */
generated_at: string;
/** Project/workspace ID */
project_id: string;
/** Dataset/database ID */
dataset_id: string;
/** Warehouse region */
region: string;
/** Permission mode detected during extraction */
permission_mode: "jobUser" | "dataViewer" | string;
/** Number of sample rows requested */
sample_limit: number;
/** Table filter applied (null if none) */
table_filter: string[] | null;
/** Additional warehouse-specific metadata */
[key: string]: any;
};
/** Array of all tables and views extracted */
tables: TableMetadata[];
/** Summary statistics */
summary: {
/** Number of tables found */
total_tables: number;
/** Number of views found */
total_views: number;
/** Total number of objects (tables + views) */
total_objects: number;
/** Number of objects that failed extraction */
failed_objects: number;
/** Total accessible row count across all tables */
total_rows_accessible: number;
};
}
// ============================================================================
// Analysis Results Schema - Output from audit.js
// ============================================================================
/**
* Analytics features detected in a table
*/
export interface AnalyticsFeatures {
/** Has timestamp/datetime field */
has_timestamp: boolean;
/** Has user identifier field */
has_user_id: boolean;
/** Has event identifier field */
has_event_id: boolean;
/** Has session identifier field */
has_session_id: boolean;
/** Has event name/type field */
has_event_name: boolean;
/** Whether timestamp field is nullable */
timestamp_nullable: boolean;
/** Whether user ID field is nullable */
user_id_nullable: boolean;
/** Whether event ID field is nullable */
event_id_nullable: boolean;
}
/**
* Data quality metrics for a table
*/
export interface DataQualityMetrics {
/** Estimated null rate across fields */
null_rate_estimate: number;
/** Fields that appear to be unique identifiers */
unique_fields: string[];
/** Fields that may contain PII */
potential_pii: string[];
}
/**
* Schema complexity metrics
*/
export interface SchemaComplexity {
/** Total number of fields */
total_fields: number;
/** Number of nested fields */
nested_fields: number;
/** Number of STRUCT fields */
struct_fields: number;
/** Number of REPEATED/ARRAY fields */
repeated_fields: number;
/** Maximum nesting depth */
max_nesting_depth: number;
}
/**
* Per-table analytics analysis result
*/
export interface TableAnalysis {
/** Table name */
table_name: string;
/** Table type */
table_type: "TABLE" | "VIEW";
/** Table category based on analytics patterns */
table_category: "EVENT" | "USER" | "LOOKUP" | "UNKNOWN";
/** Mixpanel compatibility score (0-10) */
mixpanel_compatibility: number;
/** Number of rows in the table */
row_count: number;
/** Size in bytes */
size_bytes: number;
/** Creation timestamp */
creation_time: string;
/** Required fields analysis for different patterns */
required_fields: {
/** Whether table has timestamp fields */
has_timestamp: boolean;
/** Whether table has user identifier fields */
has_user_id: boolean;
/** Detected timestamp fields */
timestamp_fields: Array<{
name: string;
type: string;
nullable: boolean;
by_type: boolean;
by_name: boolean;
}>;
/** Detected user ID fields */
user_id_fields: Array<{
name: string;
type: string;
nullable: boolean;
}>;
};
/** Event schema type classification */
event_schema_type: "MULTI_SCHEMA" | "MONO_SCHEMA" | null;
/** Schema complexity metrics */
schema_complexity: {
/** Total number of fields */
total_fields: number;
/** Complex fields (STRUCT, RECORD, JSON, REPEATED) */
complex_fields: string[];
/** Maximum nesting depth */
nested_depth: number;
/** Total number of subfields */
total_subfields: number;
};
/** Data quality assessment */
data_quality: {
/** Fields that may contain PII */
potential_pii: Array<{
field: string;
types: string[];
}>;
/** Volume category based on row count */
volume_category: "SMALL" | "MEDIUM" | "LARGE" | "UNKNOWN";
/** Days since table creation/update */
freshness: number | null;
};
/** Detailed field information */
field_details: Record<string, {
type: string;
nullable: boolean;
join_key: boolean;
}>;
}
/**
* Overall analytics insights across dataset
*/
export interface AnalyticsInsights {
/** Tables with event-like patterns (timestamp + user_id) */
event_tables: TableAnalysis[];
/** Tables with user identifiers but no timestamp */
user_tables: TableAnalysis[];
/** Tables with arbitrary join keys */
lookup_tables: TableAnalysis[];
/** Tables with complex nested structures */
complex_fields: TableAnalysis[];
/** Tables with potential PII fields */
pii_warnings: TableAnalysis[];
/** All table analyses (required by UI) */
data_quality: TableAnalysis[];
/** Field patterns found across dataset */
field_patterns: {
/** Timestamp field names found */
timestamp_fields: string[];
/** User ID field names found */
user_id_fields: string[];
/** Event name field names found */
event_name_fields: string[];
/** Session field names found */
session_fields: string[];
/** Complex field names found */
complex_fields: string[];
/** PII field names found */
pii_fields: string[];
};
}
/**
* Table relationship node
*/
export interface LineageNode {
/** Table/view name */
id: string;
/** Node type */
type: "table" | "view";
/** Number of rows */
row_count: number;
/** Analytics readiness score */
analytics_score: number;
}
/**
* Table relationship edge
*/
export interface LineageEdge {
/** Source table/view */
source: string;
/** Target table/view */
target: string;
/** Relationship type */
type: "view_dependency" | "join_key";
/** Relationship label/description */
label: string;
/** Whether relationship is bidirectional */
bidirectional?: boolean;
}
/**
* Table relationship graph
*/
export interface LineageGraph {
/** All tables and views as nodes */
nodes: LineageNode[];
/** All relationships as edges */
edges: LineageEdge[];
}
/**
* Final audit result - what audit.js produces
*/
export interface AuditResult {
/** Analysis metadata */
audit_metadata: {
/** ISO timestamp of analysis */
generated_at: string;
/** Analysis engine version */
analysis_version: string;
/** Path to source raw dataset file */
source_file: string;
/** Original extraction metadata passed through */
[key: string]: any;
};
/** All original table metadata */
tables: TableMetadata[];
/** Table relationship graph */
lineage: LineageGraph;
/** Analytics insights */
analytics: AnalyticsInsights;
/** Summary statistics */
summary: {
/** Number of tables */
total_tables: number;
/** Number of views */
total_views: number;
/** Total objects */
total_objects: number;
/** Failed objects */
failed_objects: number;
/** Total accessible rows */
total_rows_accessible: number;
};
}
// ============================================================================
// Warehouse Adapter Interface
// ============================================================================
/**
* Interface that all warehouse adapters must implement
* This enables adding support for Snowflake, Databricks, etc.
*/
export interface WarehouseAdapter {
/** Warehouse type identifier */
readonly warehouseType: string;
/**
* Extract raw dataset metadata
* @param options Configuration options
* @returns Promise that resolves to raw dataset
*/
extractRawData(options: AuditOptions): Promise<RawDataset>;
/**
* Validate connection and permissions
* @param options Configuration options
* @returns Promise that resolves to permission mode string
*/
testConnection(options: AuditOptions): Promise<string>;
}
// ============================================================================
// Export main functions
// ============================================================================
/**
* Run data extraction step
*/
export function runDataExtraction(): Promise<void>;
/**
* Run analytics analysis step
*/
export function runAudit(inputFile?: string, outputDir?: string): Promise<void>;
/**
* Analyze analytics compatibility
*/
export function analyzeAnalyticsCompatibility(tables: TableMetadata[]): AnalyticsInsights;
/**
* Build table lineage graph
*/
export function buildLineageGraph(tables: TableMetadata[]): LineageGraph;
/**
* Generate HTML report
*/
export function generateHtmlReport(auditResult: AuditResult): string;
/**
* Rebuild HTML report from existing audit data
*/
export function rebuildHtmlReport(): Promise<void>;