dwh-audit
Version:
Modular CLI tool for auditing data warehouses - extract, analyze, and report on schemas, data quality, and analytics readiness
723 lines (638 loc) • 25.7 kB
JavaScript
import { BigQuery } from "@google-cloud/bigquery";
import { promises as fs } from "fs";
// HTML report generation moved to rebuild.js step
import path from "path";
let { NODE_ENV = "production" } = process.env;
// --- Configuration ---
const config = {
projectId: process.argv[2] || "mixpanel-gtm-training",
datasetId: process.argv[3] || "warehouse_connectors",
tableFilter: process.argv[4] ? process.argv[4].split(",").map(t => t.trim()) : null, // Comma-separated list of specific tables to audit (supports glob patterns)
location: process.argv[5] || "US",
sampleLimit: parseInt(process.argv[6]) || 10,
outputDir: process.argv[7] || "./output"
};
console.log(`\n\nRunning BigQuery Data Extraction with configuration:\n
- Project ID: ${config.projectId}
- Dataset ID: ${config.datasetId}
- Table Filter: ${config.tableFilter ? config.tableFilter.join(", ") : "All tables"}
- Location: ${config.location}
- Sample Limit: ${config.sampleLimit}
- Output Directory: ${config.outputDir}
\n`);
const bigquery = new BigQuery({
projectId: config.projectId,
location: config.location
});
// Import required modules for REST API fallback
import https from "https";
import { GoogleAuth } from "google-auth-library";
const auth = new GoogleAuth({
scopes: ["https://www.googleapis.com/auth/bigquery"]
});
// --- Colors for Terminal Output ---
const colors = {
red: "\x1b[31m",
green: "\x1b[32m",
yellow: "\x1b[33m",
cyan: "\x1b[36m",
magenta: "\x1b[35m",
nc: "\x1b[0m"
};
// --- Utility Functions ---
const formatRegion = location => {
const r = location.toLowerCase();
if (r === "us" || r === "eu") {
return `region-${r}`;
}
return r;
};
// Simple glob pattern matching function
const matchesGlob = (str, pattern) => {
// Convert glob pattern to regex
// * matches any characters, ? matches single character
const regexPattern = pattern
.replace(/[.+^${}()|[\]\\]/g, '\\$&') // Escape regex special chars except * and ?
.replace(/\*/g, '.*') // Convert * to .*
.replace(/\?/g, '.'); // Convert ? to .
const regex = new RegExp(`^${regexPattern}$`, 'i'); // Case insensitive
return regex.test(str);
};
const jsonEscape = str => {
if (typeof str !== "string") return "";
return str.replace(/\\/g, "\\\\").replace(/"/g, '"').replace(/\n/g, "\\n").replace(/\r/g, "\\r").replace(/\t/g, "\\t");
};
// Parse view definitions to extract table dependencies
const parseViewDependencies = (ddl, datasetId) => {
if (!ddl || typeof ddl !== "string") return [];
const dependencies = new Set();
// Common patterns for table references in BigQuery DDL
const patterns = [
// `dataset.table` or `project.dataset.table`
/`[^`]*\.${datasetId}\.(\w+)`/gi,
// dataset.table without backticks (less reliable but common)
new RegExp(`\\b${datasetId}\\.(\\w+)\\b`, "gi"),
// FROM or JOIN clauses with table names
/FROM\s+`?[^`\s]*\.?${datasetId}\.(\w+)`?/gi,
/JOIN\s+`?[^`\s]*\.?${datasetId}\.(\w+)`?/gi
];
patterns.forEach(pattern => {
let match;
while ((match = pattern.exec(ddl)) !== null) {
if (match[1] && match[1] !== datasetId) {
dependencies.add(match[1]);
}
}
});
return Array.from(dependencies);
};
// Helper function to recursively unwrap BigQuery REST API field values
function unwrapBigQueryValue(value) {
if (value === null || value === undefined) {
return null;
}
// Handle arrays (REPEATED fields)
if (Array.isArray(value)) {
return value.map(item => {
if (item && typeof item === 'object' && 'v' in item) {
return unwrapBigQueryValue(item.v);
}
return unwrapBigQueryValue(item);
});
}
// Handle objects that might be wrapped values
if (typeof value === 'object' && value !== null) {
// If it has a 'v' property, unwrap it
if ('v' in value) {
return unwrapBigQueryValue(value.v);
}
// Handle STRUCT fields (objects with 'f' property containing field array)
if ('f' in value && Array.isArray(value.f)) {
const struct = {};
// This would need schema information to properly name fields
// For now, just unwrap the values
value.f.forEach((field, index) => {
struct[`field_${index}`] = unwrapBigQueryValue(field);
});
return struct;
}
// Regular object - recursively unwrap all properties
const unwrapped = {};
for (const [key, val] of Object.entries(value)) {
unwrapped[key] = unwrapBigQueryValue(val);
}
return unwrapped;
}
// Handle potential JSON strings for better display in sample data
if (typeof value === 'string' && value.length > 0) {
const trimmed = value.trim();
// Check if string looks like JSON (starts with { or [)
if ((trimmed.startsWith('{') && trimmed.endsWith('}')) ||
(trimmed.startsWith('[') && trimmed.endsWith(']'))) {
try {
// Try to parse as JSON
const parsed = JSON.parse(trimmed);
return parsed; // Return parsed object/array for better display
} catch (e) {
// If parsing fails, fall back to original string
return value;
}
}
}
// Primitive values - return as is
return value;
}
const csvEscape = str => {
if (str === null || str === undefined) return "";
const s = String(str);
if (s.includes(",") || s.includes("\n") || s.includes('"')) {
return `"${s.replace(/"/g, '""')}"`;
}
return s;
};
// --- Main Data Extraction Logic ---
async function testBigQueryAuth() {
console.log(`${colors.yellow}Testing BigQuery authentication and connectivity...${colors.nc}`);
try {
const datasets = await bigquery.getDatasets();
console.log(`${colors.green}✓ BigQuery authentication successful (dataViewer mode).${colors.nc}\n`);
return "dataViewer";
} catch (error) {
if (NODE_ENV === "dev") debugger;
console.error(
`${colors.red}Fatal Error: BigQuery authentication or connectivity failed. Please check your Google Cloud credentials and network access.${colors.nc}`
);
console.error(error.message);
console.error(`\n${colors.yellow}Required permissions:${colors.nc}`);
console.error(`${colors.cyan}gcloud projects add-iam-policy-binding ${config.projectId} \\${colors.nc}`);
console.error(`${colors.cyan} --member='user:your_name@yourdomain.com' \\${colors.nc}`);
console.error(`${colors.cyan} --role='roles/bigquery.dataViewer'${colors.nc}`);
process.exit(1);
}
}
async function getSampleDataViaREST(projectId, datasetId, tableName, schema, maxResults = 10) {
try {
const authClient = await auth.getClient();
const accessToken = await authClient.getAccessToken();
const url = `https://bigquery.googleapis.com/bigquery/v2/projects/${projectId}/datasets/${datasetId}/tables/${tableName}/data?maxResults=${maxResults}`;
return new Promise((resolve, reject) => {
const options = {
headers: {
Authorization: `Bearer ${accessToken.token}`,
"Content-Type": "application/json"
},
timeout: 30000 // 30 second timeout
};
const req = https.get(url, options, res => {
let data = "";
res.on("data", chunk => (data += chunk));
res.on("end", () => {
// Check for HTTP error status codes
if (res.statusCode < 200 || res.statusCode >= 300) {
try {
const errorResponse = JSON.parse(data);
reject(new Error(`HTTP ${res.statusCode}: ${errorResponse.error ? errorResponse.error.message : res.statusMessage}`));
} catch {
reject(new Error(`HTTP ${res.statusCode}: ${res.statusMessage}`));
}
return;
}
try {
const response = JSON.parse(data);
if (response.error) {
reject(new Error(`BigQuery API Error: ${response.error.message}`));
return;
}
if (response.rows && response.rows.length > 0) {
// Convert BigQuery REST API format to standard row format
// Use the schema passed as parameter since the data API doesn't include schema
const rows = response.rows.map(row => {
const obj = {};
if (row.f && Array.isArray(row.f)) {
row.f.forEach((field, index) => {
if (schema && schema[index]) {
const fieldName = schema[index].column_name || schema[index].name;
// Handle null/undefined values
if (field.v === null || field.v === undefined) {
obj[fieldName] = null;
} else {
// Properly unwrap nested/repeated field values
obj[fieldName] = unwrapBigQueryValue(field.v);
}
}
});
}
return obj;
});
resolve(rows);
} else {
resolve([]);
}
} catch (parseError) {
reject(new Error(`Failed to parse REST API response: ${parseError.message}`));
}
});
});
req.on("error", error => {
reject(new Error(`REST API request failed: ${error.message}`));
});
req.on("timeout", () => {
req.destroy();
reject(new Error("REST API request timed out after 30 seconds"));
});
req.setTimeout(30000);
});
} catch (error) {
throw new Error(`REST API authentication failed: ${error.message}`);
}
}
async function getTablesViaREST(projectId, datasetId) {
try {
const authClient = await auth.getClient();
const accessToken = await authClient.getAccessToken();
return new Promise((resolve, reject) => {
const path = `/bigquery/v2/projects/${projectId}/datasets/${datasetId}/tables?maxResults=1000`;
const options = {
hostname: 'www.googleapis.com',
path: path,
method: 'GET',
headers: {
'Authorization': `Bearer ${accessToken.token}`,
'Content-Type': 'application/json'
}
};
const req = https.request(options, (res) => {
let data = '';
res.on('data', (chunk) => data += chunk);
res.on('end', () => {
try {
const result = JSON.parse(data);
if (result.error) {
reject(new Error(`REST API Error: ${result.error.message}`));
return;
}
// Transform to match INFORMATION_SCHEMA.TABLES format
const tables = (result.tables || []).map(table => {
const tableRef = table.tableReference;
return {
table_name: tableRef.tableId,
table_type: table.type === 'VIEW' ? 'VIEW' : 'TABLE',
creation_time: table.creationTime ? new Date(parseInt(table.creationTime)).toISOString() : null
};
});
resolve(tables);
} catch (parseError) {
reject(new Error(`Failed to parse REST API response: ${parseError.message}`));
}
});
});
req.on('error', (error) => {
reject(new Error(`REST API request failed: ${error.message}`));
});
req.end();
});
} catch (error) {
throw new Error(`REST API authentication failed: ${error.message}`);
}
}
async function getTableMetadataViaREST(projectId, datasetId, tableName) {
try {
const authClient = await auth.getClient();
const accessToken = await authClient.getAccessToken();
return new Promise((resolve, reject) => {
const path = `/bigquery/v2/projects/${projectId}/datasets/${datasetId}/tables/${tableName}`;
const options = {
hostname: 'www.googleapis.com',
path: path,
method: 'GET',
headers: {
'Authorization': `Bearer ${accessToken.token}`,
'Content-Type': 'application/json'
}
};
const req = https.request(options, (res) => {
let data = '';
res.on('data', (chunk) => data += chunk);
res.on('end', () => {
try {
const result = JSON.parse(data);
if (result.error) {
reject(new Error(`REST API Error: ${result.error.message}`));
return;
}
resolve(result);
} catch (parseError) {
reject(new Error(`Failed to parse REST API response: ${parseError.message}`));
}
});
});
req.on('error', (error) => {
reject(new Error(`REST API request failed: ${error.message}`));
});
req.end();
});
} catch (error) {
throw new Error(`REST API authentication failed: ${error.message}`);
}
}
async function getTableSchemaViaREST(projectId, datasetId, tableName) {
const metadata = await getTableMetadataViaREST(projectId, datasetId, tableName);
return metadata.schema;
}
async function runDataExtraction() {
const permissionMode = await testBigQueryAuth();
console.log(`${colors.cyan}=== BigQuery Data Extraction Starting ===${colors.nc}`);
console.log("-------------------------------------------");
console.log(`${colors.green}▸ Project:${colors.nc} ${config.projectId}`);
console.log(`${colors.green}▸ Dataset:${colors.nc} ${config.datasetId}`);
console.log(`${colors.green}▸ Region:${colors.nc} ${config.location}`);
console.log(`${colors.green}▸ Table Filter:${colors.nc} ${config.tableFilter ? config.tableFilter.join(", ") : "All tables"}`);
console.log(`${colors.green}▸ Sample Limit:${colors.nc} ${config.sampleLimit}`);
console.log(`${colors.green}▸ Permission Mode:${colors.nc} DataViewer (REST API only)`);
console.log(`${colors.green}▸ Output Directory:${colors.nc} ${config.outputDir}`);
console.log("-------------------------------------------\n");
try {
console.log(`${colors.yellow}Setting up output directory structure...${colors.nc}`);
await fs.rm(config.outputDir, { recursive: true, force: true });
await fs.mkdir(path.join(config.outputDir, "schemas"), { recursive: true });
await fs.mkdir(path.join(config.outputDir, "samples"), { recursive: true });
await fs.mkdir(path.join(config.outputDir, "reports"), { recursive: true });
// Copy favicon.ico to reports directory if it exists
try {
await fs.copyFile('favicon.ico', path.join(config.outputDir, "reports", "favicon.ico"));
console.log(`${colors.green}✓ Copied favicon.ico to reports directory${colors.nc}`);
} catch (faviconError) {
if (faviconError.code !== 'ENOENT') {
console.log(`${colors.yellow}⚠ Warning: Could not copy favicon.ico: ${faviconError.message}${colors.nc}`);
}
// If favicon doesn't exist, silently continue (not a critical error)
}
console.log(`${colors.green}✓ Output directories created:${colors.nc}`);
console.log(` - ${path.join(config.outputDir, "reports")} (aggregated reports)`);
console.log(` - ${path.join(config.outputDir, "schemas")} (individual table schemas)`);
console.log(` - ${path.join(config.outputDir, "samples")} (sample data files)\n`);
} catch (error) {
console.error(`${colors.red}Fatal Error: Could not manage output directory. ${error.message}${colors.nc}`);
process.exit(1);
}
const formattedRegion = formatRegion(config.location);
const allTables = [];
let extractionSummary = {
total_tables: 0,
total_views: 0,
failed_objects: 0,
total_rows_accessible: 0
};
// CSV Headers
const allTablesSummaryCsv = [
"table_name",
"table_type",
"row_count",
"column_count",
"size_mb",
"num_partitions",
"creation_time",
"has_permission_error",
"error_details"
];
console.log(`${colors.yellow}Discovering tables and views in dataset...${colors.nc}`);
let tables;
try {
// Use REST API (dataViewer mode)
tables = await getTablesViaREST(config.projectId, config.datasetId);
// Apply table filtering if specified
if (config.tableFilter && config.tableFilter.length > 0) {
const originalCount = tables.length;
tables = tables.filter(table =>
config.tableFilter.some(filter => matchesGlob(table.table_name, filter))
);
console.log(`${colors.cyan}Filtered to ${tables.length} tables from ${originalCount} total (filter: ${config.tableFilter.join(", ")})${colors.nc}`);
}
console.log(`${colors.green}✓ Found ${tables.length} objects to extract: ${tables.filter(t => t.table_type === 'TABLE').length} tables, ${tables.filter(t => t.table_type === 'VIEW').length} views${colors.nc}\n`);
} catch (error) {
console.error(`${colors.red}Fatal Error: Could not fetch table list. Check permissions or dataset existence.${colors.nc}`);
console.error(error.message);
process.exit(1);
}
// Process each table/view
let processedCount = 0;
for (const tableInfo of tables) {
const tableName = tableInfo.table_name;
processedCount++;
console.log(`${colors.cyan}[${processedCount}/${tables.length}] Processing: ${colors.nc}${tableName} (${tableInfo.table_type})`);
const tableData = {
table_name: tableName,
table_type: tableInfo.table_type,
creation_time: tableInfo.creation_time,
has_permission_error: false,
error_details: [],
schema: [],
row_count: tableInfo.row_count || 0,
size_bytes: tableInfo.size_bytes || 0,
sample_data: [],
view_definition: null,
partitioning_info: null,
clustering_info: null
};
try {
// Get schema information via REST API
const metadata = await getTableMetadataViaREST(config.projectId, config.datasetId, tableName);
if (metadata.schema && metadata.schema.fields) {
// Recursively flatten nested fields from REST API response
const schema = [];
function processFields(fields, parentPath = "") {
fields.forEach(field => {
const fieldPath = parentPath ? `${parentPath}.${field.name}` : field.name;
schema.push({
column_name: field.name,
ordinal_position: schema.length + 1,
is_nullable: field.mode === "NULLABLE" ? "YES" : "NO",
data_type: field.type,
is_partitioning_column: false,
clustering_ordinal_position: null,
nested_field_path: fieldPath,
nested_type: field.type
});
// Handle nested fields recursively
if (field.fields) {
processFields(field.fields, fieldPath);
}
});
}
processFields(metadata.schema.fields);
tableData.schema = schema;
}
// Get additional metadata
if (metadata.numRows) {
tableData.row_count = parseInt(metadata.numRows);
}
if (metadata.numBytes) {
tableData.size_bytes = parseInt(metadata.numBytes);
}
if (metadata.timePartitioning) {
tableData.partitioning_info = metadata.timePartitioning;
}
if (metadata.clustering) {
tableData.clustering_info = metadata.clustering;
}
// Get sample data via REST API (works for tables) or query (for views/materialized views)
if (config.sampleLimit > 0 && tableData.schema.length > 0) {
try {
if (tableInfo.table_type === 'VIEW') {
// For views, try using a simple SELECT query instead of the table data endpoint
console.log(` ${colors.cyan}→ Attempting view query for sample data...${colors.nc}`);
try {
const query = `SELECT * FROM \`${config.projectId}.${config.datasetId}.${tableName}\` LIMIT ${config.sampleLimit}`;
const [queryResults] = await bigquery.query({
query: query,
location: config.location,
dryRun: false,
maximumBytesBilled: 1000000 // 1MB limit for safety
});
tableData.sample_data = queryResults;
console.log(` ${colors.green}✓ Sample data from view query: ${tableData.sample_data.length} rows${colors.nc}`);
} catch (queryError) {
console.log(` ${colors.yellow}⚠ View query failed: ${queryError.message}${colors.nc}`);
tableData.error_details.push(`View query failed: ${queryError.message}`);
tableData.sample_data = [];
}
} else {
// For tables, use the REST API table data endpoint
try {
tableData.sample_data = await getSampleDataViaREST(config.projectId, config.datasetId, tableName, tableData.schema, config.sampleLimit);
console.log(` ${colors.green}✓ Sample data: ${tableData.sample_data.length} rows${colors.nc}`);
} catch (restError) {
// If REST API fails with materialized view error, fallback to query approach
if (restError.message.includes('Cannot list a table of type MATERIALIZED_VIEW')) {
console.log(` ${colors.cyan}→ Detected materialized view, using query approach...${colors.nc}`);
try {
const query = `SELECT * FROM \`${config.projectId}.${config.datasetId}.${tableName}\` LIMIT ${config.sampleLimit}`;
const [queryResults] = await bigquery.query({
query: query,
location: config.location,
dryRun: false,
maximumBytesBilled: 50000000 // 50MB limit for materialized views
});
tableData.sample_data = queryResults;
console.log(` ${colors.green}✓ Sample data from materialized view query: ${tableData.sample_data.length} rows${colors.nc}`);
} catch (queryError) {
console.log(` ${colors.yellow}⚠ Materialized view query failed: ${queryError.message}${colors.nc}`);
tableData.error_details.push(`Materialized view query failed: ${queryError.message}`);
tableData.sample_data = [];
}
} else {
throw restError; // Re-throw other REST API errors
}
}
}
} catch (sampleError) {
console.log(` ${colors.yellow}⚠ Could not get sample data: ${sampleError.message}${colors.nc}`);
tableData.error_details.push(`Sample data error: ${sampleError.message}`);
}
}
// Get view definition for views via REST API
if (tableInfo.table_type === 'VIEW') {
try {
if (metadata.view && metadata.view.query) {
tableData.view_definition = metadata.view.query;
}
} catch (viewError) {
console.log(` ${colors.yellow}⚠ Could not get view definition: ${viewError.message}${colors.nc}`);
tableData.error_details.push(`View definition error: ${viewError.message}`);
}
}
// Update counters
if (tableInfo.table_type === 'TABLE') {
extractionSummary.total_tables++;
} else {
extractionSummary.total_views++;
}
extractionSummary.total_rows_accessible += tableData.row_count || 0;
console.log(` ${colors.green}✓ Schema: ${tableData.schema.length} fields, Samples: ${tableData.sample_data.length} rows${colors.nc}`);
} catch (error) {
tableData.has_permission_error = true;
tableData.error_details.push(error.message);
extractionSummary.failed_objects++;
console.log(` ${colors.red}✗ Error: ${error.message}${colors.nc}`);
}
// Write individual files
await fs.writeFile(
path.join(config.outputDir, "schemas", `${tableName}_schema.csv`),
[
"column_name,ordinal_position,is_nullable,data_type,is_partitioning_column,clustering_ordinal_position,nested_field_path,nested_type",
...tableData.schema.map(col =>
[col.column_name, col.ordinal_position, col.is_nullable, col.data_type, col.is_partitioning_column, col.clustering_ordinal_position, col.nested_field_path, col.nested_type].map(csvEscape).join(",")
)
].join("\n")
);
if (tableData.sample_data.length > 0) {
await fs.writeFile(
path.join(config.outputDir, "samples", `${tableName}_sample.json`),
JSON.stringify(tableData.sample_data, null, 2)
);
}
allTables.push(tableData);
}
// Build schema catalog CSV
const schemaCatalogRows = ["table_name,column_name,ordinal_position,is_nullable,data_type,is_partitioning_column,clustering_ordinal_position,nested_field_path,nested_type"];
allTables.forEach(table => {
table.schema.forEach(col => {
schemaCatalogRows.push(
[table.table_name, col.column_name, col.ordinal_position, col.is_nullable, col.data_type, col.is_partitioning_column, col.clustering_ordinal_position, col.nested_field_path, col.nested_type].map(csvEscape).join(",")
);
});
});
// Write consolidated outputs
await fs.writeFile(path.join(config.outputDir, "reports", "all_schemas_catalog.csv"), schemaCatalogRows.join("\n"));
// Write summary CSV
const summaryRows = [allTablesSummaryCsv.join(",")];
allTables.forEach(table => {
summaryRows.push([
table.table_name,
table.table_type,
table.row_count || 0,
table.schema.length,
Math.round((table.size_bytes || 0) / 1024 / 1024 * 100) / 100, // MB
0, // partitions - would need additional query
table.creation_time || "",
table.has_permission_error,
table.error_details.join("; ")
].map(csvEscape).join(","));
});
await fs.writeFile(path.join(config.outputDir, "reports", "all_tables_summary.csv"), summaryRows.join("\n"));
// Create raw dataset JSON for audit.js processing
const rawDataset = {
extraction_metadata: {
generated_at: new Date().toISOString(),
project_id: config.projectId,
dataset_id: config.datasetId,
region: config.location,
permission_mode: permissionMode,
sample_limit: config.sampleLimit,
table_filter: config.tableFilter
},
tables: allTables,
summary: {
...extractionSummary,
total_objects: allTables.length
}
};
// Write raw dataset for audit.js
await fs.writeFile(path.join(config.outputDir, "reports", "dataset_raw.json"), JSON.stringify(rawDataset, null, 2));
console.log(`\n${colors.green}✔ Data extraction complete!${colors.nc}`);
console.log("==========================================");
console.log(`${colors.green}▸ Extracted:${colors.nc} ${allTables.length} objects (${extractionSummary.total_tables} tables, ${extractionSummary.total_views} views)`);
console.log(`${colors.green}▸ Failed:${colors.nc} ${extractionSummary.failed_objects} objects`);
console.log(`${colors.green}▸ Total Rows:${colors.nc} ${extractionSummary.total_rows_accessible.toLocaleString()}`);
console.log("==========================================");
console.log(`Raw dataset: ${colors.cyan}${path.join(config.outputDir, "reports", "dataset_raw.json")}${colors.nc}`);
console.log(`\n${colors.yellow}Next steps:${colors.nc}`);
console.log(`${colors.cyan}node audit.js${colors.nc} - Run analytics and scoring analysis`);
console.log(`${colors.cyan}node rebuild.js${colors.nc} - Generate HTML report (after audit.js)`);
}
// Run extraction if this file is executed directly
if (import.meta.url === `file://${process.argv[1]}`) {
runDataExtraction().catch(console.error);
}
export { runDataExtraction };