hana-cli
Version:
HANA Developer Command Line Interface
561 lines (517 loc) • 17 kB
JavaScript
// @ts-check
import * as baseLite from '../utils/base-lite.js'
import dbClientClass from "../utils/database/index.js"
import { buildDocEpilogue } from '../utils/doc-linker.js'
export const command = 'dataProfile'
export const aliases = ['prof', 'profileData', 'dataStats']
export const describe = baseLite.bundle.getText("dataProfile")
export const builder = (yargs) => yargs.options(baseLite.getBuilder({
table: {
alias: ['t'],
type: 'string',
desc: baseLite.bundle.getText("dataProfileTable")
},
schema: {
alias: ['s'],
type: 'string',
default: '**CURRENT_SCHEMA**',
desc: baseLite.bundle.getText("dataProfileSchema")
},
columns: {
alias: ['c'],
type: 'string',
desc: baseLite.bundle.getText("dataProfileColumns")
},
output: {
alias: ['o'],
type: 'string',
desc: baseLite.bundle.getText("dataProfileOutput")
},
format: {
alias: ['f'],
choices: ["json", "csv", "summary"],
default: "summary",
type: 'string',
desc: baseLite.bundle.getText("dataProfileFormat")
},
nullAnalysis: {
alias: ['na'],
type: 'boolean',
default: true,
desc: baseLite.bundle.getText("dataProfileNullAnalysis")
},
cardinalityAnalysis: {
alias: ['ca'],
type: 'boolean',
default: true,
desc: baseLite.bundle.getText("dataProfileCardinalityAnalysis")
},
statisticalAnalysis: {
alias: ['sa'],
type: 'boolean',
default: true,
desc: baseLite.bundle.getText("dataProfileStatisticalAnalysis")
},
patternAnalysis: {
alias: ['pa'],
type: 'boolean',
default: false,
desc: baseLite.bundle.getText("dataProfilePatternAnalysis")
},
sampleSize: {
alias: ['ss'],
type: 'number',
default: 10000,
desc: baseLite.bundle.getText("dataProfileSampleSize")
},
timeout: {
alias: ['to'],
type: 'number',
default: 3600,
desc: baseLite.bundle.getText("dataProfileTimeout")
},
profile: {
alias: ['p'],
type: 'string',
desc: baseLite.bundle.getText("profile")
}
})).wrap(160).example(
'hana-cli dataProfile --table myTable --format summary',
baseLite.bundle.getText("dataProfileExample")
).epilog(buildDocEpilogue('dataProfile', 'data-tools', ['dataValidator', 'duplicateDetection']))
export let inputPrompts = {
table: {
description: baseLite.bundle.getText("dataProfileTable"),
type: 'string',
required: true
},
schema: {
description: baseLite.bundle.getText("dataProfileSchema"),
type: 'string',
required: false
},
columns: {
description: baseLite.bundle.getText("dataProfileColumns"),
type: 'string',
required: false,
ask: () => false
},
output: {
description: baseLite.bundle.getText("dataProfileOutput"),
type: 'string',
required: false,
ask: () => false
},
format: {
description: baseLite.bundle.getText("dataProfileFormat"),
type: 'string',
required: false,
ask: () => false
},
nullAnalysis: {
description: baseLite.bundle.getText("dataProfileNullAnalysis"),
type: 'boolean',
required: false,
ask: () => false
},
cardinalityAnalysis: {
description: baseLite.bundle.getText("dataProfileCardinalityAnalysis"),
type: 'boolean',
required: false,
ask: () => false
},
statisticalAnalysis: {
description: baseLite.bundle.getText("dataProfileStatisticalAnalysis"),
type: 'boolean',
required: false,
ask: () => false
},
patternAnalysis: {
description: baseLite.bundle.getText("dataProfilePatternAnalysis"),
type: 'boolean',
required: false,
ask: () => false
},
sampleSize: {
description: baseLite.bundle.getText("dataProfileSampleSize"),
type: 'number',
required: false,
ask: () => false
},
timeout: {
description: baseLite.bundle.getText("dataProfileTimeout"),
type: 'number',
required: false,
ask: () => false
},
profile: {
description: baseLite.bundle.getText("profile"),
type: 'string',
required: false,
ask: () => { }
},
debug: {
description: baseLite.bundle.getText("debug"),
type: 'boolean',
required: false,
ask: () => false
},
disableVerbose: {
description: baseLite.bundle.getText("disableVerbose"),
type: 'boolean',
required: false,
ask: () => false
},
admin: {
description: baseLite.bundle.getText("admin"),
type: 'boolean',
required: false,
ask: () => false
},
conn: {
description: baseLite.bundle.getText("connFile"),
type: 'string',
required: false,
ask: () => false
}
}
/**
* Command handler function
* @param {object} argv - Command line arguments from yargs
* @returns {Promise<void>}
*/
export async function handler(argv) {
const base = await import('../utils/base.js')
await base.promptHandler(argv, dataProfileMain, inputPrompts, true, false)
}
/**
* Generate data quality metrics and statistics
* @param {object} prompts - User prompts
* @returns {Promise<void>}
*/
export async function dataProfileMain(prompts) {
const base = await import('../utils/base.js')
let dbClient = null
let timeoutHandle = null
try {
base.setPrompts(prompts)
base.debug('dataProfileMain')
// Set operation timeout
timeoutHandle = prompts.timeout > 0
? setTimeout(() => process.exit(1), prompts.timeout * 1000)
: null
// Connect to database
dbClient = await dbClientClass.getNewClient(prompts)
await dbClient.connect()
const dbKind = (dbClient.getKind() || 'hana').toLowerCase()
// Get schema if not provided
let schema = prompts.schema
// Handle the **CURRENT_SCHEMA** placeholder
if (!schema || schema === '**CURRENT_SCHEMA**') {
if (dbKind !== 'sqlite') {
schema = await getCurrentSchema(dbClient, dbKind)
}
}
const table = prompts.table
if (!table) {
throw new Error(`Table not found: ${table}`)
}
console.log(`Starting data profile analysis for table: ${table}`)
// Get table columns
const columns = await getTableColumns(dbClient, schema, table, dbKind)
// Filter columns if specified
let profileColumns = columns
if (prompts.columns) {
const selected = prompts.columns.split(',').map(c => c.trim()).filter(c => c)
profileColumns = columns.filter(c => selected.includes(c))
}
// Generate profile
const profile = await generateDataProfile(
dbClient,
schema,
table,
profileColumns,
prompts,
dbKind
)
// Output results
// Check if output value is actually a format type (json, csv, summary)
const formatTypes = ['json', 'csv', 'summary']
if (prompts.output && formatTypes.includes(prompts.output.toLowerCase())) {
// User specified format using --output instead of --format
displayProfile(profile, prompts.output.toLowerCase())
} else if (prompts.output) {
// User specified a file path for output
await outputProfile(prompts.output, profile, prompts.format)
} else {
// No file specified, display to console with specified format
displayProfile(profile, prompts.format)
}
console.log(`Data profile complete for table ${table}. Rows: ${profile.rowCount}, Columns: ${profile.columnCount}`)
await dbClient.disconnect()
if (timeoutHandle) clearTimeout(timeoutHandle)
} catch (error) {
const errorMsg = `Data profile error: ${error.message}`
console.error(errorMsg)
base.debug(error)
if (timeoutHandle) clearTimeout(timeoutHandle)
if (dbClient) {
try {
await dbClient.disconnect()
} catch (e) {
// Ignore disconnect errors
}
}
process.exit(1)
}
}
/**
* Get table columns
* @param {object} dbClient - Database client
* @param {string|null} schema - Schema name
* @param {string} table - Table name
* @param {string} dbKind - Database kind
* @returns {Promise<Array<string>>}
*/
async function getTableColumns(dbClient, schema, table, dbKind) {
let query
if (dbKind === 'hana') {
query = `SELECT COLUMN_NAME FROM SYS.TABLE_COLUMNS
WHERE SCHEMA_NAME = ? AND TABLE_NAME = ?
ORDER BY POSITION`
const result = await dbClient.execSQL(query, [schema || 'PUBLIC', table.toUpperCase()])
return result.map(r => r.COLUMN_NAME)
} else if (dbKind === 'postgres') {
query = `SELECT column_name FROM information_schema.columns
WHERE table_schema = ? AND table_name = ?
ORDER BY ordinal_position`
const result = await dbClient.execSQL(query, [schema || 'public', table.toLowerCase()])
return result.map(r => r.column_name)
}
return []
}
/**
* Generate data profile for table
* @param {object} dbClient - Database client
* @param {string|null} schema - Schema name
* @param {string} table - Table name
* @param {Array<string>} columns - Columns to profile
* @param {object} options - Profiling options
* @param {string} dbKind - Database kind
* @returns {Promise<object>}
*/
async function generateDataProfile(dbClient, schema, table, columns, options, dbKind) {
const profile = {
table,
schema: schema || 'N/A',
rowCount: 0,
columnCount: columns.length,
columns: {},
metadata: {
profiledAt: new Date().toISOString(),
nullAnalysis: options.nullAnalysis,
cardinalityAnalysis: options.cardinalityAnalysis,
statisticalAnalysis: options.statisticalAnalysis
}
}
try {
// Get row count
const tableName = formatQualifiedName(schema, table)
const countQuery = `SELECT COUNT(*) as COUNT FROM ${tableName}`
const countResult = await dbClient.execSQL(countQuery)
profile.rowCount = countResult[0]?.COUNT || 0
// Profile each column
for (const column of columns) {
profile.columns[column] = await profileColumn(
dbClient,
schema,
table,
column,
options,
dbKind
)
}
} catch (error) {
baseLite.debug(`Error generating profile: ${error.message}`)
throw error
}
return profile
}
/**
* Profile individual column
* @param {object} dbClient - Database client
* @param {string|null} schema - Schema name
* @param {string} table - Table name
* @param {string} column - Column name
* @param {object} options - Profiling options
* @param {string} dbKind - Database kind
* @returns {Promise<object>}
*/
async function profileColumn(dbClient, schema, table, column, options, dbKind) {
const profile = {
name: column,
type: 'UNKNOWN'
}
try {
const tableName = formatQualifiedName(schema, table)
const colQuoted = `"${column}"`
// NULL analysis
if (options.nullAnalysis) {
const nullQuery = `SELECT COUNT(*) as NULL_COUNT FROM ${tableName} WHERE ${colQuoted} IS NULL`
const result = await dbClient.execSQL(nullQuery)
profile.nullCount = result[0]?.NULL_COUNT || 0
profile.nullPercentage = (profile.nullCount / (options.rowCount || 1)) * 100
}
// Cardinality analysis
if (options.cardinalityAnalysis) {
const cardQuery = `SELECT COUNT(DISTINCT ${colQuoted}) as DISTINCT_COUNT FROM ${tableName}`
const result = await dbClient.execSQL(cardQuery)
profile.distinctCount = result[0]?.DISTINCT_COUNT || 0
}
// Statistical analysis
if (options.statisticalAnalysis) {
const statsQuery = `SELECT
MIN(${colQuoted}) as MIN_VALUE,
MAX(${colQuoted}) as MAX_VALUE,
AVG(CAST(${colQuoted} AS NUMERIC)) as AVG_VALUE
FROM ${tableName} WHERE ${colQuoted} IS NOT NULL`
try {
const result = await dbClient.execSQL(statsQuery)
if (result && result[0]) {
profile.minValue = result[0].MIN_VALUE
profile.maxValue = result[0].MAX_VALUE
profile.avgValue = result[0].AVG_VALUE
}
} catch (error) {
baseLite.debug(`Statistical analysis not available for column: ${column}`)
}
}
// Length analysis for string columns
if (options.patternAnalysis) {
const lenQuery = `SELECT
MIN(LENGTH(CAST(${colQuoted} AS VARCHAR))) as MIN_LEN,
MAX(LENGTH(CAST(${colQuoted} AS VARCHAR))) as MAX_LEN,
AVG(LENGTH(CAST(${colQuoted} AS VARCHAR))) as AVG_LEN
FROM ${tableName} WHERE ${colQuoted} IS NOT NULL`
try {
const result = await dbClient.execSQL(lenQuery)
if (result && result[0]) {
profile.minLength = result[0].MIN_LEN
profile.maxLength = result[0].MAX_LEN
profile.avgLength = result[0].AVG_LEN
}
} catch (error) {
baseLite.debug(`Pattern analysis not available for column: ${column}`)
}
}
// Get top values
const topQuery = `SELECT ${colQuoted} as VALUE, COUNT(*) as COUNT FROM ${tableName}
WHERE ${colQuoted} IS NOT NULL
GROUP BY ${colQuoted}
ORDER BY COUNT DESC LIMIT 5`
try {
const result = await dbClient.execSQL(topQuery)
profile.topValues = result.map(r => ({
value: r.VALUE,
count: r.COUNT
}))
} catch (error) {
baseLite.debug(`Top values analysis not available for column: ${column}`)
profile.topValues = []
}
} catch (error) {
baseLite.debug(`Error profiling column ${column}: ${error.message}`)
}
return profile
}
/**
* Display profile in console
* @param {object} profile - Profile data
* @param {string} format - Display format
* @returns {void}
*/
function displayProfile(profile, format = 'summary') {
if (format === 'summary') {
console.log(`\n${baseLite.colors.cyan('Table Data Profile')}\n`)
console.log(`Table: ${profile.table}`)
console.log(`Schema: ${profile.schema}`)
console.log(`Rows: ${profile.rowCount}`)
console.log(`Columns: ${profile.columnCount}`)
console.log(`\n${baseLite.colors.green('Column Profiles:')}`)
for (const [colName, colProfile] of Object.entries(profile.columns)) {
console.log(`\n ${colName}:`)
console.log(` Null Count: ${colProfile.nullCount || 0}`)
console.log(` Distinct: ${colProfile.distinctCount || 0}`)
if (colProfile.minValue !== undefined) {
console.log(` Min: ${colProfile.minValue}`)
}
if (colProfile.maxValue !== undefined) {
console.log(` Max: ${colProfile.maxValue}`)
}
if (colProfile.topValues && colProfile.topValues.length > 0) {
console.log(` Top Values: ${colProfile.topValues.slice(0, 3).map(v => v.value).join(', ')}`)
}
}
} else if (format === 'csv') {
console.log('column,type,nullCount,distinctCount,minValue,maxValue')
for (const [colName, colProfile] of Object.entries(profile.columns)) {
console.log(`${colName},,${colProfile.nullCount || 0},${colProfile.distinctCount || 0},,`)
}
} else if (format === 'json') {
console.log(JSON.stringify(profile, null, 2))
}
}
/**
* Output profile to file
* @param {string} filePath - Output file path
* @param {object} profile - Profile data
* @param {string} format - Output format
* @returns {Promise<void>}
*/
async function outputProfile(filePath, profile, format = 'json') {
const fs = await import('fs')
if (format === 'json') {
await fs.promises.writeFile(filePath, JSON.stringify(profile, null, 2), 'utf8')
} else if (format === 'csv') {
let csv = 'column,nullCount,distinctCount,minValue,maxValue,topValues\n'
for (const [colName, colProfile] of Object.entries(profile.columns)) {
const topVals = colProfile.topValues ? colProfile.topValues.map(v => v.value).join(';') : ''
csv += `${colName},${colProfile.nullCount || 0},${colProfile.distinctCount || 0},,,"${topVals}"\n`
}
await fs.promises.writeFile(filePath, csv, 'utf8')
} else {
await fs.promises.writeFile(filePath, JSON.stringify(profile, null, 2), 'utf8')
}
}
/**
* Get current schema
* @param {object} dbClient - Database client
* @param {string} dbKind - Database kind
* @returns {Promise<string|null>}
*/
async function getCurrentSchema(dbClient, dbKind) {
try {
if (dbKind === 'hana') {
const result = await dbClient.execSQL("SELECT CURRENT_SCHEMA FROM DUMMY")
return result?.[0]?.CURRENT_SCHEMA || null
} else if (dbKind === 'postgres') {
const result = await dbClient.execSQL("SELECT current_schema()")
return result?.[0]?.current_schema || null
}
} catch (error) {
baseLite.debug(`Error getting current schema: ${error.message}`)
}
return null
}
/**
* Format qualified table name (schema.table)
* @param {string|null} schema - Schema name
* @param {string} table - Table name
* @returns {string}
*/
function formatQualifiedName(schema, table) {
if (schema) {
return `"${schema}"."${table}"`
}
return `"${table}"`
}