UNPKG

hana-cli

Version:
603 lines (531 loc) 16.7 kB
// @ts-check import * as baseLite from '../utils/base-lite.js' import dbClientClass from "../utils/database/index.js" import { buildDocEpilogue } from '../utils/doc-linker.js' const duplicateDetectionOptions = { table: { alias: ['t'], type: 'string', desc: baseLite.bundle.getText("duplicateDetectionTable") }, schema: { alias: ['s'], type: 'string', default: '**CURRENT_SCHEMA**', desc: baseLite.bundle.getText("duplicateDetectionSchema") }, keyColumns: { alias: ['k'], type: 'string', desc: baseLite.bundle.getText("duplicateDetectionKeyColumns") }, checkColumns: { alias: ['c'], type: 'string', desc: baseLite.bundle.getText("duplicateDetectionCheckColumns") }, excludeColumns: { alias: ['e'], type: 'string', desc: baseLite.bundle.getText("duplicateDetectionExcludeColumns") }, mode: { alias: ['m'], choices: ["exact", "fuzzy", "partial"], default: "exact", type: 'string', desc: baseLite.bundle.getText("duplicateDetectionMode") }, threshold: { alias: ['th'], type: 'number', default: 0.95, desc: baseLite.bundle.getText("duplicateDetectionThreshold") }, output: { alias: ['o'], type: 'string', desc: baseLite.bundle.getText("duplicateDetectionOutput") }, format: { alias: ['f'], choices: ["json", "csv", "summary"], default: "summary", type: 'string', desc: baseLite.bundle.getText("duplicateDetectionFormat") }, limit: { alias: ['l'], type: 'number', default: 10000, desc: baseLite.bundle.getText("duplicateDetectionLimit") }, timeout: { alias: ['to'], type: 'number', default: 3600, desc: baseLite.bundle.getText("duplicateDetectionTimeout") }, profile: { alias: ['p'], type: 'string', desc: baseLite.bundle.getText("profile") } } export const command = 'duplicateDetection' export const aliases = ['dupdetect', 'findDuplicates', 'duplicates'] export const describe = baseLite.bundle.getText("duplicateDetection") export const builder = (yargs) => yargs.options(baseLite.getBuilder(duplicateDetectionOptions)) .example('hana-cli duplicateDetection --table myTable --mode exact --threshold 0.95', baseLite.bundle.getText("duplicateDetectionExample")).wrap(160).epilog(buildDocEpilogue('duplicateDetection', 'data-tools', ['dataProfile', 'dataValidator'])) export let inputPrompts = { table: { description: baseLite.bundle.getText("duplicateDetectionTable"), type: 'string', required: true }, schema: { description: baseLite.bundle.getText("duplicateDetectionSchema"), type: 'string', required: false }, keyColumns: { description: baseLite.bundle.getText("duplicateDetectionKeyColumns"), type: 'string', required: false, ask: () => false }, checkColumns: { description: baseLite.bundle.getText("duplicateDetectionCheckColumns"), type: 'string', required: false, ask: () => false }, mode: { description: baseLite.bundle.getText("duplicateDetectionMode"), type: 'string', required: false, ask: () => false }, output: { description: baseLite.bundle.getText("duplicateDetectionOutput"), type: 'string', required: false, ask: () => false }, limit: { description: baseLite.bundle.getText("duplicateDetectionLimit"), type: 'number', required: false, default: 10000, ask: () => false }, timeout: { description: baseLite.bundle.getText("duplicateDetectionTimeout"), type: 'number', required: false, default: 3600, ask: () => false }, profile: { description: baseLite.bundle.getText("profile"), type: 'string', required: false, ask: () => { } } } /** * Command handler function * @param {object} argv - Command line arguments from yargs * @returns {Promise<void>} */ export async function handler(argv) { const base = await import('../utils/base.js') base.promptHandler(argv, duplicateDetectionMain, inputPrompts, true, true, duplicateDetectionOptions) } /** * Find duplicate records * @param {object} prompts - User prompts * @returns {Promise<void>} */ export async function duplicateDetectionMain(prompts) { const base = await import('../utils/base.js') base.debug('duplicateDetectionMain') try { base.setPrompts(prompts) // Set operation timeout const timeoutHandle = prompts.timeout > 0 ? setTimeout(() => process.exit(1), prompts.timeout * 1000) : null // Connect to database const dbClient = await dbClientClass.getNewClient(prompts) await dbClient.connect() const dbKind = (dbClient.getKind() || 'hana').toLowerCase() // Get schema if not provided let schema = prompts.schema if (schema === '**CURRENT_SCHEMA**') { schema = null } if (!schema && dbKind !== 'sqlite') { schema = await getCurrentSchema(dbClient, dbKind) } const table = prompts.table console.log(baseLite.bundle.getText("info.startingDuplicateDetection", [table])) // Get table columns const tableColumns = await getTableColumns(dbClient, schema, table, dbKind) if (tableColumns.length === 0) { throw new Error(baseLite.bundle.getText("error.noColumnsDetailed", [table, schema || ''])) } // Determine columns to check let checkColumns = tableColumns if (prompts.checkColumns) { const selected = prompts.checkColumns.split(',').map(c => c.trim()).filter(c => c) const missing = selected.filter(c => !checkColumns.includes(c)) if (missing.length > 0) { console.warn(baseLite.colors.yellow(baseLite.bundle.getText( "warning.duplicateDetectionCheckColumnsMissing", [missing.join(', '), checkColumns.join(', ')] ))) } const matched = checkColumns.filter(c => selected.includes(c)) if (selected.length > 0 && matched.length === 0) { console.warn(baseLite.colors.yellow(baseLite.bundle.getText( "warning.duplicateDetectionCheckColumnsNone", [selected.join(', '), checkColumns.join(', ')] ))) } else if (matched.length > 0) { checkColumns = matched } } if (prompts.excludeColumns) { const excluded = prompts.excludeColumns.split(',').map(c => c.trim()).filter(c => c) checkColumns = checkColumns.filter(c => !excluded.includes(c)) } // Get data to check let query = `SELECT * FROM ${formatQualifiedName(schema, table)}` if (prompts.limit > 0) { query += ` LIMIT ${prompts.limit}` } const rows = await dbClient.execSQL(query) // Detect duplicates const results = detectDuplicates(rows, checkColumns, prompts.mode, prompts.threshold) // Output results if (prompts.output) { await outputResults(prompts.output, results, prompts.format) } else { displayResults(results, prompts.format) } console.log(baseLite.bundle.getText("success.duplicateDetectionComplete", [ results.totalRows, results.uniqueRows, results.duplicateGroups, results.totalDuplicates ])) await dbClient.disconnect() if (timeoutHandle) clearTimeout(timeoutHandle) } catch (error) { console.error(baseLite.bundle.getText("error.duplicateDetection", [error.message])) base.debug(error) throw error } } /** * Get current schema * @param {object} dbClient - Database client * @param {string} dbKind - Database kind * @returns {Promise<string>} */ async function getCurrentSchema(dbClient, dbKind) { if (dbKind === 'hana') { const result = await dbClient.execSQL('SELECT CURRENT_SCHEMA FROM DUMMY') return result[0]?.CURRENT_SCHEMA || 'PUBLIC' } else if (dbKind === 'postgres') { const result = await dbClient.execSQL('SELECT current_schema()') return result[0]?.current_schema || 'public' } return 'public' } /** * Get table columns * @param {object} dbClient - Database client * @param {string|null} schema - Schema name * @param {string} table - Table name * @param {string} dbKind - Database kind * @returns {Promise<Array<string>>} */ async function getTableColumns(dbClient, schema, table, dbKind) { let query if (dbKind === 'hana') { query = `SELECT COLUMN_NAME FROM SYS.TABLE_COLUMNS WHERE SCHEMA_NAME = ? AND TABLE_NAME = ? ORDER BY POSITION` const result = await dbClient.execSQL(query, [schema || 'PUBLIC', table.toUpperCase()]) return result.map(r => r.COLUMN_NAME) } else if (dbKind === 'postgres') { query = `SELECT column_name FROM information_schema.columns WHERE table_schema = ? AND table_name = ? ORDER BY ordinal_position` const result = await dbClient.execSQL(query, [schema || 'public', table.toLowerCase()]) return result.map(r => r.column_name) } return [] } /** * Format qualified table name * @param {string|null} schema - Schema name * @param {string} table - Table name * @returns {string} */ function formatQualifiedName(schema, table) { if (schema) { return `"${schema}"."${table}"` } return `"${table}"` } /** * Detect duplicate records * @param {Array<object>} rows - Data rows * @param {Array<string>} columns - Columns to check * @param {string} mode - Detection mode (exact, fuzzy, partial) * @param {number} threshold - Similarity threshold for fuzzy matching * @returns {object} */ function detectDuplicates(rows, columns, mode, threshold) { const results = { totalRows: rows.length, uniqueRows: 0, duplicateGroups: 0, totalDuplicates: 0, duplicates: [], stats: {} } if (mode === 'exact') { results.duplicates = detectExactDuplicates(rows, columns) } else if (mode === 'fuzzy') { results.duplicates = detectFuzzyDuplicates(rows, columns, threshold) } else if (mode === 'partial') { results.duplicates = detectPartialDuplicates(rows, columns) } results.duplicateGroups = results.duplicates.length results.totalDuplicates = results.duplicates.reduce((sum, group) => sum + group.records.length - 1, 0) results.uniqueRows = results.totalRows - results.totalDuplicates // Calculate statistics for (const group of results.duplicates) { const key = group.matchKey || 'unknown' if (!results.stats[key]) { results.stats[key] = 0 } results.stats[key]++ } return results } /** * Detect exact duplicates * @param {Array<object>} rows - Data rows * @param {Array<string>} columns - Columns to check * @returns {Array<object>} */ function detectExactDuplicates(rows, columns) { const groups = new Map() const duplicates = [] for (let i = 0; i < rows.length; i++) { const row = rows[i] const key = columns.map(c => String(row[c] || '')).join('||') if (!groups.has(key)) { groups.set(key, []) } groups.get(key).push(i) } for (const [key, indices] of groups.entries()) { if (indices.length > 1) { duplicates.push({ matchKey: key, matchPercentage: 100, count: indices.length, records: indices.map(idx => ({ rowNumber: idx, data: rows[idx] })) }) } } return duplicates } /** * Detect fuzzy duplicates using Levenshtein distance * @param {Array<object>} rows - Data rows * @param {Array<string>} columns - Columns to check * @param {number} threshold - Similarity threshold (0-1) * @returns {Array<object>} */ function detectFuzzyDuplicates(rows, columns, threshold) { const duplicates = [] const matched = new Set() for (let i = 0; i < rows.length; i++) { if (matched.has(i)) continue const group = [{ rowNumber: i, data: rows[i] }] for (let j = i + 1; j < rows.length; j++) { if (matched.has(j)) continue const similarity = calculateSimilarity(rows[i], rows[j], columns) if (similarity >= threshold) { group.push({ rowNumber: j, data: rows[j] }) matched.add(j) } } if (group.length > 1) { duplicates.push({ matchKey: `fuzzy_group_${duplicates.length}`, matchPercentage: Math.round(threshold * 100), count: group.length, records: group }) } matched.add(i) } return duplicates } /** * Detect partial duplicates * @param {Array<object>} rows - Data rows * @param {Array<string>} columns - Columns to check * @returns {Array<object>} */ function detectPartialDuplicates(rows, columns) { const groups = new Map() const duplicates = [] for (let i = 0; i < rows.length; i++) { const row = rows[i] const key = columns.slice(0, 1).map(c => String(row[c] || '')).join('||') if (!groups.has(key)) { groups.set(key, []) } groups.get(key).push(i) } for (const [key, indices] of groups.entries()) { if (indices.length > 1) { duplicates.push({ matchKey: key, matchPercentage: 50, count: indices.length, records: indices.map(idx => ({ rowNumber: idx, data: rows[idx] })) }) } } return duplicates } /** * Calculate similarity between two rows * @param {object} row1 - First row * @param {object} row2 - Second row * @param {Array<string>} columns - Columns to compare * @returns {number} */ function calculateSimilarity(row1, row2, columns) { let totalSimilarity = 0 for (const col of columns) { const val1 = String(row1[col] || '') const val2 = String(row2[col] || '') const similarity = 1 - (levenshteinDistance(val1, val2) / Math.max(val1.length, val2.length, 1)) totalSimilarity += similarity } return columns.length > 0 ? totalSimilarity / columns.length : 0 } /** * Calculate Levenshtein distance between two strings * @param {string} str1 - First string * @param {string} str2 - Second string * @returns {number} */ function levenshteinDistance(str1, str2) { const track = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null)) for (let i = 0; i <= str1.length; i += 1) { track[0][i] = i } for (let j = 0; j <= str2.length; j += 1) { track[j][0] = j } for (let j = 1; j <= str2.length; j += 1) { for (let i = 1; i <= str1.length; i += 1) { const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1 track[j][i] = Math.min( track[j][i - 1] + 1, track[j - 1][i] + 1, track[j - 1][i - 1] + indicator ) } } return track[str2.length][str1.length] } /** * Output results to file * @param {string} filePath - Output file path * @param {object} results - Detection results * @param {string} format - Output format * @returns {Promise<void>} */ async function outputResults(filePath, results, format) { const fsModule = await import('fs') const fs = fsModule.promises let content if (format === 'json') { content = JSON.stringify(results, null, 2) } else if (format === 'csv') { content = 'Group,Rows,Similarity\n' for (const group of results.duplicates) { content += `"${group.matchKey}","${group.count}","${group.matchPercentage}%"\n` } } else { content = formatSummaryReport(results) } await fs.writeFile(filePath, content) } /** * Format summary report * @param {object} results - Detection results * @returns {string} */ function formatSummaryReport(results) { let report = 'Duplicate Detection Report\n' report += '==========================\n\n' report += `Total Rows: ${results.totalRows}\n` report += `Unique Rows: ${results.uniqueRows}\n` report += `Duplicate Groups: ${results.duplicateGroups}\n` report += `Total Duplicates: ${results.totalDuplicates}\n\n` if (results.duplicates.length > 0) { report += 'Duplicate Groups:\n' for (const group of results.duplicates.slice(0, 20)) { report += ` Group: ${group.matchKey}, Records: ${group.count}, Match: ${group.matchPercentage}%\n` } if (results.duplicates.length > 20) { report += ` ... and ${results.duplicates.length - 20} more groups\n` } } return report } /** * Display results in console * @param {object} results - Detection results * @param {string} format - Display format * @returns {void} */ function displayResults(results, format) { if (format === 'json') { console.log(JSON.stringify(results, null, 2)) } else if (format === 'csv') { console.log('Group,Rows,Similarity') for (const group of results.duplicates) { console.log(`"${group.matchKey}","${group.count}","${group.matchPercentage}%"`) } } else { console.log(formatSummaryReport(results)) } }