UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

344 lines (343 loc) 14 kB
export class SQLGenerator { startTime = 0; targetGenerators = new Map(); templateEngine; performanceMonitor; constructor() { this.loadTargetGenerators(); this.initializePerformanceOptimizations(); } initializePerformanceOptimizations() { // Lazy load dependencies for better performance this.templateEngine = null; this.performanceMonitor = null; } getTemplateEngine() { if (!this.templateEngine) { const { SQLTemplateEngine } = require('./sql-templates'); this.templateEngine = new SQLTemplateEngine(); } return this.templateEngine; } getPerformanceMonitor() { if (!this.performanceMonitor) { const { TemplatePerformanceMonitor } = require('./sql-templates'); this.performanceMonitor = new TemplatePerformanceMonitor(); } return this.performanceMonitor; } async generateSemanticValidation(config, options) { this.startTime = performance.now(); // Performance optimization: Validate inputs early if (!config.tableName || !config.columns.length) { throw new Error('Invalid configuration: tableName and columns are required'); } const generator = this.getTargetGenerator(options.target); const validationRules = this.buildValidationRules(config); // Use template engine for performance with dialect-aware templates const templateEngine = this.getTemplateEngine(); const templateString = this.getDialectValidationTemplate(templateEngine, options.target); const compiledTemplate = templateEngine.compile(templateString, `validation_${options.target}`); const validationView = this.generateOptimizedValidationView(config, validationRules, generator, compiledTemplate, options); const generationTime = performance.now() - this.startTime; // Track performance const monitor = this.getPerformanceMonitor(); monitor.recordRenderTime(generationTime); const result = { validationView, metadata: { generationTime, ruleCount: validationRules.length, columnCount: config.columns.length, targetDialect: options.target } }; // Generate additional components if requested if (options.enableQuarantine) { result.quarantineTable = this.generateOptimizedQuarantineTable(config, validationRules, generator, options); } if (options.dbtCompatible) { result.dbtModel = this.generateOptimizedDbtModel(config, validationRules, generator, options); } return result; } generateOptimizedValidationView(config, rules, generator, template, options) { const validationCases = rules.map(rule => generator.generateValidationCase(rule)); const context = { tableName: config.tableName, viewName: `${config.tableName}_semantic_valid`, columns: config.columns.map(col => col.name), validationCases }; return template.render(context); } generateOptimizedQuarantineTable(config, rules, generator, options) { // Build dialect-aware quarantine table SQL using the template engine const templateEngine = this.getTemplateEngine(); const templateString = templateEngine.getQuarantineTableTemplate(); const compiledTemplate = templateEngine.compile(templateString, `quarantine_${options.target}`); const validationCases = rules.map(rule => generator.generateValidationCase(rule)); const context = { tableName: config.tableName, viewName: '', columns: ['*'], validationCases, metadata: { generation_id: `${config.tableName}_${Date.now()}` } }; return compiledTemplate.render(context); } generateOptimizedDbtModel(config, rules, generator, options) { const { DbtGenerator } = require('./dbt-generator'); const dbtGenerator = new DbtGenerator(); const semanticModel = dbtGenerator.generateSemanticValidationModel(config, { materialized: 'view', tags: ['semantic_validation', 'generated'], meta: { target_dialect: options.target, performance_optimized: true } }); return semanticModel.sql; } buildValidationRules(config) { const rules = []; // Add custom rules first if (config.validationRules) { rules.push(...config.validationRules); } // Generate semantic rules from attachments and CID concepts for (const column of config.columns) { rules.push(...this.generateSemanticRules(column)); } return rules; } generateSemanticRules(column) { const rules = []; // Basic null checks for identifiers if (column.semanticAttachment?.semantic_context.semantic_type === 'identifier') { rules.push({ ruleType: 'null_check', column: column.name, condition: `${column.name} IS NOT NULL`, errorCode: 'NULL_IDENTIFIER', description: `Identifier column ${column.name} cannot be null`, severity: 'error' }); } // Email format validation if (column.semanticAttachment?.semantic_context.semantic_type === 'email_address') { rules.push({ ruleType: 'format_check', column: column.name, condition: this.getEmailValidationCondition(column.name), errorCode: 'INVALID_EMAIL_FORMAT', description: `Email address ${column.name} must be valid format`, severity: 'error' }); } // Phone number format validation if (column.semanticAttachment?.semantic_context.semantic_type === 'phone_number') { rules.push({ ruleType: 'format_check', column: column.name, condition: this.getPhoneValidationCondition(column.name), errorCode: 'INVALID_PHONE_FORMAT', description: `Phone number ${column.name} must be valid format`, severity: 'warning' }); } // Monetary value range checks if (column.semanticAttachment?.semantic_context.semantic_type === 'monetary_value') { rules.push({ ruleType: 'range_check', column: column.name, condition: `${column.name} >= 0`, errorCode: 'NEGATIVE_MONETARY_VALUE', description: `Monetary value ${column.name} cannot be negative`, severity: 'warning' }); } // PII sensitivity checks from CID concepts if (column.cidConcept?.facets.pii) { rules.push({ ruleType: 'custom', column: column.name, condition: `LENGTH(${column.name}) > 0`, errorCode: 'EMPTY_PII_FIELD', description: `PII field ${column.name} cannot be empty`, severity: 'error' }); } // Temporal validation if (column.cidConcept?.facets.temporal && column.dataType === 'datetime') { rules.push({ ruleType: 'range_check', column: column.name, condition: `${column.name} <= CURRENT_TIMESTAMP`, errorCode: 'FUTURE_TIMESTAMP', description: `Timestamp ${column.name} cannot be in the future`, severity: 'warning' }); } // Categorical validation if (column.cidConcept?.facets.categorical && column.cidConcept.examples) { const allowedValues = column.cidConcept.examples .map(v => `'${String(v).replace(/'/g, "''")}'`) .join(', '); rules.push({ ruleType: 'custom', column: column.name, condition: `${column.name} IN (${allowedValues})`, errorCode: 'INVALID_CATEGORICAL_VALUE', description: `Categorical value ${column.name} must be one of allowed values`, severity: 'error' }); } return rules; } getDialectValidationTemplate(templateEngine, target) { switch (target) { case 'snowflake': return templateEngine.getSnowflakeValidationTemplate(); case 'bigquery': return templateEngine.getBigQueryValidationTemplate(); case 'duckdb': return templateEngine.getDuckDBValidationTemplate(); case 'postgres': default: return templateEngine.getPostgresValidationTemplate(); } } getEmailValidationCondition(columnName) { return `${columnName} ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'`; } getPhoneValidationCondition(columnName) { return `${columnName} ~ '^[+]?[1-9]?[0-9]{7,15}$'`; } async generateValidationView(config, rules, generator, options) { const template = this.getValidationTemplate(options.target); const validationCases = rules.map(rule => { return generator.generateValidationCase(rule); }).join(',\n '); const selectColumns = config.columns.map(col => col.name).join(',\n '); return template .replace('{{TABLE_NAME}}', config.tableName) .replace('{{SELECT_COLUMNS}}', selectColumns) .replace('{{VALIDATION_CASES}}', validationCases) .replace('{{VIEW_NAME}}', `${config.tableName}_semantic_valid`); } async generateQuarantineTable(config, rules, generator, options) { const template = this.getQuarantineTemplate(options.target); const selectColumns = config.columns.map(col => col.name).join(',\n '); return template .replace('{{TABLE_NAME}}', config.tableName) .replace('{{SELECT_COLUMNS}}', selectColumns) .replace('{{QUARANTINE_TABLE}}', `${config.tableName}_quarantine`); } async generateDbtModel(config, rules, generator, options) { const template = this.getDbtTemplate(); const validationCases = rules.map(rule => { return generator.generateValidationCase(rule); }).join(',\n '); const selectColumns = config.columns.map(col => col.name).join(',\n '); const dbtConfig = { materialized: 'view', tags: ['semantic_validation', 'generated'], meta: { generated_by: 'anchor_semantic_toolkit', generation_time: new Date().toISOString(), rule_count: rules.length } }; return template .replace('{{DBT_CONFIG}}', JSON.stringify(dbtConfig, null, 2)) .replace('{{TABLE_NAME}}', config.tableName) .replace('{{SELECT_COLUMNS}}', selectColumns) .replace('{{VALIDATION_CASES}}', validationCases); } loadTargetGenerators() { // Lazy load target generators for better startup performance const loaders = { snowflake: () => { const { SnowflakeSQLGenerator } = require('./targets/snowflake'); return new SnowflakeSQLGenerator(); }, bigquery: () => { const { BigQuerySQLGenerator } = require('./targets/bigquery'); return new BigQuerySQLGenerator(); }, duckdb: () => { const { DuckDBSQLGenerator } = require('./targets/duckdb'); return new DuckDBSQLGenerator(); }, postgres: () => { const { PostgresSQLGenerator } = require('./targets/postgres'); return new PostgresSQLGenerator(); } }; // Store loaders, not instances, for performance for (const [target, loader] of Object.entries(loaders)) { this.targetGenerators.set(target, { loader, instance: null }); } } getTargetGenerator(target) { const generatorEntry = this.targetGenerators.get(target); if (!generatorEntry) { throw new Error(`Unsupported SQL target: ${target}`); } // Lazy instantiate the generator if (!generatorEntry.instance) { generatorEntry.instance = generatorEntry.loader(); } return generatorEntry.instance; } getValidationTemplate(target) { const baseTemplate = ` CREATE OR REPLACE VIEW {{VIEW_NAME}} AS WITH semantic_checks AS ( SELECT {{SELECT_COLUMNS}}, CASE {{VALIDATION_CASES}} ELSE 'PASS' END as _semantic_status, CURRENT_TIMESTAMP as _validation_timestamp FROM {{TABLE_NAME}} ) SELECT * FROM semantic_checks WHERE _semantic_status = 'PASS'`; return baseTemplate; } getQuarantineTemplate(target) { return ` CREATE OR REPLACE TABLE {{QUARANTINE_TABLE}} AS WITH semantic_checks AS ( SELECT {{SELECT_COLUMNS}}, CASE {{VALIDATION_CASES}} ELSE 'PASS' END as _semantic_status, CURRENT_TIMESTAMP as _quarantine_timestamp FROM {{TABLE_NAME}} ) SELECT * FROM semantic_checks WHERE _semantic_status != 'PASS'`; } getDbtTemplate() { return ` {{ config({{DBT_CONFIG}}) }} WITH semantic_checks AS ( SELECT {{SELECT_COLUMNS}}, CASE {{VALIDATION_CASES}} ELSE 'PASS' END as _semantic_status, CURRENT_TIMESTAMP as _validation_timestamp FROM {{ ref('{{TABLE_NAME}}') }} ) SELECT * FROM semantic_checks WHERE _semantic_status = 'PASS'`; } } //# sourceMappingURL=sql-generator.js.map