semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
344 lines (343 loc) • 14 kB
JavaScript
export class SQLGenerator {
startTime = 0;
targetGenerators = new Map();
templateEngine;
performanceMonitor;
constructor() {
this.loadTargetGenerators();
this.initializePerformanceOptimizations();
}
initializePerformanceOptimizations() {
// Lazy load dependencies for better performance
this.templateEngine = null;
this.performanceMonitor = null;
}
getTemplateEngine() {
if (!this.templateEngine) {
const { SQLTemplateEngine } = require('./sql-templates');
this.templateEngine = new SQLTemplateEngine();
}
return this.templateEngine;
}
getPerformanceMonitor() {
if (!this.performanceMonitor) {
const { TemplatePerformanceMonitor } = require('./sql-templates');
this.performanceMonitor = new TemplatePerformanceMonitor();
}
return this.performanceMonitor;
}
async generateSemanticValidation(config, options) {
this.startTime = performance.now();
// Performance optimization: Validate inputs early
if (!config.tableName || !config.columns.length) {
throw new Error('Invalid configuration: tableName and columns are required');
}
const generator = this.getTargetGenerator(options.target);
const validationRules = this.buildValidationRules(config);
// Use template engine for performance with dialect-aware templates
const templateEngine = this.getTemplateEngine();
const templateString = this.getDialectValidationTemplate(templateEngine, options.target);
const compiledTemplate = templateEngine.compile(templateString, `validation_${options.target}`);
const validationView = this.generateOptimizedValidationView(config, validationRules, generator, compiledTemplate, options);
const generationTime = performance.now() - this.startTime;
// Track performance
const monitor = this.getPerformanceMonitor();
monitor.recordRenderTime(generationTime);
const result = {
validationView,
metadata: {
generationTime,
ruleCount: validationRules.length,
columnCount: config.columns.length,
targetDialect: options.target
}
};
// Generate additional components if requested
if (options.enableQuarantine) {
result.quarantineTable = this.generateOptimizedQuarantineTable(config, validationRules, generator, options);
}
if (options.dbtCompatible) {
result.dbtModel = this.generateOptimizedDbtModel(config, validationRules, generator, options);
}
return result;
}
generateOptimizedValidationView(config, rules, generator, template, options) {
const validationCases = rules.map(rule => generator.generateValidationCase(rule));
const context = {
tableName: config.tableName,
viewName: `${config.tableName}_semantic_valid`,
columns: config.columns.map(col => col.name),
validationCases
};
return template.render(context);
}
generateOptimizedQuarantineTable(config, rules, generator, options) {
// Build dialect-aware quarantine table SQL using the template engine
const templateEngine = this.getTemplateEngine();
const templateString = templateEngine.getQuarantineTableTemplate();
const compiledTemplate = templateEngine.compile(templateString, `quarantine_${options.target}`);
const validationCases = rules.map(rule => generator.generateValidationCase(rule));
const context = {
tableName: config.tableName,
viewName: '',
columns: ['*'],
validationCases,
metadata: {
generation_id: `${config.tableName}_${Date.now()}`
}
};
return compiledTemplate.render(context);
}
generateOptimizedDbtModel(config, rules, generator, options) {
const { DbtGenerator } = require('./dbt-generator');
const dbtGenerator = new DbtGenerator();
const semanticModel = dbtGenerator.generateSemanticValidationModel(config, {
materialized: 'view',
tags: ['semantic_validation', 'generated'],
meta: {
target_dialect: options.target,
performance_optimized: true
}
});
return semanticModel.sql;
}
buildValidationRules(config) {
const rules = [];
// Add custom rules first
if (config.validationRules) {
rules.push(...config.validationRules);
}
// Generate semantic rules from attachments and CID concepts
for (const column of config.columns) {
rules.push(...this.generateSemanticRules(column));
}
return rules;
}
generateSemanticRules(column) {
const rules = [];
// Basic null checks for identifiers
if (column.semanticAttachment?.semantic_context.semantic_type === 'identifier') {
rules.push({
ruleType: 'null_check',
column: column.name,
condition: `${column.name} IS NOT NULL`,
errorCode: 'NULL_IDENTIFIER',
description: `Identifier column ${column.name} cannot be null`,
severity: 'error'
});
}
// Email format validation
if (column.semanticAttachment?.semantic_context.semantic_type === 'email_address') {
rules.push({
ruleType: 'format_check',
column: column.name,
condition: this.getEmailValidationCondition(column.name),
errorCode: 'INVALID_EMAIL_FORMAT',
description: `Email address ${column.name} must be valid format`,
severity: 'error'
});
}
// Phone number format validation
if (column.semanticAttachment?.semantic_context.semantic_type === 'phone_number') {
rules.push({
ruleType: 'format_check',
column: column.name,
condition: this.getPhoneValidationCondition(column.name),
errorCode: 'INVALID_PHONE_FORMAT',
description: `Phone number ${column.name} must be valid format`,
severity: 'warning'
});
}
// Monetary value range checks
if (column.semanticAttachment?.semantic_context.semantic_type === 'monetary_value') {
rules.push({
ruleType: 'range_check',
column: column.name,
condition: `${column.name} >= 0`,
errorCode: 'NEGATIVE_MONETARY_VALUE',
description: `Monetary value ${column.name} cannot be negative`,
severity: 'warning'
});
}
// PII sensitivity checks from CID concepts
if (column.cidConcept?.facets.pii) {
rules.push({
ruleType: 'custom',
column: column.name,
condition: `LENGTH(${column.name}) > 0`,
errorCode: 'EMPTY_PII_FIELD',
description: `PII field ${column.name} cannot be empty`,
severity: 'error'
});
}
// Temporal validation
if (column.cidConcept?.facets.temporal && column.dataType === 'datetime') {
rules.push({
ruleType: 'range_check',
column: column.name,
condition: `${column.name} <= CURRENT_TIMESTAMP`,
errorCode: 'FUTURE_TIMESTAMP',
description: `Timestamp ${column.name} cannot be in the future`,
severity: 'warning'
});
}
// Categorical validation
if (column.cidConcept?.facets.categorical && column.cidConcept.examples) {
const allowedValues = column.cidConcept.examples
.map(v => `'${String(v).replace(/'/g, "''")}'`)
.join(', ');
rules.push({
ruleType: 'custom',
column: column.name,
condition: `${column.name} IN (${allowedValues})`,
errorCode: 'INVALID_CATEGORICAL_VALUE',
description: `Categorical value ${column.name} must be one of allowed values`,
severity: 'error'
});
}
return rules;
}
getDialectValidationTemplate(templateEngine, target) {
switch (target) {
case 'snowflake':
return templateEngine.getSnowflakeValidationTemplate();
case 'bigquery':
return templateEngine.getBigQueryValidationTemplate();
case 'duckdb':
return templateEngine.getDuckDBValidationTemplate();
case 'postgres':
default:
return templateEngine.getPostgresValidationTemplate();
}
}
getEmailValidationCondition(columnName) {
return `${columnName} ~ '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$'`;
}
getPhoneValidationCondition(columnName) {
return `${columnName} ~ '^[+]?[1-9]?[0-9]{7,15}$'`;
}
async generateValidationView(config, rules, generator, options) {
const template = this.getValidationTemplate(options.target);
const validationCases = rules.map(rule => {
return generator.generateValidationCase(rule);
}).join(',\n ');
const selectColumns = config.columns.map(col => col.name).join(',\n ');
return template
.replace('{{TABLE_NAME}}', config.tableName)
.replace('{{SELECT_COLUMNS}}', selectColumns)
.replace('{{VALIDATION_CASES}}', validationCases)
.replace('{{VIEW_NAME}}', `${config.tableName}_semantic_valid`);
}
async generateQuarantineTable(config, rules, generator, options) {
const template = this.getQuarantineTemplate(options.target);
const selectColumns = config.columns.map(col => col.name).join(',\n ');
return template
.replace('{{TABLE_NAME}}', config.tableName)
.replace('{{SELECT_COLUMNS}}', selectColumns)
.replace('{{QUARANTINE_TABLE}}', `${config.tableName}_quarantine`);
}
async generateDbtModel(config, rules, generator, options) {
const template = this.getDbtTemplate();
const validationCases = rules.map(rule => {
return generator.generateValidationCase(rule);
}).join(',\n ');
const selectColumns = config.columns.map(col => col.name).join(',\n ');
const dbtConfig = {
materialized: 'view',
tags: ['semantic_validation', 'generated'],
meta: {
generated_by: 'anchor_semantic_toolkit',
generation_time: new Date().toISOString(),
rule_count: rules.length
}
};
return template
.replace('{{DBT_CONFIG}}', JSON.stringify(dbtConfig, null, 2))
.replace('{{TABLE_NAME}}', config.tableName)
.replace('{{SELECT_COLUMNS}}', selectColumns)
.replace('{{VALIDATION_CASES}}', validationCases);
}
loadTargetGenerators() {
// Lazy load target generators for better startup performance
const loaders = {
snowflake: () => {
const { SnowflakeSQLGenerator } = require('./targets/snowflake');
return new SnowflakeSQLGenerator();
},
bigquery: () => {
const { BigQuerySQLGenerator } = require('./targets/bigquery');
return new BigQuerySQLGenerator();
},
duckdb: () => {
const { DuckDBSQLGenerator } = require('./targets/duckdb');
return new DuckDBSQLGenerator();
},
postgres: () => {
const { PostgresSQLGenerator } = require('./targets/postgres');
return new PostgresSQLGenerator();
}
};
// Store loaders, not instances, for performance
for (const [target, loader] of Object.entries(loaders)) {
this.targetGenerators.set(target, { loader, instance: null });
}
}
getTargetGenerator(target) {
const generatorEntry = this.targetGenerators.get(target);
if (!generatorEntry) {
throw new Error(`Unsupported SQL target: ${target}`);
}
// Lazy instantiate the generator
if (!generatorEntry.instance) {
generatorEntry.instance = generatorEntry.loader();
}
return generatorEntry.instance;
}
getValidationTemplate(target) {
const baseTemplate = `
CREATE OR REPLACE VIEW {{VIEW_NAME}} AS
WITH semantic_checks AS (
SELECT
{{SELECT_COLUMNS}},
CASE
{{VALIDATION_CASES}}
ELSE 'PASS'
END as _semantic_status,
CURRENT_TIMESTAMP as _validation_timestamp
FROM {{TABLE_NAME}}
)
SELECT * FROM semantic_checks WHERE _semantic_status = 'PASS'`;
return baseTemplate;
}
getQuarantineTemplate(target) {
return `
CREATE OR REPLACE TABLE {{QUARANTINE_TABLE}} AS
WITH semantic_checks AS (
SELECT
{{SELECT_COLUMNS}},
CASE
{{VALIDATION_CASES}}
ELSE 'PASS'
END as _semantic_status,
CURRENT_TIMESTAMP as _quarantine_timestamp
FROM {{TABLE_NAME}}
)
SELECT * FROM semantic_checks WHERE _semantic_status != 'PASS'`;
}
getDbtTemplate() {
return `
{{ config({{DBT_CONFIG}}) }}
WITH semantic_checks AS (
SELECT
{{SELECT_COLUMNS}},
CASE
{{VALIDATION_CASES}}
ELSE 'PASS'
END as _semantic_status,
CURRENT_TIMESTAMP as _validation_timestamp
FROM {{ ref('{{TABLE_NAME}}') }}
)
SELECT * FROM semantic_checks WHERE _semantic_status = 'PASS'`;
}
}
//# sourceMappingURL=sql-generator.js.map