semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
271 lines • 10.8 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.PRAnalyzer = void 0;
const path_1 = require("path");
class PRAnalyzer {
SCHEMA_FILE_PATTERNS = [
/\.sql$/i,
/schema.*\.py$/i,
/models\/.*\.py$/i,
/migrations\/.*\.(sql|py)$/i,
/\.ddl$/i,
/create_table.*\.(sql|py)$/i
];
DATA_FILE_PATTERNS = [
/\.csv$/i,
/\.json$/i,
/\.parquet$/i,
/\.avro$/i,
/\.xlsx?$/i,
/data\/.*\.(py|sql)$/i
];
SEMANTIC_FILE_PATTERNS = [
/semantics\/.*\.ya?ml$/i,
/\.semantic\.ya?ml$/i,
/semantic-mappings\.ya?ml$/i
];
async analyzePR(prData) {
const startTime = Date.now();
const schemaChanges = this.analyzeSchemaChanges(prData.files, prData.diff);
const semanticFileChanges = this.analyzeSemanticFileChanges(prData.files);
const dataFileChanges = this.analyzeDataFileChanges(prData.files);
const hasSemanticChanges = schemaChanges.length > 0 ||
semanticFileChanges.length > 0 ||
dataFileChanges.length > 0;
const riskLevel = this.calculateRiskLevel(schemaChanges, dataFileChanges);
const suggestedActions = this.generateSuggestedActions(schemaChanges, semanticFileChanges, dataFileChanges);
return {
prNumber: prData.pr.number,
hasSemanticChanges,
schemaChanges,
semanticFileChanges,
dataFileChanges,
processingTime: Date.now() - startTime,
riskLevel,
suggestedActions
};
}
analyzeSchemaChanges(files, diff) {
const changes = [];
const schemaFiles = files.filter(file => this.SCHEMA_FILE_PATTERNS.some(pattern => pattern.test(file.filename)));
for (const file of schemaFiles) {
if (!file.patch)
continue;
// SQL DDL changes
const sqlChanges = this.extractSQLSchemaChanges(file.patch, file.filename);
changes.push(...sqlChanges);
// Python model changes (e.g., SQLAlchemy, Django)
if (file.filename.endsWith('.py')) {
const pythonChanges = this.extractPythonSchemaChanges(file.patch, file.filename);
changes.push(...pythonChanges);
}
}
return changes;
}
extractSQLSchemaChanges(patch, filename) {
const changes = [];
const lines = patch.split('\n');
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Column additions
if (line.startsWith('+') && /ADD COLUMN|ADD\s+\w+/i.test(line)) {
const match = line.match(/ADD\s+(?:COLUMN\s+)?(\w+)\s+(\w+)/i);
if (match) {
changes.push({
type: 'column_added',
table: this.extractTableName(filename, lines, i),
column: match[1],
after: { type: match[2] },
confidence: 0.9
});
}
}
// Column removals
if (line.startsWith('-') && /DROP COLUMN|DROP\s+\w+/i.test(line)) {
const match = line.match(/DROP\s+(?:COLUMN\s+)?(\w+)/i);
if (match) {
changes.push({
type: 'column_removed',
table: this.extractTableName(filename, lines, i),
column: match[1],
confidence: 0.9
});
}
}
// Type changes
if (line.startsWith('+') && line.includes('ALTER COLUMN')) {
const match = line.match(/ALTER COLUMN\s+(\w+)\s+TYPE\s+(\w+)/i);
if (match) {
changes.push({
type: 'type_changed',
table: this.extractTableName(filename, lines, i),
column: match[1],
after: { type: match[2] },
confidence: 0.85
});
}
}
}
return changes;
}
extractPythonSchemaChanges(patch, filename) {
const changes = [];
const lines = patch.split('\n');
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
// SQLAlchemy column additions
if (line.startsWith('+') && /Column\(/i.test(line)) {
const match = line.match(/(\w+)\s*=\s*Column\((.*?)\)/);
if (match) {
changes.push({
type: 'column_added',
table: this.extractPythonTableName(filename, lines, i),
column: match[1],
after: { definition: match[2] },
confidence: 0.8
});
}
}
// Column removals
if (line.startsWith('-') && /Column\(/i.test(line)) {
const match = line.match(/(\w+)\s*=\s*Column\(/);
if (match) {
changes.push({
type: 'column_removed',
table: this.extractPythonTableName(filename, lines, i),
column: match[1],
confidence: 0.8
});
}
}
}
return changes;
}
analyzeSemanticFileChanges(files) {
const changes = [];
const semanticFiles = files.filter(file => this.SEMANTIC_FILE_PATTERNS.some(pattern => pattern.test(file.filename)));
for (const file of semanticFiles) {
let action;
switch (file.status) {
case 'added':
action = 'created';
break;
case 'removed':
action = 'deleted';
break;
default:
action = 'updated';
}
const mappingCounts = this.analyzeMappingChanges(file.patch || '');
changes.push({
file: file.filename,
action,
...mappingCounts
});
}
return changes;
}
analyzeMappingChanges(patch) {
const lines = patch.split('\n');
let mappings_added = 0;
let mappings_removed = 0;
let mappings_modified = 0;
for (const line of lines) {
if (line.startsWith('+') && /semantic_type:|anchor_id:|column:/i.test(line)) {
mappings_added++;
}
else if (line.startsWith('-') && /semantic_type:|anchor_id:|column:/i.test(line)) {
mappings_removed++;
}
}
// Heuristic: if we have both additions and removals for similar counts, they're modifications
const modifications = Math.min(mappings_added, mappings_removed);
mappings_modified = modifications;
mappings_added -= modifications;
mappings_removed -= modifications;
return { mappings_added, mappings_removed, mappings_modified };
}
analyzeDataFileChanges(files) {
return files.filter(file => this.DATA_FILE_PATTERNS.some(pattern => pattern.test(file.filename)));
}
extractTableName(filename, lines, currentIndex) {
// Look backwards for CREATE TABLE or table references
for (let i = currentIndex; i >= 0; i--) {
const line = lines[i];
const match = line.match(/CREATE TABLE\s+(?:IF NOT EXISTS\s+)?(\w+)/i) ||
line.match(/ALTER TABLE\s+(\w+)/i);
if (match) {
return match[1];
}
}
// Fallback to filename
const parsed = (0, path_1.parse)(filename);
return parsed.name.replace(/[^a-zA-Z0-9_]/g, '_');
}
extractPythonTableName(filename, lines, currentIndex) {
// Look for class definition or __tablename__
for (let i = currentIndex; i >= 0; i--) {
const line = lines[i];
const classMatch = line.match(/class\s+(\w+)/);
if (classMatch) {
return classMatch[1].toLowerCase();
}
const tableMatch = line.match(/__tablename__\s*=\s*['"](\w+)['"]/);
if (tableMatch) {
return tableMatch[1];
}
}
const parsed = (0, path_1.parse)(filename);
return parsed.name.replace(/[^a-zA-Z0-9_]/g, '_');
}
calculateRiskLevel(schemaChanges, dataFileChanges) {
const totalChanges = schemaChanges.length + dataFileChanges.length;
// High risk indicators
const hasColumnRemovals = schemaChanges.some(change => change.type === 'column_removed');
const hasTypeChanges = schemaChanges.some(change => change.type === 'type_changed');
const hasManyDataFiles = dataFileChanges.length > 5;
if (hasColumnRemovals || hasTypeChanges || hasManyDataFiles) {
return 'high';
}
if (totalChanges > 3) {
return 'medium';
}
return 'low';
}
generateSuggestedActions(schemaChanges, semanticFileChanges, dataFileChanges) {
const actions = [];
if (schemaChanges.length > 0) {
actions.push('Review schema changes for semantic impact');
if (schemaChanges.some(c => c.type === 'column_added')) {
actions.push('Add semantic mappings for new columns');
}
if (schemaChanges.some(c => c.type === 'column_removed')) {
actions.push('Remove corresponding semantic mappings');
}
if (schemaChanges.some(c => c.type === 'type_changed')) {
actions.push('Validate existing mappings still apply');
}
}
if (dataFileChanges.length > 0) {
actions.push('Update column anchors for modified data files');
}
if (semanticFileChanges.length === 0 && (schemaChanges.length > 0 || dataFileChanges.length > 0)) {
actions.push('Consider adding semantic mappings');
}
if (actions.length === 0) {
actions.push('No semantic actions required');
}
return actions;
}
isSemanticFile(filename) {
return this.SEMANTIC_FILE_PATTERNS.some(pattern => pattern.test(filename));
}
isSchemaFile(filename) {
return this.SCHEMA_FILE_PATTERNS.some(pattern => pattern.test(filename));
}
isDataFile(filename) {
return this.DATA_FILE_PATTERNS.some(pattern => pattern.test(filename));
}
}
exports.PRAnalyzer = PRAnalyzer;
//# sourceMappingURL=pr-analyzer.js.map