semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
590 lines • 25.9 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.PatternDriftDetector = void 0;
class PatternDriftDetector {
COMMON_PATTERNS = {
email: /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/,
phone: /^[\+]?[1-9][\d\-\(\)\s\+\.]{7,15}$/,
uuid: /^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i,
date_iso: /^\d{4}-\d{2}-\d{2}$/,
date_us: /^\d{1,2}\/\d{1,2}\/\d{4}$/,
ssn: /^\d{3}-\d{2}-\d{4}$/,
credit_card: /^\d{4}[\s\-]?\d{4}[\s\-]?\d{4}[\s\-]?\d{4}$/,
ip_address: /^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$/,
url: /^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)$/,
numeric: /^-?\d+(\.\d+)?$/,
alphanumeric: /^[a-zA-Z0-9]+$/,
alpha: /^[a-zA-Z]+$/
};
SEMANTIC_PATTERNS = {
person_name: /^[A-Z][a-z]+ [A-Z][a-z]+$/,
company_name: /^[A-Z][a-zA-Z0-9\s&\.,\-\']+$/,
address: /^\d+\s+[A-Za-z0-9\s\.,\-\#]+$/,
postal_code: /^[A-Z0-9\-\s]{3,10}$/,
currency: /^\$?\d{1,3}(,\d{3})*(\.\d{2})?$/,
percentage: /^\d{1,3}(\.\d+)?%$/
};
async detectPatternDrift(historicalFingerprint, currentFingerprint, similarityThreshold = 0.8) {
const analysis = await this.analyzePatternDrift(historicalFingerprint, currentFingerprint);
if (analysis.similarity_score < similarityThreshold) {
let severity = 'low';
switch (analysis.format_stability) {
case 'format_shift':
severity = 'critical';
break;
case 'major_change':
severity = 'high';
break;
case 'minor_change':
severity = 'medium';
break;
case 'stable':
severity = 'low';
break;
}
// Escalate severity based on semantic patterns
const semanticPatternLoss = analysis.lost_patterns.filter(p => p.semantic_type).length;
if (semanticPatternLoss > 0) {
severity = severity === 'low' ? 'medium' : severity === 'medium' ? 'high' : severity;
}
return {
type: 'format',
severity: severity,
metric_value: 1 - analysis.similarity_score,
threshold: 1 - similarityThreshold,
description: `Format drift detected: ${(100 * (1 - analysis.similarity_score)).toFixed(1)}% pattern change`
};
}
return null;
}
async analyzePatternDrift(historicalFingerprint, currentFingerprint) {
// Extract patterns from fingerprints
const historicalPatterns = this.extractPatterns(historicalFingerprint.regex_patterns, historicalFingerprint.sample_values);
const currentPatterns = this.extractPatterns(currentFingerprint.regex_patterns, currentFingerprint.sample_values);
// Calculate pattern similarity
const similarityScore = this.calculatePatternSimilarity(historicalPatterns, currentPatterns);
// Identify pattern changes
const newPatterns = this.findNewPatterns(historicalPatterns, currentPatterns);
const lostPatterns = this.findLostPatterns(historicalPatterns, currentPatterns);
const changedPatterns = this.findChangedPatterns(historicalPatterns, currentPatterns);
// Analyze sample-level changes
const sampleAnalysis = this.analyzeSampleDrift(historicalFingerprint.sample_values, currentFingerprint.sample_values);
// Determine format stability
const formatStability = this.assessFormatStability(similarityScore, newPatterns.length, lostPatterns.length, sampleAnalysis);
return {
similarity_score: similarityScore,
new_patterns: newPatterns,
lost_patterns: lostPatterns,
changed_patterns: changedPatterns,
format_stability: formatStability,
sample_analysis: sampleAnalysis
};
}
extractPatterns(regexPatterns, sampleValues) {
const patterns = [];
// Add explicit regex patterns
for (const pattern of regexPatterns) {
const patternInfo = this.analyzePattern(pattern, sampleValues);
if (patternInfo) {
patterns.push(patternInfo);
}
}
// Detect common patterns from samples
const detectedPatterns = this.detectPatternsFromSamples(sampleValues);
patterns.push(...detectedPatterns);
// Remove duplicates and sort by frequency
const uniquePatterns = this.deduplicatePatterns(patterns);
return uniquePatterns.sort((a, b) => b.frequency - a.frequency);
}
analyzePattern(pattern, sampleValues) {
try {
const regex = new RegExp(pattern);
const matches = sampleValues.filter(value => regex.test(value));
const frequency = matches.length / sampleValues.length;
if (frequency > 0) {
return {
pattern: pattern,
frequency: frequency,
confidence: this.calculatePatternConfidence(pattern, matches),
examples: matches.slice(0, 5),
semantic_type: this.identifySemanticType(pattern)
};
}
}
catch (error) {
// Invalid regex pattern
}
return null;
}
detectPatternsFromSamples(sampleValues) {
const patterns = [];
// Test against common patterns
for (const [type, regex] of Object.entries(this.COMMON_PATTERNS)) {
const matches = sampleValues.filter(value => regex.test(value));
const frequency = matches.length / sampleValues.length;
if (frequency > 0.1) { // Only include patterns that match at least 10% of samples
patterns.push({
pattern: regex.source,
frequency: frequency,
confidence: frequency,
examples: matches.slice(0, 5),
semantic_type: type
});
}
}
// Test against semantic patterns
for (const [type, regex] of Object.entries(this.SEMANTIC_PATTERNS)) {
const matches = sampleValues.filter(value => regex.test(value));
const frequency = matches.length / sampleValues.length;
if (frequency > 0.05) { // Lower threshold for semantic patterns
patterns.push({
pattern: regex.source,
frequency: frequency,
confidence: frequency * 0.8, // Lower confidence for inferred patterns
examples: matches.slice(0, 5),
semantic_type: type
});
}
}
// Generate structural patterns
const structuralPatterns = this.generateStructuralPatterns(sampleValues);
patterns.push(...structuralPatterns);
return patterns;
}
generateStructuralPatterns(sampleValues) {
const patterns = [];
// Length patterns
const lengths = sampleValues.map(v => v.length);
const uniqueLengths = [...new Set(lengths)];
if (uniqueLengths.length <= 3) {
for (const length of uniqueLengths) {
const matchCount = lengths.filter(l => l === length).length;
const frequency = matchCount / sampleValues.length;
if (frequency > 0.2) {
patterns.push({
pattern: `^.{${length}}$`,
frequency: frequency,
confidence: frequency,
examples: sampleValues.filter(v => v.length === length).slice(0, 3),
semantic_type: 'fixed_length'
});
}
}
}
// Character class patterns
const characterClassPatterns = this.detectCharacterClassPatterns(sampleValues);
patterns.push(...characterClassPatterns);
return patterns;
}
detectCharacterClassPatterns(sampleValues) {
const patterns = [];
// Sample a few values to generate patterns
const sampleSize = Math.min(sampleValues.length, 20);
const samples = sampleValues.slice(0, sampleSize);
for (const sample of samples) {
const generatedPattern = this.generateCharacterClassPattern(sample);
if (generatedPattern) {
const regex = new RegExp(generatedPattern);
const matches = sampleValues.filter(v => regex.test(v));
const frequency = matches.length / sampleValues.length;
if (frequency > 0.3) {
patterns.push({
pattern: generatedPattern,
frequency: frequency,
confidence: frequency * 0.6,
examples: matches.slice(0, 3),
semantic_type: 'inferred_structure'
});
}
}
}
return this.deduplicatePatterns(patterns);
}
generateCharacterClassPattern(sample) {
if (sample.length === 0)
return null;
let pattern = '^';
let i = 0;
while (i < sample.length) {
const char = sample[i];
let charClass = '';
if (/\d/.test(char)) {
charClass = '\\d';
}
else if (/[a-z]/.test(char)) {
charClass = '[a-z]';
}
else if (/[A-Z]/.test(char)) {
charClass = '[A-Z]';
}
else if (/\s/.test(char)) {
charClass = '\\s';
}
else if (/[!@#$%^&*(),.?":{}|<>]/.test(char)) {
charClass = `\\${char}`;
}
else {
charClass = `.`;
}
// Count consecutive characters of the same class
let count = 1;
while (i + count < sample.length && this.matchesCharacterClass(sample[i + count], charClass)) {
count++;
}
if (count === 1) {
pattern += charClass;
}
else if (count <= 3) {
pattern += `${charClass}{${count}}`;
}
else {
pattern += `${charClass}+`;
}
i += count;
}
pattern += '$';
return pattern;
}
matchesCharacterClass(char, charClass) {
switch (charClass) {
case '\\d': return /\d/.test(char);
case '[a-z]': return /[a-z]/.test(char);
case '[A-Z]': return /[A-Z]/.test(char);
case '\\s': return /\s/.test(char);
default: return false;
}
}
calculatePatternSimilarity(historicalPatterns, currentPatterns) {
if (historicalPatterns.length === 0 && currentPatterns.length === 0) {
return 1.0;
}
if (historicalPatterns.length === 0 || currentPatterns.length === 0) {
return 0.0;
}
// Create frequency maps
const historicalMap = new Map(historicalPatterns.map(p => [p.pattern, p.frequency]));
const currentMap = new Map(currentPatterns.map(p => [p.pattern, p.frequency]));
// Calculate weighted similarity
let totalSimilarity = 0;
let totalWeight = 0;
// Check overlap and frequency changes
for (const [pattern, historicalFreq] of historicalMap) {
const currentFreq = currentMap.get(pattern) || 0;
const weight = Math.max(historicalFreq, currentFreq);
const similarity = 1 - Math.abs(historicalFreq - currentFreq);
totalSimilarity += similarity * weight;
totalWeight += weight;
}
// Penalize for completely new patterns
for (const [pattern, currentFreq] of currentMap) {
if (!historicalMap.has(pattern)) {
totalWeight += currentFreq;
// New patterns get 0 similarity
}
}
return totalWeight > 0 ? totalSimilarity / totalWeight : 0;
}
findNewPatterns(historicalPatterns, currentPatterns) {
const historicalPatternSet = new Set(historicalPatterns.map(p => p.pattern));
return currentPatterns.filter(p => !historicalPatternSet.has(p.pattern));
}
findLostPatterns(historicalPatterns, currentPatterns) {
const currentPatternSet = new Set(currentPatterns.map(p => p.pattern));
return historicalPatterns.filter(p => !currentPatternSet.has(p.pattern));
}
findChangedPatterns(historicalPatterns, currentPatterns) {
const changes = [];
const currentMap = new Map(currentPatterns.map(p => [p.pattern, p]));
for (const historical of historicalPatterns) {
const current = currentMap.get(historical.pattern);
if (current) {
const frequencyChange = Math.abs(historical.frequency - current.frequency);
const confidenceChange = Math.abs(historical.confidence - current.confidence);
if (frequencyChange > 0.1 || confidenceChange > 0.1) {
changes.push({
old_pattern: historical.pattern,
new_pattern: current.pattern,
similarity: 1 - Math.max(frequencyChange, confidenceChange),
impact_severity: this.assessChangeImpact(frequencyChange, confidenceChange),
transformation_type: this.identifyTransformationType(historical, current)
});
}
}
}
return changes;
}
analyzeSampleDrift(historicalSamples, currentSamples) {
// Format consistency analysis
const formatConsistency = this.calculateFormatConsistency(currentSamples);
// Length distribution analysis
const lengthDistributionChange = this.calculateLengthDistributionChange(historicalSamples, currentSamples);
// Character set analysis
const characterSetChanges = this.detectCharacterSetChanges(historicalSamples, currentSamples);
// Structural changes
const structuralChanges = this.detectStructuralChanges(historicalSamples, currentSamples);
return {
format_consistency: formatConsistency,
length_distribution_change: lengthDistributionChange,
character_set_changes: characterSetChanges,
structural_changes: structuralChanges
};
}
calculateFormatConsistency(samples) {
if (samples.length === 0)
return 1.0;
const patterns = samples.map(s => this.generateCharacterClassPattern(s));
const uniquePatterns = new Set(patterns.filter(p => p !== null));
return 1 - (uniquePatterns.size / samples.length);
}
calculateLengthDistributionChange(historicalSamples, currentSamples) {
const historicalLengths = historicalSamples.map(s => s.length);
const currentLengths = currentSamples.map(s => s.length);
const historicalAvg = historicalLengths.reduce((a, b) => a + b, 0) / historicalLengths.length;
const currentAvg = currentLengths.reduce((a, b) => a + b, 0) / currentLengths.length;
return Math.abs(historicalAvg - currentAvg) / Math.max(historicalAvg, currentAvg, 1);
}
detectCharacterSetChanges(historicalSamples, currentSamples) {
const historicalChars = new Set(historicalSamples.join('').split(''));
const currentChars = new Set(currentSamples.join('').split(''));
const changes = [];
// Check for new character types
const newChars = [...currentChars].filter(c => !historicalChars.has(c));
if (newChars.length > 0) {
changes.push(`Added characters: ${newChars.slice(0, 10).join(', ')}`);
}
// Check for lost character types
const lostChars = [...historicalChars].filter(c => !currentChars.has(c));
if (lostChars.length > 0) {
changes.push(`Removed characters: ${lostChars.slice(0, 10).join(', ')}`);
}
return changes;
}
detectStructuralChanges(historicalSamples, currentSamples) {
const changes = [];
// Detect delimiter changes
const delimiterChange = this.detectDelimiterChanges(historicalSamples, currentSamples);
if (delimiterChange)
changes.push(delimiterChange);
// Detect casing changes
const casingChange = this.detectCasingChanges(historicalSamples, currentSamples);
if (casingChange)
changes.push(casingChange);
// Detect prefix/suffix changes
const affixChanges = this.detectAffixChanges(historicalSamples, currentSamples);
changes.push(...affixChanges);
return changes;
}
detectDelimiterChanges(historicalSamples, currentSamples) {
const delimiters = ['-', '_', '.', '/', ':', ' ', ',', ';'];
const historicalDelims = this.countDelimiters(historicalSamples, delimiters);
const currentDelims = this.countDelimiters(currentSamples, delimiters);
const changes = [];
for (const delim of delimiters) {
const histFreq = historicalDelims[delim] || 0;
const currFreq = currentDelims[delim] || 0;
const change = Math.abs(histFreq - currFreq);
if (change > 0.2) {
changes.push({
before: `${delim}: ${(histFreq * 100).toFixed(1)}%`,
after: `${delim}: ${(currFreq * 100).toFixed(1)}%`
});
}
}
if (changes.length > 0) {
return {
type: 'delimiter_change',
description: `Delimiter usage patterns changed`,
impact: Math.max(...changes.map(c => Math.abs(parseFloat(c.before.split(': ')[1]) - parseFloat(c.after.split(': ')[1])))) / 100,
examples: changes
};
}
return null;
}
detectCasingChanges(historicalSamples, currentSamples) {
const historicalCasing = this.analyzeCasing(historicalSamples);
const currentCasing = this.analyzeCasing(currentSamples);
const changes = [];
for (const [type, histFreq] of Object.entries(historicalCasing)) {
const currFreq = currentCasing[type] || 0;
const change = Math.abs(histFreq - currFreq);
if (change > 0.1) {
changes.push({
before: `${type}: ${(histFreq * 100).toFixed(1)}%`,
after: `${type}: ${(currFreq * 100).toFixed(1)}%`
});
}
}
if (changes.length > 0) {
return {
type: 'casing_change',
description: `Text casing patterns changed`,
impact: Math.max(...changes.map(c => Math.abs(parseFloat(c.before.split(': ')[1]) - parseFloat(c.after.split(': ')[1])))) / 100,
examples: changes
};
}
return null;
}
detectAffixChanges(historicalSamples, currentSamples) {
const changes = [];
// Detect common prefixes/suffixes
const historicalPrefixes = this.extractCommonAffixes(historicalSamples, 'prefix');
const currentPrefixes = this.extractCommonAffixes(currentSamples, 'prefix');
const historicalSuffixes = this.extractCommonAffixes(historicalSamples, 'suffix');
const currentSuffixes = this.extractCommonAffixes(currentSamples, 'suffix');
// Compare prefixes
const prefixChange = this.compareAffixes(historicalPrefixes, currentPrefixes, 'prefix');
if (prefixChange)
changes.push(prefixChange);
// Compare suffixes
const suffixChange = this.compareAffixes(historicalSuffixes, currentSuffixes, 'suffix');
if (suffixChange)
changes.push(suffixChange);
return changes;
}
// Helper methods
deduplicatePatterns(patterns) {
const patternMap = new Map();
for (const pattern of patterns) {
const existing = patternMap.get(pattern.pattern);
if (!existing || pattern.frequency > existing.frequency) {
patternMap.set(pattern.pattern, pattern);
}
}
return Array.from(patternMap.values());
}
calculatePatternConfidence(pattern, matches) {
// Simple confidence based on match consistency and pattern complexity
const baseConfidence = matches.length > 0 ? 0.5 : 0;
const complexityBonus = Math.min(pattern.length / 100, 0.3);
const consistencyBonus = matches.length > 10 ? 0.2 : 0;
return Math.min(1.0, baseConfidence + complexityBonus + consistencyBonus);
}
identifySemanticType(pattern) {
for (const [type, regex] of Object.entries({ ...this.COMMON_PATTERNS, ...this.SEMANTIC_PATTERNS })) {
if (pattern === regex.source) {
return type;
}
}
return undefined;
}
assessFormatStability(similarityScore, newPatternsCount, lostPatternsCount, sampleAnalysis) {
const changeScore = (newPatternsCount + lostPatternsCount) / 10;
const formatScore = 1 - sampleAnalysis.format_consistency;
if (similarityScore < 0.5 || changeScore > 0.5 || formatScore > 0.5) {
return 'format_shift';
}
else if (similarityScore < 0.7 || changeScore > 0.3 || formatScore > 0.3) {
return 'major_change';
}
else if (similarityScore < 0.9 || changeScore > 0.1 || formatScore > 0.1) {
return 'minor_change';
}
else {
return 'stable';
}
}
assessChangeImpact(frequencyChange, confidenceChange) {
const maxChange = Math.max(frequencyChange, confidenceChange);
if (maxChange > 0.3)
return 'high';
if (maxChange > 0.15)
return 'medium';
return 'low';
}
identifyTransformationType(historical, current) {
const frequencyRatio = current.frequency / historical.frequency;
if (frequencyRatio > 2 || frequencyRatio < 0.5) {
return 'scale_change';
}
if (historical.semantic_type && current.semantic_type &&
historical.semantic_type !== current.semantic_type) {
return 'semantic_drift';
}
if (Math.abs(historical.confidence - current.confidence) > 0.2) {
return 'encoding_change';
}
return 'format_evolution';
}
countDelimiters(samples, delimiters) {
const counts = {};
for (const delim of delimiters) {
let totalCount = 0;
for (const sample of samples) {
totalCount += (sample.match(new RegExp(delim === '.' ? '\\.' : delim, 'g')) || []).length;
}
counts[delim] = totalCount / (samples.length * samples.join('').length);
}
return counts;
}
analyzeCasing(samples) {
let uppercase = 0;
let lowercase = 0;
let mixed = 0;
let titlecase = 0;
for (const sample of samples) {
if (sample === sample.toUpperCase()) {
uppercase++;
}
else if (sample === sample.toLowerCase()) {
lowercase++;
}
else if (sample === sample.charAt(0).toUpperCase() + sample.slice(1).toLowerCase()) {
titlecase++;
}
else {
mixed++;
}
}
const total = samples.length;
return {
uppercase: uppercase / total,
lowercase: lowercase / total,
titlecase: titlecase / total,
mixed: mixed / total
};
}
extractCommonAffixes(samples, type) {
const affixes = new Map();
const maxLength = 5;
for (const sample of samples) {
for (let len = 1; len <= Math.min(maxLength, sample.length - 1); len++) {
const affix = type === 'prefix'
? sample.substring(0, len)
: sample.substring(sample.length - len);
affixes.set(affix, (affixes.get(affix) || 0) + 1);
}
}
// Filter to only common affixes (appearing in at least 20% of samples)
const threshold = samples.length * 0.2;
for (const [affix, count] of affixes) {
if (count < threshold) {
affixes.delete(affix);
}
}
return affixes;
}
compareAffixes(historical, current, type) {
const changes = [];
for (const [affix, histCount] of historical) {
const currCount = current.get(affix) || 0;
const change = Math.abs(histCount - currCount);
if (change > historical.size * 0.1) {
changes.push({
before: `${affix}: ${histCount}`,
after: `${affix}: ${currCount}`
});
}
}
if (changes.length > 0) {
return {
type: 'prefix_suffix_change',
description: `${type} patterns changed`,
impact: changes.length / Math.max(historical.size, current.size),
examples: changes.slice(0, 5)
};
}
return null;
}
}
exports.PatternDriftDetector = PatternDriftDetector;
//# sourceMappingURL=pattern-drift.js.map