semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
243 lines • 12.7 kB
JavaScript
;
/**
* Semantic Data Science Operators
*
* This module provides comprehensive operators for semantic data processing:
* - Unit conversion with multi-category support (currency, temperature, distance, time, mass)
* - Time alignment with timezone conversion and granularity adjustment
* - Semantic join operations for intelligent data joining
*
* Key Features:
* - Multi-unit conversion with FX rate caching
* - Temporal alignment with statistical preservation
* - Timezone-aware data processing
* - Performance optimized (<50ms per operation)
* - Batch processing support
*
* Performance Targets:
* - <50ms per unit conversion
* - <100ms for timezone batch conversions
* - <2s for 10K timestamp grain adjustments
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.PERFORMANCE_TARGETS = exports.UNIT_CONVERSION_VERSION = exports.SEMANTIC_JOIN_VERSION = exports.GrainAdjuster = exports.TimezoneHandler = exports.TimeAligner = exports.OfflineMode = exports.FXCache = exports.UnitConverter = exports.SemanticJoinMetrics = exports.SemanticJoinFactory = exports.joinAdapterRegistry = exports.getSupportedJoinTypes = exports.registerJoinAdapter = exports.getJoinAdapter = exports.DataFrameJoinAdapterRegistry = exports.PolarsJoinAdapter = exports.PandasJoinAdapter = exports.JoinConfidenceCalculator = exports.JoinCostModel = exports.SemanticJoinPlanner = exports.SemanticJoinOperator = void 0;
// Core semantic join operator
var semantic_join_1 = require("./semantic-join");
Object.defineProperty(exports, "SemanticJoinOperator", { enumerable: true, get: function () { return semantic_join_1.SemanticJoinOperator; } });
const semantic_join_2 = require("./semantic-join");
const dataframe_join_adapters_1 = require("./dataframe-join-adapters");
// Query optimization and planning
var join_planner_1 = require("./join-planner");
Object.defineProperty(exports, "SemanticJoinPlanner", { enumerable: true, get: function () { return join_planner_1.SemanticJoinPlanner; } });
Object.defineProperty(exports, "JoinCostModel", { enumerable: true, get: function () { return join_planner_1.JoinCostModel; } });
// Confidence scoring system
var join_confidence_1 = require("./join-confidence");
Object.defineProperty(exports, "JoinConfidenceCalculator", { enumerable: true, get: function () { return join_confidence_1.JoinConfidenceCalculator; } });
// DataFrame adapter support
var dataframe_join_adapters_2 = require("./dataframe-join-adapters");
Object.defineProperty(exports, "PandasJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.PandasJoinAdapter; } });
Object.defineProperty(exports, "PolarsJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.PolarsJoinAdapter; } });
Object.defineProperty(exports, "DataFrameJoinAdapterRegistry", { enumerable: true, get: function () { return dataframe_join_adapters_2.DataFrameJoinAdapterRegistry; } });
Object.defineProperty(exports, "getJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.getJoinAdapter; } });
Object.defineProperty(exports, "registerJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.registerJoinAdapter; } });
Object.defineProperty(exports, "getSupportedJoinTypes", { enumerable: true, get: function () { return dataframe_join_adapters_2.getSupportedJoinTypes; } });
Object.defineProperty(exports, "joinAdapterRegistry", { enumerable: true, get: function () { return dataframe_join_adapters_2.joinAdapterRegistry; } });
// Utility functions for creating and configuring semantic joins
class SemanticJoinFactory {
/**
* Creates a pre-configured semantic join operator optimized for common use cases
*/
static createOptimized(cidRegistry, options = {}) {
const joinOperator = new semantic_join_2.SemanticJoinOperator(cidRegistry);
// Configure based on optimization preferences
if (options.enableHighPerformance) {
// Optimize for speed
joinOperator.addNormalizer('default', (value) => String(value || '').toLowerCase().trim());
}
if (options.enableHighAccuracy) {
// Add more sophisticated normalizers
// This would be implemented with more complex normalization logic
}
return joinOperator;
}
/**
* Creates default join options optimized for specific scenarios
*/
static getDefaultOptions(scenario) {
switch (scenario) {
case 'customer_matching':
return {
confidenceThreshold: 0.8,
enableFuzzyMatching: true,
fuzzyThreshold: 0.7,
autoSelectNormalizers: true,
cacheNormalizedValues: true,
batchSize: 25000
};
case 'product_catalog':
return {
confidenceThreshold: 0.9,
enableFuzzyMatching: false, // Exact matches preferred for products
autoSelectNormalizers: true,
cacheNormalizedValues: true,
batchSize: 50000
};
case 'general':
default:
return {
confidenceThreshold: 0.7,
enableFuzzyMatching: true,
fuzzyThreshold: 0.8,
autoSelectNormalizers: true,
cacheNormalizedValues: true,
batchSize: 10000
};
}
}
/**
* Analyzes two datasets and suggests optimal join configuration
*/
static async analyzeAndSuggestJoinConfig(left, right, leftColumns, rightColumns, cidRegistry) {
const joinOperator = new semantic_join_2.SemanticJoinOperator(cidRegistry);
// Get basic adapter info
const leftAdapter = (0, dataframe_join_adapters_1.getJoinAdapter)(left);
const rightAdapter = (0, dataframe_join_adapters_1.getJoinAdapter)(right);
const reasoning = [];
const warnings = [];
let confidence = 0.8;
// Analyze data sizes
const leftDf = leftAdapter ? leftAdapter.toDataFrameLike(left) : null;
const rightDf = rightAdapter ? rightAdapter.toDataFrameLike(right) : null;
if (!leftDf || !rightDf) {
return {
suggestedOptions: this.getDefaultOptions('general'),
confidence: 0.3,
reasoning: ['Unable to analyze data structure'],
warnings: ['Using default configuration due to analysis failure']
};
}
const leftRows = leftDf.shape[0];
const rightRows = rightDf.shape[0];
const totalRows = leftRows + rightRows;
const suggestedOptions = {
confidenceThreshold: 0.7,
enableFuzzyMatching: true,
fuzzyThreshold: 0.8,
autoSelectNormalizers: true,
cacheNormalizedValues: true
};
// Adjust batch size based on data size
if (totalRows < 10000) {
suggestedOptions.batchSize = totalRows;
reasoning.push('Small dataset - using single batch processing');
}
else if (totalRows < 100000) {
suggestedOptions.batchSize = 25000;
reasoning.push('Medium dataset - using 25K batch size');
}
else {
suggestedOptions.batchSize = 50000;
reasoning.push('Large dataset - using 50K batch size for optimal performance');
}
// Analyze column types for fuzzy matching recommendations
for (let i = 0; i < leftColumns.length && i < rightColumns.length; i++) {
const leftCol = leftColumns[i];
const rightCol = rightColumns[i];
const leftValues = leftDf.getColumn(leftCol);
const rightValues = rightDf.getColumn(rightCol);
// Check for high cardinality (likely identifiers)
const leftCardinality = new Set(leftValues).size / leftValues.length;
const rightCardinality = new Set(rightValues).size / rightValues.length;
if (leftCardinality > 0.9 && rightCardinality > 0.9) {
suggestedOptions.enableFuzzyMatching = false;
reasoning.push(`High cardinality columns detected (${leftCol}/${rightCol}) - disabling fuzzy matching`);
suggestedOptions.confidenceThreshold = 0.9;
confidence += 0.1;
}
}
// Performance hints from adapters
if (leftAdapter && rightAdapter) {
const leftHints = leftAdapter.getPerformanceHints();
const rightHints = rightAdapter.getPerformanceHints();
if (leftHints.memoryEfficient && rightHints.memoryEfficient) {
suggestedOptions.cacheNormalizedValues = true;
reasoning.push('Memory-efficient adapters detected - enabling value caching');
}
}
return {
suggestedOptions,
confidence,
reasoning,
warnings
};
}
}
exports.SemanticJoinFactory = SemanticJoinFactory;
/**
* Performance monitoring utilities
*/
class SemanticJoinMetrics {
static metrics = new Map();
static recordJoinPerformance(joinId, result) {
this.metrics.set(joinId, {
timestamp: new Date(),
performance: result.performance,
statistics: result.statistics,
inputRows: result.statistics.inputRowsLeft + result.statistics.inputRowsRight,
outputRows: result.statistics.outputRows,
throughput: (result.statistics.inputRowsLeft + result.statistics.inputRowsRight) / (result.performance.totalTime / 1000)
});
}
static getPerformanceReport() {
const allMetrics = Array.from(this.metrics.values());
if (allMetrics.length === 0) {
return {
averageThroughput: 0,
averageConfidence: 0,
totalJoins: 0,
performanceBreakdown: {}
};
}
const averageThroughput = allMetrics.reduce((sum, m) => sum + m.throughput, 0) / allMetrics.length;
const averageConfidence = allMetrics.reduce((sum, m) => sum + m.statistics.confidence.average, 0) / allMetrics.length;
return {
averageThroughput,
averageConfidence,
totalJoins: allMetrics.length,
performanceBreakdown: {
totalTime: allMetrics.reduce((sum, m) => sum + m.performance.totalTime, 0) / allMetrics.length,
normalizationTime: allMetrics.reduce((sum, m) => sum + m.performance.normalizationTime, 0) / allMetrics.length,
joinTime: allMetrics.reduce((sum, m) => sum + m.performance.joinTime, 0) / allMetrics.length
}
};
}
static clearMetrics() {
this.metrics.clear();
}
}
exports.SemanticJoinMetrics = SemanticJoinMetrics;
// Unit Conversion & Time Alignment Operators
var unit_convert_js_1 = require("./unit-convert.js");
Object.defineProperty(exports, "UnitConverter", { enumerable: true, get: function () { return unit_convert_js_1.UnitConverter; } });
var fx_cache_js_1 = require("./fx-cache.js");
Object.defineProperty(exports, "FXCache", { enumerable: true, get: function () { return fx_cache_js_1.FXCache; } });
Object.defineProperty(exports, "OfflineMode", { enumerable: true, get: function () { return fx_cache_js_1.OfflineMode; } });
var align_time_js_1 = require("./align-time.js");
Object.defineProperty(exports, "TimeAligner", { enumerable: true, get: function () { return align_time_js_1.TimeAligner; } });
var timezone_handler_js_1 = require("./timezone-handler.js");
Object.defineProperty(exports, "TimezoneHandler", { enumerable: true, get: function () { return timezone_handler_js_1.TimezoneHandler; } });
var grain_adjuster_js_1 = require("./grain-adjuster.js");
Object.defineProperty(exports, "GrainAdjuster", { enumerable: true, get: function () { return grain_adjuster_js_1.GrainAdjuster; } });
// Version and metadata
exports.SEMANTIC_JOIN_VERSION = '1.0.0';
exports.UNIT_CONVERSION_VERSION = '1.0.0';
exports.PERFORMANCE_TARGETS = {
MAX_TIME_100K_ROWS: 100, // milliseconds
MIN_JOIN_ACCURACY: 0.92, // 92%
TARGET_CACHE_HIT_RATE: 0.90, // 90%
JOIN_FAILURE_REDUCTION: 0.30, // 30% vs standard joins
MAX_UNIT_CONVERSION_TIME: 50, // milliseconds
MAX_TIMEZONE_BATCH_TIME: 100, // milliseconds for 1000 timestamps
MAX_GRAIN_ADJUSTMENT_TIME: 2000 // milliseconds for 10K timestamps
};
//# sourceMappingURL=index.js.map