UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

243 lines 12.7 kB
"use strict"; /** * Semantic Data Science Operators * * This module provides comprehensive operators for semantic data processing: * - Unit conversion with multi-category support (currency, temperature, distance, time, mass) * - Time alignment with timezone conversion and granularity adjustment * - Semantic join operations for intelligent data joining * * Key Features: * - Multi-unit conversion with FX rate caching * - Temporal alignment with statistical preservation * - Timezone-aware data processing * - Performance optimized (<50ms per operation) * - Batch processing support * * Performance Targets: * - <50ms per unit conversion * - <100ms for timezone batch conversions * - <2s for 10K timestamp grain adjustments */ Object.defineProperty(exports, "__esModule", { value: true }); exports.PERFORMANCE_TARGETS = exports.UNIT_CONVERSION_VERSION = exports.SEMANTIC_JOIN_VERSION = exports.GrainAdjuster = exports.TimezoneHandler = exports.TimeAligner = exports.OfflineMode = exports.FXCache = exports.UnitConverter = exports.SemanticJoinMetrics = exports.SemanticJoinFactory = exports.joinAdapterRegistry = exports.getSupportedJoinTypes = exports.registerJoinAdapter = exports.getJoinAdapter = exports.DataFrameJoinAdapterRegistry = exports.PolarsJoinAdapter = exports.PandasJoinAdapter = exports.JoinConfidenceCalculator = exports.JoinCostModel = exports.SemanticJoinPlanner = exports.SemanticJoinOperator = void 0; // Core semantic join operator var semantic_join_1 = require("./semantic-join"); Object.defineProperty(exports, "SemanticJoinOperator", { enumerable: true, get: function () { return semantic_join_1.SemanticJoinOperator; } }); const semantic_join_2 = require("./semantic-join"); const dataframe_join_adapters_1 = require("./dataframe-join-adapters"); // Query optimization and planning var join_planner_1 = require("./join-planner"); Object.defineProperty(exports, "SemanticJoinPlanner", { enumerable: true, get: function () { return join_planner_1.SemanticJoinPlanner; } }); Object.defineProperty(exports, "JoinCostModel", { enumerable: true, get: function () { return join_planner_1.JoinCostModel; } }); // Confidence scoring system var join_confidence_1 = require("./join-confidence"); Object.defineProperty(exports, "JoinConfidenceCalculator", { enumerable: true, get: function () { return join_confidence_1.JoinConfidenceCalculator; } }); // DataFrame adapter support var dataframe_join_adapters_2 = require("./dataframe-join-adapters"); Object.defineProperty(exports, "PandasJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.PandasJoinAdapter; } }); Object.defineProperty(exports, "PolarsJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.PolarsJoinAdapter; } }); Object.defineProperty(exports, "DataFrameJoinAdapterRegistry", { enumerable: true, get: function () { return dataframe_join_adapters_2.DataFrameJoinAdapterRegistry; } }); Object.defineProperty(exports, "getJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.getJoinAdapter; } }); Object.defineProperty(exports, "registerJoinAdapter", { enumerable: true, get: function () { return dataframe_join_adapters_2.registerJoinAdapter; } }); Object.defineProperty(exports, "getSupportedJoinTypes", { enumerable: true, get: function () { return dataframe_join_adapters_2.getSupportedJoinTypes; } }); Object.defineProperty(exports, "joinAdapterRegistry", { enumerable: true, get: function () { return dataframe_join_adapters_2.joinAdapterRegistry; } }); // Utility functions for creating and configuring semantic joins class SemanticJoinFactory { /** * Creates a pre-configured semantic join operator optimized for common use cases */ static createOptimized(cidRegistry, options = {}) { const joinOperator = new semantic_join_2.SemanticJoinOperator(cidRegistry); // Configure based on optimization preferences if (options.enableHighPerformance) { // Optimize for speed joinOperator.addNormalizer('default', (value) => String(value || '').toLowerCase().trim()); } if (options.enableHighAccuracy) { // Add more sophisticated normalizers // This would be implemented with more complex normalization logic } return joinOperator; } /** * Creates default join options optimized for specific scenarios */ static getDefaultOptions(scenario) { switch (scenario) { case 'customer_matching': return { confidenceThreshold: 0.8, enableFuzzyMatching: true, fuzzyThreshold: 0.7, autoSelectNormalizers: true, cacheNormalizedValues: true, batchSize: 25000 }; case 'product_catalog': return { confidenceThreshold: 0.9, enableFuzzyMatching: false, // Exact matches preferred for products autoSelectNormalizers: true, cacheNormalizedValues: true, batchSize: 50000 }; case 'general': default: return { confidenceThreshold: 0.7, enableFuzzyMatching: true, fuzzyThreshold: 0.8, autoSelectNormalizers: true, cacheNormalizedValues: true, batchSize: 10000 }; } } /** * Analyzes two datasets and suggests optimal join configuration */ static async analyzeAndSuggestJoinConfig(left, right, leftColumns, rightColumns, cidRegistry) { const joinOperator = new semantic_join_2.SemanticJoinOperator(cidRegistry); // Get basic adapter info const leftAdapter = (0, dataframe_join_adapters_1.getJoinAdapter)(left); const rightAdapter = (0, dataframe_join_adapters_1.getJoinAdapter)(right); const reasoning = []; const warnings = []; let confidence = 0.8; // Analyze data sizes const leftDf = leftAdapter ? leftAdapter.toDataFrameLike(left) : null; const rightDf = rightAdapter ? rightAdapter.toDataFrameLike(right) : null; if (!leftDf || !rightDf) { return { suggestedOptions: this.getDefaultOptions('general'), confidence: 0.3, reasoning: ['Unable to analyze data structure'], warnings: ['Using default configuration due to analysis failure'] }; } const leftRows = leftDf.shape[0]; const rightRows = rightDf.shape[0]; const totalRows = leftRows + rightRows; const suggestedOptions = { confidenceThreshold: 0.7, enableFuzzyMatching: true, fuzzyThreshold: 0.8, autoSelectNormalizers: true, cacheNormalizedValues: true }; // Adjust batch size based on data size if (totalRows < 10000) { suggestedOptions.batchSize = totalRows; reasoning.push('Small dataset - using single batch processing'); } else if (totalRows < 100000) { suggestedOptions.batchSize = 25000; reasoning.push('Medium dataset - using 25K batch size'); } else { suggestedOptions.batchSize = 50000; reasoning.push('Large dataset - using 50K batch size for optimal performance'); } // Analyze column types for fuzzy matching recommendations for (let i = 0; i < leftColumns.length && i < rightColumns.length; i++) { const leftCol = leftColumns[i]; const rightCol = rightColumns[i]; const leftValues = leftDf.getColumn(leftCol); const rightValues = rightDf.getColumn(rightCol); // Check for high cardinality (likely identifiers) const leftCardinality = new Set(leftValues).size / leftValues.length; const rightCardinality = new Set(rightValues).size / rightValues.length; if (leftCardinality > 0.9 && rightCardinality > 0.9) { suggestedOptions.enableFuzzyMatching = false; reasoning.push(`High cardinality columns detected (${leftCol}/${rightCol}) - disabling fuzzy matching`); suggestedOptions.confidenceThreshold = 0.9; confidence += 0.1; } } // Performance hints from adapters if (leftAdapter && rightAdapter) { const leftHints = leftAdapter.getPerformanceHints(); const rightHints = rightAdapter.getPerformanceHints(); if (leftHints.memoryEfficient && rightHints.memoryEfficient) { suggestedOptions.cacheNormalizedValues = true; reasoning.push('Memory-efficient adapters detected - enabling value caching'); } } return { suggestedOptions, confidence, reasoning, warnings }; } } exports.SemanticJoinFactory = SemanticJoinFactory; /** * Performance monitoring utilities */ class SemanticJoinMetrics { static metrics = new Map(); static recordJoinPerformance(joinId, result) { this.metrics.set(joinId, { timestamp: new Date(), performance: result.performance, statistics: result.statistics, inputRows: result.statistics.inputRowsLeft + result.statistics.inputRowsRight, outputRows: result.statistics.outputRows, throughput: (result.statistics.inputRowsLeft + result.statistics.inputRowsRight) / (result.performance.totalTime / 1000) }); } static getPerformanceReport() { const allMetrics = Array.from(this.metrics.values()); if (allMetrics.length === 0) { return { averageThroughput: 0, averageConfidence: 0, totalJoins: 0, performanceBreakdown: {} }; } const averageThroughput = allMetrics.reduce((sum, m) => sum + m.throughput, 0) / allMetrics.length; const averageConfidence = allMetrics.reduce((sum, m) => sum + m.statistics.confidence.average, 0) / allMetrics.length; return { averageThroughput, averageConfidence, totalJoins: allMetrics.length, performanceBreakdown: { totalTime: allMetrics.reduce((sum, m) => sum + m.performance.totalTime, 0) / allMetrics.length, normalizationTime: allMetrics.reduce((sum, m) => sum + m.performance.normalizationTime, 0) / allMetrics.length, joinTime: allMetrics.reduce((sum, m) => sum + m.performance.joinTime, 0) / allMetrics.length } }; } static clearMetrics() { this.metrics.clear(); } } exports.SemanticJoinMetrics = SemanticJoinMetrics; // Unit Conversion & Time Alignment Operators var unit_convert_js_1 = require("./unit-convert.js"); Object.defineProperty(exports, "UnitConverter", { enumerable: true, get: function () { return unit_convert_js_1.UnitConverter; } }); var fx_cache_js_1 = require("./fx-cache.js"); Object.defineProperty(exports, "FXCache", { enumerable: true, get: function () { return fx_cache_js_1.FXCache; } }); Object.defineProperty(exports, "OfflineMode", { enumerable: true, get: function () { return fx_cache_js_1.OfflineMode; } }); var align_time_js_1 = require("./align-time.js"); Object.defineProperty(exports, "TimeAligner", { enumerable: true, get: function () { return align_time_js_1.TimeAligner; } }); var timezone_handler_js_1 = require("./timezone-handler.js"); Object.defineProperty(exports, "TimezoneHandler", { enumerable: true, get: function () { return timezone_handler_js_1.TimezoneHandler; } }); var grain_adjuster_js_1 = require("./grain-adjuster.js"); Object.defineProperty(exports, "GrainAdjuster", { enumerable: true, get: function () { return grain_adjuster_js_1.GrainAdjuster; } }); // Version and metadata exports.SEMANTIC_JOIN_VERSION = '1.0.0'; exports.UNIT_CONVERSION_VERSION = '1.0.0'; exports.PERFORMANCE_TARGETS = { MAX_TIME_100K_ROWS: 100, // milliseconds MIN_JOIN_ACCURACY: 0.92, // 92% TARGET_CACHE_HIT_RATE: 0.90, // 90% JOIN_FAILURE_REDUCTION: 0.30, // 30% vs standard joins MAX_UNIT_CONVERSION_TIME: 50, // milliseconds MAX_TIMEZONE_BATCH_TIME: 100, // milliseconds for 1000 timestamps MAX_GRAIN_ADJUSTMENT_TIME: 2000 // milliseconds for 10K timestamps }; //# sourceMappingURL=index.js.map