UNPKG

semantic-ds-toolkit

Version:

Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference

461 lines 16.2 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.joinAdapterRegistry = exports.DataFrameJoinAdapterRegistry = exports.PolarsJoinAdapter = exports.PandasJoinAdapter = void 0; exports.getJoinAdapter = getJoinAdapter; exports.registerJoinAdapter = registerJoinAdapter; exports.getSupportedJoinTypes = getSupportedJoinTypes; class PandasJoinAdapter { name = 'pandas'; canHandle(obj) { return obj && typeof obj === 'object' && obj.constructor && obj.constructor.name === 'DataFrame' && typeof obj.columns !== 'undefined' && typeof obj.dtypes !== 'undefined'; } toDataFrameLike(df) { return { columns: this.getColumnNames(df), dtypes: this.getDataTypes(df), shape: this.getShape(df), sample: (n = 100) => this.sampleData(df, n), getColumn: (name) => this.getColumn(df, name) }; } fromJoinResult(result, originalLeft, originalRight, options) { // For pandas, we'll construct a new DataFrame from the result const data = result.data; if (data.length === 0) { // Return empty DataFrame with appropriate columns const leftCols = this.getColumnNames(originalLeft).map(col => `left_${col}`); const rightCols = this.getColumnNames(originalRight).map(col => `right_${col}`); const allCols = [...leftCols, ...rightCols, '_semantic_join_meta']; return this.createEmptyDataFrame(allCols); } // Extract columns and data const columns = Object.keys(data[0]); const pandasData = {}; for (const col of columns) { pandasData[col] = data.map(row => row[col]); } return this.createDataFrame(pandasData); } optimizeForType(options) { return { ...options, // Pandas works well with moderate batch sizes batchSize: options.batchSize || 50000, // Enable value caching for pandas (good memory management) cacheNormalizedValues: options.cacheNormalizedValues !== false }; } getPerformanceHints() { return { preferredBatchSize: 50000, supportsLazyExecution: false, supportsParallel: true, memoryEfficient: true }; } getColumnNames(df) { if (Array.isArray(df.columns)) { return df.columns; } if (df.columns && typeof df.columns.tolist === 'function') { return df.columns.tolist(); } if (df.columns && typeof df.columns.values !== 'undefined') { return Array.from(df.columns.values); } return Object.keys(df.dtypes || {}); } getDataTypes(df) { const result = {}; if (df.dtypes) { if (typeof df.dtypes === 'object') { for (const [col, dtype] of Object.entries(df.dtypes)) { result[col] = String(dtype); } } } return result; } getShape(df) { if (Array.isArray(df.shape) && df.shape.length >= 2) { return [df.shape[0], df.shape[1]]; } const columns = this.getColumnNames(df); const rowCount = df.length || 0; return [rowCount, columns.length]; } sampleData(df, n = 100) { const result = {}; const columns = this.getColumnNames(df); for (const col of columns) { result[col] = this.getColumn(df, col).slice(0, n); } return result; } getColumn(df, columnName) { if (df[columnName] && Array.isArray(df[columnName])) { return df[columnName]; } if (df[columnName] && typeof df[columnName].tolist === 'function') { return df[columnName].tolist(); } if (df[columnName] && df[columnName].values) { return Array.from(df[columnName].values); } return []; } createEmptyDataFrame(columns) { // This would typically create an actual pandas DataFrame // For now, return a mock structure const data = {}; for (const col of columns) { data[col] = []; } return { columns, shape: [0, columns.length], data, empty: true }; } createDataFrame(data) { // This would typically create an actual pandas DataFrame // For now, return a mock structure that mimics pandas DataFrame const columns = Object.keys(data); const rowCount = data[columns[0]]?.length || 0; return { columns, shape: [rowCount, columns.length], data, dtypes: this.inferDataTypes(data), length: rowCount }; } inferDataTypes(data) { const dtypes = {}; for (const [col, values] of Object.entries(data)) { if (values.length === 0) { dtypes[col] = 'object'; continue; } const firstNonNull = values.find(v => v != null); if (firstNonNull == null) { dtypes[col] = 'object'; continue; } if (typeof firstNonNull === 'number') { dtypes[col] = Number.isInteger(firstNonNull) ? 'int64' : 'float64'; } else if (typeof firstNonNull === 'boolean') { dtypes[col] = 'bool'; } else if (firstNonNull instanceof Date) { dtypes[col] = 'datetime64[ns]'; } else { dtypes[col] = 'object'; } } return dtypes; } } exports.PandasJoinAdapter = PandasJoinAdapter; class PolarsJoinAdapter { name = 'polars'; canHandle(obj) { return obj && typeof obj === 'object' && ((obj.constructor && obj.constructor.name === 'DataFrame') || (obj.constructor && obj.constructor.name === 'LazyFrame')) && (typeof obj.getColumns === 'function' || typeof obj.columns !== 'undefined'); } toDataFrameLike(df) { // Handle LazyFrame by collecting if necessary const workingDf = this.ensureCollected(df); return { columns: this.getColumnNames(workingDf), dtypes: this.getDataTypes(workingDf), shape: this.getShape(workingDf), sample: (n = 100) => this.sampleData(workingDf, n), getColumn: (name) => this.getColumn(workingDf, name) }; } fromJoinResult(result, originalLeft, originalRight, options) { const data = result.data; if (data.length === 0) { const leftCols = this.getColumnNames(originalLeft).map(col => `left_${col}`); const rightCols = this.getColumnNames(originalRight).map(col => `right_${col}`); const allCols = [...leftCols, ...rightCols, '_semantic_join_meta']; return this.createEmptyDataFrame(allCols); } // Convert array of objects to columnar format (Polars preferred) const columns = Object.keys(data[0]); const columnarData = {}; for (const col of columns) { columnarData[col] = data.map(row => row[col]); } return this.createPolarsDataFrame(columnarData, this.isLazyFrame(originalLeft) || this.isLazyFrame(originalRight)); } optimizeForType(options) { return { ...options, // Polars handles larger batches efficiently batchSize: options.batchSize || 100000, // Enable caching - Polars has excellent memory management cacheNormalizedValues: options.cacheNormalizedValues !== false }; } getPerformanceHints() { return { preferredBatchSize: 100000, supportsLazyExecution: true, supportsParallel: true, memoryEfficient: true }; } ensureCollected(df) { if (this.isLazyFrame(df)) { // For LazyFrame, we need to collect for immediate operations // In practice, this would call df.collect() return df.collect ? df.collect() : df; } return df; } isLazyFrame(df) { return df && df.constructor && df.constructor.name === 'LazyFrame'; } getColumnNames(df) { if (typeof df.getColumns === 'function') { return df.getColumns(); } if (df.columns && Array.isArray(df.columns)) { return df.columns; } if (typeof df.columnNames === 'function') { return df.columnNames(); } return []; } getDataTypes(df) { const result = {}; const columns = this.getColumnNames(df); if (typeof df.dtypes === 'function') { const dtypes = df.dtypes(); if (Array.isArray(dtypes)) { columns.forEach((col, i) => { if (dtypes[i]) { result[col] = String(dtypes[i]); } }); } } else if (typeof df.schema === 'function') { const schema = df.schema(); for (const [col, dtype] of Object.entries(schema)) { result[col] = String(dtype); } } return result; } getShape(df) { const height = typeof df.height === 'number' ? df.height : (typeof df.len === 'function' ? df.len() : 0); const width = typeof df.width === 'number' ? df.width : this.getColumnNames(df).length; return [height, width]; } sampleData(df, n = 100) { const result = {}; const columns = this.getColumnNames(df); let sampledDf = df; // Use Polars-specific sampling methods if (typeof df.sample === 'function') { sampledDf = df.sample(n); } else if (typeof df.head === 'function') { sampledDf = df.head(n); } else if (typeof df.limit === 'function') { sampledDf = df.limit(n); } for (const col of columns) { result[col] = this.getColumn(sampledDf, col); } return result; } getColumn(df, columnName) { // Try Polars-specific methods first if (typeof df.getColumn === 'function') { const column = df.getColumn(columnName); if (column && typeof column.toArray === 'function') { return column.toArray(); } if (column && typeof column.toList === 'function') { return column.toList(); } if (Array.isArray(column)) { return column; } } // Fallback to selecting and converting if (typeof df.select === 'function') { try { const selected = df.select(columnName); if (selected && typeof selected.toArray === 'function') { return selected.toArray(); } } catch (e) { // Fallback if selection fails } } return []; } createEmptyDataFrame(columns) { // Create empty Polars-like structure const data = {}; for (const col of columns) { data[col] = []; } return { columns, shape: [0, columns.length], height: 0, width: columns.length, data, empty: true, // Mock Polars methods getColumns: () => columns, dtypes: () => columns.map(() => 'Utf8') // Default string type }; } createPolarsDataFrame(data, lazy = false) { const columns = Object.keys(data); const rowCount = data[columns[0]]?.length || 0; const df = { columns, shape: [rowCount, columns.length], height: rowCount, width: columns.length, data, // Mock Polars DataFrame interface getColumns: () => columns, getColumn: (name) => ({ toArray: () => data[name] || [], toList: () => data[name] || [] }), dtypes: () => columns.map(col => this.inferPolarsDataType(data[col])), schema: () => { const schema = {}; for (const col of columns) { schema[col] = this.inferPolarsDataType(data[col]); } return schema; }, // Additional Polars-like methods head: (n) => this.createPolarsDataFrame(this.limitData(data, n), lazy), limit: (n) => this.createPolarsDataFrame(this.limitData(data, n), lazy), sample: (n) => this.createPolarsDataFrame(this.sampleColumnarData(data, n), lazy) }; // If should be lazy, wrap in LazyFrame-like interface if (lazy) { return { ...df, constructor: { name: 'LazyFrame' }, collect: () => df, // Add other LazyFrame methods as needed }; } return df; } limitData(data, n) { const limited = {}; for (const [col, values] of Object.entries(data)) { limited[col] = values.slice(0, n); } return limited; } sampleColumnarData(data, n) { const columns = Object.keys(data); if (columns.length === 0) return data; const totalRows = data[columns[0]].length; if (totalRows <= n) return data; const sampled = {}; const indices = this.getRandomIndices(totalRows, n); for (const [col, values] of Object.entries(data)) { sampled[col] = indices.map(i => values[i]); } return sampled; } getRandomIndices(total, count) { const indices = new Set(); while (indices.size < count) { indices.add(Math.floor(Math.random() * total)); } return Array.from(indices); } inferPolarsDataType(values) { if (values.length === 0) return 'Utf8'; const firstNonNull = values.find(v => v != null); if (firstNonNull == null) return 'Utf8'; if (typeof firstNonNull === 'number') { return Number.isInteger(firstNonNull) ? 'Int64' : 'Float64'; } else if (typeof firstNonNull === 'boolean') { return 'Boolean'; } else if (firstNonNull instanceof Date) { return 'Datetime'; } else { return 'Utf8'; } } } exports.PolarsJoinAdapter = PolarsJoinAdapter; class DataFrameJoinAdapterRegistry { adapters = []; constructor() { this.registerDefaultAdapters(); } registerDefaultAdapters() { this.register(new PandasJoinAdapter()); this.register(new PolarsJoinAdapter()); } register(adapter) { this.adapters.push(adapter); } findAdapter(obj) { for (const adapter of this.adapters) { if (adapter.canHandle(obj)) { return adapter; } } return null; } getAdapterByName(name) { return this.adapters.find(adapter => adapter.name === name) || null; } getSupportedTypes() { return this.adapters.map(adapter => adapter.name); } } exports.DataFrameJoinAdapterRegistry = DataFrameJoinAdapterRegistry; const globalJoinAdapterRegistry = new DataFrameJoinAdapterRegistry(); exports.joinAdapterRegistry = globalJoinAdapterRegistry; function getJoinAdapter(obj) { return globalJoinAdapterRegistry.findAdapter(obj); } function registerJoinAdapter(adapter) { globalJoinAdapterRegistry.register(adapter); } function getSupportedJoinTypes() { return globalJoinAdapterRegistry.getSupportedTypes(); } //# sourceMappingURL=dataframe-join-adapters.js.map