UNPKG

@flightstream/utils-arrow

Version:

Advanced utilities for working with Arrow data and Flight protocol in FlightStream for Node.js

192 lines (157 loc) 5.04 kB
import { inferType, mapToArrowType, isValidArrowType } from './type-system/index.js'; /** * Generic Schema Inference Utilities * * This module provides utilities for inferring Arrow schemas from various data formats. * It supports multiple data types and provides extensible type inference patterns. * * Key features: * 1. Type inference from sample data * 2. Support for multiple data formats (CSV, JSON, etc.) * 3. Configurable type detection rules * 4. Arrow schema generation * 5. Schema validation and normalization */ /** * Infer schema from a collection of sample data * @param {Array} samples - Array of sample records * @param {Object} options - Schema inference options * @returns {Object} Inferred schema */ export function inferSchema(samples, options = {}) { const { sampleSize = Math.min(samples.length, 1000), confidenceThreshold = 0.8, nullThreshold = 0.5, ...typeOptions } = options; if (!samples || samples.length === 0) { return {}; } // Use subset of samples for performance const sampleData = samples.slice(0, sampleSize); // Extract all column names from samples const columnNames = new Set(); sampleData.forEach(record => { if (record && typeof record === 'object') { Object.keys(record).forEach(key => columnNames.add(key)); } }); const schema = {}; // Infer type for each column for (const columnName of columnNames) { const columnValues = sampleData .map(record => record && record[columnName]) .filter(value => value !== undefined); const inferredType = inferColumnType(columnValues, { confidenceThreshold, nullThreshold, ...typeOptions }); schema[columnName] = inferredType; } return schema; } /** * Infer type for a specific column based on its values * @param {Array} values - Column values to analyze * @param {Object} options - Type inference options * @returns {string} Inferred column type */ export function inferColumnType(values, options = {}) { const { confidenceThreshold = 0.8, nullThreshold = 0.5 } = options; if (!values || values.length === 0) { return 'string'; } // Count null values const nullCount = values.filter(v => v === null || v === undefined || v === '').length; const nullRatio = nullCount / values.length; // If too many nulls, default to string if (nullRatio > nullThreshold) { return 'string'; } // Get non-null values for type inference const nonNullValues = values.filter(v => v !== null && v !== undefined && v !== ''); if (nonNullValues.length === 0) { return 'string'; } // Count types for each value const typeCounts = {}; nonNullValues.forEach(value => { const type = inferType(value, options); typeCounts[type] = (typeCounts[type] || 0) + 1; }); // Find the most common type const sortedTypes = Object.entries(typeCounts) .sort(([,a], [,b]) => b - a); const [mostCommonType, count] = sortedTypes[0]; const confidence = count / nonNullValues.length; // If confidence is high enough, use the most common type if (confidence >= confidenceThreshold) { return mostCommonType; } // Otherwise, use string as fallback return 'string'; } /** * Normalize schema by resolving type conflicts and applying rules * @param {Object} schema - Raw inferred schema * @param {Object} options - Normalization options * @returns {Object} Normalized schema */ export function normalizeSchema(schema, options = {}) { const { preferredTypes = {}, typeRules = {}, // strictMode: _strictMode = false // Reserved for future use } = options; const normalizedSchema = {}; for (const [columnName, type] of Object.entries(schema)) { let normalizedType = type; // Apply preferred types if (preferredTypes[columnName]) { normalizedType = preferredTypes[columnName]; } // Apply type rules if (typeRules[type]) { normalizedType = typeRules[type]; } // Validate type if (!isValidArrowType(normalizedType)) { console.warn(`Invalid Arrow type '${normalizedType}' for column '${columnName}', falling back to string`); normalizedType = 'string'; } normalizedSchema[columnName] = normalizedType; } return normalizedSchema; } /** * Generate Arrow schema from inferred schema * @param {Object} schema - Inferred schema * @param {Object} options - Arrow schema options * @returns {Object} Arrow schema configuration */ export function generateArrowSchema(schema, options = {}) { const { nullable = true } = options; const arrowFields = []; for (const [columnName, type] of Object.entries(schema)) { arrowFields.push({ name: columnName, type: mapToArrowType(type), nullable: nullable }); } return { fields: arrowFields, metadata: { inference_timestamp: new Date().toISOString(), source: 'schema-inference' } }; } export default { inferSchema, inferColumnType, normalizeSchema, generateArrowSchema };