UNPKG

@flightstream/adapters-csv

Version:

CSV file adapter for Arrow Flight streaming with automatic schema inference

346 lines (306 loc) 10.3 kB
import * as arrow from 'apache-arrow'; import { ArrowBuilder } from '@flightstream/core-shared'; /** * Optimized CSV-Specific Arrow Builder * * This class extends the streamlined ArrowBuilder to provide CSV-specific data processing. * It leverages Apache Arrow's native string-to-vector conversion for maximum performance. * * Key features: * 1. Direct CSV string to Arrow vector conversion using Arrow's built-in type conversion * 2. CSV type mapping to Arrow data types * 3. Zero-copy batch processing without intermediate conversions * 4. Direct IPC serialization for streaming * * Usage: * const csvSchema = { id: 'int64', name: 'string', price: 'float64' }; * const builder = new CSVArrowBuilder(csvSchema); * const vectors = builder.createTypedArraysFromCSVBatch(csvRows, headers, delimiter); * const serialized = builder.serializeFromArrays(vectors); */ export class CSVArrowBuilder extends ArrowBuilder { constructor(csvSchema, options = {}) { super(csvSchema, options); this.csvSchema = csvSchema; } // ===== IMPLEMENTATION OF ABSTRACT METHODS ===== /** * Build Arrow schema from CSV schema format * Converts CSV column type names to Arrow field definitions * @override */ _buildArrowSchema() { const fields = []; for (const [columnName, csvType] of Object.entries(this.sourceSchema)) { const arrowType = this._mapSourceTypeToArrow(csvType); fields.push(arrow.Field.new(columnName, arrowType, true)); // nullable = true } this.arrowSchema = new arrow.Schema(fields); } /** * Map CSV type names to Arrow types * Converts CSV type strings to corresponding Arrow data types * @param {string} csvType - CSV type name (e.g., 'int64', 'string', 'float64') * @returns {arrow.DataType} Arrow data type * @override */ _mapSourceTypeToArrow(csvType) { switch (csvType) { case 'boolean': return new arrow.Bool(); case 'int32': return new arrow.Int32(); case 'int64': return new arrow.Int64(); case 'float32': return new arrow.Float32(); case 'float64': return new arrow.Float64(); case 'date': return new arrow.DateMillisecond(); case 'timestamp': return new arrow.TimestampMillisecond(); case 'string': default: return new arrow.Utf8(); } } // ===== OPTIMIZED METHODS ===== /** * Create typed arrays directly from CSV lines * * This method parses CSV lines directly into typed arrays without creating * intermediate JavaScript objects, providing significant performance improvements. * * @param {Array<string>} csvBatch - Array of CSV lines (excluding headers) * @param {Array<string>} headers - Column headers * @param {string} delimiter - CSV delimiter character * @returns {Object} Object with column names as keys and typed arrays as values */ createTypedArraysFromCSVBatch(csvBatch, headers, delimiter = ',') { const fields = this.arrowSchema.fields; const typedArrays = {}; // Pre-build mappings to eliminate O(n²) field lookups const headerToField = new Map(); const headerToArray = new Map(); // Initialize typed arrays and build mappings in a single pass for (const field of fields) { const columnName = field.name; const typedArray = this._createEmptyTypedArray(field.type, csvBatch.length); typedArrays[columnName] = typedArray; headerToArray.set(columnName, typedArray); headerToField.set(columnName, field); } // Parse each line and populate typed arrays directly let validRowCount = 0; for (const line of csvBatch) { if (!line.trim()) { continue; // Skip empty lines } try { // Parse CSV line into values const values = this._parseCSVLine(line, delimiter); // Populate each column's typed array using pre-built mappings for (let i = 0; i < headers.length; i++) { const header = headers[i]; const value = values[i] || ''; // Use pre-built mappings for O(1) access const field = headerToField.get(header); const typedArray = headerToArray.get(header); if (field && typedArray) { const convertedValue = this._convertStringToTypedValue(value, field.type); typedArray[validRowCount] = convertedValue; } } validRowCount++; } catch (error) { // Skip problematic lines - error isolation continue; } } // Trim arrays to actual valid row count for (const [columnName, typedArray] of Object.entries(typedArrays)) { if (validRowCount < typedArray.length) { typedArrays[columnName] = typedArray.subarray(0, validRowCount); } } return typedArrays; } /** * Parse a single CSV line into values * * @param {string} line - CSV line * @param {string} delimiter - CSV delimiter * @returns {Array<string>} Array of values * @private */ _parseCSVLine(line, delimiter) { const values = []; const chars = []; // Use array instead of string concatenation let inQuotes = false; let i = 0; const lineLength = line.length; while (i < lineLength) { const char = line[i]; if (char === '"') { inQuotes = !inQuotes; } else if (char === delimiter && !inQuotes) { // Join chars array and trim once const value = chars.length > 0 ? chars.join('').trim() : ''; values.push(value); chars.length = 0; // Reset array instead of creating new one } else { chars.push(char); } i++; } // Add the last value - join chars array and trim once const lastValue = chars.length > 0 ? chars.join('').trim() : ''; values.push(lastValue); return values; } /** * Create an empty typed array of the appropriate type and size * * @param {arrow.DataType} arrowType - Arrow data type * @param {number} size - Array size * @returns {TypedArray} Empty typed array * @private */ _createEmptyTypedArray(arrowType, size) { if (arrowType instanceof arrow.Int32) { return new Int32Array(size); } else if (arrowType instanceof arrow.Int64) { return new BigInt64Array(size); } else if (arrowType instanceof arrow.Float32) { return new Float32Array(size); } else if (arrowType instanceof arrow.Float64) { return new Float64Array(size); } else if (arrowType instanceof arrow.Bool) { return new Uint8Array(size); } else if (arrowType instanceof arrow.DateMillisecond) { return new Int32Array(size); } else if (arrowType instanceof arrow.TimestampMillisecond) { return new BigInt64Array(size); } else { // For strings, return regular array return new Array(size); } } /** * Convert a string value to the appropriate typed value * * @param {string} value - String value * @param {arrow.DataType} arrowType - Arrow data type * @returns {any} Converted value * @private */ _convertStringToTypedValue(value, arrowType) { if (arrowType instanceof arrow.Int32) { return this._convertStringToInt32(value); } else if (arrowType instanceof arrow.Int64) { return this._convertStringToInt64(value); } else if (arrowType instanceof arrow.Float32) { return this._convertStringToFloat32(value); } else if (arrowType instanceof arrow.Float64) { return this._convertStringToFloat64(value); } else if (arrowType instanceof arrow.Bool) { return this._convertStringToBoolean(value); } else if (arrowType instanceof arrow.DateMillisecond) { return this._convertStringToDate(value); } else if (arrowType instanceof arrow.TimestampMillisecond) { return this._convertStringToTimestamp(value); } else { // For strings, return as-is return value; } } // ===== DIRECT CONVERSION METHODS ===== /** * Convert string directly to Int32 * @param {string} value - String value * @returns {number} Int32 value * @private */ _convertStringToInt32(value) { const number = parseInt(value, 10); return isNaN(number) ? 0 : number; } /** * Convert string directly to Int64 * @param {string} value - String value * @returns {bigint} Int64 value * @private */ _convertStringToInt64(value) { try { return BigInt(value); } catch (error) { return BigInt(0); } } /** * Convert string directly to Float32 * @param {string} value - String value * @returns {number} Float32 value * @private */ _convertStringToFloat32(value) { const number = parseFloat(value); return isNaN(number) ? 0.0 : number; } /** * Convert string directly to Float64 * @param {string} value - String value * @returns {number} Float64 value * @private */ _convertStringToFloat64(value) { const number = parseFloat(value); return isNaN(number) ? 0.0 : number; } /** * Convert string directly to Boolean * @param {string} value - String value * @returns {number} Boolean as 0 or 1 * @private */ _convertStringToBoolean(value) { return String(value).trim().toLowerCase() === 'true' ? 1 : 0; } /** * Convert string directly to Date (milliseconds since epoch) * @param {string} value - String value * @returns {number} Date in milliseconds * @private */ _convertStringToDate(value) { try { const date = new Date(String(value).trim()); return isNaN(date.getTime()) ? 0 : date.getTime(); } catch (error) { return 0; } } /** * Convert string directly to Timestamp (milliseconds since epoch) * @param {string} value - String value * @returns {bigint} Timestamp in milliseconds as BigInt * @private */ _convertStringToTimestamp(value) { if (value === null || value === undefined || value === '') { return BigInt(0); } const strValue = String(value).trim(); if (strValue === '') return BigInt(0); try { const date = new Date(strValue); const timestamp = isNaN(date.getTime()) ? 0 : date.getTime(); return BigInt(timestamp); } catch (error) { return BigInt(0); } } } export default CSVArrowBuilder;