@flightstream/adapters-csv
Version:
CSV file adapter for Arrow Flight streaming with automatic schema inference
346 lines (306 loc) • 10.3 kB
JavaScript
import * as arrow from 'apache-arrow';
import { ArrowBuilder } from '@flightstream/core-shared';
/**
* Optimized CSV-Specific Arrow Builder
*
* This class extends the streamlined ArrowBuilder to provide CSV-specific data processing.
* It leverages Apache Arrow's native string-to-vector conversion for maximum performance.
*
* Key features:
* 1. Direct CSV string to Arrow vector conversion using Arrow's built-in type conversion
* 2. CSV type mapping to Arrow data types
* 3. Zero-copy batch processing without intermediate conversions
* 4. Direct IPC serialization for streaming
*
* Usage:
* const csvSchema = { id: 'int64', name: 'string', price: 'float64' };
* const builder = new CSVArrowBuilder(csvSchema);
* const vectors = builder.createTypedArraysFromCSVBatch(csvRows, headers, delimiter);
* const serialized = builder.serializeFromArrays(vectors);
*/
export class CSVArrowBuilder extends ArrowBuilder {
constructor(csvSchema, options = {}) {
super(csvSchema, options);
this.csvSchema = csvSchema;
}
// ===== IMPLEMENTATION OF ABSTRACT METHODS =====
/**
* Build Arrow schema from CSV schema format
* Converts CSV column type names to Arrow field definitions
* @override
*/
_buildArrowSchema() {
const fields = [];
for (const [columnName, csvType] of Object.entries(this.sourceSchema)) {
const arrowType = this._mapSourceTypeToArrow(csvType);
fields.push(arrow.Field.new(columnName, arrowType, true)); // nullable = true
}
this.arrowSchema = new arrow.Schema(fields);
}
/**
* Map CSV type names to Arrow types
* Converts CSV type strings to corresponding Arrow data types
* @param {string} csvType - CSV type name (e.g., 'int64', 'string', 'float64')
* @returns {arrow.DataType} Arrow data type
* @override
*/
_mapSourceTypeToArrow(csvType) {
switch (csvType) {
case 'boolean':
return new arrow.Bool();
case 'int32':
return new arrow.Int32();
case 'int64':
return new arrow.Int64();
case 'float32':
return new arrow.Float32();
case 'float64':
return new arrow.Float64();
case 'date':
return new arrow.DateMillisecond();
case 'timestamp':
return new arrow.TimestampMillisecond();
case 'string':
default:
return new arrow.Utf8();
}
}
// ===== OPTIMIZED METHODS =====
/**
* Create typed arrays directly from CSV lines
*
* This method parses CSV lines directly into typed arrays without creating
* intermediate JavaScript objects, providing significant performance improvements.
*
* @param {Array<string>} csvBatch - Array of CSV lines (excluding headers)
* @param {Array<string>} headers - Column headers
* @param {string} delimiter - CSV delimiter character
* @returns {Object} Object with column names as keys and typed arrays as values
*/
createTypedArraysFromCSVBatch(csvBatch, headers, delimiter = ',') {
const fields = this.arrowSchema.fields;
const typedArrays = {};
// Pre-build mappings to eliminate O(n²) field lookups
const headerToField = new Map();
const headerToArray = new Map();
// Initialize typed arrays and build mappings in a single pass
for (const field of fields) {
const columnName = field.name;
const typedArray = this._createEmptyTypedArray(field.type, csvBatch.length);
typedArrays[columnName] = typedArray;
headerToArray.set(columnName, typedArray);
headerToField.set(columnName, field);
}
// Parse each line and populate typed arrays directly
let validRowCount = 0;
for (const line of csvBatch) {
if (!line.trim()) {
continue; // Skip empty lines
}
try {
// Parse CSV line into values
const values = this._parseCSVLine(line, delimiter);
// Populate each column's typed array using pre-built mappings
for (let i = 0; i < headers.length; i++) {
const header = headers[i];
const value = values[i] || '';
// Use pre-built mappings for O(1) access
const field = headerToField.get(header);
const typedArray = headerToArray.get(header);
if (field && typedArray) {
const convertedValue = this._convertStringToTypedValue(value, field.type);
typedArray[validRowCount] = convertedValue;
}
}
validRowCount++;
} catch (error) {
// Skip problematic lines - error isolation
continue;
}
}
// Trim arrays to actual valid row count
for (const [columnName, typedArray] of Object.entries(typedArrays)) {
if (validRowCount < typedArray.length) {
typedArrays[columnName] = typedArray.subarray(0, validRowCount);
}
}
return typedArrays;
}
/**
* Parse a single CSV line into values
*
* @param {string} line - CSV line
* @param {string} delimiter - CSV delimiter
* @returns {Array<string>} Array of values
* @private
*/
_parseCSVLine(line, delimiter) {
const values = [];
const chars = []; // Use array instead of string concatenation
let inQuotes = false;
let i = 0;
const lineLength = line.length;
while (i < lineLength) {
const char = line[i];
if (char === '"') {
inQuotes = !inQuotes;
} else if (char === delimiter && !inQuotes) {
// Join chars array and trim once
const value = chars.length > 0 ? chars.join('').trim() : '';
values.push(value);
chars.length = 0; // Reset array instead of creating new one
} else {
chars.push(char);
}
i++;
}
// Add the last value - join chars array and trim once
const lastValue = chars.length > 0 ? chars.join('').trim() : '';
values.push(lastValue);
return values;
}
/**
* Create an empty typed array of the appropriate type and size
*
* @param {arrow.DataType} arrowType - Arrow data type
* @param {number} size - Array size
* @returns {TypedArray} Empty typed array
* @private
*/
_createEmptyTypedArray(arrowType, size) {
if (arrowType instanceof arrow.Int32) {
return new Int32Array(size);
} else if (arrowType instanceof arrow.Int64) {
return new BigInt64Array(size);
} else if (arrowType instanceof arrow.Float32) {
return new Float32Array(size);
} else if (arrowType instanceof arrow.Float64) {
return new Float64Array(size);
} else if (arrowType instanceof arrow.Bool) {
return new Uint8Array(size);
} else if (arrowType instanceof arrow.DateMillisecond) {
return new Int32Array(size);
} else if (arrowType instanceof arrow.TimestampMillisecond) {
return new BigInt64Array(size);
} else {
// For strings, return regular array
return new Array(size);
}
}
/**
* Convert a string value to the appropriate typed value
*
* @param {string} value - String value
* @param {arrow.DataType} arrowType - Arrow data type
* @returns {any} Converted value
* @private
*/
_convertStringToTypedValue(value, arrowType) {
if (arrowType instanceof arrow.Int32) {
return this._convertStringToInt32(value);
} else if (arrowType instanceof arrow.Int64) {
return this._convertStringToInt64(value);
} else if (arrowType instanceof arrow.Float32) {
return this._convertStringToFloat32(value);
} else if (arrowType instanceof arrow.Float64) {
return this._convertStringToFloat64(value);
} else if (arrowType instanceof arrow.Bool) {
return this._convertStringToBoolean(value);
} else if (arrowType instanceof arrow.DateMillisecond) {
return this._convertStringToDate(value);
} else if (arrowType instanceof arrow.TimestampMillisecond) {
return this._convertStringToTimestamp(value);
} else {
// For strings, return as-is
return value;
}
}
// ===== DIRECT CONVERSION METHODS =====
/**
* Convert string directly to Int32
* @param {string} value - String value
* @returns {number} Int32 value
* @private
*/
_convertStringToInt32(value) {
const number = parseInt(value, 10);
return isNaN(number) ? 0 : number;
}
/**
* Convert string directly to Int64
* @param {string} value - String value
* @returns {bigint} Int64 value
* @private
*/
_convertStringToInt64(value) {
try {
return BigInt(value);
} catch (error) {
return BigInt(0);
}
}
/**
* Convert string directly to Float32
* @param {string} value - String value
* @returns {number} Float32 value
* @private
*/
_convertStringToFloat32(value) {
const number = parseFloat(value);
return isNaN(number) ? 0.0 : number;
}
/**
* Convert string directly to Float64
* @param {string} value - String value
* @returns {number} Float64 value
* @private
*/
_convertStringToFloat64(value) {
const number = parseFloat(value);
return isNaN(number) ? 0.0 : number;
}
/**
* Convert string directly to Boolean
* @param {string} value - String value
* @returns {number} Boolean as 0 or 1
* @private
*/
_convertStringToBoolean(value) {
return String(value).trim().toLowerCase() === 'true' ? 1 : 0;
}
/**
* Convert string directly to Date (milliseconds since epoch)
* @param {string} value - String value
* @returns {number} Date in milliseconds
* @private
*/
_convertStringToDate(value) {
try {
const date = new Date(String(value).trim());
return isNaN(date.getTime()) ? 0 : date.getTime();
} catch (error) {
return 0;
}
}
/**
* Convert string directly to Timestamp (milliseconds since epoch)
* @param {string} value - String value
* @returns {bigint} Timestamp in milliseconds as BigInt
* @private
*/
_convertStringToTimestamp(value) {
if (value === null || value === undefined || value === '') {
return BigInt(0);
}
const strValue = String(value).trim();
if (strValue === '') return BigInt(0);
try {
const date = new Date(strValue);
const timestamp = isNaN(date.getTime()) ? 0 : date.getTime();
return BigInt(timestamp);
} catch (error) {
return BigInt(0);
}
}
}
export default CSVArrowBuilder;