@uwdata/flechette
Version:
Fast, lightweight access to Apache Arrow data.
1,389 lines (1,313 loc) • 191 kB
JavaScript
'use strict';
/** Magic bytes 'ARROW1' indicating the Arrow 'file' format. */
const MAGIC = Uint8Array.of(65, 82, 82, 79, 87, 49);
/** Bytes for an 'end of stream' message. */
const EOS = Uint8Array.of(255, 255, 255, 255, 0, 0, 0, 0);
/**
* Apache Arrow version.
*/
const Version = /** @type {const} */ ({
/** 0.1.0 (October 2016). */
V1: 0,
/** 0.2.0 (February 2017). Non-backwards compatible with V1. */
V2: 1,
/** 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. */
V3: 2,
/** >= 0.8.0 (December 2017). Non-backwards compatible with V3. */
V4: 3,
/**
* >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
* metadata and IPC messages). Implementations are recommended to provide a
* V4 compatibility mode with V5 format changes disabled.
*
* Incompatible changes between V4 and V5:
* - Union buffer layout has changed.
* In V5, Unions don't have a validity bitmap buffer.
*/
V5: 4
});
/**
* Endianness of Arrow-encoded data.
*/
const Endianness = /** @type {const} */ ({
Little: 0,
Big: 1
});
/**
* Message header type codes.
*/
const MessageHeader = /** @type {const} */ ({
NONE: 0,
/**
* A Schema describes the columns in a record batch.
*/
Schema: 1,
/**
* For sending dictionary encoding information. Any Field can be
* dictionary-encoded, but in this case none of its children may be
* dictionary-encoded.
* There is one vector / column per dictionary, but that vector / column
* may be spread across multiple dictionary batches by using the isDelta
* flag.
*/
DictionaryBatch: 2,
/**
* A data header describing the shared memory layout of a "record" or "row"
* batch. Some systems call this a "row batch" internally and others a "record
* batch".
*/
RecordBatch: 3,
/**
* EXPERIMENTAL: Metadata for n-dimensional arrays, aka "tensors" or
* "ndarrays". Arrow implementations in general are not required to implement
* this type.
*
* Not currently supported by Flechette.
*/
Tensor: 4,
/**
* EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse
* tensors". Arrow implementations in general are not required to implement
* this type.
*
* Not currently supported by Flechette.
*/
SparseTensor: 5
});
/**
* Field data type ids.
* Only non-negative values ever occur in IPC flatbuffer binary data.
*/
const Type = /** @type {const} */ ({
/**
* Dictionary types compress data by using a set of integer indices to
* lookup potentially repeated vales in a separate dictionary of values.
*
* This type entry is provided for API convenience, it does not occur
* in actual Arrow IPC binary data.
*/
Dictionary: -1,
/** No data type. Included for flatbuffer compatibility. */
NONE: 0,
/** Null values only. */
Null: 1,
/** Integers, either signed or unsigned, with 8, 16, 32, or 64 bit widths. */
Int: 2,
/** Floating point numbers with 16, 32, or 64 bit precision. */
Float: 3,
/** Opaque binary data. */
Binary: 4,
/** Unicode with UTF-8 encoding. */
Utf8: 5,
/** Booleans represented as 8 bit bytes. */
Bool: 6,
/**
* Exact decimal value represented as an integer value in two's complement.
* Currently only 128-bit (16-byte) and 256-bit (32-byte) integers are used.
* The representation uses the endianness indicated in the schema.
*/
Decimal: 7,
/**
* Date is either a 32-bit or 64-bit signed integer type representing an
* elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
* - Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
* leap seconds), where the values are evenly divisible by 86400000
* - Days (32 bits) since the UNIX epoch
*/
Date: 8,
/**
* Time is either a 32-bit or 64-bit signed integer type representing an
* elapsed time since midnight, stored in either of four units: seconds,
* milliseconds, microseconds or nanoseconds.
*
* The integer `bitWidth` depends on the `unit` and must be one of the following:
* - SECOND and MILLISECOND: 32 bits
* - MICROSECOND and NANOSECOND: 64 bits
*
* The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
* (exclusive), adjusted for the time unit (for example, up to 86400000
* exclusive for the MILLISECOND unit).
* This definition doesn't allow for leap seconds. Time values from
* measurements with leap seconds will need to be corrected when ingesting
* into Arrow (for example by replacing the value 86400 with 86399).
*/
Time: 9,
/**
* Timestamp is a 64-bit signed integer representing an elapsed time since a
* fixed epoch, stored in either of four units: seconds, milliseconds,
* microseconds or nanoseconds, and is optionally annotated with a timezone.
*
* Timestamp values do not include any leap seconds (in other words, all
* days are considered 86400 seconds long).
*
* The timezone is an optional string for the name of a timezone, one of:
*
* - As used in the Olson timezone database (the "tz database" or
* "tzdata"), such as "America/New_York".
* - An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
* such as "+07:30".
*
* Whether a timezone string is present indicates different semantics about
* the data.
*/
Timestamp: 10,
/**
* A "calendar" interval which models types that don't necessarily
* have a precise duration without the context of a base timestamp (e.g.
* days can differ in length during day light savings time transitions).
* All integers in the units below are stored in the endianness indicated
* by the schema.
*
* - YEAR_MONTH - Indicates the number of elapsed whole months, stored as
* 4-byte signed integers.
* - DAY_TIME - Indicates the number of elapsed days and milliseconds (no
* leap seconds), stored as 2 contiguous 32-bit signed integers (8-bytes
* in total). Support of this IntervalUnit is not required for full arrow
* compatibility.
* - MONTH_DAY_NANO - A triple of the number of elapsed months, days, and
* nanoseconds. The values are stored contiguously in 16-byte blocks.
* Months and days are encoded as 32-bit signed integers and nanoseconds
* is encoded as a 64-bit signed integer. Nanoseconds does not allow for
* leap seconds. Each field is independent (e.g. there is no constraint
* that nanoseconds have the same sign as days or that the quantity of
* nanoseconds represents less than a day's worth of time).
*/
Interval: 11,
/**
* List (vector) data supporting variably-sized lists.
* A list has a single child data type for list entries.
*/
List: 12,
/**
* A struct consisting of multiple named child data types.
*/
Struct: 13,
/**
* A union is a complex type with parallel child data types. By default ids
* in the type vector refer to the offsets in the children. Optionally
* typeIds provides an indirection between the child offset and the type id.
* For each child `typeIds[offset]` is the id used in the type vector.
*/
Union: 14,
/**
* Binary data where each entry has the same fixed size.
*/
FixedSizeBinary: 15,
/**
* List (vector) data where every list has the same fixed size.
* A list has a single child data type for list entries.
*/
FixedSizeList: 16,
/**
* A Map is a logical nested type that is represented as
* List<entries: Struct<key: K, value: V>>
*
* In this layout, the keys and values are each respectively contiguous. We do
* not constrain the key and value types, so the application is responsible
* for ensuring that the keys are hashable and unique. Whether the keys are sorted
* may be set in the metadata for this field.
*
* In a field with Map type, the field has a child Struct field, which then
* has two children: key type and the second the value type. The names of the
* child fields may be respectively "entries", "key", and "value", but this is
* not enforced.
*
* Map
* ```text
* - child[0] entries: Struct
* - child[0] key: K
* - child[1] value: V
* ```
* Neither the "entries" field nor the "key" field may be nullable.
*
* The metadata is structured so that Arrow systems without special handling
* for Map can make Map an alias for List. The "layout" attribute for the Map
* field must have the same contents as a List.
*/
Map: 17,
/**
* An absolute length of time unrelated to any calendar artifacts. For the
* purposes of Arrow implementations, adding this value to a Timestamp
* ("t1") naively (i.e. simply summing the two numbers) is acceptable even
* though in some cases the resulting Timestamp (t2) would not account for
* leap-seconds during the elapsed time between "t1" and "t2". Similarly,
* representing the difference between two Unix timestamp is acceptable, but
* would yield a value that is possibly a few seconds off from the true
* elapsed time.
*
* The resolution defaults to millisecond, but can be any of the other
* supported TimeUnit values as with Timestamp and Time types. This type is
* always represented as an 8-byte integer.
*/
Duration: 18,
/**
* Same as Binary, but with 64-bit offsets, allowing representation of
* extremely large data values.
*/
LargeBinary: 19,
/**
* Same as Utf8, but with 64-bit offsets, allowing representation of
* extremely large data values.
*/
LargeUtf8: 20,
/**
* Same as List, but with 64-bit offsets, allowing representation of
* extremely large data values.
*/
LargeList: 21,
/**
* Contains two child arrays, run_ends and values. The run_ends child array
* must be a 16/32/64-bit integer array which encodes the indices at which
* the run with the value in each corresponding index in the values child
* array ends. Like list/struct types, the value array can be of any type.
*/
RunEndEncoded: 22,
/**
* Logically the same as Binary, but the internal representation uses a view
* struct that contains the string length and either the string's entire data
* inline (for small strings) or an inlined prefix, an index of another buffer,
* and an offset pointing to a slice in that buffer (for non-small strings).
*
* Since it uses a variable number of data buffers, each Field with this type
* must have a corresponding entry in `variadicBufferCounts`.
*/
BinaryView: 23,
/**
* Logically the same as Utf8, but the internal representation uses a view
* struct that contains the string length and either the string's entire data
* inline (for small strings) or an inlined prefix, an index of another buffer,
* and an offset pointing to a slice in that buffer (for non-small strings).
*
* Since it uses a variable number of data buffers, each Field with this type
* must have a corresponding entry in `variadicBufferCounts`.
*/
Utf8View: 24,
/**
* Represents the same logical types that List can, but contains offsets and
* sizes allowing for writes in any order and sharing of child values among
* list values.
*/
ListView: 25,
/**
* Same as ListView, but with 64-bit offsets and sizes, allowing to represent
* extremely large data values.
*/
LargeListView: 26
});
/**
* Floating point number precision.
*/
const Precision = /** @type {const} */ ({
/** 16-bit floating point number. */
HALF: 0,
/** 32-bit floating point number. */
SINGLE: 1,
/** 64-bit floating point number. */
DOUBLE: 2
});
/**
* Date units.
*/
const DateUnit = /** @type {const} */ ({
/* Days (as 32 bit int) since the UNIX epoch. */
DAY: 0,
/**
* Milliseconds (as 64 bit int) indicating UNIX time elapsed since the epoch
* (no leap seconds), with values evenly divisible by 86400000.
*/
MILLISECOND: 1
});
/**
* Time units.
*/
const TimeUnit = /** @type {const} */ ({
/** Seconds. */
SECOND: 0,
/** Milliseconds. */
MILLISECOND: 1,
/** Microseconds. */
MICROSECOND: 2,
/** Nanoseconds. */
NANOSECOND: 3
});
/**
* Date/time interval units.
*/
const IntervalUnit = /** @type {const} */ ({
/**
* Indicates the number of elapsed whole months, stored as 4-byte signed
* integers.
*/
YEAR_MONTH: 0,
/**
* Indicates the number of elapsed days and milliseconds (no leap seconds),
* stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support
* of this IntervalUnit is not required for full arrow compatibility.
*/
DAY_TIME: 1,
/**
* A triple of the number of elapsed months, days, and nanoseconds.
* The values are stored contiguously in 16-byte blocks. Months and days are
* encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit
* signed integer. Nanoseconds does not allow for leap seconds. Each field is
* independent (e.g. there is no constraint that nanoseconds have the same
* sign as days or that the quantity of nanoseconds represents less than a
* day's worth of time).
*/
MONTH_DAY_NANO: 2
});
/**
* Union type modes.
*/
const UnionMode = /** @type {const} */ ({
/** Sparse union layout with full arrays for each sub-type. */
Sparse: 0,
/** Dense union layout with offsets into value arrays. */
Dense: 1
});
/**
* @import { Int64ArrayConstructor, IntArrayConstructor, IntegerArray, TypedArray } from '../types.js'
*/
const uint8Array = Uint8Array;
const uint16Array = Uint16Array;
const uint32Array = Uint32Array;
const uint64Array = BigUint64Array;
const int8Array = Int8Array;
const int16Array = Int16Array;
const int32Array = Int32Array;
const int64Array = BigInt64Array;
const float32Array = Float32Array;
const float64Array = Float64Array;
/**
* Return the appropriate typed array constructor for the given
* integer type metadata.
* @param {number} bitWidth The integer size in bits.
* @param {boolean} signed Flag indicating if the integer is signed.
* @returns {IntArrayConstructor}
*/
function intArrayType(bitWidth, signed) {
const i = Math.log2(bitWidth) - 3;
return (
signed
? [int8Array, int16Array, int32Array, int64Array]
: [uint8Array, uint16Array, uint32Array, uint64Array]
)[i];
}
/** Shared prototype for typed arrays. */
const TypedArray = Object.getPrototypeOf(Int8Array);
/**
* Check if a value is a typed array.
* @param {*} value The value to check.
* @returns {value is TypedArray}
* True if value is a typed array, false otherwise.
*/
function isTypedArray(value) {
return value instanceof TypedArray;
}
/**
* Check if a value is either a standard array or typed array.
* @param {*} value The value to check.
* @returns {value is (Array | TypedArray)}
* True if value is an array, false otherwise.
*/
function isArray(value) {
return Array.isArray(value) || isTypedArray(value);
}
/**
* Check if a value is an array type (constructor) for 64-bit integers,
* one of BigInt64Array or BigUint64Array.
* @param {*} value The value to check.
* @returns {value is Int64ArrayConstructor}
* True if value is a 64-bit array type, false otherwise.
*/
function isInt64ArrayType(value) {
return value === int64Array || value === uint64Array;
}
/**
* Determine the correct index into an offset array for a given
* full column row index. Assumes offset indices can be manipulated
* as 32-bit signed integers.
* @param {IntegerArray} offsets The offsets array.
* @param {number} index The full column row index.
*/
function bisect(offsets, index) {
let a = 0;
let b = offsets.length;
if (b <= 2147483648) { // 2 ** 31
// fast version, use unsigned bit shift
// array length fits within 32-bit signed integer
do {
const mid = (a + b) >>> 1;
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);
} else {
// slow version, use division and truncate
// array length exceeds 32-bit signed integer
do {
const mid = Math.trunc((a + b) / 2);
if (offsets[mid] <= index) a = mid + 1;
else b = mid;
} while (a < b);
}
return a;
}
/**
* Compute a 64-bit aligned buffer size.
* @param {number} length The starting size.
* @param {number} bpe Bytes per element.
* @returns {number} The aligned size.
*/
function align64(length, bpe = 1) {
return (((length * bpe) + 7) & -8) / bpe;
}
/**
* Return a 64-bit aligned version of the array.
* @template {TypedArray} T
* @param {T} array The array.
* @param {number} length The current array length.
* @returns {T} The aligned array.
*/
function align(array, length = array.length) {
const alignedLength = align64(length, array.BYTES_PER_ELEMENT);
return array.length > alignedLength ? /** @type {T} */ (array.subarray(0, alignedLength))
: array.length < alignedLength ? resize(array, alignedLength)
: array;
}
/**
* Resize a typed array to exactly the specified length.
* @template {TypedArray} T
* @param {T} array The array.
* @param {number} newLength The new length.
* @param {number} [offset] The offset at which to copy the old array.
* @returns {T} The resized array.
*/
function resize(array, newLength, offset = 0) {
// @ts-ignore
const newArray = new array.constructor(newLength);
newArray.set(array, offset);
return newArray;
}
/**
* Grow a typed array to accommdate a minimum index. The array size is
* doubled until it exceeds the minimum index.
* @template {TypedArray} T
* @param {T} array The array.
* @param {number} index The minimum index.
* @param {boolean} [shift] Flag to shift copied bytes to back of array.
* @returns {T} The resized array.
*/
function grow(array, index, shift) {
while (array.length <= index) {
array = resize(array, array.length << 1, shift ? array.length : 0);
}
return array;
}
/**
* Check if a value is a Date instance
* @param {*} value The value to check.
* @returns {value is Date} True if value is a Date, false otherwise.
*/
function isDate(value) {
return value instanceof Date;
}
/**
* Check if a value is iterable.
* @param {*} value The value to check.
* @returns {value is Iterable} True if value is iterable, false otherwise.
*/
function isIterable(value) {
return typeof value[Symbol.iterator] === 'function';
}
/**
* Return the input value if it passes a test.
* Otherwise throw an error using the given message generator.
* @template T
* @param {T} value he value to check.
* @param {(value: T) => boolean} test The test function.
* @param {(value: *) => string} message Message generator.
* @returns {T} The input value.
* @throws if the value does not pass the test
*/
function check(value, test, message) {
if (test(value)) return value;
throw new Error(message(value));
}
/**
* Return the input value if it exists in the provided set.
* Otherwise throw an error using the given message generator.
* @template T
* @param {T} value The value to check.
* @param {T[] | Record<string,T>} set The set of valid values.
* @param {(value: *) => string} [message] Message generator.
* @returns {T} The input value.
* @throws if the value is not included in the set
*/
function checkOneOf(value, set, message) {
set = Array.isArray(set) ? set : Object.values(set);
return check(
value,
(value) => set.includes(value),
message ?? (() => `${value} must be one of ${set}`)
);
}
/**
* Return the first object key that pairs with the given value.
* @param {Record<string,any>} object The object to search.
* @param {any} value The value to lookup.
* @returns {string} The first matching key, or '<Unknown>' if not found.
*/
function keyFor(object, value) {
for (const [key, val] of Object.entries(object)) {
if (val === value) return key;
}
return '<Unknown>';
}
/**
* @import { BinaryType, BinaryViewType, BoolType, DataType, DateType, DateUnit_, DecimalType, DictionaryType, DurationType, Field, FixedSizeBinaryType, FixedSizeListType, FloatType, IntBitWidth, IntervalType, IntervalUnit_, IntType, LargeBinaryType, LargeListType, LargeListViewType, LargeUtf8Type, ListType, ListViewType, MapType, NullType, Precision_, RunEndEncodedType, StructType, TimestampType, TimeType, TimeUnit_, UnionMode_, UnionType, Utf8Type, Utf8ViewType } from './types.js'
*/
/**
* @typedef {Field | DataType} FieldInput
*/
const invalidDataType = (typeId) =>
`Unsupported data type: "${keyFor(Type, typeId)}" (id ${typeId})`;
/**
* Return a new field instance for use in a schema or type definition. A field
* represents a field name, data type, and additional metadata. Fields are used
* to represent child types within nested types like List, Struct, and Union.
* @param {string} name The field name.
* @param {DataType} type The field data type.
* @param {boolean} [nullable=true] Flag indicating if the field is nullable
* (default `true`).
* @param {Map<string,string>|null} [metadata=null] Custom field metadata
* annotations (default `null`).
* @returns {Field} The field instance.
*/
const field = (name, type, nullable = true, metadata = null) => ({
name,
type,
nullable,
metadata
});
/**
* Checks if a value is a field instance.
* @param {any} value
* @returns {value is Field}
*/
function isField(value) {
return Object.hasOwn(value, 'name') && isDataType(value.type)
}
/**
* Checks if a value is a data type instance.
* @param {any} value
* @returns {value is DataType}
*/
function isDataType(value) {
return typeof value?.typeId === 'number';
}
/**
* Return a field instance from a field or data type input.
* @param {FieldInput} value
* The value to map to a field.
* @param {string} [defaultName] The default field name.
* @param {boolean} [defaultNullable=true] The default nullable value.
* @returns {Field} The field instance.
*/
function asField(value, defaultName = '', defaultNullable = true) {
return isField(value)
? value
: field(
defaultName,
check(value, isDataType, () => `Data type expected.`),
defaultNullable
);
}
/////
/**
* Return a basic type with only a type id.
* @template {typeof Type[keyof typeof Type]} T
* @param {T} typeId The type id.
*/
const basicType = (typeId) => ({ typeId });
/**
* Return a Dictionary data type instance. A dictionary type consists of a
* dictionary of values (which may be of any type) and corresponding integer
* indices that reference those values. If values are repeated, a dictionary
* encoding can provide substantial space savings. In the IPC format,
* dictionary indices reside alongside other columns in a record batch, while
* dictionary values are written to special dictionary batches, linked by a
* unique dictionary *id*.
* @param {DataType} type The data type of dictionary
* values.
* @param {IntType} [indexType] The data type of
* dictionary indices. Must be an integer type (default `int32`).
* @param {boolean} [ordered=false] Indicates if dictionary values are
* ordered (default `false`).
* @param {number} [id=-1] The dictionary id. The default value (-1) indicates
* the dictionary applies to a single column only. Provide an explicit id in
* order to reuse a dictionary across columns when building, in which case
* different dictionaries *must* have different unique ids. All dictionary
* ids are later resolved (possibly to new values) upon IPC encoding.
* @returns {DictionaryType}
*/
const dictionary = (type, indexType, ordered = false, id = -1) => ({
typeId: Type.Dictionary,
id,
dictionary: type,
indices: indexType || int32(),
ordered
});
/**
* Return a Null data type instance. Null data requires no storage and all
* extracted values are `null`.
* @returns {NullType} The null data type.
*/
const nullType = () => basicType(Type.Null);
/**
* Return an Int data type instance.
* @param {IntBitWidth} [bitWidth=32] The integer bit width.
* One of `8`, `16`, `32` (default), or `64`.
* @param {boolean} [signed=true] Flag for signed or unsigned integers
* (default `true`).
* @returns {IntType} The integer data type.
*/
const int = (bitWidth = 32, signed = true) => ({
typeId: Type.Int,
bitWidth: checkOneOf(bitWidth, [8, 16, 32, 64]),
signed,
values: intArrayType(bitWidth, signed)
});
/**
* Return an Int data type instance for 8 bit signed integers.
* @returns {IntType} The integer data type.
*/
const int8 = () => int(8);
/**
* Return an Int data type instance for 16 bit signed integers.
* @returns {IntType} The integer data type.
*/
const int16 = () => int(16);
/**
* Return an Int data type instance for 32 bit signed integers.
* @returns {IntType} The integer data type.
*/
const int32 = () => int(32);
/**
* Return an Int data type instance for 64 bit signed integers.
* @returns {IntType} The integer data type.
*/
const int64 = () => int(64);
/**
* Return an Int data type instance for 8 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
const uint8 = () => int(8, false);
/**
* Return an Int data type instance for 16 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
const uint16 = () => int(16, false);
/**
* Return an Int data type instance for 32 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
const uint32 = () => int(32, false);
/**
* Return an Int data type instance for 64 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
const uint64 = () => int(64, false);
/**
* Return a Float data type instance for floating point numbers.
* @param {Precision_} [precision=2] The floating point
* precision. One of `Precision.HALF` (16-bit), `Precision.SINGLE` (32-bit)
* or `Precision.DOUBLE` (64-bit, default).
* @returns {FloatType} The floating point data type.
*/
const float = (precision = 2) => ({
typeId: Type.Float,
precision: checkOneOf(precision, Precision),
values: [uint16Array, float32Array, float64Array][precision]
});
/**
* Return a Float data type instance for half-precision (16 bit) numbers.
* @returns {FloatType} The floating point data type.
*/
const float16 = () => float(Precision.HALF);
/**
* Return a Float data type instance for single-precision (32 bit) numbers.
* @returns {FloatType} The floating point data type.
*/
const float32 = () => float(Precision.SINGLE);
/**
* Return a Float data type instance for double-precision (64 bit) numbers.
* @returns {FloatType} The floating point data type.
*/
const float64 = () => float(Precision.DOUBLE);
/**
* Return a Binary data type instance for variably-sized opaque binary data
* with 32-bit offsets.
* @returns {BinaryType} The binary data type.
*/
const binary = () => ({
typeId: Type.Binary,
offsets: int32Array
});
/**
* Return a Utf8 data type instance for Unicode string data.
* [UTF-8](https://en.wikipedia.org/wiki/UTF-8) code points are stored as
* binary data.
* @returns {Utf8Type} The utf8 data type.
*/
const utf8 = () => ({
typeId: Type.Utf8,
offsets: int32Array
});
/**
* Return a Bool data type instance. Bool values are stored compactly in
* bitmaps with eight values per byte.
* @returns {BoolType} The bool data type.
*/
const bool = () => basicType(Type.Bool);
/**
* Return a Decimal data type instance. Decimal values are represented as 32,
* 64, 128, or 256 bit integers in two's complement. Decimals are fixed point
* numbers with a set *precision* (total number of decimal digits) and *scale*
* (number of fractional digits). For example, the number `35.42` can be
* represented as `3542` with *precision* ≥ 4 and *scale* = 2.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @param {32 | 64 | 128 | 256} [bitWidth] The decimal bit width.
* One of 32, 64, 128 (default), or 256.
* @returns {DecimalType} The decimal data type.
*/
const decimal = (precision, scale, bitWidth = 128) => ({
typeId: Type.Decimal,
precision,
scale,
bitWidth: checkOneOf(bitWidth, [32, 64, 128, 256]),
values: bitWidth === 32 ? int32Array : uint64Array
});
/**
* Return an Decimal data type instance with a bit width of 32.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
const decimal32 = (precision, scale) => decimal(precision, scale, 32);
/**
* Return an Decimal data type instance with a bit width of 64.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
const decimal64 = (precision, scale) => decimal(precision, scale, 64);
/**
* Return an Decimal data type instance with a bit width of 128.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
const decimal128 = (precision, scale) => decimal(precision, scale, 128);
/**
* Return an Decimal data type instance with a bit width of 256.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
const decimal256 = (precision, scale) => decimal(precision, scale, 256);
/**
* Return a Date data type instance. Date values are 32-bit or 64-bit signed
* integers representing elapsed time since the UNIX epoch (Jan 1, 1970 UTC),
* either in units of days (32 bits) or milliseconds (64 bits, with values
* evenly divisible by 86400000).
* @param {DateUnit_} unit The date unit.
* One of `DateUnit.DAY` or `DateUnit.MILLISECOND`.
* @returns {DateType} The date data type.
*/
const date = (unit) => ({
typeId: Type.Date,
unit: checkOneOf(unit, DateUnit),
values: unit === DateUnit.DAY ? int32Array : int64Array
});
/**
* Return a Date data type instance with units of days.
* @returns {DateType} The date data type.
*/
const dateDay = () => date(DateUnit.DAY);
/**
* Return a Date data type instance with units of milliseconds.
* @returns {DateType} The date data type.
*/
const dateMillisecond = () => date(DateUnit.MILLISECOND);
/**
* Return a Time data type instance, stored in one of four *unit*s: seconds,
* milliseconds, microseconds or nanoseconds. The integer *bitWidth* depends
* on the *unit* and must be 32 bits for seconds and milliseconds or 64 bits
* for microseconds and nanoseconds. The allowed values are between 0
* (inclusive) and 86400 (=24*60*60) seconds (exclusive), adjusted for the
* time unit (for example, up to 86400000 exclusive for the
* `DateUnit.MILLISECOND` unit.
*
* This definition doesn't allow for leap seconds. Time values from
* measurements with leap seconds will need to be corrected when ingesting
* into Arrow (for example by replacing the value 86400 with 86399).
* @param {TimeUnit_} unit The time unit.
* One of `TimeUnit.SECOND`, `TimeUnit.MILLISECOND` (default),
* `TimeUnit.MICROSECOND`, or `TimeUnit.NANOSECOND`.
* @param {32 | 64} bitWidth The time bit width. One of `32` (for seconds
* and milliseconds) or `64` (for microseconds and nanoseconds).
* @returns {TimeType} The time data type.
*/
const time = (unit = TimeUnit.MILLISECOND, bitWidth = 32) => ({
typeId: Type.Time,
unit: checkOneOf(unit, TimeUnit),
bitWidth: checkOneOf(bitWidth, [32, 64]),
values: bitWidth === 32 ? int32Array : int64Array
});
/**
* Return a Time data type instance, represented as seconds.
* @returns {TimeType} The time data type.
*/
const timeSecond = () => time(TimeUnit.SECOND, 32);
/**
* Return a Time data type instance, represented as milliseconds.
* @returns {TimeType} The time data type.
*/
const timeMillisecond = () => time(TimeUnit.MILLISECOND, 32);
/**
* Return a Time data type instance, represented as microseconds.
* @returns {TimeType} The time data type.
*/
const timeMicrosecond = () => time(TimeUnit.MICROSECOND, 64);
/**
* Return a Time data type instance, represented as nanoseconds.
* @returns {TimeType} The time data type.
*/
const timeNanosecond = () => time(TimeUnit.NANOSECOND, 64);
/**
* Return a Timestamp data type instance. Timestamp values are 64-bit signed
* integers representing an elapsed time since a fixed epoch, stored in either
* of four units: seconds, milliseconds, microseconds or nanoseconds, and are
* optionally annotated with a timezone. Timestamp values do not include any
* leap seconds (in other words, all days are considered 86400 seconds long).
* @param {TimeUnit_} [unit] The time unit.
* One of `TimeUnit.SECOND`, `TimeUnit.MILLISECOND` (default),
* `TimeUnit.MICROSECOND`, or `TimeUnit.NANOSECOND`.
* @param {string|null} [timezone=null] An optional string for the name of a
* timezone. If provided, the value should either be a string as used in the
* Olson timezone database (the "tz database" or "tzdata"), such as
* "America/New_York", or an absolute timezone offset of the form "+XX:XX" or
* "-XX:XX", such as "+07:30".Whether a timezone string is present indicates
* different semantics about the data.
* @returns {TimestampType} The time data type.
*/
const timestamp = (unit = TimeUnit.MILLISECOND, timezone = null) => ({
typeId: Type.Timestamp,
unit: checkOneOf(unit, TimeUnit),
timezone,
values: int64Array
});
/**
* Return an Interval type instance. Values represent calendar intervals stored
* as integers for each date part. The supported *unit*s are year/moth,
* day/time, and month/day/nanosecond intervals.
*
* `IntervalUnit.YEAR_MONTH` indicates the number of elapsed whole months,
* stored as 32-bit signed integers.
*
* `IntervalUnit.DAY_TIME` indicates the number of elapsed days and
* milliseconds (no leap seconds), stored as 2 contiguous 32-bit signed
* integers (8-bytes in total).
*
* `IntervalUnit.MONTH_DAY_NANO` is a triple of the number of elapsed months,
* days, and nanoseconds. The values are stored contiguously in 16-byte blocks.
* Months and days are encoded as 32-bit signed integers and nanoseconds is
* encoded as a 64-bit signed integer. Nanoseconds does not allow for leap
* seconds. Each field is independent (e.g. there is no constraint that
* nanoseconds have the same sign as days or that the quantity of nanoseconds
* represents less than a day's worth of time).
* @param {IntervalUnit_} unit The interval unit.
* One of `IntervalUnit.YEAR_MONTH`, `IntervalUnit.DAY_TIME`, or
* `IntervalUnit.MONTH_DAY_NANO` (default).
* @returns {IntervalType} The interval data type.
*/
const interval = (unit = IntervalUnit.MONTH_DAY_NANO) => ({
typeId: Type.Interval,
unit: checkOneOf(unit, IntervalUnit),
values: unit === IntervalUnit.MONTH_DAY_NANO ? undefined : int32Array
});
/**
* Return a List data type instance, representing variably-sized lists
* (arrays) with 32-bit offsets. A list has a single child data type for
* list entries. Lists are represented using integer offsets that indicate
* list extents within a single child array containing all list values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {ListType} The list data type.
*/
const list = (child) => ({
typeId: Type.List,
children: [ asField(child) ],
offsets: int32Array
});
/**
* Return a Struct data type instance. A struct consists of multiple named
* child data types. Struct values are stored as parallel child batches, one
* per child type, and extracted to standard JavaScript objects.
* @param {Field[] | Record<string, DataType>} children
* An array of property fields, or an object mapping property names to data
* types. If an object, the instantiated fields are assumed to be nullable
* and have no metadata.
* @returns {StructType} The struct data type.
*/
const struct = (children) => ({
typeId: Type.Struct,
children: Array.isArray(children) && isField(children[0])
? /** @type {Field[]} */ (children)
: Object.entries(children).map(([name, type]) => field(name, type))
});
/**
* Return a Union type instance. A union is a complex type with parallel
* *children* data types. Union values are stored in either a sparse
* (`UnionMode.Sparse`) or dense (`UnionMode.Dense`) layout *mode*. In a
* sparse layout, child types are stored in parallel arrays with the same
* lengths, resulting in many unused, empty values. In a dense layout, child
* types have variable lengths and an offsets array is used to index the
* appropriate value.
*
* By default, ids in the type vector refer to the index in the children
* array. Optionally, *typeIds* provide an indirection between the child
* index and the type id. For each child, `typeIds[index]` is the id used
* in the type vector. The *typeIdForValue* argument provides a lookup
* function for mapping input data to the proper child type id, and is
* required if using builder methods.
* @param {UnionMode_} mode The union mode.
* One of `UnionMode.Sparse` or `UnionMode.Dense`.
* @param {FieldInput[]} children The children fields or data types.
* Types are mapped to nullable fields with no metadata.
* @param {number[]} [typeIds] Children type ids, in the same order as the
* children types. Type ids provide a level of indirection over children
* types. If not provided, the children indices are used as the type ids.
* @param {(value: any, index: number) => number} [typeIdForValue]
* A function that takes an arbitrary value and a row index and returns a
* correponding union type id. Required by builder methods.
* @returns {UnionType} The union data type.
*/
const union = (mode, children, typeIds, typeIdForValue) => {
typeIds ??= children.map((v, i) => i);
return {
typeId: Type.Union,
mode: checkOneOf(mode, UnionMode),
typeIds,
typeMap: typeIds.reduce((m, id, i) => ((m[id] = i), m), {}),
children: children.map((v, i) => asField(v, `_${i}`)),
typeIdForValue,
offsets: int32Array,
};
};
/**
* Create a FixedSizeBinary data type instance for opaque binary data where
* each entry has the same fixed size.
* @param {number} stride The fixed size in bytes.
* @returns {FixedSizeBinaryType} The fixed size binary data type.
*/
const fixedSizeBinary = (stride) => ({
typeId: Type.FixedSizeBinary,
stride
});
/**
* Return a FixedSizeList type instance for list (array) data where every list
* has the same fixed size. A list has a single child data type for list
* entries. Fixed size lists are represented as a single child array containing
* all list values, indexed using the known stride.
* @param {FieldInput} child The list item data type.
* @param {number} stride The fixed list size.
* @returns {FixedSizeListType} The fixed size list data type.
*/
const fixedSizeList = (child, stride) => ({
typeId: Type.FixedSizeList,
stride,
children: [ asField(child) ]
});
/**
* Internal method to create a Map type instance.
* @param {boolean} keysSorted Flag indicating if the map keys are sorted.
* @param {Field} child The child fields.
* @returns {MapType} The map data type.
*/
const mapType = (keysSorted, child) => ({
typeId: Type.Map,
keysSorted,
children: [child],
offsets: int32Array
});
/**
* Return a Map data type instance representing collections of key-value pairs.
* A Map is a logical nested type that is represented as a list of key-value
* structs. The key and value types are not constrained, so the application is
* responsible for ensuring that the keys are hashable and unique, and that
* keys are properly sorted if *keysSorted* is `true`.
* @param {FieldInput} keyField The map key field or data type.
* @param {FieldInput} valueField The map value field or data type.
* @param {boolean} [keysSorted=false] Flag indicating if the map keys are
* sorted (default `false`).
* @returns {MapType} The map data type.
*/
const map = (keyField, valueField, keysSorted = false) => mapType(
keysSorted,
field(
'entries',
struct([ asField(keyField, 'key', false), asField(valueField, 'value') ]),
false
)
);
/**
* Return a Duration data type instance. Durations represent an absolute length
* of time unrelated to any calendar artifacts. The resolution defaults to
* millisecond, but can be any of the other `TimeUnit` values. This type is
* always represented as a 64-bit integer.
* @param {TimeUnit_} unit
* @returns {DurationType} The duration data type.
*/
const duration = (unit = TimeUnit.MILLISECOND) => ({
typeId: Type.Duration,
unit: checkOneOf(unit, TimeUnit),
values: int64Array
});
/**
* Return a LargeBinary data type instance for variably-sized opaque binary
* data with 64-bit offsets, allowing representation of extremely large data
* values.
* @returns {LargeBinaryType} The large binary data type.
*/
const largeBinary = () => ({
typeId: Type.LargeBinary,
offsets: int64Array
});
/**
* Return a LargeUtf8 data type instance for Unicode string data of variable
* length with 64-bit offsets, allowing representation of extremely large data
* values. [UTF-8](https://en.wikipedia.org/wiki/UTF-8) code points are stored
* as binary data.
* @returns {LargeUtf8Type} The large utf8 data type.
*/
const largeUtf8 = () => ({
typeId: Type.LargeUtf8,
offsets: int64Array
});
/**
* Return a LargeList data type instance, representing variably-sized lists
* (arrays) with 64-bit offsets, allowing representation of extremely large
* data values. A list has a single child data type for list entries. Lists
* are represented using integer offsets that indicate list extents within a
* single child array containing all list values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {LargeListType} The large list data type.
*/
const largeList = (child) => ({
typeId: Type.LargeList,
children: [ asField(child) ],
offsets: int64Array
});
/**
* Return a RunEndEncoded data type instance, which compresses data by
* representing consecutive repeated values as a run. This data type uses two
* child arrays, `run_ends` and `values`. The `run_ends` child array must be
* a 16, 32, or 64 bit integer array which encodes the indices at which the
* run with the value in each corresponding index in the values child array
* ends. Like list and struct types, the `values` array can be of any type.
* @param {FieldInput} runsField The run-ends field or data type.
* @param {FieldInput} valuesField The values field or data type.
* @returns {RunEndEncodedType} The large list data type.
*/
const runEndEncoded = (runsField, valuesField) => ({
typeId: Type.RunEndEncoded,
children: [
check(
asField(runsField, 'run_ends'),
(field) => field.type.typeId === Type.Int,
() => 'Run-ends must have an integer type.'
),
asField(valuesField, 'values')
]
});
/**
* Return a BinaryView data type instance. BinaryView data is logically the
* same as the Binary type, but the internal representation uses a view struct
* that contains the string length and either the string's entire data inline
* (for small strings) or an inlined prefix, an index of another buffer, and an
* offset pointing to a slice in that buffer (for non-small strings).
*
* Flechette can encode and decode BinaryView data; however, Flechette does
* not currently support building BinaryView columns from JavaScript values.
* @returns {BinaryViewType} The binary view data type.
*/
const binaryView = () => /** @type {BinaryViewType} */
(basicType(Type.BinaryView));
/**
* Return a Utf8View data type instance. Utf8View data is logically the same as
* the Utf8 type, but the internal representation uses a view struct that
* contains the string length and either the string's entire data inline (for
* small strings) or an inlined prefix, an index of another buffer, and an
* offset pointing to a slice in that buffer (for non-small strings).
*
* Flechette can encode and decode Utf8View data; however, Flechette does
* not currently support building Utf8View columns from JavaScript values.
* @returns {Utf8ViewType} The utf8 view data type.
*/
const utf8View = () => /** @type {Utf8ViewType} */
(basicType(Type.Utf8View));
/**
* Return a ListView data type instance, representing variably-sized lists
* (arrays) with 32-bit offsets. ListView data represents the same logical
* types that List can, but contains both offsets and sizes allowing for
* writes in any order and sharing of child values among list values.
*
* Flechette can encode and decode ListView data; however, Flechette does not
* currently support building ListView columns from JavaScript values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {ListViewType} The list view data type.
*/
const listView = (child) => ({
typeId: Type.ListView,
children: [ asField(child, 'value') ],
offsets: int32Array
});
/**
* Return a LargeListView data type instance, representing variably-sized lists
* (arrays) with 64-bit offsets, allowing representation of extremely large
* data values. LargeListView data represents the same logical types that
* LargeList can, but contains both offsets and sizes allowing for writes
* in any order and sharing of child values among list values.
*
* Flechette can encode and decode LargeListView data; however, Flechette does
* not currently support building LargeListView columns from JavaScript values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {LargeListViewType} The large list view data type.
*/
const largeListView = (child) => ({
typeId: Type.LargeListView,
children: [ asField(child, 'value') ],
offsets: int64Array
});
/**
* @import { TimeUnit_, TypedArray } from '../types.js';
*/
// typed arrays over a shared buffer to aid binary conversion
const f64 = new float64Array(2);
const buf = f64.buffer;
const i64 = new int64Array(buf);
const u32 = new uint32Array(buf);
const i32 = new int32Array(buf);
const u8 = new uint8Array(buf);
/**
* Return a value unchanged.
* @template T
* @param {T} value The value.
* @returns {T} The value.
*/
function identity(value) {
return value;
}
/**
* Return a value coerced to a BigInt.
* @param {*} value The value.
* @returns {bigint} The BigInt value.
*/
function toBigInt(value) {
return BigInt(value);
}
/**
* Return an offset conversion method for the given data type.
* @param {{ offsets: TypedArray}} type The array type.
*/
function toOffset(type) {
return isInt64ArrayType(type) ? toBigInt : identity;
}
/**
* Return the number of days from a millisecond timestamp.
* @param {number} value The millisecond timestamp.
* @returns {number} The number of days.
*/
function toDateDay(value) {
return (value / 864e5) | 0;
}
/**
* Return a timestamp conversion method for the given time unit.
* @param {TimeUnit_} unit The time unit.
* @returns {(value: number) => bigint} The conversion method.
*/
function toTimestamp(unit) {
return unit === TimeUnit.SECOND ? value => toBigInt(value / 1e3)
: unit === TimeUnit.MILLISECOND ? toBigInt
: unit === TimeUnit.MICROSECOND ? value => toBigInt(value * 1e3)
: value => toBigInt(value * 1e6);
}
/**
* Write month/day/nanosecond interval to a byte buffer.
* @param {Array | Float64Array} interval The interval data.
* @returns {Uint8Array} A byte buffer with the interval data.
* The returned buffer is reused across calls, and so should be
* copied to a target buffer immediately.
*/
function toMonthDayNanoBytes([m, d, n]) {
i32[0] = m;
i32[1] = d;
i64[1] = toBigInt(n);
return u8;
}
/**
* Coerce a bigint value to a number. Throws an error if the bigint value
* lies outside the range of what a number can precisely represent.
* @param {bigint} value The value to check and possibly convert.
* @returns {number} The converted number value.
*/
function toNumber(value) {
if (value > Number.MAX_SAFE_INTEGER || value < Number.MIN_SAFE_INTEGER) {
throw Error(`BigInt exceeds integer number representation: ${value}`);
}
return Number(value);
}
/**
* Divide one BigInt value by another, and return the result as a number.
* The division may involve unsafe integers and a loss of precision.
* @param {bigint} num The numerator.
* @param {bigint} div The divisor.
* @returns {number} The result of the division as a floating point number.
*/
function divide(num, div) {
return Number(num / div) + Number(num % div) / Number(div);
}
/**
* Return a 32-bit decimal conversion method for the given decimal scale.
* @param {number} scale The scale mapping fractional digits to integers.
* @returns {(value: number|bigint) => number} A conversion method that maps
* floating point numbers to 32-bit decimals.
*/
function toDecimal32(scale) {
return (value) => typeof value === 'bigint'
? Number(value)
: Math.trunc(value * scale);
}
/**
* Convert a floating point number or bigint to decimal bytes.
* @param {number|bigint} value The number to encode. If a bigint, we assume
* it already represents the decimal in integer form with the correct scale.
* Otherwise, we assume a float that requires scaled integer conversion.
* @param {BigUint64Array} buf The uint64 array to write to.
* @param {number} offset The starting index offset into the array.
* @param {number} stride The stride of an encoded decimal, in 64-bit steps.
* @param {number} scale The scale mapping fractional digits to integers.
*/
function toDecimal(value, buf, offset, stride, scale) {
const v