@uwdata/flechette
Version:
Fast, lightweight access to Apache Arrow data.
679 lines (642 loc) • 26.9 kB
JavaScript
/**
* @import { BinaryType, BinaryViewType, BoolType, DataType, DateType, DateUnit_, DecimalType, DictionaryType, DurationType, Field, FixedSizeBinaryType, FixedSizeListType, FloatType, IntBitWidth, IntervalType, IntervalUnit_, IntType, LargeBinaryType, LargeListType, LargeListViewType, LargeUtf8Type, ListType, ListViewType, MapType, NullType, Precision_, RunEndEncodedType, StructType, TimestampType, TimeType, TimeUnit_, UnionMode_, UnionType, Utf8Type, Utf8ViewType } from './types.js'
*/
import { DateUnit, IntervalUnit, Precision, TimeUnit, Type, UnionMode } from './constants.js';
import { intArrayType, float32Array, float64Array, int32Array, int64Array, uint16Array, uint64Array } from './util/arrays.js';
import { check, checkOneOf, keyFor } from './util/objects.js';
/**
* @typedef {Field | DataType} FieldInput
*/
export const invalidDataType = (typeId) =>
`Unsupported data type: "${keyFor(Type, typeId)}" (id ${typeId})`;
/**
* Return a new field instance for use in a schema or type definition. A field
* represents a field name, data type, and additional metadata. Fields are used
* to represent child types within nested types like List, Struct, and Union.
* @param {string} name The field name.
* @param {DataType} type The field data type.
* @param {boolean} [nullable=true] Flag indicating if the field is nullable
* (default `true`).
* @param {Map<string,string>|null} [metadata=null] Custom field metadata
* annotations (default `null`).
* @returns {Field} The field instance.
*/
export const field = (name, type, nullable = true, metadata = null) => ({
name,
type,
nullable,
metadata
});
/**
* Checks if a value is a field instance.
* @param {any} value
* @returns {value is Field}
*/
function isField(value) {
return Object.hasOwn(value, 'name') && isDataType(value.type)
}
/**
* Checks if a value is a data type instance.
* @param {any} value
* @returns {value is DataType}
*/
function isDataType(value) {
return typeof value?.typeId === 'number';
}
/**
* Return a field instance from a field or data type input.
* @param {FieldInput} value
* The value to map to a field.
* @param {string} [defaultName] The default field name.
* @param {boolean} [defaultNullable=true] The default nullable value.
* @returns {Field} The field instance.
*/
function asField(value, defaultName = '', defaultNullable = true) {
return isField(value)
? value
: field(
defaultName,
check(value, isDataType, () => `Data type expected.`),
defaultNullable
);
}
/////
/**
* Return a basic type with only a type id.
* @template {typeof Type[keyof typeof Type]} T
* @param {T} typeId The type id.
*/
const basicType = (typeId) => ({ typeId });
/**
* Return a Dictionary data type instance. A dictionary type consists of a
* dictionary of values (which may be of any type) and corresponding integer
* indices that reference those values. If values are repeated, a dictionary
* encoding can provide substantial space savings. In the IPC format,
* dictionary indices reside alongside other columns in a record batch, while
* dictionary values are written to special dictionary batches, linked by a
* unique dictionary *id*.
* @param {DataType} type The data type of dictionary
* values.
* @param {IntType} [indexType] The data type of
* dictionary indices. Must be an integer type (default `int32`).
* @param {boolean} [ordered=false] Indicates if dictionary values are
* ordered (default `false`).
* @param {number} [id=-1] The dictionary id. The default value (-1) indicates
* the dictionary applies to a single column only. Provide an explicit id in
* order to reuse a dictionary across columns when building, in which case
* different dictionaries *must* have different unique ids. All dictionary
* ids are later resolved (possibly to new values) upon IPC encoding.
* @returns {DictionaryType}
*/
export const dictionary = (type, indexType, ordered = false, id = -1) => ({
typeId: Type.Dictionary,
id,
dictionary: type,
indices: indexType || int32(),
ordered
});
/**
* Return a Null data type instance. Null data requires no storage and all
* extracted values are `null`.
* @returns {NullType} The null data type.
*/
export const nullType = () => basicType(Type.Null);
/**
* Return an Int data type instance.
* @param {IntBitWidth} [bitWidth=32] The integer bit width.
* One of `8`, `16`, `32` (default), or `64`.
* @param {boolean} [signed=true] Flag for signed or unsigned integers
* (default `true`).
* @returns {IntType} The integer data type.
*/
export const int = (bitWidth = 32, signed = true) => ({
typeId: Type.Int,
bitWidth: checkOneOf(bitWidth, [8, 16, 32, 64]),
signed,
values: intArrayType(bitWidth, signed)
});
/**
* Return an Int data type instance for 8 bit signed integers.
* @returns {IntType} The integer data type.
*/
export const int8 = () => int(8);
/**
* Return an Int data type instance for 16 bit signed integers.
* @returns {IntType} The integer data type.
*/
export const int16 = () => int(16);
/**
* Return an Int data type instance for 32 bit signed integers.
* @returns {IntType} The integer data type.
*/
export const int32 = () => int(32);
/**
* Return an Int data type instance for 64 bit signed integers.
* @returns {IntType} The integer data type.
*/
export const int64 = () => int(64);
/**
* Return an Int data type instance for 8 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
export const uint8 = () => int(8, false);
/**
* Return an Int data type instance for 16 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
export const uint16 = () => int(16, false);
/**
* Return an Int data type instance for 32 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
export const uint32 = () => int(32, false);
/**
* Return an Int data type instance for 64 bit unsigned integers.
* @returns {IntType} The integer data type.
*/
export const uint64 = () => int(64, false);
/**
* Return a Float data type instance for floating point numbers.
* @param {Precision_} [precision=2] The floating point
* precision. One of `Precision.HALF` (16-bit), `Precision.SINGLE` (32-bit)
* or `Precision.DOUBLE` (64-bit, default).
* @returns {FloatType} The floating point data type.
*/
export const float = (precision = 2) => ({
typeId: Type.Float,
precision: checkOneOf(precision, Precision),
values: [uint16Array, float32Array, float64Array][precision]
});
/**
* Return a Float data type instance for half-precision (16 bit) numbers.
* @returns {FloatType} The floating point data type.
*/
export const float16 = () => float(Precision.HALF);
/**
* Return a Float data type instance for single-precision (32 bit) numbers.
* @returns {FloatType} The floating point data type.
*/
export const float32 = () => float(Precision.SINGLE);
/**
* Return a Float data type instance for double-precision (64 bit) numbers.
* @returns {FloatType} The floating point data type.
*/
export const float64 = () => float(Precision.DOUBLE);
/**
* Return a Binary data type instance for variably-sized opaque binary data
* with 32-bit offsets.
* @returns {BinaryType} The binary data type.
*/
export const binary = () => ({
typeId: Type.Binary,
offsets: int32Array
});
/**
* Return a Utf8 data type instance for Unicode string data.
* [UTF-8](https://en.wikipedia.org/wiki/UTF-8) code points are stored as
* binary data.
* @returns {Utf8Type} The utf8 data type.
*/
export const utf8 = () => ({
typeId: Type.Utf8,
offsets: int32Array
});
/**
* Return a Bool data type instance. Bool values are stored compactly in
* bitmaps with eight values per byte.
* @returns {BoolType} The bool data type.
*/
export const bool = () => basicType(Type.Bool);
/**
* Return a Decimal data type instance. Decimal values are represented as 32,
* 64, 128, or 256 bit integers in two's complement. Decimals are fixed point
* numbers with a set *precision* (total number of decimal digits) and *scale*
* (number of fractional digits). For example, the number `35.42` can be
* represented as `3542` with *precision* ≥ 4 and *scale* = 2.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @param {32 | 64 | 128 | 256} [bitWidth] The decimal bit width.
* One of 32, 64, 128 (default), or 256.
* @returns {DecimalType} The decimal data type.
*/
export const decimal = (precision, scale, bitWidth = 128) => ({
typeId: Type.Decimal,
precision,
scale,
bitWidth: checkOneOf(bitWidth, [32, 64, 128, 256]),
values: bitWidth === 32 ? int32Array : uint64Array
});
/**
* Return an Decimal data type instance with a bit width of 32.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
export const decimal32 = (precision, scale) => decimal(precision, scale, 32);
/**
* Return an Decimal data type instance with a bit width of 64.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
export const decimal64 = (precision, scale) => decimal(precision, scale, 64);
/**
* Return an Decimal data type instance with a bit width of 128.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
export const decimal128 = (precision, scale) => decimal(precision, scale, 128);
/**
* Return an Decimal data type instance with a bit width of 256.
* @param {number} precision The decimal precision: the total number of
* decimal digits that can be represented.
* @param {number} scale The number of fractional digits, beyond the
* decimal point.
* @returns {DecimalType} The decimal data type.
*/
export const decimal256 = (precision, scale) => decimal(precision, scale, 256);
/**
* Return a Date data type instance. Date values are 32-bit or 64-bit signed
* integers representing elapsed time since the UNIX epoch (Jan 1, 1970 UTC),
* either in units of days (32 bits) or milliseconds (64 bits, with values
* evenly divisible by 86400000).
* @param {DateUnit_} unit The date unit.
* One of `DateUnit.DAY` or `DateUnit.MILLISECOND`.
* @returns {DateType} The date data type.
*/
export const date = (unit) => ({
typeId: Type.Date,
unit: checkOneOf(unit, DateUnit),
values: unit === DateUnit.DAY ? int32Array : int64Array
});
/**
* Return a Date data type instance with units of days.
* @returns {DateType} The date data type.
*/
export const dateDay = () => date(DateUnit.DAY);
/**
* Return a Date data type instance with units of milliseconds.
* @returns {DateType} The date data type.
*/
export const dateMillisecond = () => date(DateUnit.MILLISECOND);
/**
* Return a Time data type instance, stored in one of four *unit*s: seconds,
* milliseconds, microseconds or nanoseconds. The integer *bitWidth* is
* inferred from the *unit* and is 32 bits for seconds and milliseconds or
* 64 bits for microseconds and nanoseconds. The allowed values are between 0
* (inclusive) and 86400 (=24*60*60) seconds (exclusive), adjusted for the
* time unit (for example, up to 86400000 exclusive for the
* `DateUnit.MILLISECOND` unit.
*
* This definition doesn't allow for leap seconds. Time values from
* measurements with leap seconds will need to be corrected when ingesting
* into Arrow (for example by replacing the value 86400 with 86399).
* @param {TimeUnit_} unit The time unit.
* One of `TimeUnit.SECOND`, `TimeUnit.MILLISECOND` (default),
* `TimeUnit.MICROSECOND`, or `TimeUnit.NANOSECOND`.
* @returns {TimeType} The time data type.
*/
export const time = (unit = TimeUnit.MILLISECOND) => {
unit = checkOneOf(unit, TimeUnit);
const bitWidth = unit === TimeUnit.SECOND || unit === TimeUnit.MILLISECOND ? 32 : 64;
return {
typeId: Type.Time,
unit,
bitWidth,
values: bitWidth === 32 ? int32Array : int64Array
};
};
/**
* Return a Time data type instance, represented as seconds.
* @returns {TimeType} The time data type.
*/
export const timeSecond = () => time(TimeUnit.SECOND);
/**
* Return a Time data type instance, represented as milliseconds.
* @returns {TimeType} The time data type.
*/
export const timeMillisecond = () => time(TimeUnit.MILLISECOND);
/**
* Return a Time data type instance, represented as microseconds.
* @returns {TimeType} The time data type.
*/
export const timeMicrosecond = () => time(TimeUnit.MICROSECOND);
/**
* Return a Time data type instance, represented as nanoseconds.
* @returns {TimeType} The time data type.
*/
export const timeNanosecond = () => time(TimeUnit.NANOSECOND);
/**
* Return a Timestamp data type instance. Timestamp values are 64-bit signed
* integers representing an elapsed time since a fixed epoch, stored in either
* of four units: seconds, milliseconds, microseconds or nanoseconds, and are
* optionally annotated with a timezone. Timestamp values do not include any
* leap seconds (in other words, all days are considered 86400 seconds long).
* @param {TimeUnit_} [unit] The time unit.
* One of `TimeUnit.SECOND`, `TimeUnit.MILLISECOND` (default),
* `TimeUnit.MICROSECOND`, or `TimeUnit.NANOSECOND`.
* @param {string|null} [timezone=null] An optional string for the name of a
* timezone. If provided, the value should either be a string as used in the
* Olson timezone database (the "tz database" or "tzdata"), such as
* "America/New_York", or an absolute timezone offset of the form "+XX:XX" or
* "-XX:XX", such as "+07:30".Whether a timezone string is present indicates
* different semantics about the data.
* @returns {TimestampType} The time data type.
*/
export const timestamp = (unit = TimeUnit.MILLISECOND, timezone = null) => ({
typeId: Type.Timestamp,
unit: checkOneOf(unit, TimeUnit),
timezone,
values: int64Array
});
/**
* Return an Interval type instance. Values represent calendar intervals stored
* as integers for each date part. The supported *unit*s are year/moth,
* day/time, and month/day/nanosecond intervals.
*
* `IntervalUnit.YEAR_MONTH` indicates the number of elapsed whole months,
* stored as 32-bit signed integers.
*
* `IntervalUnit.DAY_TIME` indicates the number of elapsed days and
* milliseconds (no leap seconds), stored as 2 contiguous 32-bit signed
* integers (8-bytes in total).
*
* `IntervalUnit.MONTH_DAY_NANO` is a triple of the number of elapsed months,
* days, and nanoseconds. The values are stored contiguously in 16-byte blocks.
* Months and days are encoded as 32-bit signed integers and nanoseconds is
* encoded as a 64-bit signed integer. Nanoseconds does not allow for leap
* seconds. Each field is independent (e.g. there is no constraint that
* nanoseconds have the same sign as days or that the quantity of nanoseconds
* represents less than a day's worth of time).
* @param {IntervalUnit_} unit The interval unit.
* One of `IntervalUnit.YEAR_MONTH`, `IntervalUnit.DAY_TIME`, or
* `IntervalUnit.MONTH_DAY_NANO` (default).
* @returns {IntervalType} The interval data type.
*/
export const interval = (unit = IntervalUnit.MONTH_DAY_NANO) => ({
typeId: Type.Interval,
unit: checkOneOf(unit, IntervalUnit),
values: unit === IntervalUnit.MONTH_DAY_NANO ? undefined : int32Array
});
/**
* Return a List data type instance, representing variably-sized lists
* (arrays) with 32-bit offsets. A list has a single child data type for
* list entries. Lists are represented using integer offsets that indicate
* list extents within a single child array containing all list values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {ListType} The list data type.
*/
export const list = (child) => ({
typeId: Type.List,
children: [ asField(child) ],
offsets: int32Array
});
/**
* Return a Struct data type instance. A struct consists of multiple named
* child data types. Struct values are stored as parallel child batches, one
* per child type, and extracted to standard JavaScript objects.
* @param {Field[] | Record<string, DataType>} children
* An array of property fields, or an object mapping property names to data
* types. If an object, the instantiated fields are assumed to be nullable
* and have no metadata.
* @returns {StructType} The struct data type.
*/
export const struct = (children) => ({
typeId: Type.Struct,
children: Array.isArray(children) && children.length > 0 && isField(children[0])
? /** @type {Field[]} */ (children)
: Object.entries(children).map(([name, type]) => field(name, type))
});
/**
* Return a Union type instance. A union is a complex type with parallel
* *children* data types. Union values are stored in either a sparse
* (`UnionMode.Sparse`) or dense (`UnionMode.Dense`) layout *mode*. In a
* sparse layout, child types are stored in parallel arrays with the same
* lengths, resulting in many unused, empty values. In a dense layout, child
* types have variable lengths and an offsets array is used to index the
* appropriate value.
*
* By default, ids in the type vector refer to the index in the children
* array. Optionally, *typeIds* provide an indirection between the child
* index and the type id. For each child, `typeIds[index]` is the id used
* in the type vector. The *typeIdForValue* argument provides a lookup
* function for mapping input data to the proper child type id, and is
* required if using builder methods.
* @param {UnionMode_} mode The union mode.
* One of `UnionMode.Sparse` or `UnionMode.Dense`.
* @param {FieldInput[]} children The children fields or data types.
* Types are mapped to nullable fields with no metadata.
* @param {number[]} [typeIds] Children type ids, in the same order as the
* children types. Type ids provide a level of indirection over children
* types. If not provided, the children indices are used as the type ids.
* @param {(value: any, index: number) => number} [typeIdForValue]
* A function that takes an arbitrary value and a row index and returns a
* correponding union type id. Required by builder methods.
* @returns {UnionType} The union data type.
*/
export const union = (mode, children, typeIds, typeIdForValue) => {
typeIds ??= children.map((v, i) => i);
return {
typeId: Type.Union,
mode: checkOneOf(mode, UnionMode),
typeIds,
typeMap: typeIds.reduce((m, id, i) => ((m[id] = i), m), {}),
children: children.map((v, i) => asField(v, `_${i}`)),
typeIdForValue,
offsets: int32Array,
};
};
/**
* Create a FixedSizeBinary data type instance for opaque binary data where
* each entry has the same fixed size.
* @param {number} stride The fixed size in bytes.
* @returns {FixedSizeBinaryType} The fixed size binary data type.
*/
export const fixedSizeBinary = (stride) => ({
typeId: Type.FixedSizeBinary,
stride
});
/**
* Return a FixedSizeList type instance for list (array) data where every list
* has the same fixed size. A list has a single child data type for list
* entries. Fixed size lists are represented as a single child array containing
* all list values, indexed using the known stride.
* @param {FieldInput} child The list item data type.
* @param {number} stride The fixed list size.
* @returns {FixedSizeListType} The fixed size list data type.
*/
export const fixedSizeList = (child, stride) => ({
typeId: Type.FixedSizeList,
stride,
children: [ asField(child) ]
});
/**
* Internal method to create a Map type instance.
* @param {boolean} keysSorted Flag indicating if the map keys are sorted.
* @param {Field} child The child fields.
* @returns {MapType} The map data type.
*/
export const mapType = (keysSorted, child) => ({
typeId: Type.Map,
keysSorted,
children: [child],
offsets: int32Array
});
/**
* Return a Map data type instance representing collections of key-value pairs.
* A Map is a logical nested type that is represented as a list of key-value
* structs. The key and value types are not constrained, so the application is
* responsible for ensuring that the keys are hashable and unique, and that
* keys are properly sorted if *keysSorted* is `true`.
* @param {FieldInput} keyField The map key field or data type.
* @param {FieldInput} valueField The map value field or data type.
* @param {boolean} [keysSorted=false] Flag indicating if the map keys are
* sorted (default `false`).
* @returns {MapType} The map data type.
*/
export const map = (keyField, valueField, keysSorted = false) => mapType(
keysSorted,
field(
'entries',
struct([ asField(keyField, 'key', false), asField(valueField, 'value') ]),
false
)
);
/**
* Return a Duration data type instance. Durations represent an absolute length
* of time unrelated to any calendar artifacts. The resolution defaults to
* millisecond, but can be any of the other `TimeUnit` values. This type is
* always represented as a 64-bit integer.
* @param {TimeUnit_} unit
* @returns {DurationType} The duration data type.
*/
export const duration = (unit = TimeUnit.MILLISECOND) => ({
typeId: Type.Duration,
unit: checkOneOf(unit, TimeUnit),
values: int64Array
});
/**
* Return a LargeBinary data type instance for variably-sized opaque binary
* data with 64-bit offsets, allowing representation of extremely large data
* values.
* @returns {LargeBinaryType} The large binary data type.
*/
export const largeBinary = () => ({
typeId: Type.LargeBinary,
offsets: int64Array
});
/**
* Return a LargeUtf8 data type instance for Unicode string data of variable
* length with 64-bit offsets, allowing representation of extremely large data
* values. [UTF-8](https://en.wikipedia.org/wiki/UTF-8) code points are stored
* as binary data.
* @returns {LargeUtf8Type} The large utf8 data type.
*/
export const largeUtf8 = () => ({
typeId: Type.LargeUtf8,
offsets: int64Array
});
/**
* Return a LargeList data type instance, representing variably-sized lists
* (arrays) with 64-bit offsets, allowing representation of extremely large
* data values. A list has a single child data type for list entries. Lists
* are represented using integer offsets that indicate list extents within a
* single child array containing all list values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {LargeListType} The large list data type.
*/
export const largeList = (child) => ({
typeId: Type.LargeList,
children: [ asField(child) ],
offsets: int64Array
});
/**
* Return a RunEndEncoded data type instance, which compresses data by
* representing consecutive repeated values as a run. This data type uses two
* child arrays, `run_ends` and `values`. The `run_ends` child array must be
* a 16, 32, or 64 bit integer array which encodes the indices at which the
* run with the value in each corresponding index in the values child array
* ends. Like list and struct types, the `values` array can be of any type.
* @param {FieldInput} runsField The run-ends field or data type.
* @param {FieldInput} valuesField The values field or data type.
* @returns {RunEndEncodedType} The large list data type.
*/
export const runEndEncoded = (runsField, valuesField) => ({
typeId: Type.RunEndEncoded,
children: [
check(
asField(runsField, 'run_ends'),
(field) => field.type.typeId === Type.Int,
() => 'Run-ends must have an integer type.'
),
asField(valuesField, 'values')
]
});
/**
* Return a BinaryView data type instance. BinaryView data is logically the
* same as the Binary type, but the internal representation uses a view struct
* that contains the string length and either the string's entire data inline
* (for small strings) or an inlined prefix, an index of another buffer, and an
* offset pointing to a slice in that buffer (for non-small strings).
*
* Flechette can encode and decode BinaryView data; however, Flechette does
* not currently support building BinaryView columns from JavaScript values.
* @returns {BinaryViewType} The binary view data type.
*/
export const binaryView = () => /** @type {BinaryViewType} */
(basicType(Type.BinaryView));
/**
* Return a Utf8View data type instance. Utf8View data is logically the same as
* the Utf8 type, but the internal representation uses a view struct that
* contains the string length and either the string's entire data inline (for
* small strings) or an inlined prefix, an index of another buffer, and an
* offset pointing to a slice in that buffer (for non-small strings).
*
* Flechette can encode and decode Utf8View data; however, Flechette does
* not currently support building Utf8View columns from JavaScript values.
* @returns {Utf8ViewType} The utf8 view data type.
*/
export const utf8View = () => /** @type {Utf8ViewType} */
(basicType(Type.Utf8View));
/**
* Return a ListView data type instance, representing variably-sized lists
* (arrays) with 32-bit offsets. ListView data represents the same logical
* types that List can, but contains both offsets and sizes allowing for
* writes in any order and sharing of child values among list values.
*
* Flechette can encode and decode ListView data; however, Flechette does not
* currently support building ListView columns from JavaScript values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {ListViewType} The list view data type.
*/
export const listView = (child) => ({
typeId: Type.ListView,
children: [ asField(child, 'value') ],
offsets: int32Array
});
/**
* Return a LargeListView data type instance, representing variably-sized lists
* (arrays) with 64-bit offsets, allowing representation of extremely large
* data values. LargeListView data represents the same logical types that
* LargeList can, but contains both offsets and sizes allowing for writes
* in any order and sharing of child values among list values.
*
* Flechette can encode and decode LargeListView data; however, Flechette does
* not currently support building LargeListView columns from JavaScript values.
* @param {FieldInput} child The child (list item) field or data type.
* @returns {LargeListViewType} The large list view data type.
*/
export const largeListView = (child) => ({
typeId: Type.LargeListView,
children: [ asField(child, 'value') ],
offsets: int64Array
});