UNPKG

@uwdata/flechette

Version:

Fast, lightweight access to Apache Arrow data.

github.com/uwdata/flechette

uwdata/flechette

1,403 lines (1,326 loc) • 199 kB

JavaScript

'use strict'; /** Magic bytes 'ARROW1' indicating the Arrow 'file' format. */ const MAGIC = Uint8Array.of(65, 82, 82, 79, 87, 49); /** Bytes for an 'end of stream' message. */ const EOS = Uint8Array.of(255, 255, 255, 255, 0, 0, 0, 0); /** * Apache Arrow version. */ const Version = /** @type {const} */ ({ /** 0.1.0 (October 2016). */ V1: 0, /** 0.2.0 (February 2017). Non-backwards compatible with V1. */ V2: 1, /** 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2. */ V3: 2, /** >= 0.8.0 (December 2017). Non-backwards compatible with V3. */ V4: 3, /** * >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4 * metadata and IPC messages). Implementations are recommended to provide a * V4 compatibility mode with V5 format changes disabled. * * Incompatible changes between V4 and V5: * - Union buffer layout has changed. * In V5, Unions don't have a validity bitmap buffer. */ V5: 4 }); /** * Endianness of Arrow-encoded data. */ const Endianness = /** @type {const} */ ({ Little: 0, Big: 1 }); /** * Message header type codes. */ const MessageHeader = /** @type {const} */ ({ NONE: 0, /** * A Schema describes the columns in a record batch. */ Schema: 1, /** * For sending dictionary encoding information. Any Field can be * dictionary-encoded, but in this case none of its children may be * dictionary-encoded. * There is one vector / column per dictionary, but that vector / column * may be spread across multiple dictionary batches by using the isDelta * flag. */ DictionaryBatch: 2, /** * A data header describing the shared memory layout of a "record" or "row" * batch. Some systems call this a "row batch" internally and others a "record * batch". */ RecordBatch: 3, /** * EXPERIMENTAL: Metadata for n-dimensional arrays, aka "tensors" or * "ndarrays". Arrow implementations in general are not required to implement * this type. * * Not currently supported by Flechette. */ Tensor: 4, /** * EXPERIMENTAL: Metadata for n-dimensional sparse arrays, aka "sparse * tensors". Arrow implementations in general are not required to implement * this type. * * Not currently supported by Flechette. */ SparseTensor: 5 }); /** * Field data type ids. * Only non-negative values ever occur in IPC flatbuffer binary data. */ const Type = /** @type {const} */ ({ /** * Dictionary types compress data by using a set of integer indices to * lookup potentially repeated vales in a separate dictionary of values. * * This type entry is provided for API convenience, it does not occur * in actual Arrow IPC binary data. */ Dictionary: -1, /** No data type. Included for flatbuffer compatibility. */ NONE: 0, /** Null values only. */ Null: 1, /** Integers, either signed or unsigned, with 8, 16, 32, or 64 bit widths. */ Int: 2, /** Floating point numbers with 16, 32, or 64 bit precision. */ Float: 3, /** Opaque binary data. */ Binary: 4, /** Unicode with UTF-8 encoding. */ Utf8: 5, /** Booleans represented as 8 bit bytes. */ Bool: 6, /** * Exact decimal value represented as an integer value in two's complement. * Currently only 128-bit (16-byte) and 256-bit (32-byte) integers are used. * The representation uses the endianness indicated in the schema. */ Decimal: 7, /** * Date is either a 32-bit or 64-bit signed integer type representing an * elapsed time since UNIX epoch (1970-01-01), stored in either of two units: * - Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no * leap seconds), where the values are evenly divisible by 86400000 * - Days (32 bits) since the UNIX epoch */ Date: 8, /** * Time is either a 32-bit or 64-bit signed integer type representing an * elapsed time since midnight, stored in either of four units: seconds, * milliseconds, microseconds or nanoseconds. * * The integer `bitWidth` depends on the `unit` and must be one of the following: * - SECOND and MILLISECOND: 32 bits * - MICROSECOND and NANOSECOND: 64 bits * * The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds * (exclusive), adjusted for the time unit (for example, up to 86400000 * exclusive for the MILLISECOND unit). * This definition doesn't allow for leap seconds. Time values from * measurements with leap seconds will need to be corrected when ingesting * into Arrow (for example by replacing the value 86400 with 86399). */ Time: 9, /** * Timestamp is a 64-bit signed integer representing an elapsed time since a * fixed epoch, stored in either of four units: seconds, milliseconds, * microseconds or nanoseconds, and is optionally annotated with a timezone. * * Timestamp values do not include any leap seconds (in other words, all * days are considered 86400 seconds long). * * The timezone is an optional string for the name of a timezone, one of: * * - As used in the Olson timezone database (the "tz database" or * "tzdata"), such as "America/New_York". * - An absolute timezone offset of the form "+XX:XX" or "-XX:XX", * such as "+07:30". * * Whether a timezone string is present indicates different semantics about * the data. */ Timestamp: 10, /** * A "calendar" interval which models types that don't necessarily * have a precise duration without the context of a base timestamp (e.g. * days can differ in length during day light savings time transitions). * All integers in the units below are stored in the endianness indicated * by the schema. * * - YEAR_MONTH - Indicates the number of elapsed whole months, stored as * 4-byte signed integers. * - DAY_TIME - Indicates the number of elapsed days and milliseconds (no * leap seconds), stored as 2 contiguous 32-bit signed integers (8-bytes * in total). Support of this IntervalUnit is not required for full arrow * compatibility. * - MONTH_DAY_NANO - A triple of the number of elapsed months, days, and * nanoseconds. The values are stored contiguously in 16-byte blocks. * Months and days are encoded as 32-bit signed integers and nanoseconds * is encoded as a 64-bit signed integer. Nanoseconds does not allow for * leap seconds. Each field is independent (e.g. there is no constraint * that nanoseconds have the same sign as days or that the quantity of * nanoseconds represents less than a day's worth of time). */ Interval: 11, /** * List (vector) data supporting variably-sized lists. * A list has a single child data type for list entries. */ List: 12, /** * A struct consisting of multiple named child data types. */ Struct: 13, /** * A union is a complex type with parallel child data types. By default ids * in the type vector refer to the offsets in the children. Optionally * typeIds provides an indirection between the child offset and the type id. * For each child `typeIds[offset]` is the id used in the type vector. */ Union: 14, /** * Binary data where each entry has the same fixed size. */ FixedSizeBinary: 15, /** * List (vector) data where every list has the same fixed size. * A list has a single child data type for list entries. */ FixedSizeList: 16, /** * A Map is a logical nested type that is represented as * List<entries: Struct<key: K, value: V>> * * In this layout, the keys and values are each respectively contiguous. We do * not constrain the key and value types, so the application is responsible * for ensuring that the keys are hashable and unique. Whether the keys are sorted * may be set in the metadata for this field. * * In a field with Map type, the field has a child Struct field, which then * has two children: key type and the second the value type. The names of the * child fields may be respectively "entries", "key", and "value", but this is * not enforced. * * Map * ```text * - child[0] entries: Struct * - child[0] key: K * - child[1] value: V * ``` * Neither the "entries" field nor the "key" field may be nullable. * * The metadata is structured so that Arrow systems without special handling * for Map can make Map an alias for List. The "layout" attribute for the Map * field must have the same contents as a List. */ Map: 17, /** * An absolute length of time unrelated to any calendar artifacts. For the * purposes of Arrow implementations, adding this value to a Timestamp * ("t1") naively (i.e. simply summing the two numbers) is acceptable even * though in some cases the resulting Timestamp (t2) would not account for * leap-seconds during the elapsed time between "t1" and "t2". Similarly, * representing the difference between two Unix timestamp is acceptable, but * would yield a value that is possibly a few seconds off from the true * elapsed time. * * The resolution defaults to millisecond, but can be any of the other * supported TimeUnit values as with Timestamp and Time types. This type is * always represented as an 8-byte integer. */ Duration: 18, /** * Same as Binary, but with 64-bit offsets, allowing representation of * extremely large data values. */ LargeBinary: 19, /** * Same as Utf8, but with 64-bit offsets, allowing representation of * extremely large data values. */ LargeUtf8: 20, /** * Same as List, but with 64-bit offsets, allowing representation of * extremely large data values. */ LargeList: 21, /** * Contains two child arrays, run_ends and values. The run_ends child array * must be a 16/32/64-bit integer array which encodes the indices at which * the run with the value in each corresponding index in the values child * array ends. Like list/struct types, the value array can be of any type. */ RunEndEncoded: 22, /** * Logically the same as Binary, but the internal representation uses a view * struct that contains the string length and either the string's entire data * inline (for small strings) or an inlined prefix, an index of another buffer, * and an offset pointing to a slice in that buffer (for non-small strings). * * Since it uses a variable number of data buffers, each Field with this type * must have a corresponding entry in `variadicBufferCounts`. */ BinaryView: 23, /** * Logically the same as Utf8, but the internal representation uses a view * struct that contains the string length and either the string's entire data * inline (for small strings) or an inlined prefix, an index of another buffer, * and an offset pointing to a slice in that buffer (for non-small strings). * * Since it uses a variable number of data buffers, each Field with this type * must have a corresponding entry in `variadicBufferCounts`. */ Utf8View: 24, /** * Represents the same logical types that List can, but contains offsets and * sizes allowing for writes in any order and sharing of child values among * list values. */ ListView: 25, /** * Same as ListView, but with 64-bit offsets and sizes, allowing to represent * extremely large data values. */ LargeListView: 26 }); /** * Floating point number precision. */ const Precision = /** @type {const} */ ({ /** 16-bit floating point number. */ HALF: 0, /** 32-bit floating point number. */ SINGLE: 1, /** 64-bit floating point number. */ DOUBLE: 2 }); /** * Date units. */ const DateUnit = /** @type {const} */ ({ /* Days (as 32 bit int) since the UNIX epoch. */ DAY: 0, /** * Milliseconds (as 64 bit int) indicating UNIX time elapsed since the epoch * (no leap seconds), with values evenly divisible by 86400000. */ MILLISECOND: 1 }); /** * Time units. */ const TimeUnit = /** @type {const} */ ({ /** Seconds. */ SECOND: 0, /** Milliseconds. */ MILLISECOND: 1, /** Microseconds. */ MICROSECOND: 2, /** Nanoseconds. */ NANOSECOND: 3 }); /** * Date/time interval units. */ const IntervalUnit = /** @type {const} */ ({ /** * Indicates the number of elapsed whole months, stored as 4-byte signed * integers. */ YEAR_MONTH: 0, /** * Indicates the number of elapsed days and milliseconds (no leap seconds), * stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support * of this IntervalUnit is not required for full arrow compatibility. */ DAY_TIME: 1, /** * A triple of the number of elapsed months, days, and nanoseconds. * The values are stored contiguously in 16-byte blocks. Months and days are * encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit * signed integer. Nanoseconds does not allow for leap seconds. Each field is * independent (e.g. there is no constraint that nanoseconds have the same * sign as days or that the quantity of nanoseconds represents less than a * day's worth of time). */ MONTH_DAY_NANO: 2 }); /** * Union type modes. */ const UnionMode = /** @type {const} */ ({ /** Sparse union layout with full arrays for each sub-type. */ Sparse: 0, /** Dense union layout with offsets into value arrays. */ Dense: 1 }); /** * Compression types. */ const CompressionType = /** @type {const} */ ({ /** * LZ4 frame compression. * Not to be confused with "raw" (also called "block") format. */ LZ4_FRAME: 0, /** Zstandard compression. */ ZSTD: 1 }); /** * Body compression methods. * Provided for forward compatibility in case Arrow needs to support * different strategies for compressing the IPC message body (like * whole-body compression rather than buffer-level) in the future. */ const BodyCompressionMethod = /** @type {const} */ ({ /** * Each constituent buffer is first compressed with the indicated * compressor, and then written with the uncompressed length in the first 8 * bytes as a 64-bit little-endian signed integer followed by the compressed * buffer bytes (and then padding as required by the protocol). The * uncompressed length may be set to -1 to indicate that the data that * follows is not compressed, which can be useful for cases where * compression does not yield appreciable savings. */ BUFFER: 0 }); /** * @import { Int64ArrayConstructor, IntArrayConstructor, IntegerArray, TypedArray } from '../types.js' */ const uint8Array = Uint8Array; const uint16Array = Uint16Array; const uint32Array = Uint32Array; const uint64Array = BigUint64Array; const int8Array = Int8Array; const int16Array = Int16Array; const int32Array = Int32Array; const int64Array = BigInt64Array; const float32Array = Float32Array; const float64Array = Float64Array; /** * Check if an input value is an ArrayBuffer or SharedArrayBuffer. * @param {unknown} data * @returns {data is ArrayBufferLike} */ function isArrayBufferLike(data) { return data instanceof ArrayBuffer || ( typeof SharedArrayBuffer !== 'undefined' && data instanceof SharedArrayBuffer ); } /** * Return the appropriate typed array constructor for the given * integer type metadata. * @param {number} bitWidth The integer size in bits. * @param {boolean} signed Flag indicating if the integer is signed. * @returns {IntArrayConstructor} */ function intArrayType(bitWidth, signed) { const i = Math.log2(bitWidth) - 3; return ( signed ? [int8Array, int16Array, int32Array, int64Array] : [uint8Array, uint16Array, uint32Array, uint64Array] )[i]; } /** Shared prototype for typed arrays. */ const TypedArray = Object.getPrototypeOf(Int8Array); /** * Check if a value is a typed array. * @param {*} value The value to check. * @returns {value is TypedArray} * True if value is a typed array, false otherwise. */ function isTypedArray(value) { return value instanceof TypedArray; } /** * Check if a value is either a standard array or typed array. * @param {*} value The value to check. * @returns {value is (Array | TypedArray)} * True if value is an array, false otherwise. */ function isArray(value) { return Array.isArray(value) || isTypedArray(value); } /** * Check if a value is an array type (constructor) for 64-bit integers, * one of BigInt64Array or BigUint64Array. * @param {*} value The value to check. * @returns {value is Int64ArrayConstructor} * True if value is a 64-bit array type, false otherwise. */ function isInt64ArrayType(value) { return value === int64Array || value === uint64Array; } /** * Determine the correct index into an offset array for a given * full column row index. Assumes offset indices can be manipulated * as 32-bit signed integers. * @param {IntegerArray} offsets The offsets array. * @param {number} index The full column row index. */ function bisect(offsets, index) { let a = 0; let b = offsets.length; if (b <= 2147483648) { // 2 ** 31 // fast version, use unsigned bit shift // array length fits within 32-bit signed integer do { const mid = (a + b) >>> 1; if (offsets[mid] <= index) a = mid + 1; else b = mid; } while (a < b); } else { // slow version, use division and truncate // array length exceeds 32-bit signed integer do { const mid = Math.trunc((a + b) / 2); if (offsets[mid] <= index) a = mid + 1; else b = mid; } while (a < b); } return a; } /** * Compute a 64-bit aligned buffer size. * @param {number} length The starting size. * @param {number} bpe Bytes per element. * @returns {number} The aligned size. */ function align64(length, bpe = 1) { return (((length * bpe) + 7) & -8) / bpe; } /** * Return a 64-bit aligned version of the array. * @template {TypedArray} T * @param {T} array The array. * @param {number} length The current array length. * @returns {T} The aligned array. */ function align(array, length = array.length) { const alignedLength = align64(length, array.BYTES_PER_ELEMENT); return array.length > alignedLength ? /** @type {T} */ (array.subarray(0, alignedLength)) : array.length < alignedLength ? resize(array, alignedLength) : array; } /** * Resize a typed array to exactly the specified length. * @template {TypedArray} T * @param {T} array The array. * @param {number} newLength The new length. * @param {number} [offset] The offset at which to copy the old array. * @returns {T} The resized array. */ function resize(array, newLength, offset = 0) { // @ts-ignore const newArray = new array.constructor(newLength); newArray.set(array, offset); return newArray; } /** * Grow a typed array to accommdate a minimum index. The array size is * doubled until it exceeds the minimum index. * @template {TypedArray} T * @param {T} array The array. * @param {number} index The minimum index. * @param {boolean} [shift] Flag to shift copied bytes to back of array. * @returns {T} The resized array. */ function grow(array, index, shift) { while (array.length <= index) { array = resize(array, array.length << 1, shift ? array.length : 0); } return array; } /** * Check if a value is a Date instance * @param {*} value The value to check. * @returns {value is Date} True if value is a Date, false otherwise. */ function isDate(value) { return value instanceof Date; } /** * Check if a value is iterable. * @param {*} value The value to check. * @returns {value is Iterable} True if value is iterable, false otherwise. */ function isIterable(value) { return typeof value[Symbol.iterator] === 'function'; } /** * Return the input value if it passes a test. * Otherwise throw an error using the given message generator. * @template T * @param {T} value The value to check. * @param {(value: T) => boolean} test The test function. * @param {(value: *) => string} message Message generator. * @returns {T} The input value. * @throws if the value does not pass the test */ function check(value, test, message) { if (test(value)) return value; throw new Error(message(value)); } /** * Return the input value if it exists in the provided set. * Otherwise throw an error using the given message generator. * @template T * @param {T} value The value to check. * @param {T[] | Record<string,T>} set The set of valid values. * @param {(value: *) => string} [message] Message generator. * @returns {T} The input value. * @throws if the value is not included in the set */ function checkOneOf(value, set, message) { set = Array.isArray(set) ? set : Object.values(set); return check( value, (value) => set.includes(value), message ?? (() => `${value} must be one of ${set}`) ); } /** * Return the first object key that pairs with the given value. * @param {Record<string,any>} object The object to search. * @param {any} value The value to lookup. * @returns {string} The first matching key, or '<Unknown>' if not found. */ function keyFor(object, value) { for (const [key, val] of Object.entries(object)) { if (val === value) return key; } return '<Unknown>'; } /** * @import { BinaryType, BinaryViewType, BoolType, DataType, DateType, DateUnit_, DecimalType, DictionaryType, DurationType, Field, FixedSizeBinaryType, FixedSizeListType, FloatType, IntBitWidth, IntervalType, IntervalUnit_, IntType, LargeBinaryType, LargeListType, LargeListViewType, LargeUtf8Type, ListType, ListViewType, MapType, NullType, Precision_, RunEndEncodedType, StructType, TimestampType, TimeType, TimeUnit_, UnionMode_, UnionType, Utf8Type, Utf8ViewType } from './types.js' */ /** * @typedef {Field | DataType} FieldInput */ const invalidDataType = (typeId) => `Unsupported data type: "${keyFor(Type, typeId)}" (id ${typeId})`; /** * Return a new field instance for use in a schema or type definition. A field * represents a field name, data type, and additional metadata. Fields are used * to represent child types within nested types like List, Struct, and Union. * @param {string} name The field name. * @param {DataType} type The field data type. * @param {boolean} [nullable=true] Flag indicating if the field is nullable * (default `true`). * @param {Map<string,string>|null} [metadata=null] Custom field metadata * annotations (default `null`). * @returns {Field} The field instance. */ const field = (name, type, nullable = true, metadata = null) => ({ name, type, nullable, metadata }); /** * Checks if a value is a field instance. * @param {any} value * @returns {value is Field} */ function isField(value) { return Object.hasOwn(value, 'name') && isDataType(value.type) } /** * Checks if a value is a data type instance. * @param {any} value * @returns {value is DataType} */ function isDataType(value) { return typeof value?.typeId === 'number'; } /** * Return a field instance from a field or data type input. * @param {FieldInput} value * The value to map to a field. * @param {string} [defaultName] The default field name. * @param {boolean} [defaultNullable=true] The default nullable value. * @returns {Field} The field instance. */ function asField(value, defaultName = '', defaultNullable = true) { return isField(value) ? value : field( defaultName, check(value, isDataType, () => `Data type expected.`), defaultNullable ); } ///// /** * Return a basic type with only a type id. * @template {typeof Type[keyof typeof Type]} T * @param {T} typeId The type id. */ const basicType = (typeId) => ({ typeId }); /** * Return a Dictionary data type instance. A dictionary type consists of a * dictionary of values (which may be of any type) and corresponding integer * indices that reference those values. If values are repeated, a dictionary * encoding can provide substantial space savings. In the IPC format, * dictionary indices reside alongside other columns in a record batch, while * dictionary values are written to special dictionary batches, linked by a * unique dictionary *id*. * @param {DataType} type The data type of dictionary * values. * @param {IntType} [indexType] The data type of * dictionary indices. Must be an integer type (default `int32`). * @param {boolean} [ordered=false] Indicates if dictionary values are * ordered (default `false`). * @param {number} [id=-1] The dictionary id. The default value (-1) indicates * the dictionary applies to a single column only. Provide an explicit id in * order to reuse a dictionary across columns when building, in which case * different dictionaries *must* have different unique ids. All dictionary * ids are later resolved (possibly to new values) upon IPC encoding. * @returns {DictionaryType} */ const dictionary = (type, indexType, ordered = false, id = -1) => ({ typeId: Type.Dictionary, id, dictionary: type, indices: indexType || int32(), ordered }); /** * Return a Null data type instance. Null data requires no storage and all * extracted values are `null`. * @returns {NullType} The null data type. */ const nullType = () => basicType(Type.Null); /** * Return an Int data type instance. * @param {IntBitWidth} [bitWidth=32] The integer bit width. * One of `8`, `16`, `32` (default), or `64`. * @param {boolean} [signed=true] Flag for signed or unsigned integers * (default `true`). * @returns {IntType} The integer data type. */ const int = (bitWidth = 32, signed = true) => ({ typeId: Type.Int, bitWidth: checkOneOf(bitWidth, [8, 16, 32, 64]), signed, values: intArrayType(bitWidth, signed) }); /** * Return an Int data type instance for 8 bit signed integers. * @returns {IntType} The integer data type. */ const int8 = () => int(8); /** * Return an Int data type instance for 16 bit signed integers. * @returns {IntType} The integer data type. */ const int16 = () => int(16); /** * Return an Int data type instance for 32 bit signed integers. * @returns {IntType} The integer data type. */ const int32 = () => int(32); /** * Return an Int data type instance for 64 bit signed integers. * @returns {IntType} The integer data type. */ const int64 = () => int(64); /** * Return an Int data type instance for 8 bit unsigned integers. * @returns {IntType} The integer data type. */ const uint8 = () => int(8, false); /** * Return an Int data type instance for 16 bit unsigned integers. * @returns {IntType} The integer data type. */ const uint16 = () => int(16, false); /** * Return an Int data type instance for 32 bit unsigned integers. * @returns {IntType} The integer data type. */ const uint32 = () => int(32, false); /** * Return an Int data type instance for 64 bit unsigned integers. * @returns {IntType} The integer data type. */ const uint64 = () => int(64, false); /** * Return a Float data type instance for floating point numbers. * @param {Precision_} [precision=2] The floating point * precision. One of `Precision.HALF` (16-bit), `Precision.SINGLE` (32-bit) * or `Precision.DOUBLE` (64-bit, default). * @returns {FloatType} The floating point data type. */ const float = (precision = 2) => ({ typeId: Type.Float, precision: checkOneOf(precision, Precision), values: [uint16Array, float32Array, float64Array][precision] }); /** * Return a Float data type instance for half-precision (16 bit) numbers. * @returns {FloatType} The floating point data type. */ const float16 = () => float(Precision.HALF); /** * Return a Float data type instance for single-precision (32 bit) numbers. * @returns {FloatType} The floating point data type. */ const float32 = () => float(Precision.SINGLE); /** * Return a Float data type instance for double-precision (64 bit) numbers. * @returns {FloatType} The floating point data type. */ const float64 = () => float(Precision.DOUBLE); /** * Return a Binary data type instance for variably-sized opaque binary data * with 32-bit offsets. * @returns {BinaryType} The binary data type. */ const binary = () => ({ typeId: Type.Binary, offsets: int32Array }); /** * Return a Utf8 data type instance for Unicode string data. * [UTF-8](https://en.wikipedia.org/wiki/UTF-8) code points are stored as * binary data. * @returns {Utf8Type} The utf8 data type. */ const utf8 = () => ({ typeId: Type.Utf8, offsets: int32Array }); /** * Return a Bool data type instance. Bool values are stored compactly in * bitmaps with eight values per byte. * @returns {BoolType} The bool data type. */ const bool = () => basicType(Type.Bool); /** * Return a Decimal data type instance. Decimal values are represented as 32, * 64, 128, or 256 bit integers in two's complement. Decimals are fixed point * numbers with a set *precision* (total number of decimal digits) and *scale* * (number of fractional digits). For example, the number `35.42` can be * represented as `3542` with *precision* ≥ 4 and *scale* = 2. * @param {number} precision The decimal precision: the total number of * decimal digits that can be represented. * @param {number} scale The number of fractional digits, beyond the * decimal point. * @param {32 | 64 | 128 | 256} [bitWidth] The decimal bit width. * One of 32, 64, 128 (default), or 256. * @returns {DecimalType} The decimal data type. */ const decimal = (precision, scale, bitWidth = 128) => ({ typeId: Type.Decimal, precision, scale, bitWidth: checkOneOf(bitWidth, [32, 64, 128, 256]), values: bitWidth === 32 ? int32Array : uint64Array }); /** * Return an Decimal data type instance with a bit width of 32. * @param {number} precision The decimal precision: the total number of * decimal digits that can be represented. * @param {number} scale The number of fractional digits, beyond the * decimal point. * @returns {DecimalType} The decimal data type. */ const decimal32 = (precision, scale) => decimal(precision, scale, 32); /** * Return an Decimal data type instance with a bit width of 64. * @param {number} precision The decimal precision: the total number of * decimal digits that can be represented. * @param {number} scale The number of fractional digits, beyond the * decimal point. * @returns {DecimalType} The decimal data type. */ const decimal64 = (precision, scale) => decimal(precision, scale, 64); /** * Return an Decimal data type instance with a bit width of 128. * @param {number} precision The decimal precision: the total number of * decimal digits that can be represented. * @param {number} scale The number of fractional digits, beyond the * decimal point. * @returns {DecimalType} The decimal data type. */ const decimal128 = (precision, scale) => decimal(precision, scale, 128); /** * Return an Decimal data type instance with a bit width of 256. * @param {number} precision The decimal precision: the total number of * decimal digits that can be represented. * @param {number} scale The number of fractional digits, beyond the * decimal point. * @returns {DecimalType} The decimal data type. */ const decimal256 = (precision, scale) => decimal(precision, scale, 256); /** * Return a Date data type instance. Date values are 32-bit or 64-bit signed * integers representing elapsed time since the UNIX epoch (Jan 1, 1970 UTC), * either in units of days (32 bits) or milliseconds (64 bits, with values * evenly divisible by 86400000). * @param {DateUnit_} unit The date unit. * One of `DateUnit.DAY` or `DateUnit.MILLISECOND`. * @returns {DateType} The date data type. */ const date = (unit) => ({ typeId: Type.Date, unit: checkOneOf(unit, DateUnit), values: unit === DateUnit.DAY ? int32Array : int64Array }); /** * Return a Date data type instance with units of days. * @returns {DateType} The date data type. */ const dateDay = () => date(DateUnit.DAY); /** * Return a Date data type instance with units of milliseconds. * @returns {DateType} The date data type. */ const dateMillisecond = () => date(DateUnit.MILLISECOND); /** * Return a Time data type instance, stored in one of four *unit*s: seconds, * milliseconds, microseconds or nanoseconds. The integer *bitWidth* is * inferred from the *unit* and is 32 bits for seconds and milliseconds or * 64 bits for microseconds and nanoseconds. The allowed values are between 0 * (inclusive) and 86400 (=24*60*60) seconds (exclusive), adjusted for the * time unit (for example, up to 86400000 exclusive for the * `DateUnit.MILLISECOND` unit. * * This definition doesn't allow for leap seconds. Time values from * measurements with leap seconds will need to be corrected when ingesting * into Arrow (for example by replacing the value 86400 with 86399). * @param {TimeUnit_} unit The time unit. * One of `TimeUnit.SECOND`, `TimeUnit.MILLISECOND` (default), * `TimeUnit.MICROSECOND`, or `TimeUnit.NANOSECOND`. * @returns {TimeType} The time data type. */ const time = (unit = TimeUnit.MILLISECOND) => { unit = checkOneOf(unit, TimeUnit); const bitWidth = unit === TimeUnit.SECOND || unit === TimeUnit.MILLISECOND ? 32 : 64; return { typeId: Type.Time, unit, bitWidth, values: bitWidth === 32 ? int32Array : int64Array }; }; /** * Return a Time data type instance, represented as seconds. * @returns {TimeType} The time data type. */ const timeSecond = () => time(TimeUnit.SECOND); /** * Return a Time data type instance, represented as milliseconds. * @returns {TimeType} The time data type. */ const timeMillisecond = () => time(TimeUnit.MILLISECOND); /** * Return a Time data type instance, represented as microseconds. * @returns {TimeType} The time data type. */ const timeMicrosecond = () => time(TimeUnit.MICROSECOND); /** * Return a Time data type instance, represented as nanoseconds. * @returns {TimeType} The time data type. */ const timeNanosecond = () => time(TimeUnit.NANOSECOND); /** * Return a Timestamp data type instance. Timestamp values are 64-bit signed * integers representing an elapsed time since a fixed epoch, stored in either * of four units: seconds, milliseconds, microseconds or nanoseconds, and are * optionally annotated with a timezone. Timestamp values do not include any * leap seconds (in other words, all days are considered 86400 seconds long). * @param {TimeUnit_} [unit] The time unit. * One of `TimeUnit.SECOND`, `TimeUnit.MILLISECOND` (default), * `TimeUnit.MICROSECOND`, or `TimeUnit.NANOSECOND`. * @param {string|null} [timezone=null] An optional string for the name of a * timezone. If provided, the value should either be a string as used in the * Olson timezone database (the "tz database" or "tzdata"), such as * "America/New_York", or an absolute timezone offset of the form "+XX:XX" or * "-XX:XX", such as "+07:30".Whether a timezone string is present indicates * different semantics about the data. * @returns {TimestampType} The time data type. */ const timestamp = (unit = TimeUnit.MILLISECOND, timezone = null) => ({ typeId: Type.Timestamp, unit: checkOneOf(unit, TimeUnit), timezone, values: int64Array }); /** * Return an Interval type instance. Values represent calendar intervals stored * as integers for each date part. The supported *unit*s are year/moth, * day/time, and month/day/nanosecond intervals. * * `IntervalUnit.YEAR_MONTH` indicates the number of elapsed whole months, * stored as 32-bit signed integers. * * `IntervalUnit.DAY_TIME` indicates the number of elapsed days and * milliseconds (no leap seconds), stored as 2 contiguous 32-bit signed * integers (8-bytes in total). * * `IntervalUnit.MONTH_DAY_NANO` is a triple of the number of elapsed months, * days, and nanoseconds. The values are stored contiguously in 16-byte blocks. * Months and days are encoded as 32-bit signed integers and nanoseconds is * encoded as a 64-bit signed integer. Nanoseconds does not allow for leap * seconds. Each field is independent (e.g. there is no constraint that * nanoseconds have the same sign as days or that the quantity of nanoseconds * represents less than a day's worth of time). * @param {IntervalUnit_} unit The interval unit. * One of `IntervalUnit.YEAR_MONTH`, `IntervalUnit.DAY_TIME`, or * `IntervalUnit.MONTH_DAY_NANO` (default). * @returns {IntervalType} The interval data type. */ const interval = (unit = IntervalUnit.MONTH_DAY_NANO) => ({ typeId: Type.Interval, unit: checkOneOf(unit, IntervalUnit), values: unit === IntervalUnit.MONTH_DAY_NANO ? undefined : int32Array }); /** * Return a List data type instance, representing variably-sized lists * (arrays) with 32-bit offsets. A list has a single child data type for * list entries. Lists are represented using integer offsets that indicate * list extents within a single child array containing all list values. * @param {FieldInput} child The child (list item) field or data type. * @returns {ListType} The list data type. */ const list = (child) => ({ typeId: Type.List, children: [ asField(child) ], offsets: int32Array }); /** * Return a Struct data type instance. A struct consists of multiple named * child data types. Struct values are stored as parallel child batches, one * per child type, and extracted to standard JavaScript objects. * @param {Field[] | Record<string, DataType>} children * An array of property fields, or an object mapping property names to data * types. If an object, the instantiated fields are assumed to be nullable * and have no metadata. * @returns {StructType} The struct data type. */ const struct = (children) => ({ typeId: Type.Struct, children: Array.isArray(children) && children.length > 0 && isField(children[0]) ? /** @type {Field[]} */ (children) : Object.entries(children).map(([name, type]) => field(name, type)) }); /** * Return a Union type instance. A union is a complex type with parallel * *children* data types. Union values are stored in either a sparse * (`UnionMode.Sparse`) or dense (`UnionMode.Dense`) layout *mode*. In a * sparse layout, child types are stored in parallel arrays with the same * lengths, resulting in many unused, empty values. In a dense layout, child * types have variable lengths and an offsets array is used to index the * appropriate value. * * By default, ids in the type vector refer to the index in the children * array. Optionally, *typeIds* provide an indirection between the child * index and the type id. For each child, `typeIds[index]` is the id used * in the type vector. The *typeIdForValue* argument provides a lookup * function for mapping input data to the proper child type id, and is * required if using builder methods. * @param {UnionMode_} mode The union mode. * One of `UnionMode.Sparse` or `UnionMode.Dense`. * @param {FieldInput[]} children The children fields or data types. * Types are mapped to nullable fields with no metadata. * @param {number[]} [typeIds] Children type ids, in the same order as the * children types. Type ids provide a level of indirection over children * types. If not provided, the children indices are used as the type ids. * @param {(value: any, index: number) => number} [typeIdForValue] * A function that takes an arbitrary value and a row index and returns a * correponding union type id. Required by builder methods. * @returns {UnionType} The union data type. */ const union = (mode, children, typeIds, typeIdForValue) => { typeIds ??= children.map((v, i) => i); return { typeId: Type.Union, mode: checkOneOf(mode, UnionMode), typeIds, typeMap: typeIds.reduce((m, id, i) => ((m[id] = i), m), {}), children: children.map((v, i) => asField(v, `_${i}`)), typeIdForValue, offsets: int32Array, }; }; /** * Create a FixedSizeBinary data type instance for opaque binary data where * each entry has the same fixed size. * @param {number} stride The fixed size in bytes. * @returns {FixedSizeBinaryType} The fixed size binary data type. */ const fixedSizeBinary = (stride) => ({ typeId: Type.FixedSizeBinary, stride }); /** * Return a FixedSizeList type instance for list (array) data where every list * has the same fixed size. A list has a single child data type for list * entries. Fixed size lists are represented as a single child array containing * all list values, indexed using the known stride. * @param {FieldInput} child The list item data type. * @param {number} stride The fixed list size. * @returns {FixedSizeListType} The fixed size list data type. */ const fixedSizeList = (child, stride) => ({ typeId: Type.FixedSizeList, stride, children: [ asField(child) ] }); /** * Internal method to create a Map type instance. * @param {boolean} keysSorted Flag indicating if the map keys are sorted. * @param {Field} child The child fields. * @returns {MapType} The map data type. */ const mapType = (keysSorted, child) => ({ typeId: Type.Map, keysSorted, children: [child], offsets: int32Array }); /** * Return a Map data type instance representing collections of key-value pairs. * A Map is a logical nested type that is represented as a list of key-value * structs. The key and value types are not constrained, so the application is * responsible for ensuring that the keys are hashable and unique, and that * keys are properly sorted if *keysSorted* is `true`. * @param {FieldInput} keyField The map key field or data type. * @param {FieldInput} valueField The map value field or data type. * @param {boolean} [keysSorted=false] Flag indicating if the map keys are * sorted (default `false`). * @returns {MapType} The map data type. */ const map = (keyField, valueField, keysSorted = false) => mapType( keysSorted, field( 'entries', struct([ asField(keyField, 'key', false), asField(valueField, 'value') ]), false ) ); /** * Return a Duration data type instance. Durations represent an absolute length * of time unrelated to any calendar artifacts. The resolution defaults to * millisecond, but can be any of the other `TimeUnit` values. This type is * always represented as a 64-bit integer. * @param {TimeUnit_} unit * @returns {DurationType} The duration data type. */ const duration = (unit = TimeUnit.MILLISECOND) => ({ typeId: Type.Duration, unit: checkOneOf(unit, TimeUnit), values: int64Array }); /** * Return a LargeBinary data type instance for variably-sized opaque binary * data with 64-bit offsets, allowing representation of extremely large data * values. * @returns {LargeBinaryType} The large binary data type. */ const largeBinary = () => ({ typeId: Type.LargeBinary, offsets: int64Array }); /** * Return a LargeUtf8 data type instance for Unicode string data of variable * length with 64-bit offsets, allowing representation of extremely large data * values. [UTF-8](https://en.wikipedia.org/wiki/UTF-8) code points are stored * as binary data. * @returns {LargeUtf8Type} The large utf8 data type. */ const largeUtf8 = () => ({ typeId: Type.LargeUtf8, offsets: int64Array }); /** * Return a LargeList data type instance, representing variably-sized lists * (arrays) with 64-bit offsets, allowing representation of extremely large * data values. A list has a single child data type for list entries. Lists * are represented using integer offsets that indicate list extents within a * single child array containing all list values. * @param {FieldInput} child The child (list item) field or data type. * @returns {LargeListType} The large list data type. */ const largeList = (child) => ({ typeId: Type.LargeList, children: [ asField(child) ], offsets: int64Array }); /** * Return a RunEndEncoded data type instance, which compresses data by * representing consecutive repeated values as a run. This data type uses two * child arrays, `run_ends` and `values`. The `run_ends` child array must be * a 16, 32, or 64 bit integer array which encodes the indices at which the * run with the value in each corresponding index in the values child array * ends. Like list and struct types, the `values` array can be of any type. * @param {FieldInput} runsField The run-ends field or data type. * @param {FieldInput} valuesField The values field or data type. * @returns {RunEndEncodedType} The large list data type. */ const runEndEncoded = (runsField, valuesField) => ({ typeId: Type.RunEndEncoded, children: [ check( asField(runsField, 'run_ends'), (field) => field.type.typeId === Type.Int, () => 'Run-ends must have an integer type.' ), asField(valuesField, 'values') ] }); /** * Return a BinaryView data type instance. BinaryView data is logically the * same as the Binary type, but the internal representation uses a view struct * that contains the string length and either the string's entire data inline * (for small strings) or an inlined prefix, an index of another buffer, and an * offset pointing to a slice in that buffer (for non-small strings). * * Flechette can encode and decode BinaryView data; however, Flechette does * not currently support building BinaryView columns from JavaScript values. * @returns {BinaryViewType} The binary view data type. */ const binaryView = () => /** @type {BinaryViewType} */ (basicType(Type.BinaryView)); /** * Return a Utf8View data type instance. Utf8View data is logically the same as * the Utf8 type, but the internal representation uses a view struct that * contains the string length and either the string's entire data inline (for * small strings) or an inlined prefix, an index of another buffer, and an * offset pointing to a slice in that buffer (for non-small strings). * * Flechette can encode and decode Utf8View data; however, Flechette does * not currently support building Utf8View columns from JavaScript values. * @returns {Utf8ViewType} The utf8 view data type. */ const utf8View = () => /** @type {Utf8ViewType} */ (basicType(Type.Utf8View)); /** * Return a ListView data type instance, representing variably-sized lists * (arrays) with 32-bit offsets. ListView data represents the same logical * types that List can, but contains both offsets and sizes allowing for * writes in any order and sharing of child values among list values. * * Flechette can encode and decode ListView data; however, Flechette does not * currently support building ListView columns from JavaScript values. * @param {FieldInput} child The child (list item) field or data type. * @returns {ListViewType} The list view data type. */ const listView = (child) => ({ typeId: Type.ListView, children: [ asField(child, 'value') ], offsets: int32Array }); /** * Return a LargeListView data type instance, representing variably-sized lists * (arrays) with 64-bit offsets, allowing representation of extremely large * data values. LargeListView data represents the same logical types that * LargeList can, but contains both offsets and sizes allowing for writes * in any order and sharing of child values among list values. * * Flechette can encode and decode LargeListView data; however, Flechette does * not currently support building LargeListView columns from JavaScript values. * @param {FieldInput} child The child (list item) field or data type. * @returns {LargeListViewType} The large list view data type. */ const largeListView = (child) => ({ typeId: Type.LargeListView, children: [ asField(child, 'value') ], offsets: int64Array }); /** * @import { TimeUnit_, TypedArray } from '../types.js'; */ // typed arrays over a shared buffer to aid binary conversion const f64 = new float64Array(2); const buf = f64.buffer; const i64 = new int64Array(buf); const u32 = new uint32Array(buf); const i32 = new int32Array(buf); const u8 = new uint8Array(buf); /** * Return a value unchanged. * @template T * @param {T} value The value. * @returns {T} The value. */ function identity(value) { return value; } /** * Return a value coerced to a BigInt. * @param {*} value The value. * @returns {bigint} The BigInt value. */ function toBigInt(value) { return BigInt(value); } /** * Return an offset conversion method for the given data type. * @param {{ offsets: TypedArray}} type The array type. */ function toOffset(type) { return isInt64ArrayType(type) ? toBigInt : identity; } /** * Return the number of days from a millisecond timestamp. * @param {number} value The millisecond timestamp. * @returns {number} The number of days. */ function toDateDay(value) { return (value / 864e5) | 0; } /** * Return a timestamp conversion method for the given time unit. * @param {TimeUnit_} unit The time unit. * @returns {(value: number) => bigint} The conversion method. */ function toTimestamp(unit) { return unit === TimeUnit.SECOND ? value => toBigInt(value / 1e3) : unit === TimeUnit.MILLISECOND ? toBigInt : unit === TimeUnit.MICROSECOND ? value => toBigInt(value * 1e3) : value => toBigInt(value * 1e6); } /** * Write month/day/nanosecond interval to a byte buffer. * @param {Array | Float64Array} interval The interval data. * @returns {Uint8Array} A byte buffer with the interval data. * The returned buffer is reused across calls, and so should be * copied to a target buffer immediately. */ function toMonthDayNanoBytes([m, d, n]) { i32[0] = m; i32[1] = d; i64[1] = toBigInt(n); return u8; } /** * Coerce a bigint value to a number. Throws an error if the bigint value * lies outside the range of what a number can precisely represent. * @param {bigint} value The value to check and possibly convert. * @returns {number} The converted number value. */ function toNumber(value) { if (value > Number.MAX_SAFE_INTEGER || value < Number.MIN_SAFE_INTEGER) { throw Error(`BigInt exceeds integer number representation: ${value}`); } return Number(value); } /** * Divide one BigInt value by another, and return the result as a number. * The division may involve unsafe integers and a loss of precisio