UNPKG

@uwdata/flechette

Version:

Fast, lightweight access to Apache Arrow data.

886 lines (823 loc) 24.3 kB
import { bisect, float64Array } from './util/arrays.js'; import { divide, fromDecimal128, fromDecimal256, fromDecimal64, toNumber } from './util/numbers.js'; import { decodeBit, readInt32, readInt64 } from './util/read.js'; import { decodeUtf8 } from './util/strings.js'; import { objectFactory, proxyFactory } from './util/struct.js'; /** * Check if the input is a batch that supports direct access to * binary data in the form of typed arrays. * @param {Batch<any>?} batch The data batch to check. * @returns {boolean} True if a direct batch, false otherwise. */ export function isDirectBatch(batch) { return batch instanceof DirectBatch; } /** * Column values from a single record batch. * A column may contain multiple batches. * @template T */ export class Batch { /** * The array type to use when extracting data from the batch. * A null value indicates that the array type should match * the type of the batch's values array. * @type {ArrayConstructor | import('./types.js').TypedArrayConstructor | null} */ static ArrayType = null; /** * Create a new column batch. * @param {object} options * @param {number} options.length The length of the batch * @param {number} options.nullCount The null value count * @param {import('./types.js').DataType} options.type The data type. * @param {Uint8Array} [options.validity] Validity bitmap buffer * @param {import('./types.js').TypedArray} [options.values] Values buffer * @param {import('./types.js').OffsetArray} [options.offsets] Offsets buffer * @param {import('./types.js').OffsetArray} [options.sizes] Sizes buffer * @param {Batch[]} [options.children] Children batches */ constructor({ length, nullCount, type, validity, values, offsets, sizes, children }) { this.length = length; this.nullCount = nullCount; this.type = type; this.validity = validity; this.values = values; this.offsets = offsets; this.sizes = sizes; this.children = children; // optimize access if this batch has no null values // some types (like union) may have null values in // child batches, but no top-level validity buffer if (!nullCount || !this.validity) { /** @type {(index: number) => T | null} */ this.at = index => this.value(index); } } /** * Provide an informative object string tag. */ get [Symbol.toStringTag]() { return 'Batch'; } /** * Return the value at the given index. * @param {number} index The value index. * @returns {T | null} The value. */ at(index) { return this.isValid(index) ? this.value(index) : null; } /** * Check if a value at the given index is valid (non-null). * @param {number} index The value index. * @returns {boolean} True if valid, false otherwise. */ isValid(index) { return decodeBit(this.validity, index); } /** * Return the value at the given index. This method does not check the * validity bitmap and is intended primarily for internal use. In most * cases, callers should use the `at()` method instead. * @param {number} index The value index * @returns {T} The value, ignoring the validity bitmap. */ value(index) { return /** @type {T} */ (this.values[index]); } /** * Extract an array of values within the given index range. Unlike * Array.slice, all arguments are required and may not be negative indices. * @param {number} start The starting index, inclusive * @param {number} end The ending index, exclusive * @returns {import('./types.js').ValueArray<T?>} The slice of values */ slice(start, end) { const n = end - start; const values = Array(n); for (let i = 0; i < n; ++i) { values[i] = this.at(start + i); } return values; } /** * Return an iterator over the values in this batch. * @returns {Iterator<T?>} */ *[Symbol.iterator]() { for (let i = 0; i < this.length; ++i) { yield this.at(i); } } } /** * A batch whose value buffer can be used directly, without transformation. * @template T * @extends {Batch<T>} */ export class DirectBatch extends Batch { /** * Create a new column batch with direct value array access. * @param {object} options * @param {number} options.length The length of the batch * @param {number} options.nullCount The null value count * @param {import('./types.js').DataType} options.type The data type. * @param {Uint8Array} [options.validity] Validity bitmap buffer * @param {import('./types.js').TypedArray} options.values Values buffer */ constructor(options) { super(options); // underlying buffers may be padded, exceeding the logical batch length // we trim the values array so we can safely access it directly const { length, values } = this; this.values = values.subarray(0, length); } /** * Extract an array of values within the given index range. Unlike * Array.slice, all arguments are required and may not be negative indices. * When feasible, a zero-copy subarray of a typed array is returned. * @param {number} start The starting index, inclusive * @param {number} end The ending index, exclusive * @returns {import('./types.js').ValueArray<T?>} The slice of values */ slice(start, end) { // @ts-ignore return this.nullCount ? super.slice(start, end) : this.values.subarray(start, end); } /** * Return an iterator over the values in this batch. * @returns {Iterator<T?>} */ [Symbol.iterator]() { return this.nullCount ? super[Symbol.iterator]() : /** @type {Iterator<T?>} */ (this.values[Symbol.iterator]()); } } /** * A batch whose values are transformed to 64-bit numbers. * @extends {Batch<number>} */ export class NumberBatch extends Batch { static ArrayType = float64Array; } /** * A batch whose values should be returned in a standard array. * @template T * @extends {Batch<T>} */ export class ArrayBatch extends Batch { static ArrayType = Array; } /** * A batch of null values only. * @extends {ArrayBatch<null>} */ export class NullBatch extends ArrayBatch { /** * @param {number} index The value index * @returns {null} */ value(index) { // eslint-disable-line no-unused-vars return null; } } /** * A batch that coerces BigInt values to 64-bit numbers. * @extends {NumberBatch} */ export class Int64Batch extends NumberBatch { /** * @param {number} index The value index */ value(index) { return toNumber(/** @type {bigint} */ (this.values[index])); } } /** * A batch of 16-bit floating point numbers, accessed as unsigned * 16-bit ints and transformed to 64-bit numbers. */ export class Float16Batch extends NumberBatch { /** * @param {number} index The value index */ value(index) { const v = /** @type {number} */ (this.values[index]); const expo = (v & 0x7C00) >> 10; const sigf = (v & 0x03FF) / 1024; const sign = (-1) ** ((v & 0x8000) >> 15); switch (expo) { case 0x1F: return sign * (sigf ? Number.NaN : 1 / 0); case 0x00: return sign * (sigf ? 6.103515625e-5 * sigf : 0); } return sign * (2 ** (expo - 15)) * (1 + sigf); } } /** * A batch of boolean values stored as a bitmap. * @extends {ArrayBatch<boolean>} */ export class BoolBatch extends ArrayBatch { /** * @param {number} index The value index */ value(index) { return decodeBit(/** @type {Uint8Array} */ (this.values), index); } } /** * A batch of 32-bit decimal numbers, returned as converted 64-bit floating * point numbers. Number coercion may be lossy if the decimal precision can * not be represented in a 64-bit floating point format. * @extends {NumberBatch} */ export class Decimal32NumberBatch extends NumberBatch { constructor(options) { super(options); const { scale } = /** @type {import('./types.js').DecimalType} */ (this.type); this.scale = 10 ** scale; } /** * @param {number} index The value index */ value(index) { return /** @type {number} */(this.values[index]) / this.scale; } } /** * An abstract class for a batch of 64-, 128- or 256-bit decimal numbers, * accessed in strided BigUint64Arrays. * @template T * @extends {Batch<T>} */ export class DecimalBatch extends Batch { constructor(options) { super(options); const { bitWidth, scale } = /** @type {import('./types.js').DecimalType} */ (this.type); this.decimal = bitWidth === 64 ? fromDecimal64 : bitWidth === 128 ? fromDecimal128 : fromDecimal256; this.scale = 10n ** BigInt(scale); } } /** * A batch of 64-, 128- or 256-bit decimal numbers, returned as converted * 64-bit floating point numbers. Number coercion may be lossy if the decimal * precision can not be represented in a 64-bit floating point format. * @extends {DecimalBatch<number>} */ export class DecimalNumberBatch extends DecimalBatch { static ArrayType = float64Array; /** * @param {number} index The value index */ value(index) { return divide( this.decimal(/** @type {BigUint64Array} */ (this.values), index), this.scale ); } } /** * A batch of 64-, 128- or 256-bit decimal numbers, returned as scaled * bigint values, such that all fractional digits have been shifted * to integer places by the decimal type scale factor. * @extends {DecimalBatch<bigint>} */ export class DecimalBigIntBatch extends DecimalBatch { static ArrayType = Array; /** * @param {number} index The value index */ value(index) { return this.decimal(/** @type {BigUint64Array} */ (this.values), index); } } /** * A batch of date or timestamp values that are coerced to UNIX epoch timestamps * and returned as JS Date objects. This batch wraps a source batch that provides * timestamp values. * @extends {ArrayBatch<Date>} */ export class DateBatch extends ArrayBatch { /** * Create a new date batch. * @param {Batch<number>} batch A batch of timestamp values. */ constructor(batch) { super(batch); this.source = batch; } /** * @param {number} index The value index */ value(index) { return new Date(this.source.value(index)); } } /** * A batch of dates as day counts, coerced to timestamp numbers. */ export class DateDayBatch extends NumberBatch { /** * @param {number} index The value index * @returns {number} */ value(index) { // epoch days to milliseconds return 86400000 * /** @type {number} */ (this.values[index]); } } /** * A batch of dates as millisecond timestamps, coerced to numbers. */ export const DateDayMillisecondBatch = Int64Batch; /** * A batch of timestaps in seconds, coerced to millisecond numbers. */ export class TimestampSecondBatch extends Int64Batch { /** * @param {number} index The value index */ value(index) { return super.value(index) * 1e3; // seconds to milliseconds } } /** * A batch of timestaps in milliseconds, coerced to numbers. */ export const TimestampMillisecondBatch = Int64Batch; /** * A batch of timestaps in microseconds, coerced to millisecond numbers. */ export class TimestampMicrosecondBatch extends Int64Batch { /** * @param {number} index The value index */ value(index) { // microseconds to milliseconds return divide(/** @type {bigint} */ (this.values[index]), 1000n); } } /** * A batch of timestaps in nanoseconds, coerced to millisecond numbers. */ export class TimestampNanosecondBatch extends Int64Batch { /** * @param {number} index The value index */ value(index) { // nanoseconds to milliseconds return divide(/** @type {bigint} */ (this.values[index]), 1000000n); } } /** * A batch of day/time intervals, returned as two-element 32-bit int arrays. * @extends {ArrayBatch<Int32Array>} */ export class IntervalDayTimeBatch extends ArrayBatch { /** * @param {number} index The value index * @returns {Int32Array} */ value(index) { const values = /** @type {Int32Array} */ (this.values); return values.subarray(index << 1, (index + 1) << 1); } } /** * A batch of month/day/nanosecond intervals, returned as three-element arrays. * @extends {ArrayBatch<Float64Array>} */ export class IntervalMonthDayNanoBatch extends ArrayBatch { /** * @param {number} index The value index */ value(index) { const values = /** @type {Uint8Array} */ (this.values); const base = index << 4; return Float64Array.of( readInt32(values, base), readInt32(values, base + 4), readInt64(values, base + 8) ); } } const offset32 = ({values, offsets}, index) => values.subarray(offsets[index], offsets[index + 1]); const offset64 = ({values, offsets}, index) => values.subarray(toNumber(offsets[index]), toNumber(offsets[index + 1])); /** * A batch of binary blobs with variable offsets, returned as byte buffers of * unsigned 8-bit integers. The offsets are 32-bit ints. * @extends {ArrayBatch<Uint8Array>} */ export class BinaryBatch extends ArrayBatch { /** * @param {number} index * @returns {Uint8Array} */ value(index) { return offset32(this, index); } } /** * A batch of binary blobs with variable offsets, returned as byte buffers of * unsigned 8-bit integers. The offsets are 64-bit ints. Value extraction will * fail if an offset exceeds `Number.MAX_SAFE_INTEGER`. * @extends {ArrayBatch<Uint8Array>} */ export class LargeBinaryBatch extends ArrayBatch { /** * @param {number} index * @returns {Uint8Array} */ value(index) { return offset64(this, index); } } /** * A batch of UTF-8 strings with variable offsets. The offsets are 32-bit ints. * @extends {ArrayBatch<string>} */ export class Utf8Batch extends ArrayBatch { /** * @param {number} index */ value(index) { return decodeUtf8(offset32(this, index)); } } /** * A batch of UTF-8 strings with variable offsets. The offsets are 64-bit ints. * Value extraction will fail if an offset exceeds `Number.MAX_SAFE_INTEGER`. * @extends {ArrayBatch<string>} */ export class LargeUtf8Batch extends ArrayBatch { /** * @param {number} index */ value(index) { return decodeUtf8(offset64(this, index)); } } /** * A batch of list (array) values of variable length. The list offsets are * 32-bit ints. * @template V * @extends {ArrayBatch<import('./types.js').ValueArray<V>>} */ export class ListBatch extends ArrayBatch { /** * @param {number} index * @returns {import('./types.js').ValueArray<V>} */ value(index) { const offsets = /** @type {Int32Array} */ (this.offsets); return this.children[0].slice(offsets[index], offsets[index + 1]); } } /** * A batch of list (array) values of variable length. The list offsets are * 64-bit ints. Value extraction will fail if an offset exceeds * `Number.MAX_SAFE_INTEGER`. * @template V * @extends {ArrayBatch<import('./types.js').ValueArray<V>>} */ export class LargeListBatch extends ArrayBatch { /** * @param {number} index * @returns {import('./types.js').ValueArray<V>} */ value(index) { const offsets = /** @type {BigInt64Array} */ (this.offsets); return this.children[0].slice(toNumber(offsets[index]), toNumber(offsets[index + 1])); } } /** * A batch of list (array) values of variable length. The list offsets and * sizes are 32-bit ints. * @template V * @extends {ArrayBatch<import('./types.js').ValueArray<V>>} */ export class ListViewBatch extends ArrayBatch { /** * @param {number} index * @returns {import('./types.js').ValueArray<V>} */ value(index) { const a = /** @type {number} */ (this.offsets[index]); const b = a + /** @type {number} */ (this.sizes[index]); return this.children[0].slice(a, b); } } /** * A batch of list (array) values of variable length. The list offsets and * sizes are 64-bit ints. Value extraction will fail if an offset or size * exceeds `Number.MAX_SAFE_INTEGER`. * @template V * @extends {ArrayBatch<import('./types.js').ValueArray<V>>} */ export class LargeListViewBatch extends ArrayBatch { /** * @param {number} index * @returns {import('./types.js').ValueArray<V>} */ value(index) { const a = /** @type {bigint} */ (this.offsets[index]); const b = a + /** @type {bigint} */ (this.sizes[index]); return this.children[0].slice(toNumber(a), toNumber(b)); } } /** * A batch with a fixed stride. * @template T * @extends {ArrayBatch<T>} */ class FixedBatch extends ArrayBatch { constructor(options) { super(options); /** @type {number} */ // @ts-ignore this.stride = this.type.stride; } } /** * A batch of binary blobs of fixed size, returned as byte buffers of unsigned * 8-bit integers. * @extends {FixedBatch<Uint8Array>} */ export class FixedBinaryBatch extends FixedBatch { /** * @param {number} index * @returns {Uint8Array} */ value(index) { const { stride, values } = this; return /** @type {Uint8Array} */ (values) .subarray(index * stride, (index + 1) * stride); } } /** * A batch of list (array) values of fixed length. * @template V * @extends {FixedBatch<import('./types.js').ValueArray<V>>} */ export class FixedListBatch extends FixedBatch { /** * @param {number} index * @returns {import('./types.js').ValueArray<V>} */ value(index) { const { children, stride } = this; return children[0].slice(index * stride, (index + 1) * stride); } } /** * Extract Map key-value pairs from parallel child batches. */ function pairs({ children, offsets }, index) { const [ keys, vals ] = children[0].children; const start = offsets[index]; const end = offsets[index + 1]; const entries = []; for (let i = start; i < end; ++i) { entries.push([keys.at(i), vals.at(i)]); } return entries; } /** * A batch of map (key, value) values. The map is represented as a list of * key-value structs. * @template K, V * @extends {ArrayBatch<[K, V][]>} */ export class MapEntryBatch extends ArrayBatch { /** * Return the value at the given index. * @param {number} index The value index. * @returns {[K, V][]} The map entries as an array of [key, value] arrays. */ value(index) { return /** @type {[K, V][]} */ (pairs(this, index)); } } /** * A batch of map (key, value) values. The map is represented as a list of * key-value structs. * @template K, V * @extends {ArrayBatch<Map<K, V>>} */ export class MapBatch extends ArrayBatch { /** * Return the value at the given index. * @param {number} index The value index. * @returns {Map<K, V>} The map value. */ value(index) { return new Map(/** @type {[K, V][]} */ (pairs(this, index))); } } /** * A batch of union-type values with a sparse layout, enabling direct * lookup from the child value batches. * @template T * @extends {ArrayBatch<T>} */ export class SparseUnionBatch extends ArrayBatch { /** * Create a new column batch. * @param {object} options * @param {number} options.length The length of the batch * @param {number} options.nullCount The null value count * @param {import('./types.js').DataType} options.type The data type. * @param {Uint8Array} [options.validity] Validity bitmap buffer * @param {Int32Array} [options.offsets] Offsets buffer * @param {Batch[]} options.children Children batches * @param {Int8Array} options.typeIds Union type ids buffer * @param {Record<string, number>} options.map A typeId to children index map */ constructor({ typeIds, ...options }) { super(options); /** @type {Int8Array} */ this.typeIds = typeIds; /** @type {Record<string, number>} */ // @ts-ignore this.typeMap = this.type.typeMap; } /** * @param {number} index The value index. */ value(index, offset = index) { const { typeIds, children, typeMap } = this; return children[typeMap[typeIds[index]]].at(offset); } } /** * A batch of union-type values with a dense layout, reqiring offset * lookups from the child value batches. * @template T * @extends {SparseUnionBatch<T>} */ export class DenseUnionBatch extends SparseUnionBatch { /** * @param {number} index The value index. */ value(index) { return super.value(index, /** @type {number} */ (this.offsets[index])); } } /** * A batch of struct values, containing a set of named properties. * Struct property values are extracted and returned as JS objects. * @extends {ArrayBatch<Record<string, any>>} */ export class StructBatch extends ArrayBatch { constructor(options, factory = objectFactory) { super(options); /** @type {string[]} */ // @ts-ignore this.names = this.type.children.map(child => child.name); this.factory = factory(this.names, this.children); } /** * @param {number} index The value index. * @returns {Record<string, any>} */ value(index) { return this.factory(index); } } /** * A batch of struct values, containing a set of named properties. * Structs are returned as proxy objects that extract data directly * from underlying Arrow batches. * @extends {StructBatch} */ export class StructProxyBatch extends StructBatch { constructor(options) { super(options, proxyFactory); } } /** * A batch of run-end-encoded values. * @template T * @extends {ArrayBatch<T>} */ export class RunEndEncodedBatch extends ArrayBatch { /** * @param {number} index The value index. */ value(index) { const [ { values: runs }, vals ] = this.children; return vals.at( bisect(/** @type {import('./types.js').IntegerArray} */(runs), index) ); } } /** * A batch of dictionary-encoded values. * @template T * @extends {ArrayBatch<T>} */ export class DictionaryBatch extends ArrayBatch { /** * Register the backing dictionary. Dictionaries are added * after batch creation as the complete dictionary may not * be finished across multiple record batches. * @param {import('./column.js').Column<T>} dictionary * The dictionary of column values. */ setDictionary(dictionary) { this.dictionary = dictionary; this.cache = dictionary.cache(); return this; } /** * @param {number} index The value index. */ value(index) { return this.cache[this.key(index)]; } /** * @param {number} index The value index. * @returns {number} The dictionary key */ key(index) { return /** @type {number} */ (this.values[index]); } } /** * @template T * @extends {ArrayBatch<T>} */ class ViewBatch extends ArrayBatch { /** * Create a new view batch. * @param {object} options Batch options. * @param {number} options.length The length of the batch * @param {number} options.nullCount The null value count * @param {import('./types.js').DataType} options.type The data type. * @param {Uint8Array} [options.validity] Validity bitmap buffer * @param {Uint8Array} options.values Values buffer * @param {Uint8Array[]} options.data View data buffers */ constructor({ data, ...options }) { super(options); this.data = data; } /** * Get the binary data at the provided index. * @param {number} index The value index. * @returns {Uint8Array} */ view(index) { const { values, data } = this; const offset = index << 4; // each entry is 16 bytes let start = offset + 4; let buf = /** @type {Uint8Array} */ (values); const length = readInt32(buf, offset); if (length > 12) { // longer strings are in a data buffer start = readInt32(buf, offset + 12); buf = data[readInt32(buf, offset + 8)]; } return buf.subarray(start, start + length); } } /** * A batch of binary blobs from variable data buffers, returned as byte * buffers of unsigned 8-bit integers. * @extends {ViewBatch<Uint8Array>} */ export class BinaryViewBatch extends ViewBatch { /** * @param {number} index The value index. */ value(index) { return this.view(index); } } /** * A batch of UTF-8 strings from variable data buffers. * @extends {ViewBatch<string>} */ export class Utf8ViewBatch extends ViewBatch { /** * @param {number} index The value index. */ value(index) { return decodeUtf8(this.view(index)); } }