UNPKG

@uwdata/flechette

Version:

Fast, lightweight access to Apache Arrow data.

277 lines (253 loc) 8.41 kB
/** * @import { ArrowData, BodyCompression, ExtractionOptions, Field, RecordBatch, Schema } from '../types.js' */ import { batchType } from '../batch-type.js'; import { columnBuilder } from '../column.js'; import { decompressBuffer, getCompressionCodec, missingCodec } from '../compression.js'; import { BodyCompressionMethod, Type, UnionMode, Version } from '../constants.js'; import { invalidDataType } from '../data-types.js'; import { Table } from '../table.js'; import { int8Array } from '../util/arrays.js'; import { decodeIPC } from './decode-ipc.js'; /** * Decode [Apache Arrow IPC data][1] and return a new Table. The input binary * data may be either an `ArrayBuffer` or `Uint8Array`. For Arrow data in the * [IPC 'stream' format][2], an array of `Uint8Array` values is also supported. * * [1]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc * [2]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format * @param {ArrayBufferLike | Uint8Array | Uint8Array[]} data * The source byte buffer, or an array of buffers. If an array, each byte * array may contain one or more self-contained messages. Messages may NOT * span multiple byte arrays. * @param {ExtractionOptions} [options] * Options for controlling how values are transformed when extracted * from an Arrow binary representation. * @returns {Table} A Table instance. */ export function tableFromIPC(data, options) { return createTable(decodeIPC(data), options); } /** * Create a table from parsed IPC data. * @param {ArrowData} data * The IPC data, as returned by parseIPC. * @param {ExtractionOptions} [options] * Options for controlling how values are transformed when extracted * from am Arrow binary representation. * @returns {Table} A Table instance. */ export function createTable(data, options = {}) { const { schema = { fields: [] }, dictionaries, records } = data; const { version, fields } = schema; const dictionaryMap = new Map; const context = contextGenerator(options, version, dictionaryMap); // build dictionary type map const dictionaryTypes = new Map; visitSchemaFields(schema, field => { const type = field.type; if (type.typeId === Type.Dictionary) { dictionaryTypes.set(type.id, type.dictionary); } }); // decode dictionaries, build dictionary column map const dicts = new Map; for (const dict of dictionaries) { const { id, data, isDelta, body } = dict; const type = dictionaryTypes.get(id); const batch = visit(type, context({ ...data, body })); if (!dicts.has(id)) { if (isDelta) { throw new Error('Delta update can not be first dictionary batch.'); } dicts.set(id, columnBuilder(type).add(batch)); } else { const dict = dicts.get(id); if (!isDelta) dict.clear(); dict.add(batch); } } dicts.forEach((value, key) => dictionaryMap.set(key, value.done())); // decode column fields const cols = fields.map(f => columnBuilder(f.type)); for (const batch of records) { const ctx = context(batch); fields.forEach((f, i) => cols[i].add(visit(f.type, ctx))); } return new Table(schema, cols.map(c => c.done()), options.useProxy); } /** * Visit all fields within a schema. * @param {Schema} schema * @param {(field: Field) => void} visitor */ function visitSchemaFields(schema, visitor) { schema.fields.forEach(function visitField(field) { visitor(field); // @ts-ignore field.type.dictionary?.children?.forEach(visitField); // @ts-ignore field.type.children?.forEach(visitField); }); } /** * Context object generator for field visitation and buffer definition. */ function contextGenerator(options, version, dictionaryMap) { const base = { version, options, dictionary: id => dictionaryMap.get(id), }; /** * Return a context generator. * @param {RecordBatch} batch */ return batch => { const { length, nodes, regions, compression, variadic, body } = batch; let nodeIndex = -1; let bufferIndex = -1; let variadicIndex = -1; return { ...base, length, node: () => nodes[++nodeIndex], buffer: (ArrayType) => { const { bytes, length, offset } = maybeDecompress(body, regions[++bufferIndex], compression); return ArrayType ? new ArrayType(bytes.buffer, bytes.byteOffset + offset, length / ArrayType.BYTES_PER_ELEMENT) : bytes.subarray(offset, offset + length) }, variadic: () => variadic[++variadicIndex], visit(children) { return children.map(f => visit(f.type, this)); } }; }; } /** * Prepare an arrow buffer for use, potentially decompressing it. * @param {Uint8Array} body * @param {{offset: number, length: number}} region * @param {BodyCompression} compression */ function maybeDecompress(body, region, compression) { if (!compression) { return { bytes: body, ...region }; } else if (compression.method !== BodyCompressionMethod.BUFFER) { throw new Error(`Unknown compression method (${compression.method})`); } else { const id = compression.codec; const codec = getCompressionCodec(id); if (!codec) throw new Error(missingCodec(id)); return decompressBuffer(body, region, codec); } } /** * Visit a field, instantiating views of buffer regions. */ function visit(type, ctx) { const { typeId } = type; const { options, node, buffer, variadic, version } = ctx; const BatchType = batchType(type, options); // extract the next { length, nullCount } field node - ALL fields have field nodes const base = { ...node(), type }; if (typeId === Type.Null) { // null fields have field nodes but no data buffers return new BatchType({ ...base, nullCount: base.length }); } switch (typeId) { // validity and data value buffers case Type.Bool: case Type.Int: case Type.Time: case Type.Duration: case Type.Float: case Type.Decimal: case Type.Date: case Type.Timestamp: case Type.Interval: case Type.FixedSizeBinary: return new BatchType({ ...base, validity: buffer(), values: buffer(type.values) }); // validity, offset, and value buffers case Type.Utf8: case Type.LargeUtf8: case Type.Binary: case Type.LargeBinary: return new BatchType({ ...base, validity: buffer(), offsets: buffer(type.offsets), values: buffer() }); // views with variadic buffers case Type.BinaryView: case Type.Utf8View: return new BatchType({ ...base, validity: buffer(), values: buffer(), // views buffer data: Array.from({ length: variadic() }, () => buffer()) // data buffers }); // validity, offset, and list child case Type.List: case Type.LargeList: case Type.Map: return new BatchType({ ...base, validity: buffer(), offsets: buffer(type.offsets), children: ctx.visit(type.children) }); // validity, offset, size, and list child case Type.ListView: case Type.LargeListView: return new BatchType({ ...base, validity: buffer(), offsets: buffer(type.offsets), sizes: buffer(type.offsets), children: ctx.visit(type.children) }); // validity and children case Type.FixedSizeList: case Type.Struct: return new BatchType({ ...base, validity: buffer(), children: ctx.visit(type.children) }); // children only case Type.RunEndEncoded: return new BatchType({ ...base, children: ctx.visit(type.children) }); // dictionary case Type.Dictionary: { const { id, indices } = type; return new BatchType({ ...base, validity: buffer(), values: buffer(indices.values), }).setDictionary(ctx.dictionary(id)); } // union case Type.Union: { if (version < Version.V5) { buffer(); // skip unused null bitmap } return new BatchType({ ...base, typeIds: buffer(int8Array), offsets: type.mode === UnionMode.Sparse ? null : buffer(type.offsets), children: ctx.visit(type.children) }); } // unsupported type default: throw new Error(invalidDataType(typeId)); } }