UNPKG

@uwdata/flechette

Version:

Fast, lightweight access to Apache Arrow data.

320 lines (290 loc) 8.84 kB
/** * @import { Batch, DictionaryBatch } from '../batch.js' * @import { Column } from '../column.js' * @import { Table } from '../table.js' * @import { DataType, RecordBatch, Schema, TypedArray } from '../types.js' * @import { Sink } from './sink.js'; */ import { Type, UnionMode } from '../constants.js'; import { invalidDataType } from '../data-types.js'; import { encodeIPC } from './encode-ipc.js'; /** * Encode an Arrow table into Arrow IPC binary format. * @param {Table} table The Arrow table to encode. * @param {object} options Encoding options. * @param {Sink} [options.sink] IPC byte consumer. * @param {'stream' | 'file'} [options.format] Arrow stream or file format. * @returns {Uint8Array | null} The generated bytes (for an in-memory sink) * or null (if using a sink that writes bytes elsewhere). */ export function tableToIPC(table, options) { // accept a format string option for Arrow-JS compatibility if (typeof options === 'string') { options = { format: options }; } const columns = table.children; checkBatchLengths(columns); const { dictionaries, idMap } = assembleDictionaryBatches(columns); const records = assembleRecordBatches(columns); const schema = assembleSchema(table.schema, idMap); const data = { schema, dictionaries, records }; return encodeIPC(data, options).finish(); } function checkBatchLengths(columns) { const n = columns[0]?.data.map(d => d.length); columns.forEach(({ data }) => { if (data.length !== n.length || data.some((b, i) => b.length !== n[i])) { throw new Error('Columns have inconsistent batch sizes.'); } }); } /** * Create a new assembly context. */ function assembleContext() { let byteLength = 0; const nodes = []; const regions = []; const buffers = []; const variadic = []; return { /** * @param {number} length * @param {number} nullCount */ node(length, nullCount) { nodes.push({ length, nullCount }); }, /** * @param {TypedArray} b */ buffer(b) { const size = b.byteLength; const length = ((size + 7) & ~7); regions.push({ offset: byteLength, length }); byteLength += length; buffers.push(new Uint8Array(b.buffer, b.byteOffset, size)); }, /** * @param {number} length */ variadic(length) { variadic.push(length); }, /** * @param {DataType} type * @param {Batch} batch */ children(type, batch) { // @ts-ignore type.children.forEach((field, index) => { visit(field.type, batch.children[index], this); }); }, /** * @returns {RecordBatch} */ done() { return { byteLength, nodes, regions, variadic, buffers }; } }; } /** * Assemble dictionary batches and their unique ids. * @param {Column[]} columns The table columns. * @returns {{ * dictionaries: DictionaryBatch[], * idMap: Map<DataType, number> * }} * The assembled dictionary batches and a map from dictionary column * instances to dictionary ids. */ function assembleDictionaryBatches(columns) { const dictionaries = []; const dictMap = new Map; const idMap = new Map; let id = -1; // track dictionaries, key by dictionary column, assign ids const visitor = dictionaryColumn => { if (!dictMap.has(dictionaryColumn)) { dictMap.set(dictionaryColumn, ++id); for (let i = 0; i < dictionaryColumn.data.length; ++i) { dictionaries.push({ id, isDelta: i > 0, data: assembleRecordBatch([dictionaryColumn], i) }); } idMap.set(dictionaryColumn.type, id); } else { idMap.set(dictionaryColumn.type, dictMap.get(dictionaryColumn)); } }; // recurse through column batches to find dictionaries // it is sufficient to visit the first batch only, // as all batches have the same dictionary column columns.forEach(col => visitDictionaries(col.data[0], visitor)); return { dictionaries, idMap }; } /** * Traverse column batches to visit dictionary columns. * @param {Batch} batch * @param {(column: Column) => void} visitor */ function visitDictionaries(batch, visitor) { if (batch?.type.typeId === Type.Dictionary) { // @ts-ignore - batch has type DictionaryBatch const dictionary = batch.dictionary; visitor(dictionary); visitDictionaries(dictionary.data[0], visitor); } batch?.children?.forEach(child => visitDictionaries(child, visitor)); } /** * Assemble a schema with resolved dictionary ids. * @param {Schema} schema The schema. * @param {Map<DataType, number>} idMap A map * from dictionary value types to dictionary ids. * @returns {Schema} A new schema with resolved * dictionary ids. If there are no dictionaries, the input schema is * returned unchanged. */ function assembleSchema(schema, idMap) { // early exit if no dictionaries if (!idMap.size) return schema; const visit = type => { if (type.typeId === Type.Dictionary) { type.id = idMap.get(type.dictionary); // lookup and set id visitDictType(type); } if (type.children) { (type.children = type.children.slice()).forEach(visitFields); } }; // visit a field in a field array const visitFields = (field, index, array) => { const type = { ...field.type }; array[index] = { ...field, type }; visit(type); }; // visit a dictionary values type const visitDictType = (parentType) => { const type = { ...parentType.dictionary }; parentType.dictionary = type; visit(type); }; schema = { ...schema, fields: schema.fields.slice() }; schema.fields.forEach(visitFields); return schema; } /** * Assemble record batches with marshalled buffers. * @param {Column[]} columns The table columns. * @returns {RecordBatch[]} The assembled record batches. */ function assembleRecordBatches(columns) { return (columns[0]?.data || []) .map((_, index) => assembleRecordBatch(columns, index)); } /** * Assemble a record batch with marshalled buffers. * @param {Column[]} columns The table columns. * @param {number} batchIndex The batch index. * @returns {RecordBatch} The assembled record batch. */ function assembleRecordBatch(columns, batchIndex = 0) { const ctx = assembleContext(); columns.forEach(column => { visit(column.type, column.data[batchIndex], ctx); }); return ctx.done(); } /** * Visit a column batch, assembling buffer data. * @param {DataType} type The data type. * @param {Batch} batch The column batch. * @param {ReturnType<assembleContext>} ctx The assembly context. */ function visit(type, batch, ctx) { const { typeId } = type; // no field node, no buffers if (typeId === Type.Null) return; // record field node info ctx.node(batch.length, batch.nullCount); switch (typeId) { // validity and value buffers // backing dictionaries handled elsewhere case Type.Bool: case Type.Int: case Type.Time: case Type.Duration: case Type.Float: case Type.Date: case Type.Timestamp: case Type.Decimal: case Type.Interval: case Type.FixedSizeBinary: case Type.Dictionary: // dict key values ctx.buffer(batch.validity); ctx.buffer(batch.values); return; // validity, offset, and value buffers case Type.Utf8: case Type.LargeUtf8: case Type.Binary: case Type.LargeBinary: ctx.buffer(batch.validity); ctx.buffer(batch.offsets); ctx.buffer(batch.values); return; // views with variadic buffers case Type.BinaryView: case Type.Utf8View: ctx.buffer(batch.validity); ctx.buffer(batch.values); // @ts-ignore ctx.variadic(batch.data.length); // @ts-ignore batch.data.forEach(b => ctx.buffer(b)); return; // validity, offset, and list child case Type.List: case Type.LargeList: case Type.Map: ctx.buffer(batch.validity); ctx.buffer(batch.offsets); ctx.children(type, batch); return; // validity, offset, size, and list child case Type.ListView: case Type.LargeListView: ctx.buffer(batch.validity); ctx.buffer(batch.offsets); ctx.buffer(batch.sizes); ctx.children(type, batch); return; // validity and children case Type.FixedSizeList: case Type.Struct: ctx.buffer(batch.validity); ctx.children(type, batch); return; // children only case Type.RunEndEncoded: ctx.children(type, batch); return; // union case Type.Union: { // @ts-ignore ctx.buffer(batch.typeIds); if (type.mode === UnionMode.Dense) { ctx.buffer(batch.offsets); } ctx.children(type, batch); return; } // unsupported type default: throw new Error(invalidDataType(typeId)); } }