@uwdata/flechette
Version:
Fast, lightweight access to Apache Arrow data.
313 lines (283 loc) • 9.02 kB
JavaScript
import { Type, UnionMode } from '../constants.js';
import { invalidDataType } from '../data-types.js';
import { encodeIPC } from './encode-ipc.js';
/**
* Encode an Arrow table into Arrow IPC binary format.
* @param {import('../table.js').Table} table The Arrow table to encode.
* @param {object} options Encoding options.
* @param {import('./sink.js').Sink} [options.sink] IPC byte consumer.
* @param {'stream' | 'file'} [options.format] Arrow stream or file format.
* @returns {Uint8Array | null} The generated bytes (for an in-memory sink)
* or null (if using a sink that writes bytes elsewhere).
*/
export function tableToIPC(table, options) {
// accept a format string option for Arrow-JS compatibility
if (typeof options === 'string') {
options = { format: options };
}
const columns = table.children;
checkBatchLengths(columns);
const { dictionaries, idMap } = assembleDictionaryBatches(columns);
const records = assembleRecordBatches(columns);
const schema = assembleSchema(table.schema, idMap);
const data = { schema, dictionaries, records };
return encodeIPC(data, options).finish();
}
function checkBatchLengths(columns) {
const n = columns[0]?.data.map(d => d.length);
columns.forEach(({ data }) => {
if (data.length !== n.length || data.some((b, i) => b.length !== n[i])) {
throw new Error('Columns have inconsistent batch sizes.');
}
});
}
/**
* Create a new assembly context.
*/
function assembleContext() {
let byteLength = 0;
const nodes = [];
const regions = [];
const buffers = [];
const variadic = [];
return {
/**
* @param {number} length
* @param {number} nullCount
*/
node(length, nullCount) {
nodes.push({ length, nullCount });
},
/**
* @param {import('../types.js').TypedArray} b
*/
buffer(b) {
const size = b.byteLength;
const length = ((size + 7) & ~7);
regions.push({ offset: byteLength, length });
byteLength += length;
buffers.push(new Uint8Array(b.buffer, b.byteOffset, size));
},
/**
* @param {number} length
*/
variadic(length) {
variadic.push(length);
},
/**
* @param {import('../types.js').DataType} type
* @param {import('../batch.js').Batch} batch
*/
children(type, batch) {
// @ts-ignore
type.children.forEach((field, index) => {
visit(field.type, batch.children[index], this);
});
},
/**
* @returns {import('../types.js').RecordBatch}
*/
done() {
return { byteLength, nodes, regions, variadic, buffers };
}
};
}
/**
* Assemble dictionary batches and their unique ids.
* @param {import('../column.js').Column[]} columns The table columns.
* @returns {{
* dictionaries: import('../types.js').DictionaryBatch[],
* idMap: Map<import('../types.js').DataType, number>
* }}
* The assembled dictionary batches and a map from dictionary column
* instances to dictionary ids.
*/
function assembleDictionaryBatches(columns) {
const dictionaries = [];
const dictMap = new Map;
const idMap = new Map;
let id = -1;
// track dictionaries, key by dictionary column, assign ids
const visitor = dictionaryColumn => {
if (!dictMap.has(dictionaryColumn)) {
dictMap.set(dictionaryColumn, ++id);
for (let i = 0; i < dictionaryColumn.data.length; ++i) {
dictionaries.push({
id,
isDelta: i > 0,
data: assembleRecordBatch([dictionaryColumn], i)
});
}
idMap.set(dictionaryColumn.type, id);
} else {
idMap.set(dictionaryColumn.type, dictMap.get(dictionaryColumn));
}
};
// recurse through column batches to find dictionaries
// it is sufficient to visit the first batch only,
// as all batches have the same dictionary column
columns.forEach(col => visitDictionaries(col.data[0], visitor));
return { dictionaries, idMap };
}
/**
* Traverse column batches to visit dictionary columns.
* @param {import('../batch.js').Batch} batch
* @param {(column: import('../column.js').Column) => void} visitor
*/
function visitDictionaries(batch, visitor) {
if (batch?.type.typeId === Type.Dictionary) {
// @ts-ignore - batch has type DictionaryBatch
const dictionary = batch.dictionary;
visitor(dictionary);
visitDictionaries(dictionary.data[0], visitor);
}
batch?.children?.forEach(child => visitDictionaries(child, visitor));
}
/**
* Assemble a schema with resolved dictionary ids.
* @param {import('../types.js').Schema} schema The schema.
* @param {Map<import('../types.js').DataType, number>} idMap A map
* from dictionary value types to dictionary ids.
* @returns {import('../types.js').Schema} A new schema with resolved
* dictionary ids. If there are no dictionaries, the input schema is
* returned unchanged.
*/
function assembleSchema(schema, idMap) {
// early exit if no dictionaries
if (!idMap.size) return schema;
const visit = type => {
if (type.typeId === Type.Dictionary) {
type.id = idMap.get(type.dictionary); // lookup and set id
visitDictType(type);
}
if (type.children) {
(type.children = type.children.slice()).forEach(visitFields);
}
};
// visit a field in a field array
const visitFields = (field, index, array) => {
const type = { ...field.type };
array[index] = { ...field, type };
visit(type);
};
// visit a dictionary values type
const visitDictType = (parentType) => {
const type = { ...parentType.dictionary };
parentType.dictionary = type;
visit(type);
};
schema = { ...schema, fields: schema.fields.slice() };
schema.fields.forEach(visitFields);
return schema;
}
/**
* Assemble record batches with marshalled buffers.
* @param {import('../column.js').Column[]} columns The table columns.
* @returns {import('../types.js').RecordBatch[]} The assembled record batches.
*/
function assembleRecordBatches(columns) {
return (columns[0]?.data || [])
.map((_, index) => assembleRecordBatch(columns, index));
}
/**
* Assemble a record batch with marshalled buffers.
* @param {import('../column.js').Column[]} columns The table columns.
* @param {number} batchIndex The batch index.
* @returns {import('../types.js').RecordBatch} The assembled record batch.
*/
function assembleRecordBatch(columns, batchIndex = 0) {
const ctx = assembleContext();
columns.forEach(column => {
visit(column.type, column.data[batchIndex], ctx);
});
return ctx.done();
}
/**
* Visit a column batch, assembling buffer data.
* @param {import('../types.js').DataType} type The data type.
* @param {import('../batch.js').Batch} batch The column batch.
* @param {ReturnType<assembleContext>} ctx The assembly context.
*/
function visit(type, batch, ctx) {
const { typeId } = type;
// no field node, no buffers
if (typeId === Type.Null) return;
// record field node info
ctx.node(batch.length, batch.nullCount);
switch (typeId) {
// validity and value buffers
// backing dictionaries handled elsewhere
case Type.Bool:
case Type.Int:
case Type.Time:
case Type.Duration:
case Type.Float:
case Type.Date:
case Type.Timestamp:
case Type.Decimal:
case Type.Interval:
case Type.FixedSizeBinary:
case Type.Dictionary: // dict key values
ctx.buffer(batch.validity);
ctx.buffer(batch.values);
return;
// validity, offset, and value buffers
case Type.Utf8:
case Type.LargeUtf8:
case Type.Binary:
case Type.LargeBinary:
ctx.buffer(batch.validity);
ctx.buffer(batch.offsets);
ctx.buffer(batch.values);
return;
// views with variadic buffers
case Type.BinaryView:
case Type.Utf8View:
ctx.buffer(batch.validity);
ctx.buffer(batch.values);
// @ts-ignore
ctx.variadic(batch.data.length);
// @ts-ignore
batch.data.forEach(b => ctx.buffer(b));
return;
// validity, offset, and list child
case Type.List:
case Type.LargeList:
case Type.Map:
ctx.buffer(batch.validity);
ctx.buffer(batch.offsets);
ctx.children(type, batch);
return;
// validity, offset, size, and list child
case Type.ListView:
case Type.LargeListView:
ctx.buffer(batch.validity);
ctx.buffer(batch.offsets);
ctx.buffer(batch.sizes);
ctx.children(type, batch);
return;
// validity and children
case Type.FixedSizeList:
case Type.Struct:
ctx.buffer(batch.validity);
ctx.children(type, batch);
return;
// children only
case Type.RunEndEncoded:
ctx.children(type, batch);
return;
// union
case Type.Union: {
// @ts-ignore
ctx.buffer(batch.typeIds);
if (type.mode === UnionMode.Dense) {
ctx.buffer(batch.offsets);
}
ctx.children(type, batch);
return;
}
// unsupported type
default:
throw new Error(invalidDataType(typeId));
}
}