@uwdata/flechette
Version:
Fast, lightweight access to Apache Arrow data.
128 lines (117 loc) • 4.14 kB
JavaScript
/**
* @import { ArrowData, Version_ } from '../types.js'
*/
import { MAGIC, MessageHeader, Version } from '../constants.js';
import { readInt16, readInt32, readObject } from '../util/read.js';
import { decodeBlocks } from './block.js';
import { decodeMessage } from './message.js';
import { decodeMetadata } from './metadata.js';
import { decodeSchema } from './schema.js';
/**
* Decode [Apache Arrow IPC data][1] and return parsed schema, record batch,
* and dictionary batch definitions. The input binary data may be either
* an `ArrayBuffer` or `Uint8Array`. For Arrow data in the IPC 'stream' format,
* an array of `Uint8Array` instances is also supported.
*
* This method stops short of generating views over field buffers. Use the
* `createData()` method on the result to enable column data access.
*
* [1]: https://arrow.apache.org/docs/format/Columnar.html#serialization-and-interprocess-communication-ipc
* @param {ArrayBuffer | Uint8Array | Uint8Array[]} data
* The source byte buffer, or an array of buffers. If an array, each byte
* array may contain one or more self-contained messages. Messages may NOT
* span multiple byte arrays.
* @returns {import('../types.js').ArrowData}
*/
export function decodeIPC(data) {
const source = data instanceof ArrayBuffer
? new Uint8Array(data)
: data;
return source instanceof Uint8Array && isArrowFileFormat(source)
? decodeIPCFile(source)
: decodeIPCStream(source);
}
/**
* @param {Uint8Array} buf
* @returns {boolean}
*/
function isArrowFileFormat(buf) {
if (!buf || buf.length < 4) return false;
for (let i = 0; i < 6; ++i) {
if (MAGIC[i] !== buf[i]) return false;
}
return true;
}
/**
* Decode data in the [Arrow IPC 'stream' format][1].
*
* [1]: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
* @param {Uint8Array | Uint8Array[]} data The source byte buffer, or an
* array of buffers. If an array, each byte array may contain one or more
* self-contained messages. Messages may NOT span multiple byte arrays.
* @returns {ArrowData}
*/
export function decodeIPCStream(data) {
const stream = [data].flat();
let schema;
const records = [];
const dictionaries = [];
// consume each message in the stream
for (const buf of stream) {
if (!(buf instanceof Uint8Array)) {
throw new Error(`IPC data batch was not a Uint8Array.`);
}
let offset = 0;
// decode all messages in current buffer
while (true) {
const m = decodeMessage(buf, offset);
if (m === null) break; // end of messages
offset = m.index;
if (!m.content) continue;
switch (m.type) {
case MessageHeader.Schema:
// ignore repeated schema messages
if (!schema) schema = m.content;
break;
case MessageHeader.RecordBatch:
records.push(m.content);
break;
case MessageHeader.DictionaryBatch:
dictionaries.push(m.content);
break;
}
}
}
return /** @type {ArrowData} */ (
{ schema, dictionaries, records, metadata: null }
);
}
/**
* Decode data in the [Arrow IPC 'file' format][1].
*
* [1]: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format
* @param {Uint8Array} data The source byte buffer.
* @returns {ArrowData}
*/
export function decodeIPCFile(data) {
// find footer location
const offset = data.byteLength - (MAGIC.length + 4);
const length = readInt32(data, offset);
// decode file footer
// 4: version
// 6: schema
// 8: dictionaries (vector)
// 10: batches (vector)
// 12: metadata
const get = readObject(data, offset - length);
const version = /** @type {Version_} */
(get(4, readInt16, Version.V1));
const dicts = get(8, decodeBlocks, []);
const recs = get(10, decodeBlocks, []);
return /** @type {ArrowData} */ ({
schema: get(6, (buf, index) => decodeSchema(buf, index, version)),
dictionaries: dicts.map(({ offset }) => decodeMessage(data, offset).content),
records: recs.map(({ offset }) => decodeMessage(data, offset).content),
metadata: get(12, decodeMetadata)
});
}