apache-arrow
Version:
Apache Arrow columnar in-memory format
193 lines (191 loc) • 9.06 kB
JavaScript
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
import { Visitor } from '../visitor';
import { UnionMode } from '../enum';
import { RecordBatch } from '../recordbatch';
import { rebaseValueOffsets } from '../util/buffer';
import { packBools, truncateBitmap } from '../util/bit';
import { selectVectorChildrenArgs } from '../util/args';
import { BufferRegion, FieldNode } from '../ipc/metadata/message';
import { DataType, } from '../type';
/** @ignore */
export class VectorAssembler extends Visitor {
constructor() {
super();
this._byteLength = 0;
this._nodes = [];
this._buffers = [];
this._bufferRegions = [];
}
/** @nocollapse */
static assemble(...args) {
const assembler = new VectorAssembler();
const vectorChildren = selectVectorChildrenArgs(RecordBatch, args);
const [assembleResult = assembler] = assembler.visitMany(vectorChildren);
return assembleResult;
}
visit(vector) {
if (!DataType.isDictionary(vector.type)) {
const { data, length, nullCount } = vector;
if (length > 2147483647) {
/* istanbul ignore next */
throw new RangeError('Cannot write arrays larger than 2^31 - 1 in length');
}
if (!DataType.isNull(vector.type)) {
addBuffer.call(this, nullCount <= 0
? new Uint8Array(0) // placeholder validity buffer
: truncateBitmap(data.offset, length, data.nullBitmap));
}
this.nodes.push(new FieldNode(length, nullCount));
}
return super.visit(vector);
}
visitNull(_nullV) {
return this;
}
visitDictionary(vector) {
// Assemble the indices here, Dictionary assembled separately.
return this.visit(vector.indices);
}
get nodes() { return this._nodes; }
get buffers() { return this._buffers; }
get byteLength() { return this._byteLength; }
get bufferRegions() { return this._bufferRegions; }
}
/** @ignore */
function addBuffer(values) {
const byteLength = (values.byteLength + 7) & ~7; // Round up to a multiple of 8
this.buffers.push(values);
this.bufferRegions.push(new BufferRegion(this._byteLength, byteLength));
this._byteLength += byteLength;
return this;
}
/** @ignore */
function assembleUnion(vector) {
const { type, length, typeIds, valueOffsets } = vector;
// All Union Vectors have a typeIds buffer
addBuffer.call(this, typeIds);
// If this is a Sparse Union, treat it like all other Nested types
if (type.mode === UnionMode.Sparse) {
return assembleNestedVector.call(this, vector);
}
else if (type.mode === UnionMode.Dense) {
// If this is a Dense Union, add the valueOffsets buffer and potentially slice the children
if (vector.offset <= 0) {
// If the Vector hasn't been sliced, write the existing valueOffsets
addBuffer.call(this, valueOffsets);
// We can treat this like all other Nested types
return assembleNestedVector.call(this, vector);
}
else {
// A sliced Dense Union is an unpleasant case. Because the offsets are different for
// each child vector, we need to "rebase" the valueOffsets for each child
// Union typeIds are not necessary 0-indexed
const maxChildTypeId = typeIds.reduce((x, y) => Math.max(x, y), typeIds[0]);
const childLengths = new Int32Array(maxChildTypeId + 1);
// Set all to -1 to indicate that we haven't observed a first occurrence of a particular child yet
const childOffsets = new Int32Array(maxChildTypeId + 1).fill(-1);
const shiftedOffsets = new Int32Array(length);
// If we have a non-zero offset, then the value offsets do not start at
// zero. We must a) create a new offsets array with shifted offsets and
// b) slice the values array accordingly
const unshiftedOffsets = rebaseValueOffsets(-valueOffsets[0], length, valueOffsets);
for (let typeId, shift, index = -1; ++index < length;) {
if ((shift = childOffsets[typeId = typeIds[index]]) === -1) {
shift = childOffsets[typeId] = unshiftedOffsets[typeId];
}
shiftedOffsets[index] = unshiftedOffsets[index] - shift;
++childLengths[typeId];
}
addBuffer.call(this, shiftedOffsets);
// Slice and visit children accordingly
for (let child, childIndex = -1, numChildren = type.children.length; ++childIndex < numChildren;) {
if (child = vector.getChildAt(childIndex)) {
const typeId = type.typeIds[childIndex];
const childLength = Math.min(length, childLengths[typeId]);
this.visit(child.slice(childOffsets[typeId], childLength));
}
}
}
}
return this;
}
/** @ignore */
function assembleBoolVector(vector) {
// Bool vector is a special case of FlatVector, as its data buffer needs to stay packed
let values;
if (vector.nullCount >= vector.length) {
// If all values are null, just insert a placeholder empty data buffer (fastest path)
return addBuffer.call(this, new Uint8Array(0));
}
else if ((values = vector.values) instanceof Uint8Array) {
// If values is already a Uint8Array, slice the bitmap (fast path)
return addBuffer.call(this, truncateBitmap(vector.offset, vector.length, values));
}
// Otherwise if the underlying data *isn't* a Uint8Array, enumerate the
// values as bools and re-pack them into a Uint8Array. This code isn't
// reachable unless you're trying to manipulate the Data internals,
// we we're only doing this for safety.
/* istanbul ignore next */
return addBuffer.call(this, packBools(vector));
}
/** @ignore */
function assembleFlatVector(vector) {
return addBuffer.call(this, vector.values.subarray(0, vector.length * vector.stride));
}
/** @ignore */
function assembleFlatListVector(vector) {
const { length, values, valueOffsets } = vector;
const firstOffset = valueOffsets[0];
const lastOffset = valueOffsets[length];
const byteLength = Math.min(lastOffset - firstOffset, values.byteLength - firstOffset);
// Push in the order FlatList types read their buffers
addBuffer.call(this, rebaseValueOffsets(-valueOffsets[0], length, valueOffsets)); // valueOffsets buffer first
addBuffer.call(this, values.subarray(firstOffset, firstOffset + byteLength)); // sliced values buffer second
return this;
}
/** @ignore */
function assembleListVector(vector) {
const { length, valueOffsets } = vector;
// If we have valueOffsets (MapVector, ListVector), push that buffer first
if (valueOffsets) {
addBuffer.call(this, rebaseValueOffsets(valueOffsets[0], length, valueOffsets));
}
// Then insert the List's values child
return this.visit(vector.getChildAt(0));
}
/** @ignore */
function assembleNestedVector(vector) {
return this.visitMany(vector.type.children.map((_, i) => vector.getChildAt(i)).filter(Boolean))[0];
}
VectorAssembler.prototype.visitBool = assembleBoolVector;
VectorAssembler.prototype.visitInt = assembleFlatVector;
VectorAssembler.prototype.visitFloat = assembleFlatVector;
VectorAssembler.prototype.visitUtf8 = assembleFlatListVector;
VectorAssembler.prototype.visitBinary = assembleFlatListVector;
VectorAssembler.prototype.visitFixedSizeBinary = assembleFlatVector;
VectorAssembler.prototype.visitDate = assembleFlatVector;
VectorAssembler.prototype.visitTimestamp = assembleFlatVector;
VectorAssembler.prototype.visitTime = assembleFlatVector;
VectorAssembler.prototype.visitDecimal = assembleFlatVector;
VectorAssembler.prototype.visitList = assembleListVector;
VectorAssembler.prototype.visitStruct = assembleNestedVector;
VectorAssembler.prototype.visitUnion = assembleUnion;
VectorAssembler.prototype.visitInterval = assembleFlatVector;
VectorAssembler.prototype.visitFixedSizeList = assembleListVector;
VectorAssembler.prototype.visitMap = assembleListVector;
//# sourceMappingURL=vectorassembler.mjs.map