@dobesv/parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
309 lines • 11.2 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.materializeColumn = exports.materializeRecords = exports.shredRecord = exports.ParquetWriteBuffer = exports.TooManyValuesShredError = exports.MissingRequiredFieldShredError = exports.ParquetShredError = void 0;
const Types = require("./types");
const ts_custom_error_1 = require("ts-custom-error");
class ParquetShredError extends ts_custom_error_1.CustomError {
constructor(message) {
super(message);
}
}
exports.ParquetShredError = ParquetShredError;
class MissingRequiredFieldShredError extends ts_custom_error_1.CustomError {
constructor(fieldName) {
super(`Missing required field: ${fieldName}`);
this.fieldName = fieldName;
}
}
exports.MissingRequiredFieldShredError = MissingRequiredFieldShredError;
class TooManyValuesShredError extends ts_custom_error_1.CustomError {
constructor(fieldName) {
super(`Multiple values for non-repeated field: ${fieldName}`);
this.fieldName = fieldName;
}
}
exports.TooManyValuesShredError = TooManyValuesShredError;
class ParquetWriteBuffer {
constructor(schema) {
this.columnData = shredColumnBuffers(schema);
this.rowCount = 0;
}
}
exports.ParquetWriteBuffer = ParquetWriteBuffer;
const shredColumnBuffers = (schema) => Object.fromEntries(schema.fieldList
.filter(field => !field.isNested)
.map(field => [
field.key,
{
dLevels: [],
rLevels: [],
values: [],
count: 0,
},
]));
/**
* 'Shred' a record into a list of <value, repetition_level, definition_level>
* tuples per column using the Google Dremel Algorithm..
*
* The buffer argument must point to an object into which the shredded record
* will be returned. You may re-use the buffer for repeated calls to this function
* to append to an existing buffer, as long as the schema is unchanged.
*
* The format in which the shredded records will be stored in the buffer is as
* follows:
*
* buffer = {
* columnData: [
* 'my_col': {
* dLevels: [d1, d2, .. dN],
* rLevels: [r1, r2, .. rN],
* values: [v1, v2, .. vN],
* }, ...
* ],
* rowCount: X,
* }
*/
function shredRecord(schema, record, buffer) {
// Shred the record fields; this may process fields recursively if the record
// has nested records or arrays in it
shredRecordFields(schema.fields, record, buffer.columnData, 0, 0);
// Increment the row count
buffer.rowCount += 1;
}
exports.shredRecord = shredRecord;
/**
* Shred a record or nested object into the output buffer. This updates the data parameter in place.
*
* Note that because fields can be optional or repeated, the number of elements pushed
* onto the arrays in data can vary.
*
* @param fields Schema information
* @param record Record to shred
* @param data Output buffer
* @param rLevel Current repetition level (used if this is a nested record inside one or more repeated fields)
* @param dLevel Current definition level (used if this is a ensted record inside one or more optional fields)
*/
function shredRecordFields(fields, record, data, rLevel, dLevel) {
for (const name in fields) {
const field = fields[name];
// fetch values
let values;
if (record &&
field.name in record &&
record[field.name] !== undefined &&
record[field.name] !== null) {
const value = record[field.name];
if (value.constructor === Array) {
values = value;
}
else {
values = [value];
}
}
else {
// Value missing / null
values = [];
}
// check values
if (values.length === 0 &&
!!record &&
field.repetitionType === 'REQUIRED') {
throw new MissingRequiredFieldShredError(field.name);
}
if (values.length > 1 && field.repetitionType !== 'REPEATED') {
throw new TooManyValuesShredError(field.name);
}
// Check if there's a value to emit
if (values.length === 0) {
if (field.isNested) {
// If it's a nested object we'll want push null for all its elements
shredRecordFields(field.fields, null, data, rLevel, dLevel);
}
else {
// If it's a primitive value, mark it as missing
const fieldData = data[field.key];
fieldData.count += 1;
fieldData.rLevels.push(rLevel);
fieldData.dLevels.push(dLevel);
}
continue;
}
// push values
for (let i = 0; i < values.length; i++) {
const rlvl = i === 0 ? rLevel : field.rLevelMax;
if (field.isNested) {
shredRecordFields(field.fields, values[i], data, rlvl, field.dLevelMax);
}
else {
const fieldData = data[field.key];
fieldData.count += 1;
fieldData.rLevels.push(rlvl);
fieldData.dLevels.push(field.dLevelMax);
fieldData.values.push(Types.toPrimitive(field.originalType || field.primitiveType, values[i]));
}
}
}
}
/**
* 'Materialize' a list of <value, repetition_level, definition_level>
* tuples back to nested records (objects/arrays) using the Google Dremel
* Algorithm..
*
* The buffer argument must point to an object with the following structure (i.e.
* the same structure that is returned by shredRecords):
*
* buffer = {
* columnData: [
* 'my_col': {
* dlevels: [d1, d2, .. dN],
* rlevels: [r1, r2, .. rN],
* values: [v1, v2, .. vN],
* }, ...
* ],
* rowCount: X,
* }
*/
function materializeRecords(schema, buffer) {
const records = [];
for (let i = 0; i < buffer.rowCount; i++)
records.push({});
for (const key in buffer.columnData) {
materializeColumnIntoRecords(schema, buffer, key, records);
}
return records;
}
exports.materializeRecords = materializeRecords;
/**
* Support iteration over the values in a single column.
*
* For a simple column which is not repeated and not nested in a repeated
* field, this will give one value for each row in the input.
*
* If the column is repeated or nested in a repeated column, it will give an
* array for each row in the input.
*
* When there are multiple levels of repetition the iterator will yield
* nested arrays.
*/
function* materializeColumn(schema, data, columnPath) {
var _a, _b;
const field = schema.findField(columnPath);
if (!field) {
throw new Error(`No field in schema for ${columnPath}`);
}
const { dLevelMax, rLevelMax } = field;
const rLevelArrays = [];
let vIndex = 0;
const count = data.count;
for (let i = 0; i < count; i++) {
const dLevel = data.dLevels[i];
const rLevel = data.rLevels[i];
// Yield back the top-level array if we're moving to the next row
if (rLevelMax > 0 && rLevel === 0 && i > 0) {
yield (_a = rLevelArrays[0]) !== null && _a !== void 0 ? _a : [];
}
// Reset arrays for all rLevels >= rLevel
rLevelArrays.length = rLevel;
// Check if we actually have a value here
if (dLevel >= dLevelMax) {
const value = Types.fromPrimitive(field.originalType || field.primitiveType, data.values[vIndex]);
vIndex++;
if (rLevelMax > 0) {
// Insert as array element
for (let n = 0; n < rLevelMax; n++) {
const v = rLevelArrays[n];
if (!v) {
const ary = [];
rLevelArrays[n] = ary;
if (n > 0) {
rLevelArrays[n - 1].push(ary);
}
}
}
// Push value onto the leaf-level array
rLevelArrays[rLevelMax - 1].push(value);
}
else {
// Emit value
yield value;
}
}
else if (rLevelMax === 0) {
// Emit null
yield null;
}
}
// Yield back the top-level array at the end if this was a repeated field (or nested in one)
if (rLevelMax > 0 && count > 0) {
yield (_b = rLevelArrays[0]) !== null && _b !== void 0 ? _b : [];
}
}
exports.materializeColumn = materializeColumn;
/**
* Read values from a column and update the records array with the values that are
* found.
*
* If a column is in a nested record or array this will create the necessary parent
* objects and arrays leading up to it, as well as creating the actual record if there's
* no record at the given position in the records array.
*
* @param schema Parquet schema
* @param buffer Data we are parsing
* @param key Field key for the column we are loading
* @param records records are added or updated in this array as necessary
*/
function materializeColumnIntoRecords(schema, buffer, key, records) {
const data = buffer.columnData[key];
if (!data.count)
return;
const field = schema.findField(key);
const branch = schema.findFieldBranch(key);
const repeated = field.repetitionType === 'REPEATED';
// tslint:disable-next-line:prefer-array-literal
const rLevels = new Array(field.rLevelMax + 1).fill(0);
let vIndex = 0;
for (let i = 0; i < data.count; i++) {
const dLevel = data.dLevels[i];
const rLevel = data.rLevels[i];
rLevels[rLevel]++;
rLevels.fill(0, rLevel + 1);
let rIndex = 0;
let record = records[rLevels[rIndex++] - 1];
// Internal nodes
for (const step of branch) {
if (step === field)
break;
if (dLevel < step.dLevelMax)
break;
if (step.repetitionType === 'REPEATED') {
if (!(step.name in record))
record[step.name] = [];
const ix = rLevels[rIndex++];
while (record[step.name].length <= ix)
record[step.name].push({});
record = record[step.name][ix];
}
else {
record[step.name] = record[step.name] || {};
record = record[step.name];
}
}
// Leaf node
if (dLevel === field.dLevelMax) {
const value = Types.fromPrimitive(field.originalType || field.primitiveType, data.values[vIndex]);
vIndex++;
if (repeated) {
if (!(field.name in record))
record[field.name] = [];
const ix = rLevels[rIndex];
while (record[field.name].length <= ix)
record[field.name].push(null);
record[field.name][ix] = value;
}
else {
record[field.name] = value;
}
}
}
}
//# sourceMappingURL=shred.js.map