UNPKG

@dobesv/parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

github.com/dobesv/parquets

dobesv/parquets

309 lines • 11.2 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.materializeColumn = exports.materializeRecords = exports.shredRecord = exports.ParquetWriteBuffer = exports.TooManyValuesShredError = exports.MissingRequiredFieldShredError = exports.ParquetShredError = void 0; const Types = require("./types"); const ts_custom_error_1 = require("ts-custom-error"); class ParquetShredError extends ts_custom_error_1.CustomError { constructor(message) { super(message); } } exports.ParquetShredError = ParquetShredError; class MissingRequiredFieldShredError extends ts_custom_error_1.CustomError { constructor(fieldName) { super(`Missing required field: ${fieldName}`); this.fieldName = fieldName; } } exports.MissingRequiredFieldShredError = MissingRequiredFieldShredError; class TooManyValuesShredError extends ts_custom_error_1.CustomError { constructor(fieldName) { super(`Multiple values for non-repeated field: ${fieldName}`); this.fieldName = fieldName; } } exports.TooManyValuesShredError = TooManyValuesShredError; class ParquetWriteBuffer { constructor(schema) { this.columnData = shredColumnBuffers(schema); this.rowCount = 0; } } exports.ParquetWriteBuffer = ParquetWriteBuffer; const shredColumnBuffers = (schema) => Object.fromEntries(schema.fieldList .filter(field => !field.isNested) .map(field => [ field.key, { dLevels: [], rLevels: [], values: [], count: 0, }, ])); /** * 'Shred' a record into a list of <value, repetition_level, definition_level> * tuples per column using the Google Dremel Algorithm.. * * The buffer argument must point to an object into which the shredded record * will be returned. You may re-use the buffer for repeated calls to this function * to append to an existing buffer, as long as the schema is unchanged. * * The format in which the shredded records will be stored in the buffer is as * follows: * * buffer = { * columnData: [ * 'my_col': { * dLevels: [d1, d2, .. dN], * rLevels: [r1, r2, .. rN], * values: [v1, v2, .. vN], * }, ... * ], * rowCount: X, * } */ function shredRecord(schema, record, buffer) { // Shred the record fields; this may process fields recursively if the record // has nested records or arrays in it shredRecordFields(schema.fields, record, buffer.columnData, 0, 0); // Increment the row count buffer.rowCount += 1; } exports.shredRecord = shredRecord; /** * Shred a record or nested object into the output buffer. This updates the data parameter in place. * * Note that because fields can be optional or repeated, the number of elements pushed * onto the arrays in data can vary. * * @param fields Schema information * @param record Record to shred * @param data Output buffer * @param rLevel Current repetition level (used if this is a nested record inside one or more repeated fields) * @param dLevel Current definition level (used if this is a ensted record inside one or more optional fields) */ function shredRecordFields(fields, record, data, rLevel, dLevel) { for (const name in fields) { const field = fields[name]; // fetch values let values; if (record && field.name in record && record[field.name] !== undefined && record[field.name] !== null) { const value = record[field.name]; if (value.constructor === Array) { values = value; } else { values = [value]; } } else { // Value missing / null values = []; } // check values if (values.length === 0 && !!record && field.repetitionType === 'REQUIRED') { throw new MissingRequiredFieldShredError(field.name); } if (values.length > 1 && field.repetitionType !== 'REPEATED') { throw new TooManyValuesShredError(field.name); } // Check if there's a value to emit if (values.length === 0) { if (field.isNested) { // If it's a nested object we'll want push null for all its elements shredRecordFields(field.fields, null, data, rLevel, dLevel); } else { // If it's a primitive value, mark it as missing const fieldData = data[field.key]; fieldData.count += 1; fieldData.rLevels.push(rLevel); fieldData.dLevels.push(dLevel); } continue; } // push values for (let i = 0; i < values.length; i++) { const rlvl = i === 0 ? rLevel : field.rLevelMax; if (field.isNested) { shredRecordFields(field.fields, values[i], data, rlvl, field.dLevelMax); } else { const fieldData = data[field.key]; fieldData.count += 1; fieldData.rLevels.push(rlvl); fieldData.dLevels.push(field.dLevelMax); fieldData.values.push(Types.toPrimitive(field.originalType || field.primitiveType, values[i])); } } } } /** * 'Materialize' a list of <value, repetition_level, definition_level> * tuples back to nested records (objects/arrays) using the Google Dremel * Algorithm.. * * The buffer argument must point to an object with the following structure (i.e. * the same structure that is returned by shredRecords): * * buffer = { * columnData: [ * 'my_col': { * dlevels: [d1, d2, .. dN], * rlevels: [r1, r2, .. rN], * values: [v1, v2, .. vN], * }, ... * ], * rowCount: X, * } */ function materializeRecords(schema, buffer) { const records = []; for (let i = 0; i < buffer.rowCount; i++) records.push({}); for (const key in buffer.columnData) { materializeColumnIntoRecords(schema, buffer, key, records); } return records; } exports.materializeRecords = materializeRecords; /** * Support iteration over the values in a single column. * * For a simple column which is not repeated and not nested in a repeated * field, this will give one value for each row in the input. * * If the column is repeated or nested in a repeated column, it will give an * array for each row in the input. * * When there are multiple levels of repetition the iterator will yield * nested arrays. */ function* materializeColumn(schema, data, columnPath) { var _a, _b; const field = schema.findField(columnPath); if (!field) { throw new Error(`No field in schema for ${columnPath}`); } const { dLevelMax, rLevelMax } = field; const rLevelArrays = []; let vIndex = 0; const count = data.count; for (let i = 0; i < count; i++) { const dLevel = data.dLevels[i]; const rLevel = data.rLevels[i]; // Yield back the top-level array if we're moving to the next row if (rLevelMax > 0 && rLevel === 0 && i > 0) { yield (_a = rLevelArrays[0]) !== null && _a !== void 0 ? _a : []; } // Reset arrays for all rLevels >= rLevel rLevelArrays.length = rLevel; // Check if we actually have a value here if (dLevel >= dLevelMax) { const value = Types.fromPrimitive(field.originalType || field.primitiveType, data.values[vIndex]); vIndex++; if (rLevelMax > 0) { // Insert as array element for (let n = 0; n < rLevelMax; n++) { const v = rLevelArrays[n]; if (!v) { const ary = []; rLevelArrays[n] = ary; if (n > 0) { rLevelArrays[n - 1].push(ary); } } } // Push value onto the leaf-level array rLevelArrays[rLevelMax - 1].push(value); } else { // Emit value yield value; } } else if (rLevelMax === 0) { // Emit null yield null; } } // Yield back the top-level array at the end if this was a repeated field (or nested in one) if (rLevelMax > 0 && count > 0) { yield (_b = rLevelArrays[0]) !== null && _b !== void 0 ? _b : []; } } exports.materializeColumn = materializeColumn; /** * Read values from a column and update the records array with the values that are * found. * * If a column is in a nested record or array this will create the necessary parent * objects and arrays leading up to it, as well as creating the actual record if there's * no record at the given position in the records array. * * @param schema Parquet schema * @param buffer Data we are parsing * @param key Field key for the column we are loading * @param records records are added or updated in this array as necessary */ function materializeColumnIntoRecords(schema, buffer, key, records) { const data = buffer.columnData[key]; if (!data.count) return; const field = schema.findField(key); const branch = schema.findFieldBranch(key); const repeated = field.repetitionType === 'REPEATED'; // tslint:disable-next-line:prefer-array-literal const rLevels = new Array(field.rLevelMax + 1).fill(0); let vIndex = 0; for (let i = 0; i < data.count; i++) { const dLevel = data.dLevels[i]; const rLevel = data.rLevels[i]; rLevels[rLevel]++; rLevels.fill(0, rLevel + 1); let rIndex = 0; let record = records[rLevels[rIndex++] - 1]; // Internal nodes for (const step of branch) { if (step === field) break; if (dLevel < step.dLevelMax) break; if (step.repetitionType === 'REPEATED') { if (!(step.name in record)) record[step.name] = []; const ix = rLevels[rIndex++]; while (record[step.name].length <= ix) record[step.name].push({}); record = record[step.name][ix]; } else { record[step.name] = record[step.name] || {}; record = record[step.name]; } } // Leaf node if (dLevel === field.dLevelMax) { const value = Types.fromPrimitive(field.originalType || field.primitiveType, data.values[vIndex]); vIndex++; if (repeated) { if (!(field.name in record)) record[field.name] = []; const ix = rLevels[rIndex]; while (record[field.name].length <= ix) record[field.name].push(null); record[field.name][ix] = value; } else { record[field.name] = value; } } } } //# sourceMappingURL=shred.js.map