UNPKG

parquetjs-lite

Version:

fully asynchronous, pure JavaScript implementation of the Parquet file format

259 lines (225 loc) 7.07 kB
'use strict'; const parquet_types = require('./types'); const parquet_schema = require('./schema'); /** * 'Shred' a record into a list of <value, repetition_level, definition_level> * tuples per column using the Google Dremel Algorithm.. * * The buffer argument must point to an object into which the shredded record * will be returned. You may re-use the buffer for repeated calls to this function * to append to an existing buffer, as long as the schema is unchanged. * * The format in which the shredded records will be stored in the buffer is as * follows: * * buffer = { * columnData: [ * 'my_col': { * dlevels: [d1, d2, .. dN], * rlevels: [r1, r2, .. rN], * values: [v1, v2, .. vN], * }, ... * ], * rowCount: X, * } * */ exports.shredRecord = function(schema, record, buffer) { /* shred the record, this may raise an exception */ var recordShredded = {}; for (let field of schema.fieldList) { recordShredded[field.path] = { dlevels: [], rlevels: [], values: [], distinct_values: new Set(), count: 0 }; } shredRecordInternal(schema.fields, record, recordShredded, 0, 0); /* if no error during shredding, add the shredded record to the buffer */ if (!('columnData' in buffer) || !('rowCount' in buffer)) { buffer.rowCount = 0; buffer.pageRowCount = 0; buffer.columnData = {}; buffer.pages = {}; for (let field of schema.fieldList) { buffer.columnData[field.path] = { dlevels: [], rlevels: [], values: [], distinct_values: new Set(), count: 0 }; buffer.pages[field.path] = []; } } buffer.rowCount += 1; buffer.pageRowCount += 1; for (let field of schema.fieldList) { let record = recordShredded[field.path]; let column = buffer.columnData[field.path]; for (let i = 0; i < record.rlevels.length; i++) { column.rlevels.push(record.rlevels[i]); column.dlevels.push(record.dlevels[i]); if (record.values[i] !== undefined) { column.values.push(record.values[i]); } } [...recordShredded[field.path].distinct_values].forEach(value => buffer.columnData[field.path].distinct_values.add(value)); buffer.columnData[field.path].count += recordShredded[field.path].count; } }; function shredRecordInternal(fields, record, data, rlvl, dlvl) { for (let fieldName in fields) { const field = fields[fieldName]; const fieldType = field.originalType || field.primitiveType; // fetch values let values = []; if (record && (fieldName in record) && record[fieldName] !== undefined && record[fieldName] !== null) { if (record[fieldName].constructor === Array) { values = record[fieldName]; } else { values.push(record[fieldName]); } } // check values if (values.length == 0 && !!record && field.repetitionType === 'REQUIRED') { throw 'missing required field: ' + field.name; } if (values.length > 1 && field.repetitionType !== 'REPEATED') { throw 'too many values for field: ' + field.name; } // push null if (values.length == 0) { if (field.isNested) { shredRecordInternal( field.fields, null, data, rlvl, dlvl); } else { data[field.path].rlevels.push(rlvl); data[field.path].dlevels.push(dlvl); data[field.path].count += 1; } continue; } // push values for (let i = 0; i < values.length; ++i) { const rlvl_i = i === 0 ? rlvl : field.rLevelMax; if (field.isNested) { shredRecordInternal( field.fields, values[i], data, rlvl_i, field.dLevelMax); } else { data[field.path].distinct_values.add(values[i]); data[field.path].values.push(parquet_types.toPrimitive(fieldType, values[i])); data[field.path].rlevels.push(rlvl_i); data[field.path].dlevels.push(field.dLevelMax); data[field.path].count += 1; } } } } /** * 'Materialize' a list of <value, repetition_level, definition_level> * tuples back to nested records (objects/arrays) using the Google Dremel * Algorithm.. * * The buffer argument must point to an object with the following structure (i.e. * the same structure that is returned by shredRecords): * * buffer = { * columnData: [ * 'my_col': { * dlevels: [d1, d2, .. dN], * rlevels: [r1, r2, .. rN], * values: [v1, v2, .. vN], * }, ... * ], * rowCount: X, * } * */ exports.materializeRecords = function(schema, buffer, records) { if (!records) { records = []; } for (let k in buffer.columnData) { const field = schema.findField(k); const fieldBranch = schema.findFieldBranch(k); let values = buffer.columnData[k].values[Symbol.iterator](); let rLevels = new Array(field.rLevelMax + 1); rLevels.fill(0); for (let i = 0; i < buffer.columnData[k].count; ++i) { const dLevel = buffer.columnData[k].dlevels[i]; const rLevel = buffer.columnData[k].rlevels[i]; rLevels[rLevel]++; rLevels.fill(0, rLevel + 1); let value = null; if (dLevel === field.dLevelMax) { value = parquet_types.fromPrimitive( field.originalType || field.primitiveType, values.next().value); } records[rLevels[0] - 1] = records[rLevels[0] - 1] || {}; materializeRecordField( records[rLevels[0] - 1], {values: fieldBranch, cursor: 0}, {values: rLevels, cursor: 1}, dLevel, value); } } return records; } function materializeRecordField(record, branch, rLevels, dLevel, value) { const node = branch.values[branch.cursor]; if (dLevel < node.dLevelMax) { return; } if (branch.values.length > branch.cursor+1) { if (node.repetitionType === "REPEATED") { if (!(node.name in record)) { record[node.name] = []; } while (record[node.name].length < rLevels.values[rLevels.cursor] + 1) { record[node.name].push({}); } rLevels.cursor++; branch.cursor++; materializeRecordField( record[node.name][rLevels.values[rLevels.cursor-1]], branch, rLevels, dLevel, value); } else { record[node.name] = record[node.name] || {}; branch.cursor++; materializeRecordField( record[node.name], branch, rLevels, dLevel, value); } } else { if (node.repetitionType === "REPEATED") { if (!(node.name in record)) { record[node.name] = []; } while (record[node.name].length < rLevels.values[rLevels.cursor] + 1) { record[node.name].push(null); } record[node.name][rLevels.values[rLevels.cursor]] = value; } else { record[node.name] = value; } } }