parquetjs-lite
Version:
fully asynchronous, pure JavaScript implementation of the Parquet file format
1,000 lines (839 loc) • 29.1 kB
JavaScript
'use strict';
const Int64 = require('node-int64');
const parquet_thrift = require('../gen-nodejs/parquet_types')
const parquet_shredder = require('./shred')
const parquet_util = require('./util')
const parquet_schema = require('./schema')
const parquet_codec = require('./codec')
const parquet_compression = require('./compression')
const parquet_types = require('./types');
const BufferReader = require('./bufferReader');
const exportMetadata = require('./exportMetadata');
/**
* Parquet File Magic String
*/
const PARQUET_MAGIC = 'PAR1';
/**
* Parquet File Format Version
*/
const PARQUET_VERSION = 1;
/**
* Internal type used for repetition/definition levels
*/
const PARQUET_RDLVL_TYPE = 'INT32';
const PARQUET_RDLVL_ENCODING = 'RLE';
/**
* A parquet cursor is used to retrieve rows from a parquet file in order
*/
class ParquetCursor {
/**
* Create a new parquet reader from the file metadata and an envelope reader.
* It is usually not recommended to call this constructor directly except for
* advanced and internal use cases. Consider using getCursor() on the
* ParquetReader instead
*/
constructor(metadata, envelopeReader, schema, columnList) {
this.metadata = metadata;
this.envelopeReader = envelopeReader;
this.schema = schema;
this.columnList = columnList;
this.rowGroup = [];
this.rowGroupIndex = 0;
}
/**
* Retrieve the next row from the cursor. Returns a row or NULL if the end
* of the file was reached
*/
async next() {
if (this.rowGroup.length === 0) {
if (this.rowGroupIndex >= this.metadata.row_groups.length) {
return null;
}
let rowBuffer = await this.envelopeReader.readRowGroup(
this.schema,
this.metadata.row_groups[this.rowGroupIndex],
this.columnList);
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer);
this.rowGroupIndex++;
}
return this.rowGroup.shift();
}
/**
* Retrieve the next RowGroup from the cursor (array or records).
* Returns a row or NULL if the end of the file was reached
*/
async nextRowGroup() {
if (this.rowGroupIndex >= this.metadata.row_groups.length) {
return null;
}
let rowBuffer = await this.envelopeReader.readRowGroup(
this.schema,
this.metadata.row_groups[this.rowGroupIndex],
this.columnList);
this.rowGroup = parquet_shredder.materializeRecords(this.schema, rowBuffer);
this.rowGroupIndex++;
return this.rowGroup;
}
/**
* Rewind the cursor the the beginning of the file
*/
rewind() {
this.rowGroup = [];
this.rowGroupIndex = 0;
}
};
/**
* A parquet reader allows retrieving the rows from a parquet file in order.
* The basic usage is to create a reader and then retrieve a cursor/iterator
* which allows you to consume row after row until all rows have been read. It is
* important that you call close() after you are finished reading the file to
* avoid leaking file descriptors.
*/
class ParquetReader {
/**
* Open the parquet file pointed to by the specified path and return a new
* parquet reader
*/
static async openFile(filePath, options) {
let envelopeReader = await ParquetEnvelopeReader.openFile(filePath, options);
return this.openEnvelopeReader(envelopeReader, options);
}
static async openBuffer(buffer, options) {
let envelopeReader = await ParquetEnvelopeReader.openBuffer(buffer, options);
return this.openEnvelopeReader(envelopeReader, options);
}
/**
* Open the parquet file from S3 using the supplied aws client and params
* The params have to include `Bucket` and `Key` to the file requested
* This function returns a new parquet reader
*/
static async openS3(client, params, options) {
let envelopeReader = await ParquetEnvelopeReader.openS3(client, params, options);
return this.openEnvelopeReader(envelopeReader, options);
}
/**
* Open the parquet file from a url using the supplied request module
* params should either be a string (url) or an object that includes
* a `url` property.
* This function returns a new parquet reader
*/
static async openUrl(request, params, options) {
let envelopeReader = await ParquetEnvelopeReader.openUrl(request, params, options);
return this.openEnvelopeReader(envelopeReader, options);
}
static async openEnvelopeReader(envelopeReader, opts) {
if (opts && opts.metadata) {
return new ParquetReader(opts.metadata, envelopeReader, opts);
}
try {
await envelopeReader.readHeader();
let metadata = await envelopeReader.readFooter();
return new ParquetReader(metadata, envelopeReader, opts);
} catch (err) {
await envelopeReader.close();
throw err;
}
}
/**
* Create a new parquet reader from the file metadata and an envelope reader.
* It is not recommended to call this constructor directly except for advanced
* and internal use cases. Consider using one of the open{File,Buffer} methods
* instead
*/
constructor(metadata, envelopeReader, opts) {
opts = opts || {};
if (metadata.version != PARQUET_VERSION) {
throw 'invalid parquet version';
}
// If metadata is a json file then we need to convert INT64 and CTIME
if (metadata.json) {
const convert = (o) => {
if (o && typeof o === 'object') {
Object.keys(o).forEach(key => o[key] = convert(o[key]));
if (o.parquetType === 'CTIME') {
return new Date(o.value);
} else if (o.parquetType === 'INT64') {
return new Int64(Buffer.from(o.value));
}
}
return o;
};
// Go through all PageLocation objects and set the proper prototype
metadata.row_groups.forEach(rowGroup => {
rowGroup.columns.forEach(column => {
if (column.offsetIndex) {
column.offsetIndex.page_locations.forEach(d => {
if (Array.isArray(d)) {
Object.setPrototypeOf(d,parquet_thrift.PageLocation.prototype);
}
});
}
});
});
convert(metadata);
}
this.metadata = envelopeReader.metadata = metadata;
this.envelopeReader = envelopeReader;
this.schema = envelopeReader.schema = new parquet_schema.ParquetSchema(
decodeSchema(
this.metadata.schema.slice(1)));
/* decode any statistics values */
if (this.metadata.row_groups && !this.metadata.json && !opts.rawStatistics) {
this.metadata.row_groups.forEach(row => row.columns.forEach( col => {
const stats = col.meta_data.statistics;
if (stats) {
const field = this.schema.findField(col.meta_data.path_in_schema);
stats.max_value = decodeStatisticsValue(stats.max_value, field);
stats.min_value = decodeStatisticsValue(stats.min_value, field);
stats.min = decodeStatisticsValue(stats.min, field);
stats.max = decodeStatisticsValue(stats.max, field);
}
}));
}
}
/**
* Return a cursor to the file. You may open more than one cursor and use
* them concurrently. All cursors become invalid once close() is called on
* the reader object.
*
* The required_columns parameter controls which columns are actually read
* from disk. An empty array or no value implies all columns. A list of column
* names means that only those columns should be loaded from disk.
*/
getCursor(columnList) {
if (!columnList) {
columnList = [];
}
columnList = columnList.map((x) => x.constructor === Array ? x : [x]);
return new ParquetCursor(
this.metadata,
this.envelopeReader,
this.schema,
columnList);
}
/**
* Return the number of rows in this file. Note that the number of rows is
* not neccessarily equal to the number of rows in each column.
*/
getRowCount() {
return this.metadata.num_rows;
}
/**
* Returns the ParquetSchema for this file
*/
getSchema() {
return this.schema;
}
/**
* Returns the user (key/value) metadata for this file
*/
getMetadata() {
let md = {};
for (let kv of this.metadata.key_value_metadata) {
md[kv.key] = kv.value;
}
return md;
}
exportMetadata(indent) {
return exportMetadata(this.metadata, indent);
}
/**
* Close this parquet reader. You MUST call this method once you're finished
* reading rows
*/
async close() {
await this.envelopeReader.close();
this.envelopeReader = null;
this.metadata = null;
}
decodePages(buffer, opts) {
return decodePages(buffer,opts);
}
}
/**
* The parquet envelope reader allows direct, unbuffered access to the individual
* sections of the parquet file, namely the header, footer and the row groups.
* This class is intended for advanced/internal users; if you just want to retrieve
* rows from a parquet file use the ParquetReader instead
*/
let ParquetEnvelopeReaderIdCounter = 0;
class ParquetEnvelopeReader {
static async openFile(filePath, options) {
let fileStat = await parquet_util.fstat(filePath);
let fileDescriptor = await parquet_util.fopen(filePath);
let readFn = (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
return parquet_util.fread(fileDescriptor, offset, length);
};
let closeFn = parquet_util.fclose.bind(undefined, fileDescriptor);
return new ParquetEnvelopeReader(readFn, closeFn, fileStat.size, options);
}
static async openBuffer(buffer, options) {
let readFn = (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
return Promise.resolve(buffer.slice(offset,offset+length));
};
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, buffer.length, options);
}
static async openS3(client, params, options) {
let fileStat = async () => client.headObject(params).promise().then(d => d.ContentLength);
let readFn = async (offset, length, file) => {
if (file) {
return Promise.reject('external references are not supported');
}
let Range = `bytes=${offset}-${offset+length-1}`;
let res = await client.getObject(Object.assign({Range}, params)).promise();
return Promise.resolve(res.Body);
};
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, fileStat, options);
}
static async openUrl(request, params, options) {
if (typeof params === 'string')
params = {url: params};
if (!params.url)
throw new Error('URL missing');
let base = params.url.split('/');
base = base.slice(0, base.length-1).join('/')+'/';
params.encoding = params.encoding || null;
let defaultHeaders = params.headers || {};
let filesize = async () => new Promise( (resolve, reject) => {
let req = request(params);
req.on('response', res => {
req.abort();
resolve(res.headers['content-length']);
});
req.on('error', reject);
});
let readFn = (offset, length, file) => {
let url = file ? base+file : params.url;
let range = `bytes=${offset}-${offset+length-1}`;
let headers = Object.assign({}, defaultHeaders, {range});
let req = Object.assign({}, params, {headers, url});
return new Promise( (resolve, reject) => {
request(req, (err, res) => {
if (err) {
reject(err);
} else {
resolve(res.body);
}
});
});
};
let closeFn = () => ({});
return new ParquetEnvelopeReader(readFn, closeFn, filesize, options);
}
constructor(readFn, closeFn, fileSize, options) {
options = options || {};
this.readFn = readFn;
this.id = ++ParquetEnvelopeReaderIdCounter;
this.close = closeFn;
this.fileSize = fileSize;
this.default_dictionary_size = options.default_dictionary_size || 10000000;
if (options.maxLength || options.maxSpan || options.queueWait) {
const bufferReader = new BufferReader(this, options);
this.read = (offset, length) => bufferReader.read(offset, length);
}
}
read(offset, length, file) {
return this.readFn(offset, length, file);
}
readHeader() {
return this.read(0, PARQUET_MAGIC.length).then(buf => {
if (buf.toString() != PARQUET_MAGIC) {
throw 'not valid parquet file'
}
});
}
// Helper function to get the column object for a particular path and row_group
getColumn(path, row_group) {
let column;
if (!isNaN(row_group)) {
row_group = this.metadata.row_groups[row_group];
}
if (typeof path === 'string') {
if (!row_group) {
throw `Missing RowGroup ${row_group}`;
}
column = row_group.columns.find(d => d.meta_data.path_in_schema.join(',') === path);
if (!column) {
throw `Column ${path} Not Found`;
}
} else {
column = path;
}
return column;
}
readOffsetIndex(path, row_group, opts) {
let column = this.getColumn(path, row_group);
if (column.offsetIndex) {
return Promise.resolve(column.offsetIndex);
} else if (!column.offset_index_offset || !column.offset_index_length) {
return Promise.reject('Offset Index Missing');
}
const data = this.read(+column.offset_index_offset, column.offset_index_length).then(data => {
let offset_index = new parquet_thrift.OffsetIndex();
parquet_util.decodeThrift(offset_index, data);
Object.defineProperty(offset_index,'column', {value: column, enumerable: false});
if (opts && opts.cache) {
column.offsetIndex = offset_index;
}
return offset_index;
});
if (opts && opts.cache) {
column.offsetIndex = data;
}
return data;
}
readColumnIndex(path, row_group, opts) {
let column = this.getColumn(path, row_group);
if (column.columnIndex) {
return Promise.resolve(column.columnIndex);
} else if (!column.column_index_offset) {
return Promise.reject(new Error('Column Index Missing'));
}
const data = this.read(+column.column_index_offset, column.column_index_length).then(data => {
let column_index = new parquet_thrift.ColumnIndex();
parquet_util.decodeThrift(column_index, data);
Object.defineProperty(column_index, 'column', { value: column });
// decode the statistics values
const field = this.schema.findField(column.meta_data.path_in_schema);
if (column_index.max_values) {
column_index.max_values = column_index.max_values.map(max_value => decodeStatisticsValue(max_value, field));
}
if (column_index.min_values) {
column_index.min_values = column_index.min_values.map(min_value => decodeStatisticsValue(min_value, field));
}
if (opts && opts.cache) {
column.columnIndex = column_index;
}
return column_index;
});
if (opts && opts.cache) {
column.columnIndex = data;
}
return data;
}
async readPage(column, page, records, opts) {
column = Object.assign({},column);
column.meta_data = Object.assign({},column.meta_data);
if (page.offset !== undefined) {
if (isNaN(page.offset) || isNaN(page.compressed_page_size)) {
throw Error('page offset and/or size missing');
}
column.meta_data.data_page_offset = page.offset;
column.meta_data.total_compressed_size = page.compressed_page_size;
} else {
const offsetIndex = column.offsetIndex || await this.readOffsetIndex(column, null, opts);
column.meta_data.data_page_offset = offsetIndex.page_locations[page].offset;
column.meta_data.total_compressed_size = offsetIndex.page_locations[page].compressed_page_size;
}
const chunk = await this.readColumnChunk(this.schema, column);
Object.defineProperty(chunk,'column', {value: column});
let data = {
columnData: {[chunk.column.meta_data.path_in_schema.join(',')]: chunk}
};
return parquet_shredder.materializeRecords(this.schema, data, records);
}
async readRowGroup(schema, rowGroup, columnList) {
var buffer = {
rowCount: +rowGroup.num_rows,
columnData: {}
};
for (let colChunk of rowGroup.columns) {
const colMetadata = colChunk.meta_data;
const colKey = colMetadata.path_in_schema;
if (columnList.length > 0 && parquet_util.fieldIndexOf(columnList, colKey) < 0) {
continue;
}
buffer.columnData[colKey] = await this.readColumnChunk(schema, colChunk);
}
return buffer;
}
readColumnChunk(schema, colChunk, opts) {
let dictionary = Promise.resolve();
let field = schema.findField(colChunk.meta_data.path_in_schema);
let type = parquet_util.getThriftEnum(
parquet_thrift.Type,
colChunk.meta_data.type);
let compression = parquet_util.getThriftEnum(
parquet_thrift.CompressionCodec,
colChunk.meta_data.codec);
let pagesOffset = +colChunk.meta_data.data_page_offset;
let pagesSize = +colChunk.meta_data.total_compressed_size;
if (!colChunk.file_path) {
pagesSize = Math.min(this.fileSize - pagesOffset, +colChunk.meta_data.total_compressed_size);
}
opts = Object.assign({},opts, {
type: type,
rLevelMax: field.rLevelMax,
dLevelMax: field.dLevelMax,
compression: compression,
column: field,
num_values: colChunk.meta_data.num_values
});
if (colChunk.meta_data.dictionary_page_offset) {
const offset = +colChunk.meta_data.dictionary_page_offset;
const size = Math.min(+this.fileSize - offset, this.default_dictionary_size);
dictionary = this.read(offset, size, colChunk.file_path).then(buffer => decodePage({offset: 0, buffer, size: buffer.length}, opts).dictionary);
}
return dictionary.then(dict => {
opts.dictionary = opts.dictionary || dict;
return this.read(pagesOffset, pagesSize, colChunk.file_path).then(pagesBuf => decodePages(pagesBuf, opts));
});
}
async readFooter() {
if (typeof this.fileSize === 'function') {
this.fileSize = await this.fileSize();
}
let trailerLen = PARQUET_MAGIC.length + 4;
let trailerBuf = await this.read(this.fileSize - trailerLen, trailerLen);
if (trailerBuf.slice(4).toString() != PARQUET_MAGIC) {
throw 'not a valid parquet file';
}
let metadataSize = trailerBuf.readUInt32LE(0);
let metadataOffset = this.fileSize - metadataSize - trailerLen;
if (metadataOffset < PARQUET_MAGIC.length) {
throw 'invalid metadata size';
}
let metadataBuf = await this.read(metadataOffset, metadataSize);
let metadata = new parquet_thrift.FileMetaData();
parquet_util.decodeThrift(metadata, metadataBuf);
return metadata;
}
}
/**
* Decode a consecutive array of data using one of the parquet encodings
*/
function decodeValues(type, encoding, cursor, count, opts) {
if (!(encoding in parquet_codec)) {
throw 'invalid encoding: ' + encoding;
}
return parquet_codec[encoding].decodeValues(type, cursor, count, opts);
}
function decodeStatisticsValue(value, column) {
if (value === null || !value.length) {
return undefined;
}
if (!column.primitiveType.includes('BYTE_ARRAY')) {
value = decodeValues(column.primitiveType,'PLAIN',{buffer: Buffer.from(value), offset: 0}, 1, column);
if (value.length === 1) value = value[0];
}
if (column.originalType) {
value = parquet_types.fromPrimitive(column.originalType, value);
}
return value;
}
function decodeStatistics(statistics, column) {
if (!statistics) {
return;
}
if (statistics.min_value !== null) {
statistics.min_value = decodeStatisticsValue(statistics.min_value, column);
}
if (statistics.max_value !== null) {
statistics.max_value = decodeStatisticsValue(statistics.max_value, column);
}
statistics.min = decodeStatisticsValue(statistics.min, column) || statistics.min_value;
statistics.max = decodeStatisticsValue(statistics.max, column) || statistics.max_value;
return statistics;
}
function decodePage(cursor, opts) {
opts = opts || {};
let page;
const pageHeader = new parquet_thrift.PageHeader();
const headerOffset = cursor.offset;
const headerSize = parquet_util.decodeThrift(pageHeader, cursor.buffer.slice(cursor.offset));
cursor.offset += headerSize;
const pageType = parquet_util.getThriftEnum(
parquet_thrift.PageType,
pageHeader.type);
switch (pageType) {
case 'DATA_PAGE':
if (!opts.rawStatistics) {
pageHeader.data_page_header.statistics = decodeStatistics(pageHeader.data_page_header.statistics, opts.column);
}
page = decodeDataPage(cursor, pageHeader, opts);
break;
case 'DATA_PAGE_V2':
if (!opts.rawStatistics) {
pageHeader.data_page_header_v2.statistics = decodeStatistics(pageHeader.data_page_header_v2.statistics, opts.column);
}
page = decodeDataPageV2(cursor, pageHeader, opts);
break;
case 'DICTIONARY_PAGE':
page = {
dictionary: decodeDictionaryPage(cursor, pageHeader, opts)
};
break;
default:
throw `invalid page type: ${pageType}`;
}
pageHeader.offset = headerOffset;
pageHeader.headerSize = headerSize;
page.pageHeader = pageHeader;
return page;
}
function decodePages(buffer, opts) {
opts = opts || {};
let cursor = {
buffer: buffer,
offset: 0,
size: buffer.length
};
let data = {
rlevels: [],
dlevels: [],
values: [],
pageHeaders: [],
count: 0
};
while (cursor.offset < cursor.size && (!opts.num_values || data.dlevels.length < opts.num_values)) {
const pageData = decodePage(cursor, opts);
if (pageData.dictionary) {
opts.dictionary = pageData.dictionary;
continue;
}
if (opts.dictionary) {
pageData.values = pageData.values.map(d => opts.dictionary[d]);
}
for (let i = 0; i < pageData.rlevels.length; i++) {
data.rlevels.push(pageData.rlevels[i]);
data.dlevels.push(pageData.dlevels[i]);
let value = pageData.values[i];
if (value !== undefined) {
data.values.push(value);
}
}
data.count += pageData.count;
data.pageHeaders.push(pageData.pageHeader);
}
return data;
}
function decodeDictionaryPage(cursor, header, opts) {
const cursorEnd = cursor.offset + header.compressed_page_size;
let dictCursor = {
offset: 0,
buffer: cursor.buffer.slice(cursor.offset,cursorEnd),
size: cursorEnd - cursor.offset
};
cursor.offset = cursorEnd;
if (opts.compression && opts.compression !== 'UNCOMPRESSED') {
let valuesBuf = parquet_compression.inflate(
opts.compression,
dictCursor.buffer.slice(dictCursor.offset,cursorEnd));
dictCursor = {
buffer: valuesBuf,
offset: 0,
size: valuesBuf.length
};
}
return decodeValues(opts.column.primitiveType, opts.column.encoding, dictCursor, header.dictionary_page_header.num_values, opts)
.map(d => d.toString());
}
function decodeDataPage(cursor, header, opts) {
const cursorEnd = cursor.offset + header.compressed_page_size;
let valueCount = header.data_page_header.num_values;
let valueEncoding = parquet_util.getThriftEnum(
parquet_thrift.Encoding,
header.data_page_header.encoding);
let valuesBufCursor = cursor;
if (opts.compression && opts.compression !== 'UNCOMPRESSED') {
let valuesBuf = parquet_compression.inflate(
opts.compression,
cursor.buffer.slice(cursor.offset, cursorEnd));
valuesBufCursor = {
buffer: valuesBuf,
offset: 0,
size: valuesBuf.length
};
}
/* read repetition levels */
let rLevelEncoding = parquet_util.getThriftEnum(
parquet_thrift.Encoding,
header.data_page_header.repetition_level_encoding);
let rLevels = new Array(valueCount);
if (opts.rLevelMax > 0) {
rLevels = decodeValues(
PARQUET_RDLVL_TYPE,
rLevelEncoding,
valuesBufCursor,
valueCount,
{ bitWidth: parquet_util.getBitWidth(opts.rLevelMax) });
} else {
rLevels.fill(0);
}
/* read definition levels */
let dLevelEncoding = parquet_util.getThriftEnum(
parquet_thrift.Encoding,
header.data_page_header.definition_level_encoding);
let dLevels = new Array(valueCount);
if (opts.dLevelMax > 0) {
dLevels = decodeValues(
PARQUET_RDLVL_TYPE,
dLevelEncoding,
valuesBufCursor,
valueCount,
{ bitWidth: parquet_util.getBitWidth(opts.dLevelMax) });
} else {
dLevels.fill(0);
}
/* read values */
let valueCountNonNull = 0;
for (let dlvl of dLevels) {
if (dlvl === opts.dLevelMax) {
++valueCountNonNull;
}
}
let values = decodeValues(
opts.type,
valueEncoding,
valuesBufCursor,
valueCountNonNull,
{
typeLength: opts.column.typeLength,
bitWidth: opts.column.typeLength,
disableEnvelope: opts.column.disableEnvelope
});
cursor.offset = cursorEnd;
return {
dlevels: dLevels,
rlevels: rLevels,
values: values,
count: valueCount
};
}
function decodeDataPageV2(cursor, header, opts) {
const cursorEnd = cursor.offset + header.compressed_page_size;
const valueCount = header.data_page_header_v2.num_values;
const valueCountNonNull = valueCount - header.data_page_header_v2.num_nulls;
const valueEncoding = parquet_util.getThriftEnum(
parquet_thrift.Encoding,
header.data_page_header_v2.encoding);
/* read repetition levels */
let rLevels = new Array(valueCount);
if (opts.rLevelMax > 0) {
rLevels = decodeValues(
PARQUET_RDLVL_TYPE,
PARQUET_RDLVL_ENCODING,
cursor,
valueCount,
{
bitWidth: parquet_util.getBitWidth(opts.rLevelMax),
disableEnvelope: true
});
} else {
rLevels.fill(0);
}
/* read definition levels */
let dLevels = new Array(valueCount);
if (opts.dLevelMax > 0) {
dLevels = decodeValues(
PARQUET_RDLVL_TYPE,
PARQUET_RDLVL_ENCODING,
cursor,
valueCount,
{
bitWidth: parquet_util.getBitWidth(opts.dLevelMax),
disableEnvelope: true
});
} else {
dLevels.fill(0);
}
/* read values */
let valuesBufCursor = cursor;
if (header.data_page_header_v2.is_compressed) {
let valuesBuf = parquet_compression.inflate(
opts.compression,
cursor.buffer.slice(cursor.offset, cursorEnd));
valuesBufCursor = {
buffer: valuesBuf,
offset: 0,
size: valuesBuf.length
};
cursor.offset = cursorEnd;
}
let values = decodeValues(
opts.type,
valueEncoding,
valuesBufCursor,
valueCountNonNull,
{
typeLength: opts.column.typeLength,
bitWidth: opts.column.typeLength
});
return {
dlevels: dLevels,
rlevels: rLevels,
values: values,
count: valueCount
};
}
function decodeSchema(schemaElements) {
let schema = {};
schemaElements.forEach(schemaElement => {
let repetitionType = parquet_util.getThriftEnum(
parquet_thrift.FieldRepetitionType,
schemaElement.repetition_type);
let optional = false;
let repeated = false;
switch (repetitionType) {
case 'REQUIRED':
break;
case 'OPTIONAL':
optional = true;
break;
case 'REPEATED':
repeated = true;
break;
};
if (schemaElement.num_children > 0) {
schema[schemaElement.name] = {
optional: optional,
repeated: repeated,
fields: Object.create({},{
/* define parent and num_children as non-enumerable */
parent: {
value: schema,
enumerable: false
},
num_children: {
value: schemaElement.num_children,
enumerable: false
}
})
};
/* move the schema pointer to the children */
schema = schema[schemaElement.name].fields;
} else {
let logicalType = parquet_util.getThriftEnum(
parquet_thrift.Type,
schemaElement.type);
if (schemaElement.converted_type != null) {
logicalType = parquet_util.getThriftEnum(
parquet_thrift.ConvertedType,
schemaElement.converted_type);
}
schema[schemaElement.name] = {
type: logicalType,
typeLength: schemaElement.type_length,
optional: optional,
repeated: repeated
};
}
/* if we have processed all children we move schema pointer to parent again */
while (schema.parent && Object.keys(schema).length === schema.num_children) {
schema = schema.parent;
}
});
return schema;
}
module.exports = {
ParquetEnvelopeReader,
ParquetReader,
};