@dobesv/parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
294 lines • 11.5 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.encodeFooter = exports.encodeRowGroup = exports.PARQUET_RDLVL_ENCODING = exports.PARQUET_RDLVL_TYPE = exports.PARQUET_DEFAULT_ROW_GROUP_SIZE = exports.PARQUET_DEFAULT_PAGE_SIZE = exports.PARQUET_VERSION = exports.PARQUET_MAGIC = void 0;
const codec_1 = require("./codec");
const plain_1 = require("./codec/plain");
const Compression = require("./compression");
const thrift_1 = require("./thrift");
const Util = require("./util");
const Int64 = require("node-int64");
/**
* Parquet File Magic String
*/
exports.PARQUET_MAGIC = 'PAR1';
/**
* Parquet Version
*/
exports.PARQUET_VERSION = 1;
/**
* Default Page and Row Group sizes
*/
exports.PARQUET_DEFAULT_PAGE_SIZE = 8192;
exports.PARQUET_DEFAULT_ROW_GROUP_SIZE = 4096;
/**
* Repetition and Definition Level encodings
*/
exports.PARQUET_RDLVL_TYPE = 'INT32';
exports.PARQUET_RDLVL_ENCODING = 'RLE';
/**
* Encode a consecutive array of data using one of the parquet encodings
*/
function encodeValues(type, encoding, values, opts) {
if (!(encoding in codec_1.PARQUET_CODEC)) {
throw new Error(`invalid encoding: ${encoding}`);
}
return codec_1.PARQUET_CODEC[encoding].encodeValues(type, values, opts);
}
/**
* Encode a parquet data page
*/
function encodeDataPage(column, data) {
/* encode repetition and definition levels */
let rLevelsBuf = Buffer.alloc(0);
if (column.rLevelMax > 0) {
rLevelsBuf = encodeValues(exports.PARQUET_RDLVL_TYPE, exports.PARQUET_RDLVL_ENCODING, data.rLevels, {
bitWidth: Util.getBitWidth(column.rLevelMax),
// disableEnvelope: false
});
}
let dLevelsBuf = Buffer.alloc(0);
if (column.dLevelMax > 0) {
dLevelsBuf = encodeValues(exports.PARQUET_RDLVL_TYPE, exports.PARQUET_RDLVL_ENCODING, data.dLevels, {
bitWidth: Util.getBitWidth(column.dLevelMax),
// disableEnvelope: false
});
}
/* encode values */
const valuesBuf = encodeValues(column.primitiveType, column.encoding, data.values, { typeLength: column.typeLength, bitWidth: column.typeLength });
const dataBuf = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]);
// compression = column.compression === 'UNCOMPRESSED' ? (compression || 'UNCOMPRESSED') : column.compression;
const compressedBuf = Compression.deflate(column.compression, dataBuf);
/* build page header */
const header = new thrift_1.PageHeader({
type: thrift_1.PageType.DATA_PAGE,
data_page_header: new thrift_1.DataPageHeader({
num_values: data.count,
encoding: thrift_1.Encoding[column.encoding],
definition_level_encoding: thrift_1.Encoding[exports.PARQUET_RDLVL_ENCODING],
repetition_level_encoding: thrift_1.Encoding[exports.PARQUET_RDLVL_ENCODING],
}),
uncompressed_page_size: dataBuf.length,
compressed_page_size: compressedBuf.length,
});
/* concat page header, repetition and definition levels and values */
const headerBuf = Util.serializeThrift(header);
const page = Buffer.concat([headerBuf, compressedBuf]);
return { header, headerSize: headerBuf.length, page };
}
/**
* Encode a parquet data page (v2)
*/
function encodeDataPageV2(column, data, rowCount) {
/* encode values */
const valuesBuf = encodeValues(column.primitiveType, column.encoding, data.values, {
typeLength: column.typeLength,
bitWidth: column.typeLength,
});
const compressedBuf = Compression.deflate(column.compression, valuesBuf);
/* encode repetition and definition levels */
let rLevelsBuf = Buffer.alloc(0);
if (column.rLevelMax > 0) {
rLevelsBuf = encodeValues(exports.PARQUET_RDLVL_TYPE, exports.PARQUET_RDLVL_ENCODING, data.rLevels, {
bitWidth: Util.getBitWidth(column.rLevelMax),
disableEnvelope: true,
});
}
let dLevelsBuf = Buffer.alloc(0);
if (column.dLevelMax > 0) {
dLevelsBuf = encodeValues(exports.PARQUET_RDLVL_TYPE, exports.PARQUET_RDLVL_ENCODING, data.dLevels, {
bitWidth: Util.getBitWidth(column.dLevelMax),
disableEnvelope: true,
});
}
/* build page header */
const header = new thrift_1.PageHeader({
type: thrift_1.PageType.DATA_PAGE_V2,
data_page_header_v2: new thrift_1.DataPageHeaderV2({
num_values: data.count,
num_nulls: data.count - data.values.length,
num_rows: rowCount,
encoding: thrift_1.Encoding[column.encoding],
definition_levels_byte_length: dLevelsBuf.length,
repetition_levels_byte_length: rLevelsBuf.length,
is_compressed: column.compression !== 'UNCOMPRESSED',
}),
uncompressed_page_size: rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length,
compressed_page_size: rLevelsBuf.length + dLevelsBuf.length + compressedBuf.length,
});
/* concat page header, repetition and definition levels and values */
const headerBuf = Util.serializeThrift(header);
const page = Buffer.concat([
headerBuf,
rLevelsBuf,
dLevelsBuf,
compressedBuf,
]);
return { header, headerSize: headerBuf.length, page };
}
/**
* Encode an array of values into a parquet column chunk
*/
function encodeColumnChunk(column, buffer, offset, opts) {
const data = buffer.columnData[column.path.join()];
const stats = buffer.statistics[column.path.join()];
const baseOffset = (opts.baseOffset || 0) + offset;
const pageSize = opts.pageSize || exports.PARQUET_DEFAULT_PAGE_SIZE;
const pageBuffers = [];
let total_uncompressed_size = 0;
let total_compressed_size = 0;
const encodePage = (pageData, rowCount) => {
const result = opts.useDataPageV2
? encodeDataPageV2(column, pageData, rowCount)
: encodeDataPage(column, pageData);
pageBuffers.push(result.page);
total_uncompressed_size +=
result.header.uncompressed_page_size + result.headerSize;
total_compressed_size +=
result.header.compressed_page_size + result.headerSize;
};
if (data.count <= pageSize) {
encodePage(data, buffer.rowCount);
}
else {
let valueOffset = 0;
let start = 0;
while (start < data.count) {
let end = Math.min(start + pageSize, data.count);
if (column.rLevelMax > 0 && end < data.count) {
while (end < data.count && data.rLevels[end] !== 0) {
end += 1;
}
}
const dLevels = data.dLevels.slice(start, end);
const rLevels = data.rLevels.slice(start, end);
let valueCount = 0;
for (const dLevel of dLevels) {
if (dLevel === column.dLevelMax) {
valueCount += 1;
}
}
const pageData = {
dLevels,
rLevels,
values: data.values.slice(valueOffset, valueOffset + valueCount),
count: end - start,
};
valueOffset += valueCount;
const pageRowCount = column.rLevelMax > 0
? pageData.rLevels.filter(r => r === 0).length
: pageData.count;
encodePage(pageData, pageRowCount);
start = end;
}
}
const pageBuf = Buffer.concat(pageBuffers);
const metadata = new thrift_1.ColumnMetaData({
path_in_schema: column.path,
num_values: data.count,
data_page_offset: baseOffset,
encodings: [],
total_uncompressed_size,
total_compressed_size,
type: thrift_1.Type[column.primitiveType],
codec: thrift_1.CompressionCodec[column.compression],
statistics: new thrift_1.Statistics({
min_value: encodeValue(stats.min, column),
max_value: encodeValue(stats.max, column),
null_count: new Int64(stats.null_count),
distinct_count: new Int64(stats.distinct_values.size),
}),
});
metadata.encodings.push(thrift_1.Encoding[exports.PARQUET_RDLVL_ENCODING]);
metadata.encodings.push(thrift_1.Encoding[column.encoding]);
const metadataOffset = baseOffset + pageBuf.length;
const body = Buffer.concat([pageBuf, Util.serializeThrift(metadata)]);
return { body, metadata, metadataOffset };
}
function encodeValue(value, column) {
if (value === null || value === undefined) {
return undefined;
}
return (0, plain_1.encodeValues)(column.primitiveType, [value], {
typeLength: column.typeLength,
bitWidth: column.typeLength,
});
}
/**
* Encode a list of column values into a parquet row group
*/
function encodeRowGroup(schema, data, opts) {
const metadata = new thrift_1.RowGroup({
num_rows: data.rowCount,
columns: [],
total_byte_size: 0,
});
let body = Buffer.alloc(0);
for (const field of schema.fieldList) {
if (field.isNested) {
continue;
}
const cchunkData = encodeColumnChunk(field, data, body.length, opts);
const cchunk = new thrift_1.ColumnChunk({
file_offset: cchunkData.metadataOffset,
meta_data: cchunkData.metadata,
});
metadata.columns.push(cchunk);
metadata.total_byte_size = new Int64(+metadata.total_byte_size + cchunkData.body.length);
body = Buffer.concat([body, cchunkData.body]);
}
return { body, metadata };
}
exports.encodeRowGroup = encodeRowGroup;
/**
* Encode a parquet file metadata footer
*/
function encodeFooter(schema, rowCount, rowGroups, userMetadata) {
const metadata = new thrift_1.FileMetaData({
version: exports.PARQUET_VERSION,
created_by: 'parquets',
num_rows: rowCount,
row_groups: rowGroups,
schema: [],
key_value_metadata: [],
});
for (const key in userMetadata) {
const kv = new thrift_1.KeyValue({
key,
value: userMetadata[key],
});
metadata.key_value_metadata.push(kv);
}
{
const schemaRoot = new thrift_1.SchemaElement({
name: 'root',
num_children: Object.keys(schema.fields).length,
});
metadata.schema.push(schemaRoot);
}
for (const field of schema.fieldList) {
const relt = thrift_1.FieldRepetitionType[field.repetitionType];
const schemaElem = new thrift_1.SchemaElement({
name: field.name,
repetition_type: relt,
});
if (field.isNested) {
schemaElem.num_children = field.fieldCount;
}
else {
schemaElem.type = thrift_1.Type[field.primitiveType];
}
if (field.originalType) {
schemaElem.converted_type = thrift_1.ConvertedType[field.originalType];
}
schemaElem.type_length = field.typeLength;
metadata.schema.push(schemaElem);
}
const metadataEncoded = Util.serializeThrift(metadata);
const footerEncoded = Buffer.alloc(metadataEncoded.length + 8);
metadataEncoded.copy(footerEncoded);
footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length);
footerEncoded.write(exports.PARQUET_MAGIC, metadataEncoded.length + 4);
return footerEncoded;
}
exports.encodeFooter = encodeFooter;
//# sourceMappingURL=encoding.js.map