@dsnp/parquetjs
Version:
fully asynchronous, pure JavaScript implementation of the Parquet file format
311 lines (310 loc) • 11.2 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeValues = exports.encodeValues = void 0;
const int53_1 = __importDefault(require("int53"));
function encodeValues_BOOLEAN(values) {
const buf = Buffer.alloc(Math.ceil(values.length / 8));
buf.fill(0);
for (let i = 0; i < values.length; ++i) {
if (values[i]) {
buf[Math.floor(i / 8)] |= 1 << i % 8;
}
}
return buf;
}
function decodeValues_BOOLEAN(cursor, count) {
const values = [];
for (let i = 0; i < count; ++i) {
const b = cursor.buffer[cursor.offset + Math.floor(i / 8)];
values.push((b & (1 << i % 8)) > 0);
}
cursor.offset += Math.ceil(count / 8);
return values;
}
function encodeValues_INT32(values, opts) {
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL';
const scale = opts?.scale || 0;
const buf = Buffer.alloc(4 * values.length);
for (let i = 0; i < values.length; i++) {
if (isDecimal) {
buf.writeInt32LE(values[i] * Math.pow(10, scale), i * 4);
}
else {
buf.writeInt32LE(values[i], i * 4);
}
}
return buf;
}
function decodeValues_INT32(cursor, count, opts) {
let values = [];
const name = opts.name || opts.column?.name || undefined;
try {
if (opts.originalType === 'DECIMAL') {
values = decodeValues_DECIMAL(cursor, count, opts);
}
else {
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readInt32LE(cursor.offset));
cursor.offset += 4;
}
}
}
catch (e) {
console.log(`Error thrown for column: ${name}`);
throw e;
}
return values;
}
function encodeValues_INT64(values, opts) {
const isDecimal = opts?.originalType === 'DECIMAL' || opts?.column?.originalType === 'DECIMAL';
const scale = opts?.scale || 0;
const buf = Buffer.alloc(8 * values.length);
for (let i = 0; i < values.length; i++) {
if (isDecimal) {
buf.writeBigInt64LE(BigInt(Math.floor(values[i] * Math.pow(10, scale))), i * 8);
}
else {
buf.writeBigInt64LE(BigInt(values[i]), i * 8);
}
}
return buf;
}
function decodeValues_INT64(cursor, count, opts) {
let values = [];
const name = opts.name || opts.column?.name || undefined;
try {
if (opts.originalType === 'DECIMAL' || opts.column?.originalType === 'DECIMAL') {
const columnOptions = opts.column?.originalType ? opts.column : opts;
values = decodeValues_DECIMAL(cursor, count, columnOptions);
}
else {
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readBigInt64LE(cursor.offset));
cursor.offset += 8;
}
}
}
catch (e) {
console.log(`Error thrown for column: ${name}`);
throw e;
}
return values;
}
function decodeValues_DECIMAL(cursor, count, opts) {
const precision = opts.precision;
// Default scale to 0 per spec
const scale = opts.scale || 0;
const name = opts.name || undefined;
if (!precision) {
throw new Error(`missing option: precision (required for DECIMAL) for column: ${name}`);
}
const values = [];
// by default we prepare the offset and bufferFunction to work with 32bit integers
let offset = 4;
let bufferFunction = (offset) => cursor.buffer.readInt32LE(offset);
if (precision > 9) {
// if the precision is over 9 digits, then we are dealing with a 64bit integer
offset = 8;
bufferFunction = (offset) => cursor.buffer.readBigInt64LE(offset);
}
for (let i = 0; i < count; ++i) {
const bufferSize = cursor.size || 0;
if (bufferSize === 0 || cursor.offset < bufferSize) {
const fullValue = bufferFunction(cursor.offset);
const valueWithDecimalApplied = Number(fullValue) / Math.pow(10, scale);
values.push(valueWithDecimalApplied);
cursor.offset += offset;
}
}
return values;
}
function encodeValues_INT96(values) {
const buf = Buffer.alloc(12 * values.length);
for (let i = 0; i < values.length; i++) {
if (values[i] >= 0) {
int53_1.default.writeInt64LE(values[i], buf, i * 12);
buf.writeUInt32LE(0, i * 12 + 8); // truncate to 64 actual precision
}
else {
int53_1.default.writeInt64LE(~-values[i] + 1, buf, i * 12);
buf.writeUInt32LE(0xffffffff, i * 12 + 8); // truncate to 64 actual precision
}
}
return buf;
}
function decodeValues_INT96(cursor, count, opts) {
const values = [];
// Default to false for backward compatibility
const treatAsTimestamp = opts?.treatInt96AsTimestamp === true;
for (let i = 0; i < count; ++i) {
// when treatAsTimestamp is true, low is nanoseconds since midnight
const low = int53_1.default.readInt64LE(cursor.buffer, cursor.offset);
// when treatAsTimestamp is true, high is Julian day
const high = cursor.buffer.readUInt32LE(cursor.offset + 8);
if (treatAsTimestamp) {
// Convert Julian day and nanoseconds to a timestamp
values.push(convertInt96ToTimestamp(high, low));
}
else {
// For non-timestamp INT96 values, maintain existing behavior
if (high === 0xffffffff) {
values.push(~-low + 1); // negative value
}
else {
values.push(low); // positive value
}
}
cursor.offset += 12;
}
return values;
}
/**
* Convert INT96 to timestamp
* In the Parquet format, INT96 timestamps are stored as:
* - The first 8 bytes (low) represent nanoseconds within the day
* - The last 4 bytes (high) represent the Julian day
*
* @param julianDay Julian day number
* @param nanosSinceMidnight Nanoseconds since midnight
* @returns JavaScript Date object (UTC)
*/
function convertInt96ToTimestamp(julianDay, nanosSinceMidnight) {
// Julian day 2440588 corresponds to 1970-01-01 (Unix epoch)
const daysSinceEpoch = julianDay - 2440588;
// Convert days to milliseconds (86,400,000 ms per day)
const millisSinceEpoch = daysSinceEpoch * 86400000;
// Convert nanoseconds to milliseconds
const nanosInMillis = Number(BigInt(nanosSinceMidnight) / 1000000n);
// Create a UTC Date
return new Date(millisSinceEpoch + nanosInMillis);
}
function encodeValues_FLOAT(values) {
const buf = Buffer.alloc(4 * values.length);
for (let i = 0; i < values.length; i++) {
buf.writeFloatLE(values[i], i * 4);
}
return buf;
}
function decodeValues_FLOAT(cursor, count) {
const values = [];
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readFloatLE(cursor.offset));
cursor.offset += 4;
}
return values;
}
function encodeValues_DOUBLE(values) {
const buf = Buffer.alloc(8 * values.length);
for (let i = 0; i < values.length; i++) {
buf.writeDoubleLE(values[i], i * 8);
}
return buf;
}
function decodeValues_DOUBLE(cursor, count) {
const values = [];
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.readDoubleLE(cursor.offset));
cursor.offset += 8;
}
return values;
}
function encodeValues_BYTE_ARRAY(values) {
let buf_len = 0;
const returnedValues = [];
for (let i = 0; i < values.length; i++) {
returnedValues[i] = Buffer.from(values[i]);
buf_len += 4 + returnedValues[i].length;
}
const buf = Buffer.alloc(buf_len);
let buf_pos = 0;
for (let i = 0; i < returnedValues.length; i++) {
buf.writeUInt32LE(returnedValues[i].length, buf_pos);
returnedValues[i].copy(buf, buf_pos + 4);
buf_pos += 4 + returnedValues[i].length;
}
return buf;
}
function decodeValues_BYTE_ARRAY(cursor, count) {
const values = [];
for (let i = 0; i < count; ++i) {
const len = cursor.buffer.readUInt32LE(cursor.offset);
cursor.offset += 4;
values.push(cursor.buffer.subarray(cursor.offset, cursor.offset + len));
cursor.offset += len;
}
return values;
}
function encodeValues_FIXED_LEN_BYTE_ARRAY(values, opts) {
if (!opts.typeLength) {
throw new Error('missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)');
}
const returnedValues = [];
for (let i = 0; i < values.length; i++) {
returnedValues[i] = Buffer.from(values[i]);
if (returnedValues[i].length !== opts.typeLength) {
throw new Error('invalid value for FIXED_LEN_BYTE_ARRAY: ' + returnedValues[i]);
}
}
return Buffer.concat(returnedValues);
}
function decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts) {
const values = [];
const typeLength = opts.typeLength ?? (opts.column ? opts.column.typeLength : undefined);
if (!typeLength) {
throw new Error('missing option: typeLength (required for FIXED_LEN_BYTE_ARRAY)');
}
for (let i = 0; i < count; ++i) {
values.push(cursor.buffer.subarray(cursor.offset, cursor.offset + typeLength));
cursor.offset += typeLength;
}
return values;
}
const encodeValues = function (type, values, opts) {
switch (type) {
case 'BOOLEAN':
return encodeValues_BOOLEAN(values);
case 'INT32':
return encodeValues_INT32(values, opts);
case 'INT64':
return encodeValues_INT64(values, opts);
case 'INT96':
return encodeValues_INT96(values);
case 'FLOAT':
return encodeValues_FLOAT(values);
case 'DOUBLE':
return encodeValues_DOUBLE(values);
case 'BYTE_ARRAY':
return encodeValues_BYTE_ARRAY(values);
case 'FIXED_LEN_BYTE_ARRAY':
return encodeValues_FIXED_LEN_BYTE_ARRAY(values, opts);
default:
throw new Error('unsupported type: ' + type);
}
};
exports.encodeValues = encodeValues;
const decodeValues = function (type, cursor, count, opts) {
switch (type) {
case 'BOOLEAN':
return decodeValues_BOOLEAN(cursor, count);
case 'INT32':
return decodeValues_INT32(cursor, count, opts);
case 'INT64':
return decodeValues_INT64(cursor, count, opts);
case 'INT96':
return decodeValues_INT96(cursor, count, opts);
case 'FLOAT':
return decodeValues_FLOAT(cursor, count);
case 'DOUBLE':
return decodeValues_DOUBLE(cursor, count);
case 'BYTE_ARRAY':
return decodeValues_BYTE_ARRAY(cursor, count);
case 'FIXED_LEN_BYTE_ARRAY':
return decodeValues_FIXED_LEN_BYTE_ARRAY(cursor, count, opts);
default:
throw new Error('unsupported type: ' + type);
}
};
exports.decodeValues = decodeValues;