UNPKG

@dsnp/parquetjs

Version:

fully asynchronous, pure JavaScript implementation of the Parquet file format

569 lines (568 loc) 15.7 kB
'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); exports.getParquetTypeDataObject = getParquetTypeDataObject; exports.toPrimitive = toPrimitive; exports.fromPrimitive = fromPrimitive; // BSON uses top level awaits, so use require for now // eslint-disable-next-line @typescript-eslint/no-require-imports const bsonSerialize = require('bson').serialize; // eslint-disable-next-line @typescript-eslint/no-require-imports const bsonDeserialize = require('bson').deserialize; function getParquetTypeDataObject(type, field) { if (type === 'DECIMAL') { if (field?.typeLength !== undefined) { return { primitiveType: 'FIXED_LEN_BYTE_ARRAY', originalType: 'DECIMAL', typeLength: field.typeLength, toPrimitive: toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL, }; } else if (field?.precision && field.precision > 18) { return { primitiveType: 'BYTE_ARRAY', originalType: 'DECIMAL', toPrimitive: toPrimitive_BYTE_ARRAY_DECIMAL, }; } else { return { primitiveType: 'INT64', originalType: 'DECIMAL', toPrimitive: toPrimitive_INT64, }; } } else if (field?.logicalType?.TIME) { const unit = field.logicalType.TIME.unit; if (unit.MILLIS) { return { originalType: 'TIME_MILLIS', primitiveType: 'INT32', toPrimitive: toPrimitive_TIME, }; } if (unit.MICROS) { return { originalType: 'TIME_MICROS', primitiveType: 'INT64', toPrimitive: toPrimitive_TIME, }; } if (unit.NANOS) { return { primitiveType: 'INT64', toPrimitive: toPrimitive_TIME, }; } throw new Error('TIME type must have a valid unit (MILLIS, MICROS, NANOS).'); } else { return PARQUET_LOGICAL_TYPE_DATA[type]; } } const PARQUET_LOGICAL_TYPES = new Set([ 'BOOLEAN', 'INT32', 'INT64', 'INT96', 'FLOAT', 'DOUBLE', 'BYTE_ARRAY', 'FIXED_LEN_BYTE_ARRAY', 'UTF8', 'ENUM', 'TIME_MILLIS', 'TIME_MICROS', 'DATE', 'TIMESTAMP_MILLIS', 'TIMESTAMP_MICROS', 'UINT_8', 'UINT_16', 'UINT_32', 'UINT_64', 'INT_8', 'INT_16', 'INT_32', 'INT_64', 'DECIMAL', 'JSON', 'BSON', 'INTERVAL', 'MAP', 'LIST', ]); const PARQUET_LOGICAL_TYPE_DATA = { BOOLEAN: { primitiveType: 'BOOLEAN', toPrimitive: toPrimitive_BOOLEAN, fromPrimitive: fromPrimitive_BOOLEAN, }, INT32: { primitiveType: 'INT32', toPrimitive: toPrimitive_INT32, }, INT64: { primitiveType: 'INT64', toPrimitive: toPrimitive_INT64, }, INT96: { primitiveType: 'INT96', toPrimitive: toPrimitive_INT96, }, FLOAT: { primitiveType: 'FLOAT', toPrimitive: toPrimitive_FLOAT, }, DOUBLE: { primitiveType: 'DOUBLE', toPrimitive: toPrimitive_DOUBLE, }, BYTE_ARRAY: { primitiveType: 'BYTE_ARRAY', toPrimitive: toPrimitive_BYTE_ARRAY, }, FIXED_LEN_BYTE_ARRAY: { primitiveType: 'FIXED_LEN_BYTE_ARRAY', toPrimitive: toPrimitive_BYTE_ARRAY, }, UTF8: { primitiveType: 'BYTE_ARRAY', originalType: 'UTF8', toPrimitive: toPrimitive_UTF8, fromPrimitive: fromPrimitive_UTF8, }, ENUM: { primitiveType: 'BYTE_ARRAY', originalType: 'UTF8', toPrimitive: toPrimitive_UTF8, fromPrimitive: fromPrimitive_UTF8, }, TIME_MILLIS: { primitiveType: 'INT32', originalType: 'TIME_MILLIS', toPrimitive: toPrimitive_TIME_MILLIS, }, TIME_MICROS: { primitiveType: 'INT64', originalType: 'TIME_MICROS', toPrimitive: toPrimitive_TIME_MICROS, }, DATE: { primitiveType: 'INT32', originalType: 'DATE', toPrimitive: toPrimitive_DATE, fromPrimitive: fromPrimitive_DATE, }, TIMESTAMP_MILLIS: { primitiveType: 'INT64', originalType: 'TIMESTAMP_MILLIS', toPrimitive: toPrimitive_TIMESTAMP_MILLIS, fromPrimitive: fromPrimitive_TIMESTAMP_MILLIS, }, TIMESTAMP_MICROS: { primitiveType: 'INT64', originalType: 'TIMESTAMP_MICROS', toPrimitive: toPrimitive_TIMESTAMP_MICROS, fromPrimitive: fromPrimitive_TIMESTAMP_MICROS, }, UINT_8: { primitiveType: 'INT32', originalType: 'UINT_8', toPrimitive: toPrimitive_UINT8, }, UINT_16: { primitiveType: 'INT32', originalType: 'UINT_16', toPrimitive: toPrimitive_UINT16, }, UINT_32: { primitiveType: 'INT32', originalType: 'UINT_32', toPrimitive: toPrimitive_UINT32, }, UINT_64: { primitiveType: 'INT64', originalType: 'UINT_64', toPrimitive: toPrimitive_UINT64, }, INT_8: { primitiveType: 'INT32', originalType: 'INT_8', toPrimitive: toPrimitive_INT8, }, INT_16: { primitiveType: 'INT32', originalType: 'INT_16', toPrimitive: toPrimitive_INT16, }, INT_32: { primitiveType: 'INT32', originalType: 'INT_32', toPrimitive: toPrimitive_INT32, }, INT_64: { primitiveType: 'INT64', originalType: 'INT_64', toPrimitive: toPrimitive_INT64, }, JSON: { primitiveType: 'BYTE_ARRAY', originalType: 'JSON', toPrimitive: toPrimitive_JSON, fromPrimitive: fromPrimitive_JSON, }, BSON: { primitiveType: 'BYTE_ARRAY', originalType: 'BSON', toPrimitive: toPrimitive_BSON, fromPrimitive: fromPrimitive_BSON, }, INTERVAL: { primitiveType: 'FIXED_LEN_BYTE_ARRAY', originalType: 'INTERVAL', typeLength: 12, toPrimitive: toPrimitive_INTERVAL, fromPrimitive: fromPrimitive_INTERVAL, }, MAP: { originalType: 'MAP', toPrimitive: toPrimitive_MAP, }, LIST: { originalType: 'LIST', toPrimitive: toPrimitive_LIST, }, }; /** * Test if something is a valid Parquet Type * @param type the string of the type * @returns if type is a valid Parquet Type */ function isParquetType(type) { return type !== undefined && PARQUET_LOGICAL_TYPES.has(type); } /** * Convert a value from it's native representation to the internal/underlying * primitive type */ function toPrimitive(type, value, field) { if (!isParquetType(type)) { throw 'invalid type: ' + type || 'undefined'; } return getParquetTypeDataObject(type, field).toPrimitive(value); } /** * Convert a value from it's internal/underlying primitive representation to * the native representation */ function fromPrimitive(type, value, field) { if (!isParquetType(type)) { throw 'invalid type: ' + type || 'undefined'; } const typeFromPrimitive = getParquetTypeDataObject(type, field).fromPrimitive; if (typeFromPrimitive !== undefined) { return typeFromPrimitive(value); } else { return value; } } function toPrimitive_BOOLEAN(value) { return !!value; } function fromPrimitive_BOOLEAN(value) { return !!value; } function toPrimitive_FLOAT(value) { if (typeof value === 'string') { const v = parseFloat(value); return v; } else if (typeof value === 'number') { return value; } throw 'invalid value for FLOAT: ' + value; } function toPrimitive_DOUBLE(value) { if (typeof value === 'string') { const v = parseFloat(value); return v; } else if (typeof value === 'number') { return value; } throw 'invalid value for DOUBLE: ' + value; } function toPrimitive_INT8(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(-0x80, 0x7f, v); return v; } catch { throw 'invalid value for INT8: ' + value; } } function toPrimitive_UINT8(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(0, 0xff, v); return v; } catch { throw 'invalid value for UINT8: ' + value; } } function toPrimitive_INT16(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(-0x8000, 0x7fff, v); return v; } catch { throw 'invalid value for INT16: ' + value; } } function toPrimitive_UINT16(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(0, 0xffff, v); return v; } catch { throw 'invalid value for UINT16: ' + value; } } function toPrimitive_INT32(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(-0x80000000, 0x7fffffff, v); return v; } catch { throw 'invalid value for INT32: ' + value; } } function toPrimitive_UINT32(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(0, 0xffffffffffff, v); return v; } catch { throw 'invalid value for UINT32: ' + value; } } const MIN_64 = BigInt('0x8000000000000000') * -1n; const MAX_64 = BigInt('0x7fffffffffffffff'); function toPrimitive_INT64(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(MIN_64, MAX_64, v); return v; } catch { throw 'invalid value for INT64: ' + value; } } const MAX_U64 = BigInt('0xffffffffffffffff'); function toPrimitive_UINT64(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(0, MAX_U64, v); return v; } catch { throw 'invalid value for UINT64: ' + value; } } const MIN_96 = BigInt('0x800000000000000000000000') * -1n; const MAX_96 = BigInt('0x7fffffffffffffffffffffff'); function toPrimitive_INT96(value) { try { let v = value; if (typeof v === 'string') v = BigInt(value); checkValidValue(MIN_96, MAX_96, v); return v; } catch { throw 'invalid value for INT96: ' + value; } } function toPrimitive_FIXED_LEN_BYTE_ARRAY_DECIMAL(value) { return Buffer.from(value); } function toPrimitive_BYTE_ARRAY_DECIMAL(value) { return Buffer.from(value); } function toPrimitive_MAP(value) { return value; } function toPrimitive_LIST(value) { return value; } function toPrimitive_BYTE_ARRAY(value) { return Buffer.from(value); } function toPrimitive_UTF8(value) { return Buffer.from(value, 'utf8'); } function fromPrimitive_UTF8(value) { return value !== undefined && value !== null ? value.toString() : value; } function toPrimitive_JSON(value) { return Buffer.from(JSON.stringify(value)); } function fromPrimitive_JSON(value) { return JSON.parse(value); } function toPrimitive_BSON(value) { return Buffer.from(bsonSerialize(value)); } function fromPrimitive_BSON(value) { return bsonDeserialize(value); } function toNumberInternal(typeName, value) { let numberValue = 0; switch (typeof value) { case 'string': numberValue = parseInt(value, 10); break; case 'number': numberValue = value; break; default: throw `${typeName} has an invalid type: ${typeof value}`; } // Year 2255 bug. Should eventually switch to bigint if (numberValue < 0 || numberValue >= Number.MAX_SAFE_INTEGER) { throw `${typeName} value is out of bounds: ${numberValue}`; } return numberValue; } function toPrimitive_TIME_MILLIS(value) { return toNumberInternal('TIME_MILLIS', value); } function toPrimitive_TIME_MICROS(value) { const v = BigInt(value); if (v < 0n) { throw 'TIME_MICROS value is out of bounds: ' + value; } return v; } const kMillisPerDay = 86400000; function toPrimitive_DATE(value) { /* convert from date */ if (value instanceof Date) { return value.getTime() / kMillisPerDay; } return toNumberInternal('DATE', value); } function fromPrimitive_DATE(value) { return new Date(+value * kMillisPerDay); } function toPrimitive_TIMESTAMP_MILLIS(value) { /* convert from date */ if (value instanceof Date) { return value.getTime(); } return toNumberInternal('TIMESTAMP_MILLIS', value); } function fromPrimitive_TIMESTAMP_MILLIS(value) { return new Date(Number(value)); } function toPrimitive_TIMESTAMP_MICROS(value) { /* convert from date */ if (value instanceof Date) { return BigInt(value.getTime()) * 1000n; } /* convert from integer */ try { // Will throw if NaN const v = BigInt(value); if (v < 0n) { throw 'out of bounds'; } return v; } catch (_e) { throw 'TIMESTAMP_MICROS value is out of bounds: ' + value; } } function fromPrimitive_TIMESTAMP_MICROS(value) { if (typeof value === 'bigint') return new Date(Number(value / 1000n)); return new Date(value / 1000); } function toPrimitive_INTERVAL(value) { if (!value.months || !value.days || !value.milliseconds) { throw 'value for INTERVAL must be object { months: ..., days: ..., milliseconds: ... }'; } const buf = Buffer.alloc(12); buf.writeUInt32LE(value.months, 0); buf.writeUInt32LE(value.days, 4); buf.writeUInt32LE(value.milliseconds, 8); return buf; } function fromPrimitive_INTERVAL(value) { const buf = Buffer.from(value); const months = buf.readUInt32LE(0); const days = buf.readUInt32LE(4); const millis = buf.readUInt32LE(8); return { months: months, days: days, milliseconds: millis }; } function checkValidValue(lowerRange, upperRange, v) { if (v < lowerRange || v > upperRange) { throw 'invalid value'; } } function toPrimitive_TIME(time) { const { value, unit, isAdjustedToUTC } = time; const timeValue = typeof value === 'string' ? BigInt(value) : BigInt(value); if (isAdjustedToUTC) { return unit === 'MILLIS' ? Number(timeValue) : timeValue; } else { switch (unit) { case 'MILLIS': return Number(adjustToLocalTimestamp(timeValue, { MILLIS: true })); case 'MICROS': return adjustToLocalTimestamp(timeValue, { MICROS: true }); case 'NANOS': return adjustToLocalTimestamp(timeValue, { NANOS: true }); default: throw new Error(`Unsupported time unit: ${unit}`); } } } function adjustToLocalTimestamp(timestamp, unit) { const localOffset = BigInt(new Date().getTimezoneOffset()) * 60n * 1000n; // Offset in milliseconds if (unit.MILLIS) { return timestamp - localOffset; } else if (unit.MICROS) { return timestamp - localOffset * 1000n; } else if (unit.NANOS) { return timestamp - localOffset * 1000000n; } throw new Error('Unsupported time unit'); }