hyparquet-writer
Version:
Parquet file writer for JavaScript
185 lines (171 loc) • 6.98 kB
JavaScript
import { Encodings, PageTypes } from 'hyparquet/src/constants.js'
import { ByteWriter } from './bytewriter.js'
import { deltaBinaryPack, deltaByteArray, deltaLengthByteArray } from './delta.js'
import { writeRleBitPackedHybrid } from './encoding.js'
import { writePlain } from './plain.js'
import { getMaxRepetitionLevel } from './schema.js'
import { writeByteStreamSplit } from './splitstream.js'
import { serializeTCompactProtocol } from './thrift.js'
/**
* @param {Object} options
* @param {Writer} options.writer
* @param {ColumnEncoder} options.column
* @param {Encoding} options.encoding
* @param {PageData} options.pageData
*/
export function writeDataPageV2({ writer, column, encoding, pageData }) {
const { columnName, element, codec, compressors } = column
const { type, type_length, repetition_type } = element
if (!type) throw new Error(`column ${columnName} cannot determine type`)
if (repetition_type === 'REPEATED') throw new Error(`column ${columnName} repeated types not supported`)
// write levels to temp buffer
const levelWriter = new ByteWriter()
const {
definition_levels_byte_length,
repetition_levels_byte_length,
num_nulls,
num_values,
num_rows,
} = writeLevels(levelWriter, column, pageData)
// TODO: skip nulls while writing instead of filtering
const nonnull = num_nulls ? pageData.values.filter(v => v !== null && v !== undefined) : pageData.values
// write page data to temp buffer
const page = new ByteWriter()
if (encoding === 'PLAIN') {
writePlain(page, nonnull, type, type_length)
} else if (encoding === 'RLE') {
if (type !== 'BOOLEAN') throw new Error('RLE encoding only supported for BOOLEAN type')
const rleData = new ByteWriter()
writeRleBitPackedHybrid(rleData, nonnull, 1)
page.appendUint32(rleData.offset) // prepend byte length
page.appendBytes(rleData.getBytes())
} else if (encoding === 'PLAIN_DICTIONARY' || encoding === 'RLE_DICTIONARY') {
// find max bitwidth
let maxValue = 0
for (const v of nonnull) if (v > maxValue) maxValue = v
const bitWidth = Math.ceil(Math.log2(maxValue + 1))
page.appendUint8(bitWidth) // prepend bitWidth
writeRleBitPackedHybrid(page, nonnull, bitWidth)
} else if (encoding === 'DELTA_BINARY_PACKED') {
if (type !== 'INT32' && type !== 'INT64') {
throw new Error('DELTA_BINARY_PACKED encoding only supported for INT32 and INT64 types')
}
deltaBinaryPack(page, nonnull)
} else if (encoding === 'DELTA_LENGTH_BYTE_ARRAY') {
if (type !== 'BYTE_ARRAY') {
throw new Error('DELTA_LENGTH_BYTE_ARRAY encoding only supported for BYTE_ARRAY type')
}
deltaLengthByteArray(page, nonnull)
} else if (encoding === 'DELTA_BYTE_ARRAY') {
if (type !== 'BYTE_ARRAY') {
throw new Error('DELTA_BYTE_ARRAY encoding only supported for BYTE_ARRAY type')
}
deltaByteArray(page, nonnull)
} else if (encoding === 'BYTE_STREAM_SPLIT') {
writeByteStreamSplit(page, nonnull, type, type_length)
} else {
throw new Error(`parquet unsupported encoding: ${encoding}`)
}
// compress page data
const pageBytes = page.getBytes()
const compressedBytes = compressors[codec]?.(pageBytes) ?? pageBytes
// write page header
writePageHeader(writer, {
type: 'DATA_PAGE_V2',
uncompressed_page_size: levelWriter.offset + page.offset,
compressed_page_size: levelWriter.offset + compressedBytes.length,
data_page_header_v2: {
num_values,
num_nulls,
num_rows,
encoding,
definition_levels_byte_length,
repetition_levels_byte_length,
is_compressed: !!codec,
// is there benefit to page statistics here?
},
})
// write levels
writer.appendBytes(levelWriter.getBytes())
// write page data
writer.appendBytes(compressedBytes)
}
/**
* @param {Writer} writer
* @param {PageHeader} header
*/
export function writePageHeader(writer, header) {
/** @type {ThriftObject} */
const compact = {
field_1: PageTypes.indexOf(header.type),
field_2: header.uncompressed_page_size,
field_3: header.compressed_page_size,
field_4: header.crc,
field_5: header.data_page_header && {
field_1: header.data_page_header.num_values,
field_2: Encodings.indexOf(header.data_page_header.encoding),
field_3: Encodings.indexOf(header.data_page_header.definition_level_encoding),
field_4: Encodings.indexOf(header.data_page_header.repetition_level_encoding),
// field_5: header.data_page_header.statistics,
},
field_7: header.dictionary_page_header && {
field_1: header.dictionary_page_header.num_values,
field_2: Encodings.indexOf(header.dictionary_page_header.encoding),
},
field_8: header.data_page_header_v2 && {
field_1: header.data_page_header_v2.num_values,
field_2: header.data_page_header_v2.num_nulls,
field_3: header.data_page_header_v2.num_rows,
field_4: Encodings.indexOf(header.data_page_header_v2.encoding),
field_5: header.data_page_header_v2.definition_levels_byte_length,
field_6: header.data_page_header_v2.repetition_levels_byte_length,
field_7: header.data_page_header_v2.is_compressed ? undefined : false, // default true
},
}
serializeTCompactProtocol(writer, compact)
}
/**
* @import {DecodedArray, Encoding, PageHeader} from 'hyparquet'
* @import {ColumnEncoder, PageData, ThriftObject, Writer} from '../src/types.js'
* @param {Writer} writer
* @param {ColumnEncoder} column
* @param {PageData} dataPage
* @returns {{
* definition_levels_byte_length: number
* repetition_levels_byte_length: number
* num_values: number
* num_nulls: number
* num_rows: number
* }}
*/
function writeLevels(writer, column, dataPage) {
const { schemaPath } = column
const { values, definitionLevels, repetitionLevels, maxDefinitionLevel } = dataPage
const num_values = definitionLevels.length || values.length
let num_nulls = 0
let num_rows = 0
if (repetitionLevels.length) {
for (let i = 0; i < repetitionLevels.length; i++) {
if (repetitionLevels[i] === 0) num_rows++
}
} else {
num_rows = values.length
}
if (definitionLevels.length) {
for (let i = 0; i < definitionLevels.length; i++) {
if (definitionLevels[i] < maxDefinitionLevel) num_nulls++
}
}
const maxRepetitionLevel = getMaxRepetitionLevel(schemaPath)
let repetition_levels_byte_length = 0
if (maxRepetitionLevel) {
const bitWidth = Math.ceil(Math.log2(maxRepetitionLevel + 1))
repetition_levels_byte_length = writeRleBitPackedHybrid(writer, repetitionLevels, bitWidth)
}
let definition_levels_byte_length = 0
if (maxDefinitionLevel) {
const bitWidth = Math.ceil(Math.log2(maxDefinitionLevel + 1))
definition_levels_byte_length = writeRleBitPackedHybrid(writer, definitionLevels, bitWidth)
}
return { definition_levels_byte_length, repetition_levels_byte_length, num_values, num_nulls, num_rows }
}