UNPKG

hyparquet-writer

Version:

Parquet file writer for JavaScript

146 lines (132 loc) 4.5 kB
import { unconvert } from './unconvert.js' import { writePlain } from './plain.js' import { snappyCompress } from './snappy.js' import { ByteWriter } from './bytewriter.js' import { writeDataPageV2, writePageHeader } from './datapage.js' /** * @param {Writer} writer * @param {SchemaElement[]} schemaPath * @param {DecodedArray} values * @param {boolean} compressed * @param {boolean} stats * @returns {ColumnMetaData} */ export function writeColumn(writer, schemaPath, values, compressed, stats) { const element = schemaPath[schemaPath.length - 1] const { type, type_length } = element if (!type) throw new Error(`column ${element.name} cannot determine type`) const offsetStart = writer.offset const num_values = values.length /** @type {Encoding[]} */ const encodings = [] // Compute statistics const statistics = stats ? getStatistics(values) : undefined // dictionary encoding let dictionary_page_offset let data_page_offset = BigInt(writer.offset) /** @type {DecodedArray | undefined} */ const dictionary = useDictionary(values, type) if (dictionary) { dictionary_page_offset = BigInt(writer.offset) // replace values with dictionary indices const indexes = new Array(values.length) for (let i = 0; i < values.length; i++) { if (values[i] !== null && values[i] !== undefined) { indexes[i] = dictionary.indexOf(values[i]) } } // write unconverted dictionary page const unconverted = unconvert(element, dictionary) writeDictionaryPage(writer, unconverted, type, type_length, compressed) // write data page with dictionary indexes data_page_offset = BigInt(writer.offset) writeDataPageV2(writer, indexes, schemaPath, 'RLE_DICTIONARY', compressed) encodings.push('RLE_DICTIONARY') } else { // unconvert values from rich types to simple values = unconvert(element, values) // write data page const encoding = type === 'BOOLEAN' && values.length > 16 ? 'RLE' : 'PLAIN' writeDataPageV2(writer, values, schemaPath, encoding, compressed) encodings.push(encoding) } return { type, encodings, path_in_schema: schemaPath.slice(1).map(s => s.name), codec: compressed ? 'SNAPPY' : 'UNCOMPRESSED', num_values: BigInt(num_values), total_compressed_size: BigInt(writer.offset - offsetStart), total_uncompressed_size: BigInt(writer.offset - offsetStart), // TODO data_page_offset, dictionary_page_offset, statistics, } } /** * @param {DecodedArray} values * @param {ParquetType} type * @returns {any[] | undefined} */ function useDictionary(values, type) { if (type === 'BOOLEAN') return const unique = new Set(values) unique.delete(undefined) unique.delete(null) if (values.length / unique.size > 2) { // TODO: sort by frequency return Array.from(unique) } } /** * @param {Writer} writer * @param {DecodedArray} dictionary * @param {ParquetType} type * @param {number | undefined} fixedLength * @param {boolean} compressed */ function writeDictionaryPage(writer, dictionary, type, fixedLength, compressed) { const dictionaryPage = new ByteWriter() writePlain(dictionaryPage, dictionary, type, fixedLength) // compress dictionary page data let compressedDictionaryPage = dictionaryPage if (compressed) { compressedDictionaryPage = new ByteWriter() snappyCompress(compressedDictionaryPage, new Uint8Array(dictionaryPage.getBuffer())) } // write dictionary page header writePageHeader(writer, { type: 'DICTIONARY_PAGE', uncompressed_page_size: dictionaryPage.offset, compressed_page_size: compressedDictionaryPage.offset, dictionary_page_header: { num_values: dictionary.length, encoding: 'PLAIN', }, }) writer.appendBuffer(compressedDictionaryPage.getBuffer()) } /** * @import {ColumnMetaData, DecodedArray, Encoding, ParquetType, SchemaElement, Statistics} from 'hyparquet' * @import {Writer} from '../src/types.js' * @param {DecodedArray} values * @returns {Statistics} */ function getStatistics(values) { let min_value = undefined let max_value = undefined let null_count = 0n for (const value of values) { if (value === null || value === undefined) { null_count++ continue } if (min_value === undefined || value < min_value) { min_value = value } if (max_value === undefined || value > max_value) { max_value = value } } return { min_value, max_value, null_count } }