icebird
Version:
Apache Iceberg client for javascript
194 lines (179 loc) • 6.6 kB
JavaScript
import { ByteWriter } from 'hyparquet-writer'
/**
* @param {Object} options
* @param {Writer} options.writer
* @param {AvroRecord} options.schema
* @param {Record<string, any>[]} options.records
* @param {number} [options.blockSize]
*/
export function avroWrite({ writer, schema, records, blockSize = 512 }) {
writer.appendUint32(0x016a624f) // Obj\x01
const meta = {
'avro.schema': typeof schema === 'string' ? schema : JSON.stringify(schema),
'avro.codec': 'null',
}
appendZigZag(writer, Object.keys(meta).length)
for (const [key, value] of Object.entries(meta)) {
const kb = new TextEncoder().encode(key)
appendZigZag(writer, kb.length)
writer.appendBytes(kb)
const vb = new TextEncoder().encode(value)
appendZigZag(writer, vb.length)
writer.appendBytes(vb)
}
writer.appendVarInt(0)
const sync = new Uint8Array(16)
for (let i = 0; i < 16; i++) sync[i] = Math.random() * 256 | 0
writer.appendBytes(sync)
for (let i = 0; i < records.length; i += blockSize) {
const block = records.slice(i, i + blockSize)
appendZigZag(writer, block.length) // record count
const blockWriter = new ByteWriter()
for (const record of block) {
for (const { name, type } of schema.fields) {
writeType(blockWriter, type, record[name])
}
}
appendZigZag(writer, blockWriter.offset) // block size
writer.appendBuffer(blockWriter.getBuffer())
writer.appendBytes(sync)
}
if (writer.finish) writer.finish()
}
/**
* @import {Writer} from 'hyparquet-writer/src/types.js'
* @import {AvroRecord, AvroType} from '../../src/types.js'
* @param {Writer} writer
* @param {AvroType} schema
* @param {*} value
*/
function writeType(writer, schema, value) {
if (Array.isArray(schema)) {
// find matching union branch
const unionIndex = schema.findIndex(s => {
if (Array.isArray(s)) throw new Error('nested unions not supported')
// normalise branch to a tag string we can test against
const tag = typeof s === 'string' ? s : 'logicalType' in s ? s.logicalType : s.type
if (value == null) return tag === 'null'
if (tag === 'boolean') return typeof value === 'boolean'
if (tag === 'int') return typeof value === 'number' && Number.isInteger(value)
if (tag === 'long') return typeof value === 'bigint' || typeof value === 'number'
if (tag === 'float' || tag === 'double') return typeof value === 'number'
if (tag === 'string') return typeof value === 'string'
if (tag === 'bytes') return value instanceof Uint8Array
if (tag === 'record') return typeof value === 'object' && value !== null
if (tag === 'array') return Array.isArray(value)
return false
})
if (unionIndex === -1) throw new Error('union branch not found')
appendZigZag(writer, unionIndex)
writeType(writer, schema[unionIndex], value)
} else if (typeof schema === 'string') {
// primitive type
if (schema === 'null') {
// no-op
} else if (schema === 'boolean') {
writer.appendUint8(value ? 1 : 0)
} else if (schema === 'int') {
if (typeof value !== 'number' || !Number.isInteger(value)) {
throw new Error('expected integer value')
}
appendZigZag(writer, value)
} else if (schema === 'long') {
if (typeof value !== 'bigint') throw new Error('expected bigint value')
appendZigZag64(writer, value)
} else if (schema === 'float') {
if (typeof value !== 'number') throw new Error('expected number value')
writer.appendFloat32(value)
} else if (schema === 'double') {
if (typeof value !== 'number') throw new Error('expected number value')
writer.appendFloat64(value)
} else if (schema === 'bytes') {
if (!(value instanceof Uint8Array)) throw new Error('expected Uint8Array value')
appendZigZag(writer, value.length)
writer.appendBytes(value)
} else if (schema === 'string') {
if (typeof value !== 'string') throw new Error('expected string value')
const b = new TextEncoder().encode(value)
appendZigZag(writer, b.length)
writer.appendBytes(b)
}
} else if ('logicalType' in schema) {
if (schema.logicalType === 'date') {
appendZigZag(writer, value instanceof Date ? Math.floor(value.getTime() / 86400000) : value)
} else if (schema.logicalType === 'timestamp-millis') {
appendZigZag64(writer, value instanceof Date ? BigInt(value.getTime()) : BigInt(value))
} else if (schema.logicalType === 'timestamp-micros') {
appendZigZag64(
writer,
value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value)
)
} else if (schema.logicalType === 'decimal') {
const scale = 'scale' in schema ? schema.scale ?? 0 : 0
let u
if (typeof value === 'bigint') {
u = value
} else if (typeof value === 'number') {
u = BigInt(Math.round(value * 10 ** scale))
} else {
throw new Error('decimal value must be bigint or number')
}
const b = bigIntToBytes(u)
writer.appendVarInt(b.length)
writer.appendBytes(b)
} else {
throw new Error(`unknown logical type ${schema.logicalType}`)
}
} else if (schema.type === 'record') {
for (const f of schema.fields) {
writeType(writer, f.type, value[f.name])
}
} else if (schema.type === 'array') {
if (value.length) {
appendZigZag(writer, value.length)
for (const it of value) {
writeType(writer, schema.items, it)
}
}
writer.appendVarInt(0)
} else {
throw new Error(`unknown schema type ${JSON.stringify(schema)}`)
}
}
/**
* @param {Writer} writer
* @param {number} v
*/
function appendZigZag(writer, v) {
writer.appendVarInt(v << 1 ^ v >> 31)
}
/**
* @param {Writer} writer
* @param {bigint} v
*/
function appendZigZag64(writer, v) {
writer.appendVarBigInt(v << 1n ^ v >> 63n)
}
/**
* Convert a signed BigInt into two’s-complement big-endian bytes.
* @param {bigint} value
* @returns {Uint8Array}
*/
function bigIntToBytes(value) {
const neg = value < 0n
let abs = neg ? -value : value
const out = []
while (abs > 0n) { out.unshift(Number(abs & 0xffn)); abs >>= 8n }
if (out.length === 0) out.push(0)
if (neg) {
for (let i = 0; i < out.length; i++) out[i] ^= 0xff
for (let i = out.length - 1; i >= 0; i--) {
out[i] = out[i] + 1 & 0xff
if (out[i]) break
}
if ((out[0] & 0x80) === 0) out.unshift(0xff)
} else if ((out[0] & 0x80) !== 0) {
out.unshift(0)
}
return Uint8Array.from(out)
}