icebird
Version:
Apache Iceberg client for javascript
97 lines (91 loc) • 3.7 kB
JavaScript
import { asyncBufferFromUrl, cachedAsyncBuffer, parquetReadObjects } from 'hyparquet'
import { compressors } from 'hyparquet-compressors'
import { avroRead } from './avro/avro.read.js'
import { avroMetadata } from './avro/avro.metadata.js'
/**
* Translates an S3A URL to an HTTPS URL for direct access to the object.
*
* @param {string} url
* @returns {string}
*/
export function translateS3Url(url) {
if (url.startsWith('s3a://') || url.startsWith('s3://')) {
const rest = url.slice(url.indexOf('://') + 3)
const slashIndex = rest.indexOf('/')
if (slashIndex === -1) {
throw new Error('Invalid S3 URL, missing "/" after bucket')
}
const bucket = rest.slice(0, slashIndex)
const key = rest.slice(slashIndex)
return `https://${bucket}.s3.amazonaws.com${key}`
}
return url
}
/**
* Reads delete files from delete manifests.
* Position deletes are grouped by target data file.
* Equality deletes are grouped by sequence number.
*
* @import {ManifestEntry} from '../src/types.js'
* @param {ManifestEntry[]} deleteEntries
* @param {RequestInit} [requestInit]
* @returns {Promise<{positionDeletesMap: Map<string, Set<bigint>>, equalityDeletesMap: Map<bigint, Record<string, any>[]>}>}
*/
export async function fetchDeleteMaps(deleteEntries, requestInit) {
// Build maps of delete entries keyed by target data file path
/** @type {Map<string, Set<bigint>>} */
const positionDeletesMap = new Map()
/** @type {Map<bigint, Record<string, any>[]>} */
const equalityDeletesMap = new Map()
// Fetch delete files in parallel
await Promise.all(deleteEntries.map(async deleteEntry => {
const { content, file_path, file_size_in_bytes } = deleteEntry.data_file
const file = await asyncBufferFromUrl({
url: translateS3Url(file_path),
byteLength: Number(file_size_in_bytes),
requestInit,
}).then(cachedAsyncBuffer)
const deleteRows = await parquetReadObjects({ file, compressors })
for (const deleteRow of deleteRows) {
if (content === 1) { // position delete
const { file_path, pos } = deleteRow
if (!file_path) throw new Error('position delete missing target file path')
if (pos === undefined) throw new Error('position delete missing pos')
// note: pos is relative to the data file's row order
let set = positionDeletesMap.get(file_path)
if (!set) {
set = new Set()
positionDeletesMap.set(file_path, set)
}
set.add(pos)
} else if (content === 2) { // equality delete
// save the entire delete row (restrict this to equalityIds?)
const { sequence_number } = deleteEntry
if (sequence_number === undefined) {
throw new Error('equality delete missing sequence number')
}
let list = equalityDeletesMap.get(sequence_number)
if (!list) {
list = []
equalityDeletesMap.set(sequence_number, list)
}
list.push(deleteRow)
}
}
}))
return { positionDeletesMap, equalityDeletesMap }
}
/**
* Decodes Avro records from a url.
*
* @param {string} manifestUrl - The URL of the manifest file
* @param {RequestInit} [requestInit] - Optional fetch request initialization
* @returns {Promise<Record<string, any>[]>} The decoded Avro records
*/
export async function fetchAvroRecords(manifestUrl, requestInit) {
const safeUrl = translateS3Url(manifestUrl)
const buffer = await fetch(safeUrl, requestInit).then(res => res.arrayBuffer())
const reader = { view: new DataView(buffer), offset: 0 }
const { metadata, syncMarker } = await avroMetadata(reader)
return await avroRead({ reader, metadata, syncMarker })
}