@severo_tests/hyparquet
Version:
Parquet file parser for JavaScript
211 lines (196 loc) • 6.62 kB
JavaScript
import { defaultInitialFetchSize } from './metadata.js'
/**
* Replace bigint, date, etc with legal JSON types.
*
* @param {any} obj object to convert
* @returns {unknown} converted object
*/
export function toJson(obj) {
if (obj === undefined) return null
if (typeof obj === 'bigint') return Number(obj)
if (Array.isArray(obj)) return obj.map(toJson)
if (obj instanceof Uint8Array) return Array.from(obj)
if (obj instanceof Date) return obj.toISOString()
if (obj instanceof Object) {
/** @type {Record<string, unknown>} */
const newObj = {}
for (const key of Object.keys(obj)) {
if (obj[key] === undefined) continue
newObj[key] = toJson(obj[key])
}
return newObj
}
return obj
}
/**
* Concatenate two arrays fast.
*
* @param {any[]} aaa first array
* @param {DecodedArray} bbb second array
*/
export function concat(aaa, bbb) {
const chunk = 10000
for (let i = 0; i < bbb.length; i += chunk) {
aaa.push(...bbb.slice(i, i + chunk))
}
}
/**
* Deep equality comparison
*
* @param {any} a First object to compare
* @param {any} b Second object to compare
* @returns {boolean} true if objects are equal
*/
export function equals(a, b) {
if (a === b) return true
if (a instanceof Uint8Array && b instanceof Uint8Array) return equals(Array.from(a), Array.from(b))
if (!a || !b || typeof a !== typeof b) return false
return Array.isArray(a) && Array.isArray(b)
? a.length === b.length && a.every((v, i) => equals(v, b[i]))
: typeof a === 'object' && Object.keys(a).length === Object.keys(b).length && Object.keys(a).every(k => equals(a[k], b[k]))
}
/**
* Get the byte length of a URL using a HEAD request.
* If requestInit is provided, it will be passed to fetch.
*
* @param {string} url
* @param {RequestInit} [requestInit] fetch options
* @param {typeof globalThis.fetch} [customFetch] fetch function to use
* @returns {Promise<number>}
*/
export async function byteLengthFromUrl(url, requestInit, customFetch) {
const fetch = customFetch ?? globalThis.fetch
return await fetch(url, { ...requestInit, method: 'HEAD' })
.then(res => {
if (!res.ok) throw new Error(`fetch head failed ${res.status}`)
const length = res.headers.get('Content-Length')
if (!length) throw new Error('missing content length')
return parseInt(length)
})
}
/**
* Construct an AsyncBuffer for a URL.
* If byteLength is not provided, will make a HEAD request to get the file size.
* If fetch is provided, it will be used instead of the global fetch.
* If requestInit is provided, it will be passed to fetch.
*
* @param {object} options
* @param {string} options.url
* @param {number} [options.byteLength]
* @param {typeof globalThis.fetch} [options.fetch] fetch function to use
* @param {RequestInit} [options.requestInit]
* @returns {Promise<AsyncBuffer>}
*/
export async function asyncBufferFromUrl({ url, byteLength, requestInit, fetch: customFetch }) {
if (!url) throw new Error('missing url')
const fetch = customFetch ?? globalThis.fetch
// byte length from HEAD request
byteLength ||= await byteLengthFromUrl(url, requestInit, fetch)
/**
* A promise for the whole buffer, if range requests are not supported.
* @type {Promise<ArrayBuffer>|undefined}
*/
let buffer = undefined
const init = requestInit || {}
return {
byteLength,
async slice(start, end) {
if (buffer) {
return buffer.then(buffer => buffer.slice(start, end))
}
const headers = new Headers(init.headers)
const endStr = end === undefined ? '' : end - 1
headers.set('Range', `bytes=${start}-${endStr}`)
const res = await fetch(url, { ...init, headers })
if (!res.ok || !res.body) throw new Error(`fetch failed ${res.status}`)
if (res.status === 200) {
// Endpoint does not support range requests and returned the whole object
buffer = res.arrayBuffer()
return buffer.then(buffer => buffer.slice(start, end))
} else if (res.status === 206) {
// The endpoint supports range requests and sent us the requested range
return res.arrayBuffer()
} else {
throw new Error(`fetch received unexpected status code ${res.status}`)
}
},
}
}
/**
* Returns a cached layer on top of an AsyncBuffer. For caching slices of a file
* that are read multiple times, possibly over a network.
*
* @param {AsyncBuffer} file file-like object to cache
* @param {{ minSize?: number }} [options]
* @returns {AsyncBuffer} cached file-like object
*/
export function cachedAsyncBuffer({ byteLength, slice }, { minSize = defaultInitialFetchSize } = {}) {
if (byteLength < minSize) {
// Cache whole file if it's small
const buffer = slice(0, byteLength)
return {
byteLength,
async slice(start, end) {
return (await buffer).slice(start, end)
},
}
}
const cache = new Map()
return {
byteLength,
/**
* @param {number} start
* @param {number} [end]
* @returns {Awaitable<ArrayBuffer>}
*/
slice(start, end) {
const key = cacheKey(start, end, byteLength)
const cached = cache.get(key)
if (cached) return cached
// cache miss, read from file
const promise = slice(start, end)
cache.set(key, promise)
return promise
},
}
}
/**
* Returns canonical cache key for a byte range 'start,end'.
* Normalize int-range and suffix-range requests to the same key.
*
* @import {AsyncBuffer, Awaitable, DecodedArray} from '../src/types.d.ts'
* @param {number} start start byte of range
* @param {number} [end] end byte of range, or undefined for suffix range
* @param {number} [size] size of file, or undefined for suffix range
* @returns {string}
*/
function cacheKey(start, end, size) {
if (start < 0) {
if (end !== undefined) throw new Error(`invalid suffix range [${start}, ${end}]`)
if (size === undefined) return `${start},`
return `${size + start},${size}`
} else if (end !== undefined) {
if (start > end) throw new Error(`invalid empty range [${start}, ${end}]`)
return `${start},${end}`
} else if (size === undefined) {
return `${start},`
} else {
return `${start},${size}`
}
}
/**
* Flatten a list of lists into a single list.
*
* @param {DecodedArray[]} [chunks]
* @returns {DecodedArray}
*/
export function flatten(chunks) {
if (!chunks) return []
if (chunks.length === 1) return chunks[0]
/** @type {any[]} */
const output = []
for (const chunk of chunks) {
concat(output, chunk)
}
return output
}