mapshaper
Version:
A tool for editing geospatial data for mapping and GIS.
1,516 lines (1,390 loc) • 454 kB
JavaScript
(function () {
'use strict';
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
/** @type {import('../src/types.d.ts').ParquetType[]} */
const ParquetTypes = [
'BOOLEAN',
'INT32',
'INT64',
'INT96', // deprecated
'FLOAT',
'DOUBLE',
'BYTE_ARRAY',
'FIXED_LEN_BYTE_ARRAY',
];
/** @type {import('../src/types.d.ts').Encoding[]} */
const Encodings = [
'PLAIN',
'GROUP_VAR_INT', // deprecated
'PLAIN_DICTIONARY',
'RLE',
'BIT_PACKED', // deprecated
'DELTA_BINARY_PACKED',
'DELTA_LENGTH_BYTE_ARRAY',
'DELTA_BYTE_ARRAY',
'RLE_DICTIONARY',
'BYTE_STREAM_SPLIT',
];
/** @type {import('../src/types.d.ts').FieldRepetitionType[]} */
const FieldRepetitionTypes = [
'REQUIRED',
'OPTIONAL',
'REPEATED',
];
/** @type {import('../src/types.d.ts').ConvertedType[]} */
const ConvertedTypes = [
'UTF8',
'MAP',
'MAP_KEY_VALUE',
'LIST',
'ENUM',
'DECIMAL',
'DATE',
'TIME_MILLIS',
'TIME_MICROS',
'TIMESTAMP_MILLIS',
'TIMESTAMP_MICROS',
'UINT_8',
'UINT_16',
'UINT_32',
'UINT_64',
'INT_8',
'INT_16',
'INT_32',
'INT_64',
'JSON',
'BSON',
'INTERVAL',
];
/** @type {import('../src/types.d.ts').CompressionCodec[]} */
const CompressionCodecs = [
'UNCOMPRESSED',
'SNAPPY',
'GZIP',
'LZO',
'BROTLI',
'LZ4',
'ZSTD',
'LZ4_RAW',
];
/** @type {import('../src/types.d.ts').PageType[]} */
const PageTypes = [
'DATA_PAGE',
'INDEX_PAGE',
'DICTIONARY_PAGE',
'DATA_PAGE_V2',
];
/** @type {import('../src/types.d.ts').BoundaryOrder[]} */
const BoundaryOrders = [
'UNORDERED',
'ASCENDING',
'DESCENDING',
];
/** @type {import('../src/types.d.ts').EdgeInterpolationAlgorithm[]} */
const EdgeInterpolationAlgorithms = [
'SPHERICAL',
'VINCENTY',
'THOMAS',
'ANDOYER',
'KARNEY',
];
/**
* WKB (Well-Known Binary) decoder for geometry objects.
*
* @param {DataReader} reader
* @returns {Geometry} geometry object
*/
function wkbToGeojson(reader) {
const flags = getFlags(reader);
if (flags.type === 1) { // Point
return { type: 'Point', coordinates: readPosition(reader, flags) }
} else if (flags.type === 2) { // LineString
return { type: 'LineString', coordinates: readLine(reader, flags) }
} else if (flags.type === 3) { // Polygon
return { type: 'Polygon', coordinates: readPolygon(reader, flags) }
} else if (flags.type === 4) { // MultiPoint
const points = [];
for (let i = 0; i < flags.count; i++) {
points.push(readPosition(reader, getFlags(reader)));
}
return { type: 'MultiPoint', coordinates: points }
} else if (flags.type === 5) { // MultiLineString
const lines = [];
for (let i = 0; i < flags.count; i++) {
lines.push(readLine(reader, getFlags(reader)));
}
return { type: 'MultiLineString', coordinates: lines }
} else if (flags.type === 6) { // MultiPolygon
const polygons = [];
for (let i = 0; i < flags.count; i++) {
polygons.push(readPolygon(reader, getFlags(reader)));
}
return { type: 'MultiPolygon', coordinates: polygons }
} else if (flags.type === 7) { // GeometryCollection
const geometries = [];
for (let i = 0; i < flags.count; i++) {
geometries.push(wkbToGeojson(reader));
}
return { type: 'GeometryCollection', geometries }
} else {
throw new Error(`Unsupported geometry type: ${flags.type}`)
}
}
/**
* Extract ISO WKB flags and base geometry type.
*
* @param {DataReader} reader
* @returns {WkbFlags}
*/
function getFlags(reader) {
const { view } = reader;
const littleEndian = view.getUint8(reader.offset++) === 1;
const rawType = view.getUint32(reader.offset, littleEndian);
reader.offset += 4;
const type = rawType % 1000;
const flags = Math.floor(rawType / 1000);
let count = 0;
if (type > 1 && type <= 7) {
count = view.getUint32(reader.offset, littleEndian);
reader.offset += 4;
}
// XY, XYZ, XYM, XYZM
let dim = 2;
if (flags) dim++;
if (flags === 3) dim++;
return { littleEndian, type, dim, count }
}
/**
* @param {DataReader} reader
* @param {WkbFlags} flags
* @returns {number[]}
*/
function readPosition(reader, flags) {
const points = [];
for (let i = 0; i < flags.dim; i++) {
const coord = reader.view.getFloat64(reader.offset, flags.littleEndian);
reader.offset += 8;
points.push(coord);
}
return points
}
/**
* @param {DataReader} reader
* @param {WkbFlags} flags
* @returns {number[][]}
*/
function readLine(reader, flags) {
const points = [];
for (let i = 0; i < flags.count; i++) {
points.push(readPosition(reader, flags));
}
return points
}
/**
* @param {DataReader} reader
* @param {WkbFlags} flags
* @returns {number[][][]}
*/
function readPolygon(reader, flags) {
const { view } = reader;
const rings = [];
for (let r = 0; r < flags.count; r++) {
const count = view.getUint32(reader.offset, flags.littleEndian);
reader.offset += 4;
rings.push(readLine(reader, { ...flags, count }));
}
return rings
}
/**
* @typedef {object} WkbFlags
* @property {boolean} littleEndian
* @property {number} type
* @property {number} dim
* @property {number} count
*/
/**
* @import {DataReader, Geometry} from '../src/types.js'
*/
/**
* @import {ColumnDecoder, DecodedArray, Encoding, ParquetParsers} from '../src/types.js'
*/
const decoder$2 = new TextDecoder();
/**
* Default type parsers when no custom ones are given
* @type ParquetParsers
*/
const DEFAULT_PARSERS = {
timestampFromMilliseconds(millis) {
return new Date(Number(millis))
},
timestampFromMicroseconds(micros) {
return new Date(Number(micros / 1000n))
},
timestampFromNanoseconds(nanos) {
return new Date(Number(nanos / 1000000n))
},
dateFromDays(days) {
return new Date(days * 86400000)
},
stringFromBytes(bytes) {
return bytes && decoder$2.decode(bytes)
},
geometryFromBytes(bytes) {
return bytes && wkbToGeojson({ view: new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength), offset: 0 })
},
geographyFromBytes(bytes) {
return bytes && wkbToGeojson({ view: new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength), offset: 0 })
},
uuidFromBytes(bytes) {
if (!bytes) return undefined
const hex = Array.from(bytes, b => b.toString(16).padStart(2, '0')).join('');
return hex.slice(0, 8) + '-' + hex.slice(8, 12) + '-' + hex.slice(12, 16) + '-' + hex.slice(16, 20) + '-' + hex.slice(20, 32)
},
};
/**
* Convert known types from primitive to rich, and dereference dictionary.
*
* @param {DecodedArray} data series of primitive types
* @param {DecodedArray | undefined} dictionary
* @param {Encoding} encoding
* @param {ColumnDecoder} columnDecoder
* @returns {DecodedArray} series of rich types
*/
function convertWithDictionary(data, dictionary, encoding, columnDecoder) {
if (dictionary && encoding.endsWith('_DICTIONARY')) {
let output = data;
if (data instanceof Uint8Array && !(dictionary instanceof Uint8Array)) {
// @ts-expect-error upgrade data to match dictionary type with fancy constructor
output = new dictionary.constructor(data.length);
}
for (let i = 0; i < data.length; i++) {
output[i] = dictionary[data[i]];
}
return output
} else {
return convert(data, columnDecoder)
}
}
/**
* Convert known types from primitive to rich.
*
* @param {DecodedArray} data series of primitive types
* @param {ColumnDecoder} columnDecoder
* @returns {DecodedArray} series of rich types
*/
function convert(data, columnDecoder) {
const { element, parsers, utf8 = true, schemaPath } = columnDecoder;
const { type, converted_type: ctype, logical_type: ltype } = element;
const nullable = element.repetition_type !== 'REQUIRED';
// Skip utf8 conversion for plain BYTE_ARRAY inside VARIANT
const isVariant = schemaPath?.some(s => s.element.logical_type?.type === 'VARIANT');
if (isVariant && type === 'BYTE_ARRAY' && ctype !== 'UTF8' && ltype?.type !== 'STRING') {
return data
}
if (ctype === 'DECIMAL') {
const scale = element.scale || 0;
const factor = 10 ** -scale;
const arr = new Array(data.length);
for (let i = 0; i < arr.length; i++) {
if (data[i] instanceof Uint8Array) {
arr[i] = parseDecimal(data[i]) * factor;
} else {
arr[i] = Number(data[i]) * factor;
}
}
return arr
}
if (!ctype && type === 'INT96') {
return Array.from(data).map(v => parsers.timestampFromNanoseconds(parseInt96Nanos(v)))
}
if (ctype === 'DATE') {
return Array.from(data).map(v => parsers.dateFromDays(v))
}
if (ctype === 'TIMESTAMP_MILLIS') {
return Array.from(data).map(v => parsers.timestampFromMilliseconds(v))
}
if (ctype === 'TIMESTAMP_MICROS') {
return Array.from(data).map(v => parsers.timestampFromMicroseconds(v))
}
if (ctype === 'JSON') {
return data.map(v => JSON.parse(decoder$2.decode(v)))
}
if (ctype === 'BSON') {
throw new Error('parquet bson not supported')
}
if (ctype === 'INTERVAL') {
throw new Error('parquet interval not supported')
}
if (ltype?.type === 'GEOMETRY') {
return data.map(v => parsers.geometryFromBytes(v))
}
if (ltype?.type === 'GEOGRAPHY') {
return data.map(v => parsers.geographyFromBytes(v))
}
if (ltype?.type === 'UUID') {
return data.map(v => parsers.uuidFromBytes(v))
}
if (ctype === 'UTF8' || ltype?.type === 'STRING' || utf8 && type === 'BYTE_ARRAY') {
return data.map(v => parsers.stringFromBytes(v))
}
if (ctype === 'UINT_64' || ltype?.type === 'INTEGER' && ltype.bitWidth === 64 && !ltype.isSigned) {
if (data instanceof BigInt64Array) return new BigUint64Array(data.buffer, data.byteOffset, data.length)
const arr = nullable ? new Array(data.length) : new BigUint64Array(data.length);
for (let i = 0; i < arr.length; i++) arr[i] = data[i];
return arr
}
if (ctype === 'UINT_32' || ltype?.type === 'INTEGER' && ltype.bitWidth === 32 && !ltype.isSigned) {
if (data instanceof Int32Array) return new Uint32Array(data.buffer, data.byteOffset, data.length)
const arr = nullable ? new Array(data.length) : new Uint32Array(data.length);
for (let i = 0; i < arr.length; i++) {
arr[i] = data[i] < 0 ? 4294967296 + data[i] : data[i];
}
return arr
}
if (ltype?.type === 'FLOAT16') {
return Array.from(data).map(parseFloat16)
}
if (ltype?.type === 'TIMESTAMP') {
const { unit } = ltype;
/** @type {ParquetParsers[keyof ParquetParsers]} */
let parser = parsers.timestampFromMilliseconds;
if (unit === 'MICROS') parser = parsers.timestampFromMicroseconds;
if (unit === 'NANOS') parser = parsers.timestampFromNanoseconds;
const arr = new Array(data.length);
for (let i = 0; i < arr.length; i++) {
arr[i] = parser(data[i]);
}
return arr
}
return data
}
/**
* @param {Uint8Array} bytes
* @returns {number}
*/
function parseDecimal(bytes) {
if (!bytes.length) return 0
let value = 0n;
for (const byte of bytes) {
value = value * 256n + BigInt(byte);
}
// handle signed
const bits = bytes.length * 8;
if (value >= 2n ** BigInt(bits - 1)) {
value -= 2n ** BigInt(bits);
}
return Number(value)
}
/**
* Converts INT96 date format (hi 32bit days, lo 64bit nanos) to nanos since epoch
* @param {bigint} value
* @returns {bigint}
*/
function parseInt96Nanos(value) {
const days = (value >> 64n) - 2440588n;
const nano = value & 0xffffffffffffffffn;
return days * 86400000000000n + nano
}
/**
* @param {Uint8Array | undefined} bytes
* @returns {number | undefined}
*/
function parseFloat16(bytes) {
if (!bytes) return undefined
const int16 = bytes[1] << 8 | bytes[0];
const sign = int16 >> 15 ? -1 : 1;
const exp = int16 >> 10 & 0x1f;
const frac = int16 & 0x3ff;
if (exp === 0) return sign * 2 ** -14 * (frac / 1024) // subnormals
if (exp === 0x1f) return frac ? NaN : sign * Infinity
return sign * 2 ** (exp - 15) * (1 + frac / 1024)
}
/**
* Build a tree from the schema elements.
*
* @param {SchemaElement[]} schema
* @param {number} rootIndex index of the root element
* @param {string[]} path path to the element
* @returns {SchemaTree} tree of schema elements
*/
function schemaTree(schema, rootIndex, path) {
const element = schema[rootIndex];
const children = [];
let count = 1;
// Read the specified number of children
if (element.num_children) {
while (children.length < element.num_children) {
const childElement = schema[rootIndex + count];
const child = schemaTree(schema, rootIndex + count, [...path, childElement.name]);
count += child.count;
children.push(child);
}
}
return { count, element, children, path }
}
/**
* Get schema elements from the root to the given element name.
*
* @param {SchemaElement[]} schema
* @param {string[]} name path to the element
* @returns {SchemaTree[]} list of schema elements
*/
function getSchemaPath(schema, name) {
let tree = schemaTree(schema, 0, []);
const path = [tree];
for (const part of name) {
const child = tree.children.find(child => child.element.name === part);
if (!child) throw new Error(`parquet schema element not found: ${name}`)
path.push(child);
tree = child;
}
return path
}
/**
* Get all physical (leaf) column names.
*
* @param {SchemaTree} schemaTree
* @returns {string[]} list of physical column names
*/
function getPhysicalColumns(schemaTree) {
/** @type {string[]} */
const columns = [];
/** @param {SchemaTree} node */
function traverse(node) {
if (node.children.length) {
for (const child of node.children) {
traverse(child);
}
} else {
columns.push(node.path.join('.'));
}
}
traverse(schemaTree);
return columns
}
/**
* Get the max repetition level for a given schema path.
*
* @param {SchemaTree[]} schemaPath
* @returns {number} max repetition level
*/
function getMaxRepetitionLevel$1(schemaPath) {
let maxLevel = 0;
for (const { element } of schemaPath) {
if (element.repetition_type === 'REPEATED') {
maxLevel++;
}
}
return maxLevel
}
/**
* Get the max definition level for a given schema path.
*
* @param {SchemaTree[]} schemaPath
* @returns {number} max definition level
*/
function getMaxDefinitionLevel(schemaPath) {
let maxLevel = 0;
for (const { element } of schemaPath.slice(1)) {
if (element.repetition_type !== 'REQUIRED') {
maxLevel++;
}
}
return maxLevel
}
/**
* Check if a column is list-like.
*
* @param {SchemaTree} schema
* @returns {boolean} true if list-like
*/
function isListLike(schema) {
if (!schema) return false
if (schema.element.converted_type !== 'LIST') return false
if (schema.children.length > 1) return false
const firstChild = schema.children[0];
if (firstChild.children.length > 1) return false
if (firstChild.element.repetition_type !== 'REPEATED') return false
return true
}
/**
* Check if a column is map-like.
*
* @param {SchemaTree} schema
* @returns {boolean} true if map-like
*/
function isMapLike(schema) {
if (!schema) return false
if (schema.element.converted_type !== 'MAP') return false
if (schema.children.length > 1) return false
const firstChild = schema.children[0];
if (firstChild.children.length !== 2) return false
if (firstChild.element.repetition_type !== 'REPEATED') return false
const keyChild = firstChild.children.find(child => child.element.name === 'key');
if (keyChild?.element.repetition_type === 'REPEATED') return false
const valueChild = firstChild.children.find(child => child.element.name === 'value');
if (valueChild?.element.repetition_type === 'REPEATED') return false
return true
}
/**
* Returns true if a column is non-nested.
*
* @param {SchemaTree[]} schemaPath
* @returns {boolean}
*/
function isFlatColumn(schemaPath) {
if (schemaPath.length !== 2) return false
const [, column] = schemaPath;
if (column.element.repetition_type === 'REPEATED') return false
if (column.children.length) return false
return true
}
/**
* @import {SchemaElement, SchemaTree} from '../src/types.js'
*/
/**
* @import {DataReader, ThriftObject, ThriftType} from '../src/types.js'
*/
// TCompactProtocol types
const STOP$1 = 0;
const TRUE$1 = 1;
const FALSE$1 = 2;
const BYTE$1 = 3;
const I16 = 4;
const I32$1 = 5;
const I64$1 = 6;
const DOUBLE$1 = 7;
const BINARY$1 = 8;
const LIST$1 = 9;
const STRUCT$1 = 12;
/**
* Parse TCompactProtocol
*
* @param {DataReader} reader
* @returns {{ [key: `field_${number}`]: any }}
*/
function deserializeTCompactProtocol(reader) {
/** @type {ThriftObject} */
const value = {};
let fid = 0;
while (reader.offset < reader.view.byteLength) {
// Parse each field based on its type and add to the result object
const byte = reader.view.getUint8(reader.offset++);
const type = byte & 0x0f;
if (type === STOP$1) break
const delta = byte >> 4;
fid = delta ? fid + delta : readZigZag(reader);
value[`field_${fid}`] = readElement(reader, type);
}
return value
}
/**
* Read a single element based on its type
*
* @param {DataReader} reader
* @param {number} type
* @returns {ThriftType}
*/
function readElement(reader, type) {
switch (type) {
case TRUE$1:
return true
case FALSE$1:
return false
case BYTE$1:
return reader.view.getInt8(reader.offset++)
case I16:
case I32$1:
return readZigZag(reader)
case I64$1:
return readZigZagBigInt(reader)
case DOUBLE$1: {
const value = reader.view.getFloat64(reader.offset, true);
reader.offset += 8;
return value
}
case BINARY$1: {
const stringLength = readVarInt(reader);
const strBytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, stringLength);
reader.offset += stringLength;
return strBytes
}
case LIST$1: {
const byte = reader.view.getUint8(reader.offset++);
const elemType = byte & 0x0f;
let listSize = byte >> 4;
if (listSize === 15) {
listSize = readVarInt(reader);
}
const boolType = elemType === TRUE$1 || elemType === FALSE$1;
const values = new Array(listSize);
for (let i = 0; i < listSize; i++) {
values[i] = boolType ? readElement(reader, BYTE$1) === 1 : readElement(reader, elemType);
}
return values
}
case STRUCT$1:
// main function handles struct parsing
return deserializeTCompactProtocol(reader)
default:
// MAP, SET, UUID not used by parquet
throw new Error(`thrift unhandled type: ${type}`)
}
}
/**
* Read varint aka Unsigned LEB128.
*
* @param {DataReader} reader
* @returns {number}
*/
function readVarInt(reader) {
let result = 0;
let shift = 0;
while (true) {
// Read groups of 7 low bits until high bit is 0
const byte = reader.view.getUint8(reader.offset++);
result |= (byte & 0x7f) << shift;
if (!(byte & 0x80)) {
return result
}
shift += 7;
}
}
/**
* Read a varint as a bigint.
*
* @param {DataReader} reader
* @returns {bigint}
*/
function readVarBigInt(reader) {
let result = 0n;
let shift = 0n;
while (true) {
const byte = reader.view.getUint8(reader.offset++);
result |= BigInt(byte & 0x7f) << shift;
if (!(byte & 0x80)) {
return result
}
shift += 7n;
}
}
/**
* Read a zigzag number.
* Zigzag folds positive and negative numbers into the positive number space.
*
* @param {DataReader} reader
* @returns {number}
*/
function readZigZag(reader) {
const zigzag = readVarInt(reader);
return zigzag >>> 1 ^ -(zigzag & 1)
}
/**
* Read a zigzag bigint.
*
* @param {DataReader} reader
* @returns {bigint}
*/
function readZigZagBigInt(reader) {
const zigzag = readVarBigInt(reader);
return zigzag >> 1n ^ -(zigzag & 1n)
}
/**
* @param {SchemaElement[]} schema
* @param {KeyValue[] | undefined} key_value_metadata
* @returns {void}
*/
function markGeoColumns(schema, key_value_metadata) {
// Prepare the list of GeoParquet columns
/** @type {Map<string, LogicalType>} */
const columns = new Map();
const geo = key_value_metadata?.find(({ key }) => key === 'geo')?.value;
const decodedColumns = (geo && JSON.parse(geo)?.columns) ?? {};
for (const [name, column] of Object.entries(decodedColumns)) {
if (column.encoding !== 'WKB') continue
const type = column.edges === 'spherical' ? 'GEOGRAPHY' : 'GEOMETRY';
const id = column.crs?.id ?? column.crs?.ids?.[0];
const crs = id ? `${id.authority}:${id.code.toString()}` : undefined;
// Note: we can't infer GEOGRAPHY's algorithm from GeoParquet
columns.set(name, { type, crs });
}
// Mark schema elements with logical type
// Only look at root-level columns of type BYTE_ARRAY without existing logical_type
for (let i = 1; i < schema.length; i++) { // skip root
const { logical_type, name, num_children, type } = schema[i];
if (num_children) {
i += num_children;
continue // skip the element and its children
}
if (type === 'BYTE_ARRAY' && !logical_type) {
schema[i].logical_type = columns.get(name);
}
}
}
/**
* @import {KeyValue, LogicalType, SchemaElement} from '../src/types.js'
*/
/**
* @import {AsyncBuffer, FileMetaData, KeyValue, LogicalType, MetadataOptions, MinMaxType, ParquetParsers, SchemaElement, SchemaTree, Statistics, TimeUnit} from '../src/types.js'
*/
const defaultInitialFetchSize = 1 << 19; // 512kb
const decoder$1 = new TextDecoder();
function decode(/** @type {Uint8Array} */ value) {
return value && decoder$1.decode(value)
}
/**
* Read parquet metadata from an async buffer.
*
* An AsyncBuffer is like an ArrayBuffer, but the slices are loaded
* asynchronously, possibly over the network.
*
* You must provide the byteLength of the buffer, typically from a HEAD request.
*
* In theory, you could use suffix-range requests to fetch the end of the file,
* and save a round trip. But in practice, this doesn't work because chrome
* deems suffix-range requests as a not-safe-listed header, and will require
* a pre-flight. So the byteLength is required.
*
* To make this efficient, we initially request the last 512kb of the file,
* which is likely to contain the metadata. If the metadata length exceeds the
* initial fetch, 512kb, we request the rest of the metadata from the AsyncBuffer.
*
* This ensures that we either make one 512kb initial request for the metadata,
* or a second request for up to the metadata size.
*
* @param {AsyncBuffer} asyncBuffer parquet file contents
* @param {MetadataOptions & { initialFetchSize?: number }} options initial fetch size in bytes (default 512kb)
* @returns {Promise<FileMetaData>} parquet metadata object
*/
async function parquetMetadataAsync(asyncBuffer, { parsers, initialFetchSize = defaultInitialFetchSize, geoparquet = true } = {}) {
if (!asyncBuffer || !(asyncBuffer.byteLength >= 0)) throw new Error('parquet expected AsyncBuffer')
// fetch last bytes (footer) of the file
const footerOffset = Math.max(0, asyncBuffer.byteLength - initialFetchSize);
const footerBuffer = await asyncBuffer.slice(footerOffset, asyncBuffer.byteLength);
// Check for parquet magic number "PAR1"
const footerView = new DataView(footerBuffer);
if (footerView.getUint32(footerBuffer.byteLength - 4, true) !== 0x31524150) {
throw new Error('parquet file invalid (footer != PAR1)')
}
// Parquet files store metadata at the end of the file
// Metadata length is 4 bytes before the last PAR1
const metadataLength = footerView.getUint32(footerBuffer.byteLength - 8, true);
if (metadataLength > asyncBuffer.byteLength - 8) {
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${asyncBuffer.byteLength - 8}`)
}
// check if metadata size fits inside the initial fetch
if (metadataLength + 8 > initialFetchSize) {
// fetch the rest of the metadata
const metadataOffset = asyncBuffer.byteLength - metadataLength - 8;
const metadataBuffer = await asyncBuffer.slice(metadataOffset, footerOffset);
// combine initial fetch with the new slice
const combinedBuffer = new ArrayBuffer(metadataLength + 8);
const combinedView = new Uint8Array(combinedBuffer);
combinedView.set(new Uint8Array(metadataBuffer));
combinedView.set(new Uint8Array(footerBuffer), footerOffset - metadataOffset);
return parquetMetadata(combinedBuffer, { parsers, geoparquet })
} else {
// parse metadata from the footer
return parquetMetadata(footerBuffer, { parsers, geoparquet })
}
}
/**
* Read parquet metadata from a buffer synchronously.
*
* @param {ArrayBuffer} arrayBuffer parquet file footer
* @param {MetadataOptions} options metadata parsing options
* @returns {FileMetaData} parquet metadata object
*/
function parquetMetadata(arrayBuffer, { parsers, geoparquet = true } = {}) {
if (!(arrayBuffer instanceof ArrayBuffer)) throw new Error('parquet expected ArrayBuffer')
const view = new DataView(arrayBuffer);
// Use default parsers if not given
parsers = { ...DEFAULT_PARSERS, ...parsers };
// Validate footer magic number "PAR1"
if (view.byteLength < 8) {
throw new Error('parquet file is too short')
}
if (view.getUint32(view.byteLength - 4, true) !== 0x31524150) {
throw new Error('parquet file invalid (footer != PAR1)')
}
// Parquet files store metadata at the end of the file
// Metadata length is 4 bytes before the last PAR1
const metadataLengthOffset = view.byteLength - 8;
const metadataLength = view.getUint32(metadataLengthOffset, true);
if (metadataLength > view.byteLength - 8) {
// {metadata}, metadata_length, PAR1
throw new Error(`parquet metadata length ${metadataLength} exceeds available buffer ${view.byteLength - 8}`)
}
const metadataOffset = metadataLengthOffset - metadataLength;
const reader = { view, offset: metadataOffset };
const metadata = deserializeTCompactProtocol(reader);
// Parse metadata from thrift data
const version = metadata.field_1;
/** @type {SchemaElement[]} */
const schema = metadata.field_2.map((/** @type {any} */ field) => ({
type: ParquetTypes[field.field_1],
type_length: field.field_2,
repetition_type: FieldRepetitionTypes[field.field_3],
name: decode(field.field_4),
num_children: field.field_5,
converted_type: ConvertedTypes[field.field_6],
scale: field.field_7,
precision: field.field_8,
field_id: field.field_9,
logical_type: logicalType$1(field.field_10),
}));
// schema element per column index
const columnSchema = schema.filter(e => e.type);
const num_rows = metadata.field_3;
const row_groups = metadata.field_4.map((/** @type {any} */ rowGroup) => ({
columns: rowGroup.field_1.map((/** @type {any} */ column, /** @type {number} */ columnIndex) => ({
file_path: decode(column.field_1),
file_offset: column.field_2,
meta_data: column.field_3 && {
type: ParquetTypes[column.field_3.field_1],
encodings: column.field_3.field_2?.map((/** @type {number} */ e) => Encodings[e]),
path_in_schema: column.field_3.field_3.map(decode),
codec: CompressionCodecs[column.field_3.field_4],
num_values: column.field_3.field_5,
total_uncompressed_size: column.field_3.field_6,
total_compressed_size: column.field_3.field_7,
key_value_metadata: column.field_3.field_8?.map((/** @type {any} */ kv) => ({
key: decode(kv.field_1),
value: decode(kv.field_2),
})),
data_page_offset: column.field_3.field_9,
index_page_offset: column.field_3.field_10,
dictionary_page_offset: column.field_3.field_11,
statistics: convertStats(column.field_3.field_12, columnSchema[columnIndex], parsers),
encoding_stats: column.field_3.field_13?.map((/** @type {any} */ encodingStat) => ({
page_type: PageTypes[encodingStat.field_1],
encoding: Encodings[encodingStat.field_2],
count: encodingStat.field_3,
})),
bloom_filter_offset: column.field_3.field_14,
bloom_filter_length: column.field_3.field_15,
size_statistics: column.field_3.field_16 && {
unencoded_byte_array_data_bytes: column.field_3.field_16.field_1,
repetition_level_histogram: column.field_3.field_16.field_2,
definition_level_histogram: column.field_3.field_16.field_3,
},
geospatial_statistics: column.field_3.field_17 && {
bbox: column.field_3.field_17.field_1 && {
xmin: column.field_3.field_17.field_1.field_1,
xmax: column.field_3.field_17.field_1.field_2,
ymin: column.field_3.field_17.field_1.field_3,
ymax: column.field_3.field_17.field_1.field_4,
zmin: column.field_3.field_17.field_1.field_5,
zmax: column.field_3.field_17.field_1.field_6,
mmin: column.field_3.field_17.field_1.field_7,
mmax: column.field_3.field_17.field_1.field_8,
},
geospatial_types: column.field_3.field_17.field_2,
},
},
offset_index_offset: column.field_4,
offset_index_length: column.field_5,
column_index_offset: column.field_6,
column_index_length: column.field_7,
crypto_metadata: column.field_8,
encrypted_column_metadata: column.field_9,
})),
total_byte_size: rowGroup.field_2,
num_rows: rowGroup.field_3,
sorting_columns: rowGroup.field_4?.map((/** @type {any} */ sortingColumn) => ({
column_idx: sortingColumn.field_1,
descending: sortingColumn.field_2,
nulls_first: sortingColumn.field_3,
})),
file_offset: rowGroup.field_5,
total_compressed_size: rowGroup.field_6,
ordinal: rowGroup.field_7,
}));
/** @type {KeyValue[] | undefined} */
const key_value_metadata = metadata.field_5?.map((/** @type {any} */ kv) => ({
key: decode(kv.field_1),
value: decode(kv.field_2),
}));
const created_by = decode(metadata.field_6);
if (geoparquet) {
markGeoColumns(schema, key_value_metadata);
}
return {
version,
schema,
num_rows,
row_groups,
key_value_metadata,
created_by,
metadata_length: metadataLength,
}
}
/**
* Return a tree of schema elements from parquet metadata.
*
* @param {{schema: SchemaElement[]}} metadata parquet metadata object
* @returns {SchemaTree} tree of schema elements
*/
function parquetSchema({ schema }) {
return getSchemaPath(schema, [])[0]
}
/**
* @param {any} logicalType
* @returns {LogicalType | undefined}
*/
function logicalType$1(logicalType) {
if (logicalType?.field_1) return { type: 'STRING' }
if (logicalType?.field_2) return { type: 'MAP' }
if (logicalType?.field_3) return { type: 'LIST' }
if (logicalType?.field_4) return { type: 'ENUM' }
if (logicalType?.field_5) return {
type: 'DECIMAL',
scale: logicalType.field_5.field_1,
precision: logicalType.field_5.field_2,
}
if (logicalType?.field_6) return { type: 'DATE' }
if (logicalType?.field_7) return {
type: 'TIME',
isAdjustedToUTC: logicalType.field_7.field_1,
unit: timeUnit$1(logicalType.field_7.field_2),
}
if (logicalType?.field_8) return {
type: 'TIMESTAMP',
isAdjustedToUTC: logicalType.field_8.field_1,
unit: timeUnit$1(logicalType.field_8.field_2),
}
if (logicalType?.field_10) return {
type: 'INTEGER',
bitWidth: logicalType.field_10.field_1,
isSigned: logicalType.field_10.field_2,
}
if (logicalType?.field_11) return { type: 'NULL' }
if (logicalType?.field_12) return { type: 'JSON' }
if (logicalType?.field_13) return { type: 'BSON' }
if (logicalType?.field_14) return { type: 'UUID' }
if (logicalType?.field_15) return { type: 'FLOAT16' }
if (logicalType?.field_16) return {
type: 'VARIANT',
specification_version: logicalType.field_16.field_1,
}
if (logicalType?.field_17) return {
type: 'GEOMETRY',
crs: decode(logicalType.field_17.field_1),
}
if (logicalType?.field_18) return {
type: 'GEOGRAPHY',
crs: decode(logicalType.field_18.field_1),
algorithm: EdgeInterpolationAlgorithms[logicalType.field_18.field_2],
}
return logicalType
}
/**
* @param {any} unit
* @returns {TimeUnit}
*/
function timeUnit$1(unit) {
if (unit.field_1) return 'MILLIS'
if (unit.field_2) return 'MICROS'
if (unit.field_3) return 'NANOS'
throw new Error('parquet time unit required')
}
/**
* Convert column statistics based on column type.
*
* @param {any} stats
* @param {SchemaElement} schema
* @param {ParquetParsers} parsers
* @returns {Statistics}
*/
function convertStats(stats, schema, parsers) {
return stats && {
max: convertMetadata(stats.field_1, schema, parsers),
min: convertMetadata(stats.field_2, schema, parsers),
null_count: stats.field_3,
distinct_count: stats.field_4,
max_value: convertMetadata(stats.field_5, schema, parsers),
min_value: convertMetadata(stats.field_6, schema, parsers),
is_max_value_exact: stats.field_7,
is_min_value_exact: stats.field_8,
}
}
/**
* @param {Uint8Array | undefined} value
* @param {SchemaElement} schema
* @param {ParquetParsers} parsers
* @returns {MinMaxType | undefined}
*/
function convertMetadata(value, schema, parsers) {
const { type, converted_type, logical_type } = schema;
if (value === undefined) return value
if (type === 'BOOLEAN') return value[0] === 1
if (type === 'BYTE_ARRAY') return parsers.stringFromBytes(value)
const view = new DataView(value.buffer, value.byteOffset, value.byteLength);
if (type === 'FLOAT' && view.byteLength === 4) return view.getFloat32(0, true)
if (type === 'DOUBLE' && view.byteLength === 8) return view.getFloat64(0, true)
if (type === 'INT32' && converted_type === 'DATE') return parsers.dateFromDays(view.getInt32(0, true))
if (type === 'INT64' && converted_type === 'TIMESTAMP_MILLIS') return parsers.timestampFromMilliseconds(view.getBigInt64(0, true))
if (type === 'INT64' && converted_type === 'TIMESTAMP_MICROS') return parsers.timestampFromMicroseconds(view.getBigInt64(0, true))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP' && logical_type?.unit === 'NANOS') return parsers.timestampFromNanoseconds(view.getBigInt64(0, true))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP' && logical_type?.unit === 'MICROS') return parsers.timestampFromMicroseconds(view.getBigInt64(0, true))
if (type === 'INT64' && logical_type?.type === 'TIMESTAMP') return parsers.timestampFromMilliseconds(view.getBigInt64(0, true))
if (type === 'INT32' && view.byteLength === 4) return view.getInt32(0, true)
if (type === 'INT64' && view.byteLength === 8) return view.getBigInt64(0, true)
if (converted_type === 'DECIMAL') return parseDecimal(value) * 10 ** -(schema.scale || 0)
if (logical_type?.type === 'FLOAT16') return parseFloat16(value)
if (type === 'FIXED_LEN_BYTE_ARRAY') return value
// assert(false)
return value
}
/**
* @import {ColumnIndex, DataReader, OffsetIndex, PageLocation, ParquetParsers, SchemaElement} from '../src/types.js'
*/
/**
* @param {DataReader} reader
* @param {SchemaElement} schema
* @param {ParquetParsers | undefined} parsers
* @returns {ColumnIndex}
*/
function readColumnIndex(reader, schema, parsers = undefined) {
parsers = { ...DEFAULT_PARSERS, ...parsers };
const thrift = deserializeTCompactProtocol(reader);
return {
null_pages: thrift.field_1,
min_values: thrift.field_2.map((/** @type {any} */ m) => convertMetadata(m, schema, parsers)),
max_values: thrift.field_3.map((/** @type {any} */ m) => convertMetadata(m, schema, parsers)),
boundary_order: BoundaryOrders[thrift.field_4],
null_counts: thrift.field_5,
repetition_level_histograms: thrift.field_6,
definition_level_histograms: thrift.field_7,
}
}
/**
* @param {DataReader} reader
* @returns {OffsetIndex}
*/
function readOffsetIndex(reader) {
const thrift = deserializeTCompactProtocol(reader);
return {
// @ts-ignore
page_locations: thrift.field_1.map(loc => ({
offset: loc.field_1,
compressed_page_size: loc.field_2,
first_row_index: loc.field_3,
})),
unencoded_byte_array_data_bytes: thrift.field_2,
}
}
/**
* @import {AsyncBuffer, Awaitable, DecodedArray} from '../src/types.js'
*/
/**
* Replace bigint, date, etc with legal JSON types.
*
* @param {any} obj object to convert
* @returns {unknown} converted object
*/
function toJson(obj) {
if (obj === undefined) return null
if (typeof obj === 'bigint') return Number(obj)
if (Object.is(obj, -0)) return 0
if (Array.isArray(obj)) return obj.map(toJson)
if (obj instanceof Uint8Array) return Array.from(obj)
if (obj instanceof Date) return obj.toISOString()
if (obj instanceof Object) {
/** @type {Record<string, unknown>} */
const newObj = {};
for (const key of Object.keys(obj)) {
if (obj[key] === undefined) continue
newObj[key] = toJson(obj[key]);
}
return newObj
}
return obj
}
/**
* Concatenate two arrays fast.
*
* @param {any[]} aaa
* @param {DecodedArray} bbb
*/
function concat(aaa, bbb) {
const chunk = 10000;
for (let i = 0; i < bbb.length; i += chunk) {
aaa.push(...bbb.slice(i, i + chunk));
}
}
/**
* Deep equality.
*
* @param {any} a
* @param {any} b
* @param {boolean} [strict]
* @returns {boolean}
*/
function equals(a, b, strict = true) {
// eslint-disable-next-line eqeqeq
if (strict ? a === b : a == b) return true
if (!a || !b || typeof a !== 'object' || typeof b !== 'object') return false
if (a instanceof Uint8Array && b instanceof Uint8Array) {
if (a.length !== b.length) return false
for (let i = 0; i < a.length; i++) {
if (a[i] !== b[i]) return false
}
return true
}
if (Array.isArray(a) && Array.isArray(b)) {
if (a.length !== b.length) return false
for (let i = 0; i < a.length; i++) {
if (!equals(a[i], b[i], strict)) return false
}
return true
}
const aKeys = Object.keys(a);
if (aKeys.length !== Object.keys(b).length) return false
for (const k of aKeys) {
if (!equals(a[k], b[k], strict)) return false
}
return true
}
/**
* Get the byte length using fetch with a ranged GET request.
* Aborts the request if server returns 200 instead of 206.
*
* @param {string} url
* @param {RequestInit} [requestInit] fetch options
* @param {typeof globalThis.fetch} [fetchFn] fetch function to use
* @returns {Promise<number>}
*/
async function byteLengthFromUrlUsingGet(url, requestInit = {}, fetchFn = globalThis.fetch) {
const controller = new AbortController();
const headers = new Headers(requestInit.headers);
headers.set('Range', 'bytes=0-0');
const res = await fetchFn(url, {
...requestInit,
headers,
signal: controller.signal,
});
if (!res.ok) throw new Error(`fetch with range failed ${res.status}`)
// Server supports Range requests (206 Partial Content)
if (res.status === 206) {
const contentRange = res.headers.get('Content-Range');
if (!contentRange) throw new Error('missing content-range header')
// Parse "bytes 0-0/9446073" to get total length
const match = contentRange.match(/bytes \d+-\d+\/(\d+)/);
if (!match) throw new Error(`invalid content-range header: ${contentRange}`)
return parseInt(match[1])
}
// Server ignored Range and returned 200 - get Content-Length and abort request
if (res.status === 200) {
const contentLength = res.headers.get('Content-Length');
// Abort the request to stop any ongoing download
controller.abort();
if (contentLength) return parseInt(contentLength)
}
throw new Error('server does not support range requests and missing content-length')
}
/**
* Get the byte length of a URL using a HEAD request.
* If HEAD fails with 403 (e.g., with signed S3 URLs), falls back to a ranged GET request.
* If HEAD succeeds but Content-Length is missing, falls back to GET with range.
* If requestInit is provided, it will be passed to fetch.
*
* @param {string} url
* @param {RequestInit} [requestInit] fetch options
* @param {typeof globalThis.fetch} [customFetch] fetch function to use
* @returns {Promise<number>}
*/
async function byteLengthFromUrl(url, requestInit, customFetch) {
const fetch = customFetch ?? globalThis.fetch;
const res = await fetch(url, { ...requestInit, method: 'HEAD' });
// If HEAD request is forbidden (common with signed S3 URLs), try GET with range
if (res.status === 403) {
return byteLengthFromUrlUsingGet(url, requestInit, fetch)
}
if (!res.ok) throw new Error(`fetch head failed ${res.status}`)
const length = res.headers.get('Content-Length');
// If Content-Length is missing from HEAD, fallback to GET with range
if (!length) {
return byteLengthFromUrlUsingGet(url, requestInit, fetch)
}
return parseInt(length)
}
/**
* Construct an AsyncBuffer for a URL.
* If byteLength is not provided, will make a HEAD request to get the file size.
* If fetch is provided, it will be used instead of the global fetch.
* If requestInit is provided, it will be passed to fetch.
*
* @param {object} options
* @param {string} options.url
* @param {number} [options.byteLength]
* @param {typeof globalThis.fetch} [options.fetch] fetch function to use
* @param {RequestInit} [options.requestInit]
* @returns {Promise<AsyncBuffer>}
*/
async function asyncBufferFromUrl({ url, byteLength, requestInit, fetch: customFetch }) {
if (!url) throw new Error('missing url')
const fetch = customFetch ?? globalThis.fetch;
// byte length from HEAD request
byteLength ??= await byteLengthFromUrl(url, requestInit, fetch);
/**
* A promise for the whole buffer, if range requests are not supported.
* @type {Promise<ArrayBuffer>|undefined}
*/
let buffer = undefined;
const init = requestInit || {};
return {
byteLength,
async slice(start, end) {
if (buffer) {
return buffer.then(buffer => buffer.slice(start, end))
}
const headers = new Headers(init.headers);
const endStr = end === undefined ? '' : end - 1;
headers.set('Range', `bytes=${start}-${endStr}`);
const res = await fetch(url, { ...init, headers });
if (!res.ok || !res.body) throw new Error(`fetch failed ${res.status}`)
if (res.status === 200) {
// Endpoint does not support range requests and returned the whole object
buffer = res.arrayBuffer();
return buffer.then(buffer => buffer.slice(start, end))
} else if (res.status === 206) {
// The endpoint supports range requests and sent us the requested range
return res.arrayBuffer()
} else {
throw new Error(`fetch received unexpected status code ${res.status}`)
}
},
}
}
/**
* Returns a cached layer on top of an AsyncBuffer. For caching slices of a file
* that are read multiple times, possibly over a network.
*
* @param {AsyncBuffer} file file-like object to cache
* @param {{ minSize?: number }} [options]
* @returns {AsyncBuffer} cached file-like object
*/
function cachedAsyncBuffer({ byteLength, slice }, { minSize = defaultInitialFetchSize } = {}) {
if (byteLength < minSize) {
// Cache whole file if it's small
const buffer = slice(0, byteLength);
return {
byteLength,
async slice(start, end) {
return (await buffer).slice(start, end)
},
}
}
const cache = new Map();
return {
byteLength,
/**
* @param {number} start
* @param {number} [end]
* @returns {Awaitable<ArrayBuffer>}
*/
slice(start, end) {
const key = cacheKey(start, end, byteLength);
const cached = cache.get(key);
if (cached) return cached
// cache miss, read from file
const promise = slice(start, end);
cache.set(key, promise);
return promise
},
}
}
/**
* Returns canonical cache key for a byte range 'start,end'.
* Normalize int-range and suffix-range requests to the same key.
*
* @param {number} start start byte of range
* @param {number} [end] end byte of range, or undefined for suffix range
* @param {number} [size] size of file, or undefined for suffix range
* @returns {string}
*/
function cacheKey(start, end, size) {
if (start < 0) {
if (end !== undefined) throw new Error(`invalid suffix range [${start}, ${end}]`)
if (size === undefined) return `${start},`
return `${size + start},${size}`
} else if (end !== undefined) {
if (start > end) throw new Error(`invalid empty range [${start}, ${end}]`)
return `${start},${end}`
} else if (size === undefined) {
return `${start},`
} else {
return `${start},${size}`
}
}
/**
* Flatten a list of lists into a single list.
*
* @param {DecodedArray[]} [chunks]
* @returns {DecodedArray}
*/
function flatten(chunks) {
if (!chunks) return []
if (chunks.length === 1) return chunks[0]
/** @type {any[]} */
const output = [];
for (const chunk of chunks) {
concat(output, chunk);
}
return output
}
/**
* @import {ParquetQueryFilter, RowGroup} from '../src/types.js'
*/
/**
* Returns an array of top-level column names needed to evaluate the filter.
*
* @param {ParquetQueryFilter} [filter]
* @returns {string[]}
*/
function columnsNeededForFilter(filter) {
if (!filter) return []
/** @type {string[]} */
const columns = [];
if ('$and' in filter && Array.isArray(filter.$and)) {
columns.push(...filter.$and.flatMap(columnsNeededForFilter));
} else if ('$or' in filter && Array.isArray(filter.$or)) {
columns.push(...filter.$or.flatMap(columnsNeededForFilter));
} else if ('$nor' in filter && Array.isArray(filter.$nor)) {
columns.push(...filter.$nor.flatMap(columnsNeededForFilter));
} else {
// Map dot-notation paths to top-level column names
columns.push(...Object.keys(filter).map(key => key.split('.')[0]));
}
return [...new Set(columns)]
}
/**
* Match a record against a query filter
*
* @param {Record<string, any>} record
* @param {ParquetQueryFilter} filter
* @param {boolean} [strict]
* @returns {boolean}
*/
function matchFilter(record, filter, strict = true) {
if ('$and' in filter && Array.isArray(filter.$and)) {
return filter.$and.every(subQuery => matchFilter(record, subQuery, strict))
}
if ('$or' in filter && Array.isArray(filter.$or)) {
return filter.$or.some(subQuery => matchFilter(record, subQuery, strict))
}
if ('$nor' in filter && Array.isArray(filter.$nor)) {
return !filter.$nor.some(subQuery => matchFilter(record, subQuery, strict))
}
return Object.entries(filter).every(([field, condition]) => {
const value = resolve(record, field);
// implicit $eq for non-object conditions
if (typeof condition !== 'object' || condition === null || Array.isArray(condition)) {
return equals(value, condition, strict)
}
return Object.entries(condition || {}).every(([operator, target]) => {
if (operator === '$gt') return value > target
if (operator === '$gte') return value >= target
if (operator === '$lt') return value < target
if