UNPKG

@lancedb/lancedb

Version:

LanceDB: A serverless, low-latency vector database for AI applications

427 lines (426 loc) 18.3 kB
"use strict"; // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The LanceDB Authors Object.defineProperty(exports, "__esModule", { value: true }); exports.sanitizeMetadata = sanitizeMetadata; exports.sanitizeInt = sanitizeInt; exports.sanitizeFloat = sanitizeFloat; exports.sanitizeDecimal = sanitizeDecimal; exports.sanitizeDate = sanitizeDate; exports.sanitizeTime = sanitizeTime; exports.sanitizeTimestamp = sanitizeTimestamp; exports.sanitizeTypedTimestamp = sanitizeTypedTimestamp; exports.sanitizeInterval = sanitizeInterval; exports.sanitizeList = sanitizeList; exports.sanitizeStruct = sanitizeStruct; exports.sanitizeUnion = sanitizeUnion; exports.sanitizeTypedUnion = sanitizeTypedUnion; exports.sanitizeFixedSizeBinary = sanitizeFixedSizeBinary; exports.sanitizeFixedSizeList = sanitizeFixedSizeList; exports.sanitizeMap = sanitizeMap; exports.sanitizeDuration = sanitizeDuration; exports.sanitizeDictionary = sanitizeDictionary; exports.sanitizeType = sanitizeType; exports.sanitizeField = sanitizeField; exports.sanitizeSchema = sanitizeSchema; exports.sanitizeTable = sanitizeTable; // The utilities in this file help sanitize data from the user's arrow // library into the types expected by vectordb's arrow library. Node // generally allows for mulitple versions of the same library (and sometimes // even multiple copies of the same version) to be installed at the same // time. However, arrow-js uses instanceof which expected that the input // comes from the exact same library instance. This is not always the case // and so we must sanitize the input to ensure that it is compatible. const apache_arrow_1 = require("apache-arrow"); const arrow_1 = require("./arrow"); function sanitizeMetadata(metadataLike) { if (metadataLike === undefined || metadataLike === null) { return undefined; } if (!(metadataLike instanceof Map)) { throw Error("Expected metadata, if present, to be a Map<string, string>"); } for (const item of metadataLike) { if (!(typeof item[0] === "string" || !(typeof item[1] === "string"))) { throw Error("Expected metadata, if present, to be a Map<string, string> but it had non-string keys or values"); } } return metadataLike; } function sanitizeInt(typeLike) { if (!("bitWidth" in typeLike) || typeof typeLike.bitWidth !== "number" || !("isSigned" in typeLike) || typeof typeLike.isSigned !== "boolean") { throw Error("Expected an Int Type to have a `bitWidth` and `isSigned` property"); } return new arrow_1.Int(typeLike.isSigned, typeLike.bitWidth); } function sanitizeFloat(typeLike) { if (!("precision" in typeLike) || typeof typeLike.precision !== "number") { throw Error("Expected a Float Type to have a `precision` property"); } return new arrow_1.Float(typeLike.precision); } function sanitizeDecimal(typeLike) { if (!("scale" in typeLike) || typeof typeLike.scale !== "number" || !("precision" in typeLike) || typeof typeLike.precision !== "number" || !("bitWidth" in typeLike) || typeof typeLike.bitWidth !== "number") { throw Error("Expected a Decimal Type to have `scale`, `precision`, and `bitWidth` properties"); } return new arrow_1.Decimal(typeLike.scale, typeLike.precision, typeLike.bitWidth); } function sanitizeDate(typeLike) { if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { throw Error("Expected a Date type to have a `unit` property"); } return new arrow_1.Date_(typeLike.unit); } function sanitizeTime(typeLike) { if (!("unit" in typeLike) || typeof typeLike.unit !== "number" || !("bitWidth" in typeLike) || typeof typeLike.bitWidth !== "number") { throw Error("Expected a Time type to have `unit` and `bitWidth` properties"); } return new arrow_1.Time(typeLike.unit, typeLike.bitWidth); } function sanitizeTimestamp(typeLike) { if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { throw Error("Expected a Timestamp type to have a `unit` property"); } let timezone = null; if ("timezone" in typeLike && typeof typeLike.timezone === "string") { timezone = typeLike.timezone; } return new arrow_1.Timestamp(typeLike.unit, timezone); } function sanitizeTypedTimestamp(typeLike, // eslint-disable-next-line @typescript-eslint/naming-convention Datatype) { let timezone = null; if ("timezone" in typeLike && typeof typeLike.timezone === "string") { timezone = typeLike.timezone; } return new Datatype(timezone); } function sanitizeInterval(typeLike) { if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { throw Error("Expected an Interval type to have a `unit` property"); } return new arrow_1.Interval(typeLike.unit); } function sanitizeList(typeLike) { if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { throw Error("Expected a List type to have an array-like `children` property"); } if (typeLike.children.length !== 1) { throw Error("Expected a List type to have exactly one child"); } return new arrow_1.List(sanitizeField(typeLike.children[0])); } function sanitizeStruct(typeLike) { if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { throw Error("Expected a Struct type to have an array-like `children` property"); } return new arrow_1.Struct(typeLike.children.map((child) => sanitizeField(child))); } function sanitizeUnion(typeLike) { if (!("typeIds" in typeLike) || !("mode" in typeLike) || typeof typeLike.mode !== "number") { throw Error("Expected a Union type to have `typeIds` and `mode` properties"); } if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { throw Error("Expected a Union type to have an array-like `children` property"); } return new arrow_1.Union(typeLike.mode, // biome-ignore lint/suspicious/noExplicitAny: skip typeLike.typeIds, typeLike.children.map((child) => sanitizeField(child))); } function sanitizeTypedUnion(typeLike, // eslint-disable-next-line @typescript-eslint/naming-convention UnionType) { if (!("typeIds" in typeLike)) { throw Error("Expected a DenseUnion/SparseUnion type to have a `typeIds` property"); } if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { throw Error("Expected a DenseUnion/SparseUnion type to have an array-like `children` property"); } return new UnionType(typeLike.typeIds, typeLike.children.map((child) => sanitizeField(child))); } function sanitizeFixedSizeBinary(typeLike) { if (!("byteWidth" in typeLike) || typeof typeLike.byteWidth !== "number") { throw Error("Expected a FixedSizeBinary type to have a `byteWidth` property"); } return new arrow_1.FixedSizeBinary(typeLike.byteWidth); } function sanitizeFixedSizeList(typeLike) { if (!("listSize" in typeLike) || typeof typeLike.listSize !== "number") { throw Error("Expected a FixedSizeList type to have a `listSize` property"); } if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { throw Error("Expected a FixedSizeList type to have an array-like `children` property"); } if (typeLike.children.length !== 1) { throw Error("Expected a FixedSizeList type to have exactly one child"); } return new arrow_1.FixedSizeList(typeLike.listSize, sanitizeField(typeLike.children[0])); } function sanitizeMap(typeLike) { if (!("children" in typeLike) || !Array.isArray(typeLike.children)) { throw Error("Expected a Map type to have an array-like `children` property"); } if (!("keysSorted" in typeLike) || typeof typeLike.keysSorted !== "boolean") { throw Error("Expected a Map type to have a `keysSorted` property"); } return new arrow_1.Map_( // biome-ignore lint/suspicious/noExplicitAny: skip typeLike.children.map((field) => sanitizeField(field)), typeLike.keysSorted); } function sanitizeDuration(typeLike) { if (!("unit" in typeLike) || typeof typeLike.unit !== "number") { throw Error("Expected a Duration type to have a `unit` property"); } return new arrow_1.Duration(typeLike.unit); } function sanitizeDictionary(typeLike) { if (!("id" in typeLike) || typeof typeLike.id !== "number") { throw Error("Expected a Dictionary type to have an `id` property"); } if (!("indices" in typeLike) || typeof typeLike.indices !== "object") { throw Error("Expected a Dictionary type to have an `indices` property"); } if (!("dictionary" in typeLike) || typeof typeLike.dictionary !== "object") { throw Error("Expected a Dictionary type to have an `dictionary` property"); } if (!("isOrdered" in typeLike) || typeof typeLike.isOrdered !== "boolean") { throw Error("Expected a Dictionary type to have an `isOrdered` property"); } return new arrow_1.Dictionary(sanitizeType(typeLike.dictionary), sanitizeType(typeLike.indices), typeLike.id, typeLike.isOrdered); } // biome-ignore lint/suspicious/noExplicitAny: skip function sanitizeType(typeLike) { if (typeof typeLike !== "object" || typeLike === null) { throw Error("Expected a Type but object was null/undefined"); } if (!("typeId" in typeLike) || !(typeof typeLike.typeId !== "function" || typeof typeLike.typeId !== "number")) { throw Error("Expected a Type to have a typeId property"); } let typeId; if (typeof typeLike.typeId === "function") { typeId = typeLike.typeId(); } else if (typeof typeLike.typeId === "number") { typeId = typeLike.typeId; } else { throw Error("Type's typeId property was not a function or number"); } switch (typeId) { case arrow_1.Type.NONE: throw Error("Received a Type with a typeId of NONE"); case arrow_1.Type.Null: return new arrow_1.Null(); case arrow_1.Type.Int: return sanitizeInt(typeLike); case arrow_1.Type.Float: return sanitizeFloat(typeLike); case arrow_1.Type.Binary: return new arrow_1.Binary(); case arrow_1.Type.Utf8: return new arrow_1.Utf8(); case arrow_1.Type.Bool: return new arrow_1.Bool(); case arrow_1.Type.Decimal: return sanitizeDecimal(typeLike); case arrow_1.Type.Date: return sanitizeDate(typeLike); case arrow_1.Type.Time: return sanitizeTime(typeLike); case arrow_1.Type.Timestamp: return sanitizeTimestamp(typeLike); case arrow_1.Type.Interval: return sanitizeInterval(typeLike); case arrow_1.Type.List: return sanitizeList(typeLike); case arrow_1.Type.Struct: return sanitizeStruct(typeLike); case arrow_1.Type.Union: return sanitizeUnion(typeLike); case arrow_1.Type.FixedSizeBinary: return sanitizeFixedSizeBinary(typeLike); case arrow_1.Type.FixedSizeList: return sanitizeFixedSizeList(typeLike); case arrow_1.Type.Map: return sanitizeMap(typeLike); case arrow_1.Type.Duration: return sanitizeDuration(typeLike); case arrow_1.Type.Dictionary: return sanitizeDictionary(typeLike); case arrow_1.Type.Int8: return new arrow_1.Int8(); case arrow_1.Type.Int16: return new arrow_1.Int16(); case arrow_1.Type.Int32: return new arrow_1.Int32(); case arrow_1.Type.Int64: return new arrow_1.Int64(); case arrow_1.Type.Uint8: return new arrow_1.Uint8(); case arrow_1.Type.Uint16: return new arrow_1.Uint16(); case arrow_1.Type.Uint32: return new arrow_1.Uint32(); case arrow_1.Type.Uint64: return new arrow_1.Uint64(); case arrow_1.Type.Float16: return new arrow_1.Float16(); case arrow_1.Type.Float32: return new arrow_1.Float32(); case arrow_1.Type.Float64: return new arrow_1.Float64(); case arrow_1.Type.DateMillisecond: return new arrow_1.DateMillisecond(); case arrow_1.Type.DateDay: return new arrow_1.DateDay(); case arrow_1.Type.TimeNanosecond: return new arrow_1.TimeNanosecond(); case arrow_1.Type.TimeMicrosecond: return new arrow_1.TimeMicrosecond(); case arrow_1.Type.TimeMillisecond: return new arrow_1.TimeMillisecond(); case arrow_1.Type.TimeSecond: return new arrow_1.TimeSecond(); case arrow_1.Type.TimestampNanosecond: return sanitizeTypedTimestamp(typeLike, arrow_1.TimestampNanosecond); case arrow_1.Type.TimestampMicrosecond: return sanitizeTypedTimestamp(typeLike, arrow_1.TimestampMicrosecond); case arrow_1.Type.TimestampMillisecond: return sanitizeTypedTimestamp(typeLike, arrow_1.TimestampMillisecond); case arrow_1.Type.TimestampSecond: return sanitizeTypedTimestamp(typeLike, arrow_1.TimestampSecond); case arrow_1.Type.DenseUnion: return sanitizeTypedUnion(typeLike, arrow_1.DenseUnion); case arrow_1.Type.SparseUnion: return sanitizeTypedUnion(typeLike, arrow_1.SparseUnion); case arrow_1.Type.IntervalDayTime: return new arrow_1.IntervalDayTime(); case arrow_1.Type.IntervalYearMonth: return new arrow_1.IntervalYearMonth(); case arrow_1.Type.DurationNanosecond: return new arrow_1.DurationNanosecond(); case arrow_1.Type.DurationMicrosecond: return new arrow_1.DurationMicrosecond(); case arrow_1.Type.DurationMillisecond: return new arrow_1.DurationMillisecond(); case arrow_1.Type.DurationSecond: return new arrow_1.DurationSecond(); default: throw new Error("Unrecoginized type id in schema: " + typeId); } } function sanitizeField(fieldLike) { if (fieldLike instanceof arrow_1.Field) { return fieldLike; } if (typeof fieldLike !== "object" || fieldLike === null) { throw Error("Expected a Field but object was null/undefined"); } if (!("type" in fieldLike) || !("name" in fieldLike) || !("nullable" in fieldLike)) { throw Error("The field passed in is missing a `type`/`name`/`nullable` property"); } const type = sanitizeType(fieldLike.type); const name = fieldLike.name; if (!(typeof name === "string")) { throw Error("The field passed in had a non-string `name` property"); } const nullable = fieldLike.nullable; if (!(typeof nullable === "boolean")) { throw Error("The field passed in had a non-boolean `nullable` property"); } let metadata; if ("metadata" in fieldLike) { metadata = sanitizeMetadata(fieldLike.metadata); } return new arrow_1.Field(name, type, nullable, metadata); } /** * Convert something schemaLike into a Schema instance * * This method is often needed even when the caller is using a Schema * instance because they might be using a different instance of apache-arrow * than lancedb is using. */ function sanitizeSchema(schemaLike) { if (schemaLike instanceof arrow_1.Schema) { return schemaLike; } if (typeof schemaLike !== "object" || schemaLike === null) { throw Error("Expected a Schema but object was null/undefined"); } if (!("fields" in schemaLike)) { throw Error("The schema passed in does not appear to be a schema (no 'fields' property)"); } let metadata; if ("metadata" in schemaLike) { metadata = sanitizeMetadata(schemaLike.metadata); } if (!Array.isArray(schemaLike.fields)) { throw Error("The schema passed in had a 'fields' property but it was not an array"); } const sanitizedFields = schemaLike.fields.map((field) => sanitizeField(field)); return new arrow_1.Schema(sanitizedFields, metadata); } function sanitizeTable(tableLike) { if (tableLike instanceof arrow_1.Table) { return tableLike; } if (typeof tableLike !== "object" || tableLike === null) { throw Error("Expected a Table but object was null/undefined"); } if (!("schema" in tableLike)) { throw Error("The table passed in does not appear to be a table (no 'schema' property)"); } if (!("batches" in tableLike)) { throw Error("The table passed in does not appear to be a table (no 'columns' property)"); } const schema = sanitizeSchema(tableLike.schema); const batches = tableLike.batches.map(sanitizeRecordBatch); return new arrow_1.Table(schema, batches); } function sanitizeRecordBatch(batchLike) { if (batchLike instanceof arrow_1.RecordBatch) { return batchLike; } if (typeof batchLike !== "object" || batchLike === null) { throw Error("Expected a RecordBatch but object was null/undefined"); } if (!("schema" in batchLike)) { throw Error("The record batch passed in does not appear to be a record batch (no 'schema' property)"); } if (!("data" in batchLike)) { throw Error("The record batch passed in does not appear to be a record batch (no 'data' property)"); } const schema = sanitizeSchema(batchLike.schema); const data = sanitizeData(batchLike.data); return new arrow_1.RecordBatch(schema, data); } function sanitizeData(dataLike) { if (dataLike instanceof apache_arrow_1.Data) { return dataLike; } return new apache_arrow_1.Data(dataLike.type, dataLike.offset, dataLike.length, dataLike.nullCount, { [apache_arrow_1.BufferType.OFFSET]: dataLike.valueOffsets, [apache_arrow_1.BufferType.DATA]: dataLike.values, [apache_arrow_1.BufferType.VALIDITY]: dataLike.nullBitmap, [apache_arrow_1.BufferType.TYPE]: dataLike.typeIds, }); }