UNPKG

@dpkit/table

Version:

Data Package implementation in TypeScript.

226 lines (195 loc) 7.22 kB
import type { Field, Schema } from "@dpkit/core" import type { DataFrame } from "nodejs-polars" import { col } from "nodejs-polars" import { getPolarsSchema } from "../schema/index.ts" import type { Table } from "../table/index.ts" import type { SchemaOptions } from "./Options.ts" // TODO: Implement actual options usage for inferring // TODO: Review default values being {fields: []} vs undefined export interface InferSchemaOptions extends SchemaOptions { sampleRows?: number confidence?: number commaDecimal?: boolean monthFirst?: boolean keepStrings?: boolean } export async function inferSchemaFromTable( table: Table, options?: InferSchemaOptions, ) { const { sampleRows = 100 } = options ?? {} const sample = await table.head(sampleRows).collect() return inferSchemaFromSample(sample, options) } export function inferSchemaFromSample( sample: DataFrame, options?: Exclude<InferSchemaOptions, "sampleRows">, ) { const { confidence = 0.9, fieldTypes, keepStrings } = options ?? {} const typeMapping = createTypeMapping() const regexMapping = createRegexMapping(options) const polarsSchema = getPolarsSchema(sample.schema) const fieldNames = options?.fieldNames ?? polarsSchema.fields.map(f => f.name) const failureThreshold = sample.height - Math.floor(sample.height * confidence) || 1 const schema: Schema = { fields: [], } for (const name of fieldNames) { const polarsField = polarsSchema.fields.find(f => f.name === name) if (!polarsField) { throw new Error(`Field "${name}" not found in the table`) } // TODO: Remove this workaround once the issue is fixed // https://github.com/pola-rs/nodejs-polars/issues/372 let variant = polarsField.type.variant as string if (!typeMapping[variant]) { variant = variant.slice(0, -1) } let type = fieldTypes?.[name] ?? typeMapping[variant] ?? "any" if (type === "array" && options?.arrayType === "list") { type = "list" } let field = { name, type } if (!keepStrings && type === "string" && !fieldTypes?.[name]) { for (const [regex, patch] of Object.entries(regexMapping)) { const failures = sample .filter(col(name).str.contains(regex).not()) .head(failureThreshold).height if (failures < failureThreshold) { field = { ...field, ...patch } break } } } enhanceField(field, options) schema.fields.push(field) } enhanceSchema(schema, options) return schema } function createTypeMapping() { const mapping: Record<string, Field["type"]> = { Array: "array", Bool: "boolean", Categorical: "string", Date: "date", Datetime: "datetime", Decimal: "number", Float32: "number", Float64: "number", Int16: "integer", Int32: "integer", Int64: "integer", Int8: "integer", List: "array", Null: "any", Object: "object", String: "string", Struct: "object", Time: "time", UInt16: "integer", UInt32: "integer", UInt64: "integer", UInt8: "integer", Utf8: "string", } return mapping } function createRegexMapping(options?: InferSchemaOptions) { const { commaDecimal, monthFirst } = options ?? {} const mapping: Record<string, Partial<Field>> = { // Numeric "^\\d+$": { type: "integer" }, "^\\d{1,3}(,\\d{3})+$": commaDecimal ? { type: "number" } : { type: "integer", groupChar: "," }, "^\\d+\\.\\d+$": commaDecimal ? { type: "integer", groupChar: "." } : { type: "number" }, "^\\d{1,3}(,\\d{3})+\\.\\d+$": { type: "number", groupChar: "," }, "^\\d{1,3}(\\.\\d{3})+,\\d+$": { type: "number", groupChar: ".", decimalChar: ",", }, // Boolean "^(true|True|TRUE|false|False|FALSE)$": { type: "boolean" }, // Date "^\\d{4}-\\d{2}-\\d{2}$": { type: "date" }, "^\\d{4}/\\d{2}/\\d{2}$": { type: "date", format: "%Y/%m/%d" }, "^\\d{2}/\\d{2}/\\d{4}$": monthFirst ? { type: "date", format: "%m/%d/%Y" } : { type: "date", format: "%d/%m/%Y" }, "^\\d{2}-\\d{2}-\\d{4}$": monthFirst ? { type: "date", format: "%m-%d-%Y" } : { type: "date", format: "%d-%m-%Y" }, "^\\d{2}\\.\\d{2}\\.\\d{4}$": monthFirst ? { type: "date", format: "%m.%d.%Y" } : { type: "date", format: "%d.%m.%Y" }, // Time "^\\d{2}:\\d{2}:\\d{2}$": { type: "time" }, "^\\d{2}:\\d{2}$": { type: "time", format: "%H:%M" }, "^\\d{1,2}:\\d{2}:\\d{2}\\s*(am|pm|AM|PM)$": { type: "time", format: "%I:%M:%S %p", }, "^\\d{1,2}:\\d{2}\\s*(am|pm|AM|PM)$": { type: "time", format: "%I:%M %p" }, "^\\d{2}:\\d{2}:\\d{2}[+-]\\d{2}:?\\d{2}$": { type: "time" }, // Datetime - ISO format "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z?$": { type: "datetime" }, "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}[+-]\\d{2}:?\\d{2}$": { type: "datetime", }, "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}$": { type: "datetime", format: "%Y-%m-%d %H:%M:%S", }, "^\\d{2}/\\d{2}/\\d{4} \\d{2}:\\d{2}$": monthFirst ? { type: "datetime", format: "%m/%d/%Y %H:%M" } : { type: "datetime", format: "%d/%m/%Y %H:%M" }, "^\\d{2}/\\d{2}/\\d{4} \\d{2}:\\d{2}:\\d{2}$": monthFirst ? { type: "datetime", format: "%m/%d/%Y %H:%M:%S" } : { type: "datetime", format: "%d/%m/%Y %H:%M:%S" }, // Object "^\\{": { type: "object" }, // Array "^\\[": { type: "array" }, // List // TODO: Support commaDecimal "^\\d+,\\d+$": { type: "list", itemType: "integer" }, "^[\\d.]+,[\\d.]+$": { type: "list", itemType: "number" }, } return mapping } function enhanceField(field: Field, options?: InferSchemaOptions) { if (field.type === "string") { field.format = options?.stringFormat ?? field.format } else if (field.type === "integer") { field.groupChar = options?.groupChar ?? field.groupChar field.bareNumber = options?.bareNumber ?? field.bareNumber } else if (field.type === "number") { field.decimalChar = options?.decimalChar ?? field.decimalChar field.groupChar = options?.groupChar ?? field.groupChar field.bareNumber = options?.bareNumber ?? field.bareNumber } else if (field.type === "boolean") { field.trueValues = options?.trueValues ?? field.trueValues field.falseValues = options?.falseValues ?? field.falseValues } else if (field.type === "datetime") { field.format = options?.datetimeFormat ?? field.format } else if (field.type === "date") { field.format = options?.dateFormat ?? field.format } else if (field.type === "time") { field.format = options?.timeFormat ?? field.format } else if (field.type === "list") { field.delimiter = options?.listDelimiter ?? field.delimiter field.itemType = options?.listItemType ?? field.itemType } else if (field.type === "geopoint") { field.format = options?.geopointFormat ?? field.format } else if (field.type === "geojson") { field.format = options?.geojsonFormat ?? field.format } } function enhanceSchema(schema: Schema, options?: InferSchemaOptions) { schema.missingValues = options?.missingValues ?? schema.missingValues }