@dpkit/table
Version:
Data Package implementation in TypeScript.
211 lines (178 loc) • 5.15 kB
text/typescript
import type { Schema } from "@dpkit/core"
import { col, lit } from "nodejs-polars"
import type { TableError } from "../error/index.ts"
import { matchField } from "../field/index.ts"
import { inspectField } from "../field/index.ts"
import { inspectRows } from "../row/index.ts"
import { getPolarsSchema } from "../schema/index.ts"
import type { PolarsSchema } from "../schema/index.ts"
import type { Table } from "./Table.ts"
import { processFields } from "./process.ts"
export async function inspectTable(
table: Table,
options?: {
schema?: Schema
sampleRows?: number
invalidRowsLimit?: number
},
) {
const { schema, sampleRows = 100, invalidRowsLimit = 100 } = options ?? {}
const errors: TableError[] = []
if (schema) {
const sample = await table.head(sampleRows).collect()
const polarsSchema = getPolarsSchema(sample.schema)
const matchErrors = inspectFieldsMatch({ schema, polarsSchema })
errors.push(...matchErrors)
const fieldErrors = await inspectFields(
table,
schema,
polarsSchema,
invalidRowsLimit,
)
errors.push(...fieldErrors)
}
return errors
}
function inspectFieldsMatch(props: {
schema: Schema
polarsSchema: PolarsSchema
}) {
const { schema, polarsSchema } = props
const errors: TableError[] = []
const fieldsMatch = schema.fieldsMatch ?? "exact"
const fields = schema.fields
const polarsFields = polarsSchema.fields
const names = fields.map(field => field.name)
const polarsNames = polarsFields.map(field => field.name)
const extraFields = polarsFields.length - fields.length
const missingFields = fields.length - polarsFields.length
const extraNames = arrayDiff(polarsNames, names)
const missingNames = arrayDiff(names, polarsNames)
if (fieldsMatch === "exact") {
if (extraFields > 0) {
errors.push({
type: "fields/extra",
fieldNames: extraNames,
})
}
if (missingFields > 0) {
errors.push({
type: "fields/missing",
fieldNames: missingNames,
})
}
}
if (fieldsMatch === "equal") {
if (extraNames.length > 0) {
errors.push({
type: "fields/extra",
fieldNames: extraNames,
})
}
if (missingNames.length > 0) {
errors.push({
type: "fields/missing",
fieldNames: missingNames,
})
}
}
if (fieldsMatch === "subset") {
if (missingNames.length > 0) {
errors.push({
type: "fields/missing",
fieldNames: missingNames,
})
}
}
if (fieldsMatch === "superset") {
if (extraNames.length > 0) {
errors.push({
type: "fields/extra",
fieldNames: extraNames,
})
}
}
if (fieldsMatch === "partial") {
if (missingNames.length === fields.length) {
errors.push({
type: "fields/missing",
fieldNames: missingNames,
})
}
}
return errors
}
async function inspectFields(
table: Table,
schema: Schema,
polarsSchema: PolarsSchema,
invalidRowsLimit: number,
) {
const errors: TableError[] = []
const targetNames: string[] = []
const sources = Object.entries(
processFields(schema, polarsSchema, { dontParse: true }),
).map(([name, expr]) => {
return expr.alias(`source:${name}`)
})
const targets = Object.entries(
processFields(schema, polarsSchema, { dontParse: false }),
).map(([name, expr]) => {
const targetName = `target:${name}`
targetNames.push(targetName)
return expr.alias(targetName)
})
let errorTable = table
.withRowCount()
.select([
col("row_nr").add(1),
lit(false).alias("error"),
...sources,
...targets,
])
for (const [index, field] of schema.fields.entries()) {
const polarsField = matchField(index, field, schema, polarsSchema)
if (polarsField) {
const fieldResult = inspectField(field, { errorTable, polarsField })
errorTable = fieldResult.errorTable
errors.push(...fieldResult.errors)
}
}
const rowsResult = inspectRows(schema, errorTable)
errorTable = rowsResult.errorTable
errors.push(...rowsResult.errors)
const errorFrame = await errorTable
.filter(col("error").eq(true))
.head(invalidRowsLimit)
.drop(targetNames)
.collect()
for (const record of errorFrame.toRecords() as any[]) {
for (const [key, value] of Object.entries(record)) {
const [kind, type, name] = key.split(":")
if (kind === "error" && value === true && type && name) {
const rowNumber = record.row_nr
// Cell-level errors
if (type.startsWith("cell/")) {
errors.push({
rowNumber,
type: type as any,
fieldName: name as any,
cell: (record[`source:${name}`] ?? "").toString(),
})
}
// Row-level errors
if (type.startsWith("row/")) {
errors.push({
rowNumber,
type: type as any,
fieldNames: name.split(","),
})
}
}
}
}
return errors
}
function arrayDiff(a: string[], b: string[]) {
return a.filter(x => !b.includes(x))
}