@dpkit/table
Version:
Data Package implementation in TypeScript.
157 lines • 21.8 kB
JavaScript
import { col, lit } from "nodejs-polars";
import { matchField } from "../field/index.js";
import { validateField } from "../field/index.js";
import { validateRows } from "../row/index.js";
import { getPolarsSchema } from "../schema/index.js";
import { normalizeFields } from "./normalize.js";
export async function validateTable(table, options) {
const { schema, sampleRows = 100, invalidRowsLimit = 100 } = options ?? {};
const errors = [];
if (schema) {
const sample = await table.head(sampleRows).collect();
const polarsSchema = getPolarsSchema(sample.schema);
const matchErrors = validateFieldsMatch({ schema, polarsSchema });
errors.push(...matchErrors);
const fieldErrors = await validateFields(table, schema, polarsSchema, invalidRowsLimit);
errors.push(...fieldErrors);
}
return { errors, valid: !errors.length };
}
function validateFieldsMatch(props) {
const { schema, polarsSchema } = props;
const errors = [];
const fieldsMatch = schema.fieldsMatch ?? "exact";
const fields = schema.fields;
const polarsFields = polarsSchema.fields;
const names = fields.map(field => field.name);
const polarsNames = polarsFields.map(field => field.name);
const requiredNames = fields
.filter(field => field.constraints?.required)
.map(field => field.name);
const extraFields = polarsFields.length - fields.length;
const missingFields = fields.length - polarsFields.length;
const extraNames = arrayDiff(polarsNames, names);
const missingNames = arrayDiff(names, polarsNames);
const missingRequiredNames = arrayDiff(requiredNames, polarsNames);
if (fieldsMatch === "exact") {
if (extraFields > 0) {
errors.push({
type: "fields/extra",
fieldNames: extraNames,
});
}
if (missingFields > 0) {
errors.push({
type: "fields/missing",
fieldNames: missingNames,
});
}
}
if (fieldsMatch === "equal") {
if (extraNames.length > 0) {
errors.push({
type: "fields/extra",
fieldNames: extraNames,
});
}
if (missingRequiredNames.length > 0) {
errors.push({
type: "fields/missing",
fieldNames: missingRequiredNames,
});
}
}
if (fieldsMatch === "subset") {
if (missingRequiredNames.length > 0) {
errors.push({
type: "fields/missing",
fieldNames: missingRequiredNames,
});
}
}
if (fieldsMatch === "superset") {
if (extraNames.length > 0) {
errors.push({
type: "fields/extra",
fieldNames: extraNames,
});
}
}
if (fieldsMatch === "partial") {
if (missingNames.length === fields.length) {
errors.push({
type: "fields/missing",
fieldNames: missingNames,
});
}
}
return errors;
}
async function validateFields(table, schema, polarsSchema, invalidRowsLimit) {
const errors = [];
const targetNames = [];
const sources = Object.entries(normalizeFields(schema, polarsSchema, { dontParse: true })).map(([name, expr]) => {
return expr.alias(`source:${name}`);
});
const targets = Object.entries(normalizeFields(schema, polarsSchema, { dontParse: false })).map(([name, expr]) => {
const targetName = `target:${name}`;
targetNames.push(targetName);
return expr.alias(targetName);
});
let errorTable = table
.withRowCount()
.select(col("row_nr").add(1), lit(false).alias("error"), ...sources, ...targets);
for (const [index, field] of schema.fields.entries()) {
const polarsField = matchField(index, field, schema, polarsSchema);
if (polarsField) {
const fieldResult = validateField(field, { errorTable, polarsField });
errorTable = fieldResult.errorTable;
errors.push(...fieldResult.errors);
}
}
const rowsResult = validateRows(schema, errorTable);
errorTable = rowsResult.errorTable;
errors.push(...rowsResult.errors);
const errorFrame = await errorTable
.filter(col("error").eq(true))
.head(invalidRowsLimit)
.drop(targetNames)
.collect();
for (const record of errorFrame.toRecords()) {
const typeErrorInFields = [];
for (const [key, value] of Object.entries(record)) {
const [kind, type, name] = key.split(":");
if (kind === "error" && value === true && type && name) {
const rowNumber = record.row_nr;
// Cell-level errors
if (type.startsWith("cell/")) {
if (!typeErrorInFields.includes(name)) {
errors.push({
rowNumber,
type: type,
fieldName: name,
cell: (record[`source:${name}`] ?? "").toString(),
});
}
// Type error is a terminating error for a cell
if (type === "cell/type") {
typeErrorInFields.push(name);
}
}
// Row-level errors
if (type.startsWith("row/")) {
errors.push({
rowNumber,
type: type,
fieldNames: name.split(","),
});
}
}
}
}
return errors;
}
function arrayDiff(a, b) {
return a.filter(x => !b.includes(x));
}
//# sourceMappingURL=data:application/json;base64,