UNPKG

@dpkit/table

Version:

Data Package implementation in TypeScript.

151 lines 19.9 kB
import { col, lit } from "nodejs-polars"; import { matchField } from "../field/index.js"; import { inspectField } from "../field/index.js"; import { inspectRows } from "../row/index.js"; import { getPolarsSchema } from "../schema/index.js"; import { processFields } from "./process.js"; export async function inspectTable(table, options) { const { schema, sampleRows = 100, invalidRowsLimit = 100 } = options ?? {}; const errors = []; if (schema) { const sample = await table.head(sampleRows).collect(); const polarsSchema = getPolarsSchema(sample.schema); const matchErrors = inspectFieldsMatch({ schema, polarsSchema }); errors.push(...matchErrors); const fieldErrors = await inspectFields(table, schema, polarsSchema, invalidRowsLimit); errors.push(...fieldErrors); } return errors; } function inspectFieldsMatch(props) { const { schema, polarsSchema } = props; const errors = []; const fieldsMatch = schema.fieldsMatch ?? "exact"; const fields = schema.fields; const polarsFields = polarsSchema.fields; const names = fields.map(field => field.name); const polarsNames = polarsFields.map(field => field.name); const extraFields = polarsFields.length - fields.length; const missingFields = fields.length - polarsFields.length; const extraNames = arrayDiff(polarsNames, names); const missingNames = arrayDiff(names, polarsNames); if (fieldsMatch === "exact") { if (extraFields > 0) { errors.push({ type: "fields/extra", fieldNames: extraNames, }); } if (missingFields > 0) { errors.push({ type: "fields/missing", fieldNames: missingNames, }); } } if (fieldsMatch === "equal") { if (extraNames.length > 0) { errors.push({ type: "fields/extra", fieldNames: extraNames, }); } if (missingNames.length > 0) { errors.push({ type: "fields/missing", fieldNames: missingNames, }); } } if (fieldsMatch === "subset") { if (missingNames.length > 0) { errors.push({ type: "fields/missing", fieldNames: missingNames, }); } } if (fieldsMatch === "superset") { if (extraNames.length > 0) { errors.push({ type: "fields/extra", fieldNames: extraNames, }); } } if (fieldsMatch === "partial") { if (missingNames.length === fields.length) { errors.push({ type: "fields/missing", fieldNames: missingNames, }); } } return errors; } async function inspectFields(table, schema, polarsSchema, invalidRowsLimit) { const errors = []; const targetNames = []; const sources = Object.entries(processFields(schema, polarsSchema, { dontParse: true })).map(([name, expr]) => { return expr.alias(`source:${name}`); }); const targets = Object.entries(processFields(schema, polarsSchema, { dontParse: false })).map(([name, expr]) => { const targetName = `target:${name}`; targetNames.push(targetName); return expr.alias(targetName); }); let errorTable = table .withRowCount() .select([ col("row_nr").add(1), lit(false).alias("error"), ...sources, ...targets, ]); for (const [index, field] of schema.fields.entries()) { const polarsField = matchField(index, field, schema, polarsSchema); if (polarsField) { const fieldResult = inspectField(field, { errorTable, polarsField }); errorTable = fieldResult.errorTable; errors.push(...fieldResult.errors); } } const rowsResult = inspectRows(schema, errorTable); errorTable = rowsResult.errorTable; errors.push(...rowsResult.errors); const errorFrame = await errorTable .filter(col("error").eq(true)) .head(invalidRowsLimit) .drop(targetNames) .collect(); for (const record of errorFrame.toRecords()) { for (const [key, value] of Object.entries(record)) { const [kind, type, name] = key.split(":"); if (kind === "error" && value === true && type && name) { const rowNumber = record.row_nr; // Cell-level errors if (type.startsWith("cell/")) { errors.push({ rowNumber, type: type, fieldName: name, cell: (record[`source:${name}`] ?? "").toString(), }); } // Row-level errors if (type.startsWith("row/")) { errors.push({ rowNumber, type: type, fieldNames: name.split(","), }); } } } } return errors; } function arrayDiff(a, b) { return a.filter(x => !b.includes(x)); } //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"inspect.js","sourceRoot":"","sources":["../../table/inspect.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,GAAG,EAAE,GAAG,EAAE,MAAM,eAAe,CAAA;AAExC,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAA;AAC9C,OAAO,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAA;AAChD,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAA;AAC7C,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAA;AAGpD,OAAO,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAE5C,MAAM,CAAC,KAAK,UAAU,YAAY,CAChC,KAAY,EACZ,OAIC;IAED,MAAM,EAAE,MAAM,EAAE,UAAU,GAAG,GAAG,EAAE,gBAAgB,GAAG,GAAG,EAAE,GAAG,OAAO,IAAI,EAAE,CAAA;IAC1E,MAAM,MAAM,GAAiB,EAAE,CAAA;IAE/B,IAAI,MAAM,EAAE,CAAC;QACX,MAAM,MAAM,GAAG,MAAM,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,OAAO,EAAE,CAAA;QACrD,MAAM,YAAY,GAAG,eAAe,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;QAEnD,MAAM,WAAW,GAAG,kBAAkB,CAAC,EAAE,MAAM,EAAE,YAAY,EAAE,CAAC,CAAA;QAChE,MAAM,CAAC,IAAI,CAAC,GAAG,WAAW,CAAC,CAAA;QAE3B,MAAM,WAAW,GAAG,MAAM,aAAa,CACrC,KAAK,EACL,MAAM,EACN,YAAY,EACZ,gBAAgB,CACjB,CAAA;QACD,MAAM,CAAC,IAAI,CAAC,GAAG,WAAW,CAAC,CAAA;IAC7B,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,kBAAkB,CAAC,KAG3B;IACC,MAAM,EAAE,MAAM,EAAE,YAAY,EAAE,GAAG,KAAK,CAAA;IAEtC,MAAM,MAAM,GAAiB,EAAE,CAAA;IAC/B,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,IAAI,OAAO,CAAA;IAEjD,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,CAAA;IAC5B,MAAM,YAAY,GAAG,YAAY,CAAC,MAAM,CAAA;IAExC,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;IAC7C,MAAM,WAAW,GAAG,YAAY,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;IAEzD,MAAM,WAAW,GAAG,YAAY,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAA;IACvD,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,GAAG,YAAY,CAAC,MAAM,CAAA;IAEzD,MAAM,UAAU,GAAG,SAAS,CAAC,WAAW,EAAE,KAAK,CAAC,CAAA;IAChD,MAAM,YAAY,GAAG,SAAS,CAAC,KAAK,EAAE,WAAW,CAAC,CAAA;IAElD,IAAI,WAAW,KAAK,OAAO,EAAE,CAAC;QAC5B,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;YACpB,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,cAAc;gBACpB,UAAU,EAAE,UAAU;aACvB,CAAC,CAAA;QACJ,CAAC;QAED,IAAI,aAAa,GAAG,CAAC,EAAE,CAAC;YACtB,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,gBAAgB;gBACtB,UAAU,EAAE,YAAY;aACzB,CAAC,CAAA;QACJ,CAAC;IACH,CAAC;IAED,IAAI,WAAW,KAAK,OAAO,EAAE,CAAC;QAC5B,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,cAAc;gBACpB,UAAU,EAAE,UAAU;aACvB,CAAC,CAAA;QACJ,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,gBAAgB;gBACtB,UAAU,EAAE,YAAY;aACzB,CAAC,CAAA;QACJ,CAAC;IACH,CAAC;IAED,IAAI,WAAW,KAAK,QAAQ,EAAE,CAAC;QAC7B,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,gBAAgB;gBACtB,UAAU,EAAE,YAAY;aACzB,CAAC,CAAA;QACJ,CAAC;IACH,CAAC;IAED,IAAI,WAAW,KAAK,UAAU,EAAE,CAAC;QAC/B,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,cAAc;gBACpB,UAAU,EAAE,UAAU;aACvB,CAAC,CAAA;QACJ,CAAC;IACH,CAAC;IAED,IAAI,WAAW,KAAK,SAAS,EAAE,CAAC;QAC9B,IAAI,YAAY,CAAC,MAAM,KAAK,MAAM,CAAC,MAAM,EAAE,CAAC;YAC1C,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,gBAAgB;gBACtB,UAAU,EAAE,YAAY;aACzB,CAAC,CAAA;QACJ,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,KAAK,UAAU,aAAa,CAC1B,KAAY,EACZ,MAAc,EACd,YAA0B,EAC1B,gBAAwB;IAExB,MAAM,MAAM,GAAiB,EAAE,CAAA;IAC/B,MAAM,WAAW,GAAa,EAAE,CAAA;IAEhC,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAC5B,aAAa,CAAC,MAAM,EAAE,YAAY,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CACzD,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,EAAE;QACrB,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,IAAI,EAAE,CAAC,CAAA;IACrC,CAAC,CAAC,CAAA;IAEF,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAC5B,aAAa,CAAC,MAAM,EAAE,YAAY,EAAE,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAC1D,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,EAAE;QACrB,MAAM,UAAU,GAAG,UAAU,IAAI,EAAE,CAAA;QACnC,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QAC5B,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAA;IAC/B,CAAC,CAAC,CAAA;IAEF,IAAI,UAAU,GAAG,KAAK;SACnB,YAAY,EAAE;SACd,MAAM,CAAC;QACN,GAAG,CAAC,QAAQ,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACpB,GAAG,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC;QACzB,GAAG,OAAO;QACV,GAAG,OAAO;KACX,CAAC,CAAA;IAEJ,KAAK,MAAM,CAAC,KAAK,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,MAAM,CAAC,OAAO,EAAE,EAAE,CAAC;QACrD,MAAM,WAAW,GAAG,UAAU,CAAC,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,YAAY,CAAC,CAAA;QAClE,IAAI,WAAW,EAAE,CAAC;YAChB,MAAM,WAAW,GAAG,YAAY,CAAC,KAAK,EAAE,EAAE,UAAU,EAAE,WAAW,EAAE,CAAC,CAAA;YACpE,UAAU,GAAG,WAAW,CAAC,UAAU,CAAA;YACnC,MAAM,CAAC,IAAI,CAAC,GAAG,WAAW,CAAC,MAAM,CAAC,CAAA;QACpC,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,WAAW,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;IAClD,UAAU,GAAG,UAAU,CAAC,UAAU,CAAA;IAClC,MAAM,CAAC,IAAI,CAAC,GAAG,UAAU,CAAC,MAAM,CAAC,CAAA;IAEjC,MAAM,UAAU,GAAG,MAAM,UAAU;SAChC,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC;SAC7B,IAAI,CAAC,gBAAgB,CAAC;SACtB,IAAI,CAAC,WAAW,CAAC;SACjB,OAAO,EAAE,CAAA;IAEZ,KAAK,MAAM,MAAM,IAAI,UAAU,CAAC,SAAS,EAAW,EAAE,CAAC;QACrD,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;YAClD,MAAM,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,GAAG,GAAG,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;YAEzC,IAAI,IAAI,KAAK,OAAO,IAAI,KAAK,KAAK,IAAI,IAAI,IAAI,IAAI,IAAI,EAAE,CAAC;gBACvD,MAAM,SAAS,GAAG,MAAM,CAAC,MAAM,CAAA;gBAE/B,oBAAoB;gBACpB,IAAI,IAAI,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;oBAC7B,MAAM,CAAC,IAAI,CAAC;wBACV,SAAS;wBACT,IAAI,EAAE,IAAW;wBACjB,SAAS,EAAE,IAAW;wBACtB,IAAI,EAAE,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE;qBAClD,CAAC,CAAA;gBACJ,CAAC;gBAED,mBAAmB;gBACnB,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;oBAC5B,MAAM,CAAC,IAAI,CAAC;wBACV,SAAS;wBACT,IAAI,EAAE,IAAW;wBACjB,UAAU,EAAE,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC;qBAC5B,CAAC,CAAA;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,SAAS,CAAC,CAAW,EAAE,CAAW;IACzC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAA;AACtC,CAAC","sourcesContent":["import type { Schema } from \"@dpkit/core\"\nimport { col, lit } from \"nodejs-polars\"\nimport type { TableError } from \"../error/index.ts\"\nimport { matchField } from \"../field/index.ts\"\nimport { inspectField } from \"../field/index.ts\"\nimport { inspectRows } from \"../row/index.ts\"\nimport { getPolarsSchema } from \"../schema/index.ts\"\nimport type { PolarsSchema } from \"../schema/index.ts\"\nimport type { Table } from \"./Table.ts\"\nimport { processFields } from \"./process.ts\"\n\nexport async function inspectTable(\n  table: Table,\n  options?: {\n    schema?: Schema\n    sampleRows?: number\n    invalidRowsLimit?: number\n  },\n) {\n  const { schema, sampleRows = 100, invalidRowsLimit = 100 } = options ?? {}\n  const errors: TableError[] = []\n\n  if (schema) {\n    const sample = await table.head(sampleRows).collect()\n    const polarsSchema = getPolarsSchema(sample.schema)\n\n    const matchErrors = inspectFieldsMatch({ schema, polarsSchema })\n    errors.push(...matchErrors)\n\n    const fieldErrors = await inspectFields(\n      table,\n      schema,\n      polarsSchema,\n      invalidRowsLimit,\n    )\n    errors.push(...fieldErrors)\n  }\n\n  return errors\n}\n\nfunction inspectFieldsMatch(props: {\n  schema: Schema\n  polarsSchema: PolarsSchema\n}) {\n  const { schema, polarsSchema } = props\n\n  const errors: TableError[] = []\n  const fieldsMatch = schema.fieldsMatch ?? \"exact\"\n\n  const fields = schema.fields\n  const polarsFields = polarsSchema.fields\n\n  const names = fields.map(field => field.name)\n  const polarsNames = polarsFields.map(field => field.name)\n\n  const extraFields = polarsFields.length - fields.length\n  const missingFields = fields.length - polarsFields.length\n\n  const extraNames = arrayDiff(polarsNames, names)\n  const missingNames = arrayDiff(names, polarsNames)\n\n  if (fieldsMatch === \"exact\") {\n    if (extraFields > 0) {\n      errors.push({\n        type: \"fields/extra\",\n        fieldNames: extraNames,\n      })\n    }\n\n    if (missingFields > 0) {\n      errors.push({\n        type: \"fields/missing\",\n        fieldNames: missingNames,\n      })\n    }\n  }\n\n  if (fieldsMatch === \"equal\") {\n    if (extraNames.length > 0) {\n      errors.push({\n        type: \"fields/extra\",\n        fieldNames: extraNames,\n      })\n    }\n\n    if (missingNames.length > 0) {\n      errors.push({\n        type: \"fields/missing\",\n        fieldNames: missingNames,\n      })\n    }\n  }\n\n  if (fieldsMatch === \"subset\") {\n    if (missingNames.length > 0) {\n      errors.push({\n        type: \"fields/missing\",\n        fieldNames: missingNames,\n      })\n    }\n  }\n\n  if (fieldsMatch === \"superset\") {\n    if (extraNames.length > 0) {\n      errors.push({\n        type: \"fields/extra\",\n        fieldNames: extraNames,\n      })\n    }\n  }\n\n  if (fieldsMatch === \"partial\") {\n    if (missingNames.length === fields.length) {\n      errors.push({\n        type: \"fields/missing\",\n        fieldNames: missingNames,\n      })\n    }\n  }\n\n  return errors\n}\n\nasync function inspectFields(\n  table: Table,\n  schema: Schema,\n  polarsSchema: PolarsSchema,\n  invalidRowsLimit: number,\n) {\n  const errors: TableError[] = []\n  const targetNames: string[] = []\n\n  const sources = Object.entries(\n    processFields(schema, polarsSchema, { dontParse: true }),\n  ).map(([name, expr]) => {\n    return expr.alias(`source:${name}`)\n  })\n\n  const targets = Object.entries(\n    processFields(schema, polarsSchema, { dontParse: false }),\n  ).map(([name, expr]) => {\n    const targetName = `target:${name}`\n    targetNames.push(targetName)\n    return expr.alias(targetName)\n  })\n\n  let errorTable = table\n    .withRowCount()\n    .select([\n      col(\"row_nr\").add(1),\n      lit(false).alias(\"error\"),\n      ...sources,\n      ...targets,\n    ])\n\n  for (const [index, field] of schema.fields.entries()) {\n    const polarsField = matchField(index, field, schema, polarsSchema)\n    if (polarsField) {\n      const fieldResult = inspectField(field, { errorTable, polarsField })\n      errorTable = fieldResult.errorTable\n      errors.push(...fieldResult.errors)\n    }\n  }\n\n  const rowsResult = inspectRows(schema, errorTable)\n  errorTable = rowsResult.errorTable\n  errors.push(...rowsResult.errors)\n\n  const errorFrame = await errorTable\n    .filter(col(\"error\").eq(true))\n    .head(invalidRowsLimit)\n    .drop(targetNames)\n    .collect()\n\n  for (const record of errorFrame.toRecords() as any[]) {\n    for (const [key, value] of Object.entries(record)) {\n      const [kind, type, name] = key.split(\":\")\n\n      if (kind === \"error\" && value === true && type && name) {\n        const rowNumber = record.row_nr\n\n        // Cell-level errors\n        if (type.startsWith(\"cell/\")) {\n          errors.push({\n            rowNumber,\n            type: type as any,\n            fieldName: name as any,\n            cell: (record[`source:${name}`] ?? \"\").toString(),\n          })\n        }\n\n        // Row-level errors\n        if (type.startsWith(\"row/\")) {\n          errors.push({\n            rowNumber,\n            type: type as any,\n            fieldNames: name.split(\",\"),\n          })\n        }\n      }\n    }\n  }\n\n  return errors\n}\n\nfunction arrayDiff(a: string[], b: string[]) {\n  return a.filter(x => !b.includes(x))\n}\n"]}