UNPKG

mongoku

Version:

[![CI](https://github.com/huggingface/Mongoku/actions/workflows/ci.yml/badge.svg)](https://github.com/huggingface/Mongoku/actions/workflows/ci.yml)

500 lines (465 loc) 16.4 kB
import type { MongoClientWithMappings } from "$lib/server/mongo"; import { ReadPreference } from "mongodb"; import { z } from "zod"; export interface CollectionSchemaInfo { hasSchema: boolean; validator: Record<string, unknown> | null; validationLevel: string | null; validationAction: string | null; } export interface SchemaAuditResult { nrecords: number; nInvalidDocuments: number; nValidDocuments: number; compliancePct: number; errors: Array<{ message: string; docId?: unknown; /** The full document that failed validation (encoded via JsonEncoder) */ document?: unknown; }>; warnings: string[]; hasSchema: boolean; tookMs: number; } /** * Fetch the JSON Schema validator configuration for a collection. */ export async function getCollectionSchema( client: MongoClientWithMappings, dbName: string, colName: string, ): Promise<CollectionSchemaInfo> { const db = client.db(dbName); const collections = await db.listCollections({ name: colName }, { nameOnly: false }).toArray(); const colInfo = collections[0]; if (!colInfo) { return { hasSchema: false, validator: null, validationLevel: null, validationAction: null }; } const options = (colInfo as { options?: Record<string, unknown> }).options ?? {}; const validator = (options.validator as Record<string, unknown>) ?? null; const validationLevel = (options.validationLevel as string) ?? "strict"; const validationAction = (options.validationAction as string) ?? "error"; return { hasSchema: !!validator && Object.keys(validator).length > 0, validator, validationLevel, validationAction, }; } /** * Extract the inner JSON Schema object from a MongoDB validator document. * Validators are typically `{ $jsonSchema: { ... } }` but may also be wrapped * in `$and`/`$or` or combined with other operators. */ function extractJsonSchema(validator: Record<string, unknown>): Record<string, unknown> | null { // Direct $jsonSchema (most common) if (validator.$jsonSchema && typeof validator.$jsonSchema === "object") { return validator.$jsonSchema as Record<string, unknown>; } // $and: [{ $jsonSchema: ... }, ...] if (Array.isArray(validator.$and)) { for (const clause of validator.$and) { const extracted = extractJsonSchema(clause as Record<string, unknown>); if (extracted) { return extracted; } } } return null; } /** Standard JSON Schema type names that zod's fromJSONSchema supports. */ const STANDARD_TYPES = new Set(["string", "number", "integer", "boolean", "object", "array", "null"]); /** Map MongoDB bsonType aliases to standard JSON Schema types. */ const BSON_TYPE_MAP: Record<string, string> = { int: "integer", long: "integer", double: "number", bool: "boolean", decimal: "number", objectId: "objectId", date: "date", }; /** * JSON Schema keywords that, when present, give the schema actual semantics. * If none of these are set, the schema is `z.any()`-equivalent under zod's * `fromJSONSchema`, which means it silently accepts `undefined` even when * the parent's `required` list includes the property. */ const CONSTRAINT_KEYWORDS = [ "type", "bsonType", "enum", "const", "anyOf", "oneOf", "allOf", "$ref", "properties", "patternProperties", "additionalProperties", "items", "prefixItems", "additionalItems", "not", "required", "propertyNames", ]; // eslint-disable-next-line @typescript-eslint/no-explicit-any function isEffectivelyAnySchema(schema: any): boolean { if (typeof schema !== "object" || schema === null) { return false; } return !CONSTRAINT_KEYWORDS.some((k) => schema[k] !== undefined); } /** * Replacement for empty `{}` sub-schemas under a required property: a union * of every JSON-representable type. This lets the value be anything (mirroring * the original "any" intent) while still rejecting `undefined`, so the parent * object's `required` check fires when the field is missing. */ const ANY_NON_UNDEFINED_SCHEMA = { anyOf: [ { type: "string" }, { type: "number" }, { type: "boolean" }, { type: "object" }, { type: "array" }, { type: "null" }, ], }; /** * Convert a MongoDB $jsonSchema (which uses `bsonType` instead of `type`) into * standard JSON Schema Draft-07 that zod's `fromJSONSchema` can enforce. * * MongoDB-specific types are mapped to EJSON wrapper shapes so zod can * perform precise structural validation: * bsonType: "objectId" → { type: "object", required: ["$oid"], properties: { $oid: { type: "string" } } } * bsonType: "date" → { type: "object", required: ["$date"], properties: { $date: { type: "string" } } } * * Documents are likewise normalized via `normalizeBsonValue()` so ObjectId * instances become `{ $oid: "hex" }` and Date instances become `{ $date: "ISO" }` * before validation. * * Supports bsonType as a string or array (e.g., `["string", "null"]` for nullable). */ // eslint-disable-next-line @typescript-eslint/no-explicit-any function bsonSchemaToStandard(schema: any): any { if (typeof schema !== "object" || schema === null) { return schema; } const out: Record<string, unknown> = { ...schema }; if (out.bsonType) { const bsonTypeVal = out.bsonType; delete out.bsonType; // Handle array of types (nullable fields): bsonType: ["string", "null"] const types: string[] = Array.isArray(bsonTypeVal) ? bsonTypeVal : [bsonTypeVal as string]; const mapped = types .map((t) => BSON_TYPE_MAP[t] ?? (STANDARD_TYPES.has(t) ? t : null)) .filter((t): t is string => t !== null); if (mapped.length === 0) { // All types were unknown (binData, regex, etc.) — drop type constraint } else if (mapped.length === 1) { const bson = mapped[0]; if (bson === "objectId") { out.type = "object"; out.required = ["$oid"]; out.properties = { $oid: { type: "string" } }; return out; } if (bson === "date") { out.type = "object"; out.required = ["$date"]; out.properties = { $date: { type: "string" } }; return out; } out.type = bson; } else { // Multiple types — use anyOf. Don't return early; still need to recurse // into properties/items/etc. for any object types in the union. out.anyOf = mapped.map((t) => { if (t === "objectId") { return { type: "object", required: ["$oid"], properties: { $oid: { type: "string" } } }; } if (t === "date") { return { type: "object", required: ["$date"], properties: { $date: { type: "string" } } }; } return { type: t }; }); } // binData, regex, timestamp, etc. — drop the type so zod uses z.any() } if (out.properties) { const requiredKeys = new Set(Array.isArray(out.required) ? (out.required as string[]) : []); out.properties = Object.fromEntries( Object.entries(out.properties as Record<string, unknown>).map(([k, v]) => { const converted = bsonSchemaToStandard(v); // If a required property has no real constraints, zod's // fromJSONSchema collapses it to `z.any()` — which silently // accepts `undefined` and defeats the `required` check. // Replace with an explicit any-of-any-non-undefined union. if (requiredKeys.has(k) && isEffectivelyAnySchema(converted)) { return [k, ANY_NON_UNDEFINED_SCHEMA]; } return [k, converted]; }), ); } if (out.additionalProperties && typeof out.additionalProperties === "object") { out.additionalProperties = bsonSchemaToStandard(out.additionalProperties); } if (Array.isArray(out.oneOf)) { out.oneOf = (out.oneOf as Array<unknown>).map((v) => bsonSchemaToStandard(v)); } if (Array.isArray(out.anyOf)) { out.anyOf = (out.anyOf as Array<unknown>).map((v) => bsonSchemaToStandard(v)); } if (Array.isArray(out.allOf)) { out.allOf = (out.allOf as Array<unknown>).map((v) => bsonSchemaToStandard(v)); } if (out.items) { out.items = Array.isArray(out.items) ? (out.items as Array<unknown>).map((v) => bsonSchemaToStandard(v)) : bsonSchemaToStandard(out.items); } return out; } /** * Recursively normalise BSON types in a document to their EJSON wrapper * representations so zod can structurally validate them against a schema * that has been converted to expect those wrappers. * * ObjectId → { $oid: "...hex..." } * Date → { $date: "...ISO..." } * Decimal128 → { $numberDecimal: "...string..." } * Long → { $numberLong: "...string..." } */ // eslint-disable-next-line @typescript-eslint/no-explicit-any function normalizeBsonValue(value: any): any { if (value === null || value === undefined) { return value; } if (typeof value !== "object") { return value; } if (value instanceof Date) { return { $date: value.toISOString() }; } // ObjectId if (value.constructor?.name === "ObjectId" && typeof value.toHexString === "function") { return { $oid: value.toHexString() }; } // Decimal128 if (value.constructor?.name === "Decimal128" && typeof value.toString === "function") { return { $numberDecimal: value.toString() }; } // Long if (value.constructor?.name === "Long" && typeof value.toString === "function") { return { $numberLong: value.toString() }; } if (Array.isArray(value)) { return value.map(normalizeBsonValue); } const out: Record<string, unknown> = {}; for (const [k, v] of Object.entries(value)) { out[k] = normalizeBsonValue(v); } return out; } /** * BSON numeric types that have no faithful representation in JavaScript: * MongoDB distinguishes int/long/double/decimal at the storage layer, but * when a document is returned through the Node driver they all surface as * `number` (or wrapped Long/Decimal128 for ranges that don't fit). * * If a validator constrains a field to one of these, our zod-based audit * cannot reliably detect violations — so we use this to produce a more * helpful fallback message when we know specific feedback isn't possible. */ const PRECISE_BSON_NUMERIC_TYPES = new Set(["int", "long", "double", "decimal"]); /** Walk the schema tree and check whether any field uses a precise BSON numeric type. */ // eslint-disable-next-line @typescript-eslint/no-explicit-any function schemaUsesPreciseBsonTypes(schema: any): boolean { if (typeof schema !== "object" || schema === null) { return false; } if (schema.bsonType) { const types = Array.isArray(schema.bsonType) ? schema.bsonType : [schema.bsonType]; if (types.some((t: unknown) => typeof t === "string" && PRECISE_BSON_NUMERIC_TYPES.has(t))) { return true; } } for (const key of ["properties", "patternProperties"]) { const props = schema[key]; if (props && typeof props === "object") { for (const v of Object.values(props)) { if (schemaUsesPreciseBsonTypes(v)) { return true; } } } } for (const key of ["items", "additionalProperties", "additionalItems"]) { const v = schema[key]; if (v && typeof v === "object" && schemaUsesPreciseBsonTypes(v)) { return true; } } for (const key of ["oneOf", "anyOf", "allOf"]) { const arr = schema[key]; if (Array.isArray(arr) && arr.some((v) => schemaUsesPreciseBsonTypes(v))) { return true; } } return false; } /** * Build a reusable per-document validator from a MongoDB `$jsonSchema`. * * The conversion + `z.fromJSONSchema` compilation is non-trivial for nested * schemas, so we do it once per audit run and reuse the resulting closure * for every sampled document. * * Returns a function that, given a raw MongoDB document, returns an array of * human-readable failure messages (empty if the doc matches the schema, or a * single fallback message if the validator itself couldn't be built). */ // eslint-disable-next-line @typescript-eslint/no-explicit-any function buildDocumentValidator(schema: Record<string, unknown>): (doc: any) => string[] { let validator: ReturnType<typeof z.fromJSONSchema> | null = null; try { const standardSchema = bsonSchemaToStandard(schema); validator = z.fromJSONSchema(standardSchema); } catch { // Conversion / compilation failed — every doc will get the fallback. validator = null; } // eslint-disable-next-line @typescript-eslint/no-explicit-any return (doc: any): string[] => { if (!validator) { return ["document does not match schema (could not parse schema with zod)"]; } try { const normalized = normalizeBsonValue(doc); const result = validator.safeParse(normalized); if (result.success) { return []; } return [z.prettifyError(result.error)]; } catch { return ["document does not match schema (could not parse schema with zod)"]; } }; } /** * Audit schema compliance for a collection. * * Uses aggregation with the `$jsonSchema` operator rather than `db.validate()` * because validate() does not reliably return `nInvalidDocuments` counts in * MongoDB 8.x (always returns 0, only logs a warning to the server log). */ export async function auditSchemaCompliance( client: MongoClientWithMappings, dbName: string, colName: string, opts?: { readPreference?: ReadPreference; maxTimeMS?: number; }, ): Promise<SchemaAuditResult> { const coll = client.db(dbName).collection(colName); const schemaInfo = await getCollectionSchema(client, dbName, colName); if (!schemaInfo.hasSchema || !schemaInfo.validator) { return { nrecords: 0, nInvalidDocuments: 0, nValidDocuments: 0, compliancePct: 100, errors: [], warnings: [], hasSchema: false, tookMs: 0, }; } const jsonSchema = extractJsonSchema(schemaInfo.validator); if (!jsonSchema) { return { nrecords: 0, nInvalidDocuments: 0, nValidDocuments: 0, compliancePct: 100, errors: [], warnings: [ "Validator is present but could not extract a $jsonSchema for auditing — validator may use non-schema operators", ], hasSchema: true, tookMs: 0, }; } const start = performance.now(); const aggOptions: Record<string, unknown> = {}; if (opts?.readPreference) { aggOptions.readPreference = opts.readPreference; } if (opts?.maxTimeMS) { aggOptions.maxTimeMS = opts.maxTimeMS; } const total = await coll.countDocuments({}, aggOptions); // Count non-matching documents. // $nor + $jsonSchema identifies docs that don't conform. const nonMatchingResult = await coll .aggregate([{ $match: { $nor: [{ $jsonSchema: jsonSchema }] } }, { $count: "c" }], aggOptions) .next() .then((r) => (r as { c: number } | null)?.c ?? 0) .catch(() => null); if (nonMatchingResult === null) { return { nrecords: total, nInvalidDocuments: 0, nValidDocuments: total, compliancePct: 100, errors: [], warnings: ["Unable to count non-matching documents (aggregation failed)"], hasSchema: true, tookMs: Math.round(performance.now() - start), }; } const nInvalidDocuments = nonMatchingResult; const nValidDocuments = total - nInvalidDocuments; const compliancePct = total > 0 ? (nValidDocuments * 100) / total : 100; // Sample non-matching documents (up to 20) const sampleDocs = await coll .aggregate([{ $match: { $nor: [{ $jsonSchema: jsonSchema }] } }, { $limit: 20 }], aggOptions) .toArray(); // Compare each non-matching doc against individual $jsonSchema constraints // to produce specific error messages rather than a generic "does not match". // When zod can't pinpoint the failure (returns no issues), it usually means // the violation is a BSON-specific type distinction that isn't visible from // the JS value alone (e.g. `bsonType: "double"` where the doc has an int). const fallbackMessage = schemaUsesPreciseBsonTypes(jsonSchema) ? "The validator uses BSON-specific numeric types (int/long/double/decimal) which cannot be distinguished from a JavaScript value alone — try inspecting the document directly in MongoDB." : "Failed to detect validation error"; // Build the per-document validator once: the schema is identical across // every sample, so there's no need to re-run bsonSchemaToStandard / // z.fromJSONSchema for each doc. const validateOne = buildDocumentValidator(jsonSchema); const errors: SchemaAuditResult["errors"] = sampleDocs.map((doc) => { const failures = validateOne(doc); return { message: failures.length > 0 ? failures.join("; ") : fallbackMessage, docId: doc._id, document: doc, }; }); const tookMs = Math.round(performance.now() - start); return { nrecords: total, nInvalidDocuments, nValidDocuments, compliancePct, errors, warnings: [], hasSchema: true, tookMs, }; }