UNPKG

mongodb-schema

Version:

Infer the probabilistic schema for a MongoDB collection.

361 lines 13.4 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.getCompletedSchemaAnalyzer = exports.verifyStreamSource = exports.SchemaAnalyzer = void 0; const reservoir_1 = __importDefault(require("reservoir")); const bson_1 = require("bson"); const semantic_types_1 = __importDefault(require("./semantic-types")); const util_1 = require("./util"); const defaultSchemaParseOptions = { semanticTypes: false, storeValues: true, storedValuesLengthLimit: 10000 }; function extractStringValueFromBSON(value) { if (value === null || value === void 0 ? void 0 : value._bsontype) { if (['Decimal128', 'Long'].includes(value._bsontype)) { return value.toString(); } if (['Double', 'Int32'].includes(value._bsontype)) { return String(value.value); } } if (typeof value === 'string') { return value; } return String(value); } function fieldComparator(a, b) { const aName = a.name; const bName = b.name; if (aName === '_id') { return -1; } if (bName === '_id') { return 1; } return aName.toLowerCase() < bName.toLowerCase() ? -1 : 1; } function getBSONType(value) { const bsonType = (value === null || value === void 0 ? void 0 : value._bsontype) ? value._bsontype : Object.prototype.toString.call(value).replace(/^\[object (\w+)\]$/, '$1'); if (bsonType === 'Object') { return 'Document'; } if (bsonType === 'Code' && value.scope) { return 'CodeWScope'; } return bsonType; } function isNullType(type) { return type.name === 'Null'; } function isArrayType(type) { return type.name === 'Array'; } function isDocumentType(type) { return type.name === 'Document'; } function schemaToPaths(fields, parent = []) { const paths = []; const sortedFields = Object.values(fields).sort(fieldComparator); for (const field of sortedFields) { const path = [...parent, field.name]; paths.push(path); const doc = Object.values(field.types).find((f) => f.bsonType === 'Document'); if (doc) { paths.push(...schemaToPaths(doc.fields, path)); } const array = Object.values(field.types).find((f) => f.bsonType === 'Array'); if (array) { const arrayDoc = Object.values(array.types).find((f) => f.bsonType === 'Document'); if (arrayDoc) { paths.push(...schemaToPaths(arrayDoc.fields, path)); } } } return paths; } function simplifiedSchema(fields) { function finalizeSchemaFieldTypes(types) { return Object.values(types).sort((a, b) => { return b.count - a.count; }).map((type) => { return { bsonType: type.bsonType, ...(isArrayType(type) ? { types: finalizeSchemaFieldTypes(type.types) } : {}), ...(isDocumentType(type) ? { fields: finalizeDocumentFieldSchema(type.fields) } : {}) }; }); } function finalizeDocumentFieldSchema(fieldMap) { const fieldSchema = Object.create(null); Object.values(fieldMap).forEach((field) => { const fieldTypes = finalizeSchemaFieldTypes(field.types); fieldSchema[field.name] = { types: fieldTypes }; }); return fieldSchema; } return finalizeDocumentFieldSchema(fields); } function cropString(value, limit) { if (limit < 1) return ''; return value.charCodeAt(limit - 1) === value.codePointAt(limit - 1) ? value.slice(0, limit) : value.slice(0, limit - 1); } function getCappedValue(bsonType, value, limit) { if (bsonType === 'String') { return cropString(value, limit); } if (bsonType === 'Binary') { value = value; return value.buffer.length > limit ? new bson_1.Binary(value.buffer.slice(0, limit), value.sub_type) : value; } if (bsonType === 'Code') { value = value; return (value.code.length >= limit) ? new bson_1.Code(cropString(value.code, limit), value.scope) : value; } return value; } function computeHasDuplicatesForType(type, unique) { if (isNullType(type)) { return type.count > 0; } if (!type.values) { return undefined; } return unique !== type.values.length; } function computeUniqueForType(type) { if (isNullType(type)) { return type.count === 0 ? 0 : 1; } if (!type.values) { return undefined; } return new Set(type.values.map(extractStringValueFromBSON)).size; } function finalizeSchema(schemaAnalysis) { function finalizeArrayFieldProperties(type) { const totalCount = Object.values(type.types) .map((v) => v.count) .reduce((p, c) => p + c, 0); const types = finalizeSchemaFieldTypes(type.types, totalCount); return { types, totalCount, lengths: type.lengths, averageLength: totalCount / type.lengths.length }; } function finalizeSchemaFieldTypes(types, parentCount) { return Object.values(types).map((type) => { const unique = computeUniqueForType(type); return { name: type.name, path: type.path, count: type.count, probability: type.count / parentCount, unique, hasDuplicates: computeHasDuplicatesForType(type, unique), values: isNullType(type) ? undefined : type.values, bsonType: type.bsonType, ...(isArrayType(type) ? finalizeArrayFieldProperties(type) : {}), ...(isDocumentType(type) ? { fields: finalizeDocumentFieldSchema(type.fields, type.count) } : {}) }; }).sort((a, b) => b.probability - a.probability); } function finalizeDocumentFieldSchema(fieldMap, parentCount) { return Object.values(fieldMap).map((field) => { const fieldTypes = finalizeSchemaFieldTypes(field.types, parentCount); const undefinedCount = parentCount - field.count; if (undefinedCount > 0) { fieldTypes.push({ name: 'Undefined', bsonType: 'Undefined', unique: undefinedCount > 1 ? 0 : 1, hasDuplicates: undefinedCount > 1, path: field.path, count: undefinedCount, probability: undefinedCount / parentCount }); } return { name: field.name, path: field.path, count: field.count, type: fieldTypes.length === 1 ? fieldTypes[0].name : fieldTypes.map((v) => v.name), probability: field.count / parentCount, hasDuplicates: !!fieldTypes.find((v) => v.hasDuplicates), types: fieldTypes }; }).sort(fieldComparator); } return finalizeDocumentFieldSchema(schemaAnalysis.fields, schemaAnalysis.count); } class SchemaAnalyzer { constructor(options) { this.documentsAnalyzed = 0; this.fieldsCount = 0; this.schemaAnalysisRoot = { fields: Object.create(null), count: 0 }; this.finalized = true; this.schemaResult = { count: 0, fields: [] }; this.fieldAndTypeAnalysisCounter = 0; this.options = { ...defaultSchemaParseOptions, ...options }; this.semanticTypes = { ...semantic_types_1.default }; if (typeof this.options.semanticTypes === 'object') { const enabledTypes = Object.entries(this.options.semanticTypes) .filter(([, v]) => typeof v === 'boolean' && v) .map(([k]) => k.toLowerCase()); this.semanticTypes = { ...Object.entries(this.semanticTypes) .filter(([k]) => enabledTypes.includes(k.toLowerCase())) .reduce((p, [k, v]) => ({ ...p, [k]: v }), Object.create(null)) }; Object.entries(this.options.semanticTypes) .filter(([, v]) => typeof v === 'function') .forEach(([k, v]) => { this.semanticTypes[k] = v; }); } } allowAbortDuringAnalysis() { if (this.fieldAndTypeAnalysisCounter++ % util_1.ALLOW_ABORT_INTERVAL_COUNT === 0) { (0, util_1.allowAbort)(); } } increaseFieldCount() { if (!this.options.distinctFieldsAbortThreshold) return; this.fieldsCount++; if (this.fieldsCount > this.options.distinctFieldsAbortThreshold) { throw new Error(`Schema analysis aborted: Fields count above ${this.options.distinctFieldsAbortThreshold}`); } } getSemanticType(value, path) { const returnValue = Object.entries(this.semanticTypes) .filter(([, v]) => { return v(value, path); }) .map(([k]) => k)[0]; return returnValue; } async analyzeDoc(doc) { this.finalized = false; const addToType = async (path, value, schema) => { var _a, _b, _c; await this.allowAbortDuringAnalysis(); const bsonType = getBSONType(value); const typeName = (this.options.semanticTypes) ? this.getSemanticType(value, path) || bsonType : bsonType; if (!schema[typeName]) { schema[typeName] = { name: typeName, bsonType: bsonType, path, count: 0 }; } const type = schema[typeName]; type.count++; if (isArrayType(type)) { type.types = (_a = type.types) !== null && _a !== void 0 ? _a : Object.create(null); type.lengths = (_b = type.lengths) !== null && _b !== void 0 ? _b : []; type.lengths.push(value.length); for (const v of value) { await addToType(path, v, type.types); } } else if (isDocumentType(type)) { type.fields = (_c = type.fields) !== null && _c !== void 0 ? _c : Object.create(null); for (const [fieldName, v] of Object.entries(value)) { await addToField(fieldName, [...path, fieldName], v, type.fields); } } else if (this.options.storeValues && !isNullType(type)) { if (!type.values) { type.values = ['String', 'Binary', 'Code'].includes(bsonType) ? (0, reservoir_1.default)(100) : (0, reservoir_1.default)(10000); } type.values.pushSome(getCappedValue(type.bsonType, value, this.options.storedValuesLengthLimit)); } }; const addToField = async (fieldName, path, value, schema) => { await this.allowAbortDuringAnalysis(); if (!schema[fieldName]) { schema[fieldName] = { name: fieldName, path: path, count: 0, types: Object.create(null) }; this.increaseFieldCount(); } const field = schema[fieldName]; field.count++; await addToType(path, value, field.types); }; for (const key of Object.keys(doc)) { await addToField(key, [key], doc[key], this.schemaAnalysisRoot.fields); } this.schemaAnalysisRoot.count += 1; } getResult() { if (this.finalized) { return this.schemaResult; } this.schemaResult = { count: this.schemaAnalysisRoot.count, fields: finalizeSchema(this.schemaAnalysisRoot) }; this.finalized = true; return this.schemaResult; } getSchemaPaths() { return schemaToPaths(this.schemaAnalysisRoot.fields); } getSimplifiedSchema() { return simplifiedSchema(this.schemaAnalysisRoot.fields); } } exports.SchemaAnalyzer = SchemaAnalyzer; function verifyStreamSource(source) { if (!(Symbol.iterator in source) && !(Symbol.asyncIterator in source)) { throw new Error('Unknown input type for `docs`. Must be an array, ' + 'stream or MongoDB Cursor.'); } return source; } exports.verifyStreamSource = verifyStreamSource; async function getCompletedSchemaAnalyzer(source, options) { var _a; const analyzer = new SchemaAnalyzer(options); for await (const doc of verifyStreamSource(source)) { if ((_a = options === null || options === void 0 ? void 0 : options.signal) === null || _a === void 0 ? void 0 : _a.aborted) throw options.signal.reason; await analyzer.analyzeDoc(doc); } return analyzer; } exports.getCompletedSchemaAnalyzer = getCompletedSchemaAnalyzer; //# sourceMappingURL=schema-analyzer.js.map