mongodb-schema
Version:
Infer the probabilistic schema for a MongoDB collection.
361 lines • 13.4 kB
JavaScript
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.getCompletedSchemaAnalyzer = exports.verifyStreamSource = exports.SchemaAnalyzer = void 0;
const reservoir_1 = __importDefault(require("reservoir"));
const bson_1 = require("bson");
const semantic_types_1 = __importDefault(require("./semantic-types"));
const util_1 = require("./util");
const defaultSchemaParseOptions = {
semanticTypes: false,
storeValues: true,
storedValuesLengthLimit: 10000
};
function extractStringValueFromBSON(value) {
if (value === null || value === void 0 ? void 0 : value._bsontype) {
if (['Decimal128', 'Long'].includes(value._bsontype)) {
return value.toString();
}
if (['Double', 'Int32'].includes(value._bsontype)) {
return String(value.value);
}
}
if (typeof value === 'string') {
return value;
}
return String(value);
}
function fieldComparator(a, b) {
const aName = a.name;
const bName = b.name;
if (aName === '_id') {
return -1;
}
if (bName === '_id') {
return 1;
}
return aName.toLowerCase() < bName.toLowerCase() ? -1 : 1;
}
function getBSONType(value) {
const bsonType = (value === null || value === void 0 ? void 0 : value._bsontype)
? value._bsontype
: Object.prototype.toString.call(value).replace(/^\[object (\w+)\]$/, '$1');
if (bsonType === 'Object') {
return 'Document';
}
if (bsonType === 'Code' && value.scope) {
return 'CodeWScope';
}
return bsonType;
}
function isNullType(type) {
return type.name === 'Null';
}
function isArrayType(type) {
return type.name === 'Array';
}
function isDocumentType(type) {
return type.name === 'Document';
}
function schemaToPaths(fields, parent = []) {
const paths = [];
const sortedFields = Object.values(fields).sort(fieldComparator);
for (const field of sortedFields) {
const path = [...parent, field.name];
paths.push(path);
const doc = Object.values(field.types).find((f) => f.bsonType === 'Document');
if (doc) {
paths.push(...schemaToPaths(doc.fields, path));
}
const array = Object.values(field.types).find((f) => f.bsonType === 'Array');
if (array) {
const arrayDoc = Object.values(array.types).find((f) => f.bsonType === 'Document');
if (arrayDoc) {
paths.push(...schemaToPaths(arrayDoc.fields, path));
}
}
}
return paths;
}
function simplifiedSchema(fields) {
function finalizeSchemaFieldTypes(types) {
return Object.values(types).sort((a, b) => {
return b.count - a.count;
}).map((type) => {
return {
bsonType: type.bsonType,
...(isArrayType(type) ? {
types: finalizeSchemaFieldTypes(type.types)
} : {}),
...(isDocumentType(type) ? { fields: finalizeDocumentFieldSchema(type.fields) } : {})
};
});
}
function finalizeDocumentFieldSchema(fieldMap) {
const fieldSchema = Object.create(null);
Object.values(fieldMap).forEach((field) => {
const fieldTypes = finalizeSchemaFieldTypes(field.types);
fieldSchema[field.name] = {
types: fieldTypes
};
});
return fieldSchema;
}
return finalizeDocumentFieldSchema(fields);
}
function cropString(value, limit) {
if (limit < 1)
return '';
return value.charCodeAt(limit - 1) === value.codePointAt(limit - 1)
? value.slice(0, limit)
: value.slice(0, limit - 1);
}
function getCappedValue(bsonType, value, limit) {
if (bsonType === 'String') {
return cropString(value, limit);
}
if (bsonType === 'Binary') {
value = value;
return value.buffer.length > limit
? new bson_1.Binary(value.buffer.slice(0, limit), value.sub_type)
: value;
}
if (bsonType === 'Code') {
value = value;
return (value.code.length >= limit)
? new bson_1.Code(cropString(value.code, limit), value.scope)
: value;
}
return value;
}
function computeHasDuplicatesForType(type, unique) {
if (isNullType(type)) {
return type.count > 0;
}
if (!type.values) {
return undefined;
}
return unique !== type.values.length;
}
function computeUniqueForType(type) {
if (isNullType(type)) {
return type.count === 0 ? 0 : 1;
}
if (!type.values) {
return undefined;
}
return new Set(type.values.map(extractStringValueFromBSON)).size;
}
function finalizeSchema(schemaAnalysis) {
function finalizeArrayFieldProperties(type) {
const totalCount = Object.values(type.types)
.map((v) => v.count)
.reduce((p, c) => p + c, 0);
const types = finalizeSchemaFieldTypes(type.types, totalCount);
return {
types,
totalCount,
lengths: type.lengths,
averageLength: totalCount / type.lengths.length
};
}
function finalizeSchemaFieldTypes(types, parentCount) {
return Object.values(types).map((type) => {
const unique = computeUniqueForType(type);
return {
name: type.name,
path: type.path,
count: type.count,
probability: type.count / parentCount,
unique,
hasDuplicates: computeHasDuplicatesForType(type, unique),
values: isNullType(type) ? undefined : type.values,
bsonType: type.bsonType,
...(isArrayType(type) ? finalizeArrayFieldProperties(type) : {}),
...(isDocumentType(type) ? { fields: finalizeDocumentFieldSchema(type.fields, type.count) } : {})
};
}).sort((a, b) => b.probability - a.probability);
}
function finalizeDocumentFieldSchema(fieldMap, parentCount) {
return Object.values(fieldMap).map((field) => {
const fieldTypes = finalizeSchemaFieldTypes(field.types, parentCount);
const undefinedCount = parentCount - field.count;
if (undefinedCount > 0) {
fieldTypes.push({
name: 'Undefined',
bsonType: 'Undefined',
unique: undefinedCount > 1 ? 0 : 1,
hasDuplicates: undefinedCount > 1,
path: field.path,
count: undefinedCount,
probability: undefinedCount / parentCount
});
}
return {
name: field.name,
path: field.path,
count: field.count,
type: fieldTypes.length === 1 ? fieldTypes[0].name : fieldTypes.map((v) => v.name),
probability: field.count / parentCount,
hasDuplicates: !!fieldTypes.find((v) => v.hasDuplicates),
types: fieldTypes
};
}).sort(fieldComparator);
}
return finalizeDocumentFieldSchema(schemaAnalysis.fields, schemaAnalysis.count);
}
class SchemaAnalyzer {
constructor(options) {
this.documentsAnalyzed = 0;
this.fieldsCount = 0;
this.schemaAnalysisRoot = {
fields: Object.create(null),
count: 0
};
this.finalized = true;
this.schemaResult = {
count: 0,
fields: []
};
this.fieldAndTypeAnalysisCounter = 0;
this.options = { ...defaultSchemaParseOptions, ...options };
this.semanticTypes = {
...semantic_types_1.default
};
if (typeof this.options.semanticTypes === 'object') {
const enabledTypes = Object.entries(this.options.semanticTypes)
.filter(([, v]) => typeof v === 'boolean' && v)
.map(([k]) => k.toLowerCase());
this.semanticTypes = {
...Object.entries(this.semanticTypes)
.filter(([k]) => enabledTypes.includes(k.toLowerCase()))
.reduce((p, [k, v]) => ({ ...p, [k]: v }), Object.create(null))
};
Object.entries(this.options.semanticTypes)
.filter(([, v]) => typeof v === 'function')
.forEach(([k, v]) => {
this.semanticTypes[k] = v;
});
}
}
allowAbortDuringAnalysis() {
if (this.fieldAndTypeAnalysisCounter++ % util_1.ALLOW_ABORT_INTERVAL_COUNT === 0) {
(0, util_1.allowAbort)();
}
}
increaseFieldCount() {
if (!this.options.distinctFieldsAbortThreshold)
return;
this.fieldsCount++;
if (this.fieldsCount > this.options.distinctFieldsAbortThreshold) {
throw new Error(`Schema analysis aborted: Fields count above ${this.options.distinctFieldsAbortThreshold}`);
}
}
getSemanticType(value, path) {
const returnValue = Object.entries(this.semanticTypes)
.filter(([, v]) => {
return v(value, path);
})
.map(([k]) => k)[0];
return returnValue;
}
async analyzeDoc(doc) {
this.finalized = false;
const addToType = async (path, value, schema) => {
var _a, _b, _c;
await this.allowAbortDuringAnalysis();
const bsonType = getBSONType(value);
const typeName = (this.options.semanticTypes) ? this.getSemanticType(value, path) || bsonType : bsonType;
if (!schema[typeName]) {
schema[typeName] = {
name: typeName,
bsonType: bsonType,
path,
count: 0
};
}
const type = schema[typeName];
type.count++;
if (isArrayType(type)) {
type.types = (_a = type.types) !== null && _a !== void 0 ? _a : Object.create(null);
type.lengths = (_b = type.lengths) !== null && _b !== void 0 ? _b : [];
type.lengths.push(value.length);
for (const v of value) {
await addToType(path, v, type.types);
}
}
else if (isDocumentType(type)) {
type.fields = (_c = type.fields) !== null && _c !== void 0 ? _c : Object.create(null);
for (const [fieldName, v] of Object.entries(value)) {
await addToField(fieldName, [...path, fieldName], v, type.fields);
}
}
else if (this.options.storeValues && !isNullType(type)) {
if (!type.values) {
type.values = ['String', 'Binary', 'Code'].includes(bsonType)
? (0, reservoir_1.default)(100) : (0, reservoir_1.default)(10000);
}
type.values.pushSome(getCappedValue(type.bsonType, value, this.options.storedValuesLengthLimit));
}
};
const addToField = async (fieldName, path, value, schema) => {
await this.allowAbortDuringAnalysis();
if (!schema[fieldName]) {
schema[fieldName] = {
name: fieldName,
path: path,
count: 0,
types: Object.create(null)
};
this.increaseFieldCount();
}
const field = schema[fieldName];
field.count++;
await addToType(path, value, field.types);
};
for (const key of Object.keys(doc)) {
await addToField(key, [key], doc[key], this.schemaAnalysisRoot.fields);
}
this.schemaAnalysisRoot.count += 1;
}
getResult() {
if (this.finalized) {
return this.schemaResult;
}
this.schemaResult = {
count: this.schemaAnalysisRoot.count,
fields: finalizeSchema(this.schemaAnalysisRoot)
};
this.finalized = true;
return this.schemaResult;
}
getSchemaPaths() {
return schemaToPaths(this.schemaAnalysisRoot.fields);
}
getSimplifiedSchema() {
return simplifiedSchema(this.schemaAnalysisRoot.fields);
}
}
exports.SchemaAnalyzer = SchemaAnalyzer;
function verifyStreamSource(source) {
if (!(Symbol.iterator in source) && !(Symbol.asyncIterator in source)) {
throw new Error('Unknown input type for `docs`. Must be an array, ' +
'stream or MongoDB Cursor.');
}
return source;
}
exports.verifyStreamSource = verifyStreamSource;
async function getCompletedSchemaAnalyzer(source, options) {
var _a;
const analyzer = new SchemaAnalyzer(options);
for await (const doc of verifyStreamSource(source)) {
if ((_a = options === null || options === void 0 ? void 0 : options.signal) === null || _a === void 0 ? void 0 : _a.aborted)
throw options.signal.reason;
await analyzer.analyzeDoc(doc);
}
return analyzer;
}
exports.getCompletedSchemaAnalyzer = getCompletedSchemaAnalyzer;
//# sourceMappingURL=schema-analyzer.js.map
;