mongodb-schema
Version:
Infer the probabilistic schema for a MongoDB collection.
161 lines (160 loc) • 5.28 kB
TypeScript
/// <reference types="node" />
import Reservoir from 'reservoir';
import { Document, ObjectId, MinKey, MaxKey, Long, Double, Int32, Decimal128, Binary, BSONRegExp, Code, BSONSymbol, Timestamp } from 'bson';
import { AnyIterable } from './types';
type TypeCastMap = {
Array: unknown[];
Binary: Binary;
Boolean: boolean;
Code: Code;
CodeWScope: Code;
Date: Date;
Decimal128: Decimal128;
Double: Double;
Int32: Int32;
Int64: Long;
MaxKey: MaxKey;
MinKey: MinKey;
Null: null;
Object: Record<string, unknown>;
ObjectId: ObjectId;
BSONRegExp: BSONRegExp;
String: string;
BSONSymbol: BSONSymbol;
Timestamp: Timestamp;
Undefined: undefined;
};
type TypeCastTypes = keyof TypeCastMap;
type BSONValue = TypeCastMap[TypeCastTypes];
export type BaseSchemaType = {
path: string[];
name: string;
count: number;
probability: number;
bsonType: string;
hasDuplicates?: boolean;
unique?: number;
};
export type ConstantSchemaType = BaseSchemaType & {
name: 'Null' | 'Undefined';
};
export type PrimitiveSchemaType = BaseSchemaType & {
name: 'String' | 'Number' | 'Int32' | 'Boolean' | 'Decimal128' | 'Long' | 'ObjectId' | 'Date' | 'RegExp' | 'Symbol' | 'MaxKey' | 'MinKey' | 'Binary' | 'Code' | 'Timestamp' | 'DBRef';
values: BSONValue[];
};
export type ArraySchemaType = BaseSchemaType & {
name: 'Array';
lengths: number[];
averageLength: number;
totalCount: number;
types: SchemaType[];
};
export type DocumentSchemaType = BaseSchemaType & {
name: 'Document';
fields: SchemaField[];
};
export type SchemaType = BaseSchemaType | ConstantSchemaType | PrimitiveSchemaType | ArraySchemaType | DocumentSchemaType;
export type SchemaField = {
name: string;
count: number;
path: string[];
type: string | string[];
probability: number;
hasDuplicates: boolean;
types: SchemaType[];
};
export type Schema = {
count: number;
fields: SchemaField[];
};
type SchemaBSONType = Exclude<keyof TypeCastMap, 'Object'> | 'Document';
type SchemaAnalysisBaseType = {
name: string;
path: string[];
bsonType: SchemaBSONType;
count: number;
values?: ReturnType<typeof Reservoir>;
};
type SchemaAnalysisNullType = SchemaAnalysisBaseType & {
name: 'Null';
};
type SchemaAnalysisPrimitiveType = SchemaAnalysisBaseType & {
name: 'String' | 'Number' | 'Int32' | 'Boolean' | 'Decimal128' | 'Long' | 'ObjectId' | 'Date' | 'RegExp' | 'Symbol' | 'MaxKey' | 'MinKey' | 'Binary' | 'Code' | 'Timestamp' | 'DBRef';
};
type SchemaAnalysisArrayType = SchemaAnalysisBaseType & {
name: 'Array';
lengths: number[];
types: SchemaAnalysisFieldTypes;
};
type SchemaAnalysisDocumentType = SchemaAnalysisBaseType & {
name: 'Document';
fields: SchemaAnalysisFieldsMap;
};
type SchemaAnalysisType = SchemaAnalysisBaseType | SchemaAnalysisNullType | SchemaAnalysisPrimitiveType | SchemaAnalysisArrayType | SchemaAnalysisDocumentType;
type SchemaAnalysisFieldTypes = {
[fieldName: string]: SchemaAnalysisType;
};
type SchemaAnalysisField = {
name: string;
path: string[];
count: number;
types: SchemaAnalysisFieldTypes;
};
type SchemaAnalysisFieldsMap = {
[fieldName: string]: SchemaAnalysisField;
};
type SchemaAnalysisRoot = {
fields: SchemaAnalysisFieldsMap;
count: number;
};
type SemanticTypeFunction = ((value: BSONValue, path?: string[]) => boolean);
type SemanticTypeMap = {
[typeName: string]: SemanticTypeFunction | boolean;
};
type AllSchemaParseOptions = {
semanticTypes: boolean | SemanticTypeMap;
storeValues: boolean;
signal?: AbortSignal;
storedValuesLengthLimit: number;
distinctFieldsAbortThreshold?: number;
};
export type SchemaParseOptions = Partial<AllSchemaParseOptions>;
export type SimplifiedSchemaBaseType = {
bsonType: SchemaBSONType;
};
export type SimplifiedSchemaArrayType = SimplifiedSchemaBaseType & {
bsonType: 'Array';
types: SimplifiedSchemaType[];
};
export type SimplifiedSchemaDocumentType = SimplifiedSchemaBaseType & {
bsonType: 'Document';
fields: SimplifiedSchema;
};
export type SimplifiedSchemaType = SimplifiedSchemaBaseType | SimplifiedSchemaArrayType | SimplifiedSchemaDocumentType;
export type SimplifiedSchemaField = {
types: SimplifiedSchemaType[];
};
export type SimplifiedSchema = {
[fieldName: string]: SimplifiedSchemaField;
};
export declare class SchemaAnalyzer {
semanticTypes: SemanticTypeMap;
options: AllSchemaParseOptions;
documentsAnalyzed: number;
fieldsCount: number;
schemaAnalysisRoot: SchemaAnalysisRoot;
finalized: boolean;
schemaResult: Schema;
fieldAndTypeAnalysisCounter: number;
constructor(options?: SchemaParseOptions);
allowAbortDuringAnalysis(): void;
increaseFieldCount(): void;
getSemanticType(value: BSONValue, path: string[]): string;
analyzeDoc(doc: Document): Promise<void>;
getResult(): Schema;
getSchemaPaths(): string[][];
getSimplifiedSchema(): SimplifiedSchema;
}
export declare function verifyStreamSource(source: AnyIterable): AnyIterable;
export declare function getCompletedSchemaAnalyzer(source: AnyIterable, options?: SchemaParseOptions): Promise<SchemaAnalyzer>;
export {};