UNPKG

mongodb-schema

Version:

Infer the probabilistic schema for a MongoDB collection.

161 lines (160 loc) 5.28 kB
/// <reference types="node" /> import Reservoir from 'reservoir'; import { Document, ObjectId, MinKey, MaxKey, Long, Double, Int32, Decimal128, Binary, BSONRegExp, Code, BSONSymbol, Timestamp } from 'bson'; import { AnyIterable } from './types'; type TypeCastMap = { Array: unknown[]; Binary: Binary; Boolean: boolean; Code: Code; CodeWScope: Code; Date: Date; Decimal128: Decimal128; Double: Double; Int32: Int32; Int64: Long; MaxKey: MaxKey; MinKey: MinKey; Null: null; Object: Record<string, unknown>; ObjectId: ObjectId; BSONRegExp: BSONRegExp; String: string; BSONSymbol: BSONSymbol; Timestamp: Timestamp; Undefined: undefined; }; type TypeCastTypes = keyof TypeCastMap; type BSONValue = TypeCastMap[TypeCastTypes]; export type BaseSchemaType = { path: string[]; name: string; count: number; probability: number; bsonType: string; hasDuplicates?: boolean; unique?: number; }; export type ConstantSchemaType = BaseSchemaType & { name: 'Null' | 'Undefined'; }; export type PrimitiveSchemaType = BaseSchemaType & { name: 'String' | 'Number' | 'Int32' | 'Boolean' | 'Decimal128' | 'Long' | 'ObjectId' | 'Date' | 'RegExp' | 'Symbol' | 'MaxKey' | 'MinKey' | 'Binary' | 'Code' | 'Timestamp' | 'DBRef'; values: BSONValue[]; }; export type ArraySchemaType = BaseSchemaType & { name: 'Array'; lengths: number[]; averageLength: number; totalCount: number; types: SchemaType[]; }; export type DocumentSchemaType = BaseSchemaType & { name: 'Document'; fields: SchemaField[]; }; export type SchemaType = BaseSchemaType | ConstantSchemaType | PrimitiveSchemaType | ArraySchemaType | DocumentSchemaType; export type SchemaField = { name: string; count: number; path: string[]; type: string | string[]; probability: number; hasDuplicates: boolean; types: SchemaType[]; }; export type Schema = { count: number; fields: SchemaField[]; }; type SchemaBSONType = Exclude<keyof TypeCastMap, 'Object'> | 'Document'; type SchemaAnalysisBaseType = { name: string; path: string[]; bsonType: SchemaBSONType; count: number; values?: ReturnType<typeof Reservoir>; }; type SchemaAnalysisNullType = SchemaAnalysisBaseType & { name: 'Null'; }; type SchemaAnalysisPrimitiveType = SchemaAnalysisBaseType & { name: 'String' | 'Number' | 'Int32' | 'Boolean' | 'Decimal128' | 'Long' | 'ObjectId' | 'Date' | 'RegExp' | 'Symbol' | 'MaxKey' | 'MinKey' | 'Binary' | 'Code' | 'Timestamp' | 'DBRef'; }; type SchemaAnalysisArrayType = SchemaAnalysisBaseType & { name: 'Array'; lengths: number[]; types: SchemaAnalysisFieldTypes; }; type SchemaAnalysisDocumentType = SchemaAnalysisBaseType & { name: 'Document'; fields: SchemaAnalysisFieldsMap; }; type SchemaAnalysisType = SchemaAnalysisBaseType | SchemaAnalysisNullType | SchemaAnalysisPrimitiveType | SchemaAnalysisArrayType | SchemaAnalysisDocumentType; type SchemaAnalysisFieldTypes = { [fieldName: string]: SchemaAnalysisType; }; type SchemaAnalysisField = { name: string; path: string[]; count: number; types: SchemaAnalysisFieldTypes; }; type SchemaAnalysisFieldsMap = { [fieldName: string]: SchemaAnalysisField; }; type SchemaAnalysisRoot = { fields: SchemaAnalysisFieldsMap; count: number; }; type SemanticTypeFunction = ((value: BSONValue, path?: string[]) => boolean); type SemanticTypeMap = { [typeName: string]: SemanticTypeFunction | boolean; }; type AllSchemaParseOptions = { semanticTypes: boolean | SemanticTypeMap; storeValues: boolean; signal?: AbortSignal; storedValuesLengthLimit: number; distinctFieldsAbortThreshold?: number; }; export type SchemaParseOptions = Partial<AllSchemaParseOptions>; export type SimplifiedSchemaBaseType = { bsonType: SchemaBSONType; }; export type SimplifiedSchemaArrayType = SimplifiedSchemaBaseType & { bsonType: 'Array'; types: SimplifiedSchemaType[]; }; export type SimplifiedSchemaDocumentType = SimplifiedSchemaBaseType & { bsonType: 'Document'; fields: SimplifiedSchema; }; export type SimplifiedSchemaType = SimplifiedSchemaBaseType | SimplifiedSchemaArrayType | SimplifiedSchemaDocumentType; export type SimplifiedSchemaField = { types: SimplifiedSchemaType[]; }; export type SimplifiedSchema = { [fieldName: string]: SimplifiedSchemaField; }; export declare class SchemaAnalyzer { semanticTypes: SemanticTypeMap; options: AllSchemaParseOptions; documentsAnalyzed: number; fieldsCount: number; schemaAnalysisRoot: SchemaAnalysisRoot; finalized: boolean; schemaResult: Schema; fieldAndTypeAnalysisCounter: number; constructor(options?: SchemaParseOptions); allowAbortDuringAnalysis(): void; increaseFieldCount(): void; getSemanticType(value: BSONValue, path: string[]): string; analyzeDoc(doc: Document): Promise<void>; getResult(): Schema; getSchemaPaths(): string[][]; getSimplifiedSchema(): SimplifiedSchema; } export declare function verifyStreamSource(source: AnyIterable): AnyIterable; export declare function getCompletedSchemaAnalyzer(source: AnyIterable, options?: SchemaParseOptions): Promise<SchemaAnalyzer>; export {};