cvm-lib
Version:
Estimate the number of distinct values in a set using the simple and space-efficient CVM algorithm
1 lines • 11.3 kB
Source Map (JSON)
{"version":3,"sources":["../../src/index.ts","../../src/is.ts","../../src/capacity.ts","../../src/estimator.ts"],"sourcesContent":["export { calculateCapacity } from \"./capacity\";\n\nexport { Estimator } from \"./estimator\";\n\nexport type { EstimatorConfig } from \"./types/estimatorConfig\";\n\nexport type { SampleSet } from \"./types/sampleSet\";\n","/**\n * Returns `true` if the value passed is between 0 and 1, `false` otherwise.\n *\n * @param number - A numeric value.\n */\nexport function isFraction(number: unknown): number is number {\n return typeof number === \"number\" && number > 0 && number < 1;\n}\n\n/**\n * Returns `true` if the value passed is a positive number, `false` otherwise.\n *\n * @param number - A numeric value.\n */\nexport function isPositive(number: unknown): number is number {\n return typeof number === \"number\" && number > 0;\n}\n\n/**\n * Returns `true` if the value passed is a positive integer, `false` otherwise.\n *\n * @param number - A numeric value.\n */\nexport function isPositiveInt(number: unknown): number is number {\n return Number.isInteger(number) && (number as number) > 0;\n}\n","import { isFraction, isPositive } from \"./is\";\n\n/**\n * Calculates the space required to estimate the number of\n * distinct values in a set with a given accuracy and confidence.\n *\n * @param n - The total number of values in the set, or an estimate if unknown.\n *\n * - Must be a positive number.\n * - If unknown, an overestimate is better, but requires more space.\n *\n * @param epsilon - An estimate's relative error. Controls accuracy.\n *\n * - Must be between 0 and 1.\n * - Smaller values equal more accuracy but more required space.\n * - Defaults to `0.05` (i.e. 95% accuracy; estimates can range within ±5% of the true value).\n *\n * @param delta - The probability an estimate is not accurate. Controls confidence.\n *\n * - Must be between 0 and 1.\n * - Smaller values equal higher confidence but more required space.\n * - Defaults to `0.01` (i.e. 99% confidence; there is a 1% chance an estimate is less accurate than expected).\n *\n * @returns The calculated capacity.\n *\n * @throws A {@link RangeError} for any of the following:\n * - `n` is not a positive number.\n * - `epsilon` is not between 0 and 1.\n * - `delta` is not between 0 and 1.\n *\n * @example\n * ```javascript\n * // Get the capacity for estimating the number\n * // of distinct values in a set of 1 billion.\n * // Estimates will have a 99% probability of\n * // being within ±5% of the actual number.\n * const capacity = calculateCapacity(1e9); // 14,617\n * ```\n *\n * @example\n * ```javascript\n * // Get the capacity for estimating the number\n * // of distinct values in a set of 1 billion.\n * // Estimates will have a 99% probability of\n * // being within ±10% of the actual number.\n * const capacity = calculateCapacity(1e9, 0.1); // 3,655\n * ```\n *\n * @example\n * ```javascript\n * // Get the capacity for estimating the number\n * // of distinct values in a set of 1 billion.\n * // Estimates will have an 80% probability of\n * // being within ±5% of the actual number.\n * const capacity = calculateCapacity(1e9, 0.05, 0.2); // 12,888\n * ```\n *\n * @example\n * ```javascript\n * // Get the capacity for estimating the number\n * // of distinct values in a set of 1 billion.\n * // Estimates will have a 99.999% probability of\n * // being within ±1% of the actual number.\n * const capacity = calculateCapacity(1e9, 0.01, 0.00001); // 465,070\n * ```\n */\nexport function calculateCapacity(\n n: number,\n epsilon = 0.05,\n delta = 0.01,\n): number {\n if (!isPositive(n)) {\n throw new RangeError(\"Invalid n\");\n }\n if (!isFraction(epsilon)) {\n throw new RangeError(\"Invalid epsilon\");\n }\n if (!isFraction(delta)) {\n throw new RangeError(\"Invalid delta\");\n }\n return Math.min(n, Math.ceil(Math.log2(n / delta) / epsilon ** 2));\n}\n","// eslint-disable-next-line @typescript-eslint/no-unused-vars\nimport { calculateCapacity } from \"./capacity\";\nimport type { EstimatorConfig } from \"./types/estimatorConfig\";\nimport { isFraction, isPositiveInt } from \"./is\";\nimport type { SampleSet } from \"./types/sampleSet\";\n\n/**\n * Estimates the number of distinct values in a set using the CVM algorithm.\n */\nexport class Estimator<T> {\n /**\n * The maximum number of samples in memory.\n */\n protected _capacity: number;\n\n /**\n * The random number generator function.\n *\n * @defaultValue `Math.random`\n */\n protected _randomFn: () => number;\n\n /**\n * The current sample rate.\n *\n * @defaultValue Initializes to `1`.\n */\n protected _rate: number;\n\n /**\n * The given sample rate.\n *\n * @defaultValue `0.5`\n */\n protected _sampleRate: number;\n\n /**\n * The set of samples in memory.\n *\n * @defaultValue `new Set<T>()`\n */\n protected _samples: SampleSet<T>;\n\n /**\n * @param capacity - The maximum number of samples in memory. Must be a positive integer.\n *\n * @throws A {@link RangeError} if `capacity` is not a positive integer.\n */\n constructor(capacity: number);\n /**\n * @param config - An {@link EstimatorConfig} configuration object.\n *\n * @throws A {@link RangeError} if a given configuration is not within their expected range.\n */\n constructor(config: EstimatorConfig<T>);\n constructor(config: number | EstimatorConfig<T>) {\n // Initialize with defaults\n this._capacity = 1;\n this._rate = 1;\n this._randomFn = Math.random;\n this._sampleRate = 0.5;\n\n // Apply capacity\n if (typeof config === \"number\") {\n this.capacity = config;\n this._samples = new Set();\n return;\n }\n\n // Apply config object\n this.capacity = config.capacity;\n this.randomFn = config.randomFn ?? this._randomFn;\n this.sampleRate = config.sampleRate ?? this._sampleRate;\n this._samples = config.storage ?? new Set();\n }\n\n /**\n * Gets capacity.\n */\n get capacity(): number {\n return this._capacity;\n }\n\n /**\n * Sets capacity. Must be a positive integer.\n *\n * This should be calculated via {@link calculateCapacity} but\n * can also be set arbitrarily. In general, larger\n * values give more accurate estimates.\n *\n * @throws A {@link RangeError} if not given a positive integer.\n */\n protected set capacity(capacity: number) {\n if (!isPositiveInt(capacity)) {\n throw new RangeError(`Invalid capacity`);\n }\n this._capacity = capacity;\n }\n\n /**\n * Gets the random number generator function.\n */\n get randomFn(): () => number {\n return this._randomFn;\n }\n\n /**\n * Sets the random number generator function.\n *\n * The function should return random or pseudorandom values between 0 and 1.\n * Otherwise, this may cause unintended behavior such as invalid estimates.\n */\n set randomFn(randomFn: () => number) {\n this._randomFn = randomFn;\n }\n\n /**\n * Gets the sample rate.\n */\n get sampleRate(): number {\n return this._sampleRate;\n }\n\n /**\n * Sets the sample rate. Must be between 0 and 1.\n *\n * @remarks Custom values may negatively affect accuracy. In general, the\n * further from `0.5`, the more it's affected. If {@link capacity} was\n * calculated via {@link calculateCapacity}, expected accuracy / confidence\n * may be invalidated.\n *\n * @throws A {@link RangeError} if not given a number between 0 and 1.\n */\n protected set sampleRate(sampleRate: number) {\n if (!isFraction(sampleRate)) {\n throw new RangeError(`Invalid sample rate`);\n }\n this._sampleRate = sampleRate;\n }\n\n /**\n * Gets the number of samples in memory.\n */\n get size(): number {\n return this._samples.size;\n }\n\n /**\n * Add a value.\n *\n * Given values may be randomly selected for sampling. If selected,\n * the value is stored internally. Otherwise, they are ignored, or\n * discarded if previously selected.\n *\n * If capacity is reached, samples are resampled,\n * and only values that are again selected are kept.\n * This process repeats until free space is made.\n *\n * @param value - The value to add.\n *\n * @returns The instance.\n */\n add(value: T): this {\n // Ignore / remove value if not sampled\n if (this._randomFn() >= this._rate) {\n this._samples.delete(value);\n return this;\n }\n\n // Add sample\n this._samples.add(value);\n\n // While at capacity\n while (this._samples.size >= this._capacity) {\n // Reduce samples to within capacity\n for (const value of this._samples) {\n if (this._randomFn() >= this._sampleRate) {\n this._samples.delete(value);\n }\n }\n\n // Update current sampling rate\n this._rate *= this._sampleRate;\n }\n\n return this;\n }\n\n /**\n * Clears / resets the instance.\n */\n clear(): void {\n this._rate = 1;\n this._samples.clear();\n }\n\n /**\n * Gets the estimated number of distinct values.\n */\n estimate(): number {\n return this._samples.size / this._rate;\n }\n}\n"],"mappings":";ijBAAA,IAAAA,EAAA,GAAAC,EAAAD,EAAA,eAAAE,EAAA,sBAAAC,IAAA,eAAAC,EAAAJ,GCKO,SAASK,EAAWC,EAAmC,CAC5D,OAAO,OAAOA,GAAW,UAAYA,EAAS,GAAKA,EAAS,CAC9D,CAOO,SAASC,EAAWD,EAAmC,CAC5D,OAAO,OAAOA,GAAW,UAAYA,EAAS,CAChD,CAOO,SAASE,EAAcF,EAAmC,CAC/D,OAAO,OAAO,UAAUA,CAAM,GAAMA,EAAoB,CAC1D,CCyCO,SAASG,EACdC,EACAC,EAAU,IACVC,EAAQ,IACA,CACR,GAAI,CAACC,EAAWH,CAAC,EACf,MAAM,IAAI,WAAW,WAAW,EAElC,GAAI,CAACI,EAAWH,CAAO,EACrB,MAAM,IAAI,WAAW,iBAAiB,EAExC,GAAI,CAACG,EAAWF,CAAK,EACnB,MAAM,IAAI,WAAW,eAAe,EAEtC,OAAO,KAAK,IAAIF,EAAG,KAAK,KAAK,KAAK,KAAKA,EAAIE,CAAK,EAAID,GAAW,CAAC,CAAC,CACnE,CCxEO,IAAMI,EAAN,KAAmB,CA8CxB,YAAYC,EAAqC,CA1CjDC,EAAA,KAAU,aAOVA,EAAA,KAAU,aAOVA,EAAA,KAAU,SAOVA,EAAA,KAAU,eAOVA,EAAA,KAAU,YAsBR,GANA,KAAK,UAAY,EACjB,KAAK,MAAQ,EACb,KAAK,UAAY,KAAK,OACtB,KAAK,YAAc,GAGf,OAAOD,GAAW,SAAU,CAC9B,KAAK,SAAWA,EAChB,KAAK,SAAW,IAAI,IACpB,MACF,CAGA,KAAK,SAAWA,EAAO,SACvB,KAAK,SAAWA,EAAO,UAAY,KAAK,UACxC,KAAK,WAAaA,EAAO,YAAc,KAAK,YAC5C,KAAK,SAAWA,EAAO,SAAW,IAAI,GACxC,CAKA,IAAI,UAAmB,CACrB,OAAO,KAAK,SACd,CAWA,IAAc,SAASE,EAAkB,CACvC,GAAI,CAACC,EAAcD,CAAQ,EACzB,MAAM,IAAI,WAAW,kBAAkB,EAEzC,KAAK,UAAYA,CACnB,CAKA,IAAI,UAAyB,CAC3B,OAAO,KAAK,SACd,CAQA,IAAI,SAASE,EAAwB,CACnC,KAAK,UAAYA,CACnB,CAKA,IAAI,YAAqB,CACvB,OAAO,KAAK,WACd,CAYA,IAAc,WAAWC,EAAoB,CAC3C,GAAI,CAACC,EAAWD,CAAU,EACxB,MAAM,IAAI,WAAW,qBAAqB,EAE5C,KAAK,YAAcA,CACrB,CAKA,IAAI,MAAe,CACjB,OAAO,KAAK,SAAS,IACvB,CAiBA,IAAIE,EAAgB,CAElB,GAAI,KAAK,UAAU,GAAK,KAAK,MAC3B,YAAK,SAAS,OAAOA,CAAK,EACnB,KAOT,IAHA,KAAK,SAAS,IAAIA,CAAK,EAGhB,KAAK,SAAS,MAAQ,KAAK,WAAW,CAE3C,QAAWA,KAAS,KAAK,SACnB,KAAK,UAAU,GAAK,KAAK,aAC3B,KAAK,SAAS,OAAOA,CAAK,EAK9B,KAAK,OAAS,KAAK,WACrB,CAEA,OAAO,IACT,CAKA,OAAc,CACZ,KAAK,MAAQ,EACb,KAAK,SAAS,MAAM,CACtB,CAKA,UAAmB,CACjB,OAAO,KAAK,SAAS,KAAO,KAAK,KACnC,CACF","names":["index_exports","__export","Estimator","calculateCapacity","__toCommonJS","isFraction","number","isPositive","isPositiveInt","calculateCapacity","n","epsilon","delta","isPositive","isFraction","Estimator","config","__publicField","capacity","isPositiveInt","randomFn","sampleRate","isFraction","value"]}