UNPKG

tdigest-kt

Version:

port of latest (5/2019) TDigest algorithm to kotlin multiplatform

289 lines (245 loc) 10.2 kB
import * as ByteBuffer from "bytebuffer"; declare namespace Kotlin{ /** * this are real partial definitions: * unfortunately no d.ts files for kotlinjs standard library available */ class Long { high_:number; low_:number; toNumber():number } interface Collection<T> { } } export namespace com.basicio { export interface BinaryOutput { /** * convert the current content of this BinaryOutput * (a wrapped ByteBuffer) into base64 encoded string */ toB64():string } export interface BinaryInput { } /** * create a [[BinaryInput]] to be used to deserialize a TDigest * @param bb a bytebuffer containing the data to be deserialized */ export function toBinaryInput(bb:ByteBuffer):BinaryInput; /** * create a [[BinaryOutput]] to be used to serialize a TDigest, from a * preallocated [[ByteBuffer]] * @param bb a preallocated bytebuffer */ export function toBinaryOutput(bb:ByteBuffer):BinaryOutput; } export namespace com.tdunning.math.stats { export class ScaleFunction { /** * Generates uniform cluster sizes: for reference only */ static readonly K_0: ScaleFunction; /** * Generates cluster sizes proportional to sqrt(q*(1-q)). This gives constant relative accuracy if accuracy is * proportional to squared cluster size. It is expected that K_2 and K_3 will give better practical results. */ static readonly K_1: ScaleFunction; /** * Generates cluster sizes proportional to sqrt(q*(1-q)) but avoids computation of asin in the critical path by * using an approximate version. */ static readonly K_1_FAST: ScaleFunction; /** * Generates cluster sizes proportional to q*(1-q). This makes tail error bounds tighter than for K_1. The use of a * normalizing function results in a strictly bounded number of clusters no matter how many samples. */ static readonly K_2: ScaleFunction; /** * Generates cluster sizes proportional to min(q, 1-q). This makes tail error bounds tighter than for K_1 or K_2. * The use of a normalizing function results in a strictly bounded number of clusters no matter how many samples. */ static readonly K_3: ScaleFunction; /** * Generates cluster sizes proportional to q*(1-q). This makes the tail error bounds tighter. This version does not * use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is good * for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for * tree-based implementations. */ static readonly K_2_NO_NORM: ScaleFunction; /** * Generates cluster sizes proportional to min(q, 1-q). This makes the tail error bounds tighter. This version does * not use a normalizer function and thus the number of clusters increases roughly proportional to log(n). That is * good for accuracy, but bad for size and bad for the statically allocated MergingDigest, but can be useful for * tree-based implementations. */ static readonly K_3_NO_NORM: ScaleFunction; } import BinaryOutput = com.basicio.BinaryOutput; import BinaryInput = com.basicio.BinaryInput; export class Centroid { constructor(record:boolean) /** * * @param x double * @param w must be INT */ add(x: number, w: number):void mean(): number /** * @return return integer */ count(): number /** * @return return integer */ id(): number } /** * this class does not exist in original kotlin code, * it has been artificially introduced, in order to allow independent * companion object between base class TDigest and child classes * AVLTreeDigest and MergeDigest, because in typescript static properties * of child objects are assumed to be subclasses of static properties of * parent objects. This is obviously not the case in kotlin */ export abstract class ITDigest { /** * Adds a sample to a histogram * * @param x the sample to add * @param w the weight for the sample: MUST BE AN INTEGER */ addWeightedSample(x: number, w: number):void updateSample(oldValue: number, newValue: number):void /** * Re-examines a t-digest to determine whether some centroids are redundant. If your data are * perversely ordered, this may be a good idea. Even if not, this may save 20% or so in space. * * The cost is roughly the same as adding as many data points as there are centroids. This * is typically &lt; 10 * compression, but could be as high as 100 * compression. * * This is a destructive operation that is not thread-safe. */ compress():void /** * Returns the number of points that have been added to this TDigest. * * @return The sum of the weights on all centroids. */ size():Kotlin.Long /** * Returns the fraction of all points added which are x. * * @param x The cutoff for the cdf. * @return The fraction of all data which is less or equal to x. */ cdf(x: number): number /** * Returns an estimate of the cutoff such that a specified fraction of the data * added to this TDigest would be less than or equal to the cutoff. * * @param q The desired fraction * @return The value x such that cdf(x) == q */ quantile(q: number): number /** * A [Collection] that lets you go through the centroids in ascending order by mean. Centroids * returned will not be re-used, but may or may not share storage with this TDigest. * * @return The centroids in the form of a Collection. */ centroids(): Kotlin.Collection<Centroid> /** * Returns the current compression factor. * * @return The compression factor originally used to set up the TDigest. */ compression(): number /** * Returns the number of bytes required to encode this TDigest using #asBytes(). * * @return The number of bytes required (an integer) */ byteSize(): number /** * Returns the number of bytes required to encode this TDigest using #asSmallBytes(). * * Note that this is just as expensive as actually compressing the digest. If you don't * care about time, but want to never over-allocate, this is fine. If you care about compression * and speed, you pretty much just have to overallocate by using allocating #byteSize() bytes. * * @return The number of bytes required.(an integer) */ smallByteSize(): number /** * set the scale function (see method [ScaleFunction$valueOf]) * default scale function is 'K2' * @param scaleFunction */ setScaleFunction(scaleFunction: ScaleFunction):void /** * Serialize this TDigest into a byte buffer. Note that the serialization used is * very straightforward and is considerably larger than strictly necessary. * * @param buf The byte buffer into which the TDigest should be serialized. */ asBytes(buf: BinaryOutput):void /** * Serialize this TDigest into a byte buffer. Some simple compression is used * such as using variable byte representation to store the centroid weights and * using delta-encoding on the centroid means so that floats can be reasonably * used to store the centroid means. * * @param buf The byte buffer into which the TDigest should be serialized. */ asSmallBytes(buf: BinaryOutput):void /** * Add a sample to this TDigest. * * @param x The data value to add */ addSample(x: number):void /** * Add all of the centroids of another TDigest to this one. * * @param other The other TDigest */ addOtherDigest(other: TDigest):void centroidCount(): number } export abstract class TDigest extends ITDigest { static readonly Companion:TDigest$Companion } /** * AVLTreeDigest is actually a subclsas of TDigest */ export class AVLTreeDigest extends ITDigest { static readonly Companion:AVLTreeDigest$Companion } export class MergeDigest extends ITDigest { static readonly Companion:MergeDigest$Companion } export class TDigest$Companion { /** * create a TDigest with the specified compression * use [createAvlTreeDigest] or [createMergingDigest] if you use a specific algorithm * @param compression */ createDigest(compression: number):TDigest createMergingDigest(compression: number): MergeDigest createAvlTreeDigest(compression: number): AVLTreeDigest } export class AVLTreeDigest$Companion { /** * Reads a [[AVLTreeDigest]] from a [[BinaryInput]] */ fromBytes(buf: BinaryInput): AVLTreeDigest } export class MergeDigest$Companion { /** * Reads a [[MergeDigest]] from a [[BinaryInput]] */ fromBytes(buf: BinaryInput): MergeDigest } }