@allemandi/embed-utils
Version:
Fast, type-safe utilities for vector embedding comparison and search.
219 lines (215 loc) • 8.37 kB
TypeScript
/**
* Calculates cosine similarity between two vectors.
* Measures how similar their directions are, ignoring magnitude.
* Use for comparing semantic or normalized vectors (e.g., text embeddings).
*
* @public
* @param {number[]} vecA - First vector.
* @param {number[]} vecB - Second vector.
* @returns {number} - Cosine similarity score between `vecA` and `vecB`.
* @example
* computeCosineSimilarity([1, 2, 3], [1, 2, 3]);
* // => 1 (identical vectors)
* computeCosineSimilarity([1, 0], [0, 1]);
* // => 0 (orthogonal vectors)
* computeCosineSimilarity([1, 2], [2, 3]);
* // => 0.992...
* computeCosineSimilarity([1, 0], [-1, 0]);
* // => -1 (vectors diametrically opposed)
* computeCosineSimilarity([0, 0], [1, 2]);
* // => 0 (one vector has zero magnitude)
*/
declare function computeCosineSimilarity(vecA: number[], vecB: number[]): number;
/**
* Calculates Euclidean distance between two vectors.
* Measures straight-line distance considering both magnitude and direction.
* Use for raw numeric data or spatial coordinates.
*
* @public
* @param {number[]} vecA - First vector.
* @param {number[]} vecB - Second vector.
* @returns {number} - Euclidean distance between `vecA` and `vecB`.
* @example
* computeEuclideanDistance([1, 2], [4, 6]);
* // => 5 (distance between (1,2) and (4,6))
* computeEuclideanDistance([0, 0], [0, 0]);
* // => 0 (identical vectors)
* computeEuclideanDistance([1, 0], [0, 1]);
* // => 1.414...
* computeEuclideanDistance([1, 2, 3], [4, 5, 6]);
* // => 5.196...
*/
declare function computeEuclideanDistance(vecA: number[], vecB: number[]): number;
/**
* Calculates Manhattan distance between two vectors.
* Measures sum of absolute differences.
* Use for grid-like data or when less sensitive to large differences.
*
* @public
* @param {number[]} vecA - First vector.
* @param {number[]} vecB - Second vector.
* @returns {number} - Manhattan distance between `vecA` and `vecB`.
* @example
* computeManhattanDistance([1, 2, 3], [4, 5, 6]);
* // => 9
* computeManhattanDistance([1, 0], [0, 1]);
* // => 2
* computeManhattanDistance([1, 2], [1, 2]);
* // => 0 (identical vectors)
* computeManhattanDistance([1, -1], [-1, 1]);
* // => 4
*/
declare function computeManhattanDistance(vecA: number[], vecB: number[]): number;
/**
* Normalizes a vector to unit length. If the vector has zero magnitude, returns the original vector.
* @public
* @param {number[]} vec - Input vector.
* @returns {number[]} - A new vector scaled to unit length.
* @example
* normalizeVector([3, 4]);
* // => [0.6, 0.8] (vector normalized to length 1)
* normalizeVector([0, 0]);
* // => [0, 0] (zero vector remains unchanged)
* normalizeVector([1, 1, 1]);
* // => [0.5773502691896258, 0.5773502691896258, 0.5773502691896258]
*/
declare function normalizeVector(vec: number[]): number[];
/**
* Efficiently checks if a vector is L2-normalized (unit length).
* @public
* @param {number[]} vec - Input vector.
* @param {number} [epsilon=1e-6] - Tolerance for floating-point comparison.
* @returns {boolean} - True if the L2 norm is within epsilon of 1.
* @example
* isNormalized([1, 0]);
* // => true (vector length is exactly 1)
* isNormalized([0.6, 0.8]);
* // => true (approximately unit length)
* isNormalized([3, 4]);
* // => false (length is 5)
* isNormalized([0, 0]);
* // => false (length is 0)
*/
declare function isNormalized(vec: number[], epsilon?: number): boolean;
/**
* Computes the mean (centroid) vector from an array of vectors.
* Assumes all vectors are of equal length.
* @public
* @param {number[][]} vectors - Array of input vectors.
* @returns {number[]} - The mean vector.
* @example
* meanVector([[1, 2], [3, 4], [5, 6]]);
* // => [3, 4]
* meanVector([]);
* // => []
*/
declare function meanVector(vectors: number[][]): number[];
/**
* Finds the nearest neighbors to a given query embedding from a list of samples
* based on the specified distance/similarity method.
*
* `'cosine'`: Cosine similarity (higher = more similar, range: [-1, 1]).
*
* `'euclidean'`: Euclidean distance (lower = closer, ≥ 0).
*
* `'manhattan'`: Manhattan distance (lower = closer, ≥ 0).
*
* @public
* @param {number[]} queryEmbedding - The embedding vector to compare against.
* @param {{ embedding: number[], label: string }[]} samples - An array of samples, each with an `embedding` and a `label`.
* @param {object} [options={}] - Optional settings.
* @param {number} [options.topK=1] - Number of top results to return. Default is 1.
* @param {number} [options.threshold] - Minimum similarity score threshold for results (cosine) or maximum distance threshold (euclidean/manhattan).
* @param {'cosine' | 'euclidean' | 'manhattan'} [options.method='cosine'] - The metric to compute:
* @returns {{ embedding: number[], label: string, similarityScore?: number, distance?: number }[]} - An array of nearest neighbors with scores/distances.
* @example
* const samples = [
* { embedding: [1, 0], label: 'A' },
* { embedding: [0, 1], label: 'B' },
* { embedding: [1, 1], label: 'C' },
* ];
*
* // Default cosine similarity
* findNearestNeighbors([1, 0], samples);
* // => [{ embedding: [1, 0], label: 'A', similarityScore: 1 }]
*
* // Euclidean distance
* findNearestNeighbors([1, 0], samples, { method: 'euclidean', topK: 2 });
* // => [
* // { embedding: [1, 0], label: 'A', distance: 0 },
* // { embedding: [1, 1], label: 'C', distance: 1 }
* // ]
*
* // Manhattan distance with threshold
* findNearestNeighbors([1, 0], samples, { method: 'manhattan', threshold: 1.5 });
* // => [{ embedding: [1, 0], label: 'A', distance: 0 }, { embedding: [1, 1], label: 'C', distance: 1 }]
*
* // Cosine with threshold
* findNearestNeighbors([1, 0], samples, { threshold: 0.9 });
* // => [{ embedding: [1, 0], label: 'A', similarityScore: 1 }]
*/
declare function findNearestNeighbors(queryEmbedding: number[], samples: {
embedding: number[];
label: string;
}[], options?: {
topK?: number | undefined;
threshold?: number | undefined;
method?: "cosine" | "euclidean" | "manhattan" | undefined;
}): {
embedding: number[];
label: string;
similarityScore?: number;
distance?: number;
}[];
/**
* Ranks all samples by similarity/distance to the query embedding.
* Does NOT apply threshold or topK filtering.
* @public
* @param {number[]} queryEmbedding - The embedding vector to compare against.
* @param {{ embedding: number[], label: string }[]} samples - Samples with embeddings and labels.
* @param {object} [options={}] - Optional settings.
* @param {'cosine' | 'euclidean' | 'manhattan'} [options.method='cosine'] - Distance/similarity method to use. Default is 'cosine'.
* @returns {{ embedding: number[], label: string, similarityScore?: number, distance?: number }[]} Sorted by best match first.
* @example
* const samples = [
* { embedding: [1, 0], label: 'A' },
* { embedding: [0, 1], label: 'B' },
* { embedding: [1, 1], label: 'C' },
* ];
*
* // Default cosine similarity
* rankBySimilarity([1, 0], samples);
* // => [
* // { embedding: [1, 0], label: 'A', similarityScore: 1 },
* // { embedding: [1, 1], label: 'C', similarityScore: 0.707... },
* // { embedding: [0, 1], label: 'B', similarityScore: 0 }
* // ]
*
* // Euclidean distance
* rankBySimilarity([1, 0], samples, { method: 'euclidean' });
* // => [
* // { embedding: [1, 0], label: 'A', distance: 0 },
* // { embedding: [1, 1], label: 'C', distance: 1 },
* // { embedding: [0, 1], label: 'B', distance: 1.414... }
* // ]
*
* // Manhattan distance
* rankBySimilarity([0, 1], samples, { method: 'manhattan' });
* // => [
* // { embedding: [0, 1], label: 'B', distance: 0 },
* // { embedding: [1, 1], label: 'C', distance: 1 },
* // { embedding: [1, 0], label: 'A', distance: 2 }
* // ]
*/
declare function rankBySimilarity(queryEmbedding: number[], samples: {
embedding: number[];
label: string;
}[], options?: {
method?: "cosine" | "euclidean" | "manhattan" | undefined;
}): {
embedding: number[];
label: string;
similarityScore?: number;
distance?: number;
}[];
export { computeCosineSimilarity, computeEuclideanDistance, computeManhattanDistance, findNearestNeighbors, isNormalized, meanVector, normalizeVector, rankBySimilarity };