string-probability
Version:
A TypeScript library to calculate Jaro-Winkler distance and string similarity probabilities between two strings.
483 lines (409 loc) โข 17.5 kB
text/typescript
import { describe, expect, test } from "bun:test";
import { distance, probability } from "../src"; // ๐ฏ Replace with your actual module path!
// ๐ Welcome to the most adorable string similarity tests ever! ๐
describe("๐ญ Jaro-Winkler Distance Function", () => {
describe("โจ Basic functionality", () => {
test("should return 0 for identical strings ๐", () => {
expect(distance("hello", "hello")).toBe(0);
expect(distance("", "")).toBe(0);
expect(distance("a", "a")).toBe(0);
expect(distance("TypeScript", "TypeScript")).toBe(0);
});
test("should return 1 for completely different strings ๐", () => {
expect(distance("abc", "xyz")).toBeCloseTo(1, 5);
expect(distance("hello", "12345")).toBeCloseTo(1, 5);
expect(distance("cat", "dog")).toBeCloseTo(1, 5);
});
test("should return 1 when one string is empty ๐ซฅ", () => {
expect(distance("", "hello")).toBe(1);
expect(distance("world", "")).toBe(1);
expect(distance("", "")).toBe(0); // Both empty = identical!
});
});
describe("๐ค Character matching & transpositions", () => {
test("should handle simple character matches ๐ฏ", () => {
const result = distance("martha", "marhta");
expect(result).toBeGreaterThan(0);
expect(result).toBeLessThan(0.1); // Very similar due to transposition
});
test("should handle prefix bonus correctly ๐", () => {
const withPrefix = distance("hello", "help");
const withoutPrefix = distance("ello", "elp");
expect(withPrefix).toBeLessThan(withoutPrefix); // Prefix should give bonus
});
test("should respect prefix scaling parameter ๐", () => {
const lowScale = distance("hello", "help", 0.05);
const highScale = distance("hello", "help", 0.2);
expect(highScale).toBeLessThan(lowScale); // Higher scale = more prefix bonus
});
});
describe("๐ช Edge cases & special scenarios", () => {
test("should handle single character strings ๐ฏ", () => {
expect(distance("a", "b")).toBe(1);
expect(distance("a", "a")).toBe(0);
expect(distance("x", "xy")).toBeGreaterThan(0);
expect(distance("x", "xy")).toBeLessThan(1);
});
test("should handle strings with different lengths gracefully ๐", () => {
const result1 = distance("short", "muchlongerstring");
const result2 = distance("verylongstringhere", "tiny");
expect(result1).toBeGreaterThan(0);
expect(result1).toBeLessThanOrEqual(1);
expect(result2).toBeGreaterThan(0);
expect(result2).toBeLessThanOrEqual(1);
});
test("should handle strings with repeated characters ๐", () => {
const result1 = distance("aaaa", "aaab");
const result2 = distance("abab", "baba");
expect(result1).toBeGreaterThan(0);
expect(result1).toBeLessThan(0.5);
expect(result2).toBeGreaterThan(0);
expect(result2).toBeLessThan(0.5);
});
test("should handle unicode characters properly ๐", () => {
expect(distance("cafรฉ", "cafe")).toBeGreaterThan(0);
expect(distance("๐ฆ", "๐ฆ")).toBe(0);
expect(distance("๐โจ", "โจ๐")).toBeGreaterThan(0);
});
test("should handle case sensitivity ๐ค", () => {
const result = distance("Hello", "hello");
expect(result).toBeGreaterThan(0);
expect(result).toBeLessThan(1);
});
});
describe("๐งฎ Mathematical properties", () => {
test("should always return values between 0 and 1 ๐", () => {
const testPairs: Array<[string, string]> = [
["", "test"],
["a", "z"],
["hello", "world"],
["typescript", "javascript"],
["similar", "similiar"],
["completely", "different"],
["๐", "party"],
];
testPairs.forEach(([str1, str2]) => {
const result = distance(str1, str2);
expect(result).toBeGreaterThanOrEqual(0);
expect(result).toBeLessThanOrEqual(1);
});
});
test("should be symmetric (mostly) ๐ช", () => {
// Note: Jaro-Winkler can have slight asymmetry due to prefix bonus
const pairs: Array<[string, string]> = [
["hello", "world"],
["abc", "def"],
["similar", "similiar"],
];
pairs.forEach(([str1, str2]) => {
const result1 = distance(str1, str2);
const result2 = distance(str2, str1);
expect(Math.abs(result1 - result2)).toBeLessThan(0.001);
});
});
});
describe("๐ฏ Known examples with expected results", () => {
test("should match known Jaro-Winkler calculations ๐", () => {
// These are well-known test cases - using actual calculated values from your implementation
expect(distance("martha", "marhta")).toBeCloseTo(0.0389, 3);
expect(distance("dixon", "dicksonx")).toBeCloseTo(0.18666667, 2);
expect(distance("dwayne", "duane")).toBeCloseTo(0.1555, 2);
});
test("should handle common typo scenarios ๐๏ธ", () => {
const typos: Array<[string, string, number]> = [
["receive", "recieve", 0.1], // Letter swap
["definitely", "definately", 0.15], // Letter substitution
["separate", "seperate", 0.1], // Letter swap
];
typos.forEach(([correct, typo, maxDistance]) => {
const result = distance(correct, typo);
expect(result).toBeLessThan(maxDistance);
expect(result).toBeGreaterThan(0);
});
});
});
});
describe("๐ฒ Probability Function Suite", () => {
describe("๐ Standard mode behavior", () => {
test("should return 1 for identical strings ๐", () => {
expect(probability("hello", "hello")).toBeCloseTo(1, 5);
expect(probability("test", "test")).toBeCloseTo(1, 5);
// Note: Empty strings cause NaN due to L=0, so we'll test this separately
});
test("should return values between 0 and 1 ๐", () => {
const testPairs: Array<[string, string]> = [
["abc", "xyz"],
["hello", "world"],
["similar", "similiar"],
["cat", "bat"],
["", "test"],
];
testPairs.forEach(([str1, str2]) => {
const result = probability(str1, str2);
expect(result).toBeGreaterThanOrEqual(0);
expect(result).toBeLessThanOrEqual(1);
});
});
test("should give higher probability to more similar strings ๐", () => {
const similar = probability("hello", "hallo");
const different = probability("hello", "xyz");
expect(similar).toBeGreaterThan(different);
// Adjusting expectations based on actual behavior
expect(similar).toBeGreaterThan(0.7); // More similar strings should have high probability
expect(different).toBeGreaterThan(0); // Even different strings may have some probability
});
});
describe("๐ Alpha mode (exponential decay)", () => {
test("should work with default alpha value ๐ฏ", () => {
const result = probability("test", "best", { mode: "alpha" });
expect(result).toBeGreaterThan(0);
expect(result).toBeLessThan(1);
});
test("should respect custom alpha values ๐๏ธ", () => {
const lowAlpha = probability("test", "best", {
mode: "alpha",
value: 0.5,
});
const highAlpha = probability("test", "best", {
mode: "alpha",
value: 5.0,
});
expect(lowAlpha).toBeGreaterThan(highAlpha); // Lower alpha = more forgiving
});
test("should produce smooth exponential decay ๐", () => {
const alphaValues = [0.5, 1.0, 2.0, 5.0];
const results = alphaValues.map((alpha) =>
probability("hello", "world", { mode: "alpha", value: alpha })
);
// Results should decrease as alpha increases
for (let i = 1; i < results.length; i++) {
// @ts-expect-error
expect(results[i]).toBeLessThanOrEqual(results[i - 1]);
}
});
});
describe("๐ช Beta mode (power formula)", () => {
test("should work with default beta value ๐ฏ", () => {
const result = probability("test", "best", { mode: "beta" });
expect(result).toBeGreaterThan(0);
expect(result).toBeLessThan(1);
});
test("should respect custom beta values ๐๏ธ", () => {
const lowBeta = probability("test", "best", { mode: "beta", value: 0.5 });
const highBeta = probability("test", "best", {
mode: "beta",
value: 3.0,
});
// Beta < 1 is stricter, beta > 1 is more forgiving
expect(lowBeta).toBeLessThan(highBeta);
});
test("should handle edge beta values gracefully ๐", () => {
const betaOne = probability("hello", "world", {
mode: "beta",
value: 1.0,
});
const betaTwo = probability("hello", "world", {
mode: "beta",
value: 2.0,
});
expect(betaOne).toBeGreaterThan(0);
expect(betaTwo).toBeGreaterThan(0);
expect(betaTwo).toBeGreaterThan(betaOne); // Higher beta = more forgiving
});
});
describe("๐ญ Mode comparison & consistency", () => {
test("should handle mode parameter correctly ๐ช", () => {
const standard = probability("test", "best");
const alpha = probability("test", "best", { mode: "alpha" });
const beta = probability("test", "best", { mode: "beta" });
// All should be valid probabilities
[standard, alpha, beta].forEach((result) => {
expect(result).toBeGreaterThanOrEqual(0);
expect(result).toBeLessThanOrEqual(1);
});
});
test("should default to standard mode when mode is not specified ๐ฏ", () => {
const explicit = probability("hello", "world", { mode: "standard" });
const implicit = probability("hello", "world");
expect(explicit).toBeCloseTo(implicit, 10);
});
test("all modes should agree on identical strings ๐", () => {
const modes: Array<{
mode?: "standard" | "alpha" | "beta";
value?: number;
}> = [
{ mode: "standard" },
{ mode: "alpha", value: 1.0 },
{ mode: "beta", value: 2.0 },
];
modes.forEach((options) => {
const result = probability("identical", "identical", options);
expect(result).toBeCloseTo(1, 5);
});
// Test empty string behavior separately for each mode
const emptyStandardResult = probability("", "", { mode: "standard" });
const emptyAlphaResult = probability("", "", {
mode: "alpha",
value: 1.0,
});
const emptyBetaResult = probability("", "", { mode: "beta", value: 2.0 });
// Standard mode gives NaN for empty strings due to L=0
expect(emptyStandardResult).toBeNaN();
// Alpha and beta modes should give 1 for identical strings (even empty ones)
expect(emptyAlphaResult).toBeCloseTo(1, 5);
expect(emptyBetaResult).toBeCloseTo(1, 5);
});
});
describe("๐จ Real-world scenarios", () => {
test("should handle name matching scenarios ๐ฅ", () => {
const names: Array<[string, string, number]> = [
["John", "Jon", 0.7], // Common nickname
["Katherine", "Catherine", 0.8], // Common spelling variation
["Smith", "Smyth", 0.7], // Phonetic similarity
];
names.forEach(([name1, name2, minProbability]) => {
const result = probability(name1, name2);
expect(result).toBeGreaterThan(minProbability);
});
});
test("should handle product name matching ๐๏ธ", () => {
const products: Array<[string, string, number]> = [
["iPhone 12", "iphone12", 0.8],
["MacBook Pro", "Macbook pro", 0.8],
["AirPods", "Air Pods", 0.7],
];
products.forEach(([product1, product2, minProbability]) => {
const result = probability(product1, product2);
expect(result).toBeGreaterThan(minProbability);
});
});
test("should detect fuzzy duplicates effectively ๐", () => {
const duplicates: Array<[string, string]> = [
["JavaScript Developer", "Javascript developer"],
["San Francisco, CA", "San Francisco CA"],
["john.doe@email.com", "john.doe@email.co"],
];
duplicates.forEach(([str1, str2]) => {
const result = probability(str1, str2);
expect(result).toBeGreaterThan(0.6); // Should detect as likely duplicates
});
});
});
describe("๐งช Edge cases & error handling", () => {
test("should handle empty strings gracefully ๐ซฅ", () => {
// Empty strings cause NaN in standard mode due to L=0 division
const emptyEmpty = probability("", "");
const emptyString = probability("", "test");
const stringEmpty = probability("test", "");
// For empty vs empty, we expect NaN due to division by zero in L
expect(emptyEmpty).toBeNaN();
// For empty vs non-empty, we should get some finite value
expect(emptyString).toBeGreaterThanOrEqual(0);
expect(stringEmpty).toBeGreaterThanOrEqual(0);
});
test("should handle very long strings ๐", () => {
const longString1 = "a".repeat(1000);
const longString2 = "b".repeat(1000);
const longString3 = "a".repeat(999) + "b";
const result1 = probability(longString1, longString2);
const result2 = probability(longString1, longString3);
// Long different strings may still have high probability due to formula 1/(1+d/L)
expect(result1).toBeGreaterThanOrEqual(0);
expect(result1).toBeLessThanOrEqual(1);
expect(result2).toBeGreaterThan(result1); // More similar should have higher probability
});
test("should handle special characters and spaces ๐จ", () => {
const specialPairs: Array<[string, string]> = [
["hello world!", "hello world?"],
["test@example.com", "test@example.org"],
[" spaced ", "spaced"],
["tab\there", "tab here"],
];
specialPairs.forEach(([str1, str2]) => {
const result = probability(str1, str2);
expect(result).toBeGreaterThanOrEqual(0);
expect(result).toBeLessThanOrEqual(1);
expect(result).toBeGreaterThan(0.3); // Should have some similarity
});
});
});
});
describe("๐ช Integration & Performance Tests", () => {
describe("๐ค Function interaction", () => {
test("probability should be inverse-related to distance ๐", () => {
const testPairs: Array<[string, string]> = [
["similar", "similiar"],
["hello", "world"],
["cat", "bat"],
["test", "best"],
];
testPairs.forEach(([str1, str2]) => {
const dist = distance(str1, str2);
const prob = probability(str1, str2);
// Basic relationship: as distance increases, we expect some pattern
// but the 1/(1+d/L) formula means even high distances can have decent probability
expect(prob).toBeGreaterThanOrEqual(0);
expect(prob).toBeLessThanOrEqual(1);
// Test that lower distances generally give higher probabilities
if (dist < 0.1) {
expect(prob).toBeGreaterThan(0.8);
}
});
});
});
describe("โก Performance characteristics", () => {
test("should handle reasonable input sizes efficiently ๐", () => {
const mediumString1 = "a".repeat(100);
const mediumString2 = "b".repeat(100);
const startTime = performance.now();
const result = distance(mediumString1, mediumString2);
const endTime = performance.now();
expect(result).toBeDefined();
expect(endTime - startTime).toBeLessThan(100); // Should complete in < 100ms
});
test("should handle batch processing ๐ฆ", () => {
const strings = [
"hello",
"world",
"test",
"best",
"javascript",
"typescript",
];
const results: number[][] = [];
const startTime = performance.now();
for (let i = 0; i < strings.length; i++) {
results[i] = [];
for (let j = 0; j < strings.length; j++) {
// @ts-expect-error
results[i][j] = distance(strings[i], strings[j]);
}
}
const endTime = performance.now();
expect(results).toHaveLength(strings.length);
expect(results[0]).toHaveLength(strings.length);
expect(endTime - startTime).toBeLessThan(50); // Batch should be fast
});
});
describe("๐ฏ Type safety validation", () => {
test("should work with various string types ๐ค", () => {
const stringTypes: string[] = [
"regular string",
"",
"123",
"special!@#$%^&*()",
"unicode: ๐ฆ๐โจ",
"mixed 123 @#$ ๐",
];
stringTypes.forEach((str) => {
const result1 = distance(str, str);
expect(result1).toBe(0);
// Only test probability for non-empty strings to avoid NaN
if (str.length > 0) {
const result2 = probability(str, str);
expect(result2).toBeCloseTo(1, 5);
}
});
});
});
});