UNPKG

node-string-similarity

Version:

A TypeScript library for string similarity comparison

166 lines (165 loc) 6.25 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.compareStrings = compareStrings; exports.batchCompareStrings = batchCompareStrings; exports.findMatchesAboveThreshold = findMatchesAboveThreshold; exports.jaroWinklerDistance = jaroWinklerDistance; exports.cosineSimilarity = cosineSimilarity; exports.diceCoefficient = diceCoefficient; function compareStrings(str1, str2) { if (typeof str1 !== "string" || typeof str2 !== "string") { throw new TypeError("Both inputs must be strings"); } if (!str1 || !str2) { throw new Error("Both strings must be non-empty"); } const length1 = str1.length; const length2 = str2.length; const maxLength = Math.max(length1, length2); if (maxLength === 0) { return 1.0; // Both strings are empty } const distance = levenshteinDistance(str1, str2); return (maxLength - distance) / maxLength; } function batchCompareStrings(target, candidates) { if (typeof target !== "string" || !Array.isArray(candidates)) { throw new TypeError("Target must be a string and candidates must be an array of strings"); } if (!target || candidates.length === 0) { throw new Error("Target string must be non-empty and candidates array must not be empty"); } return candidates.map(candidate => ({ candidate, similarity: compareStrings(target, candidate), })); } function findMatchesAboveThreshold(target, candidates, threshold) { if (typeof target !== "string" || !Array.isArray(candidates)) { throw new TypeError("Target must be a string and candidates must be an array of strings"); } if (!target || candidates.length === 0) { throw new Error("Target string must be non-empty and candidates array must not be empty"); } return candidates.filter(candidate => compareStrings(target, candidate) >= threshold); } function jaroWinklerDistance(str1, str2) { if (!str1 || !str2) { throw new Error("Both strings must be provided"); } const m = getMatchingCharacters(str1, str2); if (m === 0) return 0; const t = getTranspositions(str1, str2, m); const jaro = (m / str1.length + m / str2.length + (m - t) / m) / 3; const prefixLength = getCommonPrefixLength(str1, str2); const scalingFactor = 0.1; // Default scaling factor return jaro + Math.min(prefixLength, 4) * scalingFactor * (1 - jaro); } function cosineSimilarity(str1, str2) { if (!str1 || !str2) { throw new Error("Both strings must be provided"); } const vector1 = getCharacterFrequencyVector(str1); const vector2 = getCharacterFrequencyVector(str2); const dotProduct = vector1.reduce((sum, val, i) => sum + (vector2[i] || 0) * val, 0); const magnitude1 = Math.sqrt(vector1.reduce((sum, val) => sum + val * val, 0)); const magnitude2 = Math.sqrt(vector2.reduce((sum, val) => sum + val * val, 0)); return magnitude1 === 0 || magnitude2 === 0 ? 0 : dotProduct / (magnitude1 * magnitude2); } function diceCoefficient(str1, str2) { if (!str1 || !str2) { throw new Error("Both strings must be provided"); } const bigrams1 = getBigrams(str1); const bigrams2 = getBigrams(str2); const intersection = bigrams1.filter(bigram => bigrams2.includes(bigram)).length; return (2 * intersection) / (bigrams1.length + bigrams2.length); } function levenshteinDistance(a, b) { const aArray = Array.from(a); const bArray = Array.from(b); if (aArray.length === 0) return bArray.length; if (bArray.length === 0) return aArray.length; let previousRow = Array(bArray.length + 1).fill(0).map((_, i) => i); let currentRow = Array(bArray.length + 1).fill(0); for (let i = 1; i <= aArray.length; i++) { currentRow[0] = i; for (let j = 1; j <= bArray.length; j++) { const cost = aArray[i - 1] === bArray[j - 1] ? 0 : 1; currentRow[j] = Math.min(previousRow[j] + 1, // Deletion currentRow[j - 1] + 1, // Insertion previousRow[j - 1] + cost // Substitution ); } [previousRow, currentRow] = [currentRow, previousRow]; } return previousRow[bArray.length]; } function getMatchingCharacters(str1, str2) { const str1Array = Array.from(str1); const str2Array = Array.from(str2); const matchWindow = Math.floor(Math.max(str1Array.length, str2Array.length) / 2) - 1; const matches1 = Array(str1Array.length).fill(false); const matches2 = Array(str2Array.length).fill(false); let matches = 0; for (let i = 0; i < str1Array.length; i++) { const start = Math.max(0, i - matchWindow); const end = Math.min(i + matchWindow + 1, str2Array.length); for (let j = start; j < end; j++) { if (!matches2[j] && str1Array[i] === str2Array[j]) { matches1[i] = true; matches2[j] = true; matches++; break; } } } return matches; } function getTranspositions(str1, str2, matches) { const matched1 = []; const matched2 = []; for (let i = 0; i < str1.length; i++) { if (matches > 0 && str1[i] === str2[i]) { matched1.push(str1[i]); matched2.push(str2[i]); } } let transpositions = 0; for (let i = 0; i < matched1.length; i++) { if (matched1[i] !== matched2[i]) { transpositions++; } } return transpositions / 2; } function getCommonPrefixLength(str1, str2) { let prefixLength = 0; for (let i = 0; i < Math.min(str1.length, str2.length); i++) { if (str1[i] === str2[i]) { prefixLength++; } else { break; } } return prefixLength; } function getCharacterFrequencyVector(str) { const frequency = {}; for (const char of str) { frequency[char] = (frequency[char] || 0) + 1; } return Object.values(frequency); } function getBigrams(str) { const strArray = Array.from(str); const bigrams = []; for (let i = 0; i < strArray.length - 1; i++) { bigrams.push(strArray[i] + strArray[i + 1]); } return bigrams; }