string-similarity-plus
Version:
String similarity calculation with enhanced special character normalization
107 lines (95 loc) • 3.97 kB
JavaScript
/**
* Calculate string similarity percentage using Levenshtein distance
* with enhanced normalization for special characters
*
* @param {string} str1 - First string to compare
* @param {string} str2 - Second string to compare
* @returns {number} - Similarity percentage (0-100)
*/
function calculateStringSimilarity(str1, str2) {
// Enhanced normalize function to handle more special characters
const normalizeString = (str) => {
return str
.toLowerCase()
// Normalize various types of quotes and apostrophes
.replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0060]/g, "'")
.replace(/[\u201C\u201D\u201E\u201F\u2033\u2036]/g, '"')
// Normalize various types of dashes and hyphens
.replace(/[\u2010-\u2015\u2043\u02D7\u2212\u2796\u2E3A\u2E3B\u30FC]/g, '-')
// Normalize various types of spaces and zero-width spaces
.replace(/[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF]/g, ' ')
// Normalize ellipsis
.replace(/\u2026/g, '...')
// Normalize various types of slashes
.replace(/[\u2044\u2215\u29F8\u29F9]/g, '/')
// Normalize various types of dots
.replace(/[\u2024\u2025\u2026\u22EF\u00B7]/g, '.')
// Normalize various types of colons and semicolons
.replace(/[\uFF1A\u2236\u2A74]/g, ':')
.replace(/[\uFF1B]/g, ';')
// Normalize various types of brackets
.replace(/[\uFF08\u2768\u276A]/g, '(')
.replace(/[\uFF09\u2769\u276B]/g, ')')
.replace(/[\uFF3B\u3010]/g, '[')
.replace(/[\uFF3D\u3011]/g, ']')
.replace(/[\uFF5B\u2774]/g, '{')
.replace(/[\uFF5D\u2775]/g, '}')
// Normalize multiple spaces
.replace(/\s+/g, ' ')
// Remove common punctuation
.replace(/[,!?]/g, '')
.trim();
};
// Calculate Levenshtein distance
function levenshteinDistance(a, b) {
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
const matrix = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1,
matrix[i][j - 1] + 1,
matrix[i - 1][j] + 1
);
}
}
}
return matrix[b.length][a.length];
}
// Normalize both strings
const normalizedStr1 = normalizeString(str1);
const normalizedStr2 = normalizeString(str2);
// Calculate similarity percentage
const maxLength = Math.max(normalizedStr1.length, normalizedStr2.length);
if (maxLength === 0) return 100; // Both strings are empty
const distance = levenshteinDistance(normalizedStr1, normalizedStr2);
const similarityPercentage = ((maxLength - distance) / maxLength) * 100;
return similarityPercentage;
}
/**
* Find matches in an array of strings based on similarity threshold
*
* @param {string} searchString - String to search for
* @param {string[]} contentArray - Array of strings to search in
* @param {number} threshold - Similarity threshold percentage (0-100)
* @returns {string[]} - Array of matching strings
*/
function findSimilarStrings(searchString, contentArray, threshold = 80) {
return contentArray.filter(item =>
calculateStringSimilarity(searchString, item) >= threshold
);
}
module.exports = {
calculateStringSimilarity,
findSimilarStrings
};