UNPKG

string-similarity-plus

Version:

String similarity calculation with enhanced special character normalization

107 lines (95 loc) 3.97 kB
/** * Calculate string similarity percentage using Levenshtein distance * with enhanced normalization for special characters * * @param {string} str1 - First string to compare * @param {string} str2 - Second string to compare * @returns {number} - Similarity percentage (0-100) */ function calculateStringSimilarity(str1, str2) { // Enhanced normalize function to handle more special characters const normalizeString = (str) => { return str .toLowerCase() // Normalize various types of quotes and apostrophes .replace(/[\u2018\u2019\u201A\u201B\u2032\u2035\u0060]/g, "'") .replace(/[\u201C\u201D\u201E\u201F\u2033\u2036]/g, '"') // Normalize various types of dashes and hyphens .replace(/[\u2010-\u2015\u2043\u02D7\u2212\u2796\u2E3A\u2E3B\u30FC]/g, '-') // Normalize various types of spaces and zero-width spaces .replace(/[\u00A0\u1680\u180E\u2000-\u200B\u202F\u205F\u3000\uFEFF]/g, ' ') // Normalize ellipsis .replace(/\u2026/g, '...') // Normalize various types of slashes .replace(/[\u2044\u2215\u29F8\u29F9]/g, '/') // Normalize various types of dots .replace(/[\u2024\u2025\u2026\u22EF\u00B7]/g, '.') // Normalize various types of colons and semicolons .replace(/[\uFF1A\u2236\u2A74]/g, ':') .replace(/[\uFF1B]/g, ';') // Normalize various types of brackets .replace(/[\uFF08\u2768\u276A]/g, '(') .replace(/[\uFF09\u2769\u276B]/g, ')') .replace(/[\uFF3B\u3010]/g, '[') .replace(/[\uFF3D\u3011]/g, ']') .replace(/[\uFF5B\u2774]/g, '{') .replace(/[\uFF5D\u2775]/g, '}') // Normalize multiple spaces .replace(/\s+/g, ' ') // Remove common punctuation .replace(/[,!?]/g, '') .trim(); }; // Calculate Levenshtein distance function levenshteinDistance(a, b) { if (a.length === 0) return b.length; if (b.length === 0) return a.length; const matrix = []; for (let i = 0; i <= b.length; i++) { matrix[i] = [i]; } for (let j = 0; j <= a.length; j++) { matrix[0][j] = j; } for (let i = 1; i <= b.length; i++) { for (let j = 1; j <= a.length; j++) { if (b.charAt(i - 1) === a.charAt(j - 1)) { matrix[i][j] = matrix[i - 1][j - 1]; } else { matrix[i][j] = Math.min( matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1 ); } } } return matrix[b.length][a.length]; } // Normalize both strings const normalizedStr1 = normalizeString(str1); const normalizedStr2 = normalizeString(str2); // Calculate similarity percentage const maxLength = Math.max(normalizedStr1.length, normalizedStr2.length); if (maxLength === 0) return 100; // Both strings are empty const distance = levenshteinDistance(normalizedStr1, normalizedStr2); const similarityPercentage = ((maxLength - distance) / maxLength) * 100; return similarityPercentage; } /** * Find matches in an array of strings based on similarity threshold * * @param {string} searchString - String to search for * @param {string[]} contentArray - Array of strings to search in * @param {number} threshold - Similarity threshold percentage (0-100) * @returns {string[]} - Array of matching strings */ function findSimilarStrings(searchString, contentArray, threshold = 80) { return contentArray.filter(item => calculateStringSimilarity(searchString, item) >= threshold ); } module.exports = { calculateStringSimilarity, findSimilarStrings };