cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
143 lines (140 loc) • 6.28 kB
JavaScript
// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License
import { HashTable } from './HashTable.js';
/**
* Normalizer Utility
* src/utils/Normalizer.ts
*
* @see https://en.wikipedia.org/wiki/Text_normalization
* @see https://en.wikipedia.org/wiki/Unicode_equivalence
*
* This module provides a Normalizer class that allows for string normalization based
* on various flags. It uses a pipeline of normalization functions that can be reused
* and cached for efficiency. The Normalizer can handle both single strings and arrays
* of strings, and supports synchronous and asynchronous normalization.
*
* Supported flags:
* 'd' :: Normalize to NFD (Normalization Form Decomposed)
* 'u' :: Normalize to NFC (Normalization Form Composed)
* 'x' :: Normalize to NFKC (Normalization Form Compatibility Composed)
* 'w' :: Collapse whitespace
* 't' :: Remove leading and trailing whitespace
* 'r' :: Remove double characters
* 's' :: Remove punctuation / special characters
* 'k' :: Remove non-letter characters
* 'n' :: Remove non-number characters
* 'i' :: Case insensitive (convert to lowercase)
*
* @module Utils/Normalizer
* @author Paul Köhler (komed3)
* @license MIT
*/
/**
* The Normalizer class providing methods to normalize strings based on various flags.
*/
class Normalizer {
/**
* A map that holds normalization functions based on the flags.
* This allows for reusing normalization logic without recomputing it.
*/
static pipeline = new Map();
/**
* A cache to store normalized strings based on the flags and input.
* This helps avoid recomputing normalization for the same input and flags.
*/
static cache = new HashTable();
/**
* Returns a normalization function based on the provided flags.
* The flags are a string of characters that define the normalization steps.
*
* @param {NormalizeFlags} flags - A string of characters representing the normalization steps
* @returns {NormalizerFn} - A function that normalizes a string based on the provided flags
*/
static getPipeline(flags) {
// Return the cached pipeline if it exists
if (Normalizer.pipeline.has(flags)) return Normalizer.pipeline.get(flags);
// Define the normalization steps based on the flags
const steps = [];
// Normalize to NFD (Normalization Form Decomposed)
if (flags.includes('d')) steps.push((str) => str.normalize('NFD'));
// Normalize to NFC (Normalization Form Composed)
if (flags.includes('u')) steps.push((str) => str.normalize('NFC'));
// Normalize to NFKC (Normalization Form Compatibility Composed)
if (flags.includes('x')) steps.push((str) => str.normalize('NFKC'));
// Collapse whitespace
if (flags.includes('w')) steps.push((str) => str.replace(/\s+/g, ' '));
// Remove leading and trailing whitespace
if (flags.includes('t')) steps.push((str) => str.trim());
// Remove double characters
if (flags.includes('r')) steps.push((str) => str.replace(/(.)\1+/g, '$1'));
// Remove punctuation / special characters
if (flags.includes('s'))
steps.push((str) => str.replace(/[^\p{L}\p{N}\s]/gu, ''));
// Remove non-letter characters
if (flags.includes('k')) steps.push((str) => str.replace(/[^\p{L}]/gu, ''));
// Remove non-number characters
if (flags.includes('n')) steps.push((str) => str.replace(/\p{N}/gu, ''));
// Case insensitive
if (flags.includes('i')) steps.push((str) => str.toLowerCase());
// Build the normalization function from the steps
const compiled = (input) => {
let res = input;
for (const step of steps) res = step(res);
return res;
};
// Cache the compiled function for the given flags
Normalizer.pipeline.set(flags, compiled);
// Return the compiled normalization function
return compiled;
}
/**
* Normalizes the input string or array of strings based on the provided flags.
* The flags are a string of characters that define the normalization steps.
*
* @param {string|string[]} input - The string or array of strings to normalize
* @param {NormalizeFlags} flags - A string of characters representing the normalization steps
* @returns {string|string[]} - The normalized string(s)
*/
static normalize(input, flags) {
// If input is an array, normalize each string in the array
if (Array.isArray(input))
return input.map((s) => Normalizer.normalize(s, flags));
// If input or flags are not provided, return the input as is
if (!flags || typeof flags !== 'string' || !input) return input;
// Generate a cache key based on the flags and input
const key = Normalizer.cache.key(flags, [input]);
// If the key exists in the cache, return the cached result
if (key && Normalizer.cache.has(key)) return Normalizer.cache.get(key);
// Normalize the input using the pipeline for the given flags
const res = Normalizer.getPipeline(flags)(input);
// If a key was generated, store the result in the cache
if (key) Normalizer.cache.set(key, res);
// Return the normalized result
return res;
}
/**
* Asynchronously normalizes the input string or array of strings based on the
* provided flags. This method is useful for handling large inputs or when
* normalization needs to be done in a non-blocking way.
*
* @param {string|string[]} input - The string or array of strings to normalize
* @param {NormalizeFlags} flags - A string of characters representing the normalization steps
* @returns {Promise<string|string[]>} - A promise that resolves to the normalized string(s)
*/
static async normalizeAsync(input, flags) {
return await (Array.isArray(input)
? // If input is an array, normalize each string in the array asynchronously
Promise.all(input.map((s) => Normalizer.normalize(s, flags)))
: // If input is a single string, normalize it asynchronously
Promise.resolve(Normalizer.normalize(input, flags)));
}
/**
* Clears the normalization pipeline and cache.
* This is useful for resetting the state of the Normalizer.
*/
static clear() {
Normalizer.pipeline.clear();
Normalizer.cache.clear();
}
}
export { Normalizer };
//# sourceMappingURL=Normalizer.js.map