UNPKG

cmpstr

Version:

lightweight npm package to calculate string similarity

917 lines (678 loc) 22.3 kB
/** * class CmpStr * * The CmpStr class is the core of the cmpstr package. It provides methods to calculate * string similarity, find the closest matches in arrays, and generate similarity * matrices. The class supports built-in algorithms (e.g., Levenshtein, Dice-Sørensen) * and allows users to add custom algorithms. It also includes features like string * normalization, caching, and extensibility. * * @author komed3 (Paul Köhler) * @license MIT */ 'use strict'; /** * module exports * @public */ module.exports = class CmpStr { /** * -------------------------------------------------- * Global Variables * -------------------------------------------------- */ /** * all pre-defined similarity algorithms * * @private * @type {Object} */ #algorithms = { cosine: './algorithms/cosine', damerau: './algorithms/damerau', dice: './algorithms/dice', hamming: './algorithms/hamming', jaccard: './algorithms/jaccard', jaro: './algorithms/jaroWinkler', lcs: './algorithms/lcs', levenshtein: './algorithms/levenshtein', needlemanWunsch: './algorithms/needlemanWunsch', qGram: './algorithms/qGram', smithWaterman: './algorithms/smithWaterman', soundex: './algorithms/soundex' }; /** * stores the names of loaded algorithms * * @since 2.0.2 * @private * @type {Set<String>} */ #loadedAlgo = new Set (); /** * normalized strings cache * * @private * @type {Map<String, String>} */ #cache = new Map (); /** * added filters for string normalization * * @private * @type {Map<String, Object[]>} */ #filter = new Map (); /** * default normalization flags * set by setFlags() * * @private * @type {String} */ #flags = ''; /** * current algorithm to use for similarity calculations * set by setAlgo(), addAlgo() or constructor() * * @private * @type {String} */ #algo; /** * base string for comparison * set by setStr or constructor() * * @private * @type {String} */ #str; /** * stores the current ready state * * @since 2.0.2 * @private * @type {Boolean} */ #readyState = false; /** * -------------------------------------------------- * Constructor * -------------------------------------------------- */ /** * initializes a CmpStr instance * algorithm and base string can be set by initialization * * @param {String} algo name of the algorithm to use for calculation * @param {String} str string to set as the base */ constructor ( algo = undefined, str = undefined ) { if ( algo !== undefined ) { this.setAlgo( algo ); } if ( str !== undefined ) { this.setStr( str ); } }; /** * -------------------------------------------------- * Ready State * -------------------------------------------------- */ /** * checks whether string and algorithm are set correctly * * @returns {Boolean} true if ready, false otherwise */ isReady () { return this.#readyState; }; /** * updates the readiness state * * @since 2.0.2 * @private */ #updateReadyState () { this.#readyState = ( typeof this.#algo === 'string' && this.isAlgo( this.#algo ) && typeof this.#str === 'string' && this.#str.length !== 0 ); }; /** * checks ready state and throws an error if not * * @private * @returns {Boolean} true if ready * @throws {Error} if CmpStr is not ready */ #checkReady () { if ( !this.#readyState ) { throw new Error( `CmpStr instance is not ready. Ensure the algorithm and base string are set.` ); } return true; }; /** * -------------------------------------------------- * Base String * -------------------------------------------------- */ /** * sets the base string for comparison * * @param {String} str string to set as the base * @returns {Boolean} always returns true */ setStr ( str ) { this.#str = String ( str ); this.#updateReadyState(); return true; }; /** * gets the base string for comparison * * @since 2.0.2 * @returns {String} base string */ getStr () { return this.#str; }; /** * -------------------------------------------------- * Algorithms * -------------------------------------------------- */ /** * list all registered or loaded similarity algorithms * * @param {Boolean} [loadedOnly=false] it true, only loaded algorithm names are returned * @returns {String[]} array of algorithm names */ listAlgo ( loadedOnly = false ) { return loadedOnly ? [ ...this.#loadedAlgo ] : [ ...Object.keys( this.#algorithms ) ]; }; /** * checks if an algorithm is registered * * @param {String} algo name of the algorithm * @returns {Boolean} true if the algorithm is registered, false otherwise */ isAlgo ( algo ) { return algo in this.#algorithms; }; /** * sets the current algorithm to use for similarity calculations * * @param {String} algo name of the algorithm * @returns {Boolean} true if the algorithm has been set */ setAlgo ( algo ) { if ( this.#loadAlgo( algo ) ) { this.#algo = algo; this.#updateReadyState(); return true; } }; /** * gets the current algorithm to use for similarity calculations * * @since 2.0.2 * @returns {String} name of the algorithm */ getAlgo () { return this.#algo; }; /** * adds a new similarity algorithm * * @param {String} algo name of the algorithm * @param {Function} callback function implementing the algorithm (must accept two strings and return a number) * @param {Boolean} [useIt=true] whether to set this algorithm as the current one * @returns {Boolean} returns true if the algorithms was added successfully * @throws {Error} if the algorithm cannot be added */ addAlgo ( algo, callback, useIt = true ) { if ( !this.isAlgo( algo ) && typeof callback === 'function' && callback.length >= 2 && typeof callback.apply( null, [ 'abc', 'abc' ] ) === 'number' ) { this.#algorithms[ algo ] = callback; if ( useIt ) { this.setAlgo( algo ); } return true; } else { throw new Error ( `Algorithm "${algo}" cannot be added.` ); } }; /** * removes a registered similarity algorithm * * @param {String} algo name of the algorithm * @returns {Boolean} true if the algorithm was removed successfully * @throws {Error} if the algorithm is not defined */ rmvAlgo ( algo ) { if ( this.isAlgo( algo ) ) { delete this.#algorithms[ algo ]; this.#loadedAlgo.delete( algo ); if ( this.#algo === algo ) { /* reset current algorithm if it was removed */ this.#algo = undefined; this.#updateReadyState(); } return true; } else { throw new Error ( `Algorithm "${algo}" is not defined.` ); } }; /** * lazy-loads the specified algorithm module * * @private * @param {String} algo name of the similarity algorithm * @returns {Boolean} true if the algorithm is loaded * @throws {Error} if the algorithm cannot be loaded or is not defined */ #loadAlgo ( algo ) { if ( this.#loadedAlgo.has( algo ) ) { return true; } else if ( this.isAlgo( algo ) ) { let typeOf = typeof this.#algorithms[ algo ]; if ( typeOf === 'function' ) { this.#loadedAlgo.add( algo ); return true; } else if ( typeOf === 'string' ) { try { /* lazy-load algorithm module */ this.#algorithms[ algo ] = require( this.#algorithms[ algo ] ); this.#loadedAlgo.add( algo ); return true; } catch ( err ) { throw new Error ( `Failed to load algorithm "${algo}".`, { cause: err } ); } } else { throw new Error ( `Algorithm "${algo}" cannot be loaded.` ); } } else { throw new Error ( `Algorithm "${algo}" is not defined.` ); } }; /** * -------------------------------------------------- * Custom Filters * -------------------------------------------------- */ /** * list all added or artice filter names * * @param {Boolean} [activeOnly=false] if true, only names of active filters are returned * @returns {String[]} array of filter names */ listFilter ( activeOnly = false ) { return activeOnly ? Array.from( this.#filter.entries() ) .filter( ( [ _, filter ] ) => filter.active ) .map( ( [ name ] ) => name ) : [ ...this.#filter.keys() ]; }; /** * adds a custom normalization filter * * @param {String} name filter name * @param {Function} callback function implementing the filter (must accept a string and returns a normalized one) * @param {Int} [priority=10] priority of the filter (lower numbers are processed first) * @returns {Boolean} returns true if the filter was added successfully * @throws {Error} if the filter cannot be added */ addFilter ( name, callback, priority = 10 ) { if ( !this.#filter.has( name ) && typeof callback === 'function' && callback.length == 1 && typeof callback.apply( null, [ 'abc' ] ) === 'string' ) { this.#filter.set( name, { callback, priority, active: true } ); this.clearCache(); return true; } else { throw new Error ( `Filter "${filter}" cannot be added.` ); } }; /** * removes a custom normalization filter * * @param {String} name filter name * @returns {Boolean} true if the filter was removed successfully * @throws {Error} if the filter does not exists */ rmvFilter ( name ) { if ( this.#filter.delete( name ) ) { this.clearCache(); return true; } else { throw new Error ( `Filter "${filter}" does not exists.` ); } }; /** * pauses a custom normalization filter * * @param {String} name filter name * @returns {Boolean} true if the filter was paused successfully * @throws {Error} if the filter does not exists */ pauseFilter ( name ) { if ( this.#filter.has( name ) ) { this.#filter.get( name ).active = false; this.clearCache(); return true; } else { throw new Error ( `Filter "${filter}" does not exists.` ); } }; /** * resumes a custom normalization filter * * @param {String} name filter name * @returns {Boolean} true if the filter was resumed successfully * @throws {Error} if the filter does not exists */ resumeFilter ( name ) { if ( this.#filter.has( name ) ) { this.#filter.get( name ).active = true; this.clearCache(); return true; } else { throw new Error ( `Filter "${filter}" does not exists.` ); } }; /** * clears normalization filters (remove all of them) * * @returns {Boolean} always returns true */ clearFilter () { this.#filter.clear(); this.clearCache(); return true; }; /** * applies all active filters to a string * * @private * @param {String} str string to process * @returns {String} filtered string * @throws {Error} if applying filters cause an error */ #applyFilters ( str ) { try { return Array.from( this.#filter.values() ).flat().filter( ( filter ) => filter.active ).sort( ( a, b ) => a.priority - b.priority ).reduce( ( res, filter ) => filter.callback.apply( null, [ res ] ), String ( str ) ); } catch ( err ) { throw new Error ( `Error while applying filters.`, { cause: err } ); } }; /** * -------------------------------------------------- * Normalization * -------------------------------------------------- */ /** * set default normalization flags * * @param {String} [flags=''] normalization flags * @returns {Boolean} always returns true */ setFlags ( flags = '' ) { this.#flags = String ( flags ); }; /** * get default normalization flags * * @since 2.0.2 * @returns {String} normalization flags */ getFlags () { return this.#flags; }; /** * normalizes a string by chainable options; uses cache to increase * performance and custom filters for advanced behavior * * list of all supported flags: * * s :: remove special chars * w :: collapse whitespaces * r :: remove repeated chars * k :: keep only letters * n :: ignore numbers * t :: trim whitespaces * i :: case insensitivity * d :: decompose unicode * u :: normalize unicode * * @param {String|String[]} string string(s) to normalize * @param {String} [flags=''] normalization flags * @returns {String|String[]} normalized string(s) * @throws {Error} if normalization cause an error */ normalize ( input, flags = '' ) { const processStr = ( str ) => { let res = String ( str ); /* use normalized string from cache to increase performance */ let key = `${res}::${flags}`; if ( this.#cache.has( key ) ) { return this.#cache.get( key ); } /* apply custom filters */ res = this.#applyFilters( res ); /* normalize using flags */ try { if ( flags.includes( 's' ) ) res = res.replace( /[^a-z0-9]/gi, '' ); if ( flags.includes( 'w' ) ) res = res.replace( /\s+/g, ' ' ); if ( flags.includes( 'r' ) ) res = res.replace( /(.)\1+/g, '$1' ); if ( flags.includes( 'k' ) ) res = res.replace( /[^a-z]/gi, '' ); if ( flags.includes( 'n' ) ) res = res.replace( /[0-9]/g, '' ); if ( flags.includes( 't' ) ) res = res.trim(); if ( flags.includes( 'i' ) ) res = res.toLowerCase(); if ( flags.includes( 'd' ) ) res = res.normalize( 'NFD' ).replace( /[\u0300-\u036f]/g, '' ); if ( flags.includes( 'u' ) ) res = res.normalize( 'NFC' ); } catch ( err ) { throw new Error ( `Error while normalization.`, { cause: err } ); } /* store the normalized string in the cache */ this.#cache.set( key, res ); return res; } /* processing multiple string */ if ( Array.isArray( input ) ) { return input.map( ( str ) => processStr( str ) ); } return processStr( input ); }; /** * clears the normalization cache * * @returns {Boolean} always returns true */ clearCache () { this.#cache.clear(); return true; }; /** * -------------------------------------------------- * Similarity Comparison * -------------------------------------------------- */ /** * compares two string a and b using the passed algorithm * * @param {String} algo name of the algorithm * @param {String} a string a * @param {String} b string b * @param {Object} [config={}] config (flags, args) * @returns {Mixed} similarity score (0..1) or raw output * @throws {Error} if algorithm cause an error */ compare ( algo, a, b, config = {} ) { if ( this.#loadAlgo( algo ) ) { /* handle trivial cases */ if ( a === b ) return 1; // strings are identical if ( a.length < 2 || b.length < 2 ) return 0; // too short to compare /* apply similarity algorithm */ const { flags = this.#flags, options = {} } = config; try { return this.#algorithms[ algo ].apply( null, [ this.normalize( a, flags ), this.normalize( b, flags ), options ] ); } catch ( err ) { throw new Error ( `Error in algorithm "${algo}".`, { cause: err } ); } } }; /** * tests the similarity between the base string and a target string * using the current algorithm * * @param {String} str target string * @param {Object} [config={}] config (flags, args) * @returns {Mixed} similarity score (0..1) or raw output */ test ( str, config = {} ) { if ( this.#checkReady() ) { return this.compare( this.#algo, this.#str, str, config ); } }; /** * tests the similarity of multiple strings against the base string * * @param {String[]} arr array of strings * @param {Object} [config={}] config (flags, args) * @returns {Object[]} array of objects, each containing the target string and its similarity score / raw output */ batchTest ( arr, config = {} ) { if ( this.#checkReady() ) { return [ ...arr ].map( ( str ) => ( { target: str, match: this.compare( this.#algo, this.#str, str, config ) } ) ); } }; /** * finds strings in an array that exceed a similarity threshold * returns the array sorted by highest similarity * * @param {String[]} arr array of strings * @param {Object} [config={}] config (flags, threshold, args) * @returns {Object[]} array of objects, sorted by highest similarity */ match ( arr, config = {} ) { const { threshold = 0 } = config; delete config?.options?.raw; return this.batchTest( arr, config ).filter( ( r ) => r.match >= threshold ).sort( ( a, b ) => b.match - a.match ); }; /** * finds the closest matching string from an array * * @param {String[]} arr array of strings * @param {Object} [config={}] config (flags, args) * @returns {String} closest matching string */ closest ( arr, config = {} ) { let res = this.match( arr, config ); return res.length && res[ 0 ].match > 0 ? res[ 0 ].target : undefined; }; /** * generate a similarity matrix for an array of strings * * @param {String} algo name of the algorithm * @param {String[]} arr array of strings to cross-compare * @param {Object} [config={}] config (flags, args) * @returns {Number[][]} 2D array representing the similarity matrix */ similarityMatrix ( algo, arr, config = {} ) { if ( this.#loadAlgo( algo ) ) { delete config?.options?.raw; return [ ...arr ].map( ( a, i ) => { return [ ...arr ].map( ( b, j ) => i === j ? 1 : this.compare( algo, a, b, config ) ); } ); } }; };