UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

215 lines (211 loc) 7.03 kB
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License 'use strict'; var DeepMerge = require('../utils/DeepMerge.cjs'); var Errors = require('../utils/Errors.cjs'); var HashTable = require('../utils/HashTable.cjs'); var Profiler = require('../utils/Profiler.cjs'); var Registry = require('../utils/Registry.cjs'); const profiler = Profiler.Profiler.getInstance(); class Phonetic { static cache = new HashTable.HashTable(); static default; algo; options; optKey; map; static clear = () => this.cache.clear(); constructor(algo, opt = {}) { const defaults = this.constructor.default ?? {}; const mapId = opt.map ?? defaults.map; if (!mapId) throw new Errors.CmpStrNotFoundError( `No mapping specified for phonetic algorithm`, { algo } ); const map = PhoneticMappingRegistry.get(algo, mapId); if (map === undefined) throw new Errors.CmpStrNotFoundError( `Requested mapping <${mapId}> is not declared`, { algo, mapId } ); this.options = DeepMerge.merge( DeepMerge.merge(defaults, map.options ?? {}), opt ); this.optKey = HashTable.Hasher.fastFNV1a( JSON.stringify(this.options, Object.keys(this.options).sort()) ).toString(); this.algo = algo; this.map = map; } applyPattern(word) { const { patterns = [] } = this.map; if (!patterns || !patterns.length) return word; for (const { pattern, replace, all = false } of patterns) { word = word[all ? 'replaceAll' : 'replace'](pattern, replace); } return word; } applyRules(char, i, chars, charLen) { const { ruleset = [] } = this.map; if (!ruleset || !ruleset.length) return undefined; const prev = chars[i - 1] || '', prev2 = chars[i - 2] || ''; const next = chars[i + 1] || '', next2 = chars[i + 2] || ''; for (const rule of ruleset) { if (rule.char && rule.char !== char) continue; if (rule.position === 'start' && i !== 0) continue; if (rule.position === 'middle' && (i === 0 || i === charLen - 1)) continue; if (rule.position === 'end' && i !== charLen) continue; if (rule.prev && !rule.prev.includes(prev)) continue; if (rule.prevNot && rule.prevNot.includes(prev)) continue; if (rule.prev2 && !rule.prev2.includes(prev2)) continue; if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue; if (rule.next && !rule.next.includes(next)) continue; if (rule.nextNot && rule.nextNot.includes(next)) continue; if (rule.next2 && !rule.next2.includes(next2)) continue; if (rule.next2Not && rule.next2Not.includes(next2)) continue; if ( rule.leading && !rule.leading.includes(chars.slice(0, rule.leading.length).join('')) ) continue; if ( rule.trailing && !rule.trailing.includes(chars.slice(-rule.trailing.length).join('')) ) continue; if (rule.match && !rule.match.every((c, j) => chars[i + j] === c)) continue; return rule.code; } return undefined; } encode(word) { const { map = {}, ignore = [] } = this.map; word = this.applyPattern(word); const chars = this.word2Chars(word); const charLen = chars.length; let code = '', lastCode = null; for (let i = 0; i < charLen; i++) { const char = chars[i]; if (ignore.includes(char)) continue; const mapped = this.mapChar(char, i, chars, charLen, lastCode, map); if (mapped === undefined) continue; ((code += mapped), (lastCode = mapped)); if (this.exitEarly(code, i)) break; } return this.adjustCode(code, chars); } mapChar(char, i, chars, charLen, lastCode, map) { const { dedupe = true, fallback = undefined } = this.options; const c = this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback; return dedupe && c === lastCode ? undefined : c; } equalLen(input) { const { length = -1, pad = '0' } = this.options; return length === -1 ? input : (input + pad.repeat(length)).slice(0, length); } word2Chars = (word) => word.toLowerCase().split(''); exitEarly(code, i) { const { length = -1 } = this.options; return length > 0 && code.length >= length; } adjustCode(code, chars) { return code; } loop(words) { return Errors.ErrorUtil.wrap( () => { const index = []; for (const word of words) { const key = Phonetic.cache.key(this.algo, [word]) + this.optKey; const code = Phonetic.cache.get(key || '') ?? (() => { const res = this.encode(word); if (key) Phonetic.cache.set(key, res); return res; })(); if (code && code.length) index.push(this.equalLen(code)); } return index; }, `Failed to generate phonetic index`, { algo: this.algo, words } ); } async loopAsync(words) { return Errors.ErrorUtil.wrapAsync( async () => { const index = []; for (const word of words) { const key = Phonetic.cache.key(this.algo, [word]) + this.optKey; const code = await Promise.resolve( Phonetic.cache.get(key || '') ?? (() => { const res = this.encode(word); if (key) Phonetic.cache.set(key, res); return res; })() ); if (code && code.length) index.push(this.equalLen(code)); } return index; }, `Failed to generate phonetic index asynchronously`, { algo: this.algo, words } ); } getAlgoName = () => this.algo; getIndex(input) { const { delimiter = ' ' } = this.options; return profiler.run(() => this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean) ); } async getIndexAsync(input) { const { delimiter = ' ' } = this.options; return ( await profiler.runAsync( async () => await this.loopAsync(input.split(delimiter).filter(Boolean)) ) ).filter(Boolean); } } const PhoneticRegistry = Registry.Registry('phonetic', Phonetic); const PhoneticMappingRegistry = (() => { const mappings = Object.create(null); const maps = (algo) => (mappings[algo] ||= Object.create(null)); return Object.freeze({ add(algo, id, map, update = false) { const mappings = maps(algo); Errors.ErrorUtil.assert( !(!id || id in mappings) || update, `Entry <${id}> already exists / use <update=true> to overwrite`, { algo, id } ); mappings[id] = map; }, remove(algo, id) { delete maps(algo)[id]; }, has(algo, id) { return id in maps(algo); }, get(algo, id) { return maps(algo)[id]; }, list(algo) { return Object.keys(maps(algo)); } }); })(); exports.Phonetic = Phonetic; exports.PhoneticMappingRegistry = PhoneticMappingRegistry; exports.PhoneticRegistry = PhoneticRegistry;