cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
215 lines (211 loc) • 7.03 kB
JavaScript
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License
'use strict';
var DeepMerge = require('../utils/DeepMerge.cjs');
var Errors = require('../utils/Errors.cjs');
var HashTable = require('../utils/HashTable.cjs');
var Profiler = require('../utils/Profiler.cjs');
var Registry = require('../utils/Registry.cjs');
const profiler = Profiler.Profiler.getInstance();
class Phonetic {
static cache = new HashTable.HashTable();
static default;
algo;
options;
optKey;
map;
static clear = () => this.cache.clear();
constructor(algo, opt = {}) {
const defaults = this.constructor.default ?? {};
const mapId = opt.map ?? defaults.map;
if (!mapId)
throw new Errors.CmpStrNotFoundError(
`No mapping specified for phonetic algorithm`,
{ algo }
);
const map = PhoneticMappingRegistry.get(algo, mapId);
if (map === undefined)
throw new Errors.CmpStrNotFoundError(
`Requested mapping <${mapId}> is not declared`,
{ algo, mapId }
);
this.options = DeepMerge.merge(
DeepMerge.merge(defaults, map.options ?? {}),
opt
);
this.optKey = HashTable.Hasher.fastFNV1a(
JSON.stringify(this.options, Object.keys(this.options).sort())
).toString();
this.algo = algo;
this.map = map;
}
applyPattern(word) {
const { patterns = [] } = this.map;
if (!patterns || !patterns.length) return word;
for (const { pattern, replace, all = false } of patterns) {
word = word[all ? 'replaceAll' : 'replace'](pattern, replace);
}
return word;
}
applyRules(char, i, chars, charLen) {
const { ruleset = [] } = this.map;
if (!ruleset || !ruleset.length) return undefined;
const prev = chars[i - 1] || '',
prev2 = chars[i - 2] || '';
const next = chars[i + 1] || '',
next2 = chars[i + 2] || '';
for (const rule of ruleset) {
if (rule.char && rule.char !== char) continue;
if (rule.position === 'start' && i !== 0) continue;
if (rule.position === 'middle' && (i === 0 || i === charLen - 1))
continue;
if (rule.position === 'end' && i !== charLen) continue;
if (rule.prev && !rule.prev.includes(prev)) continue;
if (rule.prevNot && rule.prevNot.includes(prev)) continue;
if (rule.prev2 && !rule.prev2.includes(prev2)) continue;
if (rule.prev2Not && rule.prev2Not.includes(prev2)) continue;
if (rule.next && !rule.next.includes(next)) continue;
if (rule.nextNot && rule.nextNot.includes(next)) continue;
if (rule.next2 && !rule.next2.includes(next2)) continue;
if (rule.next2Not && rule.next2Not.includes(next2)) continue;
if (
rule.leading &&
!rule.leading.includes(chars.slice(0, rule.leading.length).join(''))
)
continue;
if (
rule.trailing &&
!rule.trailing.includes(chars.slice(-rule.trailing.length).join(''))
)
continue;
if (rule.match && !rule.match.every((c, j) => chars[i + j] === c))
continue;
return rule.code;
}
return undefined;
}
encode(word) {
const { map = {}, ignore = [] } = this.map;
word = this.applyPattern(word);
const chars = this.word2Chars(word);
const charLen = chars.length;
let code = '',
lastCode = null;
for (let i = 0; i < charLen; i++) {
const char = chars[i];
if (ignore.includes(char)) continue;
const mapped = this.mapChar(char, i, chars, charLen, lastCode, map);
if (mapped === undefined) continue;
((code += mapped), (lastCode = mapped));
if (this.exitEarly(code, i)) break;
}
return this.adjustCode(code, chars);
}
mapChar(char, i, chars, charLen, lastCode, map) {
const { dedupe = true, fallback = undefined } = this.options;
const c = this.applyRules(char, i, chars, charLen) ?? map[char] ?? fallback;
return dedupe && c === lastCode ? undefined : c;
}
equalLen(input) {
const { length = -1, pad = '0' } = this.options;
return length === -1
? input
: (input + pad.repeat(length)).slice(0, length);
}
word2Chars = (word) => word.toLowerCase().split('');
exitEarly(code, i) {
const { length = -1 } = this.options;
return length > 0 && code.length >= length;
}
adjustCode(code, chars) {
return code;
}
loop(words) {
return Errors.ErrorUtil.wrap(
() => {
const index = [];
for (const word of words) {
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
const code =
Phonetic.cache.get(key || '') ??
(() => {
const res = this.encode(word);
if (key) Phonetic.cache.set(key, res);
return res;
})();
if (code && code.length) index.push(this.equalLen(code));
}
return index;
},
`Failed to generate phonetic index`,
{ algo: this.algo, words }
);
}
async loopAsync(words) {
return Errors.ErrorUtil.wrapAsync(
async () => {
const index = [];
for (const word of words) {
const key = Phonetic.cache.key(this.algo, [word]) + this.optKey;
const code = await Promise.resolve(
Phonetic.cache.get(key || '') ??
(() => {
const res = this.encode(word);
if (key) Phonetic.cache.set(key, res);
return res;
})()
);
if (code && code.length) index.push(this.equalLen(code));
}
return index;
},
`Failed to generate phonetic index asynchronously`,
{ algo: this.algo, words }
);
}
getAlgoName = () => this.algo;
getIndex(input) {
const { delimiter = ' ' } = this.options;
return profiler.run(() =>
this.loop(input.split(delimiter).filter(Boolean)).filter(Boolean)
);
}
async getIndexAsync(input) {
const { delimiter = ' ' } = this.options;
return (
await profiler.runAsync(
async () => await this.loopAsync(input.split(delimiter).filter(Boolean))
)
).filter(Boolean);
}
}
const PhoneticRegistry = Registry.Registry('phonetic', Phonetic);
const PhoneticMappingRegistry = (() => {
const mappings = Object.create(null);
const maps = (algo) => (mappings[algo] ||= Object.create(null));
return Object.freeze({
add(algo, id, map, update = false) {
const mappings = maps(algo);
Errors.ErrorUtil.assert(
!(!id || id in mappings) || update,
`Entry <${id}> already exists / use <update=true> to overwrite`,
{ algo, id }
);
mappings[id] = map;
},
remove(algo, id) {
delete maps(algo)[id];
},
has(algo, id) {
return id in maps(algo);
},
get(algo, id) {
return maps(algo)[id];
},
list(algo) {
return Object.keys(maps(algo));
}
});
})();
exports.Phonetic = Phonetic;
exports.PhoneticMappingRegistry = PhoneticMappingRegistry;
exports.PhoneticRegistry = PhoneticRegistry;