UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

311 lines (308 loc) 9.74 kB
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License import { merge, set, rmv, get } from './utils/DeepMerge.mjs'; import { DiffChecker } from './utils/DiffChecker.mjs'; import { CmpStrInternalError, CmpStrNotFoundError, ErrorUtil } from './utils/Errors.mjs'; import { Filter } from './utils/Filter.mjs'; import { Normalizer } from './utils/Normalizer.mjs'; import { Profiler } from './utils/Profiler.mjs'; import { factory } from './utils/Registry.mjs'; import { StructuredData } from './utils/StructuredData.mjs'; import { TextAnalyzer } from './utils/TextAnalyzer.mjs'; import './metric/Cosine.mjs'; import './metric/DamerauLevenshtein.mjs'; import './metric/DiceSorensen.mjs'; import './metric/Hamming.mjs'; import './metric/Jaccard.mjs'; import './metric/JaroWinkler.mjs'; import './metric/LCS.mjs'; import './metric/Levenshtein.mjs'; import './metric/NeedlemanWunsch.mjs'; import './metric/QGram.mjs'; import './metric/SmithWaterman.mjs'; import { MetricRegistry, Metric } from './metric/Metric.mjs'; import './phonetic/Caverphone.mjs'; import './phonetic/Cologne.mjs'; import './phonetic/Metaphone.mjs'; import './phonetic/Soundex.mjs'; import { PhoneticMappingRegistry, PhoneticRegistry, Phonetic } from './phonetic/Phonetic.mjs'; const profiler = Profiler.getInstance(); class CmpStr { static filter = { has: Filter.has, add: Filter.add, remove: Filter.remove, pause: Filter.pause, resume: Filter.resume, list: Filter.list, clear: Filter.clear }; static metric = { add: MetricRegistry.add, remove: MetricRegistry.remove, has: MetricRegistry.has, list: MetricRegistry.list }; static phonetic = { add: PhoneticRegistry.add, remove: PhoneticRegistry.remove, has: PhoneticRegistry.has, list: PhoneticRegistry.list, map: { add: PhoneticMappingRegistry.add, remove: PhoneticMappingRegistry.remove, has: PhoneticMappingRegistry.has, list: PhoneticMappingRegistry.list } }; static profiler = profiler.services; static clearCache = { normalizer: Normalizer.clear, filter: Filter.clearPipeline, metric: Metric.clear, phonetic: Phonetic.clear }; static analyze = (input) => new TextAnalyzer(input); static diff = (a, b, opt) => new DiffChecker(a, b, opt); static create(opt) { return new CmpStr(opt); } options = Object.create(null); constructor(opt) { if (opt) typeof opt === 'string' ? this.setSerializedOptions(opt) : this.setOptions(opt); } assert(cond, test) { switch (cond) { case 'metric': if (!CmpStr.metric.has(test)) throw new CmpStrNotFoundError( `CmpStr <metric> must be set, call .setMetric(), ` + `use CmpStr.metric.list() for available metrics`, { metric: test } ); break; case 'phonetic': if (!CmpStr.phonetic.has(test)) throw new CmpStrNotFoundError( `CmpStr <phonetic> must be set, call .setPhonetic(), ` + `use CmpStr.phonetic.list() for available phonetic algorithms`, { phonetic: test } ); break; default: throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`); } } assertMany(...cond) { for (const [c, test] of cond) this.assert(c, test); } resolveOptions(opt) { return merge({ ...(this.options ?? Object.create(null)) }, opt); } normalize(input, flags) { return Normalizer.normalize(input, flags ?? this.options.flags ?? ''); } filter(input, hook) { return Filter.apply(hook, input); } prepare(input, opt) { const { flags, processors } = opt ?? this.options; if (flags?.length) input = this.normalize(input, flags); input = this.filter(input, 'input'); if (processors?.phonetic) input = this.index(input, processors.phonetic); return input; } postProcess(result, opt) { if (opt?.removeZero && Array.isArray(result)) result = result.filter((r) => r.res > 0); return result; } index(input, { algo, opt }) { this.assert('phonetic', algo); const phonetic = factory['phonetic'](algo, opt); const delimiter = opt?.delimiter ?? ' '; return Array.isArray(input) ? input.map((s) => phonetic.getIndex(s).join(delimiter)) : phonetic.getIndex(input).join(delimiter); } structured(data, key) { return StructuredData.create(data, key); } compute(a, b, opt, mode, raw, skip) { return ErrorUtil.wrap( () => { const resolved = this.resolveOptions(opt); this.assert('metric', resolved.metric); const A = skip ? a : this.prepare(a, resolved); const B = skip ? b : this.prepare(b, resolved); if ( resolved.safeEmpty && ((Array.isArray(A) && A.length === 0) || (Array.isArray(B) && B.length === 0) || A === '' || B === '') ) { return []; } const metric = factory['metric'](resolved.metric, A, B, resolved.opt); if (resolved.output !== 'prep') metric.setOriginal(a, b); metric.run(mode); const result = this.postProcess(metric.getResults(), resolved); return this.output(result, raw ?? resolved.raw); }, `Failed to compute metric <${opt?.metric ?? this.options.metric}> for the given inputs`, { a, b, options: opt } ); } output(result, raw) { return ErrorUtil.wrap( () => (raw ?? this.options.raw) ? result : Array.isArray(result) ? result.map((r) => ({ source: r.a, target: r.b, match: r.res })) : { source: result.a, target: result.b, match: result.res }, `Failed to resolve output format for the metric result`, { result, raw } ); } clone = () => Object.assign(Object.create(Object.getPrototypeOf(this)), this); reset() { for (const k in this.options) delete this.options[k]; return this; } setOptions(opt) { this.options = opt; return this; } mergeOptions(opt) { merge(this.options, opt); return this; } setSerializedOptions(opt) { return ErrorUtil.wrap( () => { this.options = JSON.parse(opt); return this; }, `Failed to parse serialized options, invalid JSON string`, { opt } ); } setOption(path, value) { set(this.options, path, value); return this; } rmvOption(path) { rmv(this.options, path); return this; } setRaw = (enable) => this.setOption('raw', enable); setMetric = (name) => this.setOption('metric', name); setFlags = (flags) => this.setOption('flags', flags); rmvFlags = () => this.rmvOption('flags'); setProcessors = (opt) => this.setOption('processors', opt); rmvProcessors = () => this.rmvOption('processors'); getOptions = () => this.options; getSerializedOptions = () => JSON.stringify(this.options); getOption = (path) => get(this.options, path); test(a, b, opt) { return this.compute(a, b, opt, 'single'); } compare(a, b, opt) { return this.compute(a, b, opt, 'single', true).res; } batchTest(a, b, opt) { return this.compute(a, b, opt, 'batch'); } batchSorted(a, b, dir = 'desc', opt) { return this.output( this.compute(a, b, opt, 'batch', true).sort((a, b) => dir === 'asc' ? a.res - b.res : b.res - a.res ), opt?.raw ?? this.options.raw ); } pairs(a, b, opt) { return this.compute(a, b, opt, 'pairwise'); } match(a, b, threshold, opt) { return this.output( this.compute(a, b, opt, 'batch', true) .filter((r) => r.res >= threshold) .sort((a, b) => b.res - a.res), opt?.raw ?? this.options.raw ); } closest(a, b, n = 1, opt) { return this.batchSorted(a, b, 'desc', opt).slice(0, n); } furthest(a, b, n = 1, opt) { return this.batchSorted(a, b, 'asc', opt).slice(0, n); } search(needle, haystack, flags, processors) { const resolved = this.resolveOptions({ flags, processors }); const test = this.prepare(needle, resolved); const hstk = this.prepare(haystack, resolved); return haystack.filter((_, i) => hstk[i].includes(test)); } matrix(input, opt) { input = this.prepare(input, this.resolveOptions(opt)); return input.map((a) => this.compute(a, input, undefined, 'batch', true, true).map( (b) => b.res ?? 0 ) ); } phoneticIndex(input, algo, opt) { const { algo: a, opt: o } = this.options.processors?.phonetic ?? {}; return this.index(input, { algo: algo ?? a, opt: opt ?? o }); } structuredLookup(query, data, key, opt) { return this.structured(data, key).lookup( (q, items, options) => this.batchTest(q, items, options), query, opt ); } structuredMatch(query, data, key, threshold, opt) { return this.structured(data, key).lookup( (q, items, options) => this.match(q, items, threshold, options), query, { ...opt, sort: 'desc' } ); } structuredClosest(query, data, key, n = 1, opt) { return this.structured(data, key).lookup( (q, items, options) => this.closest(q, items, n, options), query, { ...opt, sort: 'desc' } ); } structuredFurthest(query, data, key, n = 1, opt) { return this.structured(data, key).lookup( (q, items, options) => this.furthest(q, items, n, options), query, { ...opt, sort: 'asc' } ); } structuredPairs(data, key, other, otherKey, opt) { return this.structured(data, key).lookupPairs( (items, otherItems, options) => this.pairs(items, otherItems, options), other, otherKey, opt ); } } export { CmpStr };