UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

315 lines (311 loc) 10.1 kB
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License 'use strict'; var DeepMerge = require('./utils/DeepMerge.cjs'); var DiffChecker = require('./utils/DiffChecker.cjs'); var Errors = require('./utils/Errors.cjs'); var Filter = require('./utils/Filter.cjs'); var Normalizer = require('./utils/Normalizer.cjs'); var Profiler = require('./utils/Profiler.cjs'); var Registry = require('./utils/Registry.cjs'); var StructuredData = require('./utils/StructuredData.cjs'); var TextAnalyzer = require('./utils/TextAnalyzer.cjs'); require('./metric/Cosine.cjs'); require('./metric/DamerauLevenshtein.cjs'); require('./metric/DiceSorensen.cjs'); require('./metric/Hamming.cjs'); require('./metric/Jaccard.cjs'); require('./metric/JaroWinkler.cjs'); require('./metric/LCS.cjs'); require('./metric/Levenshtein.cjs'); require('./metric/NeedlemanWunsch.cjs'); require('./metric/QGram.cjs'); require('./metric/SmithWaterman.cjs'); var Metric = require('./metric/Metric.cjs'); require('./phonetic/Caverphone.cjs'); require('./phonetic/Cologne.cjs'); require('./phonetic/Metaphone.cjs'); require('./phonetic/Soundex.cjs'); var Phonetic = require('./phonetic/Phonetic.cjs'); const profiler = Profiler.Profiler.getInstance(); class CmpStr { static filter = { has: Filter.Filter.has, add: Filter.Filter.add, remove: Filter.Filter.remove, pause: Filter.Filter.pause, resume: Filter.Filter.resume, list: Filter.Filter.list, clear: Filter.Filter.clear }; static metric = { add: Metric.MetricRegistry.add, remove: Metric.MetricRegistry.remove, has: Metric.MetricRegistry.has, list: Metric.MetricRegistry.list }; static phonetic = { add: Phonetic.PhoneticRegistry.add, remove: Phonetic.PhoneticRegistry.remove, has: Phonetic.PhoneticRegistry.has, list: Phonetic.PhoneticRegistry.list, map: { add: Phonetic.PhoneticMappingRegistry.add, remove: Phonetic.PhoneticMappingRegistry.remove, has: Phonetic.PhoneticMappingRegistry.has, list: Phonetic.PhoneticMappingRegistry.list } }; static profiler = profiler.services; static clearCache = { normalizer: Normalizer.Normalizer.clear, filter: Filter.Filter.clearPipeline, metric: Metric.Metric.clear, phonetic: Phonetic.Phonetic.clear }; static analyze = (input) => new TextAnalyzer.TextAnalyzer(input); static diff = (a, b, opt) => new DiffChecker.DiffChecker(a, b, opt); static create(opt) { return new CmpStr(opt); } options = Object.create(null); constructor(opt) { if (opt) typeof opt === 'string' ? this.setSerializedOptions(opt) : this.setOptions(opt); } assert(cond, test) { switch (cond) { case 'metric': if (!CmpStr.metric.has(test)) throw new Errors.CmpStrNotFoundError( `CmpStr <metric> must be set, call .setMetric(), ` + `use CmpStr.metric.list() for available metrics`, { metric: test } ); break; case 'phonetic': if (!CmpStr.phonetic.has(test)) throw new Errors.CmpStrNotFoundError( `CmpStr <phonetic> must be set, call .setPhonetic(), ` + `use CmpStr.phonetic.list() for available phonetic algorithms`, { phonetic: test } ); break; default: throw new Errors.CmpStrInternalError( `Cmpstr condition <${cond}> unknown` ); } } assertMany(...cond) { for (const [c, test] of cond) this.assert(c, test); } resolveOptions(opt) { return DeepMerge.merge({ ...(this.options ?? Object.create(null)) }, opt); } normalize(input, flags) { return Normalizer.Normalizer.normalize( input, flags ?? this.options.flags ?? '' ); } filter(input, hook) { return Filter.Filter.apply(hook, input); } prepare(input, opt) { const { flags, processors } = opt ?? this.options; if (flags?.length) input = this.normalize(input, flags); input = this.filter(input, 'input'); if (processors?.phonetic) input = this.index(input, processors.phonetic); return input; } postProcess(result, opt) { if (opt?.removeZero && Array.isArray(result)) result = result.filter((r) => r.res > 0); return result; } index(input, { algo, opt }) { this.assert('phonetic', algo); const phonetic = Registry.factory['phonetic'](algo, opt); const delimiter = opt?.delimiter ?? ' '; return Array.isArray(input) ? input.map((s) => phonetic.getIndex(s).join(delimiter)) : phonetic.getIndex(input).join(delimiter); } structured(data, key) { return StructuredData.StructuredData.create(data, key); } compute(a, b, opt, mode, raw, skip) { return Errors.ErrorUtil.wrap( () => { const resolved = this.resolveOptions(opt); this.assert('metric', resolved.metric); const A = skip ? a : this.prepare(a, resolved); const B = skip ? b : this.prepare(b, resolved); if ( resolved.safeEmpty && ((Array.isArray(A) && A.length === 0) || (Array.isArray(B) && B.length === 0) || A === '' || B === '') ) { return []; } const metric = Registry.factory['metric']( resolved.metric, A, B, resolved.opt ); if (resolved.output !== 'prep') metric.setOriginal(a, b); metric.run(mode); const result = this.postProcess(metric.getResults(), resolved); return this.output(result, raw ?? resolved.raw); }, `Failed to compute metric <${opt?.metric ?? this.options.metric}> for the given inputs`, { a, b, options: opt } ); } output(result, raw) { return Errors.ErrorUtil.wrap( () => (raw ?? this.options.raw) ? result : Array.isArray(result) ? result.map((r) => ({ source: r.a, target: r.b, match: r.res })) : { source: result.a, target: result.b, match: result.res }, `Failed to resolve output format for the metric result`, { result, raw } ); } clone = () => Object.assign(Object.create(Object.getPrototypeOf(this)), this); reset() { for (const k in this.options) delete this.options[k]; return this; } setOptions(opt) { this.options = opt; return this; } mergeOptions(opt) { DeepMerge.merge(this.options, opt); return this; } setSerializedOptions(opt) { return Errors.ErrorUtil.wrap( () => { this.options = JSON.parse(opt); return this; }, `Failed to parse serialized options, invalid JSON string`, { opt } ); } setOption(path, value) { DeepMerge.set(this.options, path, value); return this; } rmvOption(path) { DeepMerge.rmv(this.options, path); return this; } setRaw = (enable) => this.setOption('raw', enable); setMetric = (name) => this.setOption('metric', name); setFlags = (flags) => this.setOption('flags', flags); rmvFlags = () => this.rmvOption('flags'); setProcessors = (opt) => this.setOption('processors', opt); rmvProcessors = () => this.rmvOption('processors'); getOptions = () => this.options; getSerializedOptions = () => JSON.stringify(this.options); getOption = (path) => DeepMerge.get(this.options, path); test(a, b, opt) { return this.compute(a, b, opt, 'single'); } compare(a, b, opt) { return this.compute(a, b, opt, 'single', true).res; } batchTest(a, b, opt) { return this.compute(a, b, opt, 'batch'); } batchSorted(a, b, dir = 'desc', opt) { return this.output( this.compute(a, b, opt, 'batch', true).sort((a, b) => dir === 'asc' ? a.res - b.res : b.res - a.res ), opt?.raw ?? this.options.raw ); } pairs(a, b, opt) { return this.compute(a, b, opt, 'pairwise'); } match(a, b, threshold, opt) { return this.output( this.compute(a, b, opt, 'batch', true) .filter((r) => r.res >= threshold) .sort((a, b) => b.res - a.res), opt?.raw ?? this.options.raw ); } closest(a, b, n = 1, opt) { return this.batchSorted(a, b, 'desc', opt).slice(0, n); } furthest(a, b, n = 1, opt) { return this.batchSorted(a, b, 'asc', opt).slice(0, n); } search(needle, haystack, flags, processors) { const resolved = this.resolveOptions({ flags, processors }); const test = this.prepare(needle, resolved); const hstk = this.prepare(haystack, resolved); return haystack.filter((_, i) => hstk[i].includes(test)); } matrix(input, opt) { input = this.prepare(input, this.resolveOptions(opt)); return input.map((a) => this.compute(a, input, undefined, 'batch', true, true).map( (b) => b.res ?? 0 ) ); } phoneticIndex(input, algo, opt) { const { algo: a, opt: o } = this.options.processors?.phonetic ?? {}; return this.index(input, { algo: algo ?? a, opt: opt ?? o }); } structuredLookup(query, data, key, opt) { return this.structured(data, key).lookup( (q, items, options) => this.batchTest(q, items, options), query, opt ); } structuredMatch(query, data, key, threshold, opt) { return this.structured(data, key).lookup( (q, items, options) => this.match(q, items, threshold, options), query, { ...opt, sort: 'desc' } ); } structuredClosest(query, data, key, n = 1, opt) { return this.structured(data, key).lookup( (q, items, options) => this.closest(q, items, n, options), query, { ...opt, sort: 'desc' } ); } structuredFurthest(query, data, key, n = 1, opt) { return this.structured(data, key).lookup( (q, items, options) => this.furthest(q, items, n, options), query, { ...opt, sort: 'asc' } ); } structuredPairs(data, key, other, otherKey, opt) { return this.structured(data, key).lookupPairs( (items, otherItems, options) => this.pairs(items, otherItems, options), other, otherKey, opt ); } } exports.CmpStr = CmpStr;