cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
311 lines (308 loc) • 9.74 kB
JavaScript
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License
import { merge, set, rmv, get } from './utils/DeepMerge.mjs';
import { DiffChecker } from './utils/DiffChecker.mjs';
import {
CmpStrInternalError,
CmpStrNotFoundError,
ErrorUtil
} from './utils/Errors.mjs';
import { Filter } from './utils/Filter.mjs';
import { Normalizer } from './utils/Normalizer.mjs';
import { Profiler } from './utils/Profiler.mjs';
import { factory } from './utils/Registry.mjs';
import { StructuredData } from './utils/StructuredData.mjs';
import { TextAnalyzer } from './utils/TextAnalyzer.mjs';
import './metric/Cosine.mjs';
import './metric/DamerauLevenshtein.mjs';
import './metric/DiceSorensen.mjs';
import './metric/Hamming.mjs';
import './metric/Jaccard.mjs';
import './metric/JaroWinkler.mjs';
import './metric/LCS.mjs';
import './metric/Levenshtein.mjs';
import './metric/NeedlemanWunsch.mjs';
import './metric/QGram.mjs';
import './metric/SmithWaterman.mjs';
import { MetricRegistry, Metric } from './metric/Metric.mjs';
import './phonetic/Caverphone.mjs';
import './phonetic/Cologne.mjs';
import './phonetic/Metaphone.mjs';
import './phonetic/Soundex.mjs';
import {
PhoneticMappingRegistry,
PhoneticRegistry,
Phonetic
} from './phonetic/Phonetic.mjs';
const profiler = Profiler.getInstance();
class CmpStr {
static filter = {
has: Filter.has,
add: Filter.add,
remove: Filter.remove,
pause: Filter.pause,
resume: Filter.resume,
list: Filter.list,
clear: Filter.clear
};
static metric = {
add: MetricRegistry.add,
remove: MetricRegistry.remove,
has: MetricRegistry.has,
list: MetricRegistry.list
};
static phonetic = {
add: PhoneticRegistry.add,
remove: PhoneticRegistry.remove,
has: PhoneticRegistry.has,
list: PhoneticRegistry.list,
map: {
add: PhoneticMappingRegistry.add,
remove: PhoneticMappingRegistry.remove,
has: PhoneticMappingRegistry.has,
list: PhoneticMappingRegistry.list
}
};
static profiler = profiler.services;
static clearCache = {
normalizer: Normalizer.clear,
filter: Filter.clearPipeline,
metric: Metric.clear,
phonetic: Phonetic.clear
};
static analyze = (input) => new TextAnalyzer(input);
static diff = (a, b, opt) => new DiffChecker(a, b, opt);
static create(opt) {
return new CmpStr(opt);
}
options = Object.create(null);
constructor(opt) {
if (opt)
typeof opt === 'string'
? this.setSerializedOptions(opt)
: this.setOptions(opt);
}
assert(cond, test) {
switch (cond) {
case 'metric':
if (!CmpStr.metric.has(test))
throw new CmpStrNotFoundError(
`CmpStr <metric> must be set, call .setMetric(), ` +
`use CmpStr.metric.list() for available metrics`,
{ metric: test }
);
break;
case 'phonetic':
if (!CmpStr.phonetic.has(test))
throw new CmpStrNotFoundError(
`CmpStr <phonetic> must be set, call .setPhonetic(), ` +
`use CmpStr.phonetic.list() for available phonetic algorithms`,
{ phonetic: test }
);
break;
default:
throw new CmpStrInternalError(`Cmpstr condition <${cond}> unknown`);
}
}
assertMany(...cond) {
for (const [c, test] of cond) this.assert(c, test);
}
resolveOptions(opt) {
return merge({ ...(this.options ?? Object.create(null)) }, opt);
}
normalize(input, flags) {
return Normalizer.normalize(input, flags ?? this.options.flags ?? '');
}
filter(input, hook) {
return Filter.apply(hook, input);
}
prepare(input, opt) {
const { flags, processors } = opt ?? this.options;
if (flags?.length) input = this.normalize(input, flags);
input = this.filter(input, 'input');
if (processors?.phonetic) input = this.index(input, processors.phonetic);
return input;
}
postProcess(result, opt) {
if (opt?.removeZero && Array.isArray(result))
result = result.filter((r) => r.res > 0);
return result;
}
index(input, { algo, opt }) {
this.assert('phonetic', algo);
const phonetic = factory['phonetic'](algo, opt);
const delimiter = opt?.delimiter ?? ' ';
return Array.isArray(input)
? input.map((s) => phonetic.getIndex(s).join(delimiter))
: phonetic.getIndex(input).join(delimiter);
}
structured(data, key) {
return StructuredData.create(data, key);
}
compute(a, b, opt, mode, raw, skip) {
return ErrorUtil.wrap(
() => {
const resolved = this.resolveOptions(opt);
this.assert('metric', resolved.metric);
const A = skip ? a : this.prepare(a, resolved);
const B = skip ? b : this.prepare(b, resolved);
if (
resolved.safeEmpty &&
((Array.isArray(A) && A.length === 0) ||
(Array.isArray(B) && B.length === 0) ||
A === '' ||
B === '')
) {
return [];
}
const metric = factory['metric'](resolved.metric, A, B, resolved.opt);
if (resolved.output !== 'prep') metric.setOriginal(a, b);
metric.run(mode);
const result = this.postProcess(metric.getResults(), resolved);
return this.output(result, raw ?? resolved.raw);
},
`Failed to compute metric <${opt?.metric ?? this.options.metric}> for the given inputs`,
{ a, b, options: opt }
);
}
output(result, raw) {
return ErrorUtil.wrap(
() =>
(raw ?? this.options.raw)
? result
: Array.isArray(result)
? result.map((r) => ({ source: r.a, target: r.b, match: r.res }))
: { source: result.a, target: result.b, match: result.res },
`Failed to resolve output format for the metric result`,
{ result, raw }
);
}
clone = () => Object.assign(Object.create(Object.getPrototypeOf(this)), this);
reset() {
for (const k in this.options) delete this.options[k];
return this;
}
setOptions(opt) {
this.options = opt;
return this;
}
mergeOptions(opt) {
merge(this.options, opt);
return this;
}
setSerializedOptions(opt) {
return ErrorUtil.wrap(
() => {
this.options = JSON.parse(opt);
return this;
},
`Failed to parse serialized options, invalid JSON string`,
{ opt }
);
}
setOption(path, value) {
set(this.options, path, value);
return this;
}
rmvOption(path) {
rmv(this.options, path);
return this;
}
setRaw = (enable) => this.setOption('raw', enable);
setMetric = (name) => this.setOption('metric', name);
setFlags = (flags) => this.setOption('flags', flags);
rmvFlags = () => this.rmvOption('flags');
setProcessors = (opt) => this.setOption('processors', opt);
rmvProcessors = () => this.rmvOption('processors');
getOptions = () => this.options;
getSerializedOptions = () => JSON.stringify(this.options);
getOption = (path) => get(this.options, path);
test(a, b, opt) {
return this.compute(a, b, opt, 'single');
}
compare(a, b, opt) {
return this.compute(a, b, opt, 'single', true).res;
}
batchTest(a, b, opt) {
return this.compute(a, b, opt, 'batch');
}
batchSorted(a, b, dir = 'desc', opt) {
return this.output(
this.compute(a, b, opt, 'batch', true).sort((a, b) =>
dir === 'asc' ? a.res - b.res : b.res - a.res
),
opt?.raw ?? this.options.raw
);
}
pairs(a, b, opt) {
return this.compute(a, b, opt, 'pairwise');
}
match(a, b, threshold, opt) {
return this.output(
this.compute(a, b, opt, 'batch', true)
.filter((r) => r.res >= threshold)
.sort((a, b) => b.res - a.res),
opt?.raw ?? this.options.raw
);
}
closest(a, b, n = 1, opt) {
return this.batchSorted(a, b, 'desc', opt).slice(0, n);
}
furthest(a, b, n = 1, opt) {
return this.batchSorted(a, b, 'asc', opt).slice(0, n);
}
search(needle, haystack, flags, processors) {
const resolved = this.resolveOptions({ flags, processors });
const test = this.prepare(needle, resolved);
const hstk = this.prepare(haystack, resolved);
return haystack.filter((_, i) => hstk[i].includes(test));
}
matrix(input, opt) {
input = this.prepare(input, this.resolveOptions(opt));
return input.map((a) =>
this.compute(a, input, undefined, 'batch', true, true).map(
(b) => b.res ?? 0
)
);
}
phoneticIndex(input, algo, opt) {
const { algo: a, opt: o } = this.options.processors?.phonetic ?? {};
return this.index(input, { algo: algo ?? a, opt: opt ?? o });
}
structuredLookup(query, data, key, opt) {
return this.structured(data, key).lookup(
(q, items, options) => this.batchTest(q, items, options),
query,
opt
);
}
structuredMatch(query, data, key, threshold, opt) {
return this.structured(data, key).lookup(
(q, items, options) => this.match(q, items, threshold, options),
query,
{ ...opt, sort: 'desc' }
);
}
structuredClosest(query, data, key, n = 1, opt) {
return this.structured(data, key).lookup(
(q, items, options) => this.closest(q, items, n, options),
query,
{ ...opt, sort: 'desc' }
);
}
structuredFurthest(query, data, key, n = 1, opt) {
return this.structured(data, key).lookup(
(q, items, options) => this.furthest(q, items, n, options),
query,
{ ...opt, sort: 'asc' }
);
}
structuredPairs(data, key, other, otherKey, opt) {
return this.structured(data, key).lookupPairs(
(items, otherItems, options) => this.pairs(items, otherItems, options),
other,
otherKey,
opt
);
}
}
export { CmpStr };