cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
162 lines (159 loc) • 5.03 kB
JavaScript
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License
import { CmpStrValidationError, ErrorUtil } from './Errors.mjs';
import { Pool } from './Pool.mjs';
class StructuredData {
data;
key;
static create(data, key) {
return new StructuredData(data, key);
}
constructor(data, key) {
this.data = data;
this.key = key;
}
extractFrom(arr, key) {
const result = Pool.acquire('string[]', arr.length);
for (let i = 0; i < arr.length; i++) {
const val = arr[i][key];
result[i] = typeof val === 'string' ? val : String(val ?? '');
}
return result;
}
extract = () => this.extractFrom(this.data, this.key);
isMetricResult(v) {
return (
typeof v === 'object' && v !== null && 'a' in v && 'b' in v && 'res' in v
);
}
isCmpStrResult(v) {
return (
typeof v === 'object' &&
v !== null &&
'source' in v &&
'target' in v &&
'match' in v
);
}
normalizeResults(results) {
if (!Array.isArray(results) || results.length === 0) return [];
const first = results[0];
let normalized = [];
if (this.isMetricResult(first)) normalized = results;
else if (this.isCmpStrResult(first))
normalized = results.map((r) => ({
metric: 'unknown',
a: r.source,
b: r.target,
res: r.match,
raw: r.raw
}));
else
throw new CmpStrValidationError(
'Unsupported result format for StructuredData normalization.'
);
return normalized.map((r, idx) => ({ ...r, __idx: idx }));
}
rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
const stringToIndices = new Map();
for (let i = 0; i < extractedStrings.length; i++) {
const str = extractedStrings[i];
if (!stringToIndices.has(str)) stringToIndices.set(str, []);
stringToIndices.get(str).push(i);
}
const output = new Array(results.length);
const occurrenceCount = new Map();
let out = 0;
for (let i = 0; i < results.length; i++) {
const result = results[i];
if (removeZero && result.res === 0) continue;
const targetStr = result.b || '';
const indices = stringToIndices.get(targetStr);
let dataIndex;
if (indices && indices.length > 0) {
const occurrence = occurrenceCount.get(targetStr) ?? 0;
occurrenceCount.set(targetStr, occurrence + 1);
dataIndex = indices[occurrence % indices.length];
} else {
dataIndex = result.__idx ?? i;
}
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
const sourceObj = sourceData[dataIndex];
const mappedTarget = extractedStrings[dataIndex] || targetStr;
if (objectsOnly) output[out++] = sourceObj;
else
output[out++] = {
obj: sourceObj,
key: this.key,
result: { source: result.a, target: mappedTarget, match: result.res },
...(result.raw ? { raw: result.raw } : null)
};
}
output.length = out;
return output;
}
sort(results, sort) {
if (!sort || results.length <= 1) return results;
const asc = sort === 'asc';
return results.sort((a, b) => (asc ? a.res - b.res : b.res - a.res));
}
finalizeLookup(results, extractedStrings, opt) {
return this.rebuild(
this.sort(this.normalizeResults(results), opt?.sort),
this.data,
extractedStrings,
opt?.removeZero,
opt?.objectsOnly
);
}
performLookup(fn, extractedStrings, opt) {
return ErrorUtil.wrap(
() => this.finalizeLookup(fn(), extractedStrings, opt),
'StructuredData lookup failed',
{ key: this.key }
);
}
async performLookupAsync(fn, extractedStrings, opt) {
return await ErrorUtil.wrapAsync(
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
'StructuredData async lookup failed',
{ key: this.key }
);
}
lookup(fn, query, opt) {
const b = this.extract();
try {
return this.performLookup(() => fn(query, b, opt), b, opt);
} finally {
Pool.release('string[]', b, b.length);
}
}
async lookupAsync(fn, query, opt) {
const b = this.extract();
try {
return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
} finally {
Pool.release('string[]', b, b.length);
}
}
lookupPairs(fn, other, otherKey, opt) {
const a = this.extract();
const b = this.extractFrom(other, otherKey);
try {
return this.performLookup(() => fn(a, b, opt), a, opt);
} finally {
Pool.release('string[]', a, a.length);
Pool.release('string[]', b, b.length);
}
}
async lookupPairsAsync(fn, other, otherKey, opt) {
const a = this.extract();
const b = this.extractFrom(other, otherKey);
try {
return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
} finally {
Pool.release('string[]', a, a.length);
Pool.release('string[]', b, b.length);
}
}
}
export { StructuredData };