UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

162 lines (159 loc) 5.03 kB
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License import { CmpStrValidationError, ErrorUtil } from './Errors.mjs'; import { Pool } from './Pool.mjs'; class StructuredData { data; key; static create(data, key) { return new StructuredData(data, key); } constructor(data, key) { this.data = data; this.key = key; } extractFrom(arr, key) { const result = Pool.acquire('string[]', arr.length); for (let i = 0; i < arr.length; i++) { const val = arr[i][key]; result[i] = typeof val === 'string' ? val : String(val ?? ''); } return result; } extract = () => this.extractFrom(this.data, this.key); isMetricResult(v) { return ( typeof v === 'object' && v !== null && 'a' in v && 'b' in v && 'res' in v ); } isCmpStrResult(v) { return ( typeof v === 'object' && v !== null && 'source' in v && 'target' in v && 'match' in v ); } normalizeResults(results) { if (!Array.isArray(results) || results.length === 0) return []; const first = results[0]; let normalized = []; if (this.isMetricResult(first)) normalized = results; else if (this.isCmpStrResult(first)) normalized = results.map((r) => ({ metric: 'unknown', a: r.source, b: r.target, res: r.match, raw: r.raw })); else throw new CmpStrValidationError( 'Unsupported result format for StructuredData normalization.' ); return normalized.map((r, idx) => ({ ...r, __idx: idx })); } rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) { const stringToIndices = new Map(); for (let i = 0; i < extractedStrings.length; i++) { const str = extractedStrings[i]; if (!stringToIndices.has(str)) stringToIndices.set(str, []); stringToIndices.get(str).push(i); } const output = new Array(results.length); const occurrenceCount = new Map(); let out = 0; for (let i = 0; i < results.length; i++) { const result = results[i]; if (removeZero && result.res === 0) continue; const targetStr = result.b || ''; const indices = stringToIndices.get(targetStr); let dataIndex; if (indices && indices.length > 0) { const occurrence = occurrenceCount.get(targetStr) ?? 0; occurrenceCount.set(targetStr, occurrence + 1); dataIndex = indices[occurrence % indices.length]; } else { dataIndex = result.__idx ?? i; } if (dataIndex < 0 || dataIndex >= sourceData.length) continue; const sourceObj = sourceData[dataIndex]; const mappedTarget = extractedStrings[dataIndex] || targetStr; if (objectsOnly) output[out++] = sourceObj; else output[out++] = { obj: sourceObj, key: this.key, result: { source: result.a, target: mappedTarget, match: result.res }, ...(result.raw ? { raw: result.raw } : null) }; } output.length = out; return output; } sort(results, sort) { if (!sort || results.length <= 1) return results; const asc = sort === 'asc'; return results.sort((a, b) => (asc ? a.res - b.res : b.res - a.res)); } finalizeLookup(results, extractedStrings, opt) { return this.rebuild( this.sort(this.normalizeResults(results), opt?.sort), this.data, extractedStrings, opt?.removeZero, opt?.objectsOnly ); } performLookup(fn, extractedStrings, opt) { return ErrorUtil.wrap( () => this.finalizeLookup(fn(), extractedStrings, opt), 'StructuredData lookup failed', { key: this.key } ); } async performLookupAsync(fn, extractedStrings, opt) { return await ErrorUtil.wrapAsync( async () => this.finalizeLookup(await fn(), extractedStrings, opt), 'StructuredData async lookup failed', { key: this.key } ); } lookup(fn, query, opt) { const b = this.extract(); try { return this.performLookup(() => fn(query, b, opt), b, opt); } finally { Pool.release('string[]', b, b.length); } } async lookupAsync(fn, query, opt) { const b = this.extract(); try { return await this.performLookupAsync(() => fn(query, b, opt), b, opt); } finally { Pool.release('string[]', b, b.length); } } lookupPairs(fn, other, otherKey, opt) { const a = this.extract(); const b = this.extractFrom(other, otherKey); try { return this.performLookup(() => fn(a, b, opt), a, opt); } finally { Pool.release('string[]', a, a.length); Pool.release('string[]', b, b.length); } } async lookupPairsAsync(fn, other, otherKey, opt) { const a = this.extract(); const b = this.extractFrom(other, otherKey); try { return await this.performLookupAsync(() => fn(a, b, opt), a, opt); } finally { Pool.release('string[]', a, a.length); Pool.release('string[]', b, b.length); } } } export { StructuredData };