cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
1,603 lines (1,588 loc) • 93.1 kB
JavaScript
/**
* CmpStr v3.2.2 build-bb61120-260311
* This is a lightweight, fast and well performing library for calculating string similarity.
* (c) 2023-2026 Paul Köhler @komed3 / MIT License
* Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr
*/
(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined'
? factory(exports)
: typeof define === 'function' && define.amd
? define(['exports'], factory)
: ((global =
typeof globalThis !== 'undefined' ? globalThis : global || self),
factory((global.CmpStr = {})));
})(this, function (exports) {
'use strict';
class CmpStrError extends Error {
code;
meta;
cause;
when = new Date().toISOString();
constructor(code, message, meta, cause) {
super(message);
this.name = this.constructor.name;
this.code = code;
this.meta = meta;
this.cause = cause;
if (typeof Error.captureStackTrace === 'function') {
Error.captureStackTrace(this, this.constructor);
}
}
toJSON() {
return {
name: this.name,
code: this.code,
message: this.message,
meta: this.meta,
when: this.when,
cause:
this.cause instanceof Error
? {
name: this.cause.name,
message: this.cause.message,
stack: this.cause.stack
}
: this.cause
};
}
toString(stack = false) {
const parts = [`${this.name} [${this.code}]`, this.message];
if (this.meta && Object.keys(this.meta).length) {
try {
parts.push(JSON.stringify(this.meta));
} catch {}
}
return (
parts.join(' - ') +
(stack && this.stack ? `\nStack Trace:\n${this.stack}` : '')
);
}
}
class CmpStrValidationError extends CmpStrError {
constructor(message, meta, cause) {
super('E_VALIDATION', message, meta, cause);
}
}
class CmpStrNotFoundError extends CmpStrError {
constructor(message, meta, cause) {
super('E_NOT_FOUND', message, meta, cause);
}
}
class CmpStrUsageError extends CmpStrError {
constructor(message, meta, cause) {
super('E_USAGE', message, meta, cause);
}
}
class CmpStrInternalError extends CmpStrError {
constructor(message, meta, cause) {
super('E_INTERNAL', message, meta, cause);
}
}
class ErrorUtil {
static assert(condition, message, meta) {
if (!condition) throw new CmpStrUsageError(message, meta);
}
static create(err, message, meta) {
if (err instanceof CmpStrError) throw err;
throw new CmpStrInternalError(message, meta, err);
}
static format(err) {
if (err instanceof CmpStrError) return err.toString();
if (err instanceof Error) return `${err.name}: ${err.message}`;
return String(err);
}
static wrap(fn, message, meta) {
try {
return fn();
} catch (err) {
throw new CmpStrInternalError(message, meta, err);
}
}
static async wrapAsync(fn, message, meta) {
try {
return await fn();
} catch (err) {
throw new CmpStrInternalError(message, meta, err);
}
}
}
var Errors = /*#__PURE__*/ Object.freeze({
__proto__: null,
CmpStrError: CmpStrError,
CmpStrInternalError: CmpStrInternalError,
CmpStrNotFoundError: CmpStrNotFoundError,
CmpStrUsageError: CmpStrUsageError,
CmpStrValidationError: CmpStrValidationError,
ErrorUtil: ErrorUtil
});
const BRACKET_PATTERN = /\[(\d+)]/g;
const PATH_CACHE = new Map();
function parse(p) {
let cached = PATH_CACHE.get(p);
if (cached) return cached;
const parsed = p
.replace(BRACKET_PATTERN, '.$1')
.split('.')
.map((s) => {
const n = Number(s);
return Number.isInteger(n) && String(n) === s ? n : s;
});
PATH_CACHE.set(p, parsed);
return parsed;
}
function get(t, path, fb) {
let o = t;
for (const k of parse(path)) {
if (o == null || !(k in o)) return fb;
o = o[k];
}
return o;
}
function has(t, path) {
let o = t;
for (const k of parse(path)) {
if (o == null || !(k in o)) return false;
o = o[k];
}
return true;
}
function set(t, path, value) {
if (path === '') return value;
const keys = parse(path);
if (t !== undefined && (typeof t !== 'object' || t === null))
throw new CmpStrUsageError(
`Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`,
{ path: keys[0], target: t }
);
const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null));
let cur = root;
for (let i = 0; i < keys.length - 1; i++) {
const k = keys[i];
let n = cur[k];
if (n != null && typeof n !== 'object')
throw new CmpStrUsageError(
`Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`,
{ path: keys.slice(0, i + 2), value: n }
);
if (n == null)
n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null);
cur = n;
}
cur[keys[keys.length - 1]] = value;
return root;
}
function merge(
t = Object.create(null),
o = Object.create(null),
mergeUndefined = false
) {
const target = t ?? Object.create(null);
Object.keys(o).forEach((k) => {
const val = o[k];
if (!mergeUndefined && val === undefined) return;
if (k === '__proto__' || k === 'constructor') return;
if (val !== null && typeof val === 'object' && !Array.isArray(val)) {
const existing = target[k];
target[k] = merge(
existing !== null &&
typeof existing === 'object' &&
!Array.isArray(existing)
? existing
: Object.create(null),
val,
mergeUndefined
);
} else target[k] = val;
});
return target;
}
function rmv(t, path, preserveEmpty = false) {
const keys = parse(path);
const remove = (obj, i = 0) => {
const key = keys[i];
if (!obj || typeof obj !== 'object') return false;
if (i === keys.length - 1) return delete obj[key];
if (!remove(obj[key], i + 1)) return false;
if (!preserveEmpty) {
const val = obj[key];
if (
typeof val === 'object' &&
((Array.isArray(val) && val.every((v) => v == null)) ||
(!Array.isArray(val) && Object.keys(val).length === 0))
)
delete obj[key];
}
return true;
};
remove(t);
return t;
}
var DeepMerge = /*#__PURE__*/ Object.freeze({
__proto__: null,
get: get,
has: has,
merge: merge,
rmv: rmv,
set: set
});
class DiffChecker {
a;
b;
options;
entries = [];
grouped = [];
diffRun = false;
constructor(a, b, opt = {}) {
((this.a = a), (this.b = b));
this.options = {
...{
mode: 'word',
caseInsensitive: false,
contextLines: 1,
groupedLines: true,
expandLines: false,
showChangeMagnitude: true,
maxMagnitudeSymbols: 5,
lineBreak: '\n'
},
...opt
};
this.computeDiff();
}
text2lines() {
const linesA = this.a.trim().split(/\r?\n/);
const linesB = this.b.trim().split(/\r?\n/);
return { linesA, linesB, maxLen: Math.max(linesA.length, linesB.length) };
}
tokenize(input) {
switch (this.options.mode) {
case 'line':
return [input];
case 'word':
return input.split(/\s+/);
}
}
concat(input) {
return input.join(this.options.mode === 'word' ? ' ' : '');
}
computeDiff() {
if (this.diffRun) return;
const { linesA, linesB, maxLen } = this.text2lines();
for (let i = 0; i < maxLen; i++)
this.lineDiff(linesA[i] || '', linesB[i] || '', i);
this.findGroups();
this.diffRun = true;
}
lineDiff(a, b, line) {
const { mode, caseInsensitive } = this.options;
const baseLen = Math.max(a.length, b.length);
let A = a,
B = b;
if (caseInsensitive) ((A = a.toLowerCase()), (B = b.toLowerCase()));
let diffs = [];
let delSize = 0,
insSize = 0;
switch (mode) {
case 'line':
if (A !== B) {
diffs.push({
posA: 0,
posB: 0,
del: a,
ins: b,
size: b.length - a.length
});
delSize = a.length;
insSize = b.length;
}
break;
case 'word':
diffs = this.preciseDiff(a, A, b, B);
for (const d of diffs)
((delSize += d.del.length), (insSize += d.ins.length));
break;
}
if (diffs.length)
this.entries.push({
line,
diffs,
delSize,
insSize,
baseLen,
totalSize: insSize - delSize,
magnitude: this.magnitude(delSize, insSize, baseLen)
});
}
preciseDiff(a, A, b, B) {
const posIndex = (t) =>
t.reduce(
(p, _, i) => (p.push(i ? p[i - 1] + t[i - 1].length + 1 : 0), p),
[]
);
const origA = this.tokenize(a);
const origB = this.tokenize(b);
const tokenA = this.tokenize(A);
const tokenB = this.tokenize(B);
const lenA = tokenA.length;
const lenB = tokenB.length;
const posArrA = posIndex(origA);
const posArrB = posIndex(origB);
const matches = [];
let ai = 0,
bi = 0;
while (ai < lenA && bi < lenB) {
if (tokenA[ai] === tokenB[bi]) {
let len = 1;
while (
ai + len < lenA &&
bi + len < lenB &&
tokenA[ai + len] === tokenB[bi + len]
)
len++;
matches.push({ ai, bi, len });
((ai += len), (bi += len));
} else {
let found = false;
for (let offset = 1; offset <= 3 && !found; offset++) {
if (ai + offset < lenA && tokenA[ai + offset] === tokenB[bi]) {
matches.push({ ai: ai + offset, bi, len: 1 });
((ai += offset + 1), (bi += 1), (found = true));
} else if (
bi + offset < lenB &&
tokenA[ai] === tokenB[bi + offset]
) {
matches.push({ ai, bi: bi + offset, len: 1 });
((ai += 1), (bi += offset + 1), (found = true));
}
}
if (!found) (ai++, bi++);
}
}
const diffs = [];
let i = 0,
j = 0;
for (const m of matches) {
if (i < m.ai || j < m.bi) {
const delArr = origA.slice(i, m.ai);
const insArr = origB.slice(j, m.bi);
diffs.push({
posA: posArrA[i] ?? 0,
posB: posArrB[j] ?? 0,
del: this.concat(delArr),
ins: this.concat(insArr),
size: insArr.join('').length - delArr.join('').length
});
}
((i = m.ai + m.len), (j = m.bi + m.len));
}
if (i < lenA || j < lenB) {
const delArr = origA.slice(i);
const insArr = origB.slice(j);
diffs.push({
posA: posArrA[i] ?? 0,
posB: posArrB[j] ?? 0,
del: this.concat(delArr),
ins: this.concat(insArr),
size: insArr.join('').length - delArr.join('').length
});
}
return diffs.filter((d) => d.del.length > 0 || d.ins.length > 0);
}
findGroups() {
const { contextLines } = this.options;
const addGroup = (group, start, end) => {
const [delSize, insSize, totalSize, baseLen] = [
'delSize',
'insSize',
'totalSize',
'baseLen'
].map((k) => group.reduce((sum, e) => sum + e[k], 0));
this.grouped.push({
start,
end,
delSize,
insSize,
totalSize,
line: group[0].line,
entries: group,
magnitude: this.magnitude(delSize, insSize, baseLen)
});
};
let group = [];
let start = 0,
end = 0;
for (const entry of this.entries) {
const s = Math.max(0, entry.line - contextLines);
const e = entry.line + contextLines;
if (!group.length || s <= end + 1) {
if (!group.length) start = s;
end = Math.max(end, e);
group.push(entry);
} else {
addGroup(group, start, end);
((group = [entry]), (start = s), (end = e));
}
}
if (group.length) addGroup(group, start, end);
}
magnitude(del, ins, baseLen) {
const { maxMagnitudeSymbols } = this.options;
const total = del + ins;
if (total === 0 || baseLen === 0) return '';
const magLen = Math.min(
maxMagnitudeSymbols,
Math.max(Math.round((total / baseLen) * maxMagnitudeSymbols), 1)
);
const plus = Math.round((ins / total) * magLen);
const minus = magLen - plus;
return '+'.repeat(plus) + '-'.repeat(minus);
}
output(cli) {
const {
mode,
contextLines,
groupedLines,
expandLines,
showChangeMagnitude,
lineBreak
} = this.options;
const { linesA, linesB, maxLen } = this.text2lines();
const linePad = Math.max(4, maxLen.toString().length);
const highlight = (s, ansi) => (cli ? `\x1b[${ansi}m${s}\x1b[0m` : s);
const cy = (s) => highlight(s, '36');
const gy = (s) => highlight(s, '90');
const gn = (s) => highlight(s, '32');
const rd = (s) => highlight(s, '31');
const ye = (s) => highlight(s, '33');
const del = (s) => (cli ? `\x1b[37;41m${s}\x1b[31;49m` : `-[${s}]`);
const ins = (s) => (cli ? `\x1b[37;42m${s}\x1b[32;49m` : `+[${s}]`);
const block = (start, end, forced, headerEntry) => {
if (headerEntry) header(headerEntry);
for (let i = start; i <= end; i++) line(i, forced ?? i);
out.push('');
};
const header = (e) => {
out.push(
`${' '.repeat(linePad)} ${cy(`@@ -${e.line + 1},${e.delSize} +${e.line + 1},${e.insSize} @@`)} ${showChangeMagnitude ? ye(e.magnitude) : ''}`
);
};
const line = (i, forced) => {
if (linesA[i] || linesB[i]) {
const entry = this.entries.find((e) => e.line === i);
const lineNo = (i + 1).toString().padStart(linePad, ' ');
if (entry && forced === i) {
out.push(
`${lineNo} ${rd(`- ${mark(linesA[i], entry.diffs, 'del')}`)}`
);
out.push(
`${' '.repeat(linePad)} ${gn(`+ ${mark(linesB[i], entry.diffs, 'ins')}`)}`
);
} else {
out.push(`${lineNo} ${gy(linesA[i])}`);
}
}
};
const mark = (line, diffs, type) => {
if (!diffs.length || mode === 'line') return line;
let res = '',
idx = 0;
for (const d of diffs) {
const pos = type === 'del' ? d.posA : d.posB;
const val = type === 'del' ? d.del : d.ins;
if (!val) continue;
if (pos > idx) res += line.slice(idx, pos);
res += type === 'del' ? del(val) : ins(val);
idx = pos + val.length;
}
return res + line.slice(idx);
};
let out = [''];
switch (true) {
case expandLines:
block(0, maxLen);
break;
case groupedLines:
for (const group of this.grouped)
block(group.start, group.end, undefined, group);
break;
default:
for (const entry of this.entries)
block(
entry.line - contextLines,
entry.line + contextLines,
entry.line,
entry
);
break;
}
return out.join(lineBreak);
}
getStructuredDiff = () => this.entries;
getGroupedDiff = () => this.grouped;
getASCIIDiff = () => this.output(false);
getCLIDiff = () => this.output(true);
}
class Filter {
static filters = new Map();
static pipeline = new Map();
static getPipeline(hook) {
return ErrorUtil.wrap(
() => {
const cached = Filter.pipeline.get(hook);
if (cached) return cached;
const filter = Filter.filters.get(hook);
if (!filter) return (s) => s;
const pipeline = Array.from(filter.values())
.filter((f) => f.active)
.sort((a, b) => a.priority - b.priority)
.map((f) => f.fn);
const fn = (input) => pipeline.reduce((v, f) => f(v), input);
Filter.pipeline.set(hook, fn);
return fn;
},
`Error compiling filter pipeline for hook <${hook}>`,
{ hook }
);
}
static has(hook, id) {
return !!Filter.filters.get(hook)?.has(id);
}
static add(hook, id, fn, opt = {}) {
return ErrorUtil.wrap(
() => {
const { priority = 10, active = true, overrideable = true } = opt;
const filter = Filter.filters.get(hook) ?? new Map();
const index = filter.get(id);
if (index && !index.overrideable) return false;
filter.set(id, { id, fn, priority, active, overrideable });
Filter.filters.set(hook, filter);
Filter.pipeline.delete(hook);
return true;
},
`Error adding filter <${id}> to hook <${hook}>`,
{ hook, id, opt }
);
}
static remove(hook, id) {
Filter.pipeline.delete(hook);
const filter = Filter.filters.get(hook);
return filter ? filter.delete(id) : false;
}
static pause(hook, id) {
Filter.pipeline.delete(hook);
const f = Filter.filters.get(hook)?.get(id);
return !!(f && ((f.active = false), true));
}
static resume(hook, id) {
Filter.pipeline.delete(hook);
const f = Filter.filters.get(hook)?.get(id);
return !!(f && ((f.active = true), true));
}
static list(hook, active = false) {
const filter = Filter.filters.get(hook);
if (!filter) return [];
const out = [];
for (const f of filter.values()) if (!active || f.active) out.push(f.id);
return out;
}
static apply(hook, input) {
return ErrorUtil.wrap(
() => {
const fn = Filter.getPipeline(hook);
return Array.isArray(input) ? input.map(fn) : fn(input);
},
`Error applying filters for hook <${hook}>`,
{ hook, input }
);
}
static async applyAsync(hook, input) {
return ErrorUtil.wrapAsync(
async () => {
const fn = Filter.getPipeline(hook);
return Array.isArray(input)
? Promise.all(input.map(fn))
: Promise.resolve(fn(input));
},
`Error applying filters for hook <${hook}>`,
{ hook, input }
);
}
static clear(hook) {
Filter.pipeline.clear();
if (hook) Filter.filters.delete(hook);
else Filter.filters.clear();
}
static clearPipeline() {
Filter.pipeline.clear();
}
}
class Hasher {
static FNV_PRIME = 0x01000193;
static HASH_OFFSET = 0x811c9dc5;
static fastFNV1a(str) {
const len = str.length;
let hash = this.HASH_OFFSET;
const chunks = Math.floor(len / 4);
for (let i = 0; i < chunks; i++) {
const pos = i * 4;
const chunk =
str.charCodeAt(pos) |
(str.charCodeAt(pos + 1) << 8) |
(str.charCodeAt(pos + 2) << 16) |
(str.charCodeAt(pos + 3) << 24);
hash ^= chunk;
hash = Math.imul(hash, this.FNV_PRIME);
}
const remaining = len % 4;
if (remaining > 0) {
const pos = chunks * 4;
for (let i = 0; i < remaining; i++) {
hash ^= str.charCodeAt(pos + i);
hash = Math.imul(hash, this.FNV_PRIME);
}
}
hash ^= hash >>> 16;
hash *= 0x85ebca6b;
hash ^= hash >>> 13;
hash *= 0xc2b2ae35;
hash ^= hash >>> 16;
return hash >>> 0;
}
}
class HashTable {
LRU;
static MAX_LEN = 2048;
static TABLE_SIZE = 10_000;
table = new Map();
constructor(LRU = true) {
this.LRU = LRU;
}
key(label, strs, sorted = false) {
for (const str of strs) if (str.length > HashTable.MAX_LEN) return false;
const hashes = strs.map((s) => Hasher.fastFNV1a(s));
return [label, ...(sorted ? hashes.sort() : hashes)].join('-');
}
has = (key) => this.table.has(key);
get = (key) => this.table.get(key);
set(key, entry, update = true) {
if (!update && this.table.has(key)) return false;
while (!this.table.has(key) && this.table.size >= HashTable.TABLE_SIZE) {
if (!this.LRU) return false;
this.table.delete(this.table.keys().next().value);
}
this.table.set(key, entry);
return true;
}
delete = (key) => this.table.delete(key);
clear = () => this.table.clear();
size = () => this.table.size;
}
class Normalizer {
static pipeline = new Map();
static cache = new HashTable();
static REGEX = {
whitespace: /\s+/g,
doubleChars: /(.)\1+/g,
specialChars: /[^\p{L}\p{N}\s]/gu,
nonLetters: /[^\p{L}]/gu,
nonNumbers: /\p{N}/gu
};
static canonicalFlags(flags) {
return Array.from(new Set(flags)).sort().join('');
}
static getPipeline(flags) {
return ErrorUtil.wrap(
() => {
if (Normalizer.pipeline.has(flags))
return Normalizer.pipeline.get(flags);
const { REGEX } = Normalizer;
const steps = [
['d', (s) => s.normalize('NFD')],
['i', (s) => s.toLowerCase()],
['k', (s) => s.replace(REGEX.nonLetters, '')],
['n', (s) => s.replace(REGEX.nonNumbers, '')],
['r', (s) => s.replace(REGEX.doubleChars, '$1')],
['s', (s) => s.replace(REGEX.specialChars, '')],
['t', (s) => s.trim()],
['u', (s) => s.normalize('NFC')],
['w', (s) => s.replace(REGEX.whitespace, ' ')],
['x', (s) => s.normalize('NFKC')]
];
const pipeline = steps
.filter(([f]) => flags.includes(f))
.map(([, fn]) => fn);
const fn = (s) => pipeline.reduce((v, f) => f(v), s);
Normalizer.pipeline.set(flags, fn);
return fn;
},
`Failed to create normalization pipeline for flags: ${flags}`,
{ flags }
);
}
static normalize(input, flags) {
return ErrorUtil.wrap(
() => {
if (!flags || typeof flags !== 'string' || !input) return input;
flags = this.canonicalFlags(flags);
if (Array.isArray(input))
return input.map((s) => Normalizer.normalize(s, flags));
const key = Normalizer.cache.key(flags, [input]);
if (key && Normalizer.cache.has(key))
return Normalizer.cache.get(key);
const res = Normalizer.getPipeline(flags)(input);
if (key) Normalizer.cache.set(key, res);
return res;
},
`Failed to normalize input with flags: ${flags}`,
{ input, flags }
);
}
static async normalizeAsync(input, flags) {
return await ErrorUtil.wrapAsync(
async () => {
if (!flags || typeof flags !== 'string' || !input) return input;
return await (Array.isArray(input)
? Promise.all(input.map((s) => Normalizer.normalize(s, flags)))
: Promise.resolve(Normalizer.normalize(input, flags)));
},
`Failed to asynchronously normalize input with flags: ${flags}`,
{ input, flags }
);
}
static clear() {
Normalizer.pipeline.clear();
Normalizer.cache.clear();
}
}
class Profiler {
active;
static ENV;
static instance;
nowFn;
memFn;
store = new Set();
totalTime = 0;
totalMem = 0;
static detectEnv() {
if (typeof process !== 'undefined') Profiler.ENV = 'nodejs';
else if (typeof performance !== 'undefined') Profiler.ENV = 'browser';
else Profiler.ENV = 'unknown';
}
static getInstance(enable) {
if (!Profiler.ENV) Profiler.detectEnv();
return (Profiler.instance ||= new Profiler(enable));
}
constructor(active = false) {
this.active = active;
switch (Profiler.ENV) {
case 'nodejs':
this.nowFn = () => Number(process.hrtime.bigint()) / 1e6;
this.memFn = () => process.memoryUsage().heapUsed;
break;
case 'browser':
this.nowFn = () => performance.now();
this.memFn = () => performance.memory?.usedJSHeapSize ?? 0;
break;
default:
this.nowFn = () => Date.now();
this.memFn = () => 0;
break;
}
}
now = () => this.nowFn();
mem = () => this.memFn();
profile(fn, meta) {
const startTime = this.now(),
startMem = this.mem();
const res = fn();
const deltaTime = this.now() - startTime,
deltaMem = this.mem() - startMem;
this.store.add({ time: deltaTime, mem: deltaMem, res, meta });
((this.totalTime += deltaTime), (this.totalMem += deltaMem));
return res;
}
enable = () => {
this.active = true;
};
disable = () => {
this.active = false;
};
clear() {
this.store.clear();
this.totalTime = 0;
this.totalMem = 0;
}
run(fn, meta = {}) {
return this.active ? this.profile(fn, meta) : fn();
}
async runAsync(fn, meta = {}) {
return this.active
? this.profile(async () => await fn(), meta)
: await fn();
}
getAll = () => [...this.store];
getLast = () => this.getAll().pop();
getTotal = () => ({ time: this.totalTime, mem: this.totalMem });
services = Object.freeze({
enable: this.enable.bind(this),
disable: this.disable.bind(this),
clear: this.clear.bind(this),
report: this.getAll.bind(this),
last: this.getLast.bind(this),
total: this.getTotal.bind(this)
});
}
const registry = Object.create(null);
const factory = Object.create(null);
function Registry(reg, ctor) {
ErrorUtil.assert(
!(reg in registry || reg in factory),
`Registry <${reg}> already exists / overwriting is forbidden`,
{ registry: reg }
);
const classes = Object.create(null);
const service = Object.freeze({
add(name, cls, update = false) {
ErrorUtil.assert(
typeof name === 'string' && name.length > 0,
`Class name must be a non-empty string`,
{ registry: reg, name }
);
ErrorUtil.assert(
typeof cls === 'function',
`Class must be a constructor function`,
{ registry: reg, class: cls }
);
ErrorUtil.assert(
cls.prototype instanceof ctor,
`Class must extend <${reg}>`,
{ registry: reg, class: cls }
);
ErrorUtil.assert(
update || !(name in classes),
`Class <${name}> already exists / use <update=true> to overwrite`,
{ registry: reg, name }
);
classes[name] = cls;
},
remove(name) {
delete classes[name];
},
has(name) {
return name in classes;
},
list() {
return Object.keys(classes);
},
get(name) {
ErrorUtil.assert(
typeof name === 'string' && name.length > 0,
`Class name must be a non-empty string`,
{ registry: reg, name }
);
ErrorUtil.assert(
name in classes,
`Class <${name}> not registered for <${reg}>`,
{ registry: reg, name }
);
return classes[name];
}
});
registry[reg] = service;
factory[reg] = (cls, ...args) => createFromRegistry(reg, cls, ...args);
return service;
}
function resolveCls(reg, cls) {
if (!(reg in registry))
throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, {
registry: reg
});
return typeof cls === 'string' ? registry[reg]?.get(cls) : cls;
}
function createFromRegistry(reg, cls, ...args) {
cls = resolveCls(reg, cls);
return ErrorUtil.wrap(
() => new cls(...args),
`Failed to create instance of class <${cls.name ?? cls}> from registry <${reg}>`,
{ registry: reg, class: cls, args }
);
}
class RingPool {
maxSize;
buffers = [];
pointer = 0;
constructor(maxSize) {
this.maxSize = maxSize;
}
acquire(minSize, allowOversize) {
return ErrorUtil.wrap(
() => {
const len = this.buffers.length;
for (let i = 0; i < len; i++) {
const idx = (this.pointer + i) & (len - 1);
const item = this.buffers[idx];
if (
item.size >= minSize &&
(allowOversize || item.size === minSize)
) {
this.pointer = (idx + 1) & (len - 1);
return item;
}
}
return null;
},
`Failed to acquire buffer of size >= ${minSize} from pool`,
{ minSize, allowOversize }
);
}
release(item) {
ErrorUtil.wrap(
() => {
if (this.buffers.length < this.maxSize)
return void [this.buffers.push(item)];
this.buffers[this.pointer] = item;
this.pointer = (this.pointer + 1) % this.maxSize;
},
`Failed to release buffer back to pool`,
{ item }
);
}
clear() {
this.buffers = [];
this.pointer = 0;
}
}
class Pool {
static CONFIG = {
int32: {
type: 'int32',
maxSize: 64,
maxItemSize: 2048,
allowOversize: true
},
'number[]': {
type: 'number[]',
maxSize: 16,
maxItemSize: 1024,
allowOversize: false
},
'string[]': {
type: 'string[]',
maxSize: 2,
maxItemSize: 1024,
allowOversize: false
},
set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false },
map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false }
};
static POOLS = {
int32: new RingPool(64),
'number[]': new RingPool(16),
'string[]': new RingPool(2),
set: new RingPool(8),
map: new RingPool(8)
};
static allocate(type, size) {
switch (type) {
case 'int32':
return new Int32Array(size);
case 'number[]':
return new Float64Array(size);
case 'string[]':
return new Array(size);
case 'set':
return new Set();
case 'map':
return new Map();
}
}
static acquire(type, size) {
const CONFIG = this.CONFIG[type];
if (!CONFIG)
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
if (size > CONFIG.maxItemSize) return this.allocate(type, size);
const item = this.POOLS[type].acquire(size, CONFIG.allowOversize);
if (item)
return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer;
return this.allocate(type, size);
}
static acquireMany(type, sizes) {
return sizes.map((size) => this.acquire(type, size));
}
static release(type, buffer, size) {
const CONFIG = this.CONFIG[type];
if (!CONFIG)
throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type });
if (size <= CONFIG.maxItemSize)
this.POOLS[type].release({ buffer, size });
}
}
class StructuredData {
data;
key;
static create(data, key) {
return new StructuredData(data, key);
}
constructor(data, key) {
this.data = data;
this.key = key;
}
extractFrom(arr, key) {
const result = Pool.acquire('string[]', arr.length);
for (let i = 0; i < arr.length; i++) {
const val = arr[i][key];
result[i] = typeof val === 'string' ? val : String(val ?? '');
}
return result;
}
extract = () => this.extractFrom(this.data, this.key);
isMetricResult(v) {
return (
typeof v === 'object' &&
v !== null &&
'a' in v &&
'b' in v &&
'res' in v
);
}
isCmpStrResult(v) {
return (
typeof v === 'object' &&
v !== null &&
'source' in v &&
'target' in v &&
'match' in v
);
}
normalizeResults(results) {
if (!Array.isArray(results) || results.length === 0) return [];
const first = results[0];
let normalized = [];
if (this.isMetricResult(first)) normalized = results;
else if (this.isCmpStrResult(first))
normalized = results.map((r) => ({
metric: 'unknown',
a: r.source,
b: r.target,
res: r.match,
raw: r.raw
}));
else
throw new CmpStrValidationError(
'Unsupported result format for StructuredData normalization.'
);
return normalized.map((r, idx) => ({ ...r, __idx: idx }));
}
rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) {
const stringToIndices = new Map();
for (let i = 0; i < extractedStrings.length; i++) {
const str = extractedStrings[i];
if (!stringToIndices.has(str)) stringToIndices.set(str, []);
stringToIndices.get(str).push(i);
}
const output = new Array(results.length);
const occurrenceCount = new Map();
let out = 0;
for (let i = 0; i < results.length; i++) {
const result = results[i];
if (removeZero && result.res === 0) continue;
const targetStr = result.b || '';
const indices = stringToIndices.get(targetStr);
let dataIndex;
if (indices && indices.length > 0) {
const occurrence = occurrenceCount.get(targetStr) ?? 0;
occurrenceCount.set(targetStr, occurrence + 1);
dataIndex = indices[occurrence % indices.length];
} else {
dataIndex = result.__idx ?? i;
}
if (dataIndex < 0 || dataIndex >= sourceData.length) continue;
const sourceObj = sourceData[dataIndex];
const mappedTarget = extractedStrings[dataIndex] || targetStr;
if (objectsOnly) output[out++] = sourceObj;
else
output[out++] = {
obj: sourceObj,
key: this.key,
result: {
source: result.a,
target: mappedTarget,
match: result.res
},
...(result.raw ? { raw: result.raw } : null)
};
}
output.length = out;
return output;
}
sort(results, sort) {
if (!sort || results.length <= 1) return results;
const asc = sort === 'asc';
return results.sort((a, b) => (asc ? a.res - b.res : b.res - a.res));
}
finalizeLookup(results, extractedStrings, opt) {
return this.rebuild(
this.sort(this.normalizeResults(results), opt?.sort),
this.data,
extractedStrings,
opt?.removeZero,
opt?.objectsOnly
);
}
performLookup(fn, extractedStrings, opt) {
return ErrorUtil.wrap(
() => this.finalizeLookup(fn(), extractedStrings, opt),
'StructuredData lookup failed',
{ key: this.key }
);
}
async performLookupAsync(fn, extractedStrings, opt) {
return await ErrorUtil.wrapAsync(
async () => this.finalizeLookup(await fn(), extractedStrings, opt),
'StructuredData async lookup failed',
{ key: this.key }
);
}
lookup(fn, query, opt) {
const b = this.extract();
try {
return this.performLookup(() => fn(query, b, opt), b, opt);
} finally {
Pool.release('string[]', b, b.length);
}
}
async lookupAsync(fn, query, opt) {
const b = this.extract();
try {
return await this.performLookupAsync(() => fn(query, b, opt), b, opt);
} finally {
Pool.release('string[]', b, b.length);
}
}
lookupPairs(fn, other, otherKey, opt) {
const a = this.extract();
const b = this.extractFrom(other, otherKey);
try {
return this.performLookup(() => fn(a, b, opt), a, opt);
} finally {
Pool.release('string[]', a, a.length);
Pool.release('string[]', b, b.length);
}
}
async lookupPairsAsync(fn, other, otherKey, opt) {
const a = this.extract();
const b = this.extractFrom(other, otherKey);
try {
return await this.performLookupAsync(() => fn(a, b, opt), a, opt);
} finally {
Pool.release('string[]', a, a.length);
Pool.release('string[]', b, b.length);
}
}
}
class TextAnalyzer {
static REGEX = {
number: /\d/,
sentence: /(?<=[.!?])\s+/,
word: /\p{L}+/gu,
nonWord: /[^\p{L}]/gu,
vowelGroup: /[aeiouy]+/g,
letter: /\p{L}/gu,
ucLetter: /\p{Lu}/gu
};
text;
words = [];
sentences = [];
charFrequency = new Map();
wordHistogram = new Map();
syllableCache = new Map();
syllableStats;
constructor(input) {
this.text = input.trim();
this.tokenize();
this.computeFrequencies();
}
tokenize() {
let match;
const lcText = this.text.toLowerCase();
while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
this.words.push(match[0]);
this.sentences = this.text
.split(TextAnalyzer.REGEX.sentence)
.filter(Boolean);
}
computeFrequencies() {
for (const char of this.text)
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
for (const word of this.words)
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
}
estimateSyllables(word) {
const clean = word
.normalize('NFC')
.toLowerCase()
.replace(TextAnalyzer.REGEX.nonWord, '');
if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
const count = matches ? matches.length : 1;
this.syllableCache.set(clean, count);
return count;
}
computeSyllableStats() {
return (this.syllableStats ||= (() => {
const perWord = this.words
.map((w) => this.estimateSyllables(w))
.sort((a, b) => a - b);
const total = perWord.reduce((sum, s) => sum + s, 0);
const mono = perWord.filter((s) => s === 1).length;
const median = !perWord.length
? 0
: perWord.length % 2 === 0
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) /
2
: perWord[Math.floor(perWord.length / 2)];
return {
total,
mono,
perWord,
avg: perWord.length ? total / perWord.length : 0,
median
};
})());
}
getLength = () => this.text.length;
getWordCount = () => this.words.length;
getSentenceCount = () => this.sentences.length;
getAvgWordLength() {
return this.words.length
? this.words.join('').length / this.words.length
: 0;
}
getAvgSentenceLength() {
return this.sentences.length
? this.words.length / this.sentences.length
: 0;
}
getWordHistogram() {
return Object.fromEntries(this.wordHistogram);
}
getMostCommonWords(limit = 5) {
return [...this.wordHistogram.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, limit)
.map((e) => e[0]);
}
getHapaxLegomena() {
return [...this.wordHistogram.entries()]
.filter(([, c]) => c === 1)
.map((e) => e[0]);
}
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
getUpperCaseRatio() {
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
return matches.length ? upper / matches.length : 0;
}
getCharFrequency() {
return Object.fromEntries(this.charFrequency);
}
getUnicodeCodepoints() {
const result = {};
for (const [char, count] of this.charFrequency) {
const block = char
.charCodeAt(0)
.toString(16)
.padStart(4, '0')
.toUpperCase();
result[block] = (result[block] || 0) + count;
}
return result;
}
getLongWordRatio(len = 7) {
let long = 0;
for (const w of this.words) if (w.length >= len) long++;
return this.words.length ? long / this.words.length : 0;
}
getShortWordRatio(len = 3) {
let short = 0;
for (const w of this.words) if (w.length <= len) short++;
return this.words.length ? short / this.words.length : 0;
}
getSyllablesCount() {
return this.computeSyllableStats().total;
}
getMonosyllabicWordCount() {
return this.computeSyllableStats().mono;
}
getMinSyllablesWordCount(min) {
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
}
getMaxSyllablesWordCount(max) {
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
}
getAvgSyllablesPerWord() {
return this.computeSyllableStats().avg;
}
getMedianSyllablesPerWord() {
return this.computeSyllableStats().median;
}
getHonoresR() {
try {
return (
(100 * Math.log(this.words.length)) /
(1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
);
} catch {
return 0;
}
}
getReadingTime(wpm = 200) {
return this.words.length / (wpm ?? 1);
}
getReadabilityScore(metric = 'flesch') {
const w = this.words.length || 1;
const s = this.sentences.length || 1;
const y = this.getSyllablesCount() || 1;
const asl = w / s;
const asw = y / w;
switch (metric) {
case 'flesch':
return 206.835 - 1.015 * asl - 84.6 * asw;
case 'fleschde':
return 180 - asl - 58.5 * asw;
case 'kincaid':
return 0.39 * asl + 11.8 * asw - 15.59;
}
}
getLIXScore() {
const w = this.words.length || 1;
const s = this.sentences.length || 1;
const l = this.getLongWordRatio() * w;
return w / s + (l / w) * 100;
}
getWSTFScore() {
const w = this.words.length || 1;
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
const s = this.getAvgSentenceLength();
const l = this.getLongWordRatio() * 100;
const m = (this.getMonosyllabicWordCount() / w) * 100;
return [
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
0.2963 * h + 0.1905 * s - 1.1144,
0.2744 * h + 0.2656 * s - 1.693
];
}
}
const profiler$2 = Profiler.getInstance();
class Metric {
static cache = new HashTable();
metric;
a;
b;
origA = [];
origB = [];
options;
optKey;
symmetric;
results;
static clear = () => this.cache.clear();
static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]);
static clamp = (res) => Math.max(0, Math.min(1, res));
constructor(metric, a, b, opt = {}, symmetric = false) {
this.metric = metric;
this.a = Array.isArray(a) ? a : [a];
this.b = Array.isArray(b) ? b : [b];
ErrorUtil.assert(
this.a.length > 0 && this.b.length > 0,
`Inputs <a> and <b> must not be empty`,
{ a: this.a, b: this.b }
);
this.options = opt;
this.optKey = Hasher.fastFNV1a(
JSON.stringify(opt, Object.keys(opt).sort())
).toString();
this.symmetric = symmetric;
}
preCompute(a, b, m, n) {
if (a === b) return { res: 1 };
if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 };
return undefined;
}
compute(a, b, m, n, maxLen) {
throw new CmpStrInternalError(
`Method compute() must be overridden in a subclass`
);
}
runSingle(i, j) {
return ErrorUtil.wrap(
() => {
let a = String(this.a[i]),
A = a;
let b = String(this.b[j]),
B = b;
let m = A.length,
n = B.length;
let result = this.preCompute(A, B, m, n);
if (!result) {
result = profiler$2.run(() => {
if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n);
const key =
Metric.cache.key(this.metric, [A, B], this.symmetric) +
this.optKey;
return (
Metric.cache.get(key || '') ??
(() => {
const res = this.compute(A, B, m, n, Math.max(m, n));
if (key) Metric.cache.set(key, res);
return res;
})()
);
});
}
return {
metric: this.metric,
a: this.origA[i] ?? a,
b: this.origB[j] ?? b,
...result
};
},
`Failed to compute metric for inputs at indices a[${i}] and b[${j}]`,
{ i, j }
);
}
async runSingleAsync(i, j) {
return Promise.resolve(this.runSingle(i, j));
}
runBatch() {
const results = [];
for (let i = 0; i < this.a.length; i++)
for (let j = 0; j < this.b.length; j++)
results.push(this.runSingle(i, j));
this.results = results;
}
async runBatchAsync() {
const results = [];
for (let i = 0; i < this.a.length; i++)
for (let j = 0; j < this.b.length; j++)
results.push(await this.runSingleAsync(i, j));
this.results = results;
}
runPairwise() {
const results = [];
for (let i = 0; i < this.a.length; i++)
results.push(this.runSingle(i, i));
this.results = results;
}
async runPairwiseAsync() {
const results = [];
for (let i = 0; i < this.a.length; i++)
results.push(await this.runSingleAsync(i, i));
this.results = results;
}
setOriginal(a, b) {
if (a) this.origA = Array.isArray(a) ? a : [a];
if (b) this.origB = Array.isArray(b) ? b : [b];
return this;
}
isBatch = () => this.a.length > 1 || this.b.length > 1;
isSingle = () => !this.isBatch();
isPairwise(safe = false) {
return this.isBatch() && this.a.length === this.b.length
? true
: !safe &&
(() => {
throw new CmpStrUsageError(
`Mode <pairwise> requires arrays of equal length`,
{ a: this.a, b: this.b }
);
})();
}
isSymmetrical = () => this.symmetric;
whichMode = (mode) => mode ?? this.options?.mode ?? 'default';
clear = () => (this.results = undefined);
run(mode, clear = true) {
if (clear) this.clear();
switch (this.whichMode(mode)) {
case 'default':
if (this.isSingle()) {
this.results = this.runSingle(0, 0);
break;
}
case 'batch':
this.runBatch();
break;
case 'single':
this.results = this.runSingle(0, 0);
break;
case 'pairwise':
if (this.isPairwise()) this.runPairwise();
break;
default:
throw new CmpStrInternalError(`Unsupported mode <${mode}>`);
}
}
async runAsync(mode, clear = true) {
if (clear) this.clear();
switch (this.whichMode(mode)) {
case 'default':
if (this.isSingle()) {
this.results = await this.runSingleAsync(0, 0);
break;
}
case 'batch':
await this.runBatchAsync();
break;
case 'single':
this.results = await this.runSingleAsync(0, 0);
break;
case 'pairwise':
if (this.isPairwise()) await this.runPairwiseAsync();
break;
default:
throw new CmpStrInternalError(`Unsupported async mode <${mode}>`);
}
}
getMetricName = () => this.metric;
getResults() {
ErrorUtil.assert(
this.results !== undefined,
`run() must be called before getResults()`
);
return this.results;
}
}
const MetricRegistry = Registry('metric', Metric);
class CosineSimilarity extends Metric {
constructor(a, b, opt = {}) {
super('cosine', a, b, opt, true);
}
_termFreq(str, delimiter) {
const terms = str.split(delimiter);
const freq = Pool.acquire('map', terms.length);
for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1);
return freq;
}
compute(a, b) {
const { delimiter = ' ' } = this.options;
const termsA = this._termFreq(a, delimiter);
const termsB = this._termFreq(b, delimiter);
try {
let dotP = 0,
magA = 0,
magB = 0;
for (const [term, freqA] of termsA) {
const freqB = termsB.get(term) || 0;
dotP += freqA * freqB;
magA += freqA * freqA;
}
for (const freqB of termsB.values()) magB += freqB * freqB;
magA = Math.sqrt(magA);
magB = Math.sqrt(magB);
return {
res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0,
raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB }
};
} finally {
Pool.release('map', termsA, termsA.size)