UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

1,603 lines (1,588 loc) 93.1 kB
/** * CmpStr v3.2.2 build-bb61120-260311 * This is a lightweight, fast and well performing library for calculating string similarity. * (c) 2023-2026 Paul Köhler @komed3 / MIT License * Visit https://github.com/komed3/cmpstr and https://npmjs.org/package/cmpstr */ (function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : typeof define === 'function' && define.amd ? define(['exports'], factory) : ((global = typeof globalThis !== 'undefined' ? globalThis : global || self), factory((global.CmpStr = {}))); })(this, function (exports) { 'use strict'; class CmpStrError extends Error { code; meta; cause; when = new Date().toISOString(); constructor(code, message, meta, cause) { super(message); this.name = this.constructor.name; this.code = code; this.meta = meta; this.cause = cause; if (typeof Error.captureStackTrace === 'function') { Error.captureStackTrace(this, this.constructor); } } toJSON() { return { name: this.name, code: this.code, message: this.message, meta: this.meta, when: this.when, cause: this.cause instanceof Error ? { name: this.cause.name, message: this.cause.message, stack: this.cause.stack } : this.cause }; } toString(stack = false) { const parts = [`${this.name} [${this.code}]`, this.message]; if (this.meta && Object.keys(this.meta).length) { try { parts.push(JSON.stringify(this.meta)); } catch {} } return ( parts.join(' - ') + (stack && this.stack ? `\nStack Trace:\n${this.stack}` : '') ); } } class CmpStrValidationError extends CmpStrError { constructor(message, meta, cause) { super('E_VALIDATION', message, meta, cause); } } class CmpStrNotFoundError extends CmpStrError { constructor(message, meta, cause) { super('E_NOT_FOUND', message, meta, cause); } } class CmpStrUsageError extends CmpStrError { constructor(message, meta, cause) { super('E_USAGE', message, meta, cause); } } class CmpStrInternalError extends CmpStrError { constructor(message, meta, cause) { super('E_INTERNAL', message, meta, cause); } } class ErrorUtil { static assert(condition, message, meta) { if (!condition) throw new CmpStrUsageError(message, meta); } static create(err, message, meta) { if (err instanceof CmpStrError) throw err; throw new CmpStrInternalError(message, meta, err); } static format(err) { if (err instanceof CmpStrError) return err.toString(); if (err instanceof Error) return `${err.name}: ${err.message}`; return String(err); } static wrap(fn, message, meta) { try { return fn(); } catch (err) { throw new CmpStrInternalError(message, meta, err); } } static async wrapAsync(fn, message, meta) { try { return await fn(); } catch (err) { throw new CmpStrInternalError(message, meta, err); } } } var Errors = /*#__PURE__*/ Object.freeze({ __proto__: null, CmpStrError: CmpStrError, CmpStrInternalError: CmpStrInternalError, CmpStrNotFoundError: CmpStrNotFoundError, CmpStrUsageError: CmpStrUsageError, CmpStrValidationError: CmpStrValidationError, ErrorUtil: ErrorUtil }); const BRACKET_PATTERN = /\[(\d+)]/g; const PATH_CACHE = new Map(); function parse(p) { let cached = PATH_CACHE.get(p); if (cached) return cached; const parsed = p .replace(BRACKET_PATTERN, '.$1') .split('.') .map((s) => { const n = Number(s); return Number.isInteger(n) && String(n) === s ? n : s; }); PATH_CACHE.set(p, parsed); return parsed; } function get(t, path, fb) { let o = t; for (const k of parse(path)) { if (o == null || !(k in o)) return fb; o = o[k]; } return o; } function has(t, path) { let o = t; for (const k of parse(path)) { if (o == null || !(k in o)) return false; o = o[k]; } return true; } function set(t, path, value) { if (path === '') return value; const keys = parse(path); if (t !== undefined && (typeof t !== 'object' || t === null)) throw new CmpStrUsageError( `Cannot set property <${keys[0]}> of <${JSON.stringify(t)}>`, { path: keys[0], target: t } ); const root = t ?? (typeof keys[0] === 'number' ? [] : Object.create(null)); let cur = root; for (let i = 0; i < keys.length - 1; i++) { const k = keys[i]; let n = cur[k]; if (n != null && typeof n !== 'object') throw new CmpStrUsageError( `Cannot set property <${keys[i + 1]}> of <${JSON.stringify(n)}>`, { path: keys.slice(0, i + 2), value: n } ); if (n == null) n = cur[k] = typeof keys[i + 1] === 'number' ? [] : Object.create(null); cur = n; } cur[keys[keys.length - 1]] = value; return root; } function merge( t = Object.create(null), o = Object.create(null), mergeUndefined = false ) { const target = t ?? Object.create(null); Object.keys(o).forEach((k) => { const val = o[k]; if (!mergeUndefined && val === undefined) return; if (k === '__proto__' || k === 'constructor') return; if (val !== null && typeof val === 'object' && !Array.isArray(val)) { const existing = target[k]; target[k] = merge( existing !== null && typeof existing === 'object' && !Array.isArray(existing) ? existing : Object.create(null), val, mergeUndefined ); } else target[k] = val; }); return target; } function rmv(t, path, preserveEmpty = false) { const keys = parse(path); const remove = (obj, i = 0) => { const key = keys[i]; if (!obj || typeof obj !== 'object') return false; if (i === keys.length - 1) return delete obj[key]; if (!remove(obj[key], i + 1)) return false; if (!preserveEmpty) { const val = obj[key]; if ( typeof val === 'object' && ((Array.isArray(val) && val.every((v) => v == null)) || (!Array.isArray(val) && Object.keys(val).length === 0)) ) delete obj[key]; } return true; }; remove(t); return t; } var DeepMerge = /*#__PURE__*/ Object.freeze({ __proto__: null, get: get, has: has, merge: merge, rmv: rmv, set: set }); class DiffChecker { a; b; options; entries = []; grouped = []; diffRun = false; constructor(a, b, opt = {}) { ((this.a = a), (this.b = b)); this.options = { ...{ mode: 'word', caseInsensitive: false, contextLines: 1, groupedLines: true, expandLines: false, showChangeMagnitude: true, maxMagnitudeSymbols: 5, lineBreak: '\n' }, ...opt }; this.computeDiff(); } text2lines() { const linesA = this.a.trim().split(/\r?\n/); const linesB = this.b.trim().split(/\r?\n/); return { linesA, linesB, maxLen: Math.max(linesA.length, linesB.length) }; } tokenize(input) { switch (this.options.mode) { case 'line': return [input]; case 'word': return input.split(/\s+/); } } concat(input) { return input.join(this.options.mode === 'word' ? ' ' : ''); } computeDiff() { if (this.diffRun) return; const { linesA, linesB, maxLen } = this.text2lines(); for (let i = 0; i < maxLen; i++) this.lineDiff(linesA[i] || '', linesB[i] || '', i); this.findGroups(); this.diffRun = true; } lineDiff(a, b, line) { const { mode, caseInsensitive } = this.options; const baseLen = Math.max(a.length, b.length); let A = a, B = b; if (caseInsensitive) ((A = a.toLowerCase()), (B = b.toLowerCase())); let diffs = []; let delSize = 0, insSize = 0; switch (mode) { case 'line': if (A !== B) { diffs.push({ posA: 0, posB: 0, del: a, ins: b, size: b.length - a.length }); delSize = a.length; insSize = b.length; } break; case 'word': diffs = this.preciseDiff(a, A, b, B); for (const d of diffs) ((delSize += d.del.length), (insSize += d.ins.length)); break; } if (diffs.length) this.entries.push({ line, diffs, delSize, insSize, baseLen, totalSize: insSize - delSize, magnitude: this.magnitude(delSize, insSize, baseLen) }); } preciseDiff(a, A, b, B) { const posIndex = (t) => t.reduce( (p, _, i) => (p.push(i ? p[i - 1] + t[i - 1].length + 1 : 0), p), [] ); const origA = this.tokenize(a); const origB = this.tokenize(b); const tokenA = this.tokenize(A); const tokenB = this.tokenize(B); const lenA = tokenA.length; const lenB = tokenB.length; const posArrA = posIndex(origA); const posArrB = posIndex(origB); const matches = []; let ai = 0, bi = 0; while (ai < lenA && bi < lenB) { if (tokenA[ai] === tokenB[bi]) { let len = 1; while ( ai + len < lenA && bi + len < lenB && tokenA[ai + len] === tokenB[bi + len] ) len++; matches.push({ ai, bi, len }); ((ai += len), (bi += len)); } else { let found = false; for (let offset = 1; offset <= 3 && !found; offset++) { if (ai + offset < lenA && tokenA[ai + offset] === tokenB[bi]) { matches.push({ ai: ai + offset, bi, len: 1 }); ((ai += offset + 1), (bi += 1), (found = true)); } else if ( bi + offset < lenB && tokenA[ai] === tokenB[bi + offset] ) { matches.push({ ai, bi: bi + offset, len: 1 }); ((ai += 1), (bi += offset + 1), (found = true)); } } if (!found) (ai++, bi++); } } const diffs = []; let i = 0, j = 0; for (const m of matches) { if (i < m.ai || j < m.bi) { const delArr = origA.slice(i, m.ai); const insArr = origB.slice(j, m.bi); diffs.push({ posA: posArrA[i] ?? 0, posB: posArrB[j] ?? 0, del: this.concat(delArr), ins: this.concat(insArr), size: insArr.join('').length - delArr.join('').length }); } ((i = m.ai + m.len), (j = m.bi + m.len)); } if (i < lenA || j < lenB) { const delArr = origA.slice(i); const insArr = origB.slice(j); diffs.push({ posA: posArrA[i] ?? 0, posB: posArrB[j] ?? 0, del: this.concat(delArr), ins: this.concat(insArr), size: insArr.join('').length - delArr.join('').length }); } return diffs.filter((d) => d.del.length > 0 || d.ins.length > 0); } findGroups() { const { contextLines } = this.options; const addGroup = (group, start, end) => { const [delSize, insSize, totalSize, baseLen] = [ 'delSize', 'insSize', 'totalSize', 'baseLen' ].map((k) => group.reduce((sum, e) => sum + e[k], 0)); this.grouped.push({ start, end, delSize, insSize, totalSize, line: group[0].line, entries: group, magnitude: this.magnitude(delSize, insSize, baseLen) }); }; let group = []; let start = 0, end = 0; for (const entry of this.entries) { const s = Math.max(0, entry.line - contextLines); const e = entry.line + contextLines; if (!group.length || s <= end + 1) { if (!group.length) start = s; end = Math.max(end, e); group.push(entry); } else { addGroup(group, start, end); ((group = [entry]), (start = s), (end = e)); } } if (group.length) addGroup(group, start, end); } magnitude(del, ins, baseLen) { const { maxMagnitudeSymbols } = this.options; const total = del + ins; if (total === 0 || baseLen === 0) return ''; const magLen = Math.min( maxMagnitudeSymbols, Math.max(Math.round((total / baseLen) * maxMagnitudeSymbols), 1) ); const plus = Math.round((ins / total) * magLen); const minus = magLen - plus; return '+'.repeat(plus) + '-'.repeat(minus); } output(cli) { const { mode, contextLines, groupedLines, expandLines, showChangeMagnitude, lineBreak } = this.options; const { linesA, linesB, maxLen } = this.text2lines(); const linePad = Math.max(4, maxLen.toString().length); const highlight = (s, ansi) => (cli ? `\x1b[${ansi}m${s}\x1b[0m` : s); const cy = (s) => highlight(s, '36'); const gy = (s) => highlight(s, '90'); const gn = (s) => highlight(s, '32'); const rd = (s) => highlight(s, '31'); const ye = (s) => highlight(s, '33'); const del = (s) => (cli ? `\x1b[37;41m${s}\x1b[31;49m` : `-[${s}]`); const ins = (s) => (cli ? `\x1b[37;42m${s}\x1b[32;49m` : `+[${s}]`); const block = (start, end, forced, headerEntry) => { if (headerEntry) header(headerEntry); for (let i = start; i <= end; i++) line(i, forced ?? i); out.push(''); }; const header = (e) => { out.push( `${' '.repeat(linePad)} ${cy(`@@ -${e.line + 1},${e.delSize} +${e.line + 1},${e.insSize} @@`)} ${showChangeMagnitude ? ye(e.magnitude) : ''}` ); }; const line = (i, forced) => { if (linesA[i] || linesB[i]) { const entry = this.entries.find((e) => e.line === i); const lineNo = (i + 1).toString().padStart(linePad, ' '); if (entry && forced === i) { out.push( `${lineNo} ${rd(`- ${mark(linesA[i], entry.diffs, 'del')}`)}` ); out.push( `${' '.repeat(linePad)} ${gn(`+ ${mark(linesB[i], entry.diffs, 'ins')}`)}` ); } else { out.push(`${lineNo} ${gy(linesA[i])}`); } } }; const mark = (line, diffs, type) => { if (!diffs.length || mode === 'line') return line; let res = '', idx = 0; for (const d of diffs) { const pos = type === 'del' ? d.posA : d.posB; const val = type === 'del' ? d.del : d.ins; if (!val) continue; if (pos > idx) res += line.slice(idx, pos); res += type === 'del' ? del(val) : ins(val); idx = pos + val.length; } return res + line.slice(idx); }; let out = ['']; switch (true) { case expandLines: block(0, maxLen); break; case groupedLines: for (const group of this.grouped) block(group.start, group.end, undefined, group); break; default: for (const entry of this.entries) block( entry.line - contextLines, entry.line + contextLines, entry.line, entry ); break; } return out.join(lineBreak); } getStructuredDiff = () => this.entries; getGroupedDiff = () => this.grouped; getASCIIDiff = () => this.output(false); getCLIDiff = () => this.output(true); } class Filter { static filters = new Map(); static pipeline = new Map(); static getPipeline(hook) { return ErrorUtil.wrap( () => { const cached = Filter.pipeline.get(hook); if (cached) return cached; const filter = Filter.filters.get(hook); if (!filter) return (s) => s; const pipeline = Array.from(filter.values()) .filter((f) => f.active) .sort((a, b) => a.priority - b.priority) .map((f) => f.fn); const fn = (input) => pipeline.reduce((v, f) => f(v), input); Filter.pipeline.set(hook, fn); return fn; }, `Error compiling filter pipeline for hook <${hook}>`, { hook } ); } static has(hook, id) { return !!Filter.filters.get(hook)?.has(id); } static add(hook, id, fn, opt = {}) { return ErrorUtil.wrap( () => { const { priority = 10, active = true, overrideable = true } = opt; const filter = Filter.filters.get(hook) ?? new Map(); const index = filter.get(id); if (index && !index.overrideable) return false; filter.set(id, { id, fn, priority, active, overrideable }); Filter.filters.set(hook, filter); Filter.pipeline.delete(hook); return true; }, `Error adding filter <${id}> to hook <${hook}>`, { hook, id, opt } ); } static remove(hook, id) { Filter.pipeline.delete(hook); const filter = Filter.filters.get(hook); return filter ? filter.delete(id) : false; } static pause(hook, id) { Filter.pipeline.delete(hook); const f = Filter.filters.get(hook)?.get(id); return !!(f && ((f.active = false), true)); } static resume(hook, id) { Filter.pipeline.delete(hook); const f = Filter.filters.get(hook)?.get(id); return !!(f && ((f.active = true), true)); } static list(hook, active = false) { const filter = Filter.filters.get(hook); if (!filter) return []; const out = []; for (const f of filter.values()) if (!active || f.active) out.push(f.id); return out; } static apply(hook, input) { return ErrorUtil.wrap( () => { const fn = Filter.getPipeline(hook); return Array.isArray(input) ? input.map(fn) : fn(input); }, `Error applying filters for hook <${hook}>`, { hook, input } ); } static async applyAsync(hook, input) { return ErrorUtil.wrapAsync( async () => { const fn = Filter.getPipeline(hook); return Array.isArray(input) ? Promise.all(input.map(fn)) : Promise.resolve(fn(input)); }, `Error applying filters for hook <${hook}>`, { hook, input } ); } static clear(hook) { Filter.pipeline.clear(); if (hook) Filter.filters.delete(hook); else Filter.filters.clear(); } static clearPipeline() { Filter.pipeline.clear(); } } class Hasher { static FNV_PRIME = 0x01000193; static HASH_OFFSET = 0x811c9dc5; static fastFNV1a(str) { const len = str.length; let hash = this.HASH_OFFSET; const chunks = Math.floor(len / 4); for (let i = 0; i < chunks; i++) { const pos = i * 4; const chunk = str.charCodeAt(pos) | (str.charCodeAt(pos + 1) << 8) | (str.charCodeAt(pos + 2) << 16) | (str.charCodeAt(pos + 3) << 24); hash ^= chunk; hash = Math.imul(hash, this.FNV_PRIME); } const remaining = len % 4; if (remaining > 0) { const pos = chunks * 4; for (let i = 0; i < remaining; i++) { hash ^= str.charCodeAt(pos + i); hash = Math.imul(hash, this.FNV_PRIME); } } hash ^= hash >>> 16; hash *= 0x85ebca6b; hash ^= hash >>> 13; hash *= 0xc2b2ae35; hash ^= hash >>> 16; return hash >>> 0; } } class HashTable { LRU; static MAX_LEN = 2048; static TABLE_SIZE = 10_000; table = new Map(); constructor(LRU = true) { this.LRU = LRU; } key(label, strs, sorted = false) { for (const str of strs) if (str.length > HashTable.MAX_LEN) return false; const hashes = strs.map((s) => Hasher.fastFNV1a(s)); return [label, ...(sorted ? hashes.sort() : hashes)].join('-'); } has = (key) => this.table.has(key); get = (key) => this.table.get(key); set(key, entry, update = true) { if (!update && this.table.has(key)) return false; while (!this.table.has(key) && this.table.size >= HashTable.TABLE_SIZE) { if (!this.LRU) return false; this.table.delete(this.table.keys().next().value); } this.table.set(key, entry); return true; } delete = (key) => this.table.delete(key); clear = () => this.table.clear(); size = () => this.table.size; } class Normalizer { static pipeline = new Map(); static cache = new HashTable(); static REGEX = { whitespace: /\s+/g, doubleChars: /(.)\1+/g, specialChars: /[^\p{L}\p{N}\s]/gu, nonLetters: /[^\p{L}]/gu, nonNumbers: /\p{N}/gu }; static canonicalFlags(flags) { return Array.from(new Set(flags)).sort().join(''); } static getPipeline(flags) { return ErrorUtil.wrap( () => { if (Normalizer.pipeline.has(flags)) return Normalizer.pipeline.get(flags); const { REGEX } = Normalizer; const steps = [ ['d', (s) => s.normalize('NFD')], ['i', (s) => s.toLowerCase()], ['k', (s) => s.replace(REGEX.nonLetters, '')], ['n', (s) => s.replace(REGEX.nonNumbers, '')], ['r', (s) => s.replace(REGEX.doubleChars, '$1')], ['s', (s) => s.replace(REGEX.specialChars, '')], ['t', (s) => s.trim()], ['u', (s) => s.normalize('NFC')], ['w', (s) => s.replace(REGEX.whitespace, ' ')], ['x', (s) => s.normalize('NFKC')] ]; const pipeline = steps .filter(([f]) => flags.includes(f)) .map(([, fn]) => fn); const fn = (s) => pipeline.reduce((v, f) => f(v), s); Normalizer.pipeline.set(flags, fn); return fn; }, `Failed to create normalization pipeline for flags: ${flags}`, { flags } ); } static normalize(input, flags) { return ErrorUtil.wrap( () => { if (!flags || typeof flags !== 'string' || !input) return input; flags = this.canonicalFlags(flags); if (Array.isArray(input)) return input.map((s) => Normalizer.normalize(s, flags)); const key = Normalizer.cache.key(flags, [input]); if (key && Normalizer.cache.has(key)) return Normalizer.cache.get(key); const res = Normalizer.getPipeline(flags)(input); if (key) Normalizer.cache.set(key, res); return res; }, `Failed to normalize input with flags: ${flags}`, { input, flags } ); } static async normalizeAsync(input, flags) { return await ErrorUtil.wrapAsync( async () => { if (!flags || typeof flags !== 'string' || !input) return input; return await (Array.isArray(input) ? Promise.all(input.map((s) => Normalizer.normalize(s, flags))) : Promise.resolve(Normalizer.normalize(input, flags))); }, `Failed to asynchronously normalize input with flags: ${flags}`, { input, flags } ); } static clear() { Normalizer.pipeline.clear(); Normalizer.cache.clear(); } } class Profiler { active; static ENV; static instance; nowFn; memFn; store = new Set(); totalTime = 0; totalMem = 0; static detectEnv() { if (typeof process !== 'undefined') Profiler.ENV = 'nodejs'; else if (typeof performance !== 'undefined') Profiler.ENV = 'browser'; else Profiler.ENV = 'unknown'; } static getInstance(enable) { if (!Profiler.ENV) Profiler.detectEnv(); return (Profiler.instance ||= new Profiler(enable)); } constructor(active = false) { this.active = active; switch (Profiler.ENV) { case 'nodejs': this.nowFn = () => Number(process.hrtime.bigint()) / 1e6; this.memFn = () => process.memoryUsage().heapUsed; break; case 'browser': this.nowFn = () => performance.now(); this.memFn = () => performance.memory?.usedJSHeapSize ?? 0; break; default: this.nowFn = () => Date.now(); this.memFn = () => 0; break; } } now = () => this.nowFn(); mem = () => this.memFn(); profile(fn, meta) { const startTime = this.now(), startMem = this.mem(); const res = fn(); const deltaTime = this.now() - startTime, deltaMem = this.mem() - startMem; this.store.add({ time: deltaTime, mem: deltaMem, res, meta }); ((this.totalTime += deltaTime), (this.totalMem += deltaMem)); return res; } enable = () => { this.active = true; }; disable = () => { this.active = false; }; clear() { this.store.clear(); this.totalTime = 0; this.totalMem = 0; } run(fn, meta = {}) { return this.active ? this.profile(fn, meta) : fn(); } async runAsync(fn, meta = {}) { return this.active ? this.profile(async () => await fn(), meta) : await fn(); } getAll = () => [...this.store]; getLast = () => this.getAll().pop(); getTotal = () => ({ time: this.totalTime, mem: this.totalMem }); services = Object.freeze({ enable: this.enable.bind(this), disable: this.disable.bind(this), clear: this.clear.bind(this), report: this.getAll.bind(this), last: this.getLast.bind(this), total: this.getTotal.bind(this) }); } const registry = Object.create(null); const factory = Object.create(null); function Registry(reg, ctor) { ErrorUtil.assert( !(reg in registry || reg in factory), `Registry <${reg}> already exists / overwriting is forbidden`, { registry: reg } ); const classes = Object.create(null); const service = Object.freeze({ add(name, cls, update = false) { ErrorUtil.assert( typeof name === 'string' && name.length > 0, `Class name must be a non-empty string`, { registry: reg, name } ); ErrorUtil.assert( typeof cls === 'function', `Class must be a constructor function`, { registry: reg, class: cls } ); ErrorUtil.assert( cls.prototype instanceof ctor, `Class must extend <${reg}>`, { registry: reg, class: cls } ); ErrorUtil.assert( update || !(name in classes), `Class <${name}> already exists / use <update=true> to overwrite`, { registry: reg, name } ); classes[name] = cls; }, remove(name) { delete classes[name]; }, has(name) { return name in classes; }, list() { return Object.keys(classes); }, get(name) { ErrorUtil.assert( typeof name === 'string' && name.length > 0, `Class name must be a non-empty string`, { registry: reg, name } ); ErrorUtil.assert( name in classes, `Class <${name}> not registered for <${reg}>`, { registry: reg, name } ); return classes[name]; } }); registry[reg] = service; factory[reg] = (cls, ...args) => createFromRegistry(reg, cls, ...args); return service; } function resolveCls(reg, cls) { if (!(reg in registry)) throw new CmpStrNotFoundError(`Registry <${reg}> does not exist`, { registry: reg }); return typeof cls === 'string' ? registry[reg]?.get(cls) : cls; } function createFromRegistry(reg, cls, ...args) { cls = resolveCls(reg, cls); return ErrorUtil.wrap( () => new cls(...args), `Failed to create instance of class <${cls.name ?? cls}> from registry <${reg}>`, { registry: reg, class: cls, args } ); } class RingPool { maxSize; buffers = []; pointer = 0; constructor(maxSize) { this.maxSize = maxSize; } acquire(minSize, allowOversize) { return ErrorUtil.wrap( () => { const len = this.buffers.length; for (let i = 0; i < len; i++) { const idx = (this.pointer + i) & (len - 1); const item = this.buffers[idx]; if ( item.size >= minSize && (allowOversize || item.size === minSize) ) { this.pointer = (idx + 1) & (len - 1); return item; } } return null; }, `Failed to acquire buffer of size >= ${minSize} from pool`, { minSize, allowOversize } ); } release(item) { ErrorUtil.wrap( () => { if (this.buffers.length < this.maxSize) return void [this.buffers.push(item)]; this.buffers[this.pointer] = item; this.pointer = (this.pointer + 1) % this.maxSize; }, `Failed to release buffer back to pool`, { item } ); } clear() { this.buffers = []; this.pointer = 0; } } class Pool { static CONFIG = { int32: { type: 'int32', maxSize: 64, maxItemSize: 2048, allowOversize: true }, 'number[]': { type: 'number[]', maxSize: 16, maxItemSize: 1024, allowOversize: false }, 'string[]': { type: 'string[]', maxSize: 2, maxItemSize: 1024, allowOversize: false }, set: { type: 'set', maxSize: 8, maxItemSize: 0, allowOversize: false }, map: { type: 'map', maxSize: 8, maxItemSize: 0, allowOversize: false } }; static POOLS = { int32: new RingPool(64), 'number[]': new RingPool(16), 'string[]': new RingPool(2), set: new RingPool(8), map: new RingPool(8) }; static allocate(type, size) { switch (type) { case 'int32': return new Int32Array(size); case 'number[]': return new Float64Array(size); case 'string[]': return new Array(size); case 'set': return new Set(); case 'map': return new Map(); } } static acquire(type, size) { const CONFIG = this.CONFIG[type]; if (!CONFIG) throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type }); if (size > CONFIG.maxItemSize) return this.allocate(type, size); const item = this.POOLS[type].acquire(size, CONFIG.allowOversize); if (item) return type === 'int32' ? item.buffer.subarray(0, size) : item.buffer; return this.allocate(type, size); } static acquireMany(type, sizes) { return sizes.map((size) => this.acquire(type, size)); } static release(type, buffer, size) { const CONFIG = this.CONFIG[type]; if (!CONFIG) throw new CmpStrUsageError(`Unsupported pool type <${type}>`, { type }); if (size <= CONFIG.maxItemSize) this.POOLS[type].release({ buffer, size }); } } class StructuredData { data; key; static create(data, key) { return new StructuredData(data, key); } constructor(data, key) { this.data = data; this.key = key; } extractFrom(arr, key) { const result = Pool.acquire('string[]', arr.length); for (let i = 0; i < arr.length; i++) { const val = arr[i][key]; result[i] = typeof val === 'string' ? val : String(val ?? ''); } return result; } extract = () => this.extractFrom(this.data, this.key); isMetricResult(v) { return ( typeof v === 'object' && v !== null && 'a' in v && 'b' in v && 'res' in v ); } isCmpStrResult(v) { return ( typeof v === 'object' && v !== null && 'source' in v && 'target' in v && 'match' in v ); } normalizeResults(results) { if (!Array.isArray(results) || results.length === 0) return []; const first = results[0]; let normalized = []; if (this.isMetricResult(first)) normalized = results; else if (this.isCmpStrResult(first)) normalized = results.map((r) => ({ metric: 'unknown', a: r.source, b: r.target, res: r.match, raw: r.raw })); else throw new CmpStrValidationError( 'Unsupported result format for StructuredData normalization.' ); return normalized.map((r, idx) => ({ ...r, __idx: idx })); } rebuild(results, sourceData, extractedStrings, removeZero, objectsOnly) { const stringToIndices = new Map(); for (let i = 0; i < extractedStrings.length; i++) { const str = extractedStrings[i]; if (!stringToIndices.has(str)) stringToIndices.set(str, []); stringToIndices.get(str).push(i); } const output = new Array(results.length); const occurrenceCount = new Map(); let out = 0; for (let i = 0; i < results.length; i++) { const result = results[i]; if (removeZero && result.res === 0) continue; const targetStr = result.b || ''; const indices = stringToIndices.get(targetStr); let dataIndex; if (indices && indices.length > 0) { const occurrence = occurrenceCount.get(targetStr) ?? 0; occurrenceCount.set(targetStr, occurrence + 1); dataIndex = indices[occurrence % indices.length]; } else { dataIndex = result.__idx ?? i; } if (dataIndex < 0 || dataIndex >= sourceData.length) continue; const sourceObj = sourceData[dataIndex]; const mappedTarget = extractedStrings[dataIndex] || targetStr; if (objectsOnly) output[out++] = sourceObj; else output[out++] = { obj: sourceObj, key: this.key, result: { source: result.a, target: mappedTarget, match: result.res }, ...(result.raw ? { raw: result.raw } : null) }; } output.length = out; return output; } sort(results, sort) { if (!sort || results.length <= 1) return results; const asc = sort === 'asc'; return results.sort((a, b) => (asc ? a.res - b.res : b.res - a.res)); } finalizeLookup(results, extractedStrings, opt) { return this.rebuild( this.sort(this.normalizeResults(results), opt?.sort), this.data, extractedStrings, opt?.removeZero, opt?.objectsOnly ); } performLookup(fn, extractedStrings, opt) { return ErrorUtil.wrap( () => this.finalizeLookup(fn(), extractedStrings, opt), 'StructuredData lookup failed', { key: this.key } ); } async performLookupAsync(fn, extractedStrings, opt) { return await ErrorUtil.wrapAsync( async () => this.finalizeLookup(await fn(), extractedStrings, opt), 'StructuredData async lookup failed', { key: this.key } ); } lookup(fn, query, opt) { const b = this.extract(); try { return this.performLookup(() => fn(query, b, opt), b, opt); } finally { Pool.release('string[]', b, b.length); } } async lookupAsync(fn, query, opt) { const b = this.extract(); try { return await this.performLookupAsync(() => fn(query, b, opt), b, opt); } finally { Pool.release('string[]', b, b.length); } } lookupPairs(fn, other, otherKey, opt) { const a = this.extract(); const b = this.extractFrom(other, otherKey); try { return this.performLookup(() => fn(a, b, opt), a, opt); } finally { Pool.release('string[]', a, a.length); Pool.release('string[]', b, b.length); } } async lookupPairsAsync(fn, other, otherKey, opt) { const a = this.extract(); const b = this.extractFrom(other, otherKey); try { return await this.performLookupAsync(() => fn(a, b, opt), a, opt); } finally { Pool.release('string[]', a, a.length); Pool.release('string[]', b, b.length); } } } class TextAnalyzer { static REGEX = { number: /\d/, sentence: /(?<=[.!?])\s+/, word: /\p{L}+/gu, nonWord: /[^\p{L}]/gu, vowelGroup: /[aeiouy]+/g, letter: /\p{L}/gu, ucLetter: /\p{Lu}/gu }; text; words = []; sentences = []; charFrequency = new Map(); wordHistogram = new Map(); syllableCache = new Map(); syllableStats; constructor(input) { this.text = input.trim(); this.tokenize(); this.computeFrequencies(); } tokenize() { let match; const lcText = this.text.toLowerCase(); while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null) this.words.push(match[0]); this.sentences = this.text .split(TextAnalyzer.REGEX.sentence) .filter(Boolean); } computeFrequencies() { for (const char of this.text) this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1); for (const word of this.words) this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1); } estimateSyllables(word) { const clean = word .normalize('NFC') .toLowerCase() .replace(TextAnalyzer.REGEX.nonWord, ''); if (this.syllableCache.has(clean)) return this.syllableCache.get(clean); const matches = clean.match(TextAnalyzer.REGEX.vowelGroup); const count = matches ? matches.length : 1; this.syllableCache.set(clean, count); return count; } computeSyllableStats() { return (this.syllableStats ||= (() => { const perWord = this.words .map((w) => this.estimateSyllables(w)) .sort((a, b) => a - b); const total = perWord.reduce((sum, s) => sum + s, 0); const mono = perWord.filter((s) => s === 1).length; const median = !perWord.length ? 0 : perWord.length % 2 === 0 ? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2 : perWord[Math.floor(perWord.length / 2)]; return { total, mono, perWord, avg: perWord.length ? total / perWord.length : 0, median }; })()); } getLength = () => this.text.length; getWordCount = () => this.words.length; getSentenceCount = () => this.sentences.length; getAvgWordLength() { return this.words.length ? this.words.join('').length / this.words.length : 0; } getAvgSentenceLength() { return this.sentences.length ? this.words.length / this.sentences.length : 0; } getWordHistogram() { return Object.fromEntries(this.wordHistogram); } getMostCommonWords(limit = 5) { return [...this.wordHistogram.entries()] .sort((a, b) => b[1] - a[1]) .slice(0, limit) .map((e) => e[0]); } getHapaxLegomena() { return [...this.wordHistogram.entries()] .filter(([, c]) => c === 1) .map((e) => e[0]); } hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text); getUpperCaseRatio() { const matches = this.text.match(TextAnalyzer.REGEX.letter) || []; const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0; return matches.length ? upper / matches.length : 0; } getCharFrequency() { return Object.fromEntries(this.charFrequency); } getUnicodeCodepoints() { const result = {}; for (const [char, count] of this.charFrequency) { const block = char .charCodeAt(0) .toString(16) .padStart(4, '0') .toUpperCase(); result[block] = (result[block] || 0) + count; } return result; } getLongWordRatio(len = 7) { let long = 0; for (const w of this.words) if (w.length >= len) long++; return this.words.length ? long / this.words.length : 0; } getShortWordRatio(len = 3) { let short = 0; for (const w of this.words) if (w.length <= len) short++; return this.words.length ? short / this.words.length : 0; } getSyllablesCount() { return this.computeSyllableStats().total; } getMonosyllabicWordCount() { return this.computeSyllableStats().mono; } getMinSyllablesWordCount(min) { return this.computeSyllableStats().perWord.filter((w) => w >= min).length; } getMaxSyllablesWordCount(max) { return this.computeSyllableStats().perWord.filter((w) => w <= max).length; } getAvgSyllablesPerWord() { return this.computeSyllableStats().avg; } getMedianSyllablesPerWord() { return this.computeSyllableStats().median; } getHonoresR() { try { return ( (100 * Math.log(this.words.length)) / (1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1)) ); } catch { return 0; } } getReadingTime(wpm = 200) { return this.words.length / (wpm ?? 1); } getReadabilityScore(metric = 'flesch') { const w = this.words.length || 1; const s = this.sentences.length || 1; const y = this.getSyllablesCount() || 1; const asl = w / s; const asw = y / w; switch (metric) { case 'flesch': return 206.835 - 1.015 * asl - 84.6 * asw; case 'fleschde': return 180 - asl - 58.5 * asw; case 'kincaid': return 0.39 * asl + 11.8 * asw - 15.59; } } getLIXScore() { const w = this.words.length || 1; const s = this.sentences.length || 1; const l = this.getLongWordRatio() * w; return w / s + (l / w) * 100; } getWSTFScore() { const w = this.words.length || 1; const h = (this.getMinSyllablesWordCount(3) / w) * 100; const s = this.getAvgSentenceLength(); const l = this.getLongWordRatio() * 100; const m = (this.getMonosyllabicWordCount() / w) * 100; return [ 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875, 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779, 0.2963 * h + 0.1905 * s - 1.1144, 0.2744 * h + 0.2656 * s - 1.693 ]; } } const profiler$2 = Profiler.getInstance(); class Metric { static cache = new HashTable(); metric; a; b; origA = []; origB = []; options; optKey; symmetric; results; static clear = () => this.cache.clear(); static swap = (a, b, m, n) => (m > n ? [b, a, n, m] : [a, b, m, n]); static clamp = (res) => Math.max(0, Math.min(1, res)); constructor(metric, a, b, opt = {}, symmetric = false) { this.metric = metric; this.a = Array.isArray(a) ? a : [a]; this.b = Array.isArray(b) ? b : [b]; ErrorUtil.assert( this.a.length > 0 && this.b.length > 0, `Inputs <a> and <b> must not be empty`, { a: this.a, b: this.b } ); this.options = opt; this.optKey = Hasher.fastFNV1a( JSON.stringify(opt, Object.keys(opt).sort()) ).toString(); this.symmetric = symmetric; } preCompute(a, b, m, n) { if (a === b) return { res: 1 }; if (m == 0 || n == 0 || (m < 2 && n < 2)) return { res: 0 }; return undefined; } compute(a, b, m, n, maxLen) { throw new CmpStrInternalError( `Method compute() must be overridden in a subclass` ); } runSingle(i, j) { return ErrorUtil.wrap( () => { let a = String(this.a[i]), A = a; let b = String(this.b[j]), B = b; let m = A.length, n = B.length; let result = this.preCompute(A, B, m, n); if (!result) { result = profiler$2.run(() => { if (this.symmetric) [A, B, m, n] = Metric.swap(A, B, m, n); const key = Metric.cache.key(this.metric, [A, B], this.symmetric) + this.optKey; return ( Metric.cache.get(key || '') ?? (() => { const res = this.compute(A, B, m, n, Math.max(m, n)); if (key) Metric.cache.set(key, res); return res; })() ); }); } return { metric: this.metric, a: this.origA[i] ?? a, b: this.origB[j] ?? b, ...result }; }, `Failed to compute metric for inputs at indices a[${i}] and b[${j}]`, { i, j } ); } async runSingleAsync(i, j) { return Promise.resolve(this.runSingle(i, j)); } runBatch() { const results = []; for (let i = 0; i < this.a.length; i++) for (let j = 0; j < this.b.length; j++) results.push(this.runSingle(i, j)); this.results = results; } async runBatchAsync() { const results = []; for (let i = 0; i < this.a.length; i++) for (let j = 0; j < this.b.length; j++) results.push(await this.runSingleAsync(i, j)); this.results = results; } runPairwise() { const results = []; for (let i = 0; i < this.a.length; i++) results.push(this.runSingle(i, i)); this.results = results; } async runPairwiseAsync() { const results = []; for (let i = 0; i < this.a.length; i++) results.push(await this.runSingleAsync(i, i)); this.results = results; } setOriginal(a, b) { if (a) this.origA = Array.isArray(a) ? a : [a]; if (b) this.origB = Array.isArray(b) ? b : [b]; return this; } isBatch = () => this.a.length > 1 || this.b.length > 1; isSingle = () => !this.isBatch(); isPairwise(safe = false) { return this.isBatch() && this.a.length === this.b.length ? true : !safe && (() => { throw new CmpStrUsageError( `Mode <pairwise> requires arrays of equal length`, { a: this.a, b: this.b } ); })(); } isSymmetrical = () => this.symmetric; whichMode = (mode) => mode ?? this.options?.mode ?? 'default'; clear = () => (this.results = undefined); run(mode, clear = true) { if (clear) this.clear(); switch (this.whichMode(mode)) { case 'default': if (this.isSingle()) { this.results = this.runSingle(0, 0); break; } case 'batch': this.runBatch(); break; case 'single': this.results = this.runSingle(0, 0); break; case 'pairwise': if (this.isPairwise()) this.runPairwise(); break; default: throw new CmpStrInternalError(`Unsupported mode <${mode}>`); } } async runAsync(mode, clear = true) { if (clear) this.clear(); switch (this.whichMode(mode)) { case 'default': if (this.isSingle()) { this.results = await this.runSingleAsync(0, 0); break; } case 'batch': await this.runBatchAsync(); break; case 'single': this.results = await this.runSingleAsync(0, 0); break; case 'pairwise': if (this.isPairwise()) await this.runPairwiseAsync(); break; default: throw new CmpStrInternalError(`Unsupported async mode <${mode}>`); } } getMetricName = () => this.metric; getResults() { ErrorUtil.assert( this.results !== undefined, `run() must be called before getResults()` ); return this.results; } } const MetricRegistry = Registry('metric', Metric); class CosineSimilarity extends Metric { constructor(a, b, opt = {}) { super('cosine', a, b, opt, true); } _termFreq(str, delimiter) { const terms = str.split(delimiter); const freq = Pool.acquire('map', terms.length); for (const term of terms) freq.set(term, (freq.get(term) || 0) + 1); return freq; } compute(a, b) { const { delimiter = ' ' } = this.options; const termsA = this._termFreq(a, delimiter); const termsB = this._termFreq(b, delimiter); try { let dotP = 0, magA = 0, magB = 0; for (const [term, freqA] of termsA) { const freqB = termsB.get(term) || 0; dotP += freqA * freqB; magA += freqA * freqA; } for (const freqB of termsB.values()) magB += freqB * freqB; magA = Math.sqrt(magA); magB = Math.sqrt(magB); return { res: magA && magB ? Metric.clamp(dotP / (magA * magB)) : 0, raw: { dotProduct: dotP, magnitudeA: magA, magnitudeB: magB } }; } finally { Pool.release('map', termsA, termsA.size)