UNPKG

@astermind/astermind-pro

Version:

Astermind Pro - Premium ML Toolkit with Advanced RAG, Reranking, Summarization, and Information Flow Analysis

80 lines 2.35 kB
// Tokenization and stemming utilities // Extracted from workers for reuse // Memo for speed const STEM_CACHE = new Map(); export function normalizeWord(raw) { const k = raw; const cached = STEM_CACHE.get(k); if (cached) return cached; let w = raw.toLowerCase(); w = w.replace(/^[^a-z0-9]+|[^a-z0-9]+$/g, ''); if (w.length <= 2) { STEM_CACHE.set(k, w); return w; } // plural → singular if (w.endsWith('ies') && w.length > 4) { w = w.slice(0, -3) + 'y'; } else if (/(xes|ches|shes|zes|sses)$/.test(w) && w.length > 4) { w = w.replace(/(xes|ches|shes|zes|sses)$/, (m) => (m === 'sses' ? 'ss' : m.replace(/es$/, ''))); } else if (w.endsWith('s') && !/(ss|us)$/.test(w) && w.length > 3) { w = w.slice(0, -1); } // conservative suffix trimming const rules = [ [/ization$|isation$/, 'ize'], [/ational$/, 'ate'], [/fulness$/, 'ful'], [/ousness$/, 'ous'], [/iveness$/, 'ive'], [/ability$/, 'able'], [/ness$/, ''], [/ment$/, ''], [/ations?$/, 'ate'], [/izer$|iser$/, 'ize'], [/ally$/, 'al'], [/ically$/, 'ic'], [/ingly$|edly$/, ''], [/ing$|ed$/, ''], ]; for (const [re, rep] of rules) { if (re.test(w) && w.length - rep.length >= 4) { w = w.replace(re, rep); break; } } STEM_CACHE.set(k, w); return w; } export function tokenize(text, doStem) { const base = text.toLowerCase() .replace(/[`*_>~]/g, ' ') .replace(/[^a-z0-9]+/g, ' ') .split(/\s+/) .filter(Boolean); if (!doStem) return base; const out = []; for (const t of base) { const n = normalizeWord(t); if (n && n.length > 1) out.push(n); } return out; } export function expandQuery(q) { const adds = []; if (/\bmap\b/.test(q)) adds.push('dict key value make'); if (/\bchan|channel\b/.test(q)) adds.push('goroutine concurrency select buffer'); if (/\berror\b/.test(q)) adds.push('fmt wrap unwrap sentinel try catch'); if (/\bstruct\b/.test(q)) adds.push('field method receiver init zero value'); return q + ' ' + adds.join(' '); } //# sourceMappingURL=tokenization.js.map