UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

github.com/komed3/cmpstr

197 lines (195 loc) • 5.91 kB

JavaScript

// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License class TextAnalyzer { static REGEX = { number: /\d/, sentence: /(?<=[.!?])\s+/, word: /\p{L}+/gu, nonWord: /[^\p{L}]/gu, vowelGroup: /[aeiouy]+/g, letter: /\p{L}/gu, ucLetter: /\p{Lu}/gu }; text; words = []; sentences = []; charFrequency = new Map(); wordHistogram = new Map(); syllableCache = new Map(); syllableStats; constructor(input) { this.text = input.trim(); this.tokenize(); this.computeFrequencies(); } tokenize() { let match; const lcText = this.text.toLowerCase(); while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null) this.words.push(match[0]); this.sentences = this.text .split(TextAnalyzer.REGEX.sentence) .filter(Boolean); } computeFrequencies() { for (const char of this.text) this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1); for (const word of this.words) this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1); } estimateSyllables(word) { const clean = word .normalize('NFC') .toLowerCase() .replace(TextAnalyzer.REGEX.nonWord, ''); if (this.syllableCache.has(clean)) return this.syllableCache.get(clean); const matches = clean.match(TextAnalyzer.REGEX.vowelGroup); const count = matches ? matches.length : 1; this.syllableCache.set(clean, count); return count; } computeSyllableStats() { return (this.syllableStats ||= (() => { const perWord = this.words .map((w) => this.estimateSyllables(w)) .sort((a, b) => a - b); const total = perWord.reduce((sum, s) => sum + s, 0); const mono = perWord.filter((s) => s === 1).length; const median = !perWord.length ? 0 : perWord.length % 2 === 0 ? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2 : perWord[Math.floor(perWord.length / 2)]; return { total, mono, perWord, avg: perWord.length ? total / perWord.length : 0, median }; })()); } getLength = () => this.text.length; getWordCount = () => this.words.length; getSentenceCount = () => this.sentences.length; getAvgWordLength() { return this.words.length ? this.words.join('').length / this.words.length : 0; } getAvgSentenceLength() { return this.sentences.length ? this.words.length / this.sentences.length : 0; } getWordHistogram() { return Object.fromEntries(this.wordHistogram); } getMostCommonWords(limit = 5) { return [...this.wordHistogram.entries()] .sort((a, b) => b[1] - a[1]) .slice(0, limit) .map((e) => e[0]); } getHapaxLegomena() { return [...this.wordHistogram.entries()] .filter(([, c]) => c === 1) .map((e) => e[0]); } hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text); getUpperCaseRatio() { const matches = this.text.match(TextAnalyzer.REGEX.letter) || []; const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0; return matches.length ? upper / matches.length : 0; } getCharFrequency() { return Object.fromEntries(this.charFrequency); } getUnicodeCodepoints() { const result = {}; for (const [char, count] of this.charFrequency) { const block = char .charCodeAt(0) .toString(16) .padStart(4, '0') .toUpperCase(); result[block] = (result[block] || 0) + count; } return result; } getLongWordRatio(len = 7) { let long = 0; for (const w of this.words) if (w.length >= len) long++; return this.words.length ? long / this.words.length : 0; } getShortWordRatio(len = 3) { let short = 0; for (const w of this.words) if (w.length <= len) short++; return this.words.length ? short / this.words.length : 0; } getSyllablesCount() { return this.computeSyllableStats().total; } getMonosyllabicWordCount() { return this.computeSyllableStats().mono; } getMinSyllablesWordCount(min) { return this.computeSyllableStats().perWord.filter((w) => w >= min).length; } getMaxSyllablesWordCount(max) { return this.computeSyllableStats().perWord.filter((w) => w <= max).length; } getAvgSyllablesPerWord() { return this.computeSyllableStats().avg; } getMedianSyllablesPerWord() { return this.computeSyllableStats().median; } getHonoresR() { try { return ( (100 * Math.log(this.words.length)) / (1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1)) ); } catch { return 0; } } getReadingTime(wpm = 200) { return this.words.length / (wpm ?? 1); } getReadabilityScore(metric = 'flesch') { const w = this.words.length || 1; const s = this.sentences.length || 1; const y = this.getSyllablesCount() || 1; const asl = w / s; const asw = y / w; switch (metric) { case 'flesch': return 206.835 - 1.015 * asl - 84.6 * asw; case 'fleschde': return 180 - asl - 58.5 * asw; case 'kincaid': return 0.39 * asl + 11.8 * asw - 15.59; } } getLIXScore() { const w = this.words.length || 1; const s = this.sentences.length || 1; const l = this.getLongWordRatio() * w; return w / s + (l / w) * 100; } getWSTFScore() { const w = this.words.length || 1; const h = (this.getMinSyllablesWordCount(3) / w) * 100; const s = this.getAvgSentenceLength(); const l = this.getLongWordRatio() * 100; const m = (this.getMonosyllabicWordCount() / w) * 100; return [ 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875, 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779, 0.2963 * h + 0.1905 * s - 1.1144, 0.2744 * h + 0.2656 * s - 1.693 ]; } } export { TextAnalyzer };