cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
197 lines (195 loc) • 5.91 kB
JavaScript
// CmpStr v3.2.2 build-bb61120-260311 by Paul Köhler @komed3 / MIT License
class TextAnalyzer {
static REGEX = {
number: /\d/,
sentence: /(?<=[.!?])\s+/,
word: /\p{L}+/gu,
nonWord: /[^\p{L}]/gu,
vowelGroup: /[aeiouy]+/g,
letter: /\p{L}/gu,
ucLetter: /\p{Lu}/gu
};
text;
words = [];
sentences = [];
charFrequency = new Map();
wordHistogram = new Map();
syllableCache = new Map();
syllableStats;
constructor(input) {
this.text = input.trim();
this.tokenize();
this.computeFrequencies();
}
tokenize() {
let match;
const lcText = this.text.toLowerCase();
while ((match = TextAnalyzer.REGEX.word.exec(lcText)) !== null)
this.words.push(match[0]);
this.sentences = this.text
.split(TextAnalyzer.REGEX.sentence)
.filter(Boolean);
}
computeFrequencies() {
for (const char of this.text)
this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1);
for (const word of this.words)
this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1);
}
estimateSyllables(word) {
const clean = word
.normalize('NFC')
.toLowerCase()
.replace(TextAnalyzer.REGEX.nonWord, '');
if (this.syllableCache.has(clean)) return this.syllableCache.get(clean);
const matches = clean.match(TextAnalyzer.REGEX.vowelGroup);
const count = matches ? matches.length : 1;
this.syllableCache.set(clean, count);
return count;
}
computeSyllableStats() {
return (this.syllableStats ||= (() => {
const perWord = this.words
.map((w) => this.estimateSyllables(w))
.sort((a, b) => a - b);
const total = perWord.reduce((sum, s) => sum + s, 0);
const mono = perWord.filter((s) => s === 1).length;
const median = !perWord.length
? 0
: perWord.length % 2 === 0
? (perWord[perWord.length / 2 - 1] + perWord[perWord.length / 2]) / 2
: perWord[Math.floor(perWord.length / 2)];
return {
total,
mono,
perWord,
avg: perWord.length ? total / perWord.length : 0,
median
};
})());
}
getLength = () => this.text.length;
getWordCount = () => this.words.length;
getSentenceCount = () => this.sentences.length;
getAvgWordLength() {
return this.words.length
? this.words.join('').length / this.words.length
: 0;
}
getAvgSentenceLength() {
return this.sentences.length
? this.words.length / this.sentences.length
: 0;
}
getWordHistogram() {
return Object.fromEntries(this.wordHistogram);
}
getMostCommonWords(limit = 5) {
return [...this.wordHistogram.entries()]
.sort((a, b) => b[1] - a[1])
.slice(0, limit)
.map((e) => e[0]);
}
getHapaxLegomena() {
return [...this.wordHistogram.entries()]
.filter(([, c]) => c === 1)
.map((e) => e[0]);
}
hasNumbers = () => TextAnalyzer.REGEX.number.test(this.text);
getUpperCaseRatio() {
const matches = this.text.match(TextAnalyzer.REGEX.letter) || [];
const upper = this.text.match(TextAnalyzer.REGEX.ucLetter)?.length || 0;
return matches.length ? upper / matches.length : 0;
}
getCharFrequency() {
return Object.fromEntries(this.charFrequency);
}
getUnicodeCodepoints() {
const result = {};
for (const [char, count] of this.charFrequency) {
const block = char
.charCodeAt(0)
.toString(16)
.padStart(4, '0')
.toUpperCase();
result[block] = (result[block] || 0) + count;
}
return result;
}
getLongWordRatio(len = 7) {
let long = 0;
for (const w of this.words) if (w.length >= len) long++;
return this.words.length ? long / this.words.length : 0;
}
getShortWordRatio(len = 3) {
let short = 0;
for (const w of this.words) if (w.length <= len) short++;
return this.words.length ? short / this.words.length : 0;
}
getSyllablesCount() {
return this.computeSyllableStats().total;
}
getMonosyllabicWordCount() {
return this.computeSyllableStats().mono;
}
getMinSyllablesWordCount(min) {
return this.computeSyllableStats().perWord.filter((w) => w >= min).length;
}
getMaxSyllablesWordCount(max) {
return this.computeSyllableStats().perWord.filter((w) => w <= max).length;
}
getAvgSyllablesPerWord() {
return this.computeSyllableStats().avg;
}
getMedianSyllablesPerWord() {
return this.computeSyllableStats().median;
}
getHonoresR() {
try {
return (
(100 * Math.log(this.words.length)) /
(1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1))
);
} catch {
return 0;
}
}
getReadingTime(wpm = 200) {
return this.words.length / (wpm ?? 1);
}
getReadabilityScore(metric = 'flesch') {
const w = this.words.length || 1;
const s = this.sentences.length || 1;
const y = this.getSyllablesCount() || 1;
const asl = w / s;
const asw = y / w;
switch (metric) {
case 'flesch':
return 206.835 - 1.015 * asl - 84.6 * asw;
case 'fleschde':
return 180 - asl - 58.5 * asw;
case 'kincaid':
return 0.39 * asl + 11.8 * asw - 15.59;
}
}
getLIXScore() {
const w = this.words.length || 1;
const s = this.sentences.length || 1;
const l = this.getLongWordRatio() * w;
return w / s + (l / w) * 100;
}
getWSTFScore() {
const w = this.words.length || 1;
const h = (this.getMinSyllablesWordCount(3) / w) * 100;
const s = this.getAvgSentenceLength();
const l = this.getLongWordRatio() * 100;
const m = (this.getMonosyllabicWordCount() / w) * 100;
return [
0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875,
0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779,
0.2963 * h + 0.1905 * s - 1.1144,
0.2744 * h + 0.2656 * s - 1.693
];
}
}
export { TextAnalyzer };