UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

github.com/komed3/cmpstr

357 lines (355 loc) • 11.2 kB

JavaScript

// CmpStr v3.0.1 dev-052fa0c-250614 by Paul Köhler @komed3 / MIT License /** * TextAnalyzer Utility * src/utils/TextAnalyzer.ts * * The TextAnalyzer class provides a comprehensive set of methods for analyzing and * extracting statistics from a given text. It supports word and sentence tokenization, * character and word frequency analysis, syllable estimation, readability metrics * (Flesch, Kincaid, LIX, WSTF), and various ratios and histograms. Designed for * efficiency and flexibility, it is suitable for linguistic research, readability * scoring, and text preprocessing tasks. * * @module Utils/TextAnalyzer * @author Paul Köhler (komed3) * @license MIT */ class TextAnalyzer { // The original text to analyze text; // Tokenized words and sentences words = []; sentences = []; // Frequency maps for characters and words charFrequency = new Map(); wordHistogram = new Map(); syllableCache = new Map(); /** * Constructs a new TextAnalyzer instance with the provided input text. * * @param {string} input - The text to analyze */ constructor(input) { this.text = input.trim(); this.tokenize(); this.computeFrequencies(); } /** * Tokenizes the input text into words and sentences. */ tokenize() { (this.words = []), (this.sentences = []); const text = this.text; const wordRegex = /\p{L}+/gu; let match; // Tokenize words using Unicode property escapes for letters while ((match = wordRegex.exec(text)) !== null) { this.words.push(match[0].toLowerCase()); } // Tokenize sentences using punctuation marks as delimiters this.sentences = text.split(/(?<=[.!?])\s+/).filter(Boolean); } /** * Computes character and word frequencies from the tokenized text. */ computeFrequencies() { // Compute character frequencies for (const char of this.text) this.charFrequency.set(char, (this.charFrequency.get(char) ?? 0) + 1); // Compute word frequencies for (const word of this.words) this.wordHistogram.set(word, (this.wordHistogram.get(word) ?? 0) + 1); } /** * Estimates the number of syllables in a word using a simple heuristic. * * @param {string} word - The word to estimate syllables for * @returns {number} - Estimated syllable count */ estimateSyllables(word) { // Check cache first to avoid redundant calculations if (this.syllableCache.has(word)) return this.syllableCache.get(word); // Normalize the word: lowercase and remove non-letter characters const clean = word.toLowerCase().replace(/[^a-zäöüß]/g, ''); const matches = clean.match(/[aeiouyäöü]+/g); // Count syllables based on vowel groups const count = matches ? matches.length : 1; this.syllableCache.set(word, count); return count; } /** * Gets the original text length in characters. * * @return {number} - Length of the text */ getLength() { return this.text.length; } /** * Gets the number of words in the text. * * @return {number} - Count of words */ getWordCount() { return this.words.length; } /** * Gets the number of sentences in the text. * * @return {number} - Count of sentences */ getSentenceCount() { return this.sentences.length; } /** * Gets the average word length in the text. * * @return {number} - Average length of words */ getAvgWordLength() { let totalLen = 0; for (const w of this.words) totalLen += w.length; return this.words.length ? totalLen / this.words.length : 0; } /** * Gets the average sentence length in words. * * @return {number} - Average length of sentences */ getAvgSentenceLength() { return this.sentences.length ? this.words.length / this.sentences.length : 0; } /** * Gets a histogram of word frequencies in the text. * * @returns {Record<string, number>} - A histogram of word frequencies */ getWordHistogram() { return Object.fromEntries(this.wordHistogram); } /** * Gets the most common words in the text, limited to a specified number. * * @param {number} [limit=5] - Maximum number of common words to return * @returns {string[]} - Array of the most common words */ getMostCommonWords(limit = 5) { return [...this.wordHistogram.entries()] .sort((a, b) => b[1] - a[1]) .slice(0, limit) .map((e) => e[0]); } /** * Gets the least common words (hapax legomena) in the text. * * Hapax legomena are words that occur only once in the text. * * @returns {string[]} - Array of hapax legomena */ getHapaxLegomena() { return [...this.wordHistogram.entries()] .filter(([, c]) => c === 1) .map((e) => e[0]); } /** * Checks if the text contains any numbers. * * @returns {boolean} - True if numbers are present, false otherwise */ hasNumbers() { return /\d/.test(this.text); } /** * Calculates the ratio of uppercase letters to total letters in the text. * * @return {number} - Ratio of uppercase letters to total letters */ getUpperCaseRatio() { let upper = 0, letters = 0; for (let i = 0, len = this.text.length; i < len; i++) { const c = this.text[i]; if (/[A-Za-zÄÖÜäöüß]/.test(c)) { letters++; if (/[A-ZÄÖÜ]/.test(c)) upper++; } } return letters ? upper / letters : 0; } /** * Gets the frequency of each character in the text. * * @returns {Record<string, number>} - A record of character frequencies */ getCharFrequency() { return Object.fromEntries(this.charFrequency); } /** * Gets the frequency of each Unicode block in the text. * * @returns {Record<string, number>} - A record of Unicode block frequencies */ getUnicodeStats() { const result = {}; for (const [char, count] of this.charFrequency) { // Get the Unicode block for the character const block = char .charCodeAt(0) .toString(16) .padStart(4, '0') .toUpperCase(); // Increment the count for the block result[block] = (result[block] ?? 0) + count; } return result; } /** * Gets the ratio of long words (words with length >= len) to total words. * * @param {number} [len=7] - Minimum length for a word to be considered long * @returns {number} - Ratio of long words to total words */ getLongWordRatio(len = 7) { let long = 0; for (const w of this.words) if (w.length >= len) long++; return this.words.length ? long / this.words.length : 0; } /** * Gets the ratio of short words (words with length <= len) to total words. * * @param {number} [len=3] - Maximum length for a word to be considered short * @returns {number} - Ratio of short words to total words */ getShortWordRatio(len = 3) { let short = 0; for (const w of this.words) if (w.length <= len) short++; return this.words.length ? short / this.words.length : 0; } /** * Estimates the number of syllables in the text. * * @returns {number} - Total estimated syllable count */ getSyllablesCount() { let count = 0; for (const w of this.words) count += this.estimateSyllables(w); return count; } /** * Gets the number of monosyllabic words (words with exactly one syllable). * * @returns {number} - Count of monosyllabic words */ getMonosyllabicWordCount() { let count = 0; for (const w of this.words) if (this.estimateSyllables(w) === 1) count++; return count; } /** * Gets the number of words with at least a specified minimum syllable count. * * @param {number} min - Minimum syllable count for a word to be included * @returns {number} - Count of words meeting the syllable criteria */ getMinSyllablesWordCount(min) { let count = 0; for (const w of this.words) if (this.estimateSyllables(w) >= min) count++; return count; } /** * Gets the number of words with at most a specified maximum syllable count. * * @param {number} max - Maximum syllable count for a word to be included * @returns {number} - Count of words meeting the syllable criteria */ getMaxSyllablesWordCount(max) { let count = 0; for (const w of this.words) if (this.estimateSyllables(w) <= max) count++; return count; } /** * Calculates the Honore's R statistic for the text as a measure of lexical richness. * * @returns {number} - The Honore's R statistic */ getHonoresR() { return ( (100 * Math.log(this.words.length)) / (1 - this.getHapaxLegomena().length / (this.wordHistogram.size ?? 1)) ); } /** * Estimates the reading time for the text based on words per minute (WPM). * * @param {number} [wpm=200] - Words per minute for the calculation * @returns {number} - Estimated reading time in minutes */ getReadingTime(wpm = 200) { return Math.max(1, this.words.length / (wpm ?? 1)); } /** * Calculates various readability scores based on the text. * * This method supports multiple readability metrics: * - Flesch Reading Ease * - Flesch-Kincaid Grade Level * * @param {'flesch'|'fleschde'|'kincaid'} [metric='flesch'] - The readability metric to calculate * @returns {number} - The calculated readability score */ getReadabilityScore(metric = 'flesch') { const w = this.words.length || 1; const s = this.sentences.length || 1; const y = this.getSyllablesCount() || 1; const asl = w / s; const asw = y / w; switch (metric) { // Flesch Reading Ease formula case 'flesch': return 206.835 - 1.015 * asl - 84.6 * asw; // Flesch Reading Ease formula for German texts case 'fleschde': return 180 - asl - 58.5 * asw; // Flesch-Kincaid Grade Level formula case 'kincaid': return 0.39 * asl + 11.8 * asw - 15.59; } } /** * Calculates the LIX (Lesbarhetsindex) score for the text. * * The LIX score is a readability index that combines average word length and sentence length. * * @returns {number} - The LIX score */ getLIXScore() { const w = this.words.length || 1; const s = this.sentences.length || 1; const l = this.getLongWordRatio() * w; return w / s + (l / w) * 100; } /** * Calculates the Wiener Sachtextformel (WSTF) scores for the text. * * The WSTF scores are a set of readability metrics based on word and sentence characteristics. * * @returns {[number, number, number, number]} - An array of WSTF scores */ getWSTFScore() { const w = this.words.length || 1; const h = (this.getMinSyllablesWordCount(3) / w) * 100; const s = this.getAvgSentenceLength(); const l = this.getLongWordRatio() * 100; const m = (this.getMonosyllabicWordCount() / w) * 100; return [ 0.1935 * h + 0.1672 * s + 0.1297 * l - 0.0327 * m - 0.875, 0.2007 * h + 0.1682 * s + 0.1373 * l - 2.779, 0.2963 * h + 0.1905 * s - 1.1144, 0.2744 * h + 0.2656 * s - 1.693 ]; } } export { TextAnalyzer }; //# sourceMappingURL=TextAnalyzer.js.map