cmpstr
Version:
CmpStr is a lightweight, fast and well performing package for calculating string similarity
200 lines (199 loc) • 6.8 kB
TypeScript
/**
* TextAnalyzer Utility
* src/utils/TextAnalyzer.ts
*
* The TextAnalyzer class provides a comprehensive set of methods for analyzing and
* extracting statistics from a given text. It supports word and sentence tokenization,
* character and word frequency analysis, syllable estimation, readability metrics
* (Flesch, Kincaid, LIX, WSTF), and various ratios and histograms. Designed for
* efficiency and flexibility, it is suitable for linguistic research, readability
* scoring, and text preprocessing tasks.
*
* @module Utils/TextAnalyzer
* @author Paul Köhler (komed3)
* @license MIT
*/
export declare class TextAnalyzer {
private readonly text;
private words;
private sentences;
private charFrequency;
private wordHistogram;
private syllableCache;
/**
* Constructs a new TextAnalyzer instance with the provided input text.
*
* @param {string} input - The text to analyze
*/
constructor(input: string);
/**
* Tokenizes the input text into words and sentences.
*/
private tokenize;
/**
* Computes character and word frequencies from the tokenized text.
*/
private computeFrequencies;
/**
* Estimates the number of syllables in a word using a simple heuristic.
*
* @param {string} word - The word to estimate syllables for
* @returns {number} - Estimated syllable count
*/
private estimateSyllables;
/**
* Gets the original text length in characters.
*
* @return {number} - Length of the text
*/
getLength(): number;
/**
* Gets the number of words in the text.
*
* @return {number} - Count of words
*/
getWordCount(): number;
/**
* Gets the number of sentences in the text.
*
* @return {number} - Count of sentences
*/
getSentenceCount(): number;
/**
* Gets the average word length in the text.
*
* @return {number} - Average length of words
*/
getAvgWordLength(): number;
/**
* Gets the average sentence length in words.
*
* @return {number} - Average length of sentences
*/
getAvgSentenceLength(): number;
/**
* Gets a histogram of word frequencies in the text.
*
* @returns {Record<string, number>} - A histogram of word frequencies
*/
getWordHistogram(): Record<string, number>;
/**
* Gets the most common words in the text, limited to a specified number.
*
* @param {number} [limit=5] - Maximum number of common words to return
* @returns {string[]} - Array of the most common words
*/
getMostCommonWords(limit?: number): string[];
/**
* Gets the least common words (hapax legomena) in the text.
*
* Hapax legomena are words that occur only once in the text.
*
* @returns {string[]} - Array of hapax legomena
*/
getHapaxLegomena(): string[];
/**
* Checks if the text contains any numbers.
*
* @returns {boolean} - True if numbers are present, false otherwise
*/
hasNumbers(): boolean;
/**
* Calculates the ratio of uppercase letters to total letters in the text.
*
* @return {number} - Ratio of uppercase letters to total letters
*/
getUpperCaseRatio(): number;
/**
* Gets the frequency of each character in the text.
*
* @returns {Record<string, number>} - A record of character frequencies
*/
getCharFrequency(): Record<string, number>;
/**
* Gets the frequency of each Unicode block in the text.
*
* @returns {Record<string, number>} - A record of Unicode block frequencies
*/
getUnicodeStats(): Record<string, number>;
/**
* Gets the ratio of long words (words with length >= len) to total words.
*
* @param {number} [len=7] - Minimum length for a word to be considered long
* @returns {number} - Ratio of long words to total words
*/
getLongWordRatio(len?: number): number;
/**
* Gets the ratio of short words (words with length <= len) to total words.
*
* @param {number} [len=3] - Maximum length for a word to be considered short
* @returns {number} - Ratio of short words to total words
*/
getShortWordRatio(len?: number): number;
/**
* Estimates the number of syllables in the text.
*
* @returns {number} - Total estimated syllable count
*/
getSyllablesCount(): number;
/**
* Gets the number of monosyllabic words (words with exactly one syllable).
*
* @returns {number} - Count of monosyllabic words
*/
getMonosyllabicWordCount(): number;
/**
* Gets the number of words with at least a specified minimum syllable count.
*
* @param {number} min - Minimum syllable count for a word to be included
* @returns {number} - Count of words meeting the syllable criteria
*/
getMinSyllablesWordCount(min: number): number;
/**
* Gets the number of words with at most a specified maximum syllable count.
*
* @param {number} max - Maximum syllable count for a word to be included
* @returns {number} - Count of words meeting the syllable criteria
*/
getMaxSyllablesWordCount(max: number): number;
/**
* Calculates the Honore's R statistic for the text as a measure of lexical richness.
*
* @returns {number} - The Honore's R statistic
*/
getHonoresR(): number;
/**
* Estimates the reading time for the text based on words per minute (WPM).
*
* @param {number} [wpm=200] - Words per minute for the calculation
* @returns {number} - Estimated reading time in minutes
*/
getReadingTime(wpm?: number): number;
/**
* Calculates various readability scores based on the text.
*
* This method supports multiple readability metrics:
* - Flesch Reading Ease
* - Flesch-Kincaid Grade Level
*
* @param {'flesch'|'fleschde'|'kincaid'} [metric='flesch'] - The readability metric to calculate
* @returns {number} - The calculated readability score
*/
getReadabilityScore(metric?: 'flesch' | 'fleschde' | 'kincaid'): number;
/**
* Calculates the LIX (Lesbarhetsindex) score for the text.
*
* The LIX score is a readability index that combines average word length and sentence length.
*
* @returns {number} - The LIX score
*/
getLIXScore(): number;
/**
* Calculates the Wiener Sachtextformel (WSTF) scores for the text.
*
* The WSTF scores are a set of readability metrics based on word and sentence characteristics.
*
* @returns {[number, number, number, number]} - An array of WSTF scores
*/
getWSTFScore(): [number, number, number, number];
}