UNPKG

cmpstr

Version:

CmpStr is a lightweight, fast and well performing package for calculating string similarity

200 lines (199 loc) 6.8 kB
/** * TextAnalyzer Utility * src/utils/TextAnalyzer.ts * * The TextAnalyzer class provides a comprehensive set of methods for analyzing and * extracting statistics from a given text. It supports word and sentence tokenization, * character and word frequency analysis, syllable estimation, readability metrics * (Flesch, Kincaid, LIX, WSTF), and various ratios and histograms. Designed for * efficiency and flexibility, it is suitable for linguistic research, readability * scoring, and text preprocessing tasks. * * @module Utils/TextAnalyzer * @author Paul Köhler (komed3) * @license MIT */ export declare class TextAnalyzer { private readonly text; private words; private sentences; private charFrequency; private wordHistogram; private syllableCache; /** * Constructs a new TextAnalyzer instance with the provided input text. * * @param {string} input - The text to analyze */ constructor(input: string); /** * Tokenizes the input text into words and sentences. */ private tokenize; /** * Computes character and word frequencies from the tokenized text. */ private computeFrequencies; /** * Estimates the number of syllables in a word using a simple heuristic. * * @param {string} word - The word to estimate syllables for * @returns {number} - Estimated syllable count */ private estimateSyllables; /** * Gets the original text length in characters. * * @return {number} - Length of the text */ getLength(): number; /** * Gets the number of words in the text. * * @return {number} - Count of words */ getWordCount(): number; /** * Gets the number of sentences in the text. * * @return {number} - Count of sentences */ getSentenceCount(): number; /** * Gets the average word length in the text. * * @return {number} - Average length of words */ getAvgWordLength(): number; /** * Gets the average sentence length in words. * * @return {number} - Average length of sentences */ getAvgSentenceLength(): number; /** * Gets a histogram of word frequencies in the text. * * @returns {Record<string, number>} - A histogram of word frequencies */ getWordHistogram(): Record<string, number>; /** * Gets the most common words in the text, limited to a specified number. * * @param {number} [limit=5] - Maximum number of common words to return * @returns {string[]} - Array of the most common words */ getMostCommonWords(limit?: number): string[]; /** * Gets the least common words (hapax legomena) in the text. * * Hapax legomena are words that occur only once in the text. * * @returns {string[]} - Array of hapax legomena */ getHapaxLegomena(): string[]; /** * Checks if the text contains any numbers. * * @returns {boolean} - True if numbers are present, false otherwise */ hasNumbers(): boolean; /** * Calculates the ratio of uppercase letters to total letters in the text. * * @return {number} - Ratio of uppercase letters to total letters */ getUpperCaseRatio(): number; /** * Gets the frequency of each character in the text. * * @returns {Record<string, number>} - A record of character frequencies */ getCharFrequency(): Record<string, number>; /** * Gets the frequency of each Unicode block in the text. * * @returns {Record<string, number>} - A record of Unicode block frequencies */ getUnicodeStats(): Record<string, number>; /** * Gets the ratio of long words (words with length >= len) to total words. * * @param {number} [len=7] - Minimum length for a word to be considered long * @returns {number} - Ratio of long words to total words */ getLongWordRatio(len?: number): number; /** * Gets the ratio of short words (words with length <= len) to total words. * * @param {number} [len=3] - Maximum length for a word to be considered short * @returns {number} - Ratio of short words to total words */ getShortWordRatio(len?: number): number; /** * Estimates the number of syllables in the text. * * @returns {number} - Total estimated syllable count */ getSyllablesCount(): number; /** * Gets the number of monosyllabic words (words with exactly one syllable). * * @returns {number} - Count of monosyllabic words */ getMonosyllabicWordCount(): number; /** * Gets the number of words with at least a specified minimum syllable count. * * @param {number} min - Minimum syllable count for a word to be included * @returns {number} - Count of words meeting the syllable criteria */ getMinSyllablesWordCount(min: number): number; /** * Gets the number of words with at most a specified maximum syllable count. * * @param {number} max - Maximum syllable count for a word to be included * @returns {number} - Count of words meeting the syllable criteria */ getMaxSyllablesWordCount(max: number): number; /** * Calculates the Honore's R statistic for the text as a measure of lexical richness. * * @returns {number} - The Honore's R statistic */ getHonoresR(): number; /** * Estimates the reading time for the text based on words per minute (WPM). * * @param {number} [wpm=200] - Words per minute for the calculation * @returns {number} - Estimated reading time in minutes */ getReadingTime(wpm?: number): number; /** * Calculates various readability scores based on the text. * * This method supports multiple readability metrics: * - Flesch Reading Ease * - Flesch-Kincaid Grade Level * * @param {'flesch'|'fleschde'|'kincaid'} [metric='flesch'] - The readability metric to calculate * @returns {number} - The calculated readability score */ getReadabilityScore(metric?: 'flesch' | 'fleschde' | 'kincaid'): number; /** * Calculates the LIX (Lesbarhetsindex) score for the text. * * The LIX score is a readability index that combines average word length and sentence length. * * @returns {number} - The LIX score */ getLIXScore(): number; /** * Calculates the Wiener Sachtextformel (WSTF) scores for the text. * * The WSTF scores are a set of readability metrics based on word and sentence characteristics. * * @returns {[number, number, number, number]} - An array of WSTF scores */ getWSTFScore(): [number, number, number, number]; }