UNPKG

@igk19/rus-text-gen

Version:

Generator of coherent and random texts in Russian language from real texts containing (as of now) about 30000 words.

171 lines (170 loc) 6.48 kB
interface ILine { wordIndices: number[]; isCapitalized: Set<number>; symbolAfter: Map<number, string>; } interface IParseOutput { lines: ILine[]; words: string[]; } /** * A generates random or coherent Russian text from an existing text file. * The text file contains corrected excerpts from various philosophers of * the end of 19th/beginning of 20th century. */ declare class TextGenerator { /** * All unique words of the text sorted * in alphabetical order. */ private readonly words; /** * Each line of the text */ private readonly lines; constructor(input: IParseOutput); /** * Parses the default text file `./texts/text1.txt` and returns an * instance of `TextGenerator` with the parsed data. * * @returns A promise that resolves when the file has been parsed. * * @private */ static build(): Promise<TextGenerator>; /** * Parses a text file. * * @param path The path to the text file to parse. * @returns A promise that resolves when the file has been parsed. * * This function reads the file line by line, splits each line into words, * and adds the words to a set to get all unique words. * Besides, it creates an array of line objects, where each object has * three properties: `wordIndices`, `isCapitalized` and `symbolAfter`. * `wordIndices` is an array of indices of words in the `this.words` array. * `isCapitalized` is a set of indices of words in the `this.words` array * that are capitalized. * `symbolAfter` is a map of indices of words in the `this.words` array * to the symbol right after the word. * * The returned data structure optimizes generation of the text. */ private static parseFile; /** * Create random text. * @param textLength Text length measured in words. * @param isSentences Create random-length sentences (by capitalizing first sentence words and adding punctuation symbols). * @param isParagraphs Create random-length paragraphs (by adding new line symbols). * * @public */ createRandomText(textLength?: number, isSentences?: boolean, isParagraphs?: boolean): string; /** * Generate coherent text of given length. * @param length - length of the output text * @returns a string of coherent text * Note: maximum length is 30000 words * * @public */ createText(length?: number): string; /** * Creates a string of random text of given length, consisting of sentences. * * @param textLength - length of the output text * @param isParagraphs - whether to split sentences into paragraphs (by adding "\n" symbols) * @returns a string of random text formed into a semblance of sentences. * * @private */ private createRandomSentences; /** * Generates an array of paragraph split points. A split point is a point in * an array of sentences where a paragraph break is inserted. * The split points are chosen randomly to simulate the distribution of * paragraph breaks in a natural text. * The number of split points is limited by the number of sentences, and the * distance between split points is limited to 6. * The distance between split points is chosen randomly from the range [2, 6]. * @param numberOfSentences The number of sentences in the text. * @returns An array of paragraph split points. * * @static @private */ private static calculateParagraphSplitPoints; /** * Joins elements of a string array into a single string with spaces between them. * Appends a space after each element unless it ends with a newline character. * * @param acc - The accumulator string that collects the joined elements. * @param s - The current string element to be added to the accumulator. * @returns The updated accumulator string with the current element appended. * * @static @private */ private static joinTextOutputArray; /** * Performs a binary search on a sorted array of Cyrillic strings to find the target string. * * @param arr - A sorted array of strings to search within. * @param target - The target string to search for in the array. * @returns The index of the target string if found, otherwise -1. * * Note: Uses `localeCompare()` for comparison to handle Cyrillic characters correctly. */ private static binarySearchCyrillic; /** * Check if the given word is written in cyrillic alphabet. * @param {string} word - The word to check. * @returns {boolean} true if the word is cyrillic, false otherwise. * * @static @private */ private static isCyrillicWord; /** * Returns the given string with the first letter capitalized. * @param {string} text - The string to capitalize. * @returns {string} The capitalized string. * * @static @private */ private static capitalize; /** * Check if the given text is capitalized. * @param {string} text - The string to check. * @returns {boolean} true if the string is capitalized, false otherwise. * * @static @private */ private static isCapitalized; /** * Generates a random number in the range [from, to). * @param to Upper limit of the range. * @param from Lower limit of the range. Defaults to 0. * @returns A random number in the range [from, to). * * @private */ private static randomNumberGen; /** * Returns the given string with the last punctuation symbol removed and * the symbol itself as the second element of the returned array. * If the string does not end with a punctuation symbol, * the second element of the returned array is `null`. * @param {string} text - The string to strip of punctuation. * @returns {Array.<string, string | null>} An array containing the string * without the last punctuation symbol and the symbol itself. * @static @private */ private static stripPunctuation; /** * Removes all punctuation symbols from the start and end of the given text. * @param {string} text - The text to trim. * @returns {string} The trimmed string. * * @static @private */ private static trimAllSymbols; } export default TextGenerator;