@igk19/rus-text-gen
Version:
Generator of coherent and random texts in Russian language from real texts containing (as of now) about 30000 words.
171 lines (170 loc) • 6.48 kB
TypeScript
interface ILine {
wordIndices: number[];
isCapitalized: Set<number>;
symbolAfter: Map<number, string>;
}
interface IParseOutput {
lines: ILine[];
words: string[];
}
/**
* A generates random or coherent Russian text from an existing text file.
* The text file contains corrected excerpts from various philosophers of
* the end of 19th/beginning of 20th century.
*/
declare class TextGenerator {
/**
* All unique words of the text sorted
* in alphabetical order.
*/
private readonly words;
/**
* Each line of the text
*/
private readonly lines;
constructor(input: IParseOutput);
/**
* Parses the default text file `./texts/text1.txt` and returns an
* instance of `TextGenerator` with the parsed data.
*
* @returns A promise that resolves when the file has been parsed.
*
* @private
*/
static build(): Promise<TextGenerator>;
/**
* Parses a text file.
*
* @param path The path to the text file to parse.
* @returns A promise that resolves when the file has been parsed.
*
* This function reads the file line by line, splits each line into words,
* and adds the words to a set to get all unique words.
* Besides, it creates an array of line objects, where each object has
* three properties: `wordIndices`, `isCapitalized` and `symbolAfter`.
* `wordIndices` is an array of indices of words in the `this.words` array.
* `isCapitalized` is a set of indices of words in the `this.words` array
* that are capitalized.
* `symbolAfter` is a map of indices of words in the `this.words` array
* to the symbol right after the word.
*
* The returned data structure optimizes generation of the text.
*/
private static parseFile;
/**
* Create random text.
* @param textLength Text length measured in words.
* @param isSentences Create random-length sentences (by capitalizing first sentence words and adding punctuation symbols).
* @param isParagraphs Create random-length paragraphs (by adding new line symbols).
*
* @public
*/
createRandomText(textLength?: number, isSentences?: boolean, isParagraphs?: boolean): string;
/**
* Generate coherent text of given length.
* @param length - length of the output text
* @returns a string of coherent text
* Note: maximum length is 30000 words
*
* @public
*/
createText(length?: number): string;
/**
* Creates a string of random text of given length, consisting of sentences.
*
* @param textLength - length of the output text
* @param isParagraphs - whether to split sentences into paragraphs (by adding "\n" symbols)
* @returns a string of random text formed into a semblance of sentences.
*
* @private
*/
private createRandomSentences;
/**
* Generates an array of paragraph split points. A split point is a point in
* an array of sentences where a paragraph break is inserted.
* The split points are chosen randomly to simulate the distribution of
* paragraph breaks in a natural text.
* The number of split points is limited by the number of sentences, and the
* distance between split points is limited to 6.
* The distance between split points is chosen randomly from the range [2, 6].
* @param numberOfSentences The number of sentences in the text.
* @returns An array of paragraph split points.
*
* @static @private
*/
private static calculateParagraphSplitPoints;
/**
* Joins elements of a string array into a single string with spaces between them.
* Appends a space after each element unless it ends with a newline character.
*
* @param acc - The accumulator string that collects the joined elements.
* @param s - The current string element to be added to the accumulator.
* @returns The updated accumulator string with the current element appended.
*
* @static @private
*/
private static joinTextOutputArray;
/**
* Performs a binary search on a sorted array of Cyrillic strings to find the target string.
*
* @param arr - A sorted array of strings to search within.
* @param target - The target string to search for in the array.
* @returns The index of the target string if found, otherwise -1.
*
* Note: Uses `localeCompare()` for comparison to handle Cyrillic characters correctly.
*/
private static binarySearchCyrillic;
/**
* Check if the given word is written in cyrillic alphabet.
* @param {string} word - The word to check.
* @returns {boolean} true if the word is cyrillic, false otherwise.
*
* @static @private
*/
private static isCyrillicWord;
/**
* Returns the given string with the first letter capitalized.
* @param {string} text - The string to capitalize.
* @returns {string} The capitalized string.
*
* @static @private
*/
private static capitalize;
/**
* Check if the given text is capitalized.
* @param {string} text - The string to check.
* @returns {boolean} true if the string is capitalized, false otherwise.
*
* @static @private
*/
private static isCapitalized;
/**
* Generates a random number in the range [from, to).
* @param to Upper limit of the range.
* @param from Lower limit of the range. Defaults to 0.
* @returns A random number in the range [from, to).
*
* @private
*/
private static randomNumberGen;
/**
* Returns the given string with the last punctuation symbol removed and
* the symbol itself as the second element of the returned array.
* If the string does not end with a punctuation symbol,
* the second element of the returned array is `null`.
* @param {string} text - The string to strip of punctuation.
* @returns {Array.<string, string | null>} An array containing the string
* without the last punctuation symbol and the symbol itself.
* @static @private
*/
private static stripPunctuation;
/**
* Removes all punctuation symbols from the start and end of the given text.
* @param {string} text - The text to trim.
* @returns {string} The trimmed string.
*
* @static @private
*/
private static trimAllSymbols;
}
export default TextGenerator;