UNPKG

@igk19/rus-text-gen

Version:

Generator of coherent and random texts in Russian language from real texts containing (as of now) about 30000 words.

492 lines (491 loc) 18.8 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const promises_1 = require("fs/promises"); const path_1 = __importDefault(require("path")); const getFilePath_js_1 = __importDefault(require("./getFilePath.js")); /** * A generates random or coherent Russian text from an existing text file. * The text file contains corrected excerpts from various philosophers of * the end of 19th/beginning of 20th century. */ class TextGenerator { constructor(input) { /** * All unique words of the text sorted * in alphabetical order. */ this.words = []; /** * Each line of the text */ this.lines = []; this.lines = input.lines; this.words = input.words; } /** * Parses the default text file `./texts/text1.txt` and returns an * instance of `TextGenerator` with the parsed data. * * @returns A promise that resolves when the file has been parsed. * * @private */ static async build() { // @ts-ignore const currentFilePath = (0, getFilePath_js_1.default)(); const p = path_1.default.join(currentFilePath, "../../texts/text1.txt"); const res = await this.parseFile(p); return new TextGenerator(res); } /** * Parses a text file. * * @param path The path to the text file to parse. * @returns A promise that resolves when the file has been parsed. * * This function reads the file line by line, splits each line into words, * and adds the words to a set to get all unique words. * Besides, it creates an array of line objects, where each object has * three properties: `wordIndices`, `isCapitalized` and `symbolAfter`. * `wordIndices` is an array of indices of words in the `this.words` array. * `isCapitalized` is a set of indices of words in the `this.words` array * that are capitalized. * `symbolAfter` is a map of indices of words in the `this.words` array * to the symbol right after the word. * * The returned data structure optimizes generation of the text. */ static async parseFile(path) { try { let lineCount = 0; const lines = []; const res = { lines: [], words: [], }; const wordSet = new Set(); const file = await (0, promises_1.open)(path, "r"); // Read each line in the file and add it // to the word set to get all unique words. for await (const line of file.readLines()) { lines.push(line); const lineWords = line.split(" "); lineWords.forEach((word) => { const [w] = TextGenerator.stripPunctuation(word); if (TextGenerator.isCyrillicWord(TextGenerator.trimAllSymbols(w))) { wordSet.add(TextGenerator.trimAllSymbols(w).toLocaleLowerCase()); } }); } // Turn the word set into an array and sort it res.words = Array.from(wordSet).toSorted((a, b) => a.localeCompare(b)); // Go through each word in each line and add their index // to the lines array. for (const line of lines) { res.lines.push({ wordIndices: [], isCapitalized: new Set(), symbolAfter: new Map(), }); const lineWords = line.split(" "); lineWords.forEach((word) => { const [w, s] = TextGenerator.stripPunctuation(word); const wordWithoutPunctuation = TextGenerator.trimAllSymbols(w); if (TextGenerator.isCyrillicWord(wordWithoutPunctuation)) { // Find the word index in the `res.words` array. // Use binary search as the array is sorted. const i = TextGenerator.binarySearchCyrillic(res.words, wordWithoutPunctuation.toLocaleLowerCase()); if (i === -1) { return; } // Push the index of the word in the `res.words` array // to the lines array res.lines[lineCount].wordIndices.push(i); // check if the w (word) is capitalized if (TextGenerator.isCapitalized(w)) { res.lines[lineCount].isCapitalized.add(res.lines[lineCount].wordIndices.length - 1); } if (s) { res.lines[lineCount].symbolAfter.set(res.lines[lineCount].wordIndices.length - 1, s); } } }); lineCount++; } return res; } catch (err) { console.error(err); throw err; } } // Note: May be used for development and debugging purposes. DO NOT REMOVE! // private restoreText() { // let text = ""; // this.lines.forEach((line, lineIndex) => { // line.wordIndices.forEach((wordIndex, wiIndex) => { // const rawWord = this.words[wordIndex]; // if (line.isCapitalized.has(wiIndex)) { // text += TextGenerator.capitalize(rawWord); // } else { // text += rawWord; // } // const isSymbolAfter = line.symbolAfter.has(wiIndex); // if (isSymbolAfter) { // text += line.symbolAfter.get(wiIndex); // } // text += " "; // }); // text += "\n"; // }); // return text; // } /** * Create random text. * @param textLength Text length measured in words. * @param isSentences Create random-length sentences (by capitalizing first sentence words and adding punctuation symbols). * @param isParagraphs Create random-length paragraphs (by adding new line symbols). * * @public */ createRandomText(textLength = 10, isSentences = false, isParagraphs = false) { const outputArr = []; let randomParagraphWordLength = TextGenerator.randomNumberGen(200, 40); if (isSentences) { try { return this.createRandomSentences(textLength, isParagraphs); } catch (err) { console.error(err); throw err; } } while (outputArr.length < textLength) { const randomWordIndex = TextGenerator.randomNumberGen(this.words.length - 1); const randomWord = this.words[randomWordIndex]; outputArr.push(randomWord); if (isParagraphs) { if (!randomParagraphWordLength) { outputArr[outputArr.length - 1] = outputArr[outputArr.length - 1] + "\n"; randomParagraphWordLength = TextGenerator.randomNumberGen(200, 40); } else { randomParagraphWordLength--; } } } return outputArr.reduce(TextGenerator.joinTextOutputArray, ""); } /** * Generate coherent text of given length. * @param length - length of the output text * @returns a string of coherent text * Note: maximum length is 30000 words * * @public */ createText(length = 10) { if (length > 30000) { throw new Error("Maximum length is 30000 words"); } const outputArr = []; while (outputArr.length <= length - 1) { const randomLineIndex = TextGenerator.randomNumberGen(this.lines.length - 1); const randomLine = this.lines[randomLineIndex]; let wiIndex = 0; while (outputArr.length <= length - 1 && wiIndex <= randomLine.wordIndices.length - 1) { const wordIndex = randomLine.wordIndices[wiIndex]; const rawWord = this.words[wordIndex]; if (randomLine.isCapitalized.has(wiIndex)) { outputArr.push(TextGenerator.capitalize(rawWord)); } else { outputArr.push(rawWord); } const isSymbolAfter = randomLine.symbolAfter.has(wiIndex); if (isSymbolAfter) { outputArr[outputArr.length - 1] = outputArr[outputArr.length - 1] + randomLine.symbolAfter.get(wiIndex); } wiIndex++; } if (!/[.,!?;:]$/.test(outputArr[outputArr.length - 1])) { outputArr[outputArr.length - 1] = outputArr[outputArr.length - 1] + "."; } outputArr[outputArr.length - 1] = outputArr[outputArr.length - 1] + "\n"; } let res = outputArr.reduce(TextGenerator.joinTextOutputArray, ""); if (res.endsWith("\n")) { res = res.slice(0, -1); if (res.endsWith(" ") || res.endsWith(",") || res.endsWith(";") || res.endsWith(":")) { res = res.slice(0, -1); } res = res.slice(0, -1) + "."; } return res; } /** * Creates a string of random text of given length, consisting of sentences. * * @param textLength - length of the output text * @param isParagraphs - whether to split sentences into paragraphs (by adding "\n" symbols) * @returns a string of random text formed into a semblance of sentences. * * @private */ createRandomSentences(textLength, isParagraphs) { if (!textLength) { throw new Error("Text length must be greater than 0"); } if (typeof isParagraphs === "undefined") { throw new Error("Please define isParagraphs argument"); } const wordsLength = this.words.length; const endOfSentenceSymbols = [".", "?", "!"]; const punctuationSymbols = [ ".", ",", "?", ".", ",", ",", ",", ".", ".", ".", ",", ".", ".", ".", "!", ".", "?", ".", ";", ":", ",", ".", ",", ".", ",", "!", ",", ".", ]; let usedWordsCounter = 0; const sentences = []; let sentence = []; let prevPunctuationSymbol = "."; /** * Number of words in a sentence. */ while (usedWordsCounter < textLength) { const randomPunctuationSymbol = punctuationSymbols[TextGenerator.randomNumberGen(punctuationSymbols.length)]; sentence = endOfSentenceSymbols.includes(prevPunctuationSymbol) ? [] : sentence; let randomSentenceLength = TextGenerator.randomNumberGen(20, 3); while (randomSentenceLength && usedWordsCounter < textLength) { const randomWordIndex = TextGenerator.randomNumberGen(wordsLength - 1); const randomWord = this.words[randomWordIndex]; sentence.push(randomWord); // Loop guards usedWordsCounter++; randomSentenceLength--; } if (usedWordsCounter === textLength) { sentences.push(sentence.join(" ") + "."); break; } prevPunctuationSymbol = randomPunctuationSymbol; if (endOfSentenceSymbols.includes(randomPunctuationSymbol)) { sentences.push(sentence.join(" ") + randomPunctuationSymbol); } else { sentence[sentence.length - 1] = sentence[sentence.length - 1] + randomPunctuationSymbol; } } let splitPoints = isParagraphs ? TextGenerator.calculateParagraphSplitPoints(sentences.length) : []; return sentences .map((s, i) => { if (splitPoints.length && splitPoints.includes(i)) { s = s + "\n"; } return TextGenerator.capitalize(s); }) .reduce(TextGenerator.joinTextOutputArray, ""); } /** * Generates an array of paragraph split points. A split point is a point in * an array of sentences where a paragraph break is inserted. * The split points are chosen randomly to simulate the distribution of * paragraph breaks in a natural text. * The number of split points is limited by the number of sentences, and the * distance between split points is limited to 6. * The distance between split points is chosen randomly from the range [2, 6]. * @param numberOfSentences The number of sentences in the text. * @returns An array of paragraph split points. * * @static @private */ static calculateParagraphSplitPoints(numberOfSentences) { const paragraphSplitPoints = []; let prevSplitPoint = 0; while (numberOfSentences) { const randomPoint = TextGenerator.randomNumberGen(6, 2); if (randomPoint > numberOfSentences) { break; } const splitPoint = randomPoint + prevSplitPoint; paragraphSplitPoints.push(splitPoint); prevSplitPoint = splitPoint; numberOfSentences -= randomPoint; } return paragraphSplitPoints; } /** * Joins elements of a string array into a single string with spaces between them. * Appends a space after each element unless it ends with a newline character. * * @param acc - The accumulator string that collects the joined elements. * @param s - The current string element to be added to the accumulator. * @returns The updated accumulator string with the current element appended. * * @static @private */ static joinTextOutputArray(acc, s) { if (s.endsWith("\n")) { acc += s; } else { acc += s + " "; } return acc; } /** * Performs a binary search on a sorted array of Cyrillic strings to find the target string. * * @param arr - A sorted array of strings to search within. * @param target - The target string to search for in the array. * @returns The index of the target string if found, otherwise -1. * * Note: Uses `localeCompare()` for comparison to handle Cyrillic characters correctly. */ static binarySearchCyrillic(arr, target) { let left = 0; let right = arr.length - 1; while (left <= right) { const mid = Math.floor((left + right) / 2); if (arr[mid].localeCompare(target) === 0) { return mid; } else if (arr[mid].localeCompare(target) > 0) { right = mid - 1; } else { left = mid + 1; } } return -1; } /** * Check if the given word is written in cyrillic alphabet. * @param {string} word - The word to check. * @returns {boolean} true if the word is cyrillic, false otherwise. * * @static @private */ static isCyrillicWord(word) { if (word === null || word.length === 0) { return false; } for (let char of word) { // Make sure the char is cyrillic if (char.charCodeAt(0) < 0x0400 || char.charCodeAt(0) > 0x04ff) { return false; } } return true; } /** * Returns the given string with the first letter capitalized. * @param {string} text - The string to capitalize. * @returns {string} The capitalized string. * * @static @private */ static capitalize(text) { return text.charAt(0).toUpperCase() + text.slice(1); } /** * Check if the given text is capitalized. * @param {string} text - The string to check. * @returns {boolean} true if the string is capitalized, false otherwise. * * @static @private */ static isCapitalized(text) { const firstLetter = text.charAt(0); return firstLetter === firstLetter.toUpperCase(); } /** * Generates a random number in the range [from, to). * @param to Upper limit of the range. * @param from Lower limit of the range. Defaults to 0. * @returns A random number in the range [from, to). * * @private */ static randomNumberGen(to, from = 0) { let randomNumber = Math.floor(Math.random() * to); if (randomNumber < from) { randomNumber = from; } return randomNumber; } /** * Returns the given string with the last punctuation symbol removed and * the symbol itself as the second element of the returned array. * If the string does not end with a punctuation symbol, * the second element of the returned array is `null`. * @param {string} text - The string to strip of punctuation. * @returns {Array.<string, string | null>} An array containing the string * without the last punctuation symbol and the symbol itself. * @static @private */ static stripPunctuation(text) { const replacedSymbolArr = text.match(/[.,!?;:]$/g); const replacedSymbol = replacedSymbolArr?.length ? replacedSymbolArr[0] : null; return replacedSymbol ? [text.replace(/[.,!?;:]$/g, ""), replacedSymbol] : [text, null]; } /** * Removes all punctuation symbols from the start and end of the given text. * @param {string} text - The text to trim. * @returns {string} The trimmed string. * * @static @private */ static trimAllSymbols(text) { const t = text.replace(/[.,!?;:*%\-_[\]{}()0-9"']$/g, ""); return t.replace(/^[.,!?;:*%\-_[\]{}()0-9"']/g, ""); } } exports.default = TextGenerator;