UNPKG

markov-strings

Version:
197 lines (196 loc) 8.18 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const lodash_es_1 = require("lodash-es"); function sampleWithPRNG(array, prng = Math.random) { const length = array == null ? 0 : array.length; return length ? array[Math.floor(prng() * length)] : undefined; } class Markov { /** * Creates an instance of Markov generator. * * @param {MarkovConstructorOptions} [options={}] * @memberof Markov */ constructor(options = {}) { this.startWords = []; this.endWords = []; this.corpus = {}; this.defaultOptions = { stateSize: 2, }; this.data = []; // Save options this.options = this.defaultOptions; (0, lodash_es_1.assignIn)(this.options, options); } /** * Imports a corpus. This overwrites existing data. * * @param data */ import(data) { this.options = (0, lodash_es_1.cloneDeep)(data.options); this.corpus = (0, lodash_es_1.cloneDeep)(data.corpus); this.startWords = (0, lodash_es_1.cloneDeep)(data.startWords); this.endWords = (0, lodash_es_1.cloneDeep)(data.endWords); } /** * Exports structed data used to generate sentence. */ export() { return (0, lodash_es_1.cloneDeep)({ options: this.options, corpus: this.corpus, startWords: this.startWords, endWords: this.endWords, }); } addData(rawData) { // Format data if necessary let input = []; if ((0, lodash_es_1.isString)(rawData[0])) { input = rawData.map((s) => ({ string: s })); } else if (rawData[0].hasOwnProperty('string')) { input = rawData; } else { throw new Error('Objects in your corpus must have a "string" property'); } this.buildCorpus(input); this.data = this.data.concat(input); } /** * Builds the corpus. You must call this before generating sentences. * * @memberof Markov */ buildCorpus(data) { const options = this.options; // Loop through all sentences data.forEach((item) => { const line = item.string; const words = line.split(' '); const stateSize = options.stateSize; // Default value of 2 is set in the constructor //#region Start words // "Start words" is the list of words that can start a generated chain. const start = (0, lodash_es_1.slice)(words, 0, stateSize).join(' '); const oldStartObj = this.startWords.find((o) => o.words === start); // If we already have identical startWords if (oldStartObj) { // If the current item is not present in the references, add it if (!(0, lodash_es_1.includes)(oldStartObj.refs, item)) { oldStartObj.refs.push(item); } } else { // Add the startWords (and reference) to the list this.startWords.push({ words: start, refs: [item] }); } //#endregion Start words //#region End words // "End words" is the list of words that can end a generated chain. const end = (0, lodash_es_1.slice)(words, words.length - stateSize, words.length).join(' '); const oldEndObj = this.endWords.find((o) => o.words === end); if (oldEndObj) { if (!(0, lodash_es_1.includes)(oldEndObj.refs, item)) { oldEndObj.refs.push(item); } } else { this.endWords.push({ words: end, refs: [item] }); } //#endregion End words //#region Corpus generation // We loop through all words in the sentence to build "blocks" of `stateSize` // e.g. for a stateSize of 2, "lorem ipsum dolor sit amet" will have the following blocks: // "lorem ipsum", "ipsum dolor", "dolor sit", and "sit amet" for (let i = 0; i < words.length - 1; i++) { const curr = (0, lodash_es_1.slice)(words, i, i + stateSize).join(' '); const next = (0, lodash_es_1.slice)(words, i + stateSize, i + stateSize * 2).join(' '); if (!next || next.split(' ').length !== options.stateSize) { continue; } // Check if the corpus already has a corresponding "curr" block if (this.corpus.hasOwnProperty(curr)) { const oldObj = this.corpus[curr].find((o) => o.words === next); if (oldObj) { // If the corpus already has the chain "curr -> next", // just add the current reference for this block oldObj.refs.push(item); } else { // Add the new "next" block in the list of possible paths for "curr" this.corpus[curr].push({ words: next, refs: [item] }); } } else { // Add the "curr" block and link it with the "next" one this.corpus[curr] = [{ words: next, refs: [item] }]; } } //#endregion Corpus generation }); } /** * Generates a result, that contains a string and its references * * @param {MarkovGenerateOptions} [options={}] * @returns {MarkovResult} * @memberof Markov */ generate(options = {}) { if ((0, lodash_es_1.isEmpty)(this.corpus)) { throw new Error('Corpus is empty. There is either no data, or the data is not sufficient to create markov chains.'); } const corpus = (0, lodash_es_1.cloneDeep)(this.corpus); const maxTries = options.maxTries ? options.maxTries : 10; const prng = options.prng ? options.prng : Math.random; let tries; // We loop through fragments to create a complete sentence for (tries = 1; tries <= maxTries; tries++) { let ended = false; // Create an array of MarkovCorpusItems // The first item is a random startWords element const arr = [sampleWithPRNG(this.startWords, prng)]; let score = 0; // loop to build a complete sentence for (let innerTries = 0; innerTries < maxTries; innerTries++) { const block = arr[arr.length - 1]; // last value in array const state = sampleWithPRNG(corpus[block.words], prng); // Find a following item in the corpus // If a state cannot be found, the sentence can't be completed if (!state) { break; } // add new state to list arr.push(state); // increment score score += corpus[block.words].length - 1; // increment score // is sentence finished? if ((0, lodash_es_1.some)(this.endWords, { words: state.words })) { ended = true; break; } } const sentence = arr .map((o) => o.words) .join(' ') .trim(); const result = { string: sentence, score, refs: (0, lodash_es_1.uniqBy)((0, lodash_es_1.flatten)(arr.map((o) => o.refs)), 'string'), tries, }; // sentence is not ended or incorrect if (!ended || (typeof options.filter === 'function' && !options.filter(result))) { continue; } return result; } throw new Error(`Failed to build a sentence after ${tries - 1} tries. Possible solutions: try a less restrictive filter(), give more raw data to the corpus builder, or increase the number of maximum tries.`); } } exports.default = Markov;