markov-strings
Version:
A Markov string generator
197 lines (196 loc) • 8.18 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const lodash_es_1 = require("lodash-es");
function sampleWithPRNG(array, prng = Math.random) {
const length = array == null ? 0 : array.length;
return length ? array[Math.floor(prng() * length)] : undefined;
}
class Markov {
/**
* Creates an instance of Markov generator.
*
* @param {MarkovConstructorOptions} [options={}]
* @memberof Markov
*/
constructor(options = {}) {
this.startWords = [];
this.endWords = [];
this.corpus = {};
this.defaultOptions = {
stateSize: 2,
};
this.data = [];
// Save options
this.options = this.defaultOptions;
(0, lodash_es_1.assignIn)(this.options, options);
}
/**
* Imports a corpus. This overwrites existing data.
*
* @param data
*/
import(data) {
this.options = (0, lodash_es_1.cloneDeep)(data.options);
this.corpus = (0, lodash_es_1.cloneDeep)(data.corpus);
this.startWords = (0, lodash_es_1.cloneDeep)(data.startWords);
this.endWords = (0, lodash_es_1.cloneDeep)(data.endWords);
}
/**
* Exports structed data used to generate sentence.
*/
export() {
return (0, lodash_es_1.cloneDeep)({
options: this.options,
corpus: this.corpus,
startWords: this.startWords,
endWords: this.endWords,
});
}
addData(rawData) {
// Format data if necessary
let input = [];
if ((0, lodash_es_1.isString)(rawData[0])) {
input = rawData.map((s) => ({ string: s }));
}
else if (rawData[0].hasOwnProperty('string')) {
input = rawData;
}
else {
throw new Error('Objects in your corpus must have a "string" property');
}
this.buildCorpus(input);
this.data = this.data.concat(input);
}
/**
* Builds the corpus. You must call this before generating sentences.
*
* @memberof Markov
*/
buildCorpus(data) {
const options = this.options;
// Loop through all sentences
data.forEach((item) => {
const line = item.string;
const words = line.split(' ');
const stateSize = options.stateSize; // Default value of 2 is set in the constructor
//#region Start words
// "Start words" is the list of words that can start a generated chain.
const start = (0, lodash_es_1.slice)(words, 0, stateSize).join(' ');
const oldStartObj = this.startWords.find((o) => o.words === start);
// If we already have identical startWords
if (oldStartObj) {
// If the current item is not present in the references, add it
if (!(0, lodash_es_1.includes)(oldStartObj.refs, item)) {
oldStartObj.refs.push(item);
}
}
else {
// Add the startWords (and reference) to the list
this.startWords.push({ words: start, refs: [item] });
}
//#endregion Start words
//#region End words
// "End words" is the list of words that can end a generated chain.
const end = (0, lodash_es_1.slice)(words, words.length - stateSize, words.length).join(' ');
const oldEndObj = this.endWords.find((o) => o.words === end);
if (oldEndObj) {
if (!(0, lodash_es_1.includes)(oldEndObj.refs, item)) {
oldEndObj.refs.push(item);
}
}
else {
this.endWords.push({ words: end, refs: [item] });
}
//#endregion End words
//#region Corpus generation
// We loop through all words in the sentence to build "blocks" of `stateSize`
// e.g. for a stateSize of 2, "lorem ipsum dolor sit amet" will have the following blocks:
// "lorem ipsum", "ipsum dolor", "dolor sit", and "sit amet"
for (let i = 0; i < words.length - 1; i++) {
const curr = (0, lodash_es_1.slice)(words, i, i + stateSize).join(' ');
const next = (0, lodash_es_1.slice)(words, i + stateSize, i + stateSize * 2).join(' ');
if (!next || next.split(' ').length !== options.stateSize) {
continue;
}
// Check if the corpus already has a corresponding "curr" block
if (this.corpus.hasOwnProperty(curr)) {
const oldObj = this.corpus[curr].find((o) => o.words === next);
if (oldObj) {
// If the corpus already has the chain "curr -> next",
// just add the current reference for this block
oldObj.refs.push(item);
}
else {
// Add the new "next" block in the list of possible paths for "curr"
this.corpus[curr].push({ words: next, refs: [item] });
}
}
else {
// Add the "curr" block and link it with the "next" one
this.corpus[curr] = [{ words: next, refs: [item] }];
}
}
//#endregion Corpus generation
});
}
/**
* Generates a result, that contains a string and its references
*
* @param {MarkovGenerateOptions} [options={}]
* @returns {MarkovResult}
* @memberof Markov
*/
generate(options = {}) {
if ((0, lodash_es_1.isEmpty)(this.corpus)) {
throw new Error('Corpus is empty. There is either no data, or the data is not sufficient to create markov chains.');
}
const corpus = (0, lodash_es_1.cloneDeep)(this.corpus);
const maxTries = options.maxTries ? options.maxTries : 10;
const prng = options.prng ? options.prng : Math.random;
let tries;
// We loop through fragments to create a complete sentence
for (tries = 1; tries <= maxTries; tries++) {
let ended = false;
// Create an array of MarkovCorpusItems
// The first item is a random startWords element
const arr = [sampleWithPRNG(this.startWords, prng)];
let score = 0;
// loop to build a complete sentence
for (let innerTries = 0; innerTries < maxTries; innerTries++) {
const block = arr[arr.length - 1]; // last value in array
const state = sampleWithPRNG(corpus[block.words], prng); // Find a following item in the corpus
// If a state cannot be found, the sentence can't be completed
if (!state) {
break;
}
// add new state to list
arr.push(state);
// increment score
score += corpus[block.words].length - 1; // increment score
// is sentence finished?
if ((0, lodash_es_1.some)(this.endWords, { words: state.words })) {
ended = true;
break;
}
}
const sentence = arr
.map((o) => o.words)
.join(' ')
.trim();
const result = {
string: sentence,
score,
refs: (0, lodash_es_1.uniqBy)((0, lodash_es_1.flatten)(arr.map((o) => o.refs)), 'string'),
tries,
};
// sentence is not ended or incorrect
if (!ended ||
(typeof options.filter === 'function' && !options.filter(result))) {
continue;
}
return result;
}
throw new Error(`Failed to build a sentence after ${tries - 1} tries. Possible solutions: try a less restrictive filter(), give more raw data to the corpus builder, or increase the number of maximum tries.`);
}
}
exports.default = Markov;