UNPKG

languagemodel

Version:

A natural language model and cross-language model, for natural language understanding and generation

erelsgl/languagemodel

179 lines (150 loc) • 6.32 kB

JavaScript

var util = require("util"); var logSumExp = require('./logSumExp'); /** * This class represents a simple, unigram-based language model. * Based on: * * Leuski Anton, Traum David. A Statistical Approach for Text Processing in Virtual Humans tech. rep.University of Southern California, Institute for Creative Technologies 2008. * http://www.citeulike.org/user/erelsegal-halevi/article/12540655 * * @author Erel Segal-Halevi * @since 2013-08 * * opts - may contain the following options: * * smoothingCoefficient - the lambda-factor for smoothing the unigram probabilities. */ var LanguageModel = function(opts) { this.smoothingCoefficient = opts.smoothingCoefficient || 0.9; } LanguageModel.prototype = { /** * Train the language with all the given documents. * * @param dataset * an array with hashes of the format: * {word1:count1, word2:count2,...} * each object represents the a sentence (it should be tokenized in advance). */ trainBatch : function(dataset) { // calculate counts for equation (3): var mapWordToTotalCount = {}; var totalNumberOfWordsInDataset = 0; for (var i in dataset) { var datum = dataset[i]; var totalPerDatum = 0; // for each input sentence, count the total number of words in it: for (var word in datum) { mapWordToTotalCount[word] |= 0; mapWordToTotalCount[word] += datum[word]; totalPerDatum += datum[word]; } datum["_total"] = totalPerDatum; totalNumberOfWordsInDataset += totalPerDatum; } mapWordToTotalCount["_total"] = totalNumberOfWordsInDataset; this.dataset = dataset; this.mapWordToTotalCount = mapWordToTotalCount; // calculate smoothing factor for equation (3): var mapWordToSmoothingFactor = {}; for (var word in mapWordToTotalCount) { mapWordToSmoothingFactor[word] = (1-this.smoothingCoefficient) * this.mapWordToTotalCount[word] / this.mapWordToTotalCount["_total"]; } this.mapWordToSmoothingFactor = mapWordToSmoothingFactor; this.globalSmoothingFactor = (1/totalNumberOfWordsInDataset) // a global smoother, for totally unseen words. }, /** * @return the map of all words in the training Dataset, each word with its total count in the Dataset. */ getAllWordCounts: function() { return this.mapWordToTotalCount; }, /** * @param sentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence. * @return the log-probability of that sentence, given the model built from the Dataset. */ logProbSentenceGivenDataset: function(sentenceCounts) { // (2) log P(w1...wn) = ... var logProducts = []; for (var i in this.dataset) { var datum = this.dataset[i]; logProducts.push(this.logProbSentenceGivenSentence(sentenceCounts, datum)); } var logSentenceLikelihood = logSumExp(logProducts); return logSentenceLikelihood - Math.log(this.dataset.length); // The last element is not needed in practice (see eq. (5)) }, /** * @param sentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence. * @param givenSentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence. * @return the (smoothed) log product probabilities that the words in sentenceCounts appear in the givenSentenceCounts. */ logProbSentenceGivenSentence: function(sentenceCounts, givenSentenceCounts) { var logProduct=0; for (var word in sentenceCounts) logProduct += sentenceCounts[word] * this.logProbWordGivenSentence(word, givenSentenceCounts); return logProduct; }, /** * @param word a word from the INPUT domain. * @param givenSentenceCounts a hash {word1: count1, word2: count2, ... "_total": totalCount}, representing a sentence. * @return the (smoothed) probability that the word appears in the sentence. */ logProbWordGivenSentence: function(word, givenSentenceCounts) { // (3) p_s(w) =~ pi_s(w) = ... if (givenSentenceCounts!==Object(givenSentenceCounts)) throw new Error("expected givenSentenceCounts to be an object, but found "+JSON.stringify(givenSentenceCounts)); var totalGivenSentenceCounts = ("_total" in givenSentenceCounts? givenSentenceCounts["_total"]: Object.keys(givenSentenceCounts). map(function(key){return givenSentenceCounts[key]}). reduce(function(memo, num){ return memo + num; }, 0)); var prob = ( word in givenSentenceCounts? this.smoothingCoefficient * givenSentenceCounts[word] / totalGivenSentenceCounts + this.mapWordToSmoothingFactor[word] + this.globalSmoothingFactor: word in this.mapWordToSmoothingFactor? this.mapWordToSmoothingFactor[word] + this.globalSmoothingFactor: this.globalSmoothingFactor); if (isNaN(prob)) { console.log(util.inspect(this,{depth:3})); throw new Error("logProbWordGivenSentence("+word+", "+JSON.stringify(givenSentenceCounts)+") is NaN!"); } return Math.log(prob); }, toJSON: function() { return { mapWordToTotalCount: this.mapWordToTotalCount, mapWordToSmoothingFactor: this.mapWordToSmoothingFactor, globalSmoothingFactor: this.globalSmoothingFactor, dataset: this.dataset, }; }, fromJSON: function() { this.mapWordToTotalCount = mapWordToTotalCount; this.mapWordToSmoothingFactor = mapWordToSmoothingFactor; this.globalSmoothingFactor = globalSmoothingFactor; this.dataset = dataset; }, } module.exports = LanguageModel; if (process.argv[1] === __filename) { console.log("LanguageModel.js demo start"); var model = new LanguageModel({ smoothingFactor : 0.9, }); var wordcounts = require('./wordcounts'); model.trainBatch([ wordcounts("I want aa"), wordcounts("I want bb"), wordcounts("I want cc") ]); var assertProbSentence = function(sentence, expected) { var p = Math.exp(model.logProbSentenceGivenDataset(wordcounts(sentence))); if (Math.abs(p-expected)/expected>0.01) { console.warn("p("+sentence+") = "+Math.exp(model.logProbSentenceGivenDataset(wordcounts(sentence))), " should be "+expected); } } assertProbSentence("I", 1/3); assertProbSentence("I want", 1/9); assertProbSentence("I want aa", 0.0123456); assertProbSentence("I want aa bb",0.00026); assertProbSentence("I want aa bb cc",0.00000427); console.log("LanguageModel.js demo end"); }