UNPKG

nlpsum

Version:

Powerful text summarization algorithms from research papers and dedicated research.

701 lines (569 loc) 19.8 kB
// Underscore var _ = require('underscore'); // Natural var natural = require('natural'); var wordnet = new natural.WordNet(); // Spencer Kelly's nlp-node libs var sentenceParser = require('./nlp-node-master/sentence_parser/sentence'); var dateExtractor = require('./nlp-node-master/date_parser/date_extractor'); // Fortnight Lab's libs var pos = require('pos'); var glossary = require("glossary")({ collapse: true }); // Constructor function nlpsum() {} nlpsum.prototype.test = function(text) { wordnet.lookup('node', function(results) { results.forEach(function(result) { console.log('------------------------------------'); console.log(result.synsetOffset); console.log(result.pos); console.log(result.lemma); console.log(result.synonyms); console.log(result.pos); console.log(result.gloss); }); }); } nlpsum.prototype.keywords = function(text) { var keywords = glossary.extract(text); return keywords; } // Generate a summary based on the fractal document theory by Christopher C. Yang & Fu Lee Wang (Chinese University of Hong Kong - see "fractal summarization.pdf") nlpsum.prototype.fractalSummary = function(text, quota) { var scope = this; var tokenizer = new natural.WordTokenizer(); var summary = []; var fractal = { paragraphs: [], weights: scope.tf(text).frequency, }; // Split in paragraphs var paragraphs = text.split('\r\n\r\n'); var s_weight = 0; // Total weights of all the sentences in reference to the document. Will be > 1. Used to normalize to the range ]0;1]. var p_weight = 0; // Total weights of all the paragraphs in reference to the document. Will be > 1. Used to normalize to the range ]0;1]. var pindex = 0; // paragraph index var ptotal = paragraphs.length; var sindex = 0; // Sentence index var stotal = 0; // total sentences _.each(paragraphs, function(paragraph) { // Temp object we'll push var buffer_p = { paragraph: paragraph, tokens: scope.tokenize(paragraph), weights: { words: scope.tf(paragraph).frequency, total: { words: 0, sentences: 0 } }, sentences: [] }; // Split in sentences var sentences = scope.joinQuotes(sentenceParser(paragraph)); _.each(sentences, function(sentence) { // Increment the sentence counter stotal++; var buffer_s = { sentence: sentence, tokens: scope.tokenize(sentence) }; if (buffer_s.tokens.length > 0) { buffer_s.weights = { sentence: scope.tf(sentence).frequency, paragraph: scope.reltf(buffer_s.tokens, buffer_p.weights), document: scope.reltf(buffer_s.tokens, fractal.weights) }; buffer_s.weights.total = { sentence: 1, // always, logical paragraph: _.reduce(_.values(buffer_s.weights.paragraph), function(total, n) {return total+n;}, 0), document: _.reduce(_.values(buffer_s.weights.document), function(total, n) {return total+n;}, 0) } s_weight += buffer_s.weights.total.document; buffer_p.weights.total.sentences += buffer_s.weights.total.document; buffer_p.sentences.push(buffer_s); } }); // Calculate tht total weights of the words in that paragraph, relative to the document buffer_p.weights.total.words = _.reduce(_.values(buffer_p.weights.words), function(total, n) {return total+n;}, 0); fractal.paragraphs.push(buffer_p); pindex++; }); // Now we normalize the sentence weights, using the sum 's_weight' var ns_weight = 0; var np_weight = 0; var quota_sum = 0; var sindex = 0; this.each(fractal.paragraphs, function(paragraph) { // Normalize the paragraph weight relative to the document ]0;1] paragraph.weights.total.normalized = paragraph.weights.total.sentences/s_weight; np_weight += paragraph.weights.total.normalized; // Calculate the quota for that paragraph paragraph.quota = Math.round(quota*paragraph.weights.total.normalized); quota_sum += paragraph.quota; // Now that we have the quotaz per paragraph and the normalized weights (actually, next step), // We define the thereshold per paragraph // To do that, we sort the sentence by weight var sentences_sorted = []; var index = 0; scope.each(paragraph.sentences, function(sentence) { // Normalize the sentence weight relative to the document ]0;1] sentence.weights.total.sentence_normalized = sentence.weights.total.document/s_weight; ns_weight += sentence.weights.total.sentence_normalized; sentences_sorted.push({ weight: sentence.weights.total.sentence_normalized, text: sentence.sentence, index: index }); index++; }); // Sort the sentences by weight paragraph.sentences_sorted = sentences_sorted.sort(function(a,b) {return b.weight-a.weight;}); // Keep only our quotas of sentences, sorted by index asc (to be in the right order) paragraph.sentences_keep = _.map(paragraph.sentences_sorted.slice(0, paragraph.quota).sort(function(a,b) {return a.index-b.index;}), function(item) { return item.text; }); if (paragraph.sentences_keep.length > 0) { _.each(paragraph.sentences_keep, function(sentence) { summary.push(sentence.trim()); }); } }); return { data: fractal, text: summary.join('.\r\n'), s_weight: s_weight, ns_weight: ns_weight, np_weight: np_weight, quota_sum: quota_sum }; } // Summarization using the word frequency. Pretty straightforward algorithm. nlpsum.prototype.wordFrequencySummary = function(text, quota) { var scope = this; var tokenizer = new natural.WordTokenizer(); var summary = []; var sentenceIndex = 0; var structure = { sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) { var obj = { text: sentence, index: sentenceIndex, data: {} }; sentenceIndex++; return obj; }), ordered: [], weights: { words: scope.tf(text).frequency, total: 0 } }; // Checking if the sum of all weights are equal to 1 (or really really near) structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0); // Calculate the relative weight of each sentence this.each(structure.sentences, function(sentence) { sentence.data.tokens = scope.tokenize(sentence.text); sentence.data.weights = { words: scope.reltf(sentence.data.tokens, structure.weights.words), total: 0 } sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0); }); // Now we sort the sentences by weight structure.ordered = structure.sentences.sort(function(a, b) { return b.data.weights.total-a.data.weights.total; }); // We only keep the top N (=quota) structure.keep = structure.ordered.slice(0, quota); // And now we reorder by index structure.indexed = structure.keep.sort(function(a, b) { return a.index-b.index; }); // Finally, we join the sentences structure.summary = _.map(structure.indexed, function(sentence) { return sentence.text; }); return { data: structure, text: structure.summary.join('\n') } } // (1-Math.sin(pindex*(Math.PI/ptotal)))/2+1 // Summarization using the word frequency, with more importance being given to the beginning and end of an article. nlpsum.prototype.sinFrequencySummary = function(text, quota) { var scope = this; var tokenizer = new natural.WordTokenizer(); var summary = []; var sentenceIndex = 0; var structure = { sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) { var obj = { text: sentence, index: sentenceIndex, data: {} }; sentenceIndex++; return obj; }), ordered: [], weights: { words: scope.tf(text).frequency, total: 0 } }; var sentenceCount = structure.sentences.length; // Checking if the sum of all weights are equal to 1 (or really really near) structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0); // Calculate the relative weight of each sentence, applying the sin transform var sentenceIndex = 0; this.each(structure.sentences, function(sentence) { sentence.data.tokens = scope.tokenize(sentence.text); sentence.data.weights = { words: scope.reltf(sentence.data.tokens, structure.weights.words), total: 0 } sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0); sentence.data.weights.sinTransform = (1-Math.sin(sentenceIndex*(Math.PI/sentenceCount)))/2+1; sentence.data.weights.total *= sentence.data.weights.sinTransform; sentenceIndex++; }); // Now we sort the sentences by weight structure.ordered = structure.sentences.sort(function(a, b) { return b.data.weights.total-a.data.weights.total; }); // We only keep the top N (=quota) structure.keep = structure.ordered.slice(0, quota); // And now we reorder by index structure.indexed = structure.keep.sort(function(a, b) { return a.index-b.index; }); // Finally, we join the sentences structure.summary = _.map(structure.indexed, function(sentence) { return sentence.text; }); return { data: structure, text: structure.summary.join('\n') } } // Weight Sin Transform: W = (1-Math.sin(X*(Math.PI/N)))/2+1 where X is the sentence's index (incremental) and N is the number of sentences in the set // Summarization using the word frequency, with more importance being given to the beginning and end of an article. nlpsum.prototype.sinFrequencySummary = function(text, quota) { var scope = this; var tokenizer = new natural.WordTokenizer(); var summary = []; var sentenceIndex = 0; var structure = { sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) { var obj = { text: sentence, index: sentenceIndex, data: {} }; sentenceIndex++; return obj; }), ordered: [], weights: { words: scope.tf(text).frequency, total: 0 } }; var sentenceCount = structure.sentences.length; // Checking if the sum of all weights are equal to 1 (or really really near) structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0); // Calculate the relative weight of each sentence, applying the sin transform var sentenceIndex = 0; this.each(structure.sentences, function(sentence) { sentence.data.tokens = scope.tokenize(sentence.text); sentence.data.weights = { words: scope.reltf(sentence.data.tokens, structure.weights.words), total: 0 } sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0); sentence.data.weights.sinTransform = (1-Math.sin(sentenceIndex*(Math.PI/sentenceCount)))/2+1; sentence.data.weights.total *= sentence.data.weights.sinTransform; sentenceIndex++; }); // Now we sort the sentences by weight structure.ordered = structure.sentences.sort(function(a, b) { return b.data.weights.total-a.data.weights.total; }); // We only keep the top N (=quota) structure.keep = structure.ordered.slice(0, quota); // And now we reorder by index structure.indexed = structure.keep.sort(function(a, b) { return a.index-b.index; }); // Finally, we join the sentences structure.summary = _.map(structure.indexed, function(sentence) { return sentence.text; }); return { data: structure, text: structure.summary.join('\n') } } // Weight Sin Transform: W = (1-Math.sin(X*(Math.PI/N)))+1 where X is the sentence's index (incremental) and N is the number of sentences in the set // Summarization using the word frequency, with more importance being given to the beginning and end of an article. // In this variant, We actually give more weight to the individual words at the beggining and end of the text, and less int he middle. // This weight is compounded everytime a word is found, with its weight depending on where in the text it is located. // As a result, the more a word is used in the text, the higher its weight, leading to a better weighting method to accurately represent the content of the text. nlpsum.prototype.sinWordFrequencySummary = function(text, quota) { var scope = this; var tokenizer = new natural.WordTokenizer(); var summary = []; var sentenceIndex = 0; var structure = { sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) { var obj = { text: sentence, index: sentenceIndex, data: {} }; sentenceIndex++; return obj; }), ordered: [], weights: { words: scope.tf(text).frequency, total: 0 } }; var sentenceCount = structure.sentences.length; // Checking if the sum of all weights are equal to 1 (or really really near) structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0); // Calculate the sin transform for the sentence var sentenceIndex = 0; this.each(structure.sentences, function(sentence) { sentence.data.tokens = scope.tokenize(sentence.text); sentence.data.sinTransform = (1-Math.sin(sentenceIndex*(Math.PI/sentenceCount)))+1; // Apply the sin transform to the global word weights var word; for (word in structure.weights.words) { if (_.contains(sentence.data.tokens, word)) { structure.weights.words[word] *= sentence.data.sinTransform; } } sentenceIndex++; }); // Recalculate the total weight structure.weights.transformed = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0); // Now we calculate the weight of each sentence var sentenceIndex = 0; this.each(structure.sentences, function(sentence) { sentence.data.weights = { words: scope.reltf(sentence.data.tokens, structure.weights.words), total: 0 } sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0); sentenceIndex++; }); // Now we sort the sentences by weight structure.ordered = structure.sentences.sort(function(a, b) { return b.data.weights.total-a.data.weights.total; }); // We only keep the top N (=quota) structure.keep = structure.ordered.slice(0, quota); // And now we reorder by index structure.indexed = structure.keep.sort(function(a, b) { return a.index-b.index; }); // Finally, we join the sentences structure.summary = _.map(structure.indexed, function(sentence) { return sentence.text; }); return { data: structure, text: structure.summary.join('\n') } } // Take a sentence array and re-join the quotes properly nlpsum.prototype.joinQuotes = function(sentences) { var i; var l = sentences.length; var output = []; var buffer = []; var open = false; var count = 0; for (i=0;i<l;i++) { var quoteMatch = sentences[i].match(/\"/ig); if (quoteMatch != null && quoteMatch.length%2 != 0 && !open) { // Odd number of quotes in that sentence, and the joining is not activated, that means broken quote! // Start joining up the quotes buffer = []; // Empty the buffer open = true; // activate the quote join count = 0; // Reset the quote counter } if (open) { buffer.push(sentences[i]); if (quoteMatch != null) { count += quoteMatch.length; if (count%2 == 0) { // We now have an even number of quotes. // Stop buffering. open = false; count = 0; output.push(buffer.join(' ')); buffer = []; } } } else { output.push(sentences[i]); } } return output; } // Iterate over an array nlpsum.prototype.each = function(array, fn) { var i; var l = array.length; for (i=0;i<l;i++) { fn(array[i]); } } // Relative text frequency nlpsum.prototype.reltf = function(sentenceTokens, textWeights) { var scope = this; var frequency = {}; _.each(textWeights, function(weight, token) { if (_.contains(sentenceTokens, token)) { frequency[token] = weight; } }); return frequency; } // Split in paragraphs and sentences nlpsum.prototype.split = function(text) { var scope = this; var tokenizer = new natural.WordTokenizer(); var fractal = { paragraphs: [] }; // Split in paragraphs var paragraphs = text.split('\r\n\r\n'); _.each(paragraphs, function(paragraph) { // Temp object we'll push var buffer_p = { paragraph: paragraph, tokens: scope.tokenize(paragraph), sentences: [] }; // Split in sentences var sentences = paragraph.split('.'); _.each(sentences, function(sentence) { var buffer_s = { sentence: sentence, tokens: scope.tokenize(sentence) }; if (buffer_s.tokens.length > 0) { buffer_p.sentences.push(buffer_s); } }); fractal.paragraphs.push(buffer_p) }); return fractal; } // Tokenize a text, removing the plurals and removing the useless words (a, the, ...) nlpsum.prototype.tokenize = function(text) { var scope = this; var tokenizer = new natural.WordTokenizer(); var tokens = tokenizer.tokenize(text); // Lowercase tokens = _.map(tokens, function(token) { return token.toString().toLowerCase(); }); // Remove the useless words var removeList = ["a","the","of","it","he","she","we","our","they","from","to","that","this","is","in","these","be","at","s","re","and","or","with","which","what","was"]; tokens = _.filter(tokens, function(token) { return !_.contains(removeList, token); }); return tokens; } // Get the frequency of the words nlpsum.prototype.tf = function(text) { var scope = this; // Tokenize the text var tokens = this.tokenize(text); // Calculate the frequency var count = {}; var sum = 0; var frequency = {}; _.each(tokens, function(token) { if (!_.has(count, token)) { count[token] = 0; } sum++; count[token]++; }); var token; for (token in count) { frequency[token] = count[token]/sum; } return { tokens: tokens, frequency: frequency, count: count, sum: sum }; } // Extract the dates from the text (Spencer Kelly) nlpsum.prototype.extractDates = function(text) { return dateExtractor(text); } // tag the words (Fortnight Lab) /* CC Coord Conjuncn and,but,or CD Cardinal number one,two DT Determiner the,some EX Existential there there FW Foreign Word mon dieu IN Preposition of,in,by JJ Adjective big JJR Adj., comparative bigger JJS Adj., superlative biggest LS List item marker 1,One MD Modal can,should NN Noun, sing. or mass dog NNP Proper noun, sing. Edinburgh NNPS Proper noun, plural Smiths NNS Noun, plural dogs POS Possessive ending �s PDT Predeterminer all, both PP$ Possessive pronoun my,one�s PRP Personal pronoun I,you,she RB Adverb quickly RBR Adverb, comparative faster RBS Adverb, superlative fastest RP Particle up,off SYM Symbol +,%,& TO �to� to UH Interjection oh, oops URL url http://www.google.com/ VB verb, base form eat VBD verb, past tense ate VBG verb, gerund eating VBN verb, past part eaten VBP Verb, present eat VBZ Verb, present eats WDT Wh-determiner which,that WP Wh pronoun who,what WP$ Possessive-Wh whose WRB Wh-adverb how,where , Comma , . Sent-final punct . ! ? : Mid-sent punct. : ; � $ Dollar sign $ # Pound sign # " quote " ( Left paren ( ) Right paren ) */ nlpsum.prototype.tag = function(text) { var words = new pos.Lexer().lex(text); var taggedWords = new pos.Tagger().tag(words); return taggedWords; } module.exports = nlpsum;