nlpsum
Version:
Powerful text summarization algorithms from research papers and dedicated research.
701 lines (569 loc) • 19.8 kB
JavaScript
// Underscore
var _ = require('underscore');
// Natural
var natural = require('natural');
var wordnet = new natural.WordNet();
// Spencer Kelly's nlp-node libs
var sentenceParser = require('./nlp-node-master/sentence_parser/sentence');
var dateExtractor = require('./nlp-node-master/date_parser/date_extractor');
// Fortnight Lab's libs
var pos = require('pos');
var glossary = require("glossary")({ collapse: true });
// Constructor
function nlpsum() {}
nlpsum.prototype.test = function(text) {
wordnet.lookup('node', function(results) {
results.forEach(function(result) {
console.log('------------------------------------');
console.log(result.synsetOffset);
console.log(result.pos);
console.log(result.lemma);
console.log(result.synonyms);
console.log(result.pos);
console.log(result.gloss);
});
});
}
nlpsum.prototype.keywords = function(text) {
var keywords = glossary.extract(text);
return keywords;
}
// Generate a summary based on the fractal document theory by Christopher C. Yang & Fu Lee Wang (Chinese University of Hong Kong - see "fractal summarization.pdf")
nlpsum.prototype.fractalSummary = function(text, quota) {
var scope = this;
var tokenizer = new natural.WordTokenizer();
var summary = [];
var fractal = {
paragraphs: [],
weights: scope.tf(text).frequency,
};
// Split in paragraphs
var paragraphs = text.split('\r\n\r\n');
var s_weight = 0; // Total weights of all the sentences in reference to the document. Will be > 1. Used to normalize to the range ]0;1].
var p_weight = 0; // Total weights of all the paragraphs in reference to the document. Will be > 1. Used to normalize to the range ]0;1].
var pindex = 0; // paragraph index
var ptotal = paragraphs.length;
var sindex = 0; // Sentence index
var stotal = 0; // total sentences
_.each(paragraphs, function(paragraph) {
// Temp object we'll push
var buffer_p = {
paragraph: paragraph,
tokens: scope.tokenize(paragraph),
weights: {
words: scope.tf(paragraph).frequency,
total: {
words: 0,
sentences: 0
}
},
sentences: []
};
// Split in sentences
var sentences = scope.joinQuotes(sentenceParser(paragraph));
_.each(sentences, function(sentence) {
// Increment the sentence counter
stotal++;
var buffer_s = {
sentence: sentence,
tokens: scope.tokenize(sentence)
};
if (buffer_s.tokens.length > 0) {
buffer_s.weights = {
sentence: scope.tf(sentence).frequency,
paragraph: scope.reltf(buffer_s.tokens, buffer_p.weights),
document: scope.reltf(buffer_s.tokens, fractal.weights)
};
buffer_s.weights.total = {
sentence: 1, // always, logical
paragraph: _.reduce(_.values(buffer_s.weights.paragraph), function(total, n) {return total+n;}, 0),
document: _.reduce(_.values(buffer_s.weights.document), function(total, n) {return total+n;}, 0)
}
s_weight += buffer_s.weights.total.document;
buffer_p.weights.total.sentences += buffer_s.weights.total.document;
buffer_p.sentences.push(buffer_s);
}
});
// Calculate tht total weights of the words in that paragraph, relative to the document
buffer_p.weights.total.words = _.reduce(_.values(buffer_p.weights.words), function(total, n) {return total+n;}, 0);
fractal.paragraphs.push(buffer_p);
pindex++;
});
// Now we normalize the sentence weights, using the sum 's_weight'
var ns_weight = 0;
var np_weight = 0;
var quota_sum = 0;
var sindex = 0;
this.each(fractal.paragraphs, function(paragraph) {
// Normalize the paragraph weight relative to the document ]0;1]
paragraph.weights.total.normalized = paragraph.weights.total.sentences/s_weight;
np_weight += paragraph.weights.total.normalized;
// Calculate the quota for that paragraph
paragraph.quota = Math.round(quota*paragraph.weights.total.normalized);
quota_sum += paragraph.quota;
// Now that we have the quotaz per paragraph and the normalized weights (actually, next step),
// We define the thereshold per paragraph
// To do that, we sort the sentence by weight
var sentences_sorted = [];
var index = 0;
scope.each(paragraph.sentences, function(sentence) {
// Normalize the sentence weight relative to the document ]0;1]
sentence.weights.total.sentence_normalized = sentence.weights.total.document/s_weight;
ns_weight += sentence.weights.total.sentence_normalized;
sentences_sorted.push({
weight: sentence.weights.total.sentence_normalized,
text: sentence.sentence,
index: index
});
index++;
});
// Sort the sentences by weight
paragraph.sentences_sorted = sentences_sorted.sort(function(a,b) {return b.weight-a.weight;});
// Keep only our quotas of sentences, sorted by index asc (to be in the right order)
paragraph.sentences_keep = _.map(paragraph.sentences_sorted.slice(0, paragraph.quota).sort(function(a,b) {return a.index-b.index;}), function(item) {
return item.text;
});
if (paragraph.sentences_keep.length > 0) {
_.each(paragraph.sentences_keep, function(sentence) {
summary.push(sentence.trim());
});
}
});
return {
data: fractal,
text: summary.join('.\r\n'),
s_weight: s_weight,
ns_weight: ns_weight,
np_weight: np_weight,
quota_sum: quota_sum
};
}
// Summarization using the word frequency. Pretty straightforward algorithm.
nlpsum.prototype.wordFrequencySummary = function(text, quota) {
var scope = this;
var tokenizer = new natural.WordTokenizer();
var summary = [];
var sentenceIndex = 0;
var structure = {
sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) {
var obj = {
text: sentence,
index: sentenceIndex,
data: {}
};
sentenceIndex++;
return obj;
}),
ordered: [],
weights: {
words: scope.tf(text).frequency,
total: 0
}
};
// Checking if the sum of all weights are equal to 1 (or really really near)
structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0);
// Calculate the relative weight of each sentence
this.each(structure.sentences, function(sentence) {
sentence.data.tokens = scope.tokenize(sentence.text);
sentence.data.weights = {
words: scope.reltf(sentence.data.tokens, structure.weights.words),
total: 0
}
sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0);
});
// Now we sort the sentences by weight
structure.ordered = structure.sentences.sort(function(a, b) {
return b.data.weights.total-a.data.weights.total;
});
// We only keep the top N (=quota)
structure.keep = structure.ordered.slice(0, quota);
// And now we reorder by index
structure.indexed = structure.keep.sort(function(a, b) {
return a.index-b.index;
});
// Finally, we join the sentences
structure.summary = _.map(structure.indexed, function(sentence) {
return sentence.text;
});
return {
data: structure,
text: structure.summary.join('\n')
}
}
// (1-Math.sin(pindex*(Math.PI/ptotal)))/2+1
// Summarization using the word frequency, with more importance being given to the beginning and end of an article.
nlpsum.prototype.sinFrequencySummary = function(text, quota) {
var scope = this;
var tokenizer = new natural.WordTokenizer();
var summary = [];
var sentenceIndex = 0;
var structure = {
sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) {
var obj = {
text: sentence,
index: sentenceIndex,
data: {}
};
sentenceIndex++;
return obj;
}),
ordered: [],
weights: {
words: scope.tf(text).frequency,
total: 0
}
};
var sentenceCount = structure.sentences.length;
// Checking if the sum of all weights are equal to 1 (or really really near)
structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0);
// Calculate the relative weight of each sentence, applying the sin transform
var sentenceIndex = 0;
this.each(structure.sentences, function(sentence) {
sentence.data.tokens = scope.tokenize(sentence.text);
sentence.data.weights = {
words: scope.reltf(sentence.data.tokens, structure.weights.words),
total: 0
}
sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0);
sentence.data.weights.sinTransform = (1-Math.sin(sentenceIndex*(Math.PI/sentenceCount)))/2+1;
sentence.data.weights.total *= sentence.data.weights.sinTransform;
sentenceIndex++;
});
// Now we sort the sentences by weight
structure.ordered = structure.sentences.sort(function(a, b) {
return b.data.weights.total-a.data.weights.total;
});
// We only keep the top N (=quota)
structure.keep = structure.ordered.slice(0, quota);
// And now we reorder by index
structure.indexed = structure.keep.sort(function(a, b) {
return a.index-b.index;
});
// Finally, we join the sentences
structure.summary = _.map(structure.indexed, function(sentence) {
return sentence.text;
});
return {
data: structure,
text: structure.summary.join('\n')
}
}
// Weight Sin Transform: W = (1-Math.sin(X*(Math.PI/N)))/2+1 where X is the sentence's index (incremental) and N is the number of sentences in the set
// Summarization using the word frequency, with more importance being given to the beginning and end of an article.
nlpsum.prototype.sinFrequencySummary = function(text, quota) {
var scope = this;
var tokenizer = new natural.WordTokenizer();
var summary = [];
var sentenceIndex = 0;
var structure = {
sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) {
var obj = {
text: sentence,
index: sentenceIndex,
data: {}
};
sentenceIndex++;
return obj;
}),
ordered: [],
weights: {
words: scope.tf(text).frequency,
total: 0
}
};
var sentenceCount = structure.sentences.length;
// Checking if the sum of all weights are equal to 1 (or really really near)
structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0);
// Calculate the relative weight of each sentence, applying the sin transform
var sentenceIndex = 0;
this.each(structure.sentences, function(sentence) {
sentence.data.tokens = scope.tokenize(sentence.text);
sentence.data.weights = {
words: scope.reltf(sentence.data.tokens, structure.weights.words),
total: 0
}
sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0);
sentence.data.weights.sinTransform = (1-Math.sin(sentenceIndex*(Math.PI/sentenceCount)))/2+1;
sentence.data.weights.total *= sentence.data.weights.sinTransform;
sentenceIndex++;
});
// Now we sort the sentences by weight
structure.ordered = structure.sentences.sort(function(a, b) {
return b.data.weights.total-a.data.weights.total;
});
// We only keep the top N (=quota)
structure.keep = structure.ordered.slice(0, quota);
// And now we reorder by index
structure.indexed = structure.keep.sort(function(a, b) {
return a.index-b.index;
});
// Finally, we join the sentences
structure.summary = _.map(structure.indexed, function(sentence) {
return sentence.text;
});
return {
data: structure,
text: structure.summary.join('\n')
}
}
// Weight Sin Transform: W = (1-Math.sin(X*(Math.PI/N)))+1 where X is the sentence's index (incremental) and N is the number of sentences in the set
// Summarization using the word frequency, with more importance being given to the beginning and end of an article.
// In this variant, We actually give more weight to the individual words at the beggining and end of the text, and less int he middle.
// This weight is compounded everytime a word is found, with its weight depending on where in the text it is located.
// As a result, the more a word is used in the text, the higher its weight, leading to a better weighting method to accurately represent the content of the text.
nlpsum.prototype.sinWordFrequencySummary = function(text, quota) {
var scope = this;
var tokenizer = new natural.WordTokenizer();
var summary = [];
var sentenceIndex = 0;
var structure = {
sentences: _.map(this.joinQuotes(sentenceParser(text)), function(sentence) {
var obj = {
text: sentence,
index: sentenceIndex,
data: {}
};
sentenceIndex++;
return obj;
}),
ordered: [],
weights: {
words: scope.tf(text).frequency,
total: 0
}
};
var sentenceCount = structure.sentences.length;
// Checking if the sum of all weights are equal to 1 (or really really near)
structure.weights.total = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0);
// Calculate the sin transform for the sentence
var sentenceIndex = 0;
this.each(structure.sentences, function(sentence) {
sentence.data.tokens = scope.tokenize(sentence.text);
sentence.data.sinTransform = (1-Math.sin(sentenceIndex*(Math.PI/sentenceCount)))+1;
// Apply the sin transform to the global word weights
var word;
for (word in structure.weights.words) {
if (_.contains(sentence.data.tokens, word)) {
structure.weights.words[word] *= sentence.data.sinTransform;
}
}
sentenceIndex++;
});
// Recalculate the total weight
structure.weights.transformed = _.reduce(_.values(structure.weights.words), function(total, n) {return total+n;}, 0);
// Now we calculate the weight of each sentence
var sentenceIndex = 0;
this.each(structure.sentences, function(sentence) {
sentence.data.weights = {
words: scope.reltf(sentence.data.tokens, structure.weights.words),
total: 0
}
sentence.data.weights.total = _.reduce(_.values(sentence.data.weights.words), function(total, n) {return total+n;}, 0);
sentenceIndex++;
});
// Now we sort the sentences by weight
structure.ordered = structure.sentences.sort(function(a, b) {
return b.data.weights.total-a.data.weights.total;
});
// We only keep the top N (=quota)
structure.keep = structure.ordered.slice(0, quota);
// And now we reorder by index
structure.indexed = structure.keep.sort(function(a, b) {
return a.index-b.index;
});
// Finally, we join the sentences
structure.summary = _.map(structure.indexed, function(sentence) {
return sentence.text;
});
return {
data: structure,
text: structure.summary.join('\n')
}
}
// Take a sentence array and re-join the quotes properly
nlpsum.prototype.joinQuotes = function(sentences) {
var i;
var l = sentences.length;
var output = [];
var buffer = [];
var open = false;
var count = 0;
for (i=0;i<l;i++) {
var quoteMatch = sentences[i].match(/\"/ig);
if (quoteMatch != null && quoteMatch.length%2 != 0 && !open) {
// Odd number of quotes in that sentence, and the joining is not activated, that means broken quote!
// Start joining up the quotes
buffer = []; // Empty the buffer
open = true; // activate the quote join
count = 0; // Reset the quote counter
}
if (open) {
buffer.push(sentences[i]);
if (quoteMatch != null) {
count += quoteMatch.length;
if (count%2 == 0) {
// We now have an even number of quotes.
// Stop buffering.
open = false;
count = 0;
output.push(buffer.join(' '));
buffer = [];
}
}
} else {
output.push(sentences[i]);
}
}
return output;
}
// Iterate over an array
nlpsum.prototype.each = function(array, fn) {
var i;
var l = array.length;
for (i=0;i<l;i++) {
fn(array[i]);
}
}
// Relative text frequency
nlpsum.prototype.reltf = function(sentenceTokens, textWeights) {
var scope = this;
var frequency = {};
_.each(textWeights, function(weight, token) {
if (_.contains(sentenceTokens, token)) {
frequency[token] = weight;
}
});
return frequency;
}
// Split in paragraphs and sentences
nlpsum.prototype.split = function(text) {
var scope = this;
var tokenizer = new natural.WordTokenizer();
var fractal = {
paragraphs: []
};
// Split in paragraphs
var paragraphs = text.split('\r\n\r\n');
_.each(paragraphs, function(paragraph) {
// Temp object we'll push
var buffer_p = {
paragraph: paragraph,
tokens: scope.tokenize(paragraph),
sentences: []
};
// Split in sentences
var sentences = paragraph.split('.');
_.each(sentences, function(sentence) {
var buffer_s = {
sentence: sentence,
tokens: scope.tokenize(sentence)
};
if (buffer_s.tokens.length > 0) {
buffer_p.sentences.push(buffer_s);
}
});
fractal.paragraphs.push(buffer_p)
});
return fractal;
}
// Tokenize a text, removing the plurals and removing the useless words (a, the, ...)
nlpsum.prototype.tokenize = function(text) {
var scope = this;
var tokenizer = new natural.WordTokenizer();
var tokens = tokenizer.tokenize(text);
// Lowercase
tokens = _.map(tokens, function(token) {
return token.toString().toLowerCase();
});
// Remove the useless words
var removeList = ["a","the","of","it","he","she","we","our","they","from","to","that","this","is","in","these","be","at","s","re","and","or","with","which","what","was"];
tokens = _.filter(tokens, function(token) {
return !_.contains(removeList, token);
});
return tokens;
}
// Get the frequency of the words
nlpsum.prototype.tf = function(text) {
var scope = this;
// Tokenize the text
var tokens = this.tokenize(text);
// Calculate the frequency
var count = {};
var sum = 0;
var frequency = {};
_.each(tokens, function(token) {
if (!_.has(count, token)) {
count[token] = 0;
}
sum++;
count[token]++;
});
var token;
for (token in count) {
frequency[token] = count[token]/sum;
}
return {
tokens: tokens,
frequency: frequency,
count: count,
sum: sum
};
}
// Extract the dates from the text (Spencer Kelly)
nlpsum.prototype.extractDates = function(text) {
return dateExtractor(text);
}
// tag the words (Fortnight Lab)
/*
CC Coord Conjuncn and,but,or
CD Cardinal number one,two
DT Determiner the,some
EX Existential there there
FW Foreign Word mon dieu
IN Preposition of,in,by
JJ Adjective big
JJR Adj., comparative bigger
JJS Adj., superlative biggest
LS List item marker 1,One
MD Modal can,should
NN Noun, sing. or mass dog
NNP Proper noun, sing. Edinburgh
NNPS Proper noun, plural Smiths
NNS Noun, plural dogs
POS Possessive ending �s
PDT Predeterminer all, both
PP$ Possessive pronoun my,one�s
PRP Personal pronoun I,you,she
RB Adverb quickly
RBR Adverb, comparative faster
RBS Adverb, superlative fastest
RP Particle up,off
SYM Symbol +,%,&
TO �to� to
UH Interjection oh, oops
URL url http://www.google.com/
VB verb, base form eat
VBD verb, past tense ate
VBG verb, gerund eating
VBN verb, past part eaten
VBP Verb, present eat
VBZ Verb, present eats
WDT Wh-determiner which,that
WP Wh pronoun who,what
WP$ Possessive-Wh whose
WRB Wh-adverb how,where
, Comma ,
. Sent-final punct . ! ?
: Mid-sent punct. : ; �
$ Dollar sign $
# Pound sign #
" quote "
( Left paren (
) Right paren )
*/
nlpsum.prototype.tag = function(text) {
var words = new pos.Lexer().lex(text);
var taggedWords = new pos.Tagger().tag(words);
return taggedWords;
}
module.exports = nlpsum;