textrank
Version:
TextRank javascript implementation for automatic text summarization
366 lines (263 loc) • 9.85 kB
JavaScript
/*
==========================================
TextRank: Bringing Order into Texts
Performs sentence extraction only.
Used for automatic article summarization.
==========================================
*/
// Article is a string of text to summarize
exports.TextRank = function (article, settings) {
this.printError = function (msg) {
console.log("TextRank ERROR:", msg);
}
if(typeof article != "string") {
this.printError("Article Must Be Type String");
return;
}
if(article.length < 1){
this.printError("Article Can't Be Empty");
return;
}
if(!settings){
settings = {};
}
this.extractAmount = (settings["extractAmount"])? settings["extractAmount"] : 5;
// Random surfer model, used in the similarity scoring function
this.d = (settings["d"])? settings["d"] : 0.85;
// Set the similarity function for edge weighting
this.userDefinedSimilarity = (settings["sim"])? settings["sim"] : null;
// Tokens are a sentence [ sentence1, sentence2, sentence3, ... , sentenceN ]
this.userDefinedTokens = (settings["tokens"])? settings["tokens"]: null;
// Split are the sentences tokenized into words [[word1, word2, ... , wordN],[word1, word2, ... , wordN], ..., [word1, word2, ... , wordN]]
this.userDefinedTokensSplit = (settings["split"])? settings["split"]: null;
this.typeOfSummary = (settings["summaryType"])? 1 : 0;
this.graph = {
V: {}, // Sentences are the vertices of the graph
E: {},
numVerts: 0
}
this.summarizedArticle = "";
// convergence threshold
this.delta = 0.0001
// Constructs the graph
this.setupGraph = function (article) {
// The TextPreprocesser cleans up and tokenizes the article
this.graph.V = TextPreprocesser(article, this.userDefinedTokens, this.userDefinedTokensSplit);
this.graph.numVerts = Object.keys(this.graph.V).length;
// Check for user defined similarity function
this.sim = (this.userDefinedSimilarity != null)? this.userDefinedSimilarity : this.similarityScoring;
// Init vertex scores
for(iIndex in this.graph.V) {
var vertex = this.graph.V[iIndex];
// The initial score of a vertex is random and does not matter for the TextRank algorithm
vertex["score"] = Math.random() * 10 + 1;
// Id is the sentence position starting from 0
vertex["id"] = Number(iIndex);
var Si = vertex;
// Add an edge between every sentence in the graph
// Fully connected graph
for (var j = 0; j < this.graph.numVerts; j++) {
var jIndex = j.toString();
// No self edges
if(jIndex != iIndex) {
// If no edge list, create it
if(!this.graph.E[iIndex]) {
this.graph.E[iIndex] = {};
}
var Sj = this.graph.V[jIndex];
// Compute the edge weight between two sentences in the graph
this.graph.E[iIndex][jIndex] = this.sim(Si, Sj);
}
}
}
}
// Given two sentences compute a score which is the weight on the edge between the two sentence
// Implementation of Similarity(Si, Sj) function defined in the paper
this.similarityScoring = function (Si, Sj) {
var overlap = {}
var Si_tokens = Si.tokens;
var Sj_tokens = Sj.tokens;
// Count words for sentence i
for(var i = 0; i < Si_tokens.length; i++) {
var word = Si_tokens[i];
if(!overlap[word]) {
overlap[word] = {}
}
overlap[word]['i'] = 1;
}
// Count words for sentence j
for(var i = 0; i < Sj_tokens.length; i++) {
var word = Sj_tokens[i];
if(!overlap[word]) {
overlap[word] = {}
}
overlap[word]['j'] = 1;
}
var logLengths = Math.log(Si_tokens.length) + Math.log(Sj_tokens.length);
var wordOverlapCount = 0;
// Compute word overlap from the sentences
for( index in overlap) {
var word = overlap[index]
if ( Object.keys(word).length === 2) {
wordOverlapCount++;
}
}
// Compute score
return wordOverlapCount/logLengths;
}
this.iterations = 0;
this.iterateAgain = true;
// The Weighted Graph WS(Vi) function to score a vertex
this.iterate = function () {
for(index in this.graph.V){
var vertex = this.graph.V[index]; // Vi vertex
var score_0 = vertex.score;
var vertexNeighbors = this.graph.E[index]; // In(Vi) set
var summedNeighbors = 0;
// Sum over In(Vi)
for (neighborIndex in vertexNeighbors) {
var neighbor = vertexNeighbors[neighborIndex]; // Vj
var wji = this.graph.E[index][neighborIndex]; // wji
// Sum over Out(Vj)
var outNeighbors = this.graph.E[neighborIndex];
var summedOutWeight = 1; // Stores the summation of weights over the Out(Vj)
for( outIndex in outNeighbors) {
summedOutWeight += outNeighbors[outIndex];
}
var WSVertex = this.graph.V[neighborIndex].score; // WS(Vj)
summedNeighbors += (wji/summedOutWeight) * WSVertex;
}
var score_1 = (1 - this.d) + this.d * summedNeighbors; // WS(Vi)
// Update the score on the vertex
this.graph.V[index].score = score_1;
// Check to see if you should continue
if(Math.abs(score_1 - score_0) <= this.delta) {
this.iterateAgain = false;
}
}
// Check for another iteration
if(this.iterateAgain == true) {
this.iterations += 1;
this.iterate();
}else {
// Prints only once
// console.log(this.iterations);
}
return;
}
// Extracts the top N sentences
this.extractSummary = function (N) {
var sentences = [];
// Graph all the sentences
for ( index in this.graph.V) {
sentences.push(this.graph.V[index]);
}
// Sort the sentences based off the score of the vertex
sentences = sentences.sort( function (a,b) {
if (a.score > b.score) {
return -1;
}else {
return 1;
}
});
// Grab the top N sentences
// var sentences = sentences.slice(0,0+(N));
sentences.length = N;
// Sort based of the id which is the position of the sentence in the original article
sentences = sentences.sort(function (a,b) {
if (a.id < b.id) {
return -1;
} else {
return 1;
}
})
var summary = null;
if(this.typeOfSummary) {
summary = [];
for (var i = 0; i < sentences.length; i++) {
summary.push(sentences[i].sentence);
}
} else {
// Compose the summary by joining the ranked sentences
var summary = sentences[0].sentence;
for (var i = 1; i < sentences.length; i++) {
summary += " " + sentences[i].sentence;
}
}
return summary;
}
this.run = function (article) {
// Create graph structure
this.setupGraph(article);
// Rank sentences
this.iterate();
this.summarizedArticle = this.extractSummary(this.extractAmount);
}
this.run(article);
}
// Handles the preprocessing of text for creating the graph structure of TextRank
function TextPreprocesser(article, userTokens, userTokensSplit) {
// Fucntion to clean up anything with the article that is passed in.
this.cleanArticle = function (article) {
// Regex to remove two or more spaces in a row.
return article.replace(/[ ]+(?= )/g, "");
}
// tokenizer takes a string {article} and turns it into an array of sentences
// tokens are sentences, must end with (!?.) characters
this.tokenizer = function(article) {
return article.replace(/([ ][".A-Za-z-|0-9]+[!|.|?|"](?=[ ]["“A-Z]))/g, "$1|").split("|");
}
// Cleans up the tokens
// tokens are sentences
this.cleanTokens = function(tokens) {
// Iterate backwards to allow for splicing.
for (var i = tokens.length - 1; i >= 0; i--) {
// Current Token
var token = tokens[i]
// Empty String
if(token == "") {
tokens.splice(i,1);
}else { // Since string is not empty clean it up
// Remove all spaces leading the sentence
tokens[i] = token.replace(/[ .]*/,"")
}
}
return tokens;
}
// given a sentence, split it up into the amount of words in the sentence
this.tokenizeASentence = function(sentence) {
// lowercase all the words in the sentences
var lc_sentence = sentence.toLowerCase();
/*
Regex Expression Below :
Example: cool, awesome, something else, and yup
The delimiters like commas (,) (:) (;) etc ... need to be removed
When scoring sentences against each other you do not want to compare
{cool,} against {cool} because they will not match since the comma stays with {cool,}
*/
// put spaces between all characters to split into words
var replaceToSpaceWithoutAfterSpace = /[-|'|"|(|)|/|<|>|,|:|;](?! )/g;
lc_sentence = lc_sentence.replace(replaceToSpaceWithoutAfterSpace," ");
// Now replace all characters with blank
var replaceToBlankWithCharacters = /[-|'|"|(|)|/|<|>|,|:|;]/g;
lc_sentence = lc_sentence.replace(replaceToBlankWithCharacters,"");
// Split into the words based off spaces since cleaned up
return lc_sentence.split(" ");
}
this.outputPreprocess = function(article) {
var cleanedArticle = this.cleanArticle(article);
// Check for user tokens
var usingUserDefinedTokens = (userTokens && userTokensSplit);
var tokens = (usingUserDefinedTokens)? userTokens : this.cleanTokens(this.tokenizer(cleanedArticle));
var output = {};
for (var i = 0; i < tokens.length; i++) {
var tokenizedSentence = (usingUserDefinedTokens)? userTokensSplit[i]: this.tokenizeASentence(tokens[i]);
output[i] = {
sentence: tokens[i],
tokens: tokenizedSentence
};
}
return output;
}
return this.outputPreprocess(article);
}