UNPKG

textrank

Version:

TextRank javascript implementation for automatic text summarization

366 lines (263 loc) 9.85 kB
/* ========================================== TextRank: Bringing Order into Texts Performs sentence extraction only. Used for automatic article summarization. ========================================== */ // Article is a string of text to summarize exports.TextRank = function (article, settings) { this.printError = function (msg) { console.log("TextRank ERROR:", msg); } if(typeof article != "string") { this.printError("Article Must Be Type String"); return; } if(article.length < 1){ this.printError("Article Can't Be Empty"); return; } if(!settings){ settings = {}; } this.extractAmount = (settings["extractAmount"])? settings["extractAmount"] : 5; // Random surfer model, used in the similarity scoring function this.d = (settings["d"])? settings["d"] : 0.85; // Set the similarity function for edge weighting this.userDefinedSimilarity = (settings["sim"])? settings["sim"] : null; // Tokens are a sentence [ sentence1, sentence2, sentence3, ... , sentenceN ] this.userDefinedTokens = (settings["tokens"])? settings["tokens"]: null; // Split are the sentences tokenized into words [[word1, word2, ... , wordN],[word1, word2, ... , wordN], ..., [word1, word2, ... , wordN]] this.userDefinedTokensSplit = (settings["split"])? settings["split"]: null; this.typeOfSummary = (settings["summaryType"])? 1 : 0; this.graph = { V: {}, // Sentences are the vertices of the graph E: {}, numVerts: 0 } this.summarizedArticle = ""; // convergence threshold this.delta = 0.0001 // Constructs the graph this.setupGraph = function (article) { // The TextPreprocesser cleans up and tokenizes the article this.graph.V = TextPreprocesser(article, this.userDefinedTokens, this.userDefinedTokensSplit); this.graph.numVerts = Object.keys(this.graph.V).length; // Check for user defined similarity function this.sim = (this.userDefinedSimilarity != null)? this.userDefinedSimilarity : this.similarityScoring; // Init vertex scores for(iIndex in this.graph.V) { var vertex = this.graph.V[iIndex]; // The initial score of a vertex is random and does not matter for the TextRank algorithm vertex["score"] = Math.random() * 10 + 1; // Id is the sentence position starting from 0 vertex["id"] = Number(iIndex); var Si = vertex; // Add an edge between every sentence in the graph // Fully connected graph for (var j = 0; j < this.graph.numVerts; j++) { var jIndex = j.toString(); // No self edges if(jIndex != iIndex) { // If no edge list, create it if(!this.graph.E[iIndex]) { this.graph.E[iIndex] = {}; } var Sj = this.graph.V[jIndex]; // Compute the edge weight between two sentences in the graph this.graph.E[iIndex][jIndex] = this.sim(Si, Sj); } } } } // Given two sentences compute a score which is the weight on the edge between the two sentence // Implementation of Similarity(Si, Sj) function defined in the paper this.similarityScoring = function (Si, Sj) { var overlap = {} var Si_tokens = Si.tokens; var Sj_tokens = Sj.tokens; // Count words for sentence i for(var i = 0; i < Si_tokens.length; i++) { var word = Si_tokens[i]; if(!overlap[word]) { overlap[word] = {} } overlap[word]['i'] = 1; } // Count words for sentence j for(var i = 0; i < Sj_tokens.length; i++) { var word = Sj_tokens[i]; if(!overlap[word]) { overlap[word] = {} } overlap[word]['j'] = 1; } var logLengths = Math.log(Si_tokens.length) + Math.log(Sj_tokens.length); var wordOverlapCount = 0; // Compute word overlap from the sentences for( index in overlap) { var word = overlap[index] if ( Object.keys(word).length === 2) { wordOverlapCount++; } } // Compute score return wordOverlapCount/logLengths; } this.iterations = 0; this.iterateAgain = true; // The Weighted Graph WS(Vi) function to score a vertex this.iterate = function () { for(index in this.graph.V){ var vertex = this.graph.V[index]; // Vi vertex var score_0 = vertex.score; var vertexNeighbors = this.graph.E[index]; // In(Vi) set var summedNeighbors = 0; // Sum over In(Vi) for (neighborIndex in vertexNeighbors) { var neighbor = vertexNeighbors[neighborIndex]; // Vj var wji = this.graph.E[index][neighborIndex]; // wji // Sum over Out(Vj) var outNeighbors = this.graph.E[neighborIndex]; var summedOutWeight = 1; // Stores the summation of weights over the Out(Vj) for( outIndex in outNeighbors) { summedOutWeight += outNeighbors[outIndex]; } var WSVertex = this.graph.V[neighborIndex].score; // WS(Vj) summedNeighbors += (wji/summedOutWeight) * WSVertex; } var score_1 = (1 - this.d) + this.d * summedNeighbors; // WS(Vi) // Update the score on the vertex this.graph.V[index].score = score_1; // Check to see if you should continue if(Math.abs(score_1 - score_0) <= this.delta) { this.iterateAgain = false; } } // Check for another iteration if(this.iterateAgain == true) { this.iterations += 1; this.iterate(); }else { // Prints only once // console.log(this.iterations); } return; } // Extracts the top N sentences this.extractSummary = function (N) { var sentences = []; // Graph all the sentences for ( index in this.graph.V) { sentences.push(this.graph.V[index]); } // Sort the sentences based off the score of the vertex sentences = sentences.sort( function (a,b) { if (a.score > b.score) { return -1; }else { return 1; } }); // Grab the top N sentences // var sentences = sentences.slice(0,0+(N)); sentences.length = N; // Sort based of the id which is the position of the sentence in the original article sentences = sentences.sort(function (a,b) { if (a.id < b.id) { return -1; } else { return 1; } }) var summary = null; if(this.typeOfSummary) { summary = []; for (var i = 0; i < sentences.length; i++) { summary.push(sentences[i].sentence); } } else { // Compose the summary by joining the ranked sentences var summary = sentences[0].sentence; for (var i = 1; i < sentences.length; i++) { summary += " " + sentences[i].sentence; } } return summary; } this.run = function (article) { // Create graph structure this.setupGraph(article); // Rank sentences this.iterate(); this.summarizedArticle = this.extractSummary(this.extractAmount); } this.run(article); } // Handles the preprocessing of text for creating the graph structure of TextRank function TextPreprocesser(article, userTokens, userTokensSplit) { // Fucntion to clean up anything with the article that is passed in. this.cleanArticle = function (article) { // Regex to remove two or more spaces in a row. return article.replace(/[ ]+(?= )/g, ""); } // tokenizer takes a string {article} and turns it into an array of sentences // tokens are sentences, must end with (!?.) characters this.tokenizer = function(article) { return article.replace(/([ ][".A-Za-z-|0-9]+[!|.|?|"](?=[ ]["“A-Z]))/g, "$1|").split("|"); } // Cleans up the tokens // tokens are sentences this.cleanTokens = function(tokens) { // Iterate backwards to allow for splicing. for (var i = tokens.length - 1; i >= 0; i--) { // Current Token var token = tokens[i] // Empty String if(token == "") { tokens.splice(i,1); }else { // Since string is not empty clean it up // Remove all spaces leading the sentence tokens[i] = token.replace(/[ .]*/,"") } } return tokens; } // given a sentence, split it up into the amount of words in the sentence this.tokenizeASentence = function(sentence) { // lowercase all the words in the sentences var lc_sentence = sentence.toLowerCase(); /* Regex Expression Below : Example: cool, awesome, something else, and yup The delimiters like commas (,) (:) (;) etc ... need to be removed When scoring sentences against each other you do not want to compare {cool,} against {cool} because they will not match since the comma stays with {cool,} */ // put spaces between all characters to split into words var replaceToSpaceWithoutAfterSpace = /[-|'|"|(|)|/|<|>|,|:|;](?! )/g; lc_sentence = lc_sentence.replace(replaceToSpaceWithoutAfterSpace," "); // Now replace all characters with blank var replaceToBlankWithCharacters = /[-|'|"|(|)|/|<|>|,|:|;]/g; lc_sentence = lc_sentence.replace(replaceToBlankWithCharacters,""); // Split into the words based off spaces since cleaned up return lc_sentence.split(" "); } this.outputPreprocess = function(article) { var cleanedArticle = this.cleanArticle(article); // Check for user tokens var usingUserDefinedTokens = (userTokens && userTokensSplit); var tokens = (usingUserDefinedTokens)? userTokens : this.cleanTokens(this.tokenizer(cleanedArticle)); var output = {}; for (var i = 0; i < tokens.length; i++) { var tokenizedSentence = (usingUserDefinedTokens)? userTokensSplit[i]: this.tokenizeASentence(tokens[i]); output[i] = { sentence: tokens[i], tokens: tokenizedSentence }; } return output; } return this.outputPreprocess(article); }