ab-node-summarizer
Version:
Text summarizer using Node.js
200 lines (173 loc) • 6 kB
JavaScript
const WordPos = require("wordpos");
const WeightedGraph = require('./WeightedGraph').WeightedGraph;
const sbd = require('sbd');
class Preprocesser {
constructor() {
this.tokenizer = sbd
}
//This method takes in a paragraph and returns a list of the sentences in the paragraph.
paragraphToSentences(string_to_process) {
try {
const result = this.tokenizer.sentences(string_to_process, {});
return result;
} catch (err) {
return Error("Cannot toeknize the given string.");
}
}
//Cleans the sentences by removing punctuation and lowercasing capital letters.
cleanSentences(list_to_clean) {
let sentence_map = new Map();
const regex = /[&\/\\#,+()$~%.'":*?<>{}]/g;
for (let i = 0; i < list_to_clean.length; i++) {
let original_sentence = list_to_clean[i];
list_to_clean[i] = list_to_clean[i].toLowerCase();
list_to_clean[i] = list_to_clean[i].replace(regex, "");
sentence_map.set(list_to_clean[i], original_sentence);
}
return [list_to_clean, sentence_map];
}
//Takes in a list of sentences and returns a list of all of the words in the sentences.
tokenizeSentences(list_of_sentences) {
let new_array = new Array();
new_array = list_of_sentences
let result_list = [];
for (let i = 0; i < new_array.length; i++) {
result_list = result_list.concat(new_array[i].split(" "));
}
return result_list;
}
//Takes in a list of words and calculates the frequencies of the words.
//Returns a list. The first item is a map of word->frequency. The second is the max frequency.
getFrequencyAndMax(list_of_words) {
let frequency_map = new Map();
let max = 0
for (let i = 0; i < list_of_words.length; i++) {
const word = list_of_words[i];
if (frequency_map.has(word)) {
const new_val = frequency_map.get(word) + 1;
frequency_map.set(word, new_val);
if (new_val > max) {
max = new_val;
}
} else {
frequency_map.set(word, 1);
}
}
return [frequency_map, max];
}
//Converts a frequency map into a map with "weights".
getWeights(list_of_words) {
const frequencies_and_max = this.getFrequencyAndMax(list_of_words);
const frequencies_map = frequencies_and_max[0];
const max = frequencies_and_max[1];
frequencies_map.forEach((value, key, map) => {
map.set(key, value / max);
});
return frequencies_map;
}
sentenceWeights(clean_sentences, weighted_map) {
let weight_of_sentence = 0;
let sentence_weight_list = [];
let sentence = "";
for (let i = 0; i < clean_sentences.length; i++) {
sentence = clean_sentences[i];
let word_list = sentence.split(" ");
weight_of_sentence = 0;
for (let j = 0; j < word_list.length; j++) {
weight_of_sentence += weighted_map.get(word_list[j]);
}
sentence_weight_list.push([weight_of_sentence / word_list.length, sentence]);
}
return sentence_weight_list;
}
//Takes a list of sentences and returns a map of the each sentence to its nouns and adjectives
async nounsAndAdjectives(clean_sentences) {
let nouns_and_adjectives_map = new Map();
let wordpos = new WordPos({});
try {
for (let i = 0; i < clean_sentences.length; i++) {
try {
let adjectives = await wordpos.getAdjectives(clean_sentences[i]);
let nouns = await wordpos.getNouns(clean_sentences[i]);
nouns_and_adjectives_map.set(clean_sentences[i], nouns.concat(adjectives));
} catch (e) {
// console.warn(e.message, `processing sentence at index [${i}]`);
}
}
return await nouns_and_adjectives_map;
} catch (err) {
console.log(err)
return
}
}
//Used for the text rank summary. Takes two lists of words and gets the weight of the edge connecting the vertices.
getEdgeWeights(list1, list2) {
let weight = 0;
let intial = list1
let other = list2
if (list2.length >= list1.length) {
intial = list2
other = list1
}
for (let i = 0; i < intial.length; i++) {
if (other.includes(intial[i])) {
weight += 1;
}
}
return weight
}
//Creates the graph for the textrank algorithm.
createTextRankGraph(nouns_and_adjactive_map) {
let graph = new WeightedGraph();
let key_list = [];
let weight = 0
nouns_and_adjactive_map.forEach((value, key, map) => {
key_list.push(key);
})
for (let i = 0; i < key_list.length; i++) {
for (let j = i + 1; j < key_list.length; j++) {
weight = this.getEdgeWeights(nouns_and_adjactive_map.get(key_list[i]),
nouns_and_adjactive_map.get(key_list[j]));
if (weight > 0) {
graph.addEdge(key_list[i], key_list[j], weight);
}
}
}
return graph;
}
//TextRank algorithm.
textRank(graph) {
let key_list = graph.getAllVertices();
let text_rank_map = new Map();
//random key to start with
if (key_list.length == 0) {
return text_rank_map;
}
let key = key_list[Math.floor(Math.random() * key_list.length)];
let vertex = graph.getVertex(key);
let probability_list = [];
//random walk
for (let i = 0; i < 10000; i++) {
let full_weight = 0
vertex.adjacent.forEach((value, key, map) => {
full_weight += value;
})
vertex.adjacent.forEach((value, key, map) => {
for (let x = 0; x < value; x++) {
probability_list.push(key);
}
})
let sentence = probability_list[Math.floor(Math.random() * probability_list.length)];
if (text_rank_map.has(sentence)) {
text_rank_map.set(sentence, text_rank_map.get(sentence) + 1)
} else {
text_rank_map.set(sentence, 1);
}
let last_vertex = vertex;
vertex = graph.getVertex(sentence);
probability_list = [];
}
return text_rank_map;
}
}
module.exports.Preprocesser = Preprocesser