UNPKG

node-summary

Version:

Summarizes text using a naive summarization algorithm

259 lines (222 loc) 10.9 kB
import _ from 'lodash' import Tokenizer from 'sbd' import request from 'request' let htmlToText = require('html-to-text'); let cheerio = require('cheerio') function splitContentToSentences(content, callback) { if(content.indexOf('.') === -1) { return callback(false) } callback(Tokenizer.sentences(content, {newline_boundaries: true}) || []) } function splitContentToParagraphs(content, callback) { callback(content.split("\n\n")) } /** * Original code from http://stackoverflow.com/a/1885660/394013 */ function intersect_safe(a, b) { var ai = 0, bi=0 var result = [] while(ai < a.length && bi < b.length){ if (a[ai] < b[bi] ){ ai++ } else if (a[ai] > b[bi] ){ bi++ } else /* they're equal */ { result.push(a[ai]) ai++ bi++ } } return result } function sentencesIntersection(sent1, sent2, callback) { var s1 = sent1.split(' ') var s2 = sent2.split(' ') if((s1.length + s2.length) === 0) { callback(true) } var intersect = intersect_safe(s1, s2) var spliceHere = ((s1.length + s2.length) / 2) callback(false, intersect.splice(0, spliceHere).length) } function formatSentence(sentence, callback) { if(sentence && sentence.replace) { // To support unicode characters. // http://www.unicode.org/reports/tr29/WordBreakTest.html var re = /[^A-Za-z\u00AA\u00B5\u00BA\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u0236\u0250-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EE\u0345\u037A\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03F5\u03F7-\u03FB\u0400-\u0481\u048A-\u04CE\u04D0-\u04F5\u04F8-\u04F9\u0500-\u050F\u0531-\u0556\u0559\u0561-\u0587\u05B0-\u05B9\u05BB-\u05BD\u05BF\u05C1-\u05C2\u05C4\u05D0-\u05EA\u05F0-\u05F3\u0610-\u0615\u0621-\u063A\u0640-\u0657\u066E-\u06D3\u06D5-\u06DC\u06E1-\u06E8\u06ED-\u06EF\u06FA-\u06FC\u06FF\u0710-\u073F\u074D-\u074F\u0780-\u07B1\u0901-\u0939\u093D-\u094C\u0950\u0958-\u0963\u0981-\u0983\u0985-\u098C\u098F-\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD-\u09C4\u09C7-\u09C8\u09CB-\u09CC\u09D7\u09DC-\u09DD\u09DF-\u09E3\u09F0-\u09F1\u0A01-\u0A03\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32-\u0A33\u0A35-\u0A36\u0A38-\u0A39\u0A3E-\u0A42\u0A47-\u0A48\u0A4B-\u0A4C\u0A59-\u0A5C\u0A5E\u0A70-\u0A74\u0A81-\u0A83\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3\u0AB5-\u0AB9\u0ABD-\u0AC5\u0AC7-\u0AC9\u0ACB-\u0ACC\u0AD0\u0AE0-\u0AE3\u0B01-\u0B03\u0B05-\u0B0C\u0B0F-\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32-\u0B33\u0B35-\u0B39\u0B3D-\u0B43\u0B47-\u0B48\u0B4B-\u0B4C\u0B56-\u0B57\u0B5C-\u0B5D\u0B5F-\u0B61\u0B71\u0B82-\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F\u0BA3-\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCC\u0BD7\u0C01-\u0C03\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4C\u0C55-\u0C56\u0C60-\u0C61\u0C82-\u0C83\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCC\u0CD5-\u0CD6\u0CDE\u0CE0-\u0CE1\u0D02-\u0D03\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D3E-\u0D43\u0D46-\u0D48\u0D4A-\u0D4C\u0D57\u0D60-\u0D61\u0D82-\u0D83\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0DCF-\u0DD4\u0DD6\u0DD8-\u0DDF\u0DF2-\u0DF3\u0F00\u0F40-\u0F47\u0F49-\u0F6A\u0F71-\u0F81\u0F88-\u0F8B\u0F90-\u0F97\u0F99-\u0FBC\u1000-\u1021\u1023-\u1027\u1029-\u102A\u102C-\u1032\u1036\u1038\u1050-\u1059\u10A0-\u10C5\u10D0-\u10F8\u1100-\u1159\u115F-\u11A2\u11A8-\u11F9\u1200-\u1206\u1208-\u1246\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1286\u1288\u128A-\u128D\u1290-\u12AE\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12CE\u12D0-\u12D6\u12D8-\u12EE\u12F0-\u130E\u1310\u1312-\u1315\u1318-\u131E\u1320-\u1346\u1348-\u135A\u13A0-\u13F4\u1401-\u166C\u166F-\u1676\u1681-\u169A\u16A0-\u16EA\u16EE-\u16F0\u1700-\u170C\u170E-\u1713\u1720-\u1733\u1740-\u1753\u1760-\u176C\u176E-\u1770\u1772-\u1773\u1780-\u17B3\u17B6-\u17C8\u17D7\u17DC\u1820-\u1877\u1880-\u18A9\u1900-\u191C\u1920-\u192B\u1930-\u1938\u1950-\u196D\u1970-\u1974\u1D00-\u1D6B\u1E00-\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2131\u2133-\u2139\u213D-\u213F\u2145-\u2149\u2160-\u2183\u3005\u3031-\u3035\u303B-\u303C\u3105-\u312C\u3131-\u318E\u31A0-\u31B7\uA000-\uA48C\uAC00-\uD7A3\uFA30-\uFA6A\uFB00-\uFB06\uFB13-\uFB17\uFB1D-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40-\uFB41\uFB43-\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFFA0-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\U00010000-\U0001000B\U0001000D-\U00010026\U00010028-\U0001003A\U0001003C-\U0001003D\U0001003F-\U0001004D\U00010050-\U0001005D\U00010080-\U000100FA\U00010300-\U0001031E\U00010330-\U0001034A\U00010380-\U0001039D\U00010400-\U0001049D\U00010800-\U00010805\U00010808\U0001080A-\U00010835\U00010837-\U00010838\U0001083C\U0001083F\U0001D400-\U0001D454\U0001D456-\U0001D49C\U0001D49E-\U0001D49F\U0001D4A2\U0001D4A5-\U0001D4A6\U0001D4A9-\U0001D4AC\U0001D4AE-\U0001D4B9\U0001D4BB\U0001D4BD-\U0001D4C3\U0001D4C5-\U0001D505\U0001D507-\U0001D50A\U0001D50D-\U0001D514\U0001D516-\U0001D51C\U0001D51E-\U0001D539\U0001D53B-\U0001D53E\U0001D540-\U0001D544\U0001D546\U0001D54A-\U0001D550\U0001D552-\U0001D6A3\U0001D6A8-\U0001D6C0\U0001D6C2-\U0001D6DA\U0001D6DC-\U0001D6FA\U0001D6FC-\U0001D714\U0001D716-\U0001D734\U0001D736-\U0001D74E\U0001D750-\U0001D76E\U0001D770-\U0001D788\U0001D78A-\U0001D7A8\U0001D7AA-\U0001D7C2\U0001D7C4-\U0001D7C9]/g return callback(sentence.replace(re, '')) } return callback(sentence) } function getBestSentence(paragraph, sentences_dict, callback) { splitContentToSentences(paragraph, function(sentences) { if (!sentences) return '' if (sentences.length < 2) return '' var best_sentence = '', max_value = 0, strip_s, sentence, s for(s in sentences) { sentence = sentences[s] formatSentence(sentence, function(strip_s) { if(strip_s && sentences_dict[strip_s] > max_value) { max_value = sentences_dict[strip_s] best_sentence = sentence } }) } callback(best_sentence) }) } function getSortedSentences(paragraph, sentences_dict, n, callback) { splitContentToSentences(paragraph, function(sentences) { if (!sentences) return callback('') if (sentences.length < 2) return callback('') var sentence_scores = [], strip_s _.each(sentences, function(s, i) { formatSentence(s, function(strip_s) { if(strip_s) { sentence_scores.push({ sentence: s, score: sentences_dict[strip_s], order: i, }) } }) }) sentence_scores = _.sortBy(sentence_scores, function(sentence_score) { return -(sentence_score.score) }) if(sentence_scores.length < n || n === 0) { n = sentence_scores.length } sentence_scores = sentence_scores.slice(0, n) sentence_scores = _.sortBy(sentence_scores, function(sentence) { return sentence.order }) callback(_.map(sentence_scores, 'sentence')) // callback with sorted_sentences. _.map is former .pluck }) } function getSentencesRanks(content, callback, sentences_dict) { if (sentences_dict !== undefined) { // return cached sentences_dict if available callback(sentences_dict) return } else sentences_dict = {} splitContentToSentences(content, function(sentences) { var n = sentences.length, zeroNRange = _.range(0, n), nRange = _.range(n) // This is ugly, I know. var values = [], _val = [] _.each(nRange, function(x) { _val = [] _.each(nRange, function(y) { _val.push(0) }) values.push(_val) }) // Assign each score to each sentence _.each(zeroNRange, function(i) { _.each(zeroNRange, function(j) { sentencesIntersection(sentences[i], sentences[j], function(err, intersection) { if(err) throw err values[i][j] = intersection }) }) }) // Build sentence score dictionary var score = 0 _.some(zeroNRange, function(i) { score = 0 _.some(zeroNRange, function(j) { if(i !== j) score += values[i][j] }) formatSentence(sentences[i], function(strip_s) { sentences_dict[strip_s] = score }) }) callback(sentences_dict) }) } exports.summarizeFromUrl = function(url, callback) { var summaryToolContext = this if (isValidUrl(url)) { request.get(url, function(error, response, body) { let title = getTitle(body) let text = convertHTMLToText(body) let content = onlyGetSentences(text) return summaryToolContext.summarize(title, content, function(err, result, dict) { if(err) { callback(err, result, dict) } else { callback(err, result, dict) } }) }) } else { callback(true, "Not a valid url. Please try passing a valid url like https://example.com/.") } } function onlyGetSentences(text) { return text.split('* ').reduce(function(prevVal, currVal) { return prevVal.length > currVal.length ? prevVal : currVal }).split('\n').filter(function(sentence) { return sentence.length > 10 }).join('') } function convertHTMLToText(body) { return htmlToText.fromString(body.toString(), { ignoreHref: true, ignoreImage: true }) } function isValidUrl (url) { let pattern = "^(https?://)?(www\\.)?([-a-z0-9]{1,63}\\.)" + "*?[a-z0-9][-a-z0-9]{0,61}[a-z0-9]\\" + ".[a-z]{2,6}(/[-\\w@\\+\\.~#\\?&/=%]*)?$"; let regexQuery = new RegExp(pattern, 'i') return regexQuery.test(url) ? true : false } function getTitle(htmlBody) { let $ = cheerio.load(htmlBody) return $('title').text() || $('h1').text() || "" } exports.summarize = function(title, content, callback, sentences_dict) { var summary = [], paragraphs = [], sentence = '', err = false if(arguments.length < 3) { if(content.constructor === Function) { callback = content content = title title = "" } } getSentencesRanks(content, function(dict) { splitContentToParagraphs(content, function(paragraphs) { summary.push(title) // Store the title. _.each(paragraphs, function(p) { getBestSentence(p, dict, function(sentence) { if(sentence) summary.push(sentence) }) }) // If we only have a title, then there is an issue. if(sentence.length === 2) err = true callback(err, summary.join("\n"), dict) }) }, sentences_dict) } exports.getSortedSentences = function(content, n, callback, sentences_dict) { if (typeof(n) === 'function') { callback = n n = 0 } getSentencesRanks(content, function(dict) { getSortedSentences(content, dict, n, function(sorted_sentences) { if(sorted_sentences === '') { callback(new Error('Too short to summarize.')) } else { callback(null, sorted_sentences, dict) } }) }, sentences_dict) }