UNPKG

nlpsum

Version:

Powerful text summarization algorithms from research papers and dedicated research.

91 lines (76 loc) 2.34 kB
/*! * jsPOS * * Copyright 2010, Percy Wegmann * Licensed under the GNU LGPLv3 license * http://www.opensource.org/licenses/lgpl-3.0.html */ module.exports = Lexer; var re = { // http://daringfireball.net/2010/07/improved_regex_for_matching_urls url: /\b(?:(?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\))+(?:\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/ig, number: /[0-9]*\.[0-9]+|[0-9]+/ig, space: /\s+/ig, unblank: /\S/, punctuation: /[\/\.\,\?\!\"\']/ig } function LexerNode(string, regex, regexs){ this.string = string; this.children = []; if (string) { this.matches = string.match(regex); var childElements = string.split(regex); } if (!this.matches) { this.matches = []; var childElements = [string]; } if (!regexs.length) { // no more regular expressions, we're done this.children = childElements; } else { // descend recursively var nextRegex = regexs[0] , nextRegexes = regexs.slice(1); for (var i in childElements) { if (childElements.hasOwnProperty(i)) { this.children.push( new LexerNode(childElements[i], nextRegex, nextRegexes)); } } } } LexerNode.prototype.fillArray = function(array){ for (var i in this.children) { if (this.children.hasOwnProperty(i)) { var child = this.children[i]; if (child.fillArray) { child.fillArray(array); } else if (re.unblank.test(child)) { array.push(child); } if (i < this.matches.length) { var match = this.matches[i]; if (re.unblank.test(match)) array.push(match); } } } } LexerNode.prototype.toString = function(){ var array = []; this.fillArray(array); return array.toString(); } function Lexer(){ // Split by urls, then numbers, then whitespace, then punctuation this.regexs = [re.url, re.number, re.space, re.punctuation]; } Lexer.prototype.lex = function(string){ var array = [] , node = new LexerNode(string, this.regexs[0], this.regexs.slice(1)); node.fillArray(array); return array; } //var lexer = new Lexer(); //print(lexer.lex("I made $5.60 today in 1 hour of work. The E.M.T.'s were on time, but only barely.").toString());