nlpsum
Version:
Powerful text summarization algorithms from research papers and dedicated research.
91 lines (76 loc) • 2.34 kB
JavaScript
/*!
* jsPOS
*
* Copyright 2010, Percy Wegmann
* Licensed under the GNU LGPLv3 license
* http://www.opensource.org/licenses/lgpl-3.0.html
*/
module.exports = Lexer;
var re = {
// http://daringfireball.net/2010/07/improved_regex_for_matching_urls
url: /\b(?:(?:[a-z][\w-]+:(?:\/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}\/)(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\))+(?:\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))/ig,
number: /[0-9]*\.[0-9]+|[0-9]+/ig,
space: /\s+/ig,
unblank: /\S/,
punctuation: /[\/\.\,\?\!\"\']/ig
}
function LexerNode(string, regex, regexs){
this.string = string;
this.children = [];
if (string) {
this.matches = string.match(regex);
var childElements = string.split(regex);
}
if (!this.matches) {
this.matches = [];
var childElements = [string];
}
if (!regexs.length) {
// no more regular expressions, we're done
this.children = childElements;
} else {
// descend recursively
var nextRegex = regexs[0]
, nextRegexes = regexs.slice(1);
for (var i in childElements) {
if (childElements.hasOwnProperty(i)) {
this.children.push(
new LexerNode(childElements[i], nextRegex, nextRegexes));
}
}
}
}
LexerNode.prototype.fillArray = function(array){
for (var i in this.children) {
if (this.children.hasOwnProperty(i)) {
var child = this.children[i];
if (child.fillArray) {
child.fillArray(array);
} else if (re.unblank.test(child)) {
array.push(child);
}
if (i < this.matches.length) {
var match = this.matches[i];
if (re.unblank.test(match))
array.push(match);
}
}
}
}
LexerNode.prototype.toString = function(){
var array = [];
this.fillArray(array);
return array.toString();
}
function Lexer(){
// Split by urls, then numbers, then whitespace, then punctuation
this.regexs = [re.url, re.number, re.space, re.punctuation];
}
Lexer.prototype.lex = function(string){
var array = []
, node = new LexerNode(string, this.regexs[0], this.regexs.slice(1));
node.fillArray(array);
return array;
}
//var lexer = new Lexer();
//print(lexer.lex("I made $5.60 today in 1 hour of work. The E.M.T.'s were on time, but only barely.").toString());