morrr-node-pos
Version:
Node.js module to analyze parts of speech
192 lines (178 loc) • 4.92 kB
JavaScript
var fs = require('fs');
exports.MOBYnounPhrase = function(string, fn) {
getLibrary(function(library) {
var results = [];
sentences = makeArray(string, '.');
forEach(sentences, function(sentence) {
var current = [];
string = makeArray(sentence, ' ');
everyPossible(string, function(subset) {
var testString = "";
forEach(subset, function(word, i) {
if (i !== subset.length - 1) {
testString = testString + word + ' ';
} else {
testString = testString + word;
}
});
if (library['h'].indexOf(testString) !== -1) {
current.push(testString);
}
});
if (current.length > 0) {
results.push(current);
}
});
return fn(results);
});
};
exports.findPhrases = function(string, fn) {
var results = [];
getLibrary(function(library) {
string = makeArray(string, '.');
forEach(string, function(sentence) {
sentence = makeArray(sentence, ' ');
forEvery(sentence, function(subSentence) {
var head = getParts(subSentence[0], library, true);
var npVp = returnPhrase(head);
if (npVp && subSentence.length > 1) {
results.push({
phrase: subSentence,
type: npVp
});
}
});
});
return fn(results);
});
};
exports.partsOfSpeech = function(string, fn) {
var results = [];
getLibrary(function(library) {
var sentences = makeArray(string, '.');
forEach(sentences, function(sentence) {
var current = [];
var words = makeArray(sentence, ' ');
forEach(words, function(word) {
word = noPunc(word);
data = {
word: word,
pos: []
};
data.pos = getParts(word, library);
if (data.pos.length === 0 && (word.match(/[A-Z]/))) {
wordB = word.toLowerCase();
data.pos = getParts(wordB, library);
}
current.push(data);
});
results.push(current);
});
return fn(results);
});
};
function getLibrary(fn) {
fs.readFile(__dirname + '/posDic.js', 'Utf8',
function(err, data) {
data = data.toString();
data = JSON.parse(data);
return fn(data);
});
}
function getParts(word, library, noconvert) {
if (noconvert === undefined) {
noconvert = false;
}
var results = [];
for (var part in library) {
if (library[part].indexOf(word) !== -1) {
if (!noconvert) {
part = convertNotation(part);
}
results.push(part);
}
}
return results;
}
function returnPhrase(array) {
if (array.indexOf('P') !== -1) {
return 'Prepositional Phrase';
} else if (array.indexOf('V') !== -1 || array.indexOf('i') !== -1 || array.indexOf('t') !== -1) {
return 'Verb Phrase';
} else {
return false;
}
}
function convertNotation(part) {
var PoS = {
"Noun": 'N',
"Plural": 'p',
"Noun Phrase": 'h',
"Verb": 'V',
"Verb Transitive": 't',
"Verb Intransitive": 'i',
"Adjective": 'A',
"Adverb": 'v',
"Conjunction": 'C',
"Preposition": 'P',
"Interjection": '!',
"Pronoun": 'r',
"Definite Article": 'D',
};
for (var x in PoS) {
if (part === PoS[x]) {
return x;
}
}
}
function rejectEmpty(array) {
var results = [];
forEach(array, function(element) {
if (element !== '' && element !== ' ') {
element = noPunc(element);
results.push(element);
}
});
return results;
}
function noPunc(word) {
var initial = word[0].match(/\!|\.|\?|\"|\'|\,/),
Final = word[word.length - 1].match(/\!|\.|\?|\"|\'|\,/);
if (initial !== null) {
word = word.substring(1, word.length);
}
if (Final !== null) {
word = word.substring(0, word.length - 1);
}
if (word[0].match(/\!|\.|\?|\"|\'|\,/) !== null || word[word.length - 1].match(/\!|\.|\?|\"|\'|\,/) !== null) {
return noPunc(word);
} else {
return word;
}
}
function forEach(array, fn) {
for (var i = 0; i < array.length; i++) {
fn(array[i], i);
}
}
function forEvery(array, fn) {
for (var i = 0; i < array.length; i++) {
list = [array[i]];
for (var l = i + 1; l < array.length; l++) {
list.push(array[l]);
}
fn(list);
}
}
function everyPossible(array, fn) {
forEvery(array, function(subArray) {
for (var i = subArray.length; i > 0; --i) {
fn(subArray.slice(0, i));
}
});
}
function makeArray(input, join) {
input = input.split(join);
input = rejectEmpty(input);
return input;
}