wink-pos-tagger
Version:
English Part-of-speech (POS) tagger
177 lines (167 loc) • 6.61 kB
JavaScript
// wink-pos-tagger
// English Part-of-speech (POS) tagger
//
// Copyright (C) 2017-19 GRAYPE Systems Private Limited
//
// This file is part of “wink-pos-tagger”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
// Defines the morphological rules for guessing the POS for unknown words.
var unknownWordsPOS = require( 'wink-lexicon/src/unknown-words.js' );
// POS for `tag`!
var tagPOS = require( 'wink-lexicon/src/tags.js' );
var words = require( 'wink-lexicon/src/wn-words.js' );
var adjExceptions = require( 'wink-lexicon/src/wn-adjective-exceptions.js' );
var nounExceptions = require( 'wink-lexicon/src/wn-noun-exceptions.js' );
var verbExceptions = require( 'wink-lexicon/src/wn-verb-exceptions.js' );
var senseMap = require( 'wink-lexicon/src/wn-word-senses.js' );
// POS for punctuations.
var punctuationPOS = require( 'wink-lexicon/src/punctuations.js' );
const rgxs4POS = [
// 0-2: NNS/VBZ
{ replace: /s$/, by: '' },
{ replace: /es$/, by: '' },
{ replace: /ies$/, by: 'y' },
// 3-4: VBG
{ replace: /ing$/, by: '' },
{ replace: /ing$/, by: 'e' },
// 5-6: VBD/VBN
{ replace: /ed$/, by: '' },
{ replace: /d$/, by: '' },
// 7-8: JJR
{ replace: /er$/, by: '' },
{ replace: /r$/, by: '' },
// 9-10: JJS
{ replace: /est$/, by: '' },
{ replace: /st$/, by: '' },
// Balance: NNS
{ replace: /xes$/, by: 'x' },
{ replace: /zes$/, by: 's' },
{ replace: /ves$/, by: 'f' },
{ replace: /ches$/, by: 'ch' },
{ replace: /shes$/, by: 'sh' },
{ replace: /men$/, by: 'man' }
];
// ### isPotentialWord
/**
*
* Checks if the supplied `word` is potentially a valid English word by
* applying regexes sequentially and performaling a level-2 look up.
*
* @param {string} word that needs to be checked
* @return {boolean} true if it is a potential English word, otherwise false.
* @private
*/
var isPotentialWord = function ( word ) {
for ( var i = 0, imax = rgxs4POS.length; i < imax; i += 1 ) {
if ( rgxs4POS[ i ].replace.test( word ) ) {
if ( words[ word.replace( rgxs4POS[ i ].replace, rgxs4POS[ i ].by ) ] !== undefined ) return true;
}
}
return false;
};
// ### unigramL2POSTagger
/**
*
* Tags a single input `token` whose lookup has failed in the `lexicon`. Attempts
* to apply morphological rules if token is found in the level-2 lookup.
* Finally falls bact to NNP!
*
* @param {object} token that needs to be tagged; must follow wink-tokenizer.
* standards.
* @param {object} lexicon containing word/pos key/value pairs.
* @return {array} the array of all possible part-of-speeches.
* @private
*/
var unigramL2POSTagger = function ( token, lexicon ) {
var word = token.normal;
var index = words[ word ];
var pos;
if ( index || adjExceptions[ word ] || nounExceptions[ word] || verbExceptions[ word ] || isPotentialWord( word ) ) {
if ( index && senseMap[ index ][ 0 ] === 15 && ( /^[A-Z][a-z]+/ ).test( token.value ) ) {
pos = 'NNP';
} else {
// Word exists, can apply morphological rules safely.
// Their sequence of application is important: match the longest
// one first!
pos = ( unknownWordsPOS[ word.slice( -4 ) ] ||
( unknownWordsPOS[ word.slice( -3 ) ] ||
( unknownWordsPOS[ word.slice( -2 ) ] ||
unknownWordsPOS[ word.slice( -1 ) ] ) ) );
if ( !pos && word.slice( 0, 2 ) === 'un' && lexicon[ word.slice( 2 ) ] ) {
pos = 'JJ';
}
}
} else {
// Seems like an unknown word, make it proper noun!
pos = 'NNP';
}
// Return poses intelligently to allow the context rules to work!
return ( ( pos === undefined ) ?
[ 'NN', 'VBP', 'VB' ] : ( pos === 'NNS' ) ?
[ 'NNS', 'VBZ' ] : ( pos === 'VBN' ) ?
[ 'VBD', 'VBN' ] : [ pos ]
);
}; // unigramL2POSTagger()
// ### unigramPOSTagger
/**
*
* Tags a single input `token` using the `lexicon`. Attempts to apply
* morphological rules if the primary look up fails but secondary one succeeds.
* Finally falls bact to NNP!
*
* @param {object} token that needs to be tagged; must follow wink-tokenizer.
* standards.
* @param {object} lexicon containing word/pos key/value pairs.
* @return {array} the array of all possible part-of-speeches.
* @private
*/
var unigramPOSTagger = function ( token, lexicon ) {
// If token is an **entity with pos defined**, no tagging is needed.
if ( token.entityType && token.pos ) return [ token.pos ];
// Use `normalize()` to obtain the word and not `toLowerCase()`.
var word = token.normal;
// Arrray of pos for the word from lexicon.
var poses;
// Finish off with punctuations first.
if ( token.tag === 'punctuation' ) {
// `|| token.value` is a catch all clause! In other words, unknown
// punctuation will have a pos as **it's value**.
token.pos = punctuationPOS[ token.value ] || token.value;
return [ token.pos ];
}
if ( token.tag === 'symbol' && token.value !== '&' ) {
token.pos = 'NN';
return [ token.pos ];
}
// Start with tag lookup!
token.pos = tagPOS[ token.tag ];
if ( token.pos === undefined ) {
// Didn't work, try dictionary lookup.
poses = lexicon[ word ] || unigramL2POSTagger( token, lexicon );
token.pos = poses[ 0 ];
} else {
// Tag POS is returned.
return [ token.pos ];
}
// The `poses` can be undefined in case of unknown words, un-adjectives & NN fallback.
return ( poses );
}; // unigramPOSTagger();
module.exports = unigramPOSTagger;