UNPKG

wink-pos-tagger

Version:

English Part-of-speech (POS) tagger

345 lines (329 loc) 13.6 kB
// wink-pos-tagger // English Part-of-speech (POS) tagger // // Copyright (C) 2017-19 GRAYPE Systems Private Limited // // This file is part of “wink-pos-tagger”. // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // var helpers = require( 'wink-helpers' ); var winkLexicon = require( 'wink-lexicon/src/lexicon.js' ); var unigramPOSTagger = require( './unigram-tagger.js' ); var applyContextRules = require( './rules-engine.js' ); var wl = require( 'wink-lemmatizer' ); var lemmatizeVBX = wl.lemmatizeVerb; var lemmatizeNNX = wl.lemmatizeNoun; var lemmatizeJJX = wl.lemmatizeAdjective; // Load tokenizer, instanciate and get tokenize method; use default config. var tokenize = require( 'wink-tokenizer' )().tokenize; // Extract string normalization function from `wink-helpers`. var normalize = helpers.string.normalize; var lemmaExceptions = Object.create( null ); lemmaExceptions.ai = 'be'; lemmaExceptions.ca = 'can'; lemmaExceptions.sha = 'shall'; lemmaExceptions[ '\'ll' ] = lemmaExceptions.wo = 'will'; lemmaExceptions[ '\'ve' ] = 'have'; lemmaExceptions[ '\'m' ] = 'am'; lemmaExceptions[ '\'re' ] = 'be'; lemmaExceptions[ 'n\'t' ] = 'not'; lemmaExceptions[ '\'d' ] = 'would'; // Needed for simple NNP transformation rule. const capA = 'A'; const capZ = 'Z'; // Required in raw tokens tagging const rgxNumber = /^\d+\/\d+|\d(?:[\.\,\-\/]?\d)*(?:\.\d+)?$/; const rgxPunctuation = /^[\’\'\‘\’\`\“\”\"\[\]\(\)\{\}\…\,\.\!\;\?\-\:]+$/; // Used in tagging years. const year = Object.create( null ); year[ '1990s' ] = 'CD'; year[ '1980s' ] = 'CD'; year[ '1970s' ] = 'CD'; year[ '1960s' ] = 'CD'; year[ '1950s' ] = 'CD'; year[ '1940s' ] = 'CD'; year[ '1930s' ] = 'CD'; year[ '1910s' ] = 'CD'; year[ 'mid-1990s' ] = 'CD'; year[ 'mid-1980s' ] = 'CD'; year[ 'mid-1970s' ] = 'CD'; year[ 'mid-1960s' ] = 'CD'; year[ 'mid-1950s' ] = 'CD'; year[ 'mid-1940s' ] = 'CD'; year[ 'mid-1930s' ] = 'CD'; year[ 'mid-1920s' ] = 'CD'; year[ 'mid-1910s' ] = 'CD'; // ### posTagger /** * * Creates an instance of {@link Tagger}. * * @return {Tagger} object conatining set of API methods for pos-tagging. * @example * // Load wink tokenizer. * var tagger = require( 'wink-pos-tagger' ); * // Create your instance of wink tokenizer. * var myTagger = posTagger(); */ var posTagger = function ( ) { /** * @classdesc Tagger class * @class Tagger * @hideconstructor */ var methods = Object.create( null ); // ### updateLexicon /** * * Updates the internal lexicon using the input `lexicon`. If a word/pos pair * is found in the internal lexicon then it's value is updated with the new pos; * otherwise it added. * * @method Tagger#updateLexicon * @param {object} lexicon containing **`word/pos`** pairs to be added to or * replaced in the existing lexicon. The `pos` should be an array containing * pos tags, with the first one as the most frequently used POS. The `word` is * normalized before updating the internal lexicon. * @return {undefined} Nothing! * @throws {Error} if `lexicon` is not a valid JS object. * @example * myTagger.updateLexicon( { Obama: [ 'NNP' ] } ); */ var updateLexicon = function ( lexicon ) { if ( !helpers.validate.isObject( lexicon ) ) { throw Error( 'wink-pos-tagger/updateLexicon: lexicon must be an object, instead found: ' + JSON.stringify( lexicon ) ); } // Update winkLexicon but with **normalized** key. for ( var key in lexicon ) winkLexicon[ normalize( key ) ] = lexicon[ key ]; // eslint-disable-line guard-for-in }; // updateLexicon() // ### defineConfig /** * * This API has no effect. It has been maintained for compatibility purpose. * The `wink-tokenizer` will now always add **lemma** and **normal** forms. * Note, lemmas are added only for **nouns** (excluding proper noun), **verbs** and * **adjectives**. * * @method Tagger#defineConfig * @return {object} always as `{ lemma: true, normal: true }`. * @example * // There will not be any effect: * var myTagger.defineConfig( { lemma: false } ); * // -> { lemma: true, normal: true } */ var defineConfig = function ( ) { // Return a copy of configuration object. return ( JSON.parse( JSON.stringify( { lemma: true, normal: true } ) ) ); }; // defineConfig() // ### lemmatize /** * * Performs lemmatization; also applies NNP transformation rules for captitalized * nouns and adjectives and CD rule for years. * * @method Tagger#lemmatize * @param {object[]} tokens to be lemmatized. * @return {object[]} lemmatized tokens. * @private */ var lemmatize = function ( tokens ) { var t, v0, w; var lemma; var tpos; for ( let i = 0, imax = tokens.length; i < imax; i += 1 ) { t = tokens[ i ]; w = t.normal; v0 = t.value[ 0 ]; tpos = year[ w ]; if ( tpos ) t.pos = 'CD'; // First handle exceptions arising out of contractions. lemma = lemmaExceptions[ w ]; if ( lemma ) { t.lemma = lemma; } else { // Otherwise use lemmatizer. switch ( t.pos[ 0 ] ) { case 'J': if ( ( v0 >= capA ) && ( v0 <= capZ ) ) { t.lemma = w; t.pos = 'NNP'; } else { t.lemma = ( t.pos.length > 2 ) ? lemmatizeJJX( w ) : w; } break; case 'V': t.lemma = ( t.pos.length > 2 ) ? ( ( t.normal === '\'s') ? 'be' : lemmatizeVBX( w ) ) : w; break; case 'N': if ( ( v0 >= capA ) && ( v0 <= capZ ) ) { t.lemma = w; t.pos = 'NNP'; } else { // No lemmatization of NNPs please! t.lemma = ( t.pos !== 'NNP' && t.pos.length > 2 ) ? lemmatizeNNX( w ) : w; } break; case 'M': t.lemma = lemmatizeVBX( w ); break; default: // Do nothing! } // swtich } // if } return tokens; }; // lemmatize() // ### tag /** * * Tags the input **`tokens`** with their **pos**. It has another alias – **`tagTokens()`**. * * *In order to pos tag a sentence directly, use * [`tagSentence`](http://winkjs.org/wink-pos-tagger/Tagger.html#tagSentence) * API instead.* * * @method Tagger#tag * @param {object[]} tokens to be pos tagged. They are array of objects and * must follow the [**`wink-tokenizer`**](http://winkjs.org/wink-tokenizer/) * standard. * @return {object[]} pos tagged `tokens`. * @example * // Get `tokenizer` method from the instance of `wink-tokenizer`. * var tokenize = require( 'wink-tokenizer' )().tokenize; * // Tag the tokenized sentence. * myTagger.tag( tokenize( 'I ate the entire pizza as I was feeling hungry.' ) ); * // -> [ { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' }, * // { value: 'ate', tag: 'word', normal: 'ate', pos: 'VBD', lemma: 'eat' }, * // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' }, * // { value: 'entire', tag: 'word', normal: 'entire', pos: 'JJ', lemma: 'entire' }, * // { value: 'pizza', tag: 'word', normal: 'pizza', pos: 'NN', lemma: 'pizza' }, * // { value: 'as', tag: 'word', normal: 'as', pos: 'IN' }, * // { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' }, * // { value: 'was', tag: 'word', normal: 'was', pos: 'VBD', lemma: 'be' }, * // { value: 'feeling', tag: 'word', normal: 'feeling', pos: 'VBG', lemma: 'feel' }, * // { value: 'hungry', tag: 'word', normal: 'hungry', pos: 'JJ', lemma: 'hungry' }, * // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ] */ var tag = function ( tokens ) { // Array of "array each possible pos" for each token. var poses = []; // Temp token & word. var t; for ( let i = 0, imax = tokens.length; i < imax; i += 1 ) { t = tokens[ i ]; // Normalize, if configuration demands it! t.normal = normalize( t.value ); poses.push( unigramPOSTagger( t, winkLexicon ) ); } applyContextRules( tokens, poses ); // Lemmatize, if configuration demands... lemmatize( tokens ); return tokens; }; // tagTokens(); // ### tagRawTokens /** * * Tags the **`raw tokens`** with their **pos**. Note, it only categorizes each * token in to one of the following 3-categories (a) word, or (b) punctuation, * or (c) number. * * *In order to pos tag a sentence directly, use * [`tagSentence`](http://winkjs.org/wink-pos-tagger/Tagger.html#tagSentence) * API instead.* * * @method Tagger#tagRawTokens * @param {string[]} rawTokens to be pos tagged. They are simple array of string. * @return {object[]} pos tagged `tokens`. * @example * var rawTokens = [ 'I', 'ate', 'the', 'entire', 'pizza', 'as', 'I', 'was', 'feeling', 'hungry', '.' ]; * // Tag the raw tokens. * myTagger.tagRawTokens( rawTokens ); * // -> [ { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' }, * // { value: 'ate', tag: 'word', normal: 'ate', pos: 'VBD', lemma: 'eat' }, * // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' }, * // { value: 'entire', tag: 'word', normal: 'entire', pos: 'JJ', lemma: 'entire' }, * // { value: 'pizza', tag: 'word', normal: 'pizza', pos: 'NN', lemma: 'pizza' }, * // { value: 'as', tag: 'word', normal: 'as', pos: 'IN' }, * // { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' }, * // { value: 'was', tag: 'word', normal: 'was', pos: 'VBD', lemma: 'be' }, * // { value: 'feeling', tag: 'word', normal: 'feeling', pos: 'VBG', lemma: 'feel' }, * // { value: 'hungry', tag: 'word', normal: 'hungry', pos: 'JJ', lemma: 'hungry' }, * // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ] */ var tagRawTokens = function ( rawTokens ) { // Will contain tokens transformed into wink format tokens var wt = []; var t; for ( var i = 0, imax = rawTokens.length; i < imax; i += 1 ) { t = rawTokens[ i ]; if ( rgxNumber.test( t ) ) { wt.push( { value: t, tag: 'number' } ); } else if ( rgxPunctuation.test( t ) ) { wt.push( { value: t, tag: 'punctuation' } ); } else wt.push( { value: t, tag: 'word' } ); } return tag( wt ); }; // tagRawTokens() // ### tagSentence /** * * Tags the input `sentence` with their **pos**. * * @method Tagger#tagSentence * @param {string} sentence to be pos tagged. * @return {object[]} pos tagged `tokens.` * @throws {Error} if `sentence` is not a valid string. * @example * myTagger.tagSentence( 'A bear just crossed the road.' ); * // -> [ { value: 'A', tag: 'word', normal: 'a', pos: 'DT' }, * // { value: 'bear', tag: 'word', normal: 'bear', pos: 'NN', lemma: 'bear' }, * // { value: 'just', tag: 'word', normal: 'just', pos: 'RB' }, * // { value: 'crossed', tag: 'word', normal: 'crossed', pos: 'VBD', lemma: 'cross' }, * // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' }, * // { value: 'road', tag: 'word', normal: 'road', pos: 'NN', lemma: 'road' }, * // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ] * // * // * myTagger.tagSentence( 'I will bear all the expenses.' ); * // -> [ { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' }, * // { value: 'will', tag: 'word', normal: 'will', pos: 'MD', lemma: 'will' }, * // { value: 'bear', tag: 'word', normal: 'bear', pos: 'VB', lemma: 'bear' }, * // { value: 'all', tag: 'word', normal: 'all', pos: 'PDT' }, * // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' }, * // { value: 'expenses', tag: 'word', normal: 'expenses', pos: 'NNS', lemma: 'expense' }, * // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ] */ var tagSentence = function ( sentence ) { if ( typeof sentence !== 'string' ) { throw Error( 'wink-pos-tagger: input sentence must be a string, instead found: ' + typeof sentence ); } return tag( tokenize( sentence ) ); }; // tagSentence() methods.updateLexicon = updateLexicon; methods.tag = tag; methods.tagTokens = tag; methods.tagRawTokens = tagRawTokens; methods.tagSentence = tagSentence; methods.defineConfig = defineConfig; return methods; }; // posTagger() module.exports = posTagger;