UNPKG

wink-pos-tagger

Version:

English Part-of-speech (POS) tagger

163 lines (153 loc) 6.69 kB
// wink-pos-tagger // English Part-of-speech (POS) tagger // // Copyright (C) 2017-19 GRAYPE Systems Private Limited // // This file is part of “wink-pos-tagger”. // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. /* eslint-disable no-console */ // const K = require( './rules/consts.js' ); var posCRsGE0 = require( './rules/pos-rules-ge0.js' ); var valueCRsGE0 = require( './rules/value-rules-ge0.js' ); var posCRsLE0 = require( './rules/pos-rules-le0.js' ); var valueCRsLE0 = require( './rules/value-rules-le0.js' ); // ### testValueAtDelta /** * * Tests the **value** of token's property defined by `rule.operand.property` * using regex `rule.matches` at `rule.operand.delta` away from `cti`. * * @param {object[]} tokens in wink-tokenizer standards. * @param {number} cti current token's index. * @param {object} rule containing keys `op`, `operand` and `matches` and their * corresponding values. * @return {boolean} `true` if match occurs otherwise `false`. * @private */ var testValueAtDelta = function ( tokens, cti, rule ) { var tAti = tokens[ rule.operand.delta + cti ]; if ( tAti && rule.matches.test( tAti[ rule.operand.property ] ) ) return true; return false; }; // testValueAtDelta(); // ### testValueInRange /** * * Tests the **value** of token's property defined by `rule.operand.property` * using regex `rule.matches` anywhere within the range specified by array * `rule.operand.range`. The array is a 2-element array specifying the range, * which is added to `cti` to compute the actual range. * * @param {object[]} tokens in wink-tokenizer standards. * @param {number} cti current token's index. * @param {object} rule containing keys `op`, `operand` and `matches` and their * corresponding values. * @return {boolean} `true` if match occurs otherwise `false`. * @private */ var testValueInRange = function ( tokens, cti, rule ) { var tAti; for ( var i = rule.operand.range[ 0 ]; i <= rule.operand.range[ 1 ]; i += 1 ) { tAti = tokens[ i + cti ]; if ( tAti && rule.matches.test( tAti[ rule.operand.property ] ) ) return true; } return false; }; // testValueInRange() var operation = Object.create( null ); operation[ K.TEST_VALUE_AT_DELTA ] = testValueAtDelta; operation[ K.TEST_VALUE_IN_RANGE ] = testValueInRange; // ### applyContextRule /** * * Applies the given `contextRule` on the current token. A rule applicatin may * trigger change in the POS at token specified by `thenPosAt` relative distance. * The change is applied only if the new POS is amongst one of the valid POSes. * * @param {object[]} tokens in wink-tokenizer standards. * @param {number} cti current token's index. * @param {object} contextRule contains the specific rule. * @param {array[]} poses each element is an array & contains valid POSes for * the token at that index in `tokens`. * @return {boolean} `true` if pos change occurs otherwise `false`. * @private */ var applyContextRule = function ( tokens, cti, contextRule, poses ) { var rules = contextRule.rules; var change = true; for ( var i = 0, imax = rules.length; ( i < imax && change ); i += 1 ) { change = operation[ rules[ i ].op ]( tokens, cti, rules[ i ] ); } // Trigger change only if the new `pos` is a valid one — present in `poses`. if ( change && poses[ cti ].indexOf( contextRule.willBe ) !== -1 ) { tokens[ contextRule.thenPosAt + cti ].pos = contextRule.willBe; return true; } return false; }; // applyContextRule() // ### applyContextRules /** * * Applies given `contextRules` on each token one-by-one. For each token, rules * are tried until either a POS change has occurred or all rules have been exhausted * without any change. * * @param {object[]} tokens in wink-tokenizer standards. * @param {object} contextRules contains rules for different POSes. The rules to * be applied is selected on the basis of POS of the current token. * @param {array[]} poses each element is an array & contains valid POSes for * the token at that index in `tokens`. * @return {void} Nothing! * @private */ var applyContextRules = function ( tokens, contextRules, poses ) { var rules; var i, imax, j, jmax; for ( i = 0, imax = tokens.length; i < imax; i += 1 ) { rules = contextRules[ tokens[ i ].pos ]; if ( rules ) { for ( j = 0, jmax = rules.length; j < jmax && !applyContextRule( tokens, i, rules[ j ], poses ); j += 1); } } }; // applyContextRules() // ### applyAllContextRules /** * * There are currently 4 sets of context rules. They are first categorized * on the basis of `property` of token they use i.e. **value** or **pos**. Each * one of them is further categorized on the basis of if the **delta/range** values * are **positive** or **negative**. It applies these rules in the required sequence. * * @param {object[]} tokens in wink-tokenizer standards. * @param {array[]} poses each element is an array & contains valid POSes for * the token at that index in `tokens`. * @return {void} Nothing! * @private */ var applyAllContextRules = function ( tokens, poses ) { // First apply <0 rules to update POS before looking ahead. // Try `value` specific rules first followed by `pos` specific. In other words // specific rules followed by generic rules. applyContextRules( tokens, valueCRsLE0, poses ); applyContextRules( tokens, posCRsLE0, poses ); // Already applied <0 rules, time to look ahead. applyContextRules( tokens, valueCRsGE0, poses ); applyContextRules( tokens, posCRsGE0, poses ); }; // applyAllContextRules() module.exports = applyAllContextRules;