wink-nlp
Version:
Developer friendly Natural Language Processing ✨
170 lines (154 loc) • 6.65 kB
JavaScript
// wink-nlp
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-nlp”.
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and
// associated documentation files (the "Software"), to
// deal in the Software without restriction, including
// without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial
// portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
/* eslint-disable no-underscore-dangle */
var constants = require( './constants.js' );
// Bits reserved for `precedingSpaces`.
var bits4PrecedingSpace = constants.bits4PrecedingSpace;
// Size of a single expansion.
var xpSize = constants.xpSize;
// Bits reserved for `lemma`.
var bits4lemma = constants.bits4lemma;
// The UNK!
var UNK = constants.UNK;
// Size of a single token.
var tkSize = constants.tkSize;
var docDataWrapper = function ( data ) {
// Extract frequently referred data elements:
// Extract `cache`.
var cache = data.cache;
// Extract `tokens`.
var tokens = data.tokens;
// Returned!
var methods = Object.create( null );
// ## addToken
/**
*
* It first creates a new lexeme entry into the `cache` and then this entry
* is pushed into the `tokens` array alongwith the `precedingSpaces` and
* rest of the token properties are initialized to `0`.
*
* @param {string} text to be added as token.
* @param {string} category of the token.
* @param {number} precedingSpaces to the `text` as parsed by tokenizer.
* @param {array} nbsp, containing details of nbsp.
* @returns {boolean} always `true`.
* @private
*/
var addToken = function ( text, category, precedingSpaces, nbsp ) {
// Non-normalized index of the token being pushed.
var idx;
idx = tokens.push( cache.add( text, category ), precedingSpaces, 0, 0 );
// See comments in `addTokenIfInCache()`
if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
return true;
}; // addToken()
// ## addTokenIfInCache
/**
*
* Adds a token corresponding to the input `text` if it is found in cache i.e.
* not an OOV. The addition process ensures the following:
* 1. Preceding spaces are added.
* 2. If text is a contraction, it expansions are added. Since expansins
* consists of lexeme, normal, lemma and pos, all of these are added to the
* token structure.
*
* @param {string} text to be added as token.
* @param {number} precedingSpaces to the `text` as parsed by tokenizer.
* @param {string} nbsp non breaking spaces
* @returns {boolean} `truthy` if `text` is found in cache otherwise `falsy`.
* @private
*/
var addTokenIfInCache = function ( text, precedingSpaces, nbsp ) {
// The array `tokenIndex` will contain 1-element if `text` is not a predefined
// contraction; otherwise it will contain `n x 4` elements, where `n` is the
// number of expansions.
var tokenIndex = cache.lookup( text );
// Temp for preceding space in case of contarction.
var ps;
// Temp for lemma & pos.
var lemma, pos;
// Non-normalized index of the token being pushed.
var idx;
// `UNK` means 0 or `falsy`; it flags that token has not been added.
if ( tokenIndex === null ) return UNK;
if ( tokenIndex.length === 1 ) {
idx = tokens.push( tokenIndex[ 0 ], precedingSpaces, 0, 0 );
// Store non breaking spaces preceding this token. Do it only if `precedingSpaces > 0` (Note:
// it is zero in case of expansion of a contraction) AND `nbsp` is defined (Note: in this case
// precedingSpaces would be set to max i.e. 0xFFFF with only exception when the token is being
// expanded: the first one will have nbsp but the subsequent ones with have 0 preceding spaces).
// The storage index should be the normalaized token index.
if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
} else {
// Contraction, itereate through each expansion.
for ( let k = 0; k < tokenIndex.length; k += xpSize ) {
// The `precedingSpaces` will be 0 except for the first expansion.
ps = ( k === 0 ) ? precedingSpaces : 0;
// Concatenate pointer to normal contained in `xpansions` with preceding
// spaces.
ps |= ( tokenIndex[ k + 1 ] << bits4PrecedingSpace ); // eslint-disable-line no-bitwise
// Lemma & POS are fixed mostly for all contractions.
lemma = tokenIndex[ k + 2 ];
pos = tokenIndex[ k + 3 ];
// Add token; annotations may be filled later in the pipeline.
idx = tokens.push( tokenIndex[ k ], ps, ( lemma | ( pos << bits4lemma ) ), 0 ); // eslint-disable-line no-bitwise
// See comment above in the then block of this if-statement.
if ( nbsp !== null && precedingSpaces > 0 ) data.nonBreakingSpaces[ ( idx / tkSize ) - 1 ] = nbsp;
}
}
// Return `truthy`, indicating that token(s) has been added successfully.
return 99;
}; // addTokenIfInCache()
// ## isLexeme
/**
*
* Tests if the `text` is a valid lexeme or not.
*
* @param {string} text to be added as token.
* @returns {boolean} `truthy` if `text` is a valid lexeme otherwise `falsy`.
* @private
*/
var isLexeme = function ( text ) {
// Return `truthy` if the text is valid i.e. found. Note for `$%^OOV^%$`, it returns
// `0` i.e. `falsy`!
return cache.lookup( text );
}; // isLexeme()
var clean = function () {
tokens = null;
cache = null;
}; // clean()
methods._addToken = addToken;
methods._addTokenIfInCache = addTokenIfInCache;
methods.isLexeme = isLexeme;
methods.clean = clean;
return methods;
}; // docDataWrapper()
module.exports = docDataWrapper;