wink-nlp
Version:
Developer friendly Natural Language Processing ✨
581 lines (545 loc) • 23.9 kB
JavaScript
// wink-nlp
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-nlp”.
//
// Permission is hereby granted, free of charge, to any
// person obtaining a copy of this software and
// associated documentation files (the "Software"), to
// deal in the Software without restriction, including
// without limitation the rights to use, copy, modify,
// merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to
// whom the Software is furnished to do so, subject to
// the following conditions:
//
// The above copyright notice and this permission notice
// shall be included in all copies or substantial
// portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
var constants = require( './constants.js' );
var xnMask = constants.xnMask;
var bits4PrecedingSpace = constants.bits4PrecedingSpace;
var xcMask = constants.xcMask;
var bits4xpPointer = constants.bits4xpPointer;
// ## cache
/**
*
* Creates an instance of `cache`. It is typically instantiated in each `winkNLP`
* instance and there it is responsible for caching token properties acrosss the
* documents i.e. the `doc()`.
*
* @param {Array} model containing language model.
* @param {Array} featureFn extracts language specific features of a lexeme.
* @return {object} of methods.
* @private
*/
var cache = function ( model, featureFn ) {
const fTokenType = 'tokenType';
// Returned!
var methods = Object.create( null );
// Extract frequently used properties.
var lexemesHash = model.features.lexeme.hash;
var lxm = model.features.lexeme;
var lexemeIntrinsicSize = model.features.lexeme.intrinsicSize;
var layout = model.packing.layout;
var pkSize = model.packing.size;
var efSize = model.packing.efSize;
var efList = model.packing.efList;
var efListSize = efList.length;
var lexicon = model.lexicon;
var xpansions = model.xpansions;
var posClusters = model.features.posClusters.list;
// Contains quantas of UInt32Array of size `model.packing.size`. A quanta
// at an `index` contains the features of the corresponding OOV lexeme loacted
// at `model.features.lexeme.list[ index ]`. This simplifies information access,
// as it remains identical to the **intrinsic lexicon** with the only difference
// that this not a continuous array of UInt32s. It follows
// `[ normal, lemma, <extractable features> ]` structure. The extractable
// features will be dynamically determined using the language model.
var extrinsicLexicon = [];
// Base Packing Size is `2` because one word each for normal & lemma is needed.
var elBasePackingSize = 2;
// Packing size for each lexeme in `extrinsicLexicon` — base plus additional
// words needed for extractable features.
var elPackingSize = 2 + efSize;
// Extractable Features temp storage; eventually its contents will be pushed
// inside `extrinsicLexicon`. Space is allocated right in the beginning to save
// time. Its contents are filled i.e. initialized with 0 whenever needed.
var efArray = new Uint32Array( efSize );
var feature = featureFn( model.packing.config );
// Extractable Features Hash: used during property extraction for OOV tokens.
// If a token is not found in this then a **0** is returned.
var efHash = Object.create( null );
// Since `tokenType` is determined during tokenization, it is always extractable.
efHash.tokenType = true;
// Copy rest from the list in to the hash.
efList.forEach( ( ef ) => ( efHash[ ef ] = true ) );
// ## getFeaturesIndex
/**
*
* Returns the `index` of `value` from the feature `name`. If the value is
* missing then it is added and its `index` is returned accordingly alongwith
* a flag indicating that it is a new value.
*
* @param {string} name of the feature.
* @param {string} value of the feature, whoes index will be returned.
* @return {number[]} `[ isNewValue, index ]`.
* @example
* // Returns the index (hash) of **lexeme** – `you`:
* getFeaturesIndex( 'lexeme', 'you' );
* // -> [ 0, 47 ]
* // If `you` was absent then it would have been added and the return value
* // would have been [ 1, index of added value ]
* @private
*/
var getFeaturesIndex = function ( name, value ) {
// Extract the named feature.
var f = model.features[ name ];
// And its hash & list.
var h = f.hash;
var l = f.list;
// New `value` flag.
var isNewValue = 0;
// Check if `value` is present.
var index = h[ value ];
if ( index === undefined ) {
// Feature's storage limit check. — not required right now!
// if ( f.index > f.maxIndex ) {
// throw Error( `wink-nlp: memory limit for "${name}" exceeded.` );
// }
// Missing — add `value`.
index = h[ value ] = f.index;
// No need to increment index because push returns the required value!
f.index = l.push( value );
// Set new value flag.
isNewValue = 1;
}
return [ isNewValue, index ];
}; // getFeaturesIndex()
// ## add
/**
*
* Adds a token in the cache corresponding to the **text**. If the same is
* present in the cache then a pointer to its cached value is retured; otherwise
* a new entry is made in the cache and the same is returned.
*
* Whenever a new entry is made, all its extractable features are also
* extracted & packed; and if an extractable feature is also new, its entry
* is also made via `getFeaturesIndex()` api.
*
* @param {string} text i.e. the value of the token to be added.
* @param {number} category of the token i.e. `word(0)` or `number(1)`, etc.
* @return {number[]} index (or hash) of the `text` added.
* @private
*/
var add = function ( text, category ) {
// Lowercased `text`.
var normText = text.toLowerCase();
// First start with `text` as its properties are being processed first.
var textIndex = getFeaturesIndex( 'lexeme', text );
// Then obtain index of its normal.
var normIndex = ( normText === text ) ? textIndex : getFeaturesIndex( 'lexeme', normText );
// Helpers: cfg of feature, feature, feature's value, feature's value for
// packing & loop index.
var cfg, f, fv, fv4p, k;
// Process properties of `text` first.
// The `textIndex[ 0 ]` is a indicated if the value is newly added, and if
// so then add extract-able features. See `getFeaturesIndex()` above.
if ( textIndex[ 0 ] ) {
// NOTE: This block of code is repeated below, with an exception that
// in the next block we use `normtext` in `fv = feature[ f ]( text )`.
// Intialize extractable featires' array with all 0s.
efArray.fill( 0 );
// For every extractable feature, extract & pack.
for ( k = 0; k < efListSize; k += 1 ) {
f = efList[ k ];
cfg = layout[ f ];
// Use `text`.
fv = feature[ f ]( text, category, methods );
fv4p = ( cfg[ 3 ] ) ? fv : getFeaturesIndex( f, fv )[ 1 ];
efArray[ cfg[ 0 ] ] |= ( fv4p << cfg[ 2 ] ); // eslint-disable-line no-bitwise
} // for
// Pack token type now.
f = fTokenType;
cfg = layout[ f ];
efArray[ cfg[ 0 ] ] |= ( category << cfg[ 2 ] ); // eslint-disable-line no-bitwise
// Push all the details i.e. `[ normal, lemma, <extractable features> ]`
// into `extrinsicLexicon`.
extrinsicLexicon.push( normIndex[ 1 ], normIndex[ 1 ], ...efArray );
} // if ( >= lexemeIntrinsicSize )
// If the normalized text is not same as the original text then the
// normalize text's extract-able features could be candidates for addition.
if ( textIndex[ 1 ] !== normIndex[ 1 ] ) {
// Has it been newly added? If Yes, add its extract-able features.
if ( normIndex[ 0 ] ) {
// NOTE: This block of code is same as above.
// Intialize extractable featires' array with all 0s.
efArray.fill( 0 );
// For every extractable feature, extract & pack.
for ( k = 0; k < efListSize; k += 1 ) {
f = efList[ k ];
cfg = layout[ f ];
// Use `normText`.
fv = feature[ f ]( normText, category, methods );
fv4p = ( cfg[ 3 ] ) ? fv : getFeaturesIndex( f, fv )[ 1 ];
efArray[ cfg[ 0 ] ] |= ( fv4p << cfg[ 2 ] ); // eslint-disable-line no-bitwise
} // for
// Pack token type now.
f = fTokenType;
cfg = layout[ f ];
efArray[ cfg[ 0 ] ] |= ( category << cfg[ 2 ] ); // eslint-disable-line no-bitwise
// Push all the details i.e. `[ normal, lemma, <extractable features> ]`
// into `extrinsicLexicon`.
extrinsicLexicon.push( normIndex[ 1 ], normIndex[ 1 ], ...efArray );
} // if ( >= lexemeIntrinsicSize )
} // if ( textIndex !== normIndex )
// Return the `textIndex` only – this can be sued to extract properties.
return ( textIndex[ 1 ] );
}; // add()
// ## lookup
/**
*
* Looks up for the `text` in the cache and returns its index. If the input
* text is a contraction then its expansions are returned.
*
* @param {string} text to be searched in the cache.
* @return {number[]} contains either a single element (i.e. `index`) indicating
* that it is NOT a contraction or multiple elements indication that the text
* is a contraction. Each contraction expands into 4 elements viz. `lexeme`,
* `normal`, `lemma` , and `pos`.
* @private
*/
var lookup = function ( text ) {
// `layout.isContraction` for multiple use later.
var layout4isContraction = layout.isContraction;
var layout4lemma = layout.lemma;
// `index` to `text`.
var index = lexemesHash[ text ];
// Holds lemma extracted in case of contraction.
var lemma;
// Contraction Count, Contraction Index, Loop Index.
var cc, cx, cxi;
// If the text is not found, return `null`.
if ( index === undefined ) return null;
// `text` is found – need to check for contraction if `text` is not an OOV.
var tokens = [];
var isContraction;
if ( index < lexemeIntrinsicSize ) {
// Not an OOV, check it it is a contraction.
isContraction = ( lexicon[ layout4isContraction[ 0 ] + ( index * pkSize ) ] & layout4isContraction[ 1 ] ) >>> layout4isContraction[ 2 ]; // eslint-disable-line no-bitwise
if ( isContraction ) {
// It is a contraction, process its expansions.
// Start by extracting lemma, as it contains pointer to `expansions` and their count.
lemma = ( lexicon[ layout4lemma[ 0 ] + ( index * pkSize ) ] & layout4lemma[ 1 ] ) >>> layout4lemma[ 2 ]; // eslint-disable-line no-bitwise
// Extract pointer (i.e. index) to expansions and their count.
cx = lemma & 0x3FFF; // eslint-disable-line no-bitwise
cc = ( lemma & ( xcMask << bits4xpPointer ) ) >> bits4xpPointer; // eslint-disable-line no-bitwise
// Iterate through `cc` times to push details into the `tokens`.
for ( cxi = 0; cxi < cc; cxi += 4 ) {
tokens.push(
xpansions[ cx + cxi ], // lexeme
cx + cxi + 1, // normal (pointer to xpansion & not to lexicon)
xpansions[ cx + cxi + 2 ], // lemma
xpansions[ cx + cxi + 3 ] // pos
);
}
} else {
// Not a contraction, simply add `text`'s `index` to `tokens`.
tokens.push( index );
}
} else {
// An OOV, only add `text`'s `index` to `tokens`.
tokens.push( index );
}
return tokens;
}; // lookup()
// ## value
/**
*
* Returns the value corresponding to the `index`.
*
* @param {number} index for the value.
* @return {string} value corresponding to the `index`.
* @private
*/
var value = function ( index ) {
return lxm.list[ index ];
}; // value()
// ## normal
/**
*
* Returns the index of normal of the input `index` (of required lexeme) after
* taking into account mapping of spelling, if any.
*
* @param {number} index of the required lexeme.
* @return {string} index to the normal.
* @private
*/
var normal = function ( index ) {
// Temps for `layput.normal`, `layout.isSpellingMapped`, etc.
var layout4normal = layout.normal;
var layout4mapped = layout.isSpellingMapped;
var layout4lemma = layout.lemma;
// Used to remap if its value is `1`. In this case lemma becomes the `normIndex`.
var isSpellingMapped;
// Index for OOVs i.e. when `index > lexemeIntrinsicSize`.
var oovIdx;
// Returned: normal's index.
var normIndex;
// Processing is different for native and OOV words or lexemes. For OOVs
// properties have to be extracted from `extrinsicLexicon`, whereas for
// native words they are exracted from `lexicon`.
if ( index < lexemeIntrinsicSize ) {
normIndex = ( lexicon[ layout4normal[ 0 ] + ( index * pkSize ) ] & layout4normal[ 1 ] ) >>> layout4normal[ 2 ]; // eslint-disable-line no-bitwise
isSpellingMapped = ( lexicon[ layout4mapped[ 0 ] + ( index * pkSize ) ] & layout4mapped[ 1 ] ) >>> layout4mapped[ 2 ]; // eslint-disable-line no-bitwise
if ( isSpellingMapped ) {
// Mapped, pick up the lemma portion as this points to normal in case of
// mapped spellings.
normIndex = ( lexicon[ layout4lemma[ 0 ] + ( index * pkSize ) ] & layout4lemma[ 1 ] ) >>> layout4lemma[ 2 ]; // eslint-disable-line no-bitwise
} else {
// Compute actual index from the relative index.
normIndex += index;
}
} else {
oovIdx = index - lexemeIntrinsicSize;
// Refer to `extrinsicLexicon` structure at the top of `cache()`.
normIndex = extrinsicLexicon[ oovIdx * elPackingSize ];
// This `normIndex` may point to an intrinsic lexeme, in which case
// mapping needs to be checked.
if ( normIndex < lexemeIntrinsicSize ) {
isSpellingMapped = ( lexicon[ layout4mapped[ 0 ] + ( normIndex * pkSize ) ] & layout4mapped[ 1 ] ) >>> layout4mapped[ 2 ]; // eslint-disable-line no-bitwise
if ( isSpellingMapped ) {
normIndex = ( lexicon[ layout4lemma[ 0 ] + ( normIndex * pkSize ) ] & layout4lemma[ 1 ] ) >>> layout4lemma[ 2 ]; // eslint-disable-line no-bitwise
}
}
}
return normIndex;
}; // normal()
// ## mappedSpelling
/**
*
* Returns the index of mapped spelling's of the input `index` of required lexeme.
*
* @param {number} index of the required lexeme.
* @return {string} index to the normal.
* @private
*/
var mappedSpelling = function ( index ) {
// Temps for `layout.isSpellingMapped`, etc.
var layout4mapped = layout.isSpellingMapped;
var layout4lemma = layout.lemma;
// Used to remap if its value is `1`. In this case lemma becomes the `normIndex`.
var isSpellingMapped;
// Returned: normal's index.
var mappedIndex = index;
// Only applicable to lexems that are inside the vocabulary as there can not
// be mapped spelling for OOV words!
if ( index < lexemeIntrinsicSize ) {
isSpellingMapped = ( lexicon[ layout4mapped[ 0 ] + ( index * pkSize ) ] & layout4mapped[ 1 ] ) >>> layout4mapped[ 2 ]; // eslint-disable-line no-bitwise
if ( isSpellingMapped ) {
// Mapped, pick up the lemma portion as this points to normal in case of
// mapped spellings.
mappedIndex = ( lexicon[ layout4lemma[ 0 ] + ( index * pkSize ) ] & layout4lemma[ 1 ] ) >>> layout4lemma[ 2 ]; // eslint-disable-line no-bitwise
}
}
return mappedIndex;
}; // mappedSpelling()
// ## nox
/**
*
* Returns the index of normal of the expansion.
*
* @param {number} binaryWord containing pointer to `xpansions` and `precedingSpaces`;
* It is the 2nd (relative) element of a single token's packet of 4-words.
* @return {number} index to the normal, whoes value can be found via `value()`.
* @private
*/
var nox = function ( binaryWord ) {
return xpansions[ ( binaryWord & xnMask) >>> bits4PrecedingSpace ]; // eslint-disable-line no-bitwise
}; // nox()
// ## property
/**
*
* Extracts the property – `prop` of a lexeme (or word) specified by `index`.
*
* @param {number} index of the lexeme whoes properties are required to be extracted.
* @param {string} prop (name) that needs to be extracted — it should be a valid property.
* @return {string} extracted property, if `prop` is known/valid otherwise `null`.
* @private
*/
var property = function ( index, prop ) {
// A property and its value
var propValue;
// Index for OOVs i.e. when `index > lexemeIntrinsicSize`.
var oovIdx;
// Temp for `layput[ p ]`
var layout4Prop;
// Processing is different for native and OOV words or lexemes. For OOVs
// properties have to be extracted from `extrinsicLexicon`, whereas for
// native words they are exracted from `lexicon`.
if ( index < lexemeIntrinsicSize ) {
layout4Prop = layout[ prop ];
if ( layout4Prop === undefined ) return null;
propValue = ( lexicon[ layout4Prop[ 0 ] + ( index * pkSize ) ] & layout4Prop[ 1 ] ) >>> layout4Prop[ 2 ]; // eslint-disable-line no-bitwise
// Use hash/list to update value if required.
if ( layout4Prop[ 3 ] === 0 || layout4Prop[ 5 ] === 1 ) propValue = model.features[ prop ].list[ propValue ];
} else {
// Attempt extraction only if extractable!
if ( !efHash[ prop ] ) return 0;
// Compute index into `extrinsicLexicon`.
oovIdx = index - lexemeIntrinsicSize;
layout4Prop = layout[ prop ];
// No need for this check as `if ( !efHash[ prop ] )...` ensures return
// in case of any unknown property:
/* if ( layout4Prop === undefined ) return null; */
// Use `extrinsicLexicon`.
// Reach to the desired quanta via `oovIdx * elPackingSize`, move forward by `base size` and then go to offset!
propValue = ( extrinsicLexicon[ ( oovIdx * elPackingSize ) + elBasePackingSize + layout4Prop[ 0 ] ] & layout4Prop[ 1 ] ) >>> layout4Prop[ 2 ]; // eslint-disable-line no-bitwise
// Use hash/list to update value if required.
if ( layout4Prop[ 3 ] === 0 || layout4Prop[ 5 ] === 1 ) propValue = model.features[ prop ].list[ propValue ];
}
return propValue;
}; // property()
var isMemberPOS = function ( lexemeIdx, posIdx ) {
// Dont miss converting posIdx to a number.
return posClusters[ property( lexemeIdx, 'lexemeCID' ) ].has( +posIdx );
}; // isMemberPOS()
// ## posOf
/**
*
* Extracts the pos' index of the a lexeme (or word) specified by `index`.
*
* @param {number} index of the lexeme whoes properties are required to be extracted.
* @return {string[]} extracted properties in the same sequence as `list`.
* @private
*/
var posOf = function ( index ) {
// Value of extracted pos will go here.
var posValue;
// Index for OOVs i.e. when `index > lexemeIntrinsicSize`.
var oovIdx;
// Temp for `layput[ p ]`
var layout4Prop;
// Processing is different for native and OOV words or lexemes. For OOVs
// properties have to be extracted from `extrinsicLexicon`, whereas for
// native words they are exracted from `lexicon`.
if ( index < lexemeIntrinsicSize ) {
layout4Prop = layout.pos;
posValue = ( lexicon[ layout4Prop[ 0 ] + ( index * pkSize ) ] & layout4Prop[ 1 ] ) >>> layout4Prop[ 2 ]; // eslint-disable-line no-bitwise
} else {
// Compute index into `extrinsicLexicon`.
oovIdx = index - lexemeIntrinsicSize;
layout4Prop = layout.pos;
// Use `extrinsicLexicon`.
// Reach to the desired quanta via `oovIdx * elPackingSize`, move forward by `base size` and then go to offset!
posValue = ( extrinsicLexicon[ ( oovIdx * elPackingSize ) + elBasePackingSize + layout4Prop[ 0 ] ] & layout4Prop[ 1 ] ) >>> layout4Prop[ 2 ]; // eslint-disable-line no-bitwise
}
return posValue;
}; // posOf()
// ## valueOf
/**
*
* Extracts the value of the `prop`erty for its input `index`.
*
* @param {string} prop to be extracted for the `index`.
* @param {number} index of the property.
* @return {string[]} extracted properties in the same sequence as `list`.
* @private
*/
var valueOf = function ( prop, index ) {
return model.features[ prop ].list[ index ];
}; // valueOf()
// ## currentSize
/**
*
* Returns the current size of lexicon including OOVs.
*
* @return {number} size of the current lexicon.
* @private
*/
var currentSize = function () {
// Minus `1` becuase at `0` we have OOV symbolic word.
return ( lxm.list.length - 1 );
}; // size()
// ## intrinsicSize
/**
*
* Returns the intrinsic i.e. native size of lexicon.
*
* @return {number} size of the native or intrinsic lexicon.
* @private
*/
var intrinsicSize = function () {
return lexemeIntrinsicSize;
};
/**
* Finds if the text can have `pos` as valid part of speech, provided it is a
* base form. Used in **lemmatization** to see if the lemma shares the same pos
* with the original word.
*
* @param {string} text the incoming word.
* @param {string} pos the pos that needs to be checked as one of the valid pos for text.
* @return {boolean} True if it does, otherwise false.
*/
var hasSamePOS = function ( text, pos ) {
// Get the word's index
var textIndex = lookup( text );
// If not found i.e. OOV means that it did not have a pre-defined POS set.
if ( !textIndex ) return false;
// More then one means it is a contraction.
if ( textIndex.length > 1 ) return false;
// Outside intrinsic vocab means OOV again.
if ( textIndex[ 0 ] >= lexemeIntrinsicSize ) return false;
// If it is not a base form so point in checking same POS — basics of
// lemmatization. For example, `hiding` becomes `hid` on removal of `-ing`,
// which is not in base form (i.e. hid is the past tense of hide); so it should
// not take that as the lemma and instead try adding `-e`.
if ( property( textIndex, 'isBaseForm' ) === 0 ) return false;
// Finally if it is in base form then check for pos membership.
return isMemberPOS( textIndex[ 0 ], model.pos.hash[ pos ] );
}; // hasSamePOS()
// ## isOOV
/**
*
* Tests the input `text` for being an OOV.
*
* @param {text} text that needs to be test for OOV.
* @return {boolean} true if OOV otherwise false (in vocab).
* @private
*/
var isOOV = function ( text ) {
var textIndex = lookup( text );
if ( !textIndex ) return true;
if ( textIndex.length > 1 ) return false;
if ( textIndex[ 0 ] >= lexemeIntrinsicSize ) return true;
return false;
}; // isOOV()
methods.add = add;
methods.lookup = lookup;
methods.value = value;
methods.property = property;
methods.normal = normal;
methods.nox = nox;
methods.posOf = posOf;
methods.valueOf = valueOf;
methods.currentSize = currentSize;
methods.intrinsicSize = intrinsicSize;
methods.isOOV = isOOV;
methods.isMemberPOS = isMemberPOS;
methods.hasSamePOS = hasSamePOS;
methods.mappedSpelling = mappedSpelling;
return methods;
}; // cache()
module.exports = cache;