UNPKG

wink-nlp

Version:

Developer friendly Natural Language Processing ✨

winkjs/wink-nlp

228 lines (191 loc) • 7.38 kB

JavaScript

// wink-nlp // // Copyright (C) GRAYPE Systems Private Limited // // This file is part of “wink-nlp”. // // Permission is hereby granted, free of charge, to any // person obtaining a copy of this software and // associated documentation files (the "Software"), to // deal in the Software without restriction, including // without limitation the rights to use, copy, modify, // merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to // whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice // shall be included in all copies or substantial // portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED // TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF // CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // var sort4FT = require( './sort4FT.js' ); var constants = require( './constants.js' ); var caseMap = [ 'other', 'lowerCase', 'upperCase', 'titleCase' ]; var swi = require( './sentence-wise-importance.js' ); var reconstructSpaces = require( './reconstruct-spaces.js' ); // Size of a single token. var tkSize = constants.tkSize; // Bits reserved for `lemma`. var bits4lemma = constants.bits4lemma; // Mask for extracting pos var posMask = constants.posMask; // Mask for lemma in case of contraction. var lemmaMask = constants.lemmaMask; var its = Object.create( null ); its.case = function ( index, rdd ) { return caseMap[ rdd.cache.property( rdd.tokens[ index * tkSize ], 'lutCase' ) ]; }; // case() its.uniqueId = function ( index, rdd ) { return rdd.tokens[ index * tkSize ]; }; // uniqueId() its.negationFlag = function ( index, rdd ) { return rdd.tokens[ ( index * tkSize ) + 3 ] >= constants.negationFlag; }; // negationFlag() its.normal = function ( index, rdd ) { var tokens = rdd.tokens; var cache = rdd.cache; return ( ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) ? cache.value( cache.nox( tokens[ ( index * tkSize ) + 1 ] ) ) : cache.value( cache.normal( tokens[ index * tkSize ] ) ) ); }; // normal() its.contractionFlag = function ( index, rdd ) { return ( rdd.tokens[ ( index * tkSize ) + 1 ] > 65535 ); }; // contractionFlag() its.pos = function ( index, rdd ) { return rdd.cache.valueOf( 'pos', ( rdd.tokens[ ( index * tkSize ) + 2 ] & posMask ) >>> bits4lemma ); // eslint-disable-line no-bitwise }; // pos() its.precedingSpaces = function ( index, rdd ) { return reconstructSpaces( index, rdd ); }; // precedingSpaces() its.prefix = function ( index, rdd ) { return rdd.cache.property( rdd.tokens[ index * tkSize ], 'prefix' ); }; // prefix() its.shape = function ( index, rdd ) { return rdd.cache.property( rdd.tokens[ index * tkSize ], 'shape' ); }; // shape() its.stopWordFlag = function ( index, rdd ) { var tokens = rdd.tokens; var cache = rdd.cache; // Apply check on normalized token and not the original value, because // stop words are always defined in the lowercase. var normal = ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) ? cache.nox( tokens[ ( index * tkSize ) + 1 ] ) : cache.normal( tokens[ index * tkSize ] ); return ( cache.property( normal, 'isStopWord' ) === 1 ); }; // stopWordFlag() its.abbrevFlag = function ( index, rdd ) { return ( rdd.cache.property( rdd.tokens[ index * tkSize ], 'isAbbrev' ) === 1 ); }; // abbrevFlag() its.suffix = function ( index, rdd ) { return rdd.cache.property( rdd.tokens[ index * tkSize ], 'suffix' ); }; // suffix() its.type = function ( index, rdd ) { return rdd.cache.property( rdd.tokens[ index * tkSize ], 'tokenType' ); }; // type() its.value = function ( index, rdd ) { return rdd.cache.value( rdd.tokens[ index * tkSize ] ); }; // value() its.stem = function ( index, rdd, addons ) { return addons.stem( rdd.cache.value( rdd.tokens[ index * tkSize ] ) ); }; // stem() its.lemma = function ( index, rdd, addons ) { var tokens = rdd.tokens; var cache = rdd.cache; // If it is a contraction that lemma is already available in the token's data structure. if ( tokens[ ( index * tkSize ) + 1 ] > 65535 ) { return cache.value( tokens[ ( index * tkSize ) + 2 ] & lemmaMask ); // eslint-disable-line no-bitwise } // Handle mapped spelling if any. const mappedIdx = cache.mappedSpelling( tokens[ index * tkSize ] ); // If the token has single lemma then no further processing is needed. if ( cache.property( mappedIdx, 'isSLemma' ) === 1 ) { return cache.value( cache.property( mappedIdx, 'lemma' ) ); } // Exhausted all possibilities to avoid processing! Now lemmatize! const pos = its.pos( index, rdd ); const value = cache.value( cache.normal( tokens[ index * tkSize ] ) ); return addons.lemmatize( value, pos, cache ); }; // lemmas() its.vector = function ( ) { return ( new Array( 100 ).fill( 0 ) ); }; // vector() its.detail = function ( ) { return true; }; // detail() its.markedUpText = function ( index, rdd ) { // This is a special case because `tokens.out()` allows `as.markedUpText`. // Therefore simply return the value and rest is handled by `colTokensOut` with // `as.markedUpText()`` or `as.text()` as one of the arugments. return its.value( index, rdd ); }; // markedUpText() its.span = function ( spanItem ) { return spanItem.slice( 0, 2 ); }; // span() its.sentiment = function ( spanItem ) { return spanItem[ 3 ]; }; // span() its.readabilityStats = function ( rdd, addons ) { return addons.readabilityStats( rdd, its ); }; // readabilityStats() its.sentenceWiseImportance = function ( rdd ) { return swi( rdd ); }; // sentenceWiseImportance() /* ------ utilities ------ */ its.terms = function ( tf, idf, terms ) { return terms; }; // terms() its.docTermMatrix = function ( tf, idf, terms ) { const dtm = new Array( tf.length ); for ( let id = 0; id < tf.length; id += 1 ) { dtm[ id ] = []; for ( let i = 0; i < terms.length; i += 1 ) { dtm[ id ].push( tf[ id ][ terms[ i ] ] || 0 ); } } return dtm; }; // getDocTermMatrix() its.docBOWArray = function ( tf ) { return tf; }; // docBOWArray() its.bow = function ( tf ) { return tf; }; // bow() its.idf = function ( tf, idf ) { var arr = []; for ( const t in idf ) { // eslint-disable-line guard-for-in arr.push( [ t, idf[ t ] ] ); } // Sort on frequency followed by the term. return arr.sort( sort4FT ); }; // idf() its.tf = function ( tf ) { const arr = []; for ( const t in tf ) { // eslint-disable-line guard-for-in arr.push( [ t, tf[ t ] ] ); } // Sort on frequency followed by the term. return arr.sort( sort4FT ); }; // tf() its.modelJSON = function ( tf, idf, terms, docId, sumOfAllDLs ) { return JSON.stringify( { uid: 'WinkNLP-BM25Vectorizer-Model/1.0.0', tf: tf, idf: idf, terms: terms, docId: docId, sumOfAllDLs: sumOfAllDLs } ); }; // model() module.exports = its;