UNPKG

wink-nlp

Version:

Developer friendly Natural Language Processing ✨

407 lines (367 loc) 15.3 kB
// wink-nlp // // Copyright (C) GRAYPE Systems Private Limited // // This file is part of “wink-nlp”. // // Permission is hereby granted, free of charge, to any // person obtaining a copy of this software and // associated documentation files (the "Software"), to // deal in the Software without restriction, including // without limitation the rights to use, copy, modify, // merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to // whom the Software is furnished to do so, subject to // the following conditions: // // The above copyright notice and this permission notice // shall be included in all copies or substantial // portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF // ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED // TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A // PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, // DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF // CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // var allowed = require( './allowed.js' ); var its = require( '../src/its.js' ); var helper = require( '../src/helper.js' ); // Norm computation helper connstants and functions. const L2 = 'l2'; const NONE = 'none'; const normFn = Object.create( null ); /** * Computes absolute value of `v` for l1 norm. * * @param {number} v values whose abs is computed. * @return {number} absolute value of v. */ normFn.l1 = ( ( v ) => ( Math.abs( v ) ) ); /** * Computes square of `v` for l2 norm. * * @param {number} v values whose square is computed. * @return {number} square of v. */ normFn.l2 = ( ( v ) => ( v * v ) ); /** * Returns 0, used when norm is none. * * @return {number} 0 value. */ normFn.none = ( () => ( 0 ) ); /** * Validates the input config's numeric parameter, if in valid the returns * the default value. * @param {Number} num input number to be validated. * @param {Number} numDefault default value that is used if the input num is invalid. * @param {Number} min used for testing num < min. * @param {[type]} max used for testing num > max. * @return {[type]} input number of default value. */ const getValidCfgNum = function ( num, numDefault, min, max ) { return ( ( num === null ) || ( num === undefined ) || ( typeof num !== 'number' ) || ( num < min || num > max ) ) ? numDefault : num; }; // getValidCfgNum() // # bm25Vectorizer /** * Creates an instance of BM25-based vectorizer using the `config`. * * @param {object} config defines values of various BM25 configuration params — * k, k1, b and normalization scheme — none, l1, or l2. * @return {undefined} Nothing! */ var bm25Vectorizer = function ( config ) { const cfg = ( helper.isObject( config ) ) ? config : Object.create( null ); // Setup BM25 Parameters. const k = getValidCfgNum( cfg.k, 1, 0, 100 ); const k1 = getValidCfgNum( cfg.k1, 1.2, 0, 100 ); const b = getValidCfgNum( cfg.b, 0.75, 0, 1 ); // Setup precision. const precision = getValidCfgNum( cfg.precision, 6, 1, 12 ); // Setup norm. const norm = ( ( cfg.norm === null ) || ( cfg.norm === undefined ) || ( normFn[ cfg.norm ] === undefined ) ) ? NONE : cfg.norm; // Document Id; it is incremented at the end of each `learn()` call. var docId = 0; // Term frequencies. The raw `tf` builds up here during learning. Subsequently // the same is updated using BM25 formula during wieght computation. var tf = []; // Length of each document in terms of number of tokens. var docLength = []; // Sum of all document's length, used for average DL computation. var sumOfAllDLs = 0; // Inverse Document Frequency. The raw `idf` builds up here during learning. // Subsequently the same is updated using BM25 formula during wieght computation. var idf = Object.create( null ); // Norms value for each document. var norms = []; // Terms or features — typically to be used alongwith vector or doc term matrix // by the developer. var terms = []; // Useful in avoiding re-computation of final weights. var weightsComputed = false; // Returned! var methods = Object.create( null ); // ## computeWeights /** * Computes & updates the TF and the IDF as per the BM25 formulae. * * @return {undefined} Nothing! */ var computeWeights = function () { // If weights have been computed, then re-computation is not allowed. if ( weightsComputed ) return; if ( docId === 0 ) throw Error( 'wink-nlp: this operation doesn\'t make sense without any learning; use learn() API first.' ); // Set the average document length used for normalization. const avgDL = sumOfAllDLs / docId; // Compute IDF. for ( const t in idf ) { // eslint-disable-line guard-for-in // Equation 3.3 in "Probabilistic Relevance Framework BM25 and Beyond" by // Stephen Robertson and Hugo Zaragoza idf[ t ] = +Math.log( ( ( docId - idf[ t ] + 0.5 ) / ( idf[ t ] + 0.5 ) ) + k ).toFixed( precision ); } // Compute the TF for every documet. for ( let id = 0; id < docId; id += 1 ) { for ( const t in tf[ id ] ) { // eslint-disable-line guard-for-in // Equation 3.15 with modification described in section on "Some Variations on BM25" // (3.5.1) of "Probabilistic Relevance Framework BM25 and Beyond" by // Stephen Robertson and Hugo Zaragoza // IDF * ((k + 1) * tf) / (k * (1.0 - b + b * (|d|/avgDl)) + tf) tf[ id ][ t ] = idf[ t ] * ( ( k1 + 1 ) * tf[ id ][ t ] ) / ( ( k1 * ( 1 - b + ( b * ( docLength[ id ] / avgDL ) ) ) ) + tf[ id ][ t ] ); // Compute the norm incrementally; will eventually use it as the divisor // at the time of final computation in the next loop. norms[ id ] += normFn[ norm ]( tf[ id ][ t ] ); } // Compute norm, if none then we set norms[ id ] to 1 and divide by 1 has // no effect! if ( norm === L2 ) { norms[ id ] = Math.sqrt( norms[ id ] ); } else if ( norm === NONE ) norms[ id ] = 1; for ( const t in tf[ id ] ) { // eslint-disable-line guard-for-in tf[ id ][ t ] = +( tf[ id ][ t ] / norms[ id ] ).toFixed( precision ); } } // Extract terms sorted alphabetically — vectors & matrix follow this order. terms = Object.keys( idf ).sort(); // Set weightsComputed. weightsComputed = true; }; // computeWeights() // ## learn /** * * Builds raw TF and IDF from the tokenized input document. Throws error if * call to out has been made — because out updates the raw tf-idf values as * per the BM25 formulae. * * @param {string[]} tokens tokenized document, usually obtained via winkNLP. * @return {undefined} nothing. */ methods.learn = function ( tokens ) { if ( weightsComputed ) throw Error( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' ); // It will contain the TF i.e.bag-of-words for this document — `docId`. const bow = Object.create( null ); // Set the length of this document. docLength[ docId ] = tokens.length; // Build TF and keep updating the IDF. for ( let i = 0; i < tokens.length; i += 1 ) { // eslint-disable-line guard-for-in const t = tokens[ i ]; if ( bow[ t ] === undefined ) { bow[ t ] = 1; // Increment doc count for this token; note it will happen only once whenever // we encounter a token unnseen so far. idf[ t ] = 1 + ( idf[ t ] || 0 ); } else { // Token has been seen, simply increment. bow[ t ] += 1; } } // Save this TF at `docId` location. tf.push( bow ); norms.push( 0 ); // Update sum, will be used to compute the average DL later. sumOfAllDLs += docLength[ docId ]; // Get ready to process the next document. docId += 1; }; // learn() // ## out /** * Produces out put according to the input function. Operates at vecorizer level. * * @param {function} f a function that determins the output format/content. * @return {array} array containing either strings, objects or arrays. */ methods.out = function ( f ) { computeWeights(); // Pass `docId` & `sumOfAllDLs` in additionn to `tf`, `idf` & `terms`; this // is needed while saving the model JSON. if ( allowed.its4BM25.has( f ) ) return f( tf, idf, terms, docId, sumOfAllDLs ); // In case of innvalid `f`, fall back to the default method — `docBOWArray`. return its.docBOWArray( tf, idf, terms, docId, sumOfAllDLs ); }; // out() // ## doc /** * Returns the APIs applicable to the document specified by its index. * * @param {numebr} n index of the document. * @return {object} object containing `out()` and `length()` methods. */ methods.doc = function ( n ) { const api = Object.create( null ); // ## doc.out /** * Produces out put according to the input function. Operates at the document * level. * * @param {function} f a function that determins the output format/content. * @return {array} array containing either strings, objects or arrays. */ api.out = function ( f ) { computeWeights(); if ( f === its.bow ) return f( tf[ n ] ); if ( f === its.vector ) { const arr = new Array( terms.length ); for ( let i = 0; i < terms.length; i += 1 ) { arr[ i ] = tf[ n ][ terms[ i ] ] || 0; } return arr; } if ( f === its.tf ) return f( tf[ n ] ); return tf[ n ]; // its.bow — default fall back. }; // doc.out() // ## doc.length /** * * Returns the number of unique tokens in the n<sup>th</sup> document. * * @return {number} the number of unique tokens in the document. */ api.length = function () { return ( tf.length === 0 ) ? 0 : Object.keys( tf[ n ] ).length; }; // doc.length() return api; }; // doc() // ## vectorOf /** * Computes the vector of the input document given in form of tokens using * the tf-idf learned so far. * @param {string[]} tokens tokenized document, usually obtained via winkNLP. * @return {number[]} its vector. */ methods.vectorOf = function ( tokens ) { computeWeights(); const arr = new Array( terms.length ); const bow = Object.create( null ); const avgDL = sumOfAllDLs / docId; let thisNorm = 0; for ( let i = 0; i < tokens.length; i += 1 ) { const t = tokens[ i ]; bow[ t ] = 1 + ( bow[ t ] || 0 ); } for ( let i = 0; i < terms.length; i += 1 ) { const t = terms[ i ]; arr[ i ] = bow[ t ] || 0; arr[ i ] = idf[ t ] * ( ( k1 + 1 ) * arr[ i ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + arr[ i ] ); thisNorm += normFn[ norm ]( arr[ i ] ); } if ( norm === L2 ) { thisNorm = Math.sqrt( thisNorm ); } else if ( norm === NONE ) thisNorm = 1; // `thisNorm || 1` ensures that there is no attempt to divide by zero! // This may happen if all tokens are unseen. return arr.map( ( v ) => +( v / ( thisNorm || 1 ) ).toFixed( precision ) ); }; // vectorOf() // ## bowOf /** * Computes the bag-of-words (bowOf) of the input document, using the tf-idf * learned so far. If `processOOV` is true then for OOV token's frequency is * computed and its `idf` is assumed to be **1**; otherwise all OOVs are ignored. * @param {string[]} tokens tokenized text, usually obtained via winkNLP. * @param {boolean} processOOV true — process OOV, false — ignore OOV (default). * @return {object} its bow. */ methods.bowOf = function ( tokens, processOOV = false ) { computeWeights(); const bow = Object.create( null ); const avgDL = sumOfAllDLs / docId; let thisNorm = 0; if ( typeof processOOV !== 'boolean' ) { throw Error( 'wink-nlp: processOOV must be a boolean.' ); } for ( let i = 0; i < tokens.length; i += 1 ) { const t = tokens[ i ]; // `processOOV` true means count every term otherwise count only if it is // in the vocabulary i.e. `idf`. if ( processOOV ) { bow[ t ] = 1 + ( bow[ t ] || 0 ); } else if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 ); } for ( const t in bow ) { // eslint-disable-line guard-for-in // `bow` tokens are determined by `processOOV` i.e. if true it will contain // OOVs also otherwise it will not have any OOV. On the other hand `idf` // always contains all the seen tokens. Therefore when `processOOV` is true, // the `idf[ t ]` for all OOV will be taken as **1** (highest possible value). bow[ t ] = ( idf[ t ] || 1 ) * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] ); thisNorm += normFn[ norm ]( bow[ t ] ); } if ( norm === L2 ) { thisNorm = Math.sqrt( thisNorm ); } else if ( norm === NONE ) thisNorm = 1; for ( const t in bow ) { // eslint-disable-line guard-for-in // Unlike in `vectorOf`, `thisNorm || 1` is not needed here as bow will be // empty if `thisNorm` is zero! bow[ t ] = +( bow[ t ] / thisNorm ).toFixed( precision ); } return bow; }; // bowOf() methods.config = ( () => ( { k: k, k1: k1, b: b, norm: norm } ) ); // ## loadModel /** * Loads the input model JSON into the BM25's respective data structure. Throws * error if invalid JSON or model is passed. Sets `weightsComputed` to true to * prevent further learning. * @param {string} json Input model's JSON string. * @return {void} Nothing! */ methods.loadModel = function ( json ) { // Used to check presence of required fields; `uid` is checked separately. const modelFields = [ 'docId', 'tf', 'idf', 'terms', 'sumOfAllDLs' ]; let model; if ( docId > 0 ) throw Error( 'wink-nlp: can not load model after learning.' ); try { model = JSON.parse( json ); } catch (e) { throw Error( `wink-nlp: invalid input JSON:\n\t${e}\n\n` ); } if ( helper.isObject( model ) && ( Object.keys( model ).length === 6 ) && ( model.uid === 'WinkNLP-BM25Vectorizer-Model/1.0.0' ) ) { // Check presence of all required fields. modelFields.forEach( ( f ) => { if ( model[ f ] === undefined ) throw Error( 'wink-nlp: invalid model format/version' ); } ); // All good, set fields. docId = model.docId; tf = model.tf; idf = model.idf; terms = model.terms; sumOfAllDLs = model.sumOfAllDLs; // To prevent further learning. weightsComputed = true; } else { throw Error( 'wink-nlp: invalid model format/version' ); } }; // loadModel() return methods; }; // bm25Vectorizer() module.exports = bm25Vectorizer;