UNPKG

@happyaccident/wink-sentiment

Version:

Accurate and fast sentiment scoring of phrases with emoticons :) & emojis 🎉

205 lines (197 loc) • 8.22 kB
// wink-sentiment // Accurate and fast sentiment scoring of phrases with hashtags, emoticons & emojis. // // Copyright (C) 2017-18 GRAYPE Systems Private Limited // // This file is part of “wink-sentiment”. // // Permission is hereby granted, free of charge, to any person obtaining a // copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included // in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. // var emojis = require( './emojis.js' ); var afinn = require( './afinn-en-165.js' ); var emoticons = require( './emoticons.js' ); var negations = require( './negations.js' ); var affin2Grams = require( './afinn-en-165-2grams.js' ); var tokenize = require( 'wink-tokenizer' )().tokenize; /* eslint max-depth: 0 */ // ### normalize /** * * Computes the normalized sentiment score from the absolute scores. * * @param {number} hss absolute sentiment scrore of hashtags. * @param {number} wss absolute sentiment scrore of words/emojis/emoticons. * @param {number} sentiHashtags number of hashtags that have an associated sentiment score. * @param {number} sentiWords wnumber of words that have an associated sentiment score. * @param {number} totalWords total number of words in the text. * @return {number} normalized score. * @private */ var normalize = function ( hss, wss, sentiHashtags, sentiWords, totalWords ) { // **N**ormalized **h**ashtags & **w**ords **s**entiment **s**cores. let nhss = 0, nwss = 0; // 1. Normalize hashtags sentiment score by computing the average. if ( sentiHashtags ) nhss = hss / sentiHashtags; if ( sentiWords ) { // 2. Normalize words sentiment score by computing the average. nwss = wss / sentiWords; // 3. Normalized words sentiment score is further adjusted on the basis of the // total number of words in the text. // Average sentence length in words (assumed). const avgLength = 15; // Make adjustments. nwss /= Math.sqrt( ( totalWords > avgLength ) ? ( totalWords / avgLength ) : 1 ); } return ( nhss && nwss ) ? ( ( nhss + nwss ) / 2 ) : ( nwss || nhss ); }; // normalize() // ### sentiment /** * * Computes the absolue and normalized sentiment scores of the input `phrase`, * after tokenizing it. * * The normalized score is computed by taking into account of absolute scores of * words, emojis, emoticons, and hashtags and adjusting it on the basis of total * words in the text; this is always between -5 and +5. A score of less than 0 indicates * negative sentiments and a score of more than 0 indicates positive sentiments; * wheras a near zero score suggests a neutral sentiment. While counting tokens * only the ones tagged as **`word`**, **`emoji`**, or **`emoticon`** are counted; * and one letter words are ignored. * * It performs tokenization using [wink-tokenizer](http://winkjs.org/wink-tokenizer/). * During sentiment analysis, each token may be assigned up to 3 new properties. * These properties are: * * 1. **`score`** — contains the sentiment score of the word, emoji, emoticon or hashtag, which is always * between -5 and +5. This is added only when the word in question has a positive or * negative sentiment associated with it. * 2. **`negation`** — is added & set to **true** whenever the `score` of the * token has beeen impacted due to a negation word apprearing prior to it. * 3. **`grouped`** — is added whenever, the token is the first * word of a short idiom or a phrase. It's value provides the number of tokens * that have been grouped together to form the phrase/idiom. * * @param {string} phrase whoes sentiment score needs to be computed. * @return {object} absolute `score`, `normalizedScore` and `tokenizedPhrase` of `phrase`. * * @example * sentiment( 'not a good product #fail' ); * // -> { score: -5, * // normalizedScore: -2.5, * // tokenizedPhrase: [ * // { value: 'not', tag: 'word' }, * // { value: 'a', tag: 'word' }, * // { value: 'good', tag: 'word', negation: true, score: -3 }, * // { value: 'product', tag: 'word' }, * // { value: '#fail', tag: 'hashtag', score: -2 } * // ] * // } */ var sentiment = function ( phrase ) { if ( typeof phrase !== 'string' ) { throw Error( 'wink-sentiment: input phrase must be a string, instead found: ' + typeof phrase ); } // Early exit. var tokenizedPhrase = tokenize( phrase ); if ( tokenizedPhrase.length === 0 ) return { score: 0, normalizedScore: 0 }; // Sentiment Score. var ss = 0; // Hash Tags SS. var hss = 0; // Number of sentiment containing hashtags and words encountered. var sentiHashtags = 0, sentiWords = 0; // Number of words encountered. var words = 0; // Helpers: for loop indexes, token, temp ss, and word count. var k, kmax, t, tkn, tss, wc; for ( k = 0, kmax = tokenizedPhrase.length; k < kmax; k += 1 ) { tkn = tokenizedPhrase[ k ]; t = tkn.value; switch ( tkn.tag ) { case 'emoji': tss = emojis[ t ]; if ( tss ) { ss += tss; tkn.score = tss; sentiWords += 1; } words += 1; break; case 'emoticon': tss = emoticons[ t ]; if ( tss ) { ss += tss; tkn.score = tss; sentiWords += 1; } words += 1; break; case 'hashtag': tss = afinn[ t.slice( 1 ).toLowerCase() ]; if ( tss ) { tkn.score = tss; hss += tss; sentiHashtags += 1; } break; case 'word': t = t.toLowerCase(); wc = 1; // Check for bigram configurations i.e. token at `k` and `k+1`. Accordingly // compute the sentiment score in `tss`. Convert to Lower Case for case insensitive comparison. if ( ( k < ( kmax - 1 ) ) && affin2Grams[ t ] && ( affin2Grams[ t ][ tokenizedPhrase[ k + 1 ].value.toLowerCase() ] !== undefined ) ) { tss = affin2Grams[ t ][ tokenizedPhrase[ k + 1 ].value.toLowerCase() ]; tkn.grouped = 1; // Will have to count `2` words! wc = 2; // sentiWords += 1; } else { tss = afinn[ t ] || 0; // sentiWords += 1; } // Check for negation — upto two words ahead; even a bigram AFINN config may be negated! Convert to Lower Case for case insensitive comparison. if ( ( k > 0 && negations[ tokenizedPhrase[ k - 1 ].value.toLowerCase() ] ) || ( k > 1 && negations[ tokenizedPhrase[ k - 2 ].value.toLowerCase() ] ) ) { tss = -tss; tkn.negation = true; } ss += tss; // Increment `k` by 1 if a bigram config was found earlier i.e. `wc` was set to **2**. k += ( wc - 1 ); if ( tss ) { tkn.score = tss; sentiWords += 1; } // Update number of words accordingly. words += wc; break; default: // Do Nothing! } // swtich ( t.tag ) } // if ( words === 0 ) words = 1; // Return score and its normalized value. return { score: ( ss + hss ), normalizedScore: +( normalize( hss, ss, sentiHashtags, sentiWords, words ) ).toFixed( 4 ), tokenizedPhrase: tokenizedPhrase }; }; // sentiment() module.exports = sentiment;