vader-sentiment
Version:
javascript port of vader sentiment tool
694 lines (625 loc) • 19.1 kB
JavaScript
/**
* Copyright 2018 Comcast Cable Communications Management, LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
Javascript port of vader sentiment analysis tool, source:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
Sentiment Analysis of Social Media Text. Eighth International Conference on
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
*/
// Constants
const LEXICON = require('./vader_lexicon.js').lexicon;
// (empirically derived mean sentiment intensity rating increase for booster words)
export const B_INCR = 0.293;
export const B_DECR = -0.293;
// (empirically derived mean sentiment intensity rating increase for using
// ALLCAPs to emphasize a word)
export const C_INCR = 0.733;
export const N_SCALAR = -0.74;
export const REGEX_REMOVE_PUNCTUATION = new RegExp(/[!"#$%&'()*+,-./:;<=>?@[\\\]^_`{|}~]/g);
// ` // <- to fix ide thinking grave accent in regex is starting a template quote
export const PUNC_LIST = [
'.',
'!',
'?',
',',
';',
':',
'-',
"'",
'"',
'!!',
'!!!',
'??',
'???',
'?!?',
'!?!',
'?!?!',
'!?!?'
];
export const NEGATE = [
'aint',
'arent',
'cannot',
'cant',
'couldnt',
'darent',
'didnt',
'doesnt',
"ain't",
"aren't",
"can't",
"couldn't",
"daren't",
"didn't",
"doesn't",
'dont',
'hadnt',
'hasnt',
'havent',
'isnt',
'mightnt',
'mustnt',
'neither',
"don't",
"hadn't",
"hasn't",
"haven't",
"isn't",
"mightn't",
"mustn't",
'neednt',
"needn't",
'never',
'none',
'nope',
'nor',
'not',
'nothing',
'nowhere',
'oughtnt',
'shant',
'shouldnt',
'uhuh',
'wasnt',
'werent',
"oughtn't",
"shan't",
"shouldn't",
'uh-uh',
"wasn't",
"weren't",
'without',
'wont',
'wouldnt',
"won't",
"wouldn't",
'rarely',
'seldom',
'despite'
];
// booster/dampener 'intensifiers' or 'degree adverbs'
// http://en.wiktionary.org/wiki/Category:English_degree_adverbs
export const BOOSTER_DICT = {
absolutely: B_INCR,
amazingly: B_INCR,
awfully: B_INCR,
completely: B_INCR,
considerably: B_INCR,
decidedly: B_INCR,
deeply: B_INCR,
effing: B_INCR,
enormously: B_INCR,
entirely: B_INCR,
especially: B_INCR,
exceptionally: B_INCR,
extremely: B_INCR,
fabulously: B_INCR,
flipping: B_INCR,
flippin: B_INCR,
fricking: B_INCR,
frickin: B_INCR,
frigging: B_INCR,
friggin: B_INCR,
fully: B_INCR,
fucking: B_INCR,
greatly: B_INCR,
hella: B_INCR,
highly: B_INCR,
hugely: B_INCR,
incredibly: B_INCR,
intensely: B_INCR,
majorly: B_INCR,
more: B_INCR,
most: B_INCR,
particularly: B_INCR,
purely: B_INCR,
quite: B_INCR,
really: B_INCR,
remarkably: B_INCR,
so: B_INCR,
substantially: B_INCR,
thoroughly: B_INCR,
totally: B_INCR,
tremendously: B_INCR,
uber: B_INCR,
unbelievably: B_INCR,
unusually: B_INCR,
utterly: B_INCR,
very: B_INCR,
almost: B_DECR,
barely: B_DECR,
hardly: B_DECR,
'just enough': B_DECR,
'kind of': B_DECR,
kinda: B_DECR,
kindof: B_DECR,
'kind-of': B_DECR,
less: B_DECR,
little: B_DECR,
marginally: B_DECR,
occasionally: B_DECR,
partly: B_DECR,
scarcely: B_DECR,
slightly: B_DECR,
somewhat: B_DECR,
'sort of': B_DECR,
sorta: B_DECR,
sortof: B_DECR,
'sort-of': B_DECR
};
// check for special case idioms using a sentiment-laden keyword known to VADER
export const SPECIAL_CASE_IDIOMS = {
'the shit': 3,
'the bomb': 3,
'bad ass': 1.5,
'yeah right': -2,
'cut the mustard': 2,
'kiss of death': -1.5,
'hand to mouth': -2
};
// static methods
export const negated = (input_words, include_nt = true) => {
/**
Determine if input contains negation words
*/
let neg_words = [];
neg_words.push.apply(neg_words, NEGATE);
for (let i = 0; i < neg_words.length; i++) {
if (input_words.indexOf(neg_words[i]) >= 0) {
return true;
}
}
if (include_nt === true) {
for (let i = 0; i < input_words.length; i++) {
if (input_words[i].indexOf("n't") >= 0) {
return true;
}
}
}
const i = input_words.findIndex(element => {
return element === 'least';
});
return i !== -1 && i > 0 && input_words[i - 1] !== 'at';
};
export const normalize = (score, alpha = 15) => {
/**
Normalize the score to be between -1 and 1 using an alpha that
approximates the max expected value
*/
const norm_score = score / Math.sqrt(score * score + alpha);
if (norm_score < -1.0) {
return -1.0;
} else if (norm_score > 1.0) {
return 1.0;
} else {
return norm_score;
}
};
export const allcap_differential = words => {
/**
Check whether just some words in the input are ALL CAPS
*/
let allcap_words = 0;
for (let i = 0; i < words.length; i++) {
if (is_upper_python(words[i])) {
allcap_words += 1;
}
}
const cap_differential = words.length - allcap_words;
return cap_differential > 0 && cap_differential < words.length;
};
export const scalar_inc_dec = (word, valence, is_cap_diff) => {
/**
Check if the preceding words increase, decrease, or negate/nullify the
valence
*/
let scalar = 0.0;
const word_lower = word.toLowerCase();
if (BOOSTER_DICT.hasOwnProperty(word_lower)) {
scalar = BOOSTER_DICT[word_lower];
if (valence < 0) {
scalar *= -1;
}
// check if booster/dampener word is in ALLCAPS (while others aren't)
if (is_cap_diff && is_upper_python(word)) {
if (valence > 0) {
scalar += C_INCR;
} else {
scalar -= C_INCR;
}
}
}
return scalar;
};
export const is_upper_python = word => {
/**
Python style "isupper" function. Requirements are that the string is at least one character in length,
and does not consider an emoticon, e.g. :), as an uppercase word, but a string with special characters and only
all caps characters is an uppercase word, e.g. ':)WORD' is true
*/
if (typeof word === 'string' || word instanceof String) {
if (word.length > 0) {
return /^[^a-z]*[A-Z]+[^a-z]*$/g.test(word);
}
}
return false;
};
export class SentiText {
/**
Identify sentiment-relevant string-level properties of input text
*/
constructor(text) {
this.text = text;
this.words_and_emoticons = this.get_words_and_emoticons();
// doesn't separate words from adjacent punctuation (keeps emoticons & contractions)
this.is_cap_diff = allcap_differential(this.words_and_emoticons);
}
get_words_plus_punc() {
/**
Returns mapping of form:
{
'cat,': 'cat',
',cat': 'cat'
}
*/
// removes punctuation (but loses emoticons & contractions)
const no_punc_text = this.text.slice(0).replace(REGEX_REMOVE_PUNCTUATION, '');
const words = no_punc_text.split(/\s/);
// removes singletons
const words_only = words.filter(word => {
return word.length > 1;
});
let words_punc_dict = {};
for (let j = 0; j < PUNC_LIST.length; j++) {
for (let k = 0; k < words_only.length; k++) {
let pb = `${PUNC_LIST[j]}${words_only[k]}`;
let pa = `${words_only[k]}${PUNC_LIST[j]}`;
words_punc_dict[pb] = words_only[k];
words_punc_dict[pa] = words_only[k];
}
}
return words_punc_dict;
}
get_words_and_emoticons() {
/**
Removes leading and trailing puncutation
Leaves contractions and most emoticons
Does not preserve punc-plus-letter emoticons (e.g. :D)
*/
const tokens = this.text.split(/\s/);
const words_punc_dict = this.get_words_plus_punc();
let words_only = tokens.filter(token => {
return token.length > 1;
});
for (let i = 0; i < words_only.length; i++) {
if (words_punc_dict.hasOwnProperty(words_only[i])) {
words_only[i] = words_punc_dict[words_only[i]];
}
}
return words_only;
}
}
export class SentimentIntensityAnalyzer {
/**
Give a sentiment intensity score to sentences
*/
static polarity_scores(text) {
/**
Return a float for sentiment strength based on the input text.
Positive values are positive valence, negative value are negative
valence
*/
const sentiText = new SentiText(text);
let sentiments = [];
const words_and_emoticons = sentiText.words_and_emoticons;
for (let i = 0; i < words_and_emoticons.length; i++) {
let valence = 0;
const item = words_and_emoticons[i];
if (
(i < words_and_emoticons.length - 1 &&
item.toLowerCase() === 'kind' &&
words_and_emoticons[i + 1].toLowerCase() === 'of') ||
BOOSTER_DICT.hasOwnProperty(item.toLowerCase())
) {
sentiments.push(valence);
continue;
}
sentiments = SentimentIntensityAnalyzer.sentiment_valence(
valence,
sentiText,
item,
i,
sentiments
);
}
sentiments = SentimentIntensityAnalyzer.but_check(words_and_emoticons, sentiments);
const valence_dict = SentimentIntensityAnalyzer.score_valence(sentiments, text);
return valence_dict;
}
static sentiment_valence(valence, sentiText, item, index, sentiments) {
const is_cap_diff = sentiText.is_cap_diff;
const words_and_emoticons = sentiText.words_and_emoticons;
const item_lowercase = item.toLowerCase();
if (LEXICON.hasOwnProperty(item_lowercase)) {
// get the sentiment valence
valence = LEXICON[item_lowercase];
// check if sentiment laden word is in ALL CAPS (while others aren't)
if (is_upper_python(item) && is_cap_diff) {
if (valence > 0) {
valence += C_INCR;
} else {
valence -= C_INCR;
}
}
for (let start_i = 0; start_i < 3; start_i++) {
if (
index > start_i &&
LEXICON.hasOwnProperty(words_and_emoticons[index - (start_i + 1)].toLowerCase()) === false
) {
// dampen the scalar modifier of preceding words and emoticons
// (excluding the ones that immediately preceed the item) based
// on their distance from the current item.
let s = scalar_inc_dec(words_and_emoticons[index - (start_i + 1)], valence, is_cap_diff);
if (start_i === 1 && s !== 0) {
s = s * 0.95;
} else if (start_i === 2 && s !== 0) {
s = s * 0.9;
}
valence = valence + s;
valence = this.never_check(valence, words_and_emoticons, start_i, index);
if (start_i === 2) {
valence = this.idioms_check(valence, words_and_emoticons, index);
}
}
}
valence = this.least_check(valence, words_and_emoticons, index);
}
sentiments.push(valence);
return sentiments;
}
static least_check(valence, words_and_emoticons, index) {
/**
Check for negaion case using "least"
*/
if (
index > 1 &&
words_and_emoticons[index - 1].toLowerCase() === 'least' &&
LEXICON.hasOwnProperty(words_and_emoticons[index - 1].toLowerCase()) === false
) {
if (
words_and_emoticons[index - 2].toLowerCase() !== 'at' &&
words_and_emoticons[index - 2].toLowerCase() !== 'very'
) {
valence = valence * N_SCALAR;
}
} else if (
index > 0 &&
words_and_emoticons[index - 1].toLowerCase() === 'least' &&
LEXICON.hasOwnProperty(words_and_emoticons[index - 1].toLowerCase()) === false
) {
valence = valence * N_SCALAR;
}
return valence;
}
static but_check(words_and_emoticons, sentiments) {
/**
Check for modification in sentiment due to contrastive conjunction 'but'
*/
let but_index = words_and_emoticons.indexOf('but');
if (but_index === -1) {
but_index = words_and_emoticons.indexOf('BUT');
}
if (but_index !== -1) {
for (let i = 0; i < sentiments.length; i++) {
const sentiment_index = i;
const sentiment = sentiments[sentiment_index];
if (sentiment_index < but_index) {
sentiments.splice(sentiment_index, 1);
sentiments.splice(sentiment_index, 0, sentiment * 0.5);
} else if (sentiment_index > but_index) {
sentiments.splice(sentiment_index, 1);
sentiments.splice(sentiment_index, 0, sentiment * 1.5);
}
}
}
return sentiments;
}
static idioms_check(valence, words_and_emoticons, index) {
const onezero = `${words_and_emoticons[index - 1]} ${words_and_emoticons[index]}`;
const twoonezero = `${words_and_emoticons[index - 2]} ${words_and_emoticons[index - 1]} ${
words_and_emoticons[index]
}`;
const twoone = `${words_and_emoticons[index - 2]} ${words_and_emoticons[index - 1]}`;
const threetwoone = `${words_and_emoticons[index - 3]} ${words_and_emoticons[index - 2]} ${
words_and_emoticons[index - 1]
}`;
const threetwo = `${words_and_emoticons[index - 3]} ${words_and_emoticons[index - 2]}`;
const sequences = [onezero, twoonezero, twoone, threetwoone, threetwo];
for (let i = 0; i < sequences.length; i++) {
if (SPECIAL_CASE_IDIOMS.hasOwnProperty(sequences[i])) {
valence = SPECIAL_CASE_IDIOMS[sequences[i]];
break;
}
}
if (words_and_emoticons.length - 1 > index) {
const zeroone = `${words_and_emoticons[index]} ${words_and_emoticons[index + 1]}`;
if (SPECIAL_CASE_IDIOMS.hasOwnProperty(zeroone)) {
valence = SPECIAL_CASE_IDIOMS[zeroone];
}
}
if (words_and_emoticons.length - 1 > index + 1) {
const zeroonetwo = `${words_and_emoticons[index]} ${words_and_emoticons[index + 1]} ${
words_and_emoticons[index + 2]
}`;
if (SPECIAL_CASE_IDIOMS.hasOwnProperty(zeroonetwo)) {
valence = SPECIAL_CASE_IDIOMS[zeroonetwo];
}
}
// check for booster/dampener bi-grams such as 'sort of' or 'kind of'
if (BOOSTER_DICT.hasOwnProperty(threetwo) || BOOSTER_DICT.hasOwnProperty(twoone)) {
valence = valence + B_DECR;
}
return valence;
}
static never_check(valence, words_and_emoticons, start_i, index) {
if (start_i === 0) {
if (negated([words_and_emoticons[index - 1]])) {
valence = valence * N_SCALAR;
}
}
if (start_i === 1) {
if (
words_and_emoticons[index - 2] === 'never' &&
(words_and_emoticons[index - 1] === 'so' || words_and_emoticons[index - 1] === 'this')
) {
valence = valence * 1.5;
} else if (negated([words_and_emoticons[index - (start_i + 1)]])) {
valence = valence * N_SCALAR;
}
}
if (start_i === 2) {
if (
(words_and_emoticons[index - 3] === 'never' &&
(words_and_emoticons[index - 2] === 'so' || words_and_emoticons[index - 2] === 'this')) ||
(words_and_emoticons[index - 1] === 'so' || words_and_emoticons[index - 1] === 'this')
) {
valence = valence * 1.25;
} else if (negated([words_and_emoticons[index - (start_i + 1)]])) {
valence = valence * N_SCALAR;
}
}
return valence;
}
static punctuation_emphasis(sum_s, text) {
/**
Add emphasis from exclamation points and question marks
*/
const ep_amplifier = SentimentIntensityAnalyzer.amplify_ep(text);
const qm_amplifier = SentimentIntensityAnalyzer.amplify_qm(text);
const punct_emph_amplifier = ep_amplifier + qm_amplifier;
return punct_emph_amplifier;
}
static amplify_ep(text) {
/**
Check for added emphasis resulting from exclamation points (up to 4 of them)
*/
let ep_count = text.replace(/[^!]/g, '').length;
if (ep_count > 4) {
ep_count = 4;
}
// empirically derived mean sentiment intensity rating increase for exclamation points
const ep_amplifier = ep_count * 0.292;
return ep_amplifier;
}
static amplify_qm(text) {
/**
Check for added emphasis resulting from question marks (2 or 3+)
*/
let qm_count = text.replace(/[^?]/g, '').length;
let qm_amplifier = 0;
if (qm_count > 1) {
if (qm_count <= 3) {
// empirically derived mean sentiment intensity rating increase for question marks
qm_amplifier = qm_count * 0.18;
} else {
qm_amplifier = 0.96;
}
}
return qm_amplifier;
}
static sift_sentiment_scores(sentiments) {
/**
Want separate positive versus negative sentiment scores
*/
let pos_sum = 0.0;
let neg_sum = 0.0;
let neu_count = 0;
for (let i = 0; i < sentiments.length; i++) {
const sentiment_score = sentiments[i];
if (sentiment_score > 0) {
pos_sum += sentiment_score + 1; // compensates for neutral words that are counted as 1
} else if (sentiment_score < 0) {
neg_sum += sentiment_score - 1; // when used with math.fabs(), compensates for neutrals
} else {
neu_count += 1;
}
}
const results = [pos_sum, neg_sum, neu_count];
return results;
}
static score_valence(sentiments, text) {
if (sentiments && sentiments.length > 0) {
let sum_s = 0;
for (let i = 0; i < sentiments.length; i++) {
sum_s += sentiments[i];
}
// compute and add emphasis from punctuation in text
const punct_emph_amplifier = SentimentIntensityAnalyzer.punctuation_emphasis(sum_s, text);
if (sum_s > 0) {
sum_s += punct_emph_amplifier;
} else if (sum_s < 0) {
sum_s -= punct_emph_amplifier;
}
let compound = normalize(sum_s);
// discriminate between positive, negative and neutral sentiment scores
let scores = SentimentIntensityAnalyzer.sift_sentiment_scores(sentiments);
let pos_sum = scores[0];
let neg_sum = scores[1];
let neu_count = scores[2];
if (pos_sum > Math.abs(neg_sum)) {
pos_sum += punct_emph_amplifier;
} else if (pos_sum < Math.abs(neg_sum)) {
neg_sum -= punct_emph_amplifier;
}
const total = pos_sum + Math.abs(neg_sum) + neu_count;
const pos = Math.abs(pos_sum / total);
const neg = Math.abs(neg_sum / total);
const neu = Math.abs(neu_count / total);
const sentiment_dict = {
neg: parseFloat(neg.toFixed(3)),
neu: parseFloat(neu.toFixed(3)),
pos: parseFloat(pos.toFixed(3)),
compound: parseFloat(compound.toFixed(4))
};
return sentiment_dict;
} else {
const sentiment_dict = {
neg: 0.0,
neu: 0.0,
pos: 0.0,
compound: 0.0
};
return sentiment_dict;
}
}
}