whichx
Version:
A text description classifier for classifying arbitrary strings into provided labels
1 lines • 5.24 kB
JavaScript
function WhichX(config){var STOPWORDS;var DEFAULT_STOPWORDS=["a","all","am","an","and","any","are","as","at","be","because","been","being","but","by","count","could","did","do","does","doing","during","each","few","for","had","has","have","having","he","hed","hes","her","here","heres","hers","herself","him","himself","his","how","hows","i","id","im","ive","if","in","into","is","it","its","itself","lets","me","more","most","my","myself","of","off","on","once","only","or","other","ought","our","ours","ourselves","over","own","same","she","shes","should","so","some","such","than","that","thats","the","their","theirs","them","themselves","then","there","theres","these","they","theyd","theyll","theyre","theyve","this","those","through","to","too","until","was","we","wed","well","were","weve","what","whats","when","whens","where","wheres","which","while","who","whos","whom","why","whys","with","wordtotal","would","you","youd","youll","youre","your","youve","yours","yourself","yourselves"];if(!config||!config.stopwords){STOPWORDS=DEFAULT_STOPWORDS}else if(config.stopwords instanceof Array){STOPWORDS=config.stopwords.slice();STOPWORDS.push("tcount","wordtotal")}else{throw new Error("The `stopwords` variable of your configuration must be an array.")}var typesMap={total:{tcount:0,wordTotal:1}};this.addLabels=function(labels){var i=0;if(typeof labels==="string"){addLabel(labels)}else if(labels instanceof Array){for(i;i<labels.length;i++){addLabel(labels[i])}}else{throw new Error("Invalid label '"+labels+"' of type '"+typeof labels+"'. Expected an Array or a string.")}};this.addData=function(label,description){var type,wordArray,i,word;var total=typesMap.total;if(label.toLowerCase()in typesMap&&typeof description==="string"&&description.length>0){type=typesMap[label.toLowerCase()];type.tcount=type.tcount+1;total.tcount=total.tcount+1;wordArray=processToArray(description);for(i=0;i<wordArray.length;i++){word=wordArray[i];if(word in type){type[word]=type[word]+1}else{type[word]=1}if(word in total){total[word]=total[word]+1}else{total[word]=1}type.wordTotal=type.wordTotal+1;total.wordTotal=total.wordTotal+1}}else{if(!(label.toLowerCase()in typesMap)){throw new Error("Invalid label '"+label+"'. '"+label+"' is not an existing label in: "+Object.keys(typesMap)+".")}else{throw new Error("Invalid description '"+description+"' of type '"+typeof description+"'. Expected a non-empty string.")}}};this.classify=function(description){var wordArray,bestChance,bestLabel,typeName,type,typeChance;if(typeof description==="string"&&description.length>0){wordArray=processToArray(description);bestChance=-1;bestLabel=undefined;for(typeName in typesMap){if(Object.prototype.hasOwnProperty.call(typesMap,typeName)){type=typesMap[typeName];typeChance=getTypeChance(type,wordArray);if(typeChance>bestChance){bestChance=typeChance;bestLabel=typeName}}}return bestLabel}else{throw new Error("Invalid description "+description+" of type "+typeof description+". We expected a non empty string.")}};this.export=function(){return typesMap};this.import=function(importedTypesMap){var newTotal=importedTypesMap.total;if(newTotal===undefined||newTotal.tcount===undefined||newTotal.wordTotal===undefined){throw new Error("Import invalid. This doesn't look like it was exported from a prior model.")}typesMap=importedTypesMap};function addLabel(label){if(typeof label!=="string"){throw new Error("Invalid label of type '"+typeof label+"'. Expected string.")}else if(label.length===0||label.trim().length===0){throw new Error("Label strings must be non-empty.")}else if(label.toLowerCase()==="total"){throw new Error("Invalid label. 'total' is a reserved keyword.")}else if({}[label.toLowerCase()]!==undefined){throw new Error("Label '"+label.toLowerCase()+"' must not replace a property of Object.")}else if(label.toLowerCase()in typesMap){throw new Error("Duplicate label '"+label+"'.")}else{typesMap[label.toLowerCase()]={tcount:0,wordTotal:0}}}function getTypeChance(type,words){var i,typeWordCount,totalWordCount,p1,p2,wordChance;var typeChance=0;var total=typesMap.total;for(i=0;i<words.length;i++){typeWordCount=typeof type[words[i]]!=="undefined"?type[words[i]]:mEstimate();totalWordCount=typeof total[words[i]]!=="undefined"?total[words[i]]:mEstimate();p1=typeWordCount/type.wordTotal*(type.tcount/total.tcount);p2=(totalWordCount-typeWordCount/(total.wordTotal-type.wordTotal))*((total.tcount-type.tcount)/total.tcount);wordChance=p1/(p1+p2);if(typeChance<=0){typeChance=wordChance}else{typeChance=typeChance*wordChance}}return typeChance*(type.tcount/total.tcount)}function mEstimate(){var total=typesMap.total;return 1/(total.wordTotal*100)}function processToArray(description){var i=0;if(typeof description==="string"){if(description.normalize){description=description.normalize("NFD")}description=description.toLowerCase().replace(/[\u0300-\u036f]/g,"").replace(/[^a-zA-Z ]/g,"");for(i;i<STOPWORDS.length;i++){description=description.replace(new RegExp("\\b"+STOPWORDS[i]+"\\b","g")," ")}description=description.replace(/\s+/g," ");return description.trim().split(" ")}else{throw new Error("Invalid description "+description+" of type "+typeof description+". Expected string.")}}}if(module&&module.exports){module.exports=WhichX}