UNPKG

whichx

Version:

A text description classifier for classifying arbitrary strings into provided labels

github.com/rudikershaw/whichx

rudikershaw/whichx

264 lines (241 loc) • 11.5 kB

JavaScript

// @ts-check /** * @typedef {Object} Config The WhichX configuration options. * @property {string[]} stopwords The list of stop words in the text. Those words will be ignored during the classification process. */ /** * @typedef {Object} LabelEntry * @property {number} tcount The total number of those labels. * @property {number} wordTotal The total number of words added against that label. */ /** @typedef {Record<string, LabelEntry>} TypeMap The map of labels and descriptions. */ /** * Defining the Whichx object. * @param {Config=} config The optional configuration for WhichX. */ function WhichX(config) { // Internet explorer 9 or later required, or any other popular browser. var STOPWORDS; // Stop words including tcount & wordtotal (because they are key words in the maps used to store the data). var DEFAULT_STOPWORDS = ["a", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "being", "but", "by", "count", "could", "did", "do", "does", "doing", "during", "each", "few", "for", "had", "has", "have", "having", "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how", "hows", "i", "id", "im", "ive", "if", "in", "into", "is", "it", "its", "itself", "lets", "me", "more", "most", "my", "myself", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "over", "own", "same", "she", "shes", "should", "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through", "to", "too", "until", "was", "we", "wed", "well", "were", "weve", "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why", "whys", "with", "wordtotal", "would", "you", "youd", "youll", "youre", "your", "youve", "yours", "yourself", "yourselves"]; // Configure WhichX object. if (!config || !config.stopwords) { STOPWORDS = DEFAULT_STOPWORDS; } else if (config.stopwords instanceof Array) { STOPWORDS = config.stopwords.slice(); STOPWORDS.push("tcount", "wordtotal"); } else { throw new Error("The `stopwords` variable of your configuration must be an array."); } // Map (using object notation) of description types. // Each type containing a map of words and counts. // The tcount represents the total number of those labels. // The word total represents the total number of words added against that label. /** @type {TypeMap} */ var typesMap = { // Total must exist and be incremented for probability calculations. total: { tcount: 0, wordTotal: 1 } }; /** * Add a label or list of labels to the classifier. * @param {string | string[]} labels A label or a list of labels to add. */ this.addLabels = function(labels) { var i = 0; if (typeof labels === "string") { addLabel(labels); } else if (labels instanceof Array) { for (i; i < labels.length; i++) { addLabel(labels[i]); } } else { throw new Error("Invalid label '" + labels + "' of type '" + typeof labels + "'. Expected an Array or a string."); } }; /** * Add word data from a description to a specified label. * @param {string} label The label the description must be attached to. * @param {string} description The description matching the label. */ this.addData = function(label, description) { var type, wordArray, i, word; var total = typesMap.total; if (label.toLowerCase() in typesMap && typeof description === "string" && description.length > 0) { type = typesMap[label.toLowerCase()]; type.tcount = type.tcount + 1; total.tcount = total.tcount + 1; wordArray = processToArray(description); // Check whether each word exists against that label and the total. // If it does increment the tcount, otherwise add the word. for (i = 0; i < wordArray.length; i++) { word = wordArray[i]; // Add/Increment word to specific label. if (word in type) { type[word] = type[word] + 1; } else { type[word] = 1; } // Add/Increment word to total if (word in total) { total[word] = total[word] + 1; } else { total[word] = 1; } type.wordTotal = type.wordTotal + 1; total.wordTotal = total.wordTotal + 1; } } else { if (!(label.toLowerCase() in typesMap)) { throw new Error("Invalid label '" + label + "'. '" + label + "' is not an existing label in: " + Object.keys(typesMap) + "."); } else { throw new Error("Invalid description '" + description + "' of type '" + typeof description + "'. Expected a non-empty string."); } } }; /** * Take a description and find the most likely label for it. * @param {string} description The description to classify. * @returns {string} The label that best matches the description. */ this.classify = function(description) { var wordArray, bestChance, bestLabel, typeName, type, typeChance; if (typeof description === "string" && description.length > 0) { wordArray = processToArray(description); bestChance = -1; bestLabel = undefined; // Loop through types working out the chance of the description being // for this type. If better than bestChance then bestChange <- chance. for (typeName in typesMap) { if (Object.prototype.hasOwnProperty.call(typesMap, typeName)) { type = typesMap[typeName]; typeChance = getTypeChance(type, wordArray); if (typeChance > bestChance) { bestChance = typeChance; bestLabel = typeName; } } } return bestLabel; } else { throw new Error("Invalid description " + description + " of type " + typeof description + ". We expected a non empty string."); } }; /** * Exports the WhichX internal data representation learned from provided. * labeled text. Please see the typesMap comments for more details. * @returns {TypeMap} A TypeMap that can be saved for later import in WhichX. */ this.export = function() { return typesMap; }; /** * Imports a previously exported model. This will write over any data this instance has already learned. * @param {TypeMap} importedTypesMap The types map previously exported from WhichX */ this.import = function(importedTypesMap) { var newTotal = importedTypesMap.total; if (newTotal === undefined || newTotal.tcount === undefined || newTotal.wordTotal === undefined) { throw new Error("Import invalid. This doesn't look like it was exported from a prior model."); } typesMap = importedTypesMap; }; /** * Add a label to the classifier. * @param {string} label A label to add. */ function addLabel(label) { if (typeof label !== "string") { throw new Error("Invalid label of type '" + typeof label + "'. Expected string."); } else if (label.length === 0 || label.trim().length === 0) { throw new Error("Label strings must be non-empty."); } else if (label.toLowerCase() === "total") { throw new Error("Invalid label. 'total' is a reserved keyword."); } else if (({})[label.toLowerCase()] !== undefined) { throw new Error("Label '" + label.toLowerCase() + "' must not replace a property of Object."); } else if (label.toLowerCase() in typesMap) { throw new Error("Duplicate label '" + label + "'."); } else { typesMap[label.toLowerCase()] = { tcount: 0, wordTotal: 0 }; } } /** * Loop through words and work out probability of a type given each word. * Multiply each word's probability by total probability to determine type probability. * @param {LabelEntry} type The label entry to test. * @param {string[]} words The words list in the description. * @returns {number} The probability that the description belongs to that given label. */ function getTypeChance(type, words) { var i, typeWordCount, totalWordCount, p1, p2, wordChance; var typeChance = 0; var total = typesMap.total; for (i = 0; i < words.length; i++) { typeWordCount = (typeof type[words[i]] !== "undefined" ? type[words[i]] : mEstimate()); totalWordCount = (typeof total[words[i]] !== "undefined" ? total[words[i]] : mEstimate()); // Bayes' theorem calculation. p1 = (typeWordCount / type.wordTotal) * (type.tcount / total.tcount); p2 = ((totalWordCount - typeWordCount / (total.wordTotal - type.wordTotal)) * ((total.tcount - type.tcount) / total.tcount)); wordChance = p1 / (p1 + p2); if (typeChance <= 0) { typeChance = wordChance; } else { typeChance = typeChance * wordChance; } } // Multiply final probability by overall probability that it is of this type to weight by most popular types. return typeChance * (type.tcount / total.tcount); } /** * A non-zero prior estimate to prevent 0 based probability calculations. * @returns {number} The non-zero probability. */ function mEstimate() { var total = typesMap.total; return 1 / (total.wordTotal * 100); } /** * Process the description into an array of standardized lower case words. * @param {string} description The description to process. * @returns {string[]} The list of processed words contained in the description. */ function processToArray(description) { var i = 0; if (typeof description === "string") { // Remove special characters. if (description.normalize) { description = description.normalize("NFD"); } description = description.toLowerCase() .replace(/[\u0300-\u036f]/g, "") .replace(/[^a-zA-Z ]/g, ""); // Remove all stop words for (i; i < STOPWORDS.length; i++) { description = description.replace(new RegExp("\\b" + STOPWORDS[i] + "\\b", "g"), " "); } // Remove extra spaces. description = description.replace(/\s+/g, " "); // Return array of processed words. return description.trim().split(" "); } else { throw new Error("Invalid description " + description + " of type " + typeof description + ". Expected string."); } } } // Export whichx function if using node. if (module && module.exports) { module.exports = WhichX; }