UNPKG

programming-language-classifier

Version:

A programming language classifier that uses a bayesian algorithm to determine likelihood of a language being a particular programming language

120 lines (104 loc) 3.69 kB
// Generated by CoffeeScript 1.7.1 var Classifier, Samples, Tokenizer, path, _; _ = require('lodash'); path = require('path'); Samples = require('linguist-samples'); Tokenizer = require('code-tokenizer'); Classifier = (function() { var _classify, _language_probability, _token_probability, _tokens_probability; function Classifier(db) { if (db == null) { db = {}; } this.tokens_total = db['tokens_total']; this.languages_total = db['languages_total']; this.tokens = db['tokens']; this.language_tokens = db['language_tokens']; this.languages = db['languages']; } Classifier.prototype.train = function(db, language, data) { var tokens; tokens = Tokenizer.tokenize(data); db['tokens_total'] = db['tokens_total'] || 0; db['languages_total'] = db['languages_total'] || 0; db['tokens'] = db['tokens'] || {}; db['language_tokens'] = db['language_tokens'] || {}; db['languages'] = db['languages'] || {}; _.each(tokens, function(token) { db['tokens'][language] = db['tokens'][language] || {}; db['tokens'][language][token] = db['tokens'][language][token] || 0; db['tokens'][language][token] += 1; db['language_tokens'][language] = db['language_tokens'][language] || 0; db['language_tokens'][language] += 1; return db['tokens_total'] += 1; }); db['languages'][language] = db['languages'][language] || 0; db['languages'][language] += 1; db['languages_total'] += 1; this.tokens_total = db['tokens_total']; this.languages_total = db['languages_total']; this.tokens = db['tokens']; this.language_tokens = db['language_tokens']; this.languages = db['languages']; return null; }; Classifier.prototype.classify = function(db, tokens, languages) { if (db == null) { db = false; } if (languages == null) { languages = null; } if (db === false) { db = Samples.loadSampleFile(path.join(__dirname, '../data', '2014-05-16.json')); languages = languages || _.keys(db['languages']); return _classify(db, tokens, languages); } else { languages = languages || _.keys(db['languages']); return _classify(db, tokens, languages); } }; _classify = function(db, tokens, languages) { var scores, sortableScores, sortedScores; if (tokens === null) { return []; } if (_.isString(tokens)) { tokens = Tokenizer.tokenize(tokens); } scores = {}; _.each(languages, function(language) { return scores[language] = _tokens_probability(db, tokens, language) + _language_probability(db, language); }); sortableScores = []; _.forOwn(scores, function(score, key) { return sortableScores.push([key, score]); }); sortedScores = sortableScores.sort(function(a, b) { return b[1] - a[1]; }); return sortableScores; }; _tokens_probability = function(db, tokens, language) { var sum; sum = 0.0; _.each(tokens, function(token) { return sum += Math.log(_token_probability(db, token, language)); }); return sum; }; _token_probability = function(db, token, language) { var tokenCount; tokenCount = db['tokens'][language][token] != null ? db['tokens'][language][token] : 0; if (tokenCount === 0) { return 1 / db['tokens_total']; } else { return parseFloat(db['tokens'][language][token]) / parseFloat(db['language_tokens'][language]); } }; _language_probability = function(db, language) { return Math.log(parseFloat(db['languages'][language]) / parseFloat(db['languages_total'])); }; return Classifier; })(); module.exports = new Classifier();