programming-language-classifier
Version:
A programming language classifier that uses a bayesian algorithm to determine likelihood of a language being a particular programming language
120 lines (104 loc) • 3.69 kB
JavaScript
// Generated by CoffeeScript 1.7.1
var Classifier, Samples, Tokenizer, path, _;
_ = require('lodash');
path = require('path');
Samples = require('linguist-samples');
Tokenizer = require('code-tokenizer');
Classifier = (function() {
var _classify, _language_probability, _token_probability, _tokens_probability;
function Classifier(db) {
if (db == null) {
db = {};
}
this.tokens_total = db['tokens_total'];
this.languages_total = db['languages_total'];
this.tokens = db['tokens'];
this.language_tokens = db['language_tokens'];
this.languages = db['languages'];
}
Classifier.prototype.train = function(db, language, data) {
var tokens;
tokens = Tokenizer.tokenize(data);
db['tokens_total'] = db['tokens_total'] || 0;
db['languages_total'] = db['languages_total'] || 0;
db['tokens'] = db['tokens'] || {};
db['language_tokens'] = db['language_tokens'] || {};
db['languages'] = db['languages'] || {};
_.each(tokens, function(token) {
db['tokens'][language] = db['tokens'][language] || {};
db['tokens'][language][token] = db['tokens'][language][token] || 0;
db['tokens'][language][token] += 1;
db['language_tokens'][language] = db['language_tokens'][language] || 0;
db['language_tokens'][language] += 1;
return db['tokens_total'] += 1;
});
db['languages'][language] = db['languages'][language] || 0;
db['languages'][language] += 1;
db['languages_total'] += 1;
this.tokens_total = db['tokens_total'];
this.languages_total = db['languages_total'];
this.tokens = db['tokens'];
this.language_tokens = db['language_tokens'];
this.languages = db['languages'];
return null;
};
Classifier.prototype.classify = function(db, tokens, languages) {
if (db == null) {
db = false;
}
if (languages == null) {
languages = null;
}
if (db === false) {
db = Samples.loadSampleFile(path.join(__dirname, '../data', '2014-05-16.json'));
languages = languages || _.keys(db['languages']);
return _classify(db, tokens, languages);
} else {
languages = languages || _.keys(db['languages']);
return _classify(db, tokens, languages);
}
};
_classify = function(db, tokens, languages) {
var scores, sortableScores, sortedScores;
if (tokens === null) {
return [];
}
if (_.isString(tokens)) {
tokens = Tokenizer.tokenize(tokens);
}
scores = {};
_.each(languages, function(language) {
return scores[language] = _tokens_probability(db, tokens, language) + _language_probability(db, language);
});
sortableScores = [];
_.forOwn(scores, function(score, key) {
return sortableScores.push([key, score]);
});
sortedScores = sortableScores.sort(function(a, b) {
return b[1] - a[1];
});
return sortableScores;
};
_tokens_probability = function(db, tokens, language) {
var sum;
sum = 0.0;
_.each(tokens, function(token) {
return sum += Math.log(_token_probability(db, token, language));
});
return sum;
};
_token_probability = function(db, token, language) {
var tokenCount;
tokenCount = db['tokens'][language][token] != null ? db['tokens'][language][token] : 0;
if (tokenCount === 0) {
return 1 / db['tokens_total'];
} else {
return parseFloat(db['tokens'][language][token]) / parseFloat(db['language_tokens'][language]);
}
};
_language_probability = function(db, language) {
return Math.log(parseFloat(db['languages'][language]) / parseFloat(db['languages_total']));
};
return Classifier;
})();
module.exports = new Classifier();