programming-language-classifier
Version:
A programming language classifier that uses a bayesian algorithm to determine likelihood of a language being a particular programming language
93 lines (70 loc) • 2.91 kB
text/coffeescript
_ = require 'lodash'
path = require 'path'
Samples = require 'linguist-samples'
Tokenizer = require 'code-tokenizer'
class Classifier
constructor: (db = {}) ->
= db['tokens_total']
= db['languages_total']
= db['tokens']
= db['language_tokens']
= db['languages']
train: (db, language, data) ->
tokens = Tokenizer.tokenize data
db['tokens_total'] = db['tokens_total'] or 0
db['languages_total'] = db['languages_total'] or 0
db['tokens'] = db['tokens'] or {}
db['language_tokens'] = db['language_tokens'] or {}
db['languages'] = db['languages'] or {}
_.each tokens, (token) ->
db['tokens'][language] = db['tokens'][language] or {}
db['tokens'][language][token] = db['tokens'][language][token] or 0
db['tokens'][language][token] += 1
db['language_tokens'][language] = db['language_tokens'][language] or 0
db['language_tokens'][language] += 1
db['tokens_total'] += 1
db['languages'][language] = db['languages'][language] or 0
db['languages'][language] += 1
db['languages_total'] += 1
= db['tokens_total']
= db['languages_total']
= db['tokens']
= db['language_tokens']
= db['languages']
null
classify: (db = false, tokens, languages = null) ->
if db is false
db = Samples.loadSampleFile(path.join(__dirname, '../data', '2014-05-16.json'))
languages = languages or _.keys(db['languages'])
_classify db, tokens, languages
else
languages = languages or _.keys(db['languages'])
_classify db, tokens, languages
_classify = (db, tokens, languages) =>
return [] if tokens is null
tokens = Tokenizer.tokenize(tokens) if _.isString tokens
scores = {}
# TODO: ADD DEBUG
_.each languages, (language) ->
scores[language] = _tokens_probability(db, tokens, language) + _language_probability(db, language)
#TODO: ADD DEBUG
sortableScores = []
_.forOwn scores, (score, key) ->
sortableScores.push [key, score]
sortedScores = sortableScores.sort (a, b) ->
return b[1] - a[1]
sortableScores
_tokens_probability = (db, tokens, language) ->
sum = 0.0
_.each tokens, (token) ->
sum += Math.log _token_probability(db, token, language)
sum
_token_probability = (db, token, language) =>
tokenCount = if db['tokens'][language][token]? then db['tokens'][language][token] else 0
if tokenCount is 0
1 / db['tokens_total']
else
parseFloat(db['tokens'][language][token]) / parseFloat(db['language_tokens'][language])
_language_probability = (db, language) ->
Math.log(parseFloat(db['languages'][language]) / parseFloat(db['languages_total']))
module.exports = new Classifier()