@smodin/fast-text-language-detection
Version:
Language detection with facebook fast-text model
272 lines (270 loc) • 4.06 kB
JavaScript
/* internal requirements */
var path = require('path');
/* external requirements */
var Classifier = require('fast-text').Classifier;
// from benchmark-testing/results/reliability_list_<verison>.json
var reliabilityList = [
'th',
'ml',
'my',
'ta',
'te',
'pa',
'am',
'kn',
'gu',
'si',
'bo',
'dv',
'ja',
'el',
'he',
'ko',
'hy',
'bn',
'mr',
'en',
'zh',
'tr',
'ru',
'de',
'ug',
'vi',
'eo',
'ka',
'hi',
'it',
'ar',
'fr',
'hu',
'lo',
'pl',
'km',
'es',
'fi',
'pt',
'mk',
'uk',
'ur',
'nl',
'lt',
'cs',
];
var languageIsoCodes = [
'af',
'als',
'am',
'an',
'ar',
'arz',
'as',
'ast',
'av',
'az',
'azb',
'ba',
'bar',
'bcl',
'be',
'bg',
'bh',
'bn',
'bo',
'bpy',
'br',
'bs',
'bxr',
'ca',
'cbk',
'ce',
'ceb',
'ckb',
'co',
'cs',
'cv',
'cy',
'da',
'de',
'diq',
'dsb',
'dty',
'dv',
'el',
'eml',
'en',
'eo',
'es',
'et',
'eu',
'fa',
'fi',
'fr',
'frr',
'fy',
'ga',
'gd',
'gl',
'gn',
'gom',
'gu',
'gv',
'he',
'hi',
'hif',
'hr',
'hsb',
'ht',
'hu',
'hy',
'ia',
'id',
'ie',
'ilo',
'io',
'is',
'it',
'ja',
'jbo',
'jv',
'ka',
'kk',
'km',
'kn',
'ko',
'krc',
'ku',
'kv',
'kw',
'ky',
'la',
'lb',
'lez',
'li',
'lmo',
'lo',
'lrc',
'lt',
'lv',
'mai',
'mg',
'mhr',
'min',
'mk',
'ml',
'mn',
'mr',
'mrj',
'ms',
'mt',
'mwl',
'my',
'myv',
'mzn',
'nah',
'nap',
'nds',
'ne',
'new',
'nl',
'nn',
'no',
'oc',
'or',
'os',
'pa',
'pam',
'pfl',
'pl',
'pms',
'pnb',
'ps',
'pt',
'qu',
'rm',
'ro',
'ru',
'rue',
'sa',
'sah',
'sc',
'scn',
'sco',
'sd',
'sh',
'si',
'sk',
'sl',
'so',
'sq',
'sr',
'su',
'sv',
'sw',
'ta',
'te',
'tg',
'th',
'tk',
'tl',
'tr',
'tt',
'tyv',
'ug',
'uk',
'ur',
'uz',
'vec',
'vep',
'vi',
'vls',
'vo',
'wa',
'war',
'wuu',
'xal',
'xmf',
'yi',
'yo',
'yue',
'zh',
];
// Some characters can misconstrude the results
// Removing \n and : because of this issue: https://github.com/indix/whatthelang/issues/12
function formatInput(text) {
return text.replace(/[\n:]/g, '');
}
/* the API class */
var LanguageDetection = /** @class */ (function () {
function LanguageDetection(options) {
if (options === void 0) { options = {}; }
this.options = Object.assign({}, {
model: path.join(__dirname, '..', 'model', 'fast-text-lid-model.bin'),
}, options);
this.classifier = new Classifier(this.options.model);
this.languageIsoCodes = languageIsoCodes;
}
LanguageDetection.prototype.predict = function (text, k) {
var _this = this;
if (k === void 0) { k = 1; }
return new Promise(function (resolve, reject) {
_this.classifier.predict(formatInput(text), k, function (err, res) {
if (err)
reject(err);
else {
res = res.map(function (item) {
var lang = item.label.replace(/^__label__/, '');
return {
lang: lang,
prob: item.value,
isReliableLanguage: reliabilityList.includes(lang),
};
});
resolve(res);
}
});
});
};
return LanguageDetection;
}());
module.exports = LanguageDetection;
;