@isdk/detect-text-language
Version:
Detect the text language
580 lines (578 loc) • 14 kB
JavaScript
// src/country-codes.ts
var CountryCodes = {
"US": "en-US,es-US,haw,fr",
"GB": "en-GB,cy-GB,gd",
"FR": "fr-FR,frp,br,co,ca,eu,oc",
"DE": "de",
"AD": "ca",
"AE": "ar-AE,fa,en,hi,ur",
"AF": "fa-AF,ps,uz-AF,tk",
"AG": "en-AG",
"AI": "en-AI",
"AL": "sq,el",
"AM": "hy",
"AN": "nl-AN,en,es",
"AO": "pt-AO",
"AQ": "",
"AR": "es-AR,en,it,de,fr,gn",
"AS": "en-AS,sm,to",
"AT": "de-AT,hr,hu,sl",
"AU": "en-AU",
"AW": "nl-AW,pap,es,en",
"AX": "sv-AX",
"AZ": "az,ru,hy",
"BA": "bs,hr-BA,sr-BA",
"BB": "en-BB",
"BD": "bn-BD,en",
"BE": "nl-BE,fr-BE,de-BE",
"BF": "fr-BF,mos",
"BG": "bg,tr-BG,rom",
"BH": "ar-BH,en,fa,ur",
"BI": "fr-BI,rn",
"BJ": "fr-BJ",
"BL": "fr",
"BM": "en-BM,pt",
"BN": "ms-BN,en-BN",
"BO": "es-BO,qu,ay",
"BQ": "nl,pap,en",
"BR": "pt-BR,es,en,fr",
"BS": "en-BS",
"BT": "dz",
"BV": "",
"BW": "en-BW,tn-BW",
"BY": "be,ru",
"BZ": "en-BZ,es",
"CA": "en-CA,fr-CA,iu",
"CC": "ms-CC,en",
"CD": "fr-CD,ln,ktu,kg,sw,lua",
"CF": "fr-CF,sg,ln,kg",
"CG": "fr-CG,kg,ln-CG",
"CH": "de-CH,fr-CH,it-CH,rm",
"CI": "fr-CI",
"CK": "en-CK,mi",
"CL": "es-CL",
"CM": "en-CM,fr-CM",
"CN": "zh-CN,zh,yue,wuu,dta,ug,za",
"CO": "es-CO",
"CR": "es-CR,en",
"CU": "es-CU,pap",
"CV": "pt-CV",
"CW": "nl,pap",
"CX": "en,zh,ms-CX",
"CY": "el-CY,tr-CY,en",
"CZ": "cs,sk",
"DJ": "fr-DJ,ar,so-DJ,aa",
"DK": "da-DK,en,fo,de-DK",
"DM": "en-DM",
"DO": "es-DO",
"DZ": "ar-DZ",
"EC": "es-EC",
"EE": "et,ru",
"EG": "ar-EG,en,fr",
"EH": "ar,mey",
"ER": "aa-ER,ar,tig,kun,ti-ER",
"ES": "es-ES,ca,gl,eu,oc",
"ET": "am,en-ET,om-ET,ti-ET,so-ET,sid",
"FI": "fi-FI,sv-FI,smn",
"FJ": "en-FJ,fj",
"FK": "en-FK",
"FM": "en-FM,chk,pon,yap,kos,uli,woe,nkr,kpg",
"FO": "fo,da-FO",
"GA": "fr-GA",
"GD": "en-GD",
"GE": "ka,ru,hy,az",
"GF": "fr-GF",
"GG": "en,nrf",
"GH": "en-GH,ak,ee,tw",
"GI": "en-GI,es,it,pt",
"GL": "kl,da-GL,en",
"GM": "en-GM,mnk,wof,wo,ff",
"GN": "fr-GN",
"GP": "fr-GP",
"GQ": "es-GQ,fr,pt",
"GR": "el-GR,en,fr",
"GS": "en",
"GT": "es-GT",
"GU": "en-GU,ch-GU",
"GW": "pt-GW,pov",
"GY": "en-GY",
"HK": "zh-HK,yue,zh,en",
"HM": "",
"HN": "es-HN,cab,miq",
"HR": "hr-HR,sr",
"HT": "ht,fr-HT",
"HU": "hu-HU",
"ID": "id,en,nl,jv",
"IE": "en-IE,ga-IE",
"IL": "he,ar-IL,en-IL",
"IM": "en,gv",
"IN": "en-IN,hi,bn,te,mr,ta,ur,gu,kn,ml,or,pa,as,bh,sat,ks,ne,sd,kok,doi,mni,sit,sa,fr,lus,inc",
"IO": "en-IO",
"IQ": "ar-IQ,ku,hy",
"IR": "fa-IR,ku",
"IS": "is,en,de,da,sv,no",
"IT": "it-IT,de-IT,fr-IT,sc,ca,co,sl",
"JE": "en,fr,nrf",
"JM": "en-JM",
"JO": "ar-JO,en",
"JP": "ja",
"KE": "en-KE,sw-KE",
"KG": "ky,uz,ru",
"KH": "km,fr,en",
"KI": "en-KI,gil",
"KM": "ar,fr-KM",
"KN": "en-KN",
"KP": "ko-KP",
"KR": "ko-KR,en",
"KW": "ar-KW,en",
"KY": "en-KY",
"KZ": "kk,ru",
"LA": "lo,fr,en",
"LB": "ar-LB,fr-LB,en,hy",
"LC": "en-LC",
"LI": "de-LI",
"LK": "si,ta,en",
"LR": "en-LR",
"LS": "en-LS,st,zu,xh",
"LT": "lt,ru,pl",
"LU": "lb,de-LU,fr-LU",
"LV": "lv,ru,lt",
"LY": "ar-LY,it,en",
"MA": "ar-MA,ber,fr",
"MC": "fr-MC,en,it",
"MD": "ro,ru,gag,tr",
"ME": "sr,hu,bs,sq,hr,rom",
"MF": "fr",
"MG": "fr-MG,mg",
"MH": "mh,en-MH",
"MK": "mk,sq,tr,rmm,sr",
"ML": "fr-ML,bm",
"MM": "my",
"MN": "mn,ru",
"MO": "zh,zh-MO,pt",
"MP": "fil,tl,zh,ch-MP,en-MP",
"MQ": "fr-MQ",
"MR": "ar-MR,fuc,snk,fr,mey,wo",
"MS": "en-MS",
"MT": "mt,en-MT",
"MU": "en-MU,bho,fr",
"MV": "dv,en",
"MW": "ny,yao,tum,swk",
"MX": "es-MX",
"MY": "ms-MY,en,zh,ta,te,ml,pa,th",
"MZ": "pt-MZ,vmw",
"NA": "en-NA,af,de,hz,naq",
"NC": "fr-NC",
"NE": "fr-NE,ha,kr,dje",
"NF": "en-NF",
"NG": "en-NG,ha,yo,ig,ff",
"NI": "es-NI,en",
"NL": "nl-NL,fy-NL",
"NO": "no,nb,nn,se,fi",
"NP": "ne,en",
"NR": "na,en-NR",
"NU": "niu,en-NU",
"NZ": "en-NZ,mi",
"OM": "ar-OM,en,bal,ur",
"PA": "es-PA,en",
"PE": "es-PE,qu,ay",
"PF": "fr-PF,ty",
"PG": "en-PG,ho,meu,tpi",
"PH": "tl,en-PH,fil,ceb,ilo,hil,war,pam,bik,bcl,pag,mrw,tsg,mdh,cbk,krj,sgd,msb,akl,ibg,yka,mta,abx",
"PK": "ur-PK,en-PK,pa,sd,ps,brh",
"PL": "pl",
"PM": "fr-PM",
"PN": "en-PN",
"PR": "en-PR,es-PR",
"PS": "ar-PS",
"PT": "pt-PT,mwl",
"PW": "pau,sov,en-PW,tox,ja,fil,zh",
"PY": "es-PY,gn",
"QA": "ar-QA,es",
"RE": "fr-RE",
"RO": "ro,hu,rom",
"RS": "sr,hu,bs,rom",
"RU": "ru,tt,xal,cau,ady,kv,ce,tyv,cv,udm,tut,mns,bua,myv,mdf,chm,ba,inh,kbd,krc,av,sah,nog",
"RW": "rw,en-RW,fr-RW,sw",
"SA": "ar-SA",
"SB": "en-SB,tpi",
"SC": "en-SC,fr-SC",
"SD": "ar-SD,en,fia",
"SE": "sv-SE,se,sma,fi-SE",
"SG": "cmn,en-SG,ms-SG,ta-SG,zh-SG",
"SH": "en-SH",
"SI": "sl,sh",
"SJ": "no,ru",
"SK": "sk,hu",
"SL": "en-SL,men,tem",
"SM": "it-SM",
"SN": "fr-SN,wo,fuc,mnk",
"SO": "so-SO,ar-SO,it,en-SO",
"SR": "nl-SR,en,srn,hns,jv",
"SS": "en,ar-SS",
"ST": "pt-ST",
"SV": "es-SV",
"SX": "nl,en",
"SY": "ar-SY,ku,hy,arc,fr,en",
"SZ": "en-SZ,ss-SZ",
"TC": "en-TC",
"TD": "fr-TD,ar-TD,sre",
"TF": "fr",
"TG": "fr-TG,ee,hna,kbp,dag,ha",
"TH": "th,en",
"TJ": "tg,ru",
"TK": "tkl,en-TK",
"TL": "tet,pt-TL,id,en",
"TM": "tk,ru,uz",
"TN": "ar-TN,fr",
"TO": "to,en-TO",
"TR": "tr-TR,ku,diq,az,av",
"TT": "en-TT,hns,fr,es,zh",
"TV": "tvl,en,sm,gil",
"TW": "zh-TW,zh,nan,hak",
"TZ": "sw-TZ,en,ar",
"UA": "uk,ru-UA,rom,pl,hu",
"UG": "en-UG,lg,sw,ar",
"UM": "en-UM",
"UY": "es-UY",
"UZ": "uz,ru,tg",
"VA": "la,it,fr",
"VC": "en-VC,fr",
"VE": "es-VE",
"VG": "en-VG",
"VI": "en-VI",
"VN": "vi,en,fr,zh,km",
"VU": "bi,en-VU,fr-VU",
"WF": "wls,fud,fr-WF",
"WS": "sm,en-WS",
"XK": "sq,sr",
"YE": "ar-YE",
"YT": "fr-YT",
"ZA": "zu,xh,af,nso,en-ZA,tn,st,ts,ss,ve,nr",
"ZM": "en-ZM,bem,loz,lun,lue,ny,toi",
"ZW": "en-ZW,sn,nr,nd"
};
var CountryNames = {
"AD": "Andorra",
"AE": "United Arab Emirates",
"AF": "Afghanistan",
"AG": "Antigua and Barbuda",
"AI": "Anguilla",
"AL": "Albania",
"AM": "Armenia",
"AN": "Netherlands Antilles",
"AO": "Angola",
"AQ": "Antarctica",
"AR": "Argentina",
"AS": "American Samoa",
"AT": "Austria",
"AU": "Australia",
"AW": "Aruba",
"AX": "\xC5land Islands",
"AZ": "Azerbaijan",
"BA": "Bosnia and Herzegovina",
"BB": "Barbados",
"BD": "Bangladesh",
"BE": "Belgium",
"BF": "Burkina Faso",
"BG": "Bulgaria",
"BH": "Bahrain",
"BI": "Burundi",
"BJ": "Benin",
"BL": "Saint Barth\xE9lemy",
"BM": "Bermuda",
"BN": "Brunei Darussalam",
"BO": "Bolivia, Plurinational State of",
"BQ": "Caribbean Netherlands",
"BR": "Brazil",
"BS": "Bahamas",
"BT": "Bhutan",
"BV": "Bouvet Island",
"BW": "Botswana",
"BY": "Belarus",
"BZ": "Belize",
"CA": "Canada",
"CC": "Cocos (Keeling) Islands",
"CD": "Democratic Republic of the Congo",
"CF": "Central African Republic",
"CH": "Switzerland",
"CI": "C\xF4te d'Ivoire",
"CK": "Cook Islands",
"CL": "Chile",
"CM": "Cameroon",
"CN": "China",
"CO": "Colombia",
"CR": "Costa Rica",
"CU": "Cuba",
"CV": "Cape Verde",
"CW": "Curacao",
"CX": "Christmas Island",
"CY": "Cyprus",
"CZ": "Czech Republic",
"DE": "Germany",
"DJ": "Djibouti",
"DK": "Denmark",
"DM": "Dominica",
"DO": "Dominican Republic",
"DZ": "Algeria",
"EC": "Ecuador",
"EE": "Estonia",
"EG": "Egypt",
"EH": "Western Sahara",
"ER": "Eritrea",
"ES": "Spain",
"ET": "Ethiopia",
"FI": "Finland",
"FJ": "Fiji",
"FK": "Falkland Islands (Malvinas)",
"FM": "Micronesia, Federated States of",
"FO": "Faroe Islands",
"FR": "France",
"GA": "Gabon",
"GB": "United Kingdom",
"GD": "Grenada",
"GE": "Georgia",
"GF": "French Guiana",
"GG": "Guernsey",
"GH": "Ghana",
"GI": "Gibraltar",
"GL": "Greenland",
"GM": "The Gambia",
"GN": "Guinea",
"GP": "Guadeloupe",
"GQ": "Equatorial Guinea",
"GR": "Greece",
"GS": "South Georgia and the South Sandwich Islands",
"GT": "Guatemala",
"GU": "Guam",
"GW": "Guinea-Bissau",
"GY": "Guyana",
"HK": "Hong Kong, China (SAR)",
"HM": "Heard Island and McDonald Islands",
"HN": "Honduras",
"HR": "Croatia",
"HT": "Haiti",
"HU": "Hungary",
"ID": "Indonesia",
"IE": "Ireland",
"IL": "Israel",
"IM": "Isle of Man",
"IN": "India",
"IO": "British Indian Ocean Territory (administered by India)",
"IQ": "Iraq",
"IR": "Iran, Islamic Republic of",
"IS": "Iceland",
"IT": "Italy",
"JE": "Jersey",
"JM": "Jamaica",
"JO": "Jordan",
"JP": "Japan",
"KE": "Kenya",
"KG": "Kyrgyzstan",
"KH": "Cambodia",
"KI": "Kiribati",
"KM": "Comoros",
"KN": "Saint Kitts and Nevis",
"KP": "Korea, Democratic People's Republic of (North Korea)",
"KR": "Republic of Korea (South Korea)",
"KW": "Kuwait",
"KY": "Cayman Islands",
"KZ": "Kazakhstan",
"LA": "Lao People's Democratic Republic",
"LB": "Lebanon",
"LC": "Saint Lucia",
"LI": "Liechtenstein",
"LK": "Sri Lanka",
"LR": "Liberia",
"LS": "Lesotho",
"LT": "Lithuania",
"LU": "Luxembourg",
"LV": "Latvia",
"LY": "Libya",
"MA": "Morocco",
"MC": "Monaco",
"MD": "Moldova, Republic of",
"ME": "Montenegro",
"MF": "Saint Martin (French part)",
"MG": "Madagascar",
"MH": "Marshall Islands",
"MK": "North Macedonia",
"ML": "Mali",
"MM": "Myanmar",
"MN": "Mongolia",
"MO": "Macau, China (SAR)",
"MP": "Northern Mariana Islands",
"MQ": "Martinique",
"MR": "Mauritania",
"MS": "Montserrat",
"MT": "Malta",
"MU": "Mauritius",
"MV": "Maldives",
"MW": "Malawi",
"MX": "Mexico",
"MY": "Malaysia",
"MZ": "Mozambique",
"NA": "Namibia",
"NC": "New Caledonia",
"NE": "Niger",
"NF": "Norfolk Island",
"NG": "Nigeria",
"NI": "Nicaragua",
"NL": "Netherlands",
"NO": "Norway",
"NP": "Nepal",
"NR": "Nauru",
"NU": "Niue",
"NZ": "New Zealand",
"OM": "Oman",
"PA": "Panama",
"PE": "Peru",
"PF": "French Polynesia",
"PG": "Papua New Guinea",
"PH": "Philippines",
"PK": "Pakistan",
"PL": "Poland",
"PM": "Saint Pierre and Miquelon",
"PN": "Pitcairn Islands",
"PR": "Puerto Rico",
"PS": "Palestine, State of (Gaza Strip)",
"PT": "Portugal",
"PW": "Palau",
"PY": "Paraguay",
"QA": "Qatar",
"RE": "R\xE9union",
"RO": "Romania",
"RS": "Serbia",
"RU": "Russian Federation (Russia)",
"RW": "Rwanda",
"SA": "Saudi Arabia",
"SB": "Solomon Islands",
"SC": "Seychelles",
"SD": "Sudan",
"SE": "Sweden",
"SG": "Singapore",
"SH": "Saint Helena, Ascension and Tristan da Cunha",
"SI": "Slovenia",
"SJ": "Svalbard and Jan Mayen Islands",
"SK": "Slovakia",
"SL": "Sierra Leone",
"SM": "San Marino",
"SN": "Senegal",
"SO": "Somalia",
"SR": "Suriname",
"SS": "South Sudan",
"ST": "S\xE3o Tom\xE9 and Pr\xEDncipe",
"SV": "El Salvador",
"SX": "Sint Maarten (Dutch part)",
"SY": "Syrian Arab Republic",
"SZ": "Eswatini",
"TC": "Turks and Caicos Islands",
"TD": "Chad",
"TF": "French Southern Territories",
"TG": "Togo",
"TH": "Thailand",
"TJ": "Tajikistan",
"TK": "Tokelau",
"TL": "Timor-Leste (East Timor)",
"TM": "Turkmenistan",
"TN": "Tunisia",
"TO": "Tonga",
"TR": "Turkey",
"TT": "Trinidad and Tobago",
"TV": "Tuvalu",
"TW": "Taiwan, Province of China",
"TZ": "Tanzania, United Republic of",
"UA": "Ukraine",
"UG": "Uganda",
"UM": "United States Minor Outlying Islands",
"US": "United States",
"UY": "Uruguay",
"UZ": "Uzbekistan",
"VA": "Holy See (Vatican City State)",
"VC": "Saint Vincent and the Grenadines",
"VE": "Venezuela, Bolivarian Republic of",
"VG": "British Virgin Islands",
"VI": "United States Virgin Islands",
"VN": "Viet Nam",
"VU": "Vanuatu",
"WF": "Wallis and Futuna",
"WS": "Samoa",
"XK": "Kosovo",
"YE": "Yemen",
"YT": "Mayotte",
"ZA": "South Africa",
"ZM": "Zambia",
"ZW": "Zimbabwe"
};
// src/detect-text-lang-eld.ts
import { eld } from "eld";
import { iso6393 } from "iso-639-3";
var _CountryCodes = Object.fromEntries(Object.entries(CountryCodes).map(([key, value]) => [key, value.split(",")[0]]));
var isSubset = false;
function dynamicLangSubset(langSubset) {
if (langSubset?.length) {
isSubset = true;
eld.dynamicLangSubset(langSubset);
} else if (isSubset) {
isSubset = false;
eld.dynamicLangSubset(false);
}
}
function detectTextLanguage(text, options = {}) {
dynamicLangSubset(options.langSubset);
const result = eld.detect(text);
if (result.isReliable()) {
const threshold = options.threshold ?? 0.1;
const scores = result.getScores();
const lang = result.language;
if (scores[lang] >= threshold) {
if (!options.isoCode) {
return getLanguageFromIso6391(lang);
}
return lang;
}
}
}
function detectTextLangEx(text, options) {
dynamicLangSubset(options?.langSubset);
let result;
const langInfo = eld.detect(text);
if (langInfo.isReliable()) {
const threshold = options?.threshold ?? 0.1;
const scores = langInfo.getScores();
const iso6391 = langInfo.language;
if (scores[iso6391] >= threshold) {
result = { iso6391, scores: langInfo.getScores() };
const countryCode = getCountryCodeFromLang(iso6391);
if (countryCode) {
result.iso3166 = countryCode;
const countryName = CountryNames[countryCode];
result.country = countryName;
}
const info = iso6393.find((i) => i.iso6391 === iso6391);
if (info?.name) {
result.name = info.name;
}
}
}
return result;
}
function getCountryCodeFromLang(iso6391) {
for (const [key, value] of Object.entries(_CountryCodes)) {
if (value === iso6391 || value.startsWith(iso6391 + "-")) {
return key;
}
}
}
function getLanguageFromIso6391(iso6391) {
const info = iso6393.find((i) => i.iso6391 === iso6391);
return info?.name;
}
export {
CountryCodes,
CountryNames,
detectTextLangEx,
detectTextLanguage,
getCountryCodeFromLang,
getLanguageFromIso6391
};