UNPKG

efficient-language-detector-no-dynamic-import

Version:

Fast and accurate natural language detection. Detector written in Javascript. Efficient language detector, Nito-ELD, ELD.

327 lines (299 loc) 9.99 kB
/* Copyright 2023 Nito T.M. License https://www.apache.org/licenses/LICENSE-2.0 Apache-2.0 Author Nito T.M. (https://github.com/nitotm) Package npmjs.com/package/eld */ import { languageData, loadNgrams } from './languageData.js' import { separators, matchDomains } from './regexPatterns.js' import { dictionary } from './dictionary.js' import { isoLanguages } from './isoLanguages.js' import { LanguageResult } from './LanguageResult.js' import { saveLanguageSubset } from './saveLanguageSubset.dev.js' await loadNgrams('ngramsM60.js') // Project is ES2015 const eld = (function () { return { detect: detect, cleanText: cleanText, dynamicLangSubset: dynamicLangSubset, saveSubset: saveSubset, loadNgrams: loadNgrams, info: info } })() /** @type {boolean|Array} */ let subset = false /** @type {boolean} When true, detect() cleans input text with getCleanTxt() */ let doCleanText = false /** * detect() identifies the natural language of a UTF-8 string * Returns an object, with a variable named 'language', with an ISO 639-1 code or empty string * { language: 'es', getScores(): {'es': 0.5, 'et': 0.2}, isReliable(): true } * * @param {string} text UTF-8 * @returns {{language: string, getScores(): Object, isReliable(): boolean}} class LanguageResult */ function detect (text) { if (typeof text !== 'string') return new LanguageResult('', 0, 0,{}) if (doCleanText) { // Removes Urls, emails, alphanumerical & numbers text = getCleanTxt(text) } const byteWords = textProcessor(text) const byteNgrams = getByteNgrams(byteWords) const numNgrams = Object.keys(byteNgrams).length let results = calculateScores(byteNgrams, numNgrams) let language = '' if (subset) { results = filterLangSubset(results) } if (results.length > 0) { results.sort((a, b) => b[1] - a[1]) language = languageData.langCodes[results[0][0]] } return new LanguageResult(language, results, numNgrams, languageData.langCodes) } /** * Public function to change doCleanText value * * @param {boolean} bool */ function cleanText (bool) { doCleanText = Boolean(bool) } /** * Removes parts of a string, that may be considered as "noise" for language detection * * @param {string} str * @returns {string} */ function getCleanTxt (str) { // Remove URLS str = str.replace(/[hw]((ttps?:\/\/(www\.)?)|ww\.)([^\s/?.#-]+\.?)+(\/\S*)?/gi, ' ') // Remove emails str = str.replace(/[a-zA-Z0-9.!$%&’+_`-]+@[A-Za-z0-9.-]+\.[A-Za-z0-9-]{2,64}/g, ' ') // Remove .com domains str = str.replace(matchDomains, ' ') // Remove alphanumerical/number codes str = str.replace(/[a-zA-Z]*[0-9]+[a-zA-Z0-9]*/g, ' ') return str } /** * @param {string} text * @returns {Array} */ function textProcessor (text) { text = text.substring(0, 1000) // Normalize special characters/word separators text = text.replace(separators, ' ') text = text.trim().toLowerCase() return strToUtf8Bytes(text) // returns array of words } /** * Gets Ngrams from a given array of words * * @param {Array} words * @returns {Object} */ function getByteNgrams (words) { let byteNgrams = {} let countNgrams = 0 let thisBytes let j for (let key in words) { let word = words[key] let len = word.length if (len > 70) { len = 70 } for (j = 0; j + 4 < len; j += 3, ++countNgrams) { thisBytes = (j === 0 ? ' ' : '') + word.substring(j, j + 4) byteNgrams[thisBytes] = typeof byteNgrams[thisBytes] !== 'undefined' ? byteNgrams[thisBytes] + 1 : 1 } thisBytes = (j === 0 ? ' ' : '') + word.substring(len !== 3 ? len - 4 : 0) + ' ' byteNgrams[thisBytes] = typeof byteNgrams[thisBytes] !== 'undefined' ? byteNgrams[thisBytes] + 1 : 1 countNgrams++ } // Frequency is multiplied by 15000 at the ngrams database. A reduced number (13200) seems to work better. // Linear formulas were tried, decreasing the multiplier for fewer ngram strings, no meaningful improvement. for (let bytes in byteNgrams) { byteNgrams[bytes] = (byteNgrams[bytes] / countNgrams) * 13200 } return byteNgrams } /** * Calculate scores for each language from the given Ngrams * * @param {Object} byteNgrams * @param {number} numNgrams * @returns {Array} */ function calculateScores (byteNgrams, numNgrams) { let bytes, globalFrequency, relevancy, langCount, frequency, lang, thisByte let langScore = [...languageData.langScore] for (bytes in byteNgrams) { frequency = byteNgrams[bytes] thisByte = languageData.ngrams[bytes] if (thisByte) { langCount = Object.keys(thisByte).length // Ngram score multiplier, the fewer languages found the more relevancy. Formula can be fine-tuned. if (langCount === 1) { relevancy = 27 // Handpicked relevance multiplier, trial-error } else { if (langCount < 16) { relevancy = (16 - langCount) / 2 + 1 } else { relevancy = 1 } } // Most time-consuming loop, do only the strictly necessary inside for (lang in thisByte) { globalFrequency = thisByte[lang] langScore[lang] += (frequency > globalFrequency ? globalFrequency / frequency : frequency / globalFrequency) * relevancy + 2 } } } // This divisor will produce a final score between 0 - ~1, score could be >1. Can be improved. let resultDivisor = numNgrams * 3.2 let results = [] for (lang in langScore) { if (langScore[lang]) { // Javascript does Not guarantee object order, so a multi-array is used results.push([parseInt(lang), langScore[lang] / resultDivisor]) // * languageData.scoreNormalizer[lang]; } } return results } /** * Converts each byte to a single character, using our own dictionary, since javascript does not allow raw byte * strings or invalid UTF-8 characters. We could use TextEncoder() to create an Uint8Array, and then translate to our * dictionary, but this function is overall faster as it does both jobs at once * * Alternatives such as just using Uint8Array/hex for detection adds complexity and or a bigger database * * @param {string} str * @returns {Array} */ function strToUtf8Bytes (str) { let encoded = '' let words = [] let countBytes = 0 const cutAfter = 350 // Cut to first whitespace after 350 byte length offset const enforceCutAfter = 380 // Cut after any UTF-8 character when surpassing 380 byte length for (let ii = 0; ii < str.length; ii++) { let charCode = str.charCodeAt(ii) if (charCode < 0x80) { if (charCode === 32) { if (encoded !== '') { words.push(encoded) encoded = '' } if (countBytes > cutAfter) { break } } else { encoded += str[ii] } countBytes++ } else if (charCode < 0x800) { encoded += dictionary[0xc0 | (charCode >> 6)] + dictionary[0x80 | (charCode & 0x3f)] countBytes += 2 } else if (charCode < 0xd800 || charCode >= 0xe000) { encoded += dictionary[0xe0 | (charCode >> 12)] + dictionary[0x80 | ((charCode >> 6) & 0x3f)] + dictionary[0x80 | (charCode & 0x3f)] countBytes += 3 } else { // UTF-16 ii++ charCode = 0x10000 + (((charCode & 0x3ff) << 10) | (str.charCodeAt(ii) & 0x3ff)) encoded += dictionary[0xf0 | (charCode >> 18)] + dictionary[0x80 | ((charCode >> 12) & 0x3f)] + dictionary[0x80 | ((charCode >> 6) & 0x3f)] + dictionary[0x80 | (charCode & 0x3f)] countBytes += 4 } if (countBytes > enforceCutAfter) { break } } if (encoded !== '') { words.push(encoded) // It is faster to build the array than to words.split(/ +/).filter((x) => x !== ' ') later } return words } /** * Filters languages not included in the subset, from the result scores * * @param {Array} results * @returns {Array} */ function filterLangSubset (results) { let subResults = [] for (let key in results) { if (subset.indexOf(results[key][0]) > -1) { subResults.push(results[key]) } } return subResults } /** * Validates an expected array of ISO 639-1 language code strings, given by the user, and creates a subset of the valid * languages compared against the current database available languages * * @param {Array|boolean} languages * @returns {Array|boolean} */ function makeSubset (languages) { if (languages) { subset = [] for (let key in languages) { // Validate languages, by checking if they are available at languageData let lang = Object.keys(languageData.langCodes).find((lkey) => languageData.langCodes[lkey] === languages[key]) if (lang) { subset.push(parseInt(lang)) } } if (subset.length) { subset.sort() } else { subset = false } } else { subset = false } return subset } /** * Creates a subset of languages, from which detect() will filter excluded languages from the results * Call dynamicLangSubset(false) to delete the subset * * @param {Array|boolean} languages * @returns {Object} Returns list of the validated languages for the new subset */ function dynamicLangSubset (languages) { let result = makeSubset(languages) if (result) { return isoLanguages(result, languageData.langCodes) } return {} } /** * Creates a download, only available for the web browser, with a file containing the ngrams database, of the validated * languages from the array argument * * @param {Array} languages */ function saveSubset (languages) { const langArray = makeSubset(languages) makeSubset(false) // remove the global subset, we only need the filtered langArray saveLanguageSubset.saveSubset(langArray, languageData.ngrams, languageData.langCodes, languageData.type) } function info() { return { 'Data type': languageData.type, 'Languages': languageData.langCodes, 'Dynamic subset': subset ? isoLanguages(subset, languageData.langCodes) : false } } export { eld };