UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

468 lines 16.6 kB
import chalk from 'chalk'; import { parseText } from '../nlp/Segmentation.js'; import { Logger } from '../utilities/Logger.js'; import { loadPackage } from '../utilities/PackageManager.js'; export async function translateText(sourceText, sourceLanguage, targetLanguage) { const logger = new Logger(); //const languageNames = Object.keys(languageNameToNLLBCode) //logger.log(languageNames) logger.start(`Load transformers.js module`); const { AutoTokenizer, M2M100ForConditionalGeneration } = await import('@echogarden/transformers-nodejs-lite'); logger.start(`Load NLLB package`); const modelPath = await loadPackage(`xenova-nllb-200-distilled-600M-q8`); logger.start(`Load NLLB tokenizer`); const tokenizer = await AutoTokenizer.from_pretrained(modelPath); logger.start(`Load NLLB model`); const model = await M2M100ForConditionalGeneration.from_pretrained(modelPath); logger.start(`Split to sentences`); const config = { src_lang: 'spa_Latn', tgt_lang: 'eng_Latn' }; const segmentedText = await parseText(sourceText, sourceLanguage); const sentences = segmentedText.sentences; logger.end(); const translationPairs = []; for (let i = 0; i < sentences.length; i++) { const sentenceText = sentences[i].text; logger.logTitledMessage(`Translate sentence ${i + 1}/${sentences.length}`, `"${sentenceText.trim()}"`, chalk.magentaBright); logger.start(`Tokenize sentence`); const inputs = tokenizer._build_translation_inputs(sentenceText, { padding: true, truncation: true, }, config); logger.start(`Translate sentence with NLLB model`); const translationTokenIds = await model.generate(inputs.input_ids, config); logger.start(`Extract tokens`); const translationTokens = tokenizer.model.convert_ids_to_tokens(translationTokenIds[0]); const translatedText = translationTokens .slice(2, translationTokens.length - 1) .map(token => { if (token.startsWith('▁')) { return token.replaceAll('▁', ' '); } return token; }) .join('') .trim(); translationPairs.push({ sourceText: sentenceText, translatedText }); logger.end(); } return translationPairs; } const languageNameToNLLBCode = { 'Acehnese (Arabic script)': 'ace_Arab', 'Acehnese (Latin script)': 'ace_Latn', 'Afrikaans': 'afr_Latn', 'Akan': 'aka_Latn', 'Amharic': 'amh_Ethi', 'Armenian': 'hye_Armn', 'Assamese': 'asm_Beng', 'Asturian': 'ast_Latn', 'Awadhi': 'awa_Deva', 'Ayacucho Quechua': 'quy_Latn', 'Balinese': 'ban_Latn', 'Bambara': 'bam_Latn', 'Banjar (Arabic script)': 'bjn_Arab', 'Banjar (Latin script)': 'bjn_Latn', 'Bashkir': 'bak_Cyrl', 'Basque': 'eus_Latn', 'Belarusian': 'bel_Cyrl', 'Bemba': 'bem_Latn', 'Bengali': 'ben_Beng', 'Bhojpuri': 'bho_Deva', 'Bosnian': 'bos_Latn', 'Buginese': 'bug_Latn', 'Bulgarian': 'bul_Cyrl', 'Burmese': 'mya_Mymr', 'Catalan': 'cat_Latn', 'Cebuano': 'ceb_Latn', 'Central Atlas Tamazight': 'tzm_Tfng', 'Central Aymara': 'ayr_Latn', 'Central Kanuri (Arabic script)': 'knc_Arab', 'Central Kanuri (Latin script)': 'knc_Latn', 'Central Kurdish': 'ckb_Arab', 'Chhattisgarhi': 'hne_Deva', 'Chinese (Simplified)': 'zho_Hans', 'Chinese (Traditional)': 'zho_Hant', 'Chokwe': 'cjk_Latn', 'Crimean Tatar': 'crh_Latn', 'Croatian': 'hrv_Latn', 'Czech': 'ces_Latn', 'Danish': 'dan_Latn', 'Dari': 'prs_Arab', 'Dutch': 'nld_Latn', 'Dyula': 'dyu_Latn', 'Dzongkha': 'dzo_Tibt', 'Eastern Panjabi': 'pan_Guru', 'Eastern Yiddish': 'ydd_Hebr', 'Egyptian Arabic': 'arz_Arab', 'English': 'eng_Latn', 'Esperanto': 'epo_Latn', 'Estonian': 'est_Latn', 'Ewe': 'ewe_Latn', 'Faroese': 'fao_Latn', 'Fijian': 'fij_Latn', 'Finnish': 'fin_Latn', 'Fon': 'fon_Latn', 'French': 'fra_Latn', 'Friulian': 'fur_Latn', 'Galician': 'glg_Latn', 'Ganda': 'lug_Latn', 'Georgian': 'kat_Geor', 'German': 'deu_Latn', 'Greek': 'ell_Grek', 'Guarani': 'grn_Latn', 'Gujarati': 'guj_Gujr', 'Haitian Creole': 'hat_Latn', 'Halh Mongolian': 'khk_Cyrl', 'Hausa': 'hau_Latn', 'Hebrew': 'heb_Hebr', 'Hindi': 'hin_Deva', 'Hungarian': 'hun_Latn', 'Icelandic': 'isl_Latn', 'Igbo': 'ibo_Latn', 'Ilocano': 'ilo_Latn', 'Indonesian': 'ind_Latn', 'Irish': 'gle_Latn', 'Italian': 'ita_Latn', 'Japanese': 'jpn_Jpan', 'Javanese': 'jav_Latn', 'Jingpho': 'kac_Latn', 'Kabiyè': 'kbp_Latn', 'Kabuverdianu': 'kea_Latn', 'Kabyle': 'kab_Latn', 'Kamba': 'kam_Latn', 'Kannada': 'kan_Knda', 'Kashmiri (Arabic script)': 'kas_Arab', 'Kashmiri (Devanagari script)': 'kas_Deva', 'Kazakh': 'kaz_Cyrl', 'Khmer': 'khm_Khmr', 'Kikongo': 'kon_Latn', 'Kikuyu': 'kik_Latn', 'Kimbundu': 'kmb_Latn', 'Kinyarwanda': 'kin_Latn', 'Korean': 'kor_Hang', 'Kyrgyz': 'kir_Cyrl', 'Lao': 'lao_Laoo', 'Latgalian': 'ltg_Latn', 'Ligurian': 'lij_Latn', 'Limburgish': 'lim_Latn', 'Lingala': 'lin_Latn', 'Lithuanian': 'lit_Latn', 'Lombard': 'lmo_Latn', 'Luba-Kasai': 'lua_Latn', 'Luo': 'luo_Latn', 'Luxembourgish': 'ltz_Latn', 'Macedonian': 'mkd_Cyrl', 'Magahi': 'mag_Deva', 'Maithili': 'mai_Deva', 'Malayalam': 'mal_Mlym', 'Maltese': 'mlt_Latn', 'Maori': 'mri_Latn', 'Marathi': 'mar_Deva', 'Meitei (Bengali script)': 'mni_Beng', 'Mesopotamian Arabic': 'acm_Arab', 'Minangkabau (Arabic script)': 'min_Arab', 'Minangkabau (Latin script)': 'min_Latn', 'Mizo': 'lus_Latn', 'Modern Standard Arabic (Romanized)': 'arb_Latn', 'Modern Standard Arabic': 'arb_Arab', 'Moroccan Arabic': 'ary_Arab', 'Mossi': 'mos_Latn', 'Najdi Arabic': 'ars_Arab', 'Nepali': 'npi_Deva', 'Nigerian Fulfulde': 'fuv_Latn', 'North Azerbaijani': 'azj_Latn', 'North Levantine Arabic': 'apc_Arab', 'Northern Kurdish': 'kmr_Latn', 'Northern Sotho': 'nso_Latn', 'Northern Uzbek': 'uzn_Latn', 'Norwegian Bokmål': 'nob_Latn', 'Norwegian Nynorsk': 'nno_Latn', 'Nuer': 'nus_Latn', 'Nyanja': 'nya_Latn', 'Occitan': 'oci_Latn', 'Odia': 'ory_Orya', 'Pangasinan': 'pag_Latn', 'Papiamento': 'pap_Latn', 'Plateau Malagasy': 'plt_Latn', 'Polish': 'pol_Latn', 'Portuguese': 'por_Latn', 'Romanian': 'ron_Latn', 'Rundi': 'run_Latn', 'Russian': 'rus_Cyrl', 'Samoan': 'smo_Latn', 'Sango': 'sag_Latn', 'Sanskrit': 'san_Deva', 'Santali': 'sat_Olck', 'Sardinian': 'srd_Latn', 'Scottish Gaelic': 'gla_Latn', 'Serbian': 'srp_Cyrl', 'Shan': 'shn_Mymr', 'Shona': 'sna_Latn', 'Sicilian': 'scn_Latn', 'Silesian': 'szl_Latn', 'Sindhi': 'snd_Arab', 'Sinhala': 'sin_Sinh', 'Slovak': 'slk_Latn', 'Slovenian': 'slv_Latn', 'Somali': 'som_Latn', 'South Azerbaijani': 'azb_Arab', 'South Levantine Arabic': 'ajp_Arab', 'Southern Pashto': 'pbt_Arab', 'Southern Sotho': 'sot_Latn', 'Southwestern Dinka': 'dik_Latn', 'Spanish': 'spa_Latn', 'Standard Latvian': 'lvs_Latn', 'Standard Malay': 'zsm_Latn', 'Standard Tibetan': 'bod_Tibt', 'Sundanese': 'sun_Latn', 'Swahili': 'swh_Latn', 'Swati': 'ssw_Latn', 'Swedish': 'swe_Latn', 'Tagalog': 'tgl_Latn', 'Tajik': 'tgk_Cyrl', 'Tamasheq (Latin script)': 'taq_Latn', 'Tamasheq (Tifinagh script)': 'taq_Tfng', 'Tamil': 'tam_Taml', 'Tatar': 'tat_Cyrl', 'Ta’izzi-Adeni Arabic': 'acq_Arab', 'Telugu': 'tel_Telu', 'Thai': 'tha_Thai', 'Tigrinya': 'tir_Ethi', 'Tok Pisin': 'tpi_Latn', 'Tosk Albanian': 'als_Latn', 'Tsonga': 'tso_Latn', 'Tswana': 'tsn_Latn', 'Tumbuka': 'tum_Latn', 'Tunisian Arabic': 'aeb_Arab', 'Turkish': 'tur_Latn', 'Turkmen': 'tuk_Latn', 'Twi': 'twi_Latn', 'Ukrainian': 'ukr_Cyrl', 'Umbundu': 'umb_Latn', 'Urdu': 'urd_Arab', 'Uyghur': 'uig_Arab', 'Venetian': 'vec_Latn', 'Vietnamese': 'vie_Latn', 'Waray': 'war_Latn', 'Welsh': 'cym_Latn', 'West Central Oromo': 'gaz_Latn', 'Western Persian': 'pes_Arab', 'Wolof': 'wol_Latn', 'Xhosa': 'xho_Latn', 'Yoruba': 'yor_Latn', 'Yue Chinese': 'yue_Hant', 'Zulu': 'zul_Latn', }; const languageNameToISO931 = { 'Acehnese (Arabic script)': 'unknown', // No ISO 639-1 code for Acehnese 'Acehnese (Latin script)': 'unknown', // No ISO 639-1 code for Acehnese 'Afrikaans': 'af', 'Akan': 'ak', 'Amharic': 'am', 'Armenian': 'hy', 'Assamese': 'as', 'Asturian': 'ast', // Approximate: ISO 639-3 code 'Awadhi': 'awa', // Approximate: ISO 639-3 code 'Ayacucho Quechua': 'qu', // Approximate: Quechua has many variants 'Balinese': 'ban', // Approximate: ISO 639-3 code 'Bambara': 'bm', 'Banjar (Arabic script)': 'unknown', // No ISO 639-1 code for Banjar 'Banjar (Latin script)': 'unknown', // No ISO 639-1 code for Banjar 'Bashkir': 'ba', 'Basque': 'eu', 'Belarusian': 'be', 'Bemba': 'bem', // Approximate: ISO 639-3 code 'Bengali': 'bn', 'Bhojpuri': 'bho', 'Bosnian': 'bs', 'Buginese': 'bug', 'Bulgarian': 'bg', 'Burmese': 'my', 'Catalan': 'ca', 'Cebuano': 'ceb', // Approximate: ISO 639-2 code 'Central Atlas Tamazight': 'tzm', 'Central Aymara': 'ay', // Approximate: Aymara has variants 'Central Kanuri (Arabic script)': 'kr', // Approximate: Kanuri uses multiple scripts 'Central Kanuri (Latin script)': 'kr', // Approximate: Kanuri uses multiple scripts 'Central Kurdish': 'ckb', // Approximate: Kurdish has several variants 'Chhattisgarhi': 'hne', // Approximate: ISO 639-3 code 'Chinese (Simplified)': 'zh', // Approximate: zh covers both Simplified and Traditional 'Chinese (Traditional)': 'zh', // Approximate: zh covers both Simplified and Traditional 'Chokwe': 'cjk', // Approximate: ISO 639-3 code 'Crimean Tatar': 'crh', // Approximate: ISO 639-3 code 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', 'Dari': 'prs', // Approximate: Dari is considered a dialect of Persian 'Dutch': 'nl', 'Dyula': 'dyu', 'Dzongkha': 'dz', 'Eastern Panjabi': 'pa', // Approximate: Panjabi has multiple writing systems 'Eastern Yiddish': 'yid', // Approximate: Yiddish has multiple variants 'Egyptian Arabic': 'arz', 'English': 'en', 'Esperanto': 'eo', 'Estonian': 'et', 'Ewe': 'ee', 'Faroese': 'fo', 'Fijian': 'fj', 'Finnish': 'fi', 'Fon': 'fon', 'French': 'fr', 'Friulian': 'fur', // Approximate: ISO 639-3 code 'Galician': 'gl', 'Ganda': 'lg', 'Georgian': 'ka', 'German': 'de', 'Greek': 'el', 'Guarani': 'gn', 'Gujarati': 'gu', 'Haitian Creole': 'ht', 'Halh Mongolian': 'mn', // Approximate: Mongolian encompasses several dialects 'Hausa': 'ha', 'Hebrew': 'he', 'Hindi': 'hi', 'Hungarian': 'hu', 'Icelandic': 'is', 'Igbo': 'ig', 'Ilocano': 'ilo', 'Indonesian': 'id', 'Irish': 'ga', 'Italian': 'it', 'Japanese': 'ja', 'Javanese': 'jv', 'Jingpho': 'kac', // Approximate: ISO 639-3 code 'Kabiyè': 'kbp', // Approximate: ISO 639-3 code 'Kabuverdianu': 'kea', // Approximate: ISO 639-3 code 'Kabyle': 'kab', 'Kamba': 'kam', // Approximate: ISO 639-3 code 'Kannada': 'kn', 'Kashmiri (Arabic script)': 'ks', // Approximate: Kashmiri uses multiple scripts 'Kashmiri (Devanagari script)': 'ks', // Approximate: Kashmiri uses multiple scripts 'Kazakh': 'kk', 'Khmer': 'km', 'Kikongo': 'kg', 'Kikuyu': 'ki', 'Kimbundu': 'kmb', 'Kinyarwanda': 'rw', 'Korean': 'ko', 'Kyrgyz': 'ky', 'Lao': 'lo', 'Latgalian': 'ltg', // Approximate: ISO 639-3 code 'Ligurian': 'lij', // Approximate: ISO 639-3 code 'Limburgish': 'li', // Approximate: ISO 639-1 code for Limburgish-Ripuarian 'Lingala': 'ln', 'Lithuanian': 'lt', 'Lombard': 'lmo', // Approximate: ISO 639-3 code 'Luba-Kasai': 'lua', 'Luo': 'luo', 'Luxembourgish': 'lb', 'Macedonian': 'mk', 'Magahi': 'mag', // Approximate: ISO 639-3 code 'Maithili': 'mai', // Approximate: ISO 639-3 code 'Malayalam': 'ml', 'Maltese': 'mt', 'Maori': 'mi', 'Marathi': 'mr', 'Meitei (Bengali script)': 'mni', // Approximate: Meitei uses multiple scripts 'Mesopotamian Arabic': 'acm', // Approximate: ISO 639-3 code 'Minangkabau (Arabic script)': 'min', // Approximate: Minangkabau uses multiple scripts 'Minangkabau (Latin script)': 'min', // Approximate: Minangkabau uses multiple scripts 'Mizo': 'lus', 'Modern Standard Arabic (Romanized)': 'ar', // Approximate: Modern Standard Arabic is a standardized form 'Modern Standard Arabic': 'ar', // Approximate: Modern Standard Arabic is a standardized form 'Moroccan Arabic': 'ary', 'Mossi': 'mos', 'Najdi Arabic': 'ars', // Approximate: ISO 639-3 code for Najdi Arabic 'Nepali': 'ne', 'Nigerian Fulfulde': 'fuv', 'North Azerbaijani': 'az', // Approximate: Azerbaijani is a pluricentric language 'North Levantine Arabic': 'apc', // Approximate: ISO 639-3 code for North Levantine Arabic 'Northern Kurdish': 'kmr', // Approximate: Kurdish has several variants 'Northern Sotho': 'nso', 'Northern Uzbek': 'uz', // Approximate: Uzbek is a pluricentric language 'Norwegian Bokmål': 'nb', 'Norwegian Nynorsk': 'nn', 'Nuer': 'nus', // Approximate: ISO 639-3 code 'Nyanja': 'ny', 'Occitan': 'oc', 'Odia': 'or', 'Pangasinan': 'pag', 'Papiamento': 'pap', 'Plateau Malagasy': 'plt', // Approximate: ISO 639-3 code for Plateau Malagasy 'Polish': 'pl', 'Portuguese': 'pt', 'Romanian': 'ro', 'Rundi': 'rn', 'Russian': 'ru', 'Samoan': 'sm', 'Sango': 'sg', 'Sanskrit': 'sa', 'Santali': 'sat', 'Sardinian': 'sc', // Approximate: Sardinian has several variants 'Scottish Gaelic': 'gd', 'Serbian': 'sr', 'Shan': 'shn', // Approximate: ISO 639-3 code 'Shona': 'sn', 'Sicilian': 'scn', // Approximate: ISO 639-3 code 'Silesian': 'szl', // Approximate: ISO 639-3 code 'Sindhi': 'sd', 'Sinhala': 'si', 'Slovak': 'sk', 'Slovenian': 'sl', 'Somali': 'so', 'South Azerbaijani': 'az', // Approximate: Azerbaijani is a pluricentric language 'South Levantine Arabic': 'ajp', // Approximate: ISO 639-3 code for South Levantine Arabic 'Southern Pashto': 'pst', // Approximate: Pashto has several variants 'Southern Sotho': 'st', 'Southwestern Dinka': 'dik', // Approximate: ISO 639-3 code for Southwestern Dinka 'Spanish': 'es', 'Standard Latvian': 'lv', // Approximate: Standard Latvian is the official form of Latvian 'Standard Malay': 'ms', // Approximate: Standard Malay is the basis for both Malaysian and Indonesian 'Standard Tibetan': 'bo', // Approximate: Tibetan has several variants 'Sundanese': 'su', 'Swahili': 'sw', 'Swati': 'ss', 'Swedish': 'sv', 'Tagalog': 'tl', 'Tajik': 'tg', 'Tamasheq (Latin script)': 'tmh', // Approximate: Tamasheq uses multiple scripts 'Tamasheq (Tifinagh script)': 'tmh', // Approximate: Tamasheq uses multiple scripts 'Tamil': 'ta', 'Tatar': 'tt', 'Ta’izzi-Adeni Arabic': 'acq', // Approximate: ISO 639-3 code for Ta'izzi-Adeni Arabic 'Telugu': 'te', 'Thai': 'th', 'Tigrinya': 'ti', 'Tok Pisin': 'tpi', 'Tosk Albanian': 'sq', // Approximate: Albanian encompasses both Tosk and Gheg dialects 'Tsonga': 'ts', 'Tswana': 'tn', 'Tumbuka': 'tum', 'Tunisian Arabic': 'aeb', // Approximate: ISO 639-3 code for Tunisian Arabic 'Turkish': 'tr', 'Turkmen': 'tk', 'Twi': 'tw', 'Ukrainian': 'uk', 'Umbundu': 'umb', 'Urdu': 'ur', 'Uyghur': 'ug', 'Venetian': 'vec', // Approximate: ISO 639-3 code 'Vietnamese': 'vi', 'Waray': 'war', 'Welsh': 'cy', 'West Central Oromo': 'om', // Approximate: Oromo has several variants 'Western Persian': 'fa', // Approximate: Western Persian is the most common variety of Persian 'Wolof': 'wo', 'Xhosa': 'xh', 'Yoruba': 'yo', 'Yue Chinese': 'yue', // Approximate: ISO 639-3 code 'Zulu': 'zu' }; //# sourceMappingURL=NLLBTextTranslation.js.map