UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

187 lines 8.83 kB
import chalk from 'chalk'; import * as API from './API.js'; import { extendDeep } from '../utilities/ObjectUtilities.js'; import { logToStderr } from '../utilities/Utilities.js'; import { ensureRawAudio, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js'; import { Logger } from '../utilities/Logger.js'; import { addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js'; import { formatLanguageCodeWithName, getShortLanguageCode, normalizeIdentifierToLanguageCode, parseLangIdentifier } from '../utilities/Locale.js'; import { detectSpeechLanguage } from './API.js'; const log = logToStderr; ///////////////////////////////////////////////////////////////////////////////////////////// // Speech translation ///////////////////////////////////////////////////////////////////////////////////////////// export async function translateSpeech(input, options, onPart) { const logger = new Logger(); const startTimestamp = logger.getTimestamp(); options = extendDeep(defaultSpeechTranslationOptions, options); const inputRawAudio = await ensureRawAudio(input); let sourceRawAudio; let isolatedRawAudio; let backgroundRawAudio; if (options.isolate) { logger.log(``); logger.end(); ({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation)); logger.end(); logger.log(``); logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1); } else { logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1); } let sourceUncropTimeline; if (options.crop) { logger.start('Crop using voice activity detection'); ({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad)); logger.end(); } logger.start('Normalize and trim audio'); normalizeAudioLevelInPlace(sourceRawAudio); sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0]); if (options.sourceLanguage) { const languageData = await parseLangIdentifier(options.sourceLanguage); options.sourceLanguage = languageData.Name; logger.end(); logger.logTitledMessage('Source language specified', formatLanguageCodeWithName(options.sourceLanguage)); } else { logger.start('No source language specified. Detect speech language'); const { detectedLanguage } = await detectSpeechLanguage(sourceRawAudio, options.languageDetection || {}); options.sourceLanguage = detectedLanguage; logger.end(); logger.logTitledMessage('Source language detected', formatLanguageCodeWithName(detectedLanguage)); } options.targetLanguage = await normalizeIdentifierToLanguageCode(options.targetLanguage); logger.logTitledMessage('Target language', formatLanguageCodeWithName(options.targetLanguage)); logger.start('Preprocess audio for translation'); const engine = options.engine; const sourceLanguage = options.sourceLanguage; const targetLanguage = options.targetLanguage; let transcript; let wordTimeline; let segmentTimeline; logger.start(`Load ${engine} module`); switch (engine) { case 'whisper': { const WhisperSTT = await import('../recognition/WhisperSTT.js'); const whisperOptions = options.whisper; const shortSourceLanguageCode = getShortLanguageCode(sourceLanguage); const shortTargetLanguageCode = getShortLanguageCode(targetLanguage); const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperOptions.model, shortSourceLanguageCode); if (shortTargetLanguageCode != 'en') { throw new Error('Whisper translation only supports English as target language'); } if (modelName.endsWith('.en')) { throw new Error('Whisper translation tasks are only possible with a multilingual model'); } if (shortSourceLanguageCode == 'en' && shortTargetLanguageCode == 'en') { throw new Error('Both translation source and target languages are English'); } logger.end(); ({ transcript, timeline: wordTimeline } = await WhisperSTT.recognize(sourceRawAudio, modelName, modelDir, 'translate', sourceLanguage, whisperOptions, onPart)); break; } case 'whisper.cpp': { const WhisperCppSTT = await import('../recognition/WhisperCppSTT.js'); const whisperCppOptions = options.whisperCpp; const shortSourceLanguageCode = getShortLanguageCode(sourceLanguage); const shortTargetLanguageCode = getShortLanguageCode(targetLanguage); logger.end(); const { modelName, modelPath } = await WhisperCppSTT.loadModelPackage(whisperCppOptions.model, shortSourceLanguageCode); if (shortTargetLanguageCode != 'en') { throw new Error('Whisper.cpp translation only supports English as target language'); } if (modelName.endsWith('.en')) { throw new Error('Whisper.cpp translation tasks are only possible with a multilingual model'); } logger.end(); ({ transcript, timeline: wordTimeline } = await WhisperCppSTT.recognize(sourceRawAudio, 'translate', shortSourceLanguageCode, modelName, modelPath, whisperCppOptions)); break; } case 'openai-cloud': { const OpenAICloudSTT = await import('../recognition/OpenAICloudSTT.js'); const openAICloudSTTOptions = options.openAICloud; if (!openAICloudSTTOptions.apiKey) { throw new Error(`No OpenAI Cloud API key provided`); } const shortSourceLanguageCode = getShortLanguageCode(sourceLanguage); const shortTargetLanguageCode = getShortLanguageCode(targetLanguage); if (shortTargetLanguageCode != 'en') { throw new Error('OpenAI cloud speech translation only supports English as target language'); } logger.end(); ({ transcript, timeline: segmentTimeline } = await OpenAICloudSTT.recognize(sourceRawAudio, shortSourceLanguageCode, openAICloudSTTOptions, 'translate')); break; } default: { throw new Error(`Engine '${options.engine}' is not supported`); } } logger.end(); // If the audio was cropped before recognition, map the timestamps back to the original audio if (sourceUncropTimeline && sourceUncropTimeline.length > 0) { if (wordTimeline) { API.convertCroppedToUncroppedTimeline(wordTimeline, sourceUncropTimeline); } else if (segmentTimeline) { API.convertCroppedToUncroppedTimeline(segmentTimeline, sourceUncropTimeline); } } if (wordTimeline) { addWordTextOffsetsToTimelineInPlace(wordTimeline, transcript); } if (!segmentTimeline) { ({ segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(wordTimeline, transcript, targetLanguage, 'single', 'preserve')); } logger.log(''); logger.logDuration(`Total speech translation time`, startTimestamp, chalk.magentaBright); return { transcript, timeline: segmentTimeline, wordTimeline, sourceLanguage, targetLanguage, inputRawAudio, isolatedRawAudio, backgroundRawAudio, }; } export const defaultSpeechTranslationOptions = { engine: 'whisper', sourceLanguage: undefined, targetLanguage: 'en', crop: true, isolate: false, languageDetection: undefined, subtitles: {}, vad: { engine: 'adaptive-gate' }, whisper: {}, whisperCpp: {}, openAICloud: {}, }; export const speechTranslationEngines = [ { id: 'whisper', name: 'OpenAI Whisper', description: `Uses Whisper's speech translation capability to produce an English transcript from speech in a different language.`, type: 'local' }, { id: 'whisper.cpp', name: 'OpenAI Whisper (C++ port)', description: `Uses Whisper's speech translation capability to produce an English transcript from speech in a different language.`, type: 'local' }, { id: 'openai-cloud', name: 'OpenAI Cloud', description: 'Speech translation cloud service provided by OpenAI. Only support English as target language.', type: 'cloud' } ]; //# sourceMappingURL=SpeechTranslation.js.map