UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

129 lines 5.92 kB
import { extendDeep } from '../utilities/ObjectUtilities.js'; import { logToStderr } from '../utilities/Utilities.js'; import { ensureRawAudio, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js'; import { Logger } from '../utilities/Logger.js'; import * as API from './API.js'; import { addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js'; import { formatLanguageCodeWithName, getShortLanguageCode, normalizeIdentifierToLanguageCode, parseLangIdentifier } from '../utilities/Locale.js'; import chalk from 'chalk'; const log = logToStderr; export async function alignTranslation(input, translatedTranscript, options) { const logger = new Logger(); const startTimestamp = logger.getTimestamp(); options = extendDeep(defaultTranslationAlignmentOptions, options); const inputRawAudio = await ensureRawAudio(input); let sourceRawAudio; let isolatedRawAudio; let backgroundRawAudio; if (options.isolate) { logger.log(``); logger.end(); ({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation)); logger.end(); logger.log(``); logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1); } else { logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1); } let sourceUncropTimeline; if (options.crop) { logger.start('Crop using voice activity detection'); ({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad)); logger.end(); } logger.start('Normalize and trim audio'); normalizeAudioLevelInPlace(sourceRawAudio); sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0]); logger.end(); let sourceLanguage; if (options.sourceLanguage) { const languageData = await parseLangIdentifier(options.sourceLanguage); sourceLanguage = languageData.Name; logger.end(); logger.logTitledMessage('Source language specified', formatLanguageCodeWithName(sourceLanguage)); } else { logger.start('No source language specified. Detect speech language'); const { detectedLanguage } = await API.detectSpeechLanguage(sourceRawAudio, options.languageDetection || {}); sourceLanguage = detectedLanguage; logger.end(); logger.logTitledMessage('Source language detected', formatLanguageCodeWithName(detectedLanguage)); } const targetLanguage = await normalizeIdentifierToLanguageCode(options.targetLanguage); logger.logTitledMessage('Target language', formatLanguageCodeWithName(targetLanguage)); let mappedTimeline; switch (options.engine) { case 'whisper': { const WhisperSTT = await import('../recognition/WhisperSTT.js'); const shortSourceLanguageCode = getShortLanguageCode(sourceLanguage); const shortTargetLanguageCode = getShortLanguageCode(targetLanguage); if (shortTargetLanguageCode != 'en') { throw new Error('Whisper translation only supports English as target language'); } if (shortSourceLanguageCode == 'en' && shortTargetLanguageCode == 'en') { throw new Error('Both translation source and target languages are English'); } const whisperAlignmnentOptions = options.whisper; const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperAlignmnentOptions.model, shortSourceLanguageCode); logger.end(); if (modelName.endsWith('.en')) { throw new Error('Whisper translation tasks are only possible with a multilingual model'); } mappedTimeline = await WhisperSTT.alignEnglishTranslation(sourceRawAudio, translatedTranscript, modelName, modelDir, shortSourceLanguageCode, whisperAlignmnentOptions); break; } default: { throw new Error(`Engine '${options.engine}' is not supported`); } } // If the audio was cropped before recognition, map the timestamps back to the original audio if (sourceUncropTimeline && sourceUncropTimeline.length > 0) { API.convertCroppedToUncroppedTimeline(mappedTimeline, sourceUncropTimeline); } // Add text offsets addWordTextOffsetsToTimelineInPlace(mappedTimeline, translatedTranscript); // Make segment timeline const { segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(mappedTimeline, translatedTranscript, sourceLanguage, options.plainText?.paragraphBreaks, options.plainText?.whitespace); logger.end(); logger.logDuration(`Total translation alignment time`, startTimestamp, chalk.magentaBright); return { timeline: segmentTimeline, wordTimeline: mappedTimeline, translatedTranscript, sourceLanguage, targetLanguage, inputRawAudio, isolatedRawAudio, backgroundRawAudio, }; } export const defaultTranslationAlignmentOptions = { engine: 'whisper', sourceLanguage: undefined, targetLanguage: 'en', isolate: false, crop: true, languageDetection: {}, plainText: { paragraphBreaks: 'double', whitespace: 'collapse' }, subtitles: {}, vad: { engine: 'adaptive-gate' }, sourceSeparation: {}, whisper: {} }; export const translationAlignmentEngines = [ { id: 'whisper', name: 'OpenAI Whisper', description: 'Extracts timestamps by guiding the Whisper recognition model to recognize the translated transcript tokens.', type: 'local' } ]; //# sourceMappingURL=TranslationAlignment.js.map