echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
187 lines • 8.83 kB
JavaScript
import chalk from 'chalk';
import * as API from './API.js';
import { extendDeep } from '../utilities/ObjectUtilities.js';
import { logToStderr } from '../utilities/Utilities.js';
import { ensureRawAudio, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js';
import { Logger } from '../utilities/Logger.js';
import { addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js';
import { formatLanguageCodeWithName, getShortLanguageCode, normalizeIdentifierToLanguageCode, parseLangIdentifier } from '../utilities/Locale.js';
import { detectSpeechLanguage } from './API.js';
const log = logToStderr;
/////////////////////////////////////////////////////////////////////////////////////////////
// Speech translation
/////////////////////////////////////////////////////////////////////////////////////////////
export async function translateSpeech(input, options, onPart) {
const logger = new Logger();
const startTimestamp = logger.getTimestamp();
options = extendDeep(defaultSpeechTranslationOptions, options);
const inputRawAudio = await ensureRawAudio(input);
let sourceRawAudio;
let isolatedRawAudio;
let backgroundRawAudio;
if (options.isolate) {
logger.log(``);
logger.end();
({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation));
logger.end();
logger.log(``);
logger.start(`Resample audio to 16kHz mono`);
sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1);
}
else {
logger.start(`Resample audio to 16kHz mono`);
sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1);
}
let sourceUncropTimeline;
if (options.crop) {
logger.start('Crop using voice activity detection');
({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad));
logger.end();
}
logger.start('Normalize and trim audio');
normalizeAudioLevelInPlace(sourceRawAudio);
sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0]);
if (options.sourceLanguage) {
const languageData = await parseLangIdentifier(options.sourceLanguage);
options.sourceLanguage = languageData.Name;
logger.end();
logger.logTitledMessage('Source language specified', formatLanguageCodeWithName(options.sourceLanguage));
}
else {
logger.start('No source language specified. Detect speech language');
const { detectedLanguage } = await detectSpeechLanguage(sourceRawAudio, options.languageDetection || {});
options.sourceLanguage = detectedLanguage;
logger.end();
logger.logTitledMessage('Source language detected', formatLanguageCodeWithName(detectedLanguage));
}
options.targetLanguage = await normalizeIdentifierToLanguageCode(options.targetLanguage);
logger.logTitledMessage('Target language', formatLanguageCodeWithName(options.targetLanguage));
logger.start('Preprocess audio for translation');
const engine = options.engine;
const sourceLanguage = options.sourceLanguage;
const targetLanguage = options.targetLanguage;
let transcript;
let wordTimeline;
let segmentTimeline;
logger.start(`Load ${engine} module`);
switch (engine) {
case 'whisper': {
const WhisperSTT = await import('../recognition/WhisperSTT.js');
const whisperOptions = options.whisper;
const shortSourceLanguageCode = getShortLanguageCode(sourceLanguage);
const shortTargetLanguageCode = getShortLanguageCode(targetLanguage);
const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperOptions.model, shortSourceLanguageCode);
if (shortTargetLanguageCode != 'en') {
throw new Error('Whisper translation only supports English as target language');
}
if (modelName.endsWith('.en')) {
throw new Error('Whisper translation tasks are only possible with a multilingual model');
}
if (shortSourceLanguageCode == 'en' && shortTargetLanguageCode == 'en') {
throw new Error('Both translation source and target languages are English');
}
logger.end();
({ transcript, timeline: wordTimeline } = await WhisperSTT.recognize(sourceRawAudio, modelName, modelDir, 'translate', sourceLanguage, whisperOptions, onPart));
break;
}
case 'whisper.cpp': {
const WhisperCppSTT = await import('../recognition/WhisperCppSTT.js');
const whisperCppOptions = options.whisperCpp;
const shortSourceLanguageCode = getShortLanguageCode(sourceLanguage);
const shortTargetLanguageCode = getShortLanguageCode(targetLanguage);
logger.end();
const { modelName, modelPath } = await WhisperCppSTT.loadModelPackage(whisperCppOptions.model, shortSourceLanguageCode);
if (shortTargetLanguageCode != 'en') {
throw new Error('Whisper.cpp translation only supports English as target language');
}
if (modelName.endsWith('.en')) {
throw new Error('Whisper.cpp translation tasks are only possible with a multilingual model');
}
logger.end();
({ transcript, timeline: wordTimeline } = await WhisperCppSTT.recognize(sourceRawAudio, 'translate', shortSourceLanguageCode, modelName, modelPath, whisperCppOptions));
break;
}
case 'openai-cloud': {
const OpenAICloudSTT = await import('../recognition/OpenAICloudSTT.js');
const openAICloudSTTOptions = options.openAICloud;
if (!openAICloudSTTOptions.apiKey) {
throw new Error(`No OpenAI Cloud API key provided`);
}
const shortSourceLanguageCode = getShortLanguageCode(sourceLanguage);
const shortTargetLanguageCode = getShortLanguageCode(targetLanguage);
if (shortTargetLanguageCode != 'en') {
throw new Error('OpenAI cloud speech translation only supports English as target language');
}
logger.end();
({ transcript, timeline: segmentTimeline } = await OpenAICloudSTT.recognize(sourceRawAudio, shortSourceLanguageCode, openAICloudSTTOptions, 'translate'));
break;
}
default: {
throw new Error(`Engine '${options.engine}' is not supported`);
}
}
logger.end();
// If the audio was cropped before recognition, map the timestamps back to the original audio
if (sourceUncropTimeline && sourceUncropTimeline.length > 0) {
if (wordTimeline) {
API.convertCroppedToUncroppedTimeline(wordTimeline, sourceUncropTimeline);
}
else if (segmentTimeline) {
API.convertCroppedToUncroppedTimeline(segmentTimeline, sourceUncropTimeline);
}
}
if (wordTimeline) {
addWordTextOffsetsToTimelineInPlace(wordTimeline, transcript);
}
if (!segmentTimeline) {
({ segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(wordTimeline, transcript, targetLanguage, 'single', 'preserve'));
}
logger.log('');
logger.logDuration(`Total speech translation time`, startTimestamp, chalk.magentaBright);
return {
transcript,
timeline: segmentTimeline,
wordTimeline,
sourceLanguage,
targetLanguage,
inputRawAudio,
isolatedRawAudio,
backgroundRawAudio,
};
}
export const defaultSpeechTranslationOptions = {
engine: 'whisper',
sourceLanguage: undefined,
targetLanguage: 'en',
crop: true,
isolate: false,
languageDetection: undefined,
subtitles: {},
vad: {
engine: 'adaptive-gate'
},
whisper: {},
whisperCpp: {},
openAICloud: {},
};
export const speechTranslationEngines = [
{
id: 'whisper',
name: 'OpenAI Whisper',
description: `Uses Whisper's speech translation capability to produce an English transcript from speech in a different language.`,
type: 'local'
},
{
id: 'whisper.cpp',
name: 'OpenAI Whisper (C++ port)',
description: `Uses Whisper's speech translation capability to produce an English transcript from speech in a different language.`,
type: 'local'
},
{
id: 'openai-cloud',
name: 'OpenAI Cloud',
description: 'Speech translation cloud service provided by OpenAI. Only support English as target language.',
type: 'cloud'
}
];
//# sourceMappingURL=SpeechTranslation.js.map