echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
162 lines • 5.16 kB
JavaScript
import { request } from 'gaxios';
import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js';
import { Logger } from '../utilities/Logger.js';
import { logToStderr } from '../utilities/Utilities.js';
import { extendDeep } from '../utilities/ObjectUtilities.js';
import { decodeBase64 } from '../encodings/Base64.js';
import { splitToWords } from '../nlp/Segmentation.js';
const log = logToStderr;
export async function synthesize(text, voiceId, language, options) {
const logger = new Logger();
logger.start('Request synthesis from ElevenLabs');
options = extendDeep(defaultElevenLabsTTSOptions, options);
let response;
try {
response = await request({
url: `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
method: 'POST',
headers: {
'accept': 'audio/mpeg',
'xi-api-key': options.apiKey,
},
params: {
output_format: 'mp3_44100_64',
},
data: {
text,
model_id: options.modelId,
voice_setting: {
stability: options.stability,
similarity_boost: options.similarityBoost,
style: options.style,
use_speaker_boost: options.useSpeakerBoost,
},
seed: options.seed
},
responseType: 'json'
});
}
catch (e) {
const response = e.response;
if (response) {
logger.log(`Request failed with status code ${response.status}`);
if (response.data) {
logger.log(`Server responded with:`);
logger.log(response.data);
}
}
throw e;
}
logger.start('Decode synthesized audio');
const audioData = decodeBase64(response.data.audio_base64);
const rawAudio = await FFMpegTranscoder.decodeToChannels(audioData);
let timeline;
const characters = response.data.alignment?.characters;
const characterStartTimes = response.data.alignment?.character_start_times_seconds;
const characterEndTimes = response.data.alignment?.character_end_times_seconds;
if (characters && characterStartTimes && characterEndTimes) {
logger.start('Create timeline from returned character timings');
const referenceText = characters.join('');
const wordSequence = await splitToWords(referenceText, language);
timeline = [];
for (const wordEntry of wordSequence.entries) {
const wordText = wordEntry.text;
const wordStartOffset = wordEntry.startOffset;
const wordEndOffset = wordEntry.endOffset;
const startTime = characterStartTimes[wordStartOffset];
const endTime = characterEndTimes[wordEndOffset] ?? characterEndTimes[wordEndOffset - 1];
timeline.push({
type: 'word',
text: wordText,
startTime,
endTime,
});
}
}
logger.end();
return { rawAudio, timeline };
}
export async function getVoiceList(apiKey) {
const response = await request({
method: 'GET',
url: 'https://api.elevenlabs.io/v1/voices',
headers: {
'accept': 'accept: application/json',
'xi-api-key': apiKey
},
responseType: 'json'
});
const elevenLabsVoices = response.data.voices;
const voices = elevenLabsVoices.map(elevenLabsVoice => {
const gender = elevenLabsVoice?.labels?.gender ?? 'unknown';
const supportedLanguages = [];
let accent = elevenLabsVoice?.labels?.accent;
accent = accent?.toLowerCase() ?? '';
if (accent.startsWith('american')) {
supportedLanguages.push('en-US');
}
else if (accent.startsWith('british')) {
supportedLanguages.push('en-GB');
}
else if (accent === 'irish') {
supportedLanguages.push('en-IE');
}
else if (accent == 'australian') {
supportedLanguages.push('en-AU');
}
else {
supportedLanguages.push('en');
}
supportedLanguages.push(...supportedLanguagesInMultilingualModels);
return {
name: elevenLabsVoice.name,
languages: supportedLanguages,
gender,
elevenLabsVoiceId: elevenLabsVoice.voice_id,
};
});
return voices;
}
export const defaultElevenLabsTTSOptions = {
apiKey: undefined,
modelId: 'eleven_multilingual_v2',
stability: 0.5,
similarityBoost: 0.5,
style: 0,
useSpeakerBoost: true,
seed: undefined,
};
export const supportedLanguagesInMultilingualModels = [
'ja',
'zh',
'de',
'hi',
'fr',
'ko',
'pt',
'it',
'es',
'id',
'nl',
'tr',
'fil',
'pl',
'sv',
'bg',
'ro',
'ar',
'cs',
'el',
'fi',
'hr',
'ms',
'sk',
'da',
'ta',
'uk',
'ru',
'hu',
'no',
'vi',
];
//# sourceMappingURL=ElevenLabsTTS.js.map