@just-every/ensemble
Version:
LLM provider abstraction layer with unified streaming interface
248 lines • 10.1 kB
JavaScript
import { getModelFromAgent, getModelProvider } from '../model_providers/model_provider.js';
import { findModel } from '../data/model_data.js';
export async function* ensembleVoice(text, agent, options = {}) {
const streamOptions = { ...options, stream: true };
const format = options.response_format || 'mp3';
const isPCM = format.includes('pcm');
const model = await getModelFromAgent(agent, 'voice');
const isGemini = model.startsWith('gemini');
const isElevenLabs = model.startsWith('eleven_');
const startTime = Date.now();
let firstByteTime = null;
console.log(`[ensembleVoice] Starting TTS generation with model: ${model}`);
const modelInfo = findModel(model);
const isOpenAI = modelInfo?.provider === 'openai';
const isWav = format === 'wav';
const effectiveFormat = isGemini || (isElevenLabs && isPCM) ? 'wav' : format;
const supportsStreaming = effectiveFormat === 'wav' || effectiveFormat.includes('pcm');
const getMimeType = (fmt) => {
const mimeTypes = {
mp3: 'audio/mpeg',
opus: 'audio/opus',
aac: 'audio/aac',
flac: 'audio/flac',
wav: 'audio/wav',
pcm: 'audio/pcm',
pcm_16000: 'audio/pcm',
pcm_22050: 'audio/pcm',
pcm_24000: 'audio/pcm',
pcm_44100: 'audio/pcm',
};
return mimeTypes[fmt] || 'audio/mpeg';
};
yield {
type: 'format_info',
timestamp: new Date().toISOString(),
format: effectiveFormat,
mimeType: getMimeType(effectiveFormat),
supportsStreaming,
...(isPCM || isGemini || (isOpenAI && isWav)
? {
pcmParameters: {
sampleRate: isGemini
? 24000
: isOpenAI
? 24000
: format === 'pcm_44100'
? 44100
: format === 'pcm_22050'
? 22050
: format === 'pcm_16000'
? 16000
: 24000,
channels: 1,
bitDepth: 16,
},
}
: {}),
};
yield {
type: 'audio_stream',
timestamp: new Date().toISOString(),
format: effectiveFormat,
...(isPCM || isGemini || (isOpenAI && isWav)
? {
pcmParameters: {
sampleRate: isGemini
? 24000
: isOpenAI
? 24000
: format === 'pcm_44100'
? 44100
: format === 'pcm_22050'
? 22050
: format === 'pcm_16000'
? 16000
: 24000,
channels: 1,
bitDepth: 16,
},
}
: {}),
};
let provider;
try {
provider = getModelProvider(model);
}
catch (error) {
throw new Error(`Failed to initialize provider for model ${model}: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
if (!provider.createVoice) {
throw new Error(`Provider for model ${model} does not support voice generation`);
}
let result;
try {
const providerStartTime = Date.now();
result = await provider.createVoice(text, model, streamOptions);
const providerTime = Date.now() - providerStartTime;
console.log(`[ensembleVoice] Got result from provider in ${providerTime}ms:`, typeof result, result instanceof ReadableStream ? 'ReadableStream' : 'ArrayBuffer');
}
catch (error) {
console.error('[ensembleVoice] Error calling provider.createVoice:', error);
throw error;
}
if (!(result instanceof ReadableStream)) {
throw new Error('Expected streaming response but got buffer');
}
const reader = result.getReader();
const CHUNK_SIZE = 8192;
let buffer = new Uint8Array(0);
let chunkIndex = 0;
const needsWavHandling = isGemini || (isElevenLabs && isPCM) || (isOpenAI && isWav);
if (needsWavHandling) {
const provider = isGemini ? 'Gemini' : isOpenAI ? 'OpenAI' : 'ElevenLabs';
let sampleRate;
if (isGemini || isOpenAI) {
sampleRate = 24000;
}
else {
if (format === 'pcm_16000')
sampleRate = 16000;
else if (format === 'pcm_22050')
sampleRate = 22050;
else if (format === 'pcm_44100')
sampleRate = 44100;
else
sampleRate = 24000;
}
console.log(`[ensembleVoice] ${provider}: Will stream with sample rate ${sampleRate}Hz`);
const providerChunkSize = isGemini ? 32768 : CHUNK_SIZE;
let isFirstChunk = true;
let hasWavHeader = false;
let headerBuffer = new Uint8Array(0);
while (true) {
const { done, value } = await reader.read();
if (value) {
if (!firstByteTime) {
firstByteTime = Date.now();
}
const newBuffer = new Uint8Array(buffer.length + value.length);
newBuffer.set(buffer);
newBuffer.set(value, buffer.length);
buffer = newBuffer;
}
if (isFirstChunk && buffer.length >= 4) {
const header = new TextDecoder().decode(buffer.slice(0, 4));
hasWavHeader = header === 'RIFF';
isFirstChunk = false;
if (!hasWavHeader) {
const dataSize = 0x7ffffffe;
const wavHeader = new ArrayBuffer(44);
const view = new DataView(wavHeader);
const setString = (offset, str) => {
for (let i = 0; i < str.length; i++) {
view.setUint8(offset + i, str.charCodeAt(i));
}
};
setString(0, 'RIFF');
view.setUint32(4, dataSize + 36, true);
setString(8, 'WAVE');
setString(12, 'fmt ');
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
setString(36, 'data');
view.setUint32(40, dataSize, true);
headerBuffer = new Uint8Array(wavHeader);
const newBuffer = new Uint8Array(headerBuffer.length + buffer.length);
newBuffer.set(headerBuffer);
newBuffer.set(buffer, headerBuffer.length);
buffer = newBuffer;
}
}
while (buffer.length >= providerChunkSize || (done && buffer.length > 0)) {
const chunkSize = Math.min(providerChunkSize, buffer.length);
const chunk = buffer.slice(0, chunkSize);
buffer = buffer.slice(chunkSize);
const base64Chunk = Buffer.from(chunk).toString('base64');
const isFinalChunk = done && buffer.length === 0;
yield {
type: 'audio_stream',
chunkIndex: chunkIndex++,
isFinalChunk: isFinalChunk,
data: base64Chunk,
timestamp: new Date().toISOString(),
};
if (isFinalChunk)
break;
}
if (done)
break;
}
}
else {
while (true) {
const { done, value } = await reader.read();
if (value) {
if (!firstByteTime) {
firstByteTime = Date.now();
}
const newBuffer = new Uint8Array(buffer.length + value.length);
newBuffer.set(buffer);
newBuffer.set(value, buffer.length);
buffer = newBuffer;
}
while (buffer.length >= CHUNK_SIZE || (done && buffer.length > 0)) {
const chunkSize = Math.min(CHUNK_SIZE, buffer.length);
const chunk = buffer.slice(0, chunkSize);
buffer = buffer.slice(chunkSize);
const base64Chunk = Buffer.from(chunk).toString('base64');
const isFinalChunk = done && buffer.length === 0;
const audioEvent = {
type: 'audio_stream',
chunkIndex: chunkIndex++,
isFinalChunk: isFinalChunk,
data: base64Chunk,
timestamp: new Date().toISOString(),
};
yield audioEvent;
if (isFinalChunk)
break;
}
if (done)
break;
}
}
const totalTime = Date.now() - startTime;
console.log(`[ensembleVoice] ${model}: Total generation time: ${totalTime}ms`);
try {
const costTrackerModule = await import('../utils/cost_tracker.js');
const costTracker = costTrackerModule.costTracker;
const usage = costTracker.entries || [];
if (usage.length > 0) {
const latestUsage = usage[usage.length - 1];
yield {
type: 'cost_update',
usage: latestUsage,
};
}
}
catch (error) {
console.debug('Cost tracking not available:', error);
}
}
//# sourceMappingURL=ensemble_voice.js.map