UNPKG

@just-every/ensemble

Version:

LLM provider abstraction layer with unified streaming interface

248 lines 10.1 kB
import { getModelFromAgent, getModelProvider } from '../model_providers/model_provider.js'; import { findModel } from '../data/model_data.js'; export async function* ensembleVoice(text, agent, options = {}) { const streamOptions = { ...options, stream: true }; const format = options.response_format || 'mp3'; const isPCM = format.includes('pcm'); const model = await getModelFromAgent(agent, 'voice'); const isGemini = model.startsWith('gemini'); const isElevenLabs = model.startsWith('eleven_'); const startTime = Date.now(); let firstByteTime = null; console.log(`[ensembleVoice] Starting TTS generation with model: ${model}`); const modelInfo = findModel(model); const isOpenAI = modelInfo?.provider === 'openai'; const isWav = format === 'wav'; const effectiveFormat = isGemini || (isElevenLabs && isPCM) ? 'wav' : format; const supportsStreaming = effectiveFormat === 'wav' || effectiveFormat.includes('pcm'); const getMimeType = (fmt) => { const mimeTypes = { mp3: 'audio/mpeg', opus: 'audio/opus', aac: 'audio/aac', flac: 'audio/flac', wav: 'audio/wav', pcm: 'audio/pcm', pcm_16000: 'audio/pcm', pcm_22050: 'audio/pcm', pcm_24000: 'audio/pcm', pcm_44100: 'audio/pcm', }; return mimeTypes[fmt] || 'audio/mpeg'; }; yield { type: 'format_info', timestamp: new Date().toISOString(), format: effectiveFormat, mimeType: getMimeType(effectiveFormat), supportsStreaming, ...(isPCM || isGemini || (isOpenAI && isWav) ? { pcmParameters: { sampleRate: isGemini ? 24000 : isOpenAI ? 24000 : format === 'pcm_44100' ? 44100 : format === 'pcm_22050' ? 22050 : format === 'pcm_16000' ? 16000 : 24000, channels: 1, bitDepth: 16, }, } : {}), }; yield { type: 'audio_stream', timestamp: new Date().toISOString(), format: effectiveFormat, ...(isPCM || isGemini || (isOpenAI && isWav) ? { pcmParameters: { sampleRate: isGemini ? 24000 : isOpenAI ? 24000 : format === 'pcm_44100' ? 44100 : format === 'pcm_22050' ? 22050 : format === 'pcm_16000' ? 16000 : 24000, channels: 1, bitDepth: 16, }, } : {}), }; let provider; try { provider = getModelProvider(model); } catch (error) { throw new Error(`Failed to initialize provider for model ${model}: ${error instanceof Error ? error.message : 'Unknown error'}`); } if (!provider.createVoice) { throw new Error(`Provider for model ${model} does not support voice generation`); } let result; try { const providerStartTime = Date.now(); result = await provider.createVoice(text, model, streamOptions); const providerTime = Date.now() - providerStartTime; console.log(`[ensembleVoice] Got result from provider in ${providerTime}ms:`, typeof result, result instanceof ReadableStream ? 'ReadableStream' : 'ArrayBuffer'); } catch (error) { console.error('[ensembleVoice] Error calling provider.createVoice:', error); throw error; } if (!(result instanceof ReadableStream)) { throw new Error('Expected streaming response but got buffer'); } const reader = result.getReader(); const CHUNK_SIZE = 8192; let buffer = new Uint8Array(0); let chunkIndex = 0; const needsWavHandling = isGemini || (isElevenLabs && isPCM) || (isOpenAI && isWav); if (needsWavHandling) { const provider = isGemini ? 'Gemini' : isOpenAI ? 'OpenAI' : 'ElevenLabs'; let sampleRate; if (isGemini || isOpenAI) { sampleRate = 24000; } else { if (format === 'pcm_16000') sampleRate = 16000; else if (format === 'pcm_22050') sampleRate = 22050; else if (format === 'pcm_44100') sampleRate = 44100; else sampleRate = 24000; } console.log(`[ensembleVoice] ${provider}: Will stream with sample rate ${sampleRate}Hz`); const providerChunkSize = isGemini ? 32768 : CHUNK_SIZE; let isFirstChunk = true; let hasWavHeader = false; let headerBuffer = new Uint8Array(0); while (true) { const { done, value } = await reader.read(); if (value) { if (!firstByteTime) { firstByteTime = Date.now(); } const newBuffer = new Uint8Array(buffer.length + value.length); newBuffer.set(buffer); newBuffer.set(value, buffer.length); buffer = newBuffer; } if (isFirstChunk && buffer.length >= 4) { const header = new TextDecoder().decode(buffer.slice(0, 4)); hasWavHeader = header === 'RIFF'; isFirstChunk = false; if (!hasWavHeader) { const dataSize = 0x7ffffffe; const wavHeader = new ArrayBuffer(44); const view = new DataView(wavHeader); const setString = (offset, str) => { for (let i = 0; i < str.length; i++) { view.setUint8(offset + i, str.charCodeAt(i)); } }; setString(0, 'RIFF'); view.setUint32(4, dataSize + 36, true); setString(8, 'WAVE'); setString(12, 'fmt '); view.setUint32(16, 16, true); view.setUint16(20, 1, true); view.setUint16(22, 1, true); view.setUint32(24, sampleRate, true); view.setUint32(28, sampleRate * 2, true); view.setUint16(32, 2, true); view.setUint16(34, 16, true); setString(36, 'data'); view.setUint32(40, dataSize, true); headerBuffer = new Uint8Array(wavHeader); const newBuffer = new Uint8Array(headerBuffer.length + buffer.length); newBuffer.set(headerBuffer); newBuffer.set(buffer, headerBuffer.length); buffer = newBuffer; } } while (buffer.length >= providerChunkSize || (done && buffer.length > 0)) { const chunkSize = Math.min(providerChunkSize, buffer.length); const chunk = buffer.slice(0, chunkSize); buffer = buffer.slice(chunkSize); const base64Chunk = Buffer.from(chunk).toString('base64'); const isFinalChunk = done && buffer.length === 0; yield { type: 'audio_stream', chunkIndex: chunkIndex++, isFinalChunk: isFinalChunk, data: base64Chunk, timestamp: new Date().toISOString(), }; if (isFinalChunk) break; } if (done) break; } } else { while (true) { const { done, value } = await reader.read(); if (value) { if (!firstByteTime) { firstByteTime = Date.now(); } const newBuffer = new Uint8Array(buffer.length + value.length); newBuffer.set(buffer); newBuffer.set(value, buffer.length); buffer = newBuffer; } while (buffer.length >= CHUNK_SIZE || (done && buffer.length > 0)) { const chunkSize = Math.min(CHUNK_SIZE, buffer.length); const chunk = buffer.slice(0, chunkSize); buffer = buffer.slice(chunkSize); const base64Chunk = Buffer.from(chunk).toString('base64'); const isFinalChunk = done && buffer.length === 0; const audioEvent = { type: 'audio_stream', chunkIndex: chunkIndex++, isFinalChunk: isFinalChunk, data: base64Chunk, timestamp: new Date().toISOString(), }; yield audioEvent; if (isFinalChunk) break; } if (done) break; } } const totalTime = Date.now() - startTime; console.log(`[ensembleVoice] ${model}: Total generation time: ${totalTime}ms`); try { const costTrackerModule = await import('../utils/cost_tracker.js'); const costTracker = costTrackerModule.costTracker; const usage = costTracker.entries || []; if (usage.length > 0) { const latestUsage = usage[usage.length - 1]; yield { type: 'cost_update', usage: latestUsage, }; } } catch (error) { console.debug('Cost tracking not available:', error); } } //# sourceMappingURL=ensemble_voice.js.map