UNPKG

@just-every/ensemble

Version:

LLM provider abstraction layer with unified streaming interface

1,160 lines 67.3 kB
import { GoogleGenAI, Type, FunctionCallingConfigMode, Modality, MediaResolution, } from '@google/genai'; import { v4 as uuidv4 } from 'uuid'; import { BaseModelProvider } from './base_provider.js'; import { costTracker } from '../index.js'; import { log_llm_error, log_llm_request, log_llm_response } from '../utils/llm_logger.js'; import { isPaused } from '../utils/pause_controller.js'; import { appendMessageWithImage, resizeAndTruncateForGemini } from '../utils/image_utils.js'; import { hasEventHandler } from '../utils/event_controller.js'; function convertParameterToGeminiFormat(param) { let type = Type.STRING; switch (param.type) { case 'string': type = Type.STRING; break; case 'number': type = Type.NUMBER; break; case 'boolean': type = Type.BOOLEAN; break; case 'object': type = Type.OBJECT; break; case 'array': type = Type.ARRAY; break; case 'null': type = Type.STRING; console.warn("Mapping 'null' type to STRING"); break; default: console.warn(`Unsupported parameter type '${param.type}'. Defaulting to STRING.`); type = Type.STRING; } const result = { type, description: param.description }; if (type === Type.ARRAY) { if (param.items) { let itemType; let itemEnum; let itemProperties; if (typeof param.items === 'object') { itemType = param.items.type; itemEnum = param.items.enum; if ('properties' in param.items) { itemProperties = param.items.properties; } } if (itemType === 'object' || itemProperties) { result.items = { type: Type.STRING }; result.description = `${result.description || 'Array parameter'} (Each item should be a JSON-encoded object)`; if (itemProperties) { const propNames = Object.keys(itemProperties); result.description += `. Expected properties: ${propNames.join(', ')}`; } } else if (itemType) { result.items = { type: itemType === 'string' ? Type.STRING : itemType === 'number' ? Type.NUMBER : itemType === 'boolean' ? Type.BOOLEAN : itemType === 'null' ? Type.STRING : Type.STRING, }; if (itemEnum) { if (typeof itemEnum === 'function') { console.warn('Gemini provider does not support async enum functions in array items'); } else { result.items.enum = itemEnum; } } } else { result.items = { type: Type.STRING }; } } else { result.items = { type: Type.STRING }; } } else if (type === Type.OBJECT) { if (param.properties && typeof param.properties === 'object') { result.properties = {}; for (const [propName, propSchema] of Object.entries(param.properties)) { result.properties[propName] = convertParameterToGeminiFormat(propSchema); } } else { result.properties = {}; } } else if (param.enum) { if (typeof param.enum === 'function') { console.warn('Gemini provider does not support async enum functions. Enum will be omitted.'); } else { result.format = 'enum'; result.enum = param.enum; } } return result; } async function resolveAsyncEnums(params) { if (!params || typeof params !== 'object') { return params; } const resolved = { ...params }; if (resolved.properties) { const resolvedProps = {}; for (const [key, value] of Object.entries(resolved.properties)) { if (value && typeof value === 'object') { const propCopy = { ...value }; if (typeof propCopy.enum === 'function') { try { const enumValue = await propCopy.enum(); if (Array.isArray(enumValue) && enumValue.length > 0) { propCopy.enum = enumValue; } else { delete propCopy.enum; } } catch { delete propCopy.enum; } } resolvedProps[key] = await resolveAsyncEnums(propCopy); } else { resolvedProps[key] = value; } } resolved.properties = resolvedProps; } return resolved; } async function convertToGeminiFunctionDeclarations(tools) { const declarations = await Promise.all(tools.map(async (tool) => { if (tool.definition.function.name === 'google_web_search') { return null; } const resolvedParams = await resolveAsyncEnums(tool.definition?.function?.parameters); const toolParams = resolvedParams?.properties; const properties = {}; if (toolParams) { for (const [name, param] of Object.entries(toolParams)) { properties[name] = convertParameterToGeminiFormat(param); } } else { console.warn(`Tool ${tool.definition?.function?.name || 'Unnamed Tool'} has missing or invalid parameters definition.`); } return { name: tool.definition.function.name, description: tool.definition.function.description, parameters: { type: Type.OBJECT, properties, required: Array.isArray(resolvedParams?.required) ? resolvedParams.required : [], }, }; })); return declarations.filter(Boolean); } export function getImageMimeType(imageData) { if (imageData.includes('data:image/jpeg')) return 'image/jpeg'; if (imageData.includes('data:image/png')) return 'image/png'; if (imageData.includes('data:image/gif')) return 'image/gif'; if (imageData.includes('data:image/webp')) return 'image/webp'; return 'image/jpeg'; } export function cleanBase64Data(imageData) { return imageData.replace(/^data:image\/[a-z]+;base64,/, ''); } function formatGroundingChunks(chunks) { return chunks .filter(c => c?.web?.uri) .map((c, i) => `${i + 1}. ${c.web.title || 'Untitled'} – ${c.web.uri}`) .join('\n'); } async function addImagesToInput(input, images, source) { for (const [image_id, imageData] of Object.entries(images)) { const processedImageData = await resizeAndTruncateForGemini(imageData); const mimeType = getImageMimeType(processedImageData); const cleanedImageData = cleanBase64Data(processedImageData); input.push({ role: 'user', parts: [ { text: `This is [image #${image_id}] from the ${source}`, }, { inlineData: { mimeType: mimeType, data: cleanedImageData, }, }, ], }); } return input; } async function convertToGeminiContents(model, messages) { let contents = []; for (const msg of messages) { if (msg.type === 'function_call') { let args = {}; try { const parsedArgs = JSON.parse(msg.arguments || '{}'); args = typeof parsedArgs === 'object' && parsedArgs !== null ? parsedArgs : { value: parsedArgs }; } catch (e) { console.error(`Failed to parse function call arguments for ${msg.name}:`, msg.arguments, e); args = { error: 'Invalid JSON arguments provided', raw_args: msg.arguments, }; } contents.push({ role: 'model', parts: [ { functionCall: { name: msg.name, args, }, }, ], }); } else if (msg.type === 'function_call_output') { let textOutput = ''; if (typeof msg.output === 'string') { textOutput = msg.output; } else { textOutput = JSON.stringify(msg.output); } const message = { role: 'user', parts: [ { functionResponse: { name: msg.name, response: { content: textOutput || '' }, }, }, ], }; contents = await appendMessageWithImage(model, contents, message, { read: () => textOutput, write: value => { message.parts[0].functionResponse.response.content = value; return message; }, }, addImagesToInput); } else { let textContent = ''; if (typeof msg.content === 'string') { textContent = msg.content; } else if (msg.content && typeof msg.content === 'object' && 'text' in msg.content) { textContent = msg.content.text; } else { textContent = JSON.stringify(msg.content); } const role = msg.role === 'assistant' ? 'model' : 'user'; const message = { role, parts: [ { thought: msg.type === 'thinking', text: textContent.trim(), }, ], }; contents = await appendMessageWithImage(model, contents, message, { read: () => textContent, write: value => { message.parts[0].text = value; return message; }, }, addImagesToInput); } } return contents; } const THINKING_BUDGET_CONFIGS = { '-low': 0, '-medium': 2048, '-high': 12288, '-max': 24576, }; export class GeminiProvider extends BaseModelProvider { _client; apiKey; constructor(apiKey) { super('google'); this.apiKey = apiKey; } get client() { if (!this._client) { const apiKey = this.apiKey || process.env.GOOGLE_API_KEY; if (!apiKey) { throw new Error('Failed to initialize Gemini client. GOOGLE_API_KEY is missing or not provided.'); } this._client = new GoogleGenAI({ apiKey: apiKey, vertexai: false, httpOptions: { apiVersion: 'v1alpha' }, }); } return this._client; } async createEmbedding(input, model, opts) { try { let actualModelId = model.startsWith('gemini/') ? model.substring(7) : model; let thinkingConfig = null; for (const [suffix, budget] of Object.entries(THINKING_BUDGET_CONFIGS)) { if (actualModelId.endsWith(suffix)) { thinkingConfig = { thinkingBudget: budget }; actualModelId = actualModelId.slice(0, -suffix.length); break; } } console.log(`[Gemini] Generating embedding with model ${actualModelId}${opts?.dimensions ? ` (dimensions: ${opts.dimensions})` : ''}`); const payload = { model: actualModelId, contents: input, config: { taskType: opts?.taskType ?? 'SEMANTIC_SIMILARITY', ...(opts?.dimensions && { outputDimensionality: opts.dimensions }), }, }; if (thinkingConfig) { payload.config.thinkingConfig = thinkingConfig; } const response = await this.client.models.embedContent(payload); console.log('[Gemini] Embedding response structure:', JSON.stringify(response, (key, value) => key === 'values' && Array.isArray(value) && value.length > 10 ? `[${value.length} items]` : value, 2)); if (!response.embeddings || !Array.isArray(response.embeddings)) { console.error('[Gemini] Unexpected embedding response structure:', response); throw new Error('Invalid embedding response structure from Gemini API'); } const estimatedTokens = typeof input === 'string' ? Math.ceil(input.length / 4) : input.reduce((sum, text) => sum + Math.ceil(text.length / 4), 0); let extractedValues = []; let dimensions = 0; if (response.embeddings.length > 0) { if (response.embeddings[0].values) { extractedValues = response.embeddings.map(e => e.values); dimensions = extractedValues[0].length; } else { console.warn('[Gemini] Could not find expected "values" property in embeddings response'); extractedValues = response.embeddings; dimensions = Array.isArray(extractedValues[0]) ? extractedValues[0].length : 0; } } costTracker.addUsage({ model: actualModelId, input_tokens: estimatedTokens, output_tokens: 0, metadata: { dimensions, }, }); if (Array.isArray(input) && input.length > 1) { return extractedValues; } else { let result; if (Array.isArray(extractedValues) && extractedValues.length >= 1) { const firstValue = extractedValues[0]; if (Array.isArray(firstValue)) { result = firstValue; } else { console.error('[Gemini] Unexpected format in embedding result:', firstValue); result = []; } } else { result = []; } return result; } } catch (error) { console.error('[Gemini] Error generating embedding:', error); throw error; } } async *retryStreamOnIncompleteJson(requestFn, maxRetries = 2) { let attempts = 0; while (attempts <= maxRetries) { try { const stream = await requestFn(); for await (const chunk of stream) { yield chunk; } return; } catch (error) { attempts++; const errorMsg = error instanceof Error ? error.message : String(error); if (errorMsg.includes('Incomplete JSON segment') && attempts <= maxRetries) { console.warn(`[Gemini] Incomplete JSON segment error, retrying (${attempts}/${maxRetries})...`); await new Promise(resolve => setTimeout(resolve, 1000 * attempts)); continue; } throw error; } } } async *createResponseStream(messages, model, agent) { const { getToolsFromAgent } = await import('../utils/agent.js'); const tools = agent ? await getToolsFromAgent(agent) : []; const settings = agent?.modelSettings; let messageId = uuidv4(); let contentBuffer = ''; let thoughtBuffer = ''; let eventOrder = 0; const shownGrounding = new Set(); let requestId = undefined; const chunks = []; try { const contents = await convertToGeminiContents(model, messages); if (contents.length === 0) { console.warn('Gemini API Warning: No valid content found in messages after conversion. Adding default message.'); contents.push({ role: 'user', parts: [ { text: "Let's think this through step by step.", }, ], }); } const lastContent = contents[contents.length - 1]; if (lastContent.role !== 'user') { console.warn("Last message in history is not from 'user'. Gemini might not respond as expected."); } let thinkingBudget = null; for (const [suffix, budget] of Object.entries(THINKING_BUDGET_CONFIGS)) { if (model.endsWith(suffix)) { thinkingBudget = budget; model = model.slice(0, -suffix.length); break; } } const config = { thinkingConfig: { includeThoughts: true, }, }; if (thinkingBudget) { config.thinkingConfig.thinkingBudget = thinkingBudget; } if (settings?.stop_sequence) { config.stopSequences = [settings.stop_sequence]; } if (settings?.temperature) { config.temperature = settings.temperature; } if (settings?.max_tokens) { config.maxOutputTokens = settings.max_tokens; } if (settings?.top_p) { config.topP = settings.top_p; } if (settings?.top_k) { config.topK = settings.top_k; } if (settings?.json_schema) { config.responseMimeType = 'application/json'; config.responseSchema = settings.json_schema.schema; if (config.responseSchema) { const removeAdditionalProperties = (obj) => { if (!obj || typeof obj !== 'object') { return; } if ('additionalProperties' in obj) { delete obj.additionalProperties; } if (obj.properties && typeof obj.properties === 'object') { Object.values(obj.properties).forEach(prop => { removeAdditionalProperties(prop); }); } if (obj.items) { removeAdditionalProperties(obj.items); } ['oneOf', 'anyOf', 'allOf'].forEach(key => { if (obj[key] && Array.isArray(obj[key])) { obj[key].forEach((subSchema) => { removeAdditionalProperties(subSchema); }); } }); }; removeAdditionalProperties(config.responseSchema); } } let hasGoogleWebSearch = false; if (tools && tools.length > 0) { hasGoogleWebSearch = tools.some(tool => tool.definition.function.name === 'google_web_search'); const functionDeclarations = await convertToGeminiFunctionDeclarations(tools); let allowedFunctionNames = []; if (functionDeclarations.length > 0) { config.tools = [{ functionDeclarations }]; if (settings?.tool_choice) { let toolChoice; if (typeof settings.tool_choice === 'object' && settings.tool_choice?.type === 'function' && settings.tool_choice?.function?.name) { toolChoice = FunctionCallingConfigMode.ANY; allowedFunctionNames = [settings.tool_choice.function.name]; } else if (settings.tool_choice === 'required') { toolChoice = FunctionCallingConfigMode.ANY; } else if (settings.tool_choice === 'auto') { toolChoice = FunctionCallingConfigMode.AUTO; } else if (settings.tool_choice === 'none') { toolChoice = FunctionCallingConfigMode.NONE; } if (toolChoice) { config.toolConfig = { functionCallingConfig: { mode: toolChoice, }, }; if (allowedFunctionNames.length > 0) { config.toolConfig.functionCallingConfig.allowedFunctionNames = allowedFunctionNames; } } } } else if (!hasGoogleWebSearch) { console.warn('Tools were provided but resulted in empty declarations after conversion.'); } } if (hasGoogleWebSearch) { console.log('[Gemini] Enabling Google Search grounding'); config.tools = [{ googleSearch: {} }]; config.toolConfig = { functionCallingConfig: { mode: FunctionCallingConfigMode.ANY, allowedFunctionNames: ['googleSearch'], }, }; } const requestParams = { model, contents, config, }; requestId = log_llm_request(agent.agent_id, 'google', model, requestParams); const { waitWhilePaused } = await import('../utils/pause_controller.js'); await waitWhilePaused(100, agent.abortSignal); const getStreamFn = () => this.client.models.generateContentStream(requestParams); const response = this.retryStreamOnIncompleteJson(getStreamFn); let usageMetadata; for await (const chunk of response) { chunks.push(chunk); if (chunk.responseId) { messageId = chunk.responseId; } if (isPaused()) { console.log(`[Gemini] System paused during stream for model ${model}. Waiting...`); await waitWhilePaused(100, agent.abortSignal); console.log(`[Gemini] System resumed, continuing stream for model ${model}`); } if (chunk.functionCalls && chunk.functionCalls.length > 0) { for (const fc of chunk.functionCalls) { if (fc && fc.name) { yield { type: 'tool_start', tool_call: { id: fc.id || `call_${uuidv4()}`, type: 'function', function: { name: fc.name, arguments: JSON.stringify(fc.args || {}), }, }, }; } } } for (const candidate of chunk.candidates) { if (candidate.content?.parts) { for (const part of candidate.content.parts) { let text = ''; if (part.text) { text += part.text; } if (part.executableCode) { if (text) { text += '\n\n'; } text += part.executableCode; } if (part.videoMetadata) { if (text) { text += '\n\n'; } text += JSON.stringify(part.videoMetadata); } if (text.length > 0) { const ev = { type: 'message_delta', content: '', message_id: messageId, order: eventOrder++, }; if (part.thought) { thoughtBuffer += text; ev.thinking_content = text; } else { contentBuffer += text; ev.content = text; } yield ev; } if (part.inlineData?.data) { yield { type: 'file_complete', data_format: 'base64', data: part.inlineData.data, mime_type: part.inlineData.mimeType || 'image/png', message_id: uuidv4(), order: eventOrder++, }; } } } const gChunks = candidate.groundingMetadata?.groundingChunks; if (Array.isArray(gChunks)) { const newChunks = gChunks.filter(c => c?.web?.uri && !shownGrounding.has(c.web.uri)); if (newChunks.length) { newChunks.forEach(c => shownGrounding.add(c.web.uri)); const formatted = formatGroundingChunks(newChunks); yield { type: 'message_delta', content: '\n\nSearch Results:\n' + formatted + '\n', message_id: messageId, order: eventOrder++, }; contentBuffer += '\n\nSearch Results:\n' + formatted + '\n'; } } } if (chunk.usageMetadata) { usageMetadata = chunk.usageMetadata; } } if (usageMetadata) { const calculatedUsage = costTracker.addUsage({ model, input_tokens: usageMetadata.promptTokenCount || 0, output_tokens: usageMetadata.candidatesTokenCount || 0, cached_tokens: usageMetadata.cachedContentTokenCount || 0, metadata: { total_tokens: usageMetadata.totalTokenCount || 0, reasoning_tokens: usageMetadata.thoughtsTokenCount || 0, tool_tokens: usageMetadata.toolUsePromptTokenCount || 0, }, }); if (!hasEventHandler()) { yield { type: 'cost_update', usage: { ...calculatedUsage, total_tokens: usageMetadata.totalTokenCount || 0, }, }; } } else { console.warn('[Gemini] No usage metadata found in the response. Using token estimation.'); let inputText = ''; for (const content of contents) { if (content.parts) { for (const part of content.parts) { if (part.text) { inputText += part.text + '\n'; } } } } const calculatedUsage = costTracker.addEstimatedUsage(model, inputText, contentBuffer + thoughtBuffer, { provider: 'gemini', }); if (!hasEventHandler()) { yield { type: 'cost_update', usage: { ...calculatedUsage, total_tokens: calculatedUsage.input_tokens + calculatedUsage.output_tokens, }, }; } } if (contentBuffer || thoughtBuffer) { yield { type: 'message_complete', content: contentBuffer, thinking_content: thoughtBuffer, message_id: messageId, }; } } catch (error) { log_llm_error(requestId, error); const errorMessage = error instanceof Error ? error.stack || error.message : String(error); if (errorMessage.includes('Incomplete JSON segment')) { console.error('[Gemini] Stream terminated with incomplete JSON. This may indicate network issues or timeouts.'); } console.error('\n=== Gemini error ==='); console.dir(error, { depth: null }); console.error('\n=== JSON dump of error ==='); console.error(JSON.stringify(error, Object.getOwnPropertyNames(error), 2)); console.error('\n=== Manual property walk ==='); for (const key of Reflect.ownKeys(error)) { console.error(`${String(key)}:`, error[key]); } yield { type: 'error', error: `Gemini error ${model}: ${errorMessage}`, }; if (contentBuffer || thoughtBuffer) { yield { type: 'message_complete', content: contentBuffer, thinking_content: thoughtBuffer, message_id: messageId, }; } } finally { log_llm_response(requestId, chunks); } } async createImage(prompt, model, opts) { try { model = model || 'imagen-3.0-generate-002'; const numberOfImages = opts?.n || 1; let aspectRatio = '1:1'; if (opts?.size === 'landscape') { aspectRatio = '16:9'; } else if (opts?.size === 'portrait') { aspectRatio = '9:16'; } console.log(`[Gemini] Generating ${numberOfImages} image(s) with model ${model}, prompt: "${prompt.substring(0, 100)}${prompt.length > 100 ? '...' : ''}"`); const response = await this.client.models.generateImages({ model, prompt, config: { numberOfImages, aspectRatio, includeSafetyAttributes: false, }, }); const images = []; if (response.generatedImages && response.generatedImages.length > 0) { for (const generatedImage of response.generatedImages) { if (generatedImage.image?.imageBytes) { const base64Image = `data:image/png;base64,${generatedImage.image.imageBytes}`; images.push(base64Image); } } const perImageCost = this.getImageCost(model); costTracker.addUsage({ model, image_count: images.length, metadata: { aspect_ratio: aspectRatio, cost_per_image: perImageCost, }, }); } if (images.length === 0) { throw new Error('No images returned from Gemini/Imagen'); } return images; } catch (error) { console.error('[Gemini] Error generating image:', error); throw error; } } getImageCost(model) { if (model.includes('imagen-3')) { return 0.04; } else if (model.includes('imagen-2')) { return 0.02; } return 0.04; } async createVoice(text, model = 'gemini-2.5-flash-preview-tts', opts) { try { console.log(`[Gemini] Generating speech with model ${model}, text: "${text.substring(0, 100)}${text.length > 100 ? '...' : ''}"`); const voiceName = this.mapVoiceToGemini(opts?.voice); const speechConfig = { voiceConfig: { prebuiltVoiceConfig: { voiceName: voiceName, }, }, }; const config = { responseModalities: [Modality.AUDIO], speechConfig: speechConfig, }; let say_prefix = ''; let say_postfix = ''; if (opts?.speed && opts.speed !== 1.0) { const speedDescription = opts.speed < 1.0 ? `slowly at ${Math.round(opts.speed * 100)}% speed` : `quickly at ${Math.round(opts.speed * 100)}% speed`; say_postfix = speedDescription; } if (opts?.affect) { say_prefix = `Sound ${opts.affect}`; } if (say_postfix || say_prefix) { if (say_postfix && say_prefix) { text = `${say_prefix} and say ${say_postfix}:\n${text}`; } else if (say_postfix) { text = `Say ${say_postfix}:\n${text}`; } else if (say_prefix) { text = `${say_prefix} and say:\n${text}`; } } console.log(`[Gemini] Starting generateContentStream call...`); const streamPromise = this.client.models.generateContentStream({ model, contents: [{ role: 'user', parts: [{ text }] }], config, }); const textLength = text.length; costTracker.addUsage({ model, input_tokens: Math.ceil(textLength / 4), output_tokens: 0, metadata: { voice: voiceName, text_length: textLength, type: 'voice_generation', }, }); if (opts?.stream) { const stream = await streamPromise; const chunks = []; for await (const chunk of stream) { if (chunk.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data) { const part = chunk.candidates[0].content.parts[0]; const binaryString = atob(part.inlineData.data); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } chunks.push(bytes); if (part.inlineData.mimeType) { console.log(`[Gemini] Audio format: ${part.inlineData.mimeType}`); } } } const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0); const combined = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { combined.set(chunk, offset); offset += chunk.length; } return new ReadableStream({ start(controller) { controller.enqueue(combined); controller.close(); }, }); } let allData = new Uint8Array(0); const stream = await streamPromise; for await (const chunk of stream) { if (!chunk.candidates || !chunk.candidates[0]?.content?.parts) { continue; } const part = chunk.candidates[0].content.parts[0]; if (part?.inlineData?.data) { const binaryString = atob(part.inlineData.data); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } const newData = new Uint8Array(allData.length + bytes.length); newData.set(allData); newData.set(bytes, allData.length); allData = newData; } } if (allData.length === 0) { throw new Error('No audio data generated from Gemini TTS'); } return allData.buffer; } catch (error) { console.error('[Gemini] Error generating voice:', error); throw error; } } mapVoiceToGemini(voice) { const geminiVoices = [ 'Kore', 'Puck', 'Charon', 'Fenrir', 'Aoede', 'Glados', ]; if (!voice) { return 'Kore'; } if (geminiVoices.includes(voice)) { return voice; } const voiceMap = { alloy: 'Kore', echo: 'Puck', fable: 'Charon', onyx: 'Fenrir', nova: 'Aoede', shimmer: 'Glados', male: 'Puck', female: 'Kore', neutral: 'Charon', young: 'Aoede', mature: 'Fenrir', robotic: 'Glados', kore: 'Kore', puck: 'Puck', charon: 'Charon', fenrir: 'Fenrir', aoede: 'Aoede', glados: 'Glados', }; const mappedVoice = voiceMap[voice.toLowerCase()]; if (mappedVoice) { return mappedVoice; } console.warn(`[Gemini] Unknown voice '${voice}', using default voice 'Kore'`); return 'Kore'; } async *createTranscription(audio, agent, model, opts) { let session = null; let isConnected = false; try { const ai = new GoogleGenAI({ apiKey: this.apiKey, httpOptions: { apiVersion: 'v1alpha' }, }); const realtimeInputConfig = opts?.realtimeInputConfig || { automaticActivityDetection: { disabled: false, startOfSpeechSensitivity: 'START_SENSITIVITY_HIGH', endOfSpeechSensitivity: 'END_SENSITIVITY_LOW', }, }; const speechConfig = opts?.speechConfig || { languageCode: 'en-US', }; const systemInstruction = agent.instructions || `You should reply only "OK" to every single message from the user. Nothing else.`; console.log('[Gemini] Connecting to Live API for transcription...'); const connectionPromise = new Promise((resolve, reject) => { const timeout = setTimeout(() => { reject(new Error('Connection timeout')); }, 10000); const config = { responseModalities: [Modality.TEXT], mediaResolution: MediaResolution.MEDIA_RESOLUTION_MEDIUM, speechConfig, realtimeInputConfig, systemInstruction: { parts: [{ text: systemInstruction }], }, inputAudioTranscription: {}, }; ai.live .connect({ model: model, config, callbacks: { onopen: () => { clearTimeout(timeout); console.log('[Gemini] Live session connected'); isConnected = true; resolve(); }, onmessage: async (msg) => { if (msg.serverContent?.inputTranscription?.text) { const previewEvent = { type: 'transcription_turn_delta', timestamp: new Date().toISOString(), delta: msg.serverContent.inputTranscription.text, }; transcriptEvents.push(previewEvent); } if (msg.serverContent?.turnComplete) { const turnEvent = { type: 'transcription_turn_complete', timestamp: new Date().toISOString(), }; transcriptEvents.push(turnEvent); } if (msg.usageMetadata) { if (msg.usageMetadata.promptTokensDetails && Array.isArray(msg.usageMetadata.promptTokensDetails)) { for (const detail of msg.usageMetadata.promptTokensDetails) { if (detail.modality && detail.tokenCount > 0) { costTracker.addUsage({ model: model, input_tokens: detail.tokenCount, output_tokens: 0, input_modality: detail.modality.toLowerCase(), metadata: { totalTokens: msg.usageMetadata.totalTokenCount || 0, source: 'gemini-live-transcription', modalityType: 'input', originalModality: detail.modality, }, }); } } } if (msg.usageMetadata.responseTokensDetails && Array.isArray(msg.usageMetadata.responseTokensDetails)) { for (const detail of msg.usageMetadata.responseTokensDetails) { if (detail.modality && detail.tokenCount > 0) { costTracker.addUsage({ model: model, input_tokens: 0, output_tokens: detail.tokenCount, output_modality: detail.modality.toLowerCase(), metadata: { totalTokens: msg.usageMetadata.totalTokenCount || 0, source: 'gemini-live-transcription', modalityType: 'output', originalModality: detail.modality, }, }); } } } if ((!msg.usageMetadata.promptTokensDetails || msg.usageMetadata.promptTokensDetails.length === 0) && (!msg.usageMetadata.responseTokensDetails || msg.usageMetadata.responseTokensDetails.length === 0)) { costTracker.addUsage({ model: model, input_tokens: msg.usageMetadata.promptTokenCount || 0, output_tokens: msg.usageMetadata.responseTokenCount || 0, input_modality: 'audio', output_modality: 'text', metadata: { totalTokens: msg.usageMetadata.totalTokenCount || 0, source: 'gemini-live-transcription', }, }); } } }, onerror: (err) => { console.error('[Gemini] Live API error:', { code: err.code, reason: err.reason, wasClean: err.wasClean, }); connectionError = err; }, onclose: (event) => { console.log('[Gemini] Live session closed'); if (event) { console.log('[Gemini] Close event details:', { code: event.code, reason: event.reason, wasClean: event.wasClean, }); } isConnected = false; }, }, }) .then(async (s) => { session = s; }); }); const transcriptEvents = []; let connectionError = null; await connectionPromise; const audioStream = normalizeAudioSource(audio); const reader = audioStream.getReader(); const sendAudioChunk = async (chunk) => { try { const base64Data = chunk.toString('base64'); await session.sendRealtimeInput({ media: { mimeType: 'audio/pcm;rate=16000', data: base64Data, }, }); } catch (err) { console.error('[Gemini] Error sending audio chunk:', err); connectionError = err; throw err; } }; try { while (true) { const { done, value } = await reader.read(); if (done) break; if (value && session && isConnected) { const chunk = value instanceof Buffer ? value : Buffer.from(value); await sendAudioChunk(chunk); } if (transcriptEvents.length > 0) { const events = transcriptEvents.splice(0, transcriptEvents.length); for (const event of events) { yield event; } } if (connectionError) { throw connectionError; } } await new Promise(resolve => setTimeout(resolve, 1000)); if (transcriptEvents.length > 0) { const events = transcriptEvents.splice(0, transcriptEvents.length); for (const ev