UNPKG

@just-every/ensemble

Version:

LLM provider abstraction layer with unified streaming interface

1,147 lines (1,146 loc) 69.5 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.geminiProvider = exports.GeminiProvider = void 0; exports.getImageMimeType = getImageMimeType; exports.cleanBase64Data = cleanBase64Data; const genai_1 = require("@google/genai"); const uuid_1 = require("uuid"); const base_provider_js_1 = require("./base_provider.cjs"); const index_js_1 = require("../index.cjs"); const llm_logger_js_1 = require("../utils/llm_logger.cjs"); const pause_controller_js_1 = require("../utils/pause_controller.cjs"); const image_utils_js_1 = require("../utils/image_utils.cjs"); const event_controller_js_1 = require("../utils/event_controller.cjs"); function convertParameterToGeminiFormat(param) { let type = genai_1.Type.STRING; switch (param.type) { case 'string': type = genai_1.Type.STRING; break; case 'number': type = genai_1.Type.NUMBER; break; case 'boolean': type = genai_1.Type.BOOLEAN; break; case 'object': type = genai_1.Type.OBJECT; break; case 'array': type = genai_1.Type.ARRAY; break; case 'null': type = genai_1.Type.STRING; console.warn("Mapping 'null' type to STRING"); break; default: console.warn(`Unsupported parameter type '${param.type}'. Defaulting to STRING.`); type = genai_1.Type.STRING; } const result = { type, description: param.description }; if (type === genai_1.Type.ARRAY) { if (param.items) { let itemType; let itemEnum; let itemProperties; if (typeof param.items === 'object') { itemType = param.items.type; itemEnum = param.items.enum; if ('properties' in param.items) { itemProperties = param.items.properties; } } if (itemType === 'object' || itemProperties) { result.items = { type: genai_1.Type.STRING }; result.description = `${result.description || 'Array parameter'} (Each item should be a JSON-encoded object)`; if (itemProperties) { const propNames = Object.keys(itemProperties); result.description += `. Expected properties: ${propNames.join(', ')}`; } } else if (itemType) { result.items = { type: itemType === 'string' ? genai_1.Type.STRING : itemType === 'number' ? genai_1.Type.NUMBER : itemType === 'boolean' ? genai_1.Type.BOOLEAN : itemType === 'null' ? genai_1.Type.STRING : genai_1.Type.STRING, }; if (itemEnum) { if (typeof itemEnum === 'function') { console.warn('Gemini provider does not support async enum functions in array items'); } else { result.items.enum = itemEnum; } } } else { result.items = { type: genai_1.Type.STRING }; } } else { result.items = { type: genai_1.Type.STRING }; } } else if (type === genai_1.Type.OBJECT) { if (param.properties && typeof param.properties === 'object') { result.properties = {}; for (const [propName, propSchema] of Object.entries(param.properties)) { result.properties[propName] = convertParameterToGeminiFormat(propSchema); } } else { result.properties = {}; } } else if (param.enum) { if (typeof param.enum === 'function') { console.warn('Gemini provider does not support async enum functions. Enum will be omitted.'); } else { result.format = 'enum'; result.enum = param.enum; } } return result; } async function resolveAsyncEnums(params) { if (!params || typeof params !== 'object') { return params; } const resolved = { ...params }; if (resolved.properties) { const resolvedProps = {}; for (const [key, value] of Object.entries(resolved.properties)) { if (value && typeof value === 'object') { const propCopy = { ...value }; if (typeof propCopy.enum === 'function') { try { const enumValue = await propCopy.enum(); if (Array.isArray(enumValue) && enumValue.length > 0) { propCopy.enum = enumValue; } else { delete propCopy.enum; } } catch { delete propCopy.enum; } } resolvedProps[key] = await resolveAsyncEnums(propCopy); } else { resolvedProps[key] = value; } } resolved.properties = resolvedProps; } return resolved; } async function convertToGeminiFunctionDeclarations(tools) { const declarations = await Promise.all(tools.map(async (tool) => { if (tool.definition.function.name === 'google_web_search') { return null; } const resolvedParams = await resolveAsyncEnums(tool.definition?.function?.parameters); const toolParams = resolvedParams?.properties; const properties = {}; if (toolParams) { for (const [name, param] of Object.entries(toolParams)) { properties[name] = convertParameterToGeminiFormat(param); } } else { console.warn(`Tool ${tool.definition?.function?.name || 'Unnamed Tool'} has missing or invalid parameters definition.`); } return { name: tool.definition.function.name, description: tool.definition.function.description, parameters: { type: genai_1.Type.OBJECT, properties, required: Array.isArray(resolvedParams?.required) ? resolvedParams.required : [], }, }; })); return declarations.filter(Boolean); } function getImageMimeType(imageData) { if (imageData.includes('data:image/jpeg')) return 'image/jpeg'; if (imageData.includes('data:image/png')) return 'image/png'; if (imageData.includes('data:image/gif')) return 'image/gif'; if (imageData.includes('data:image/webp')) return 'image/webp'; return 'image/jpeg'; } function cleanBase64Data(imageData) { return imageData.replace(/^data:image\/[a-z]+;base64,/, ''); } function formatGroundingChunks(chunks) { return chunks .filter(c => c?.web?.uri) .map((c, i) => `${i + 1}. ${c.web.title || 'Untitled'} – ${c.web.uri}`) .join('\n'); } async function addImagesToInput(input, images, source) { for (const [image_id, imageData] of Object.entries(images)) { const processedImageData = await (0, image_utils_js_1.resizeAndTruncateForGemini)(imageData); const mimeType = getImageMimeType(processedImageData); const cleanedImageData = cleanBase64Data(processedImageData); input.push({ role: 'user', parts: [ { text: `This is [image #${image_id}] from the ${source}`, }, { inlineData: { mimeType: mimeType, data: cleanedImageData, }, }, ], }); } return input; } async function convertToGeminiContents(model, messages) { let contents = []; for (const msg of messages) { if (msg.type === 'function_call') { let args = {}; try { const parsedArgs = JSON.parse(msg.arguments || '{}'); args = typeof parsedArgs === 'object' && parsedArgs !== null ? parsedArgs : { value: parsedArgs }; } catch (e) { console.error(`Failed to parse function call arguments for ${msg.name}:`, msg.arguments, e); args = { error: 'Invalid JSON arguments provided', raw_args: msg.arguments, }; } contents.push({ role: 'model', parts: [ { functionCall: { name: msg.name, args, }, }, ], }); } else if (msg.type === 'function_call_output') { let textOutput = ''; if (typeof msg.output === 'string') { textOutput = msg.output; } else { textOutput = JSON.stringify(msg.output); } const message = { role: 'user', parts: [ { functionResponse: { name: msg.name, response: { content: textOutput || '' }, }, }, ], }; contents = await (0, image_utils_js_1.appendMessageWithImage)(model, contents, message, { read: () => textOutput, write: value => { message.parts[0].functionResponse.response.content = value; return message; }, }, addImagesToInput); } else { let textContent = ''; if (typeof msg.content === 'string') { textContent = msg.content; } else if (msg.content && typeof msg.content === 'object' && 'text' in msg.content) { textContent = msg.content.text; } else { textContent = JSON.stringify(msg.content); } const role = msg.role === 'assistant' ? 'model' : 'user'; const message = { role, parts: [ { thought: msg.type === 'thinking', text: textContent.trim(), }, ], }; contents = await (0, image_utils_js_1.appendMessageWithImage)(model, contents, message, { read: () => textContent, write: value => { message.parts[0].text = value; return message; }, }, addImagesToInput); } } return contents; } const THINKING_BUDGET_CONFIGS = { '-low': 0, '-medium': 2048, '-high': 12288, '-max': 24576, }; class GeminiProvider extends base_provider_js_1.BaseModelProvider { _client; apiKey; constructor(apiKey) { super('google'); this.apiKey = apiKey; } get client() { if (!this._client) { const apiKey = this.apiKey || process.env.GOOGLE_API_KEY; if (!apiKey) { throw new Error('Failed to initialize Gemini client. GOOGLE_API_KEY is missing or not provided.'); } this._client = new genai_1.GoogleGenAI({ apiKey: apiKey, vertexai: false, httpOptions: { apiVersion: 'v1alpha' }, }); } return this._client; } async createEmbedding(input, model, opts) { try { let actualModelId = model.startsWith('gemini/') ? model.substring(7) : model; let thinkingConfig = null; for (const [suffix, budget] of Object.entries(THINKING_BUDGET_CONFIGS)) { if (actualModelId.endsWith(suffix)) { thinkingConfig = { thinkingBudget: budget }; actualModelId = actualModelId.slice(0, -suffix.length); break; } } console.log(`[Gemini] Generating embedding with model ${actualModelId}${opts?.dimensions ? ` (dimensions: ${opts.dimensions})` : ''}`); const payload = { model: actualModelId, contents: input, config: { taskType: opts?.taskType ?? 'SEMANTIC_SIMILARITY', ...(opts?.dimensions && { outputDimensionality: opts.dimensions }), }, }; if (thinkingConfig) { payload.config.thinkingConfig = thinkingConfig; } const response = await this.client.models.embedContent(payload); console.log('[Gemini] Embedding response structure:', JSON.stringify(response, (key, value) => key === 'values' && Array.isArray(value) && value.length > 10 ? `[${value.length} items]` : value, 2)); if (!response.embeddings || !Array.isArray(response.embeddings)) { console.error('[Gemini] Unexpected embedding response structure:', response); throw new Error('Invalid embedding response structure from Gemini API'); } const estimatedTokens = typeof input === 'string' ? Math.ceil(input.length / 4) : input.reduce((sum, text) => sum + Math.ceil(text.length / 4), 0); let extractedValues = []; let dimensions = 0; if (response.embeddings.length > 0) { if (response.embeddings[0].values) { extractedValues = response.embeddings.map(e => e.values); dimensions = extractedValues[0].length; } else { console.warn('[Gemini] Could not find expected "values" property in embeddings response'); extractedValues = response.embeddings; dimensions = Array.isArray(extractedValues[0]) ? extractedValues[0].length : 0; } } index_js_1.costTracker.addUsage({ model: actualModelId, input_tokens: estimatedTokens, output_tokens: 0, metadata: { dimensions, }, }); if (Array.isArray(input) && input.length > 1) { return extractedValues; } else { let result; if (Array.isArray(extractedValues) && extractedValues.length >= 1) { const firstValue = extractedValues[0]; if (Array.isArray(firstValue)) { result = firstValue; } else { console.error('[Gemini] Unexpected format in embedding result:', firstValue); result = []; } } else { result = []; } return result; } } catch (error) { console.error('[Gemini] Error generating embedding:', error); throw error; } } async *retryStreamOnIncompleteJson(requestFn, maxRetries = 2) { let attempts = 0; while (attempts <= maxRetries) { try { const stream = await requestFn(); for await (const chunk of stream) { yield chunk; } return; } catch (error) { attempts++; const errorMsg = error instanceof Error ? error.message : String(error); if (errorMsg.includes('Incomplete JSON segment') && attempts <= maxRetries) { console.warn(`[Gemini] Incomplete JSON segment error, retrying (${attempts}/${maxRetries})...`); await new Promise(resolve => setTimeout(resolve, 1000 * attempts)); continue; } throw error; } } } async *createResponseStream(messages, model, agent) { const { getToolsFromAgent } = await Promise.resolve().then(() => __importStar(require("../utils/agent.cjs"))); const tools = agent ? await getToolsFromAgent(agent) : []; const settings = agent?.modelSettings; let messageId = (0, uuid_1.v4)(); let contentBuffer = ''; let thoughtBuffer = ''; let eventOrder = 0; const shownGrounding = new Set(); let requestId = undefined; const chunks = []; try { const contents = await convertToGeminiContents(model, messages); if (contents.length === 0) { console.warn('Gemini API Warning: No valid content found in messages after conversion. Adding default message.'); contents.push({ role: 'user', parts: [ { text: "Let's think this through step by step.", }, ], }); } const lastContent = contents[contents.length - 1]; if (lastContent.role !== 'user') { console.warn("Last message in history is not from 'user'. Gemini might not respond as expected."); } let thinkingBudget = null; for (const [suffix, budget] of Object.entries(THINKING_BUDGET_CONFIGS)) { if (model.endsWith(suffix)) { thinkingBudget = budget; model = model.slice(0, -suffix.length); break; } } const config = { thinkingConfig: { includeThoughts: true, }, }; if (thinkingBudget) { config.thinkingConfig.thinkingBudget = thinkingBudget; } if (settings?.stop_sequence) { config.stopSequences = [settings.stop_sequence]; } if (settings?.temperature) { config.temperature = settings.temperature; } if (settings?.max_tokens) { config.maxOutputTokens = settings.max_tokens; } if (settings?.top_p) { config.topP = settings.top_p; } if (settings?.top_k) { config.topK = settings.top_k; } if (settings?.json_schema) { config.responseMimeType = 'application/json'; config.responseSchema = settings.json_schema.schema; if (config.responseSchema) { const removeAdditionalProperties = (obj) => { if (!obj || typeof obj !== 'object') { return; } if ('additionalProperties' in obj) { delete obj.additionalProperties; } if (obj.properties && typeof obj.properties === 'object') { Object.values(obj.properties).forEach(prop => { removeAdditionalProperties(prop); }); } if (obj.items) { removeAdditionalProperties(obj.items); } ['oneOf', 'anyOf', 'allOf'].forEach(key => { if (obj[key] && Array.isArray(obj[key])) { obj[key].forEach((subSchema) => { removeAdditionalProperties(subSchema); }); } }); }; removeAdditionalProperties(config.responseSchema); } } let hasGoogleWebSearch = false; if (tools && tools.length > 0) { hasGoogleWebSearch = tools.some(tool => tool.definition.function.name === 'google_web_search'); const functionDeclarations = await convertToGeminiFunctionDeclarations(tools); let allowedFunctionNames = []; if (functionDeclarations.length > 0) { config.tools = [{ functionDeclarations }]; if (settings?.tool_choice) { let toolChoice; if (typeof settings.tool_choice === 'object' && settings.tool_choice?.type === 'function' && settings.tool_choice?.function?.name) { toolChoice = genai_1.FunctionCallingConfigMode.ANY; allowedFunctionNames = [settings.tool_choice.function.name]; } else if (settings.tool_choice === 'required') { toolChoice = genai_1.FunctionCallingConfigMode.ANY; } else if (settings.tool_choice === 'auto') { toolChoice = genai_1.FunctionCallingConfigMode.AUTO; } else if (settings.tool_choice === 'none') { toolChoice = genai_1.FunctionCallingConfigMode.NONE; } if (toolChoice) { config.toolConfig = { functionCallingConfig: { mode: toolChoice, }, }; if (allowedFunctionNames.length > 0) { config.toolConfig.functionCallingConfig.allowedFunctionNames = allowedFunctionNames; } } } } else if (!hasGoogleWebSearch) { console.warn('Tools were provided but resulted in empty declarations after conversion.'); } } if (hasGoogleWebSearch) { console.log('[Gemini] Enabling Google Search grounding'); config.tools = [{ googleSearch: {} }]; config.toolConfig = { functionCallingConfig: { mode: genai_1.FunctionCallingConfigMode.ANY, allowedFunctionNames: ['googleSearch'], }, }; } const requestParams = { model, contents, config, }; requestId = (0, llm_logger_js_1.log_llm_request)(agent.agent_id, 'google', model, requestParams); const { waitWhilePaused } = await Promise.resolve().then(() => __importStar(require("../utils/pause_controller.cjs"))); await waitWhilePaused(100, agent.abortSignal); const getStreamFn = () => this.client.models.generateContentStream(requestParams); const response = this.retryStreamOnIncompleteJson(getStreamFn); let usageMetadata; for await (const chunk of response) { chunks.push(chunk); if (chunk.responseId) { messageId = chunk.responseId; } if ((0, pause_controller_js_1.isPaused)()) { console.log(`[Gemini] System paused during stream for model ${model}. Waiting...`); await waitWhilePaused(100, agent.abortSignal); console.log(`[Gemini] System resumed, continuing stream for model ${model}`); } if (chunk.functionCalls && chunk.functionCalls.length > 0) { for (const fc of chunk.functionCalls) { if (fc && fc.name) { yield { type: 'tool_start', tool_call: { id: fc.id || `call_${(0, uuid_1.v4)()}`, type: 'function', function: { name: fc.name, arguments: JSON.stringify(fc.args || {}), }, }, }; } } } for (const candidate of chunk.candidates) { if (candidate.content?.parts) { for (const part of candidate.content.parts) { let text = ''; if (part.text) { text += part.text; } if (part.executableCode) { if (text) { text += '\n\n'; } text += part.executableCode; } if (part.videoMetadata) { if (text) { text += '\n\n'; } text += JSON.stringify(part.videoMetadata); } if (text.length > 0) { const ev = { type: 'message_delta', content: '', message_id: messageId, order: eventOrder++, }; if (part.thought) { thoughtBuffer += text; ev.thinking_content = text; } else { contentBuffer += text; ev.content = text; } yield ev; } if (part.inlineData?.data) { yield { type: 'file_complete', data_format: 'base64', data: part.inlineData.data, mime_type: part.inlineData.mimeType || 'image/png', message_id: (0, uuid_1.v4)(), order: eventOrder++, }; } } } const gChunks = candidate.groundingMetadata?.groundingChunks; if (Array.isArray(gChunks)) { const newChunks = gChunks.filter(c => c?.web?.uri && !shownGrounding.has(c.web.uri)); if (newChunks.length) { newChunks.forEach(c => shownGrounding.add(c.web.uri)); const formatted = formatGroundingChunks(newChunks); yield { type: 'message_delta', content: '\n\nSearch Results:\n' + formatted + '\n', message_id: messageId, order: eventOrder++, }; contentBuffer += '\n\nSearch Results:\n' + formatted + '\n'; } } } if (chunk.usageMetadata) { usageMetadata = chunk.usageMetadata; } } if (usageMetadata) { const calculatedUsage = index_js_1.costTracker.addUsage({ model, input_tokens: usageMetadata.promptTokenCount || 0, output_tokens: usageMetadata.candidatesTokenCount || 0, cached_tokens: usageMetadata.cachedContentTokenCount || 0, metadata: { total_tokens: usageMetadata.totalTokenCount || 0, reasoning_tokens: usageMetadata.thoughtsTokenCount || 0, tool_tokens: usageMetadata.toolUsePromptTokenCount || 0, }, }); if (!(0, event_controller_js_1.hasEventHandler)()) { yield { type: 'cost_update', usage: { ...calculatedUsage, total_tokens: usageMetadata.totalTokenCount || 0, }, }; } } else { console.warn('[Gemini] No usage metadata found in the response. Using token estimation.'); let inputText = ''; for (const content of contents) { if (content.parts) { for (const part of content.parts) { if (part.text) { inputText += part.text + '\n'; } } } } const calculatedUsage = index_js_1.costTracker.addEstimatedUsage(model, inputText, contentBuffer + thoughtBuffer, { provider: 'gemini', }); if (!(0, event_controller_js_1.hasEventHandler)()) { yield { type: 'cost_update', usage: { ...calculatedUsage, total_tokens: calculatedUsage.input_tokens + calculatedUsage.output_tokens, }, }; } } if (contentBuffer || thoughtBuffer) { yield { type: 'message_complete', content: contentBuffer, thinking_content: thoughtBuffer, message_id: messageId, }; } } catch (error) { (0, llm_logger_js_1.log_llm_error)(requestId, error); const errorMessage = error instanceof Error ? error.stack || error.message : String(error); if (errorMessage.includes('Incomplete JSON segment')) { console.error('[Gemini] Stream terminated with incomplete JSON. This may indicate network issues or timeouts.'); } console.error('\n=== Gemini error ==='); console.dir(error, { depth: null }); console.error('\n=== JSON dump of error ==='); console.error(JSON.stringify(error, Object.getOwnPropertyNames(error), 2)); console.error('\n=== Manual property walk ==='); for (const key of Reflect.ownKeys(error)) { console.error(`${String(key)}:`, error[key]); } yield { type: 'error', error: `Gemini error ${model}: ${errorMessage}`, }; if (contentBuffer || thoughtBuffer) { yield { type: 'message_complete', content: contentBuffer, thinking_content: thoughtBuffer, message_id: messageId, }; } } finally { (0, llm_logger_js_1.log_llm_response)(requestId, chunks); } } async createImage(prompt, model, opts) { try { model = model || 'imagen-3.0-generate-002'; const numberOfImages = opts?.n || 1; let aspectRatio = '1:1'; if (opts?.size === 'landscape') { aspectRatio = '16:9'; } else if (opts?.size === 'portrait') { aspectRatio = '9:16'; } console.log(`[Gemini] Generating ${numberOfImages} image(s) with model ${model}, prompt: "${prompt.substring(0, 100)}${prompt.length > 100 ? '...' : ''}"`); const response = await this.client.models.generateImages({ model, prompt, config: { numberOfImages, aspectRatio, includeSafetyAttributes: false, }, }); const images = []; if (response.generatedImages && response.generatedImages.length > 0) { for (const generatedImage of response.generatedImages) { if (generatedImage.image?.imageBytes) { const base64Image = `data:image/png;base64,${generatedImage.image.imageBytes}`; images.push(base64Image); } } const perImageCost = this.getImageCost(model); index_js_1.costTracker.addUsage({ model, image_count: images.length, metadata: { aspect_ratio: aspectRatio, cost_per_image: perImageCost, }, }); } if (images.length === 0) { throw new Error('No images returned from Gemini/Imagen'); } return images; } catch (error) { console.error('[Gemini] Error generating image:', error); throw error; } } getImageCost(model) { if (model.includes('imagen-3')) { return 0.04; } else if (model.includes('imagen-2')) { return 0.02; } return 0.04; } async createVoice(text, model = 'gemini-2.5-flash-preview-tts', opts) { try { console.log(`[Gemini] Generating speech with model ${model}, text: "${text.substring(0, 100)}${text.length > 100 ? '...' : ''}"`); const voiceName = this.mapVoiceToGemini(opts?.voice); const speechConfig = { voiceConfig: { prebuiltVoiceConfig: { voiceName: voiceName, }, }, }; const config = { responseModalities: [genai_1.Modality.AUDIO], speechConfig: speechConfig, }; let say_prefix = ''; let say_postfix = ''; if (opts?.speed && opts.speed !== 1.0) { const speedDescription = opts.speed < 1.0 ? `slowly at ${Math.round(opts.speed * 100)}% speed` : `quickly at ${Math.round(opts.speed * 100)}% speed`; say_postfix = speedDescription; } if (opts?.affect) { say_prefix = `Sound ${opts.affect}`; } if (say_postfix || say_prefix) { if (say_postfix && say_prefix) { text = `${say_prefix} and say ${say_postfix}:\n${text}`; } else if (say_postfix) { text = `Say ${say_postfix}:\n${text}`; } else if (say_prefix) { text = `${say_prefix} and say:\n${text}`; } } console.log(`[Gemini] Starting generateContentStream call...`); const streamPromise = this.client.models.generateContentStream({ model, contents: [{ role: 'user', parts: [{ text }] }], config, }); const textLength = text.length; index_js_1.costTracker.addUsage({ model, input_tokens: Math.ceil(textLength / 4), output_tokens: 0, metadata: { voice: voiceName, text_length: textLength, type: 'voice_generation', }, }); if (opts?.stream) { const stream = await streamPromise; const chunks = []; for await (const chunk of stream) { if (chunk.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data) { const part = chunk.candidates[0].content.parts[0]; const binaryString = atob(part.inlineData.data); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } chunks.push(bytes); if (part.inlineData.mimeType) { console.log(`[Gemini] Audio format: ${part.inlineData.mimeType}`); } } } const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0); const combined = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { combined.set(chunk, offset); offset += chunk.length; } return new ReadableStream({ start(controller) { controller.enqueue(combined); controller.close(); }, }); } let allData = new Uint8Array(0); const stream = await streamPromise; for await (const chunk of stream) { if (!chunk.candidates || !chunk.candidates[0]?.content?.parts) { continue; } const part = chunk.candidates[0].content.parts[0]; if (part?.inlineData?.data) { const binaryString = atob(part.inlineData.data); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } const newData = new Uint8Array(allData.length + bytes.length); newData.set(allData); newData.set(bytes, allData.length); allData = newData; } } if (allData.length === 0) { throw new Error('No audio data generated from Gemini TTS'); } return allData.buffer; } catch (error) { console.error('[Gemini] Error generating voice:', error); throw error; } } mapVoiceToGemini(voice) { const geminiVoices = [ 'Kore', 'Puck', 'Charon', 'Fenrir', 'Aoede', 'Glados', ]; if (!voice) { return 'Kore'; } if (geminiVoices.includes(voice)) { return voice; } const voiceMap = { alloy: 'Kore', echo: 'Puck', fable: 'Charon', onyx: 'Fenrir', nova: 'Aoede', shimmer: 'Glados', male: 'Puck', female: 'Kore', neutral: 'Charon', young: 'Aoede', mature: 'Fenrir', robotic: 'Glados', kore: 'Kore', puck: 'Puck', charon: 'Charon', fenrir: 'Fenrir', aoede: 'Aoede', glados: 'Glados', }; const mappedVoice = voiceMap[voice.toLowerCase()]; if (mappedVoice) { return mappedVoice; } console.warn(`[Gemini] Unknown voice '${voice}', using default voice 'Kore'`); return 'Kore'; } async *createTranscription(audio, agent, model, opts) { let session = null; let isConnected = false; try { const ai = new genai_1.GoogleGenAI({ apiKey: this.apiKey, httpOptions: { apiVersion: 'v1alpha' }, }); const realtimeInputConfig = opts?.realtimeInputConfig || { automaticActivityDetection: { disabled: false, startOfSpeechSensitivity: 'START_SENSITIVITY_HIGH', endOfSpeechSensitivity: 'END_SENSITIVITY_LOW', }, }; const speechConfig = opts?.speechConfig || { languageCode: 'en-US', }; const systemInstruction = agent.instructions || `You should reply only "OK" to every single message from the user. Nothing else.`; console.log('[Gemini] Connecting to Live API for transcription...'); const connectionPromise = new Promise((resolve, reject) => { const timeout = setTimeout(() => { reject(new Error('Connection timeout')); }, 10000); const config = { responseModalities: [genai_1.Modality.TEXT], mediaResolution: genai_1.MediaResolution.MEDIA_RESOLUTION_MEDIUM, speechConfig, realtimeInputConfig, systemInstruction: { parts: [{ text: systemInstruction }], }, inputAudioTranscription: {}, }; ai.live .connect({ model: model, config, callbacks: { onopen: () => { clearTimeout(timeout); console.log('[Gemini] Live session connected'); isConnected = true; resolve(); }, onmessage: async (msg) => { if (msg.serverContent?.inputTranscription?.text) { const previewEvent = { type: 'transcription_turn_delta', timestamp: new Date().toISOString(), delta: msg.serverContent.inputTranscription.text, }; transcriptEvents.push(previewEvent); } if (msg.serverContent?.turnComplete) { const turnEvent = { type: 'transcription_turn_complete', timestamp: new Date().toISOString(), }; transcriptEvents.push(turnEvent); } if (msg.usageMetadata) { if (msg.usageMetadata.promptTokensDetails && Array.isArray(msg.usageMetadata.promptTokensDetails)) { for (const detail of msg.usageMetadata.promptTokensDetails) { if (detail.modality && detail.tokenCount > 0) { index_js_1.costTracker.addUsage({ model: model, input_tokens: detail.tokenCount, output_tokens: 0, input_modality: detail.modality.toLowerCase(), metadata: { totalTokens: msg.usageMetadata.totalTokenCount || 0, source: 'gemini-live-transcription', modalityType: 'input', originalModality: detail.modality, }, }); } } } if (msg.usageMetadata.responseTokensDetails && Array.isArray(msg.usageMetadata.responseTokensDetails)) { for (const detail of msg.usageMetadata.responseTokensDetails) { if (detail.modality && detail.tokenCount > 0) { index_js_1.costTracker.addUsage({ model: model, input_tokens: 0, output_tokens: detail.tokenCount, output_modality: detail.modality.toLowerCase(), metadata: { totalTokens: msg.usageMetadata.totalTokenCount || 0, source: 'gemini-live-transcription', modalityType: 'output', originalModality: detail.modality, }, }); } } } if ((!msg.usageMetadata.promptTokensDetails || msg.usageMetadata.promptTokensDetails.length === 0) && (!msg.usageMetadata.responseTokensDetails || msg.usageMetadata.responseTokensDetails.length === 0)) { index_js_1.costTracker.addUsage({ model: model, input_tokens: msg.usageMetadata.promptTokenCount || 0, output_tokens: msg.usageMetadata.responseTokenCount || 0, input_modality: 'audio', output_modality: 'text', metadata: { totalTokens: msg.usageMetadata.totalTokenCount || 0, source: 'gemini-live-transcription', }, }); } } }, onerror: (err) => { console.error('[Gemini] Live API error:', { code: err.code, reason: err.reason, wasClean: err.wasClean, }); connectionError = err; }, onclose: (event) => { console.log('[Gemini] Live session closed'); if (event) { console.log('[Gemini] Close event details:', { code: event.code, reason: event.reason, wasClean: event.wasClean, }); }