@just-every/ensemble
Version:
LLM provider abstraction layer with unified streaming interface
1,160 lines • 67.3 kB
JavaScript
import { GoogleGenAI, Type, FunctionCallingConfigMode, Modality, MediaResolution, } from '@google/genai';
import { v4 as uuidv4 } from 'uuid';
import { BaseModelProvider } from './base_provider.js';
import { costTracker } from '../index.js';
import { log_llm_error, log_llm_request, log_llm_response } from '../utils/llm_logger.js';
import { isPaused } from '../utils/pause_controller.js';
import { appendMessageWithImage, resizeAndTruncateForGemini } from '../utils/image_utils.js';
import { hasEventHandler } from '../utils/event_controller.js';
function convertParameterToGeminiFormat(param) {
let type = Type.STRING;
switch (param.type) {
case 'string':
type = Type.STRING;
break;
case 'number':
type = Type.NUMBER;
break;
case 'boolean':
type = Type.BOOLEAN;
break;
case 'object':
type = Type.OBJECT;
break;
case 'array':
type = Type.ARRAY;
break;
case 'null':
type = Type.STRING;
console.warn("Mapping 'null' type to STRING");
break;
default:
console.warn(`Unsupported parameter type '${param.type}'. Defaulting to STRING.`);
type = Type.STRING;
}
const result = { type, description: param.description };
if (type === Type.ARRAY) {
if (param.items) {
let itemType;
let itemEnum;
let itemProperties;
if (typeof param.items === 'object') {
itemType = param.items.type;
itemEnum = param.items.enum;
if ('properties' in param.items) {
itemProperties = param.items.properties;
}
}
if (itemType === 'object' || itemProperties) {
result.items = { type: Type.STRING };
result.description = `${result.description || 'Array parameter'} (Each item should be a JSON-encoded object)`;
if (itemProperties) {
const propNames = Object.keys(itemProperties);
result.description += `. Expected properties: ${propNames.join(', ')}`;
}
}
else if (itemType) {
result.items = {
type: itemType === 'string'
? Type.STRING
: itemType === 'number'
? Type.NUMBER
: itemType === 'boolean'
? Type.BOOLEAN
: itemType === 'null'
? Type.STRING
: Type.STRING,
};
if (itemEnum) {
if (typeof itemEnum === 'function') {
console.warn('Gemini provider does not support async enum functions in array items');
}
else {
result.items.enum = itemEnum;
}
}
}
else {
result.items = { type: Type.STRING };
}
}
else {
result.items = { type: Type.STRING };
}
}
else if (type === Type.OBJECT) {
if (param.properties && typeof param.properties === 'object') {
result.properties = {};
for (const [propName, propSchema] of Object.entries(param.properties)) {
result.properties[propName] = convertParameterToGeminiFormat(propSchema);
}
}
else {
result.properties = {};
}
}
else if (param.enum) {
if (typeof param.enum === 'function') {
console.warn('Gemini provider does not support async enum functions. Enum will be omitted.');
}
else {
result.format = 'enum';
result.enum = param.enum;
}
}
return result;
}
async function resolveAsyncEnums(params) {
if (!params || typeof params !== 'object') {
return params;
}
const resolved = { ...params };
if (resolved.properties) {
const resolvedProps = {};
for (const [key, value] of Object.entries(resolved.properties)) {
if (value && typeof value === 'object') {
const propCopy = { ...value };
if (typeof propCopy.enum === 'function') {
try {
const enumValue = await propCopy.enum();
if (Array.isArray(enumValue) && enumValue.length > 0) {
propCopy.enum = enumValue;
}
else {
delete propCopy.enum;
}
}
catch {
delete propCopy.enum;
}
}
resolvedProps[key] = await resolveAsyncEnums(propCopy);
}
else {
resolvedProps[key] = value;
}
}
resolved.properties = resolvedProps;
}
return resolved;
}
async function convertToGeminiFunctionDeclarations(tools) {
const declarations = await Promise.all(tools.map(async (tool) => {
if (tool.definition.function.name === 'google_web_search') {
return null;
}
const resolvedParams = await resolveAsyncEnums(tool.definition?.function?.parameters);
const toolParams = resolvedParams?.properties;
const properties = {};
if (toolParams) {
for (const [name, param] of Object.entries(toolParams)) {
properties[name] = convertParameterToGeminiFormat(param);
}
}
else {
console.warn(`Tool ${tool.definition?.function?.name || 'Unnamed Tool'} has missing or invalid parameters definition.`);
}
return {
name: tool.definition.function.name,
description: tool.definition.function.description,
parameters: {
type: Type.OBJECT,
properties,
required: Array.isArray(resolvedParams?.required) ? resolvedParams.required : [],
},
};
}));
return declarations.filter(Boolean);
}
export function getImageMimeType(imageData) {
if (imageData.includes('data:image/jpeg'))
return 'image/jpeg';
if (imageData.includes('data:image/png'))
return 'image/png';
if (imageData.includes('data:image/gif'))
return 'image/gif';
if (imageData.includes('data:image/webp'))
return 'image/webp';
return 'image/jpeg';
}
export function cleanBase64Data(imageData) {
return imageData.replace(/^data:image\/[a-z]+;base64,/, '');
}
function formatGroundingChunks(chunks) {
return chunks
.filter(c => c?.web?.uri)
.map((c, i) => `${i + 1}. ${c.web.title || 'Untitled'} – ${c.web.uri}`)
.join('\n');
}
async function addImagesToInput(input, images, source) {
for (const [image_id, imageData] of Object.entries(images)) {
const processedImageData = await resizeAndTruncateForGemini(imageData);
const mimeType = getImageMimeType(processedImageData);
const cleanedImageData = cleanBase64Data(processedImageData);
input.push({
role: 'user',
parts: [
{
text: `This is [image #${image_id}] from the ${source}`,
},
{
inlineData: {
mimeType: mimeType,
data: cleanedImageData,
},
},
],
});
}
return input;
}
async function convertToGeminiContents(model, messages) {
let contents = [];
for (const msg of messages) {
if (msg.type === 'function_call') {
let args = {};
try {
const parsedArgs = JSON.parse(msg.arguments || '{}');
args = typeof parsedArgs === 'object' && parsedArgs !== null ? parsedArgs : { value: parsedArgs };
}
catch (e) {
console.error(`Failed to parse function call arguments for ${msg.name}:`, msg.arguments, e);
args = {
error: 'Invalid JSON arguments provided',
raw_args: msg.arguments,
};
}
contents.push({
role: 'model',
parts: [
{
functionCall: {
name: msg.name,
args,
},
},
],
});
}
else if (msg.type === 'function_call_output') {
let textOutput = '';
if (typeof msg.output === 'string') {
textOutput = msg.output;
}
else {
textOutput = JSON.stringify(msg.output);
}
const message = {
role: 'user',
parts: [
{
functionResponse: {
name: msg.name,
response: { content: textOutput || '' },
},
},
],
};
contents = await appendMessageWithImage(model, contents, message, {
read: () => textOutput,
write: value => {
message.parts[0].functionResponse.response.content = value;
return message;
},
}, addImagesToInput);
}
else {
let textContent = '';
if (typeof msg.content === 'string') {
textContent = msg.content;
}
else if (msg.content && typeof msg.content === 'object' && 'text' in msg.content) {
textContent = msg.content.text;
}
else {
textContent = JSON.stringify(msg.content);
}
const role = msg.role === 'assistant' ? 'model' : 'user';
const message = {
role,
parts: [
{
thought: msg.type === 'thinking',
text: textContent.trim(),
},
],
};
contents = await appendMessageWithImage(model, contents, message, {
read: () => textContent,
write: value => {
message.parts[0].text = value;
return message;
},
}, addImagesToInput);
}
}
return contents;
}
const THINKING_BUDGET_CONFIGS = {
'-low': 0,
'-medium': 2048,
'-high': 12288,
'-max': 24576,
};
export class GeminiProvider extends BaseModelProvider {
_client;
apiKey;
constructor(apiKey) {
super('google');
this.apiKey = apiKey;
}
get client() {
if (!this._client) {
const apiKey = this.apiKey || process.env.GOOGLE_API_KEY;
if (!apiKey) {
throw new Error('Failed to initialize Gemini client. GOOGLE_API_KEY is missing or not provided.');
}
this._client = new GoogleGenAI({
apiKey: apiKey,
vertexai: false,
httpOptions: { apiVersion: 'v1alpha' },
});
}
return this._client;
}
async createEmbedding(input, model, opts) {
try {
let actualModelId = model.startsWith('gemini/') ? model.substring(7) : model;
let thinkingConfig = null;
for (const [suffix, budget] of Object.entries(THINKING_BUDGET_CONFIGS)) {
if (actualModelId.endsWith(suffix)) {
thinkingConfig = { thinkingBudget: budget };
actualModelId = actualModelId.slice(0, -suffix.length);
break;
}
}
console.log(`[Gemini] Generating embedding with model ${actualModelId}${opts?.dimensions ? ` (dimensions: ${opts.dimensions})` : ''}`);
const payload = {
model: actualModelId,
contents: input,
config: {
taskType: opts?.taskType ?? 'SEMANTIC_SIMILARITY',
...(opts?.dimensions && { outputDimensionality: opts.dimensions }),
},
};
if (thinkingConfig) {
payload.config.thinkingConfig = thinkingConfig;
}
const response = await this.client.models.embedContent(payload);
console.log('[Gemini] Embedding response structure:', JSON.stringify(response, (key, value) => key === 'values' && Array.isArray(value) && value.length > 10
? `[${value.length} items]`
: value, 2));
if (!response.embeddings || !Array.isArray(response.embeddings)) {
console.error('[Gemini] Unexpected embedding response structure:', response);
throw new Error('Invalid embedding response structure from Gemini API');
}
const estimatedTokens = typeof input === 'string'
? Math.ceil(input.length / 4)
: input.reduce((sum, text) => sum + Math.ceil(text.length / 4), 0);
let extractedValues = [];
let dimensions = 0;
if (response.embeddings.length > 0) {
if (response.embeddings[0].values) {
extractedValues = response.embeddings.map(e => e.values);
dimensions = extractedValues[0].length;
}
else {
console.warn('[Gemini] Could not find expected "values" property in embeddings response');
extractedValues = response.embeddings;
dimensions = Array.isArray(extractedValues[0]) ? extractedValues[0].length : 0;
}
}
costTracker.addUsage({
model: actualModelId,
input_tokens: estimatedTokens,
output_tokens: 0,
metadata: {
dimensions,
},
});
if (Array.isArray(input) && input.length > 1) {
return extractedValues;
}
else {
let result;
if (Array.isArray(extractedValues) && extractedValues.length >= 1) {
const firstValue = extractedValues[0];
if (Array.isArray(firstValue)) {
result = firstValue;
}
else {
console.error('[Gemini] Unexpected format in embedding result:', firstValue);
result = [];
}
}
else {
result = [];
}
return result;
}
}
catch (error) {
console.error('[Gemini] Error generating embedding:', error);
throw error;
}
}
async *retryStreamOnIncompleteJson(requestFn, maxRetries = 2) {
let attempts = 0;
while (attempts <= maxRetries) {
try {
const stream = await requestFn();
for await (const chunk of stream) {
yield chunk;
}
return;
}
catch (error) {
attempts++;
const errorMsg = error instanceof Error ? error.message : String(error);
if (errorMsg.includes('Incomplete JSON segment') && attempts <= maxRetries) {
console.warn(`[Gemini] Incomplete JSON segment error, retrying (${attempts}/${maxRetries})...`);
await new Promise(resolve => setTimeout(resolve, 1000 * attempts));
continue;
}
throw error;
}
}
}
async *createResponseStream(messages, model, agent) {
const { getToolsFromAgent } = await import('../utils/agent.js');
const tools = agent ? await getToolsFromAgent(agent) : [];
const settings = agent?.modelSettings;
let messageId = uuidv4();
let contentBuffer = '';
let thoughtBuffer = '';
let eventOrder = 0;
const shownGrounding = new Set();
let requestId = undefined;
const chunks = [];
try {
const contents = await convertToGeminiContents(model, messages);
if (contents.length === 0) {
console.warn('Gemini API Warning: No valid content found in messages after conversion. Adding default message.');
contents.push({
role: 'user',
parts: [
{
text: "Let's think this through step by step.",
},
],
});
}
const lastContent = contents[contents.length - 1];
if (lastContent.role !== 'user') {
console.warn("Last message in history is not from 'user'. Gemini might not respond as expected.");
}
let thinkingBudget = null;
for (const [suffix, budget] of Object.entries(THINKING_BUDGET_CONFIGS)) {
if (model.endsWith(suffix)) {
thinkingBudget = budget;
model = model.slice(0, -suffix.length);
break;
}
}
const config = {
thinkingConfig: {
includeThoughts: true,
},
};
if (thinkingBudget) {
config.thinkingConfig.thinkingBudget = thinkingBudget;
}
if (settings?.stop_sequence) {
config.stopSequences = [settings.stop_sequence];
}
if (settings?.temperature) {
config.temperature = settings.temperature;
}
if (settings?.max_tokens) {
config.maxOutputTokens = settings.max_tokens;
}
if (settings?.top_p) {
config.topP = settings.top_p;
}
if (settings?.top_k) {
config.topK = settings.top_k;
}
if (settings?.json_schema) {
config.responseMimeType = 'application/json';
config.responseSchema = settings.json_schema.schema;
if (config.responseSchema) {
const removeAdditionalProperties = (obj) => {
if (!obj || typeof obj !== 'object') {
return;
}
if ('additionalProperties' in obj) {
delete obj.additionalProperties;
}
if (obj.properties && typeof obj.properties === 'object') {
Object.values(obj.properties).forEach(prop => {
removeAdditionalProperties(prop);
});
}
if (obj.items) {
removeAdditionalProperties(obj.items);
}
['oneOf', 'anyOf', 'allOf'].forEach(key => {
if (obj[key] && Array.isArray(obj[key])) {
obj[key].forEach((subSchema) => {
removeAdditionalProperties(subSchema);
});
}
});
};
removeAdditionalProperties(config.responseSchema);
}
}
let hasGoogleWebSearch = false;
if (tools && tools.length > 0) {
hasGoogleWebSearch = tools.some(tool => tool.definition.function.name === 'google_web_search');
const functionDeclarations = await convertToGeminiFunctionDeclarations(tools);
let allowedFunctionNames = [];
if (functionDeclarations.length > 0) {
config.tools = [{ functionDeclarations }];
if (settings?.tool_choice) {
let toolChoice;
if (typeof settings.tool_choice === 'object' &&
settings.tool_choice?.type === 'function' &&
settings.tool_choice?.function?.name) {
toolChoice = FunctionCallingConfigMode.ANY;
allowedFunctionNames = [settings.tool_choice.function.name];
}
else if (settings.tool_choice === 'required') {
toolChoice = FunctionCallingConfigMode.ANY;
}
else if (settings.tool_choice === 'auto') {
toolChoice = FunctionCallingConfigMode.AUTO;
}
else if (settings.tool_choice === 'none') {
toolChoice = FunctionCallingConfigMode.NONE;
}
if (toolChoice) {
config.toolConfig = {
functionCallingConfig: {
mode: toolChoice,
},
};
if (allowedFunctionNames.length > 0) {
config.toolConfig.functionCallingConfig.allowedFunctionNames = allowedFunctionNames;
}
}
}
}
else if (!hasGoogleWebSearch) {
console.warn('Tools were provided but resulted in empty declarations after conversion.');
}
}
if (hasGoogleWebSearch) {
console.log('[Gemini] Enabling Google Search grounding');
config.tools = [{ googleSearch: {} }];
config.toolConfig = {
functionCallingConfig: {
mode: FunctionCallingConfigMode.ANY,
allowedFunctionNames: ['googleSearch'],
},
};
}
const requestParams = {
model,
contents,
config,
};
requestId = log_llm_request(agent.agent_id, 'google', model, requestParams);
const { waitWhilePaused } = await import('../utils/pause_controller.js');
await waitWhilePaused(100, agent.abortSignal);
const getStreamFn = () => this.client.models.generateContentStream(requestParams);
const response = this.retryStreamOnIncompleteJson(getStreamFn);
let usageMetadata;
for await (const chunk of response) {
chunks.push(chunk);
if (chunk.responseId) {
messageId = chunk.responseId;
}
if (isPaused()) {
console.log(`[Gemini] System paused during stream for model ${model}. Waiting...`);
await waitWhilePaused(100, agent.abortSignal);
console.log(`[Gemini] System resumed, continuing stream for model ${model}`);
}
if (chunk.functionCalls && chunk.functionCalls.length > 0) {
for (const fc of chunk.functionCalls) {
if (fc && fc.name) {
yield {
type: 'tool_start',
tool_call: {
id: fc.id || `call_${uuidv4()}`,
type: 'function',
function: {
name: fc.name,
arguments: JSON.stringify(fc.args || {}),
},
},
};
}
}
}
for (const candidate of chunk.candidates) {
if (candidate.content?.parts) {
for (const part of candidate.content.parts) {
let text = '';
if (part.text) {
text += part.text;
}
if (part.executableCode) {
if (text) {
text += '\n\n';
}
text += part.executableCode;
}
if (part.videoMetadata) {
if (text) {
text += '\n\n';
}
text += JSON.stringify(part.videoMetadata);
}
if (text.length > 0) {
const ev = {
type: 'message_delta',
content: '',
message_id: messageId,
order: eventOrder++,
};
if (part.thought) {
thoughtBuffer += text;
ev.thinking_content = text;
}
else {
contentBuffer += text;
ev.content = text;
}
yield ev;
}
if (part.inlineData?.data) {
yield {
type: 'file_complete',
data_format: 'base64',
data: part.inlineData.data,
mime_type: part.inlineData.mimeType || 'image/png',
message_id: uuidv4(),
order: eventOrder++,
};
}
}
}
const gChunks = candidate.groundingMetadata?.groundingChunks;
if (Array.isArray(gChunks)) {
const newChunks = gChunks.filter(c => c?.web?.uri && !shownGrounding.has(c.web.uri));
if (newChunks.length) {
newChunks.forEach(c => shownGrounding.add(c.web.uri));
const formatted = formatGroundingChunks(newChunks);
yield {
type: 'message_delta',
content: '\n\nSearch Results:\n' + formatted + '\n',
message_id: messageId,
order: eventOrder++,
};
contentBuffer += '\n\nSearch Results:\n' + formatted + '\n';
}
}
}
if (chunk.usageMetadata) {
usageMetadata = chunk.usageMetadata;
}
}
if (usageMetadata) {
const calculatedUsage = costTracker.addUsage({
model,
input_tokens: usageMetadata.promptTokenCount || 0,
output_tokens: usageMetadata.candidatesTokenCount || 0,
cached_tokens: usageMetadata.cachedContentTokenCount || 0,
metadata: {
total_tokens: usageMetadata.totalTokenCount || 0,
reasoning_tokens: usageMetadata.thoughtsTokenCount || 0,
tool_tokens: usageMetadata.toolUsePromptTokenCount || 0,
},
});
if (!hasEventHandler()) {
yield {
type: 'cost_update',
usage: {
...calculatedUsage,
total_tokens: usageMetadata.totalTokenCount || 0,
},
};
}
}
else {
console.warn('[Gemini] No usage metadata found in the response. Using token estimation.');
let inputText = '';
for (const content of contents) {
if (content.parts) {
for (const part of content.parts) {
if (part.text) {
inputText += part.text + '\n';
}
}
}
}
const calculatedUsage = costTracker.addEstimatedUsage(model, inputText, contentBuffer + thoughtBuffer, {
provider: 'gemini',
});
if (!hasEventHandler()) {
yield {
type: 'cost_update',
usage: {
...calculatedUsage,
total_tokens: calculatedUsage.input_tokens + calculatedUsage.output_tokens,
},
};
}
}
if (contentBuffer || thoughtBuffer) {
yield {
type: 'message_complete',
content: contentBuffer,
thinking_content: thoughtBuffer,
message_id: messageId,
};
}
}
catch (error) {
log_llm_error(requestId, error);
const errorMessage = error instanceof Error ? error.stack || error.message : String(error);
if (errorMessage.includes('Incomplete JSON segment')) {
console.error('[Gemini] Stream terminated with incomplete JSON. This may indicate network issues or timeouts.');
}
console.error('\n=== Gemini error ===');
console.dir(error, { depth: null });
console.error('\n=== JSON dump of error ===');
console.error(JSON.stringify(error, Object.getOwnPropertyNames(error), 2));
console.error('\n=== Manual property walk ===');
for (const key of Reflect.ownKeys(error)) {
console.error(`${String(key)}:`, error[key]);
}
yield {
type: 'error',
error: `Gemini error ${model}: ${errorMessage}`,
};
if (contentBuffer || thoughtBuffer) {
yield {
type: 'message_complete',
content: contentBuffer,
thinking_content: thoughtBuffer,
message_id: messageId,
};
}
}
finally {
log_llm_response(requestId, chunks);
}
}
async createImage(prompt, model, opts) {
try {
model = model || 'imagen-3.0-generate-002';
const numberOfImages = opts?.n || 1;
let aspectRatio = '1:1';
if (opts?.size === 'landscape') {
aspectRatio = '16:9';
}
else if (opts?.size === 'portrait') {
aspectRatio = '9:16';
}
console.log(`[Gemini] Generating ${numberOfImages} image(s) with model ${model}, prompt: "${prompt.substring(0, 100)}${prompt.length > 100 ? '...' : ''}"`);
const response = await this.client.models.generateImages({
model,
prompt,
config: {
numberOfImages,
aspectRatio,
includeSafetyAttributes: false,
},
});
const images = [];
if (response.generatedImages && response.generatedImages.length > 0) {
for (const generatedImage of response.generatedImages) {
if (generatedImage.image?.imageBytes) {
const base64Image = `data:image/png;base64,${generatedImage.image.imageBytes}`;
images.push(base64Image);
}
}
const perImageCost = this.getImageCost(model);
costTracker.addUsage({
model,
image_count: images.length,
metadata: {
aspect_ratio: aspectRatio,
cost_per_image: perImageCost,
},
});
}
if (images.length === 0) {
throw new Error('No images returned from Gemini/Imagen');
}
return images;
}
catch (error) {
console.error('[Gemini] Error generating image:', error);
throw error;
}
}
getImageCost(model) {
if (model.includes('imagen-3')) {
return 0.04;
}
else if (model.includes('imagen-2')) {
return 0.02;
}
return 0.04;
}
async createVoice(text, model = 'gemini-2.5-flash-preview-tts', opts) {
try {
console.log(`[Gemini] Generating speech with model ${model}, text: "${text.substring(0, 100)}${text.length > 100 ? '...' : ''}"`);
const voiceName = this.mapVoiceToGemini(opts?.voice);
const speechConfig = {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: voiceName,
},
},
};
const config = {
responseModalities: [Modality.AUDIO],
speechConfig: speechConfig,
};
let say_prefix = '';
let say_postfix = '';
if (opts?.speed && opts.speed !== 1.0) {
const speedDescription = opts.speed < 1.0
? `slowly at ${Math.round(opts.speed * 100)}% speed`
: `quickly at ${Math.round(opts.speed * 100)}% speed`;
say_postfix = speedDescription;
}
if (opts?.affect) {
say_prefix = `Sound ${opts.affect}`;
}
if (say_postfix || say_prefix) {
if (say_postfix && say_prefix) {
text = `${say_prefix} and say ${say_postfix}:\n${text}`;
}
else if (say_postfix) {
text = `Say ${say_postfix}:\n${text}`;
}
else if (say_prefix) {
text = `${say_prefix} and say:\n${text}`;
}
}
console.log(`[Gemini] Starting generateContentStream call...`);
const streamPromise = this.client.models.generateContentStream({
model,
contents: [{ role: 'user', parts: [{ text }] }],
config,
});
const textLength = text.length;
costTracker.addUsage({
model,
input_tokens: Math.ceil(textLength / 4),
output_tokens: 0,
metadata: {
voice: voiceName,
text_length: textLength,
type: 'voice_generation',
},
});
if (opts?.stream) {
const stream = await streamPromise;
const chunks = [];
for await (const chunk of stream) {
if (chunk.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data) {
const part = chunk.candidates[0].content.parts[0];
const binaryString = atob(part.inlineData.data);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
chunks.push(bytes);
if (part.inlineData.mimeType) {
console.log(`[Gemini] Audio format: ${part.inlineData.mimeType}`);
}
}
}
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
const combined = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
combined.set(chunk, offset);
offset += chunk.length;
}
return new ReadableStream({
start(controller) {
controller.enqueue(combined);
controller.close();
},
});
}
let allData = new Uint8Array(0);
const stream = await streamPromise;
for await (const chunk of stream) {
if (!chunk.candidates || !chunk.candidates[0]?.content?.parts) {
continue;
}
const part = chunk.candidates[0].content.parts[0];
if (part?.inlineData?.data) {
const binaryString = atob(part.inlineData.data);
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
const newData = new Uint8Array(allData.length + bytes.length);
newData.set(allData);
newData.set(bytes, allData.length);
allData = newData;
}
}
if (allData.length === 0) {
throw new Error('No audio data generated from Gemini TTS');
}
return allData.buffer;
}
catch (error) {
console.error('[Gemini] Error generating voice:', error);
throw error;
}
}
mapVoiceToGemini(voice) {
const geminiVoices = [
'Kore',
'Puck',
'Charon',
'Fenrir',
'Aoede',
'Glados',
];
if (!voice) {
return 'Kore';
}
if (geminiVoices.includes(voice)) {
return voice;
}
const voiceMap = {
alloy: 'Kore',
echo: 'Puck',
fable: 'Charon',
onyx: 'Fenrir',
nova: 'Aoede',
shimmer: 'Glados',
male: 'Puck',
female: 'Kore',
neutral: 'Charon',
young: 'Aoede',
mature: 'Fenrir',
robotic: 'Glados',
kore: 'Kore',
puck: 'Puck',
charon: 'Charon',
fenrir: 'Fenrir',
aoede: 'Aoede',
glados: 'Glados',
};
const mappedVoice = voiceMap[voice.toLowerCase()];
if (mappedVoice) {
return mappedVoice;
}
console.warn(`[Gemini] Unknown voice '${voice}', using default voice 'Kore'`);
return 'Kore';
}
async *createTranscription(audio, agent, model, opts) {
let session = null;
let isConnected = false;
try {
const ai = new GoogleGenAI({
apiKey: this.apiKey,
httpOptions: { apiVersion: 'v1alpha' },
});
const realtimeInputConfig = opts?.realtimeInputConfig || {
automaticActivityDetection: {
disabled: false,
startOfSpeechSensitivity: 'START_SENSITIVITY_HIGH',
endOfSpeechSensitivity: 'END_SENSITIVITY_LOW',
},
};
const speechConfig = opts?.speechConfig || {
languageCode: 'en-US',
};
const systemInstruction = agent.instructions || `You should reply only "OK" to every single message from the user. Nothing else.`;
console.log('[Gemini] Connecting to Live API for transcription...');
const connectionPromise = new Promise((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error('Connection timeout'));
}, 10000);
const config = {
responseModalities: [Modality.TEXT],
mediaResolution: MediaResolution.MEDIA_RESOLUTION_MEDIUM,
speechConfig,
realtimeInputConfig,
systemInstruction: {
parts: [{ text: systemInstruction }],
},
inputAudioTranscription: {},
};
ai.live
.connect({
model: model,
config,
callbacks: {
onopen: () => {
clearTimeout(timeout);
console.log('[Gemini] Live session connected');
isConnected = true;
resolve();
},
onmessage: async (msg) => {
if (msg.serverContent?.inputTranscription?.text) {
const previewEvent = {
type: 'transcription_turn_delta',
timestamp: new Date().toISOString(),
delta: msg.serverContent.inputTranscription.text,
};
transcriptEvents.push(previewEvent);
}
if (msg.serverContent?.turnComplete) {
const turnEvent = {
type: 'transcription_turn_complete',
timestamp: new Date().toISOString(),
};
transcriptEvents.push(turnEvent);
}
if (msg.usageMetadata) {
if (msg.usageMetadata.promptTokensDetails &&
Array.isArray(msg.usageMetadata.promptTokensDetails)) {
for (const detail of msg.usageMetadata.promptTokensDetails) {
if (detail.modality && detail.tokenCount > 0) {
costTracker.addUsage({
model: model,
input_tokens: detail.tokenCount,
output_tokens: 0,
input_modality: detail.modality.toLowerCase(),
metadata: {
totalTokens: msg.usageMetadata.totalTokenCount || 0,
source: 'gemini-live-transcription',
modalityType: 'input',
originalModality: detail.modality,
},
});
}
}
}
if (msg.usageMetadata.responseTokensDetails &&
Array.isArray(msg.usageMetadata.responseTokensDetails)) {
for (const detail of msg.usageMetadata.responseTokensDetails) {
if (detail.modality && detail.tokenCount > 0) {
costTracker.addUsage({
model: model,
input_tokens: 0,
output_tokens: detail.tokenCount,
output_modality: detail.modality.toLowerCase(),
metadata: {
totalTokens: msg.usageMetadata.totalTokenCount || 0,
source: 'gemini-live-transcription',
modalityType: 'output',
originalModality: detail.modality,
},
});
}
}
}
if ((!msg.usageMetadata.promptTokensDetails ||
msg.usageMetadata.promptTokensDetails.length === 0) &&
(!msg.usageMetadata.responseTokensDetails ||
msg.usageMetadata.responseTokensDetails.length === 0)) {
costTracker.addUsage({
model: model,
input_tokens: msg.usageMetadata.promptTokenCount || 0,
output_tokens: msg.usageMetadata.responseTokenCount || 0,
input_modality: 'audio',
output_modality: 'text',
metadata: {
totalTokens: msg.usageMetadata.totalTokenCount || 0,
source: 'gemini-live-transcription',
},
});
}
}
},
onerror: (err) => {
console.error('[Gemini] Live API error:', {
code: err.code,
reason: err.reason,
wasClean: err.wasClean,
});
connectionError = err;
},
onclose: (event) => {
console.log('[Gemini] Live session closed');
if (event) {
console.log('[Gemini] Close event details:', {
code: event.code,
reason: event.reason,
wasClean: event.wasClean,
});
}
isConnected = false;
},
},
})
.then(async (s) => {
session = s;
});
});
const transcriptEvents = [];
let connectionError = null;
await connectionPromise;
const audioStream = normalizeAudioSource(audio);
const reader = audioStream.getReader();
const sendAudioChunk = async (chunk) => {
try {
const base64Data = chunk.toString('base64');
await session.sendRealtimeInput({
media: {
mimeType: 'audio/pcm;rate=16000',
data: base64Data,
},
});
}
catch (err) {
console.error('[Gemini] Error sending audio chunk:', err);
connectionError = err;
throw err;
}
};
try {
while (true) {
const { done, value } = await reader.read();
if (done)
break;
if (value && session && isConnected) {
const chunk = value instanceof Buffer ? value : Buffer.from(value);
await sendAudioChunk(chunk);
}
if (transcriptEvents.length > 0) {
const events = transcriptEvents.splice(0, transcriptEvents.length);
for (const event of events) {
yield event;
}
}
if (connectionError) {
throw connectionError;
}
}
await new Promise(resolve => setTimeout(resolve, 1000));
if (transcriptEvents.length > 0) {
const events = transcriptEvents.splice(0, transcriptEvents.length);
for (const ev