UNPKG

@ai-sdk/google

Version:

The **[Google Generative AI provider](https://ai-sdk.dev/providers/ai-sdk-providers/google-generative-ai)** for the [AI SDK](https://ai-sdk.dev/docs) contains language model support for the [Google Generative AI](https://ai.google/discover/generativeai/)

843 lines (807 loc) 30.4 kB
import type { JSONValue, LanguageModelV3FinishReason, LanguageModelV3Source, LanguageModelV3StreamPart, SharedV3ProviderMetadata, SharedV3Warning, } from '@ai-sdk/provider'; import type { ParseResult } from '@ai-sdk/provider-utils'; import type { GoogleInteractionsEvent, GoogleInteractionsUsage, } from './google-interactions-api'; import { convertGoogleInteractionsUsage } from './convert-google-interactions-usage'; import { annotationsToSources, builtinToolResultToSources, } from './extract-google-interactions-sources'; import { mapGoogleInteractionsFinishReason } from './map-google-interactions-finish-reason'; import type { GoogleInteractionsAnnotation, GoogleInteractionsBuiltinToolResultContent, GoogleInteractionsStatus, } from './google-interactions-prompt'; const BUILTIN_TOOL_CALL_TYPES = new Set([ 'google_search_call', 'code_execution_call', 'url_context_call', 'file_search_call', 'google_maps_call', 'mcp_server_tool_call', ]); const BUILTIN_TOOL_RESULT_TYPES = new Set([ 'google_search_result', 'code_execution_result', 'url_context_result', 'file_search_result', 'google_maps_result', 'mcp_server_tool_result', ]); function builtinToolNameFromCallType(type: string): string { return type.replace(/_call$/, ''); } function builtinToolNameFromResultType(type: string): string { return type.replace(/_result$/, ''); } type OpenBlockState = | { kind: 'text'; id: string; emittedSourceKeys: Set<string> } | { kind: 'reasoning'; id: string; signature?: string; } | { kind: 'image'; id: string; data?: string; mimeType?: string; uri?: string; } | { kind: 'function_call'; id: string; toolCallId: string; toolName: string; /** * Accumulator for partial JSON arguments. Arguments stream as a * sequence of `arguments_delta` substrings on `step.delta`; each one is * appended verbatim and surfaced as a `tool-input-delta`. On * `step.stop` the accumulated string is parsed to recover the full * arguments object for the final `tool-call` event. */ argumentsAccum: string; signature?: string; } | { kind: 'builtin_tool_call'; id: string; blockType: string; toolCallId: string; toolName: string; arguments: Record<string, unknown>; callEmitted: boolean; } | { kind: 'builtin_tool_result'; id: string; blockType: string; callId: string; toolName: string; result: unknown; isError?: boolean; resultEmitted: boolean; } /** * A `model_output` step whose inner content-block kind has not yet been * disambiguated. `step.start` may arrive bare (`{type: 'model_output'}`, * no content payload); the first `step.delta` reveals whether the block * is text or image. The block opens in this transitional state and swaps * to `text` / `image` on the first matching delta. */ | { kind: 'pending_model_output'; id: string } | { kind: 'unknown'; id: string }; /** * Builds a `TransformStream<ParseResult<GoogleInteractionsEvent>, LanguageModelV3StreamPart>` * over the Interactions API SSE event stream. * * Surfaces text + thought (reasoning), function_call, image, built-in tool * call/result steps, and `text_annotation` -> `source` parts. */ export function buildGoogleInteractionsStreamTransform({ warnings, generateId, includeRawChunks, serviceTier: headerServiceTier, }: { warnings: Array<SharedV3Warning>; generateId: () => string; includeRawChunks?: boolean; /** * Defensive fallback for service tier read from the `x-gemini-service-tier` * HTTP response header. The Interactions API surfaces the applied tier in * the `interaction.completed` event body (see `service_tier` below); this * parameter exists so we still surface a tier if the API later starts * sending the header. */ serviceTier?: string; }): TransformStream< ParseResult<GoogleInteractionsEvent>, LanguageModelV3StreamPart > { let interactionId: string | undefined; let usage: GoogleInteractionsUsage | undefined; let serviceTier: string | undefined = headerServiceTier; let finishStatus: GoogleInteractionsStatus | string | undefined; let hasFunctionCall = false; /* * Per-index open step slots. The Interactions API frames concurrent steps * (e.g. text alongside thought) by `index`; we track each open slot * independently so a text delta at index N never collides with a thought * delta at index M. */ const openBlocks = new Map<number, OpenBlockState>(); /* * De-duplicate sources across the whole stream. Citations often re-appear * across multiple `text_annotation` deltas as the model's text grows; * surface each unique URL once. */ const emittedSourceKeys = new Set<string>(); function sourceKey(source: LanguageModelV3Source): string { return source.sourceType === 'url' ? `url:${source.url}` : `doc:${source.filename ?? source.title}`; } return new TransformStream< ParseResult<GoogleInteractionsEvent>, LanguageModelV3StreamPart >({ start(controller) { controller.enqueue({ type: 'stream-start', warnings }); }, transform(chunk, controller) { if (includeRawChunks) { controller.enqueue({ type: 'raw', rawValue: chunk.rawValue }); } if (!chunk.success) { finishStatus = 'failed'; controller.enqueue({ type: 'error', error: chunk.error }); return; } const value = chunk.value; const eventType = (value as { event_type?: string }).event_type; switch (eventType) { case 'interaction.created': { const event = value as Extract< GoogleInteractionsEvent, { event_type: 'interaction.created' } >; const interaction = event.interaction; /* * The Interactions API returns `id: ""` (empty string) on streaming * events when running with `store: false` — there is no server-side * record. Treat empty string the same as missing so providerMetadata * stays clean. */ interactionId = interaction?.id != null && interaction.id.length > 0 ? interaction.id : undefined; const created = (interaction as { created?: string } | undefined) ?.created; let timestamp: Date | undefined; if (typeof created === 'string') { const parsed = new Date(created); if (!Number.isNaN(parsed.getTime())) { timestamp = parsed; } } controller.enqueue({ type: 'response-metadata', ...(interactionId != null ? { id: interactionId } : {}), modelId: (interaction as { model?: string } | undefined)?.model, ...(timestamp ? { timestamp } : {}), }); break; } case 'step.start': { const event = value as Extract< GoogleInteractionsEvent, { event_type: 'step.start' } >; const step = event.step as | { type?: string; id?: string; call_id?: string; name?: string; arguments?: Record<string, unknown>; signature?: string; summary?: Array<{ type?: string; text?: string }>; result?: unknown; is_error?: boolean; content?: Array<{ type?: string; text?: string; data?: string; mime_type?: string; uri?: string; annotations?: Array<GoogleInteractionsAnnotation>; }>; } | undefined; const index = event.index; const blockId = `${interactionId ?? 'interaction'}:${index}`; const stepType = step?.type; if (stepType === 'model_output') { /* * `step.start` for a `model_output` step often carries only the * type discriminator — content/image payloads then arrive on * subsequent `step.delta` events. Open in a transitional * `pending_model_output` state; the first delta promotes it to * either `text` (and emits `text-start`) or `image`. * * `step.content[0]` may also arrive populated as a hint; when * present, promote eagerly. */ const initial = step?.content?.[0] as | { type?: string; text?: string; data?: string; mime_type?: string; uri?: string; annotations?: Array<GoogleInteractionsAnnotation>; } | undefined; if (initial?.type === 'text') { openBlocks.set(index, { kind: 'text', id: blockId, emittedSourceKeys: new Set<string>(), }); controller.enqueue({ type: 'text-start', id: blockId }); const initialSources = annotationsToSources({ annotations: initial.annotations, generateId, }); for (const source of initialSources) { const key = sourceKey(source); if (emittedSourceKeys.has(key)) continue; emittedSourceKeys.add(key); controller.enqueue(source); } } else if (initial?.type === 'image') { openBlocks.set(index, { kind: 'image', id: blockId, ...(initial.data != null ? { data: initial.data } : {}), ...(initial.mime_type != null ? { mimeType: initial.mime_type } : {}), ...(initial.uri != null ? { uri: initial.uri } : {}), }); } else { openBlocks.set(index, { kind: 'pending_model_output', id: blockId, }); } } else if (stepType === 'thought') { const signature = step?.signature; openBlocks.set(index, { kind: 'reasoning', id: blockId, ...(signature != null ? { signature } : {}), }); controller.enqueue({ type: 'reasoning-start', id: blockId }); /* * A `thought` step's initial `summary[]` may already contain text * items on `step.start` — emit those as reasoning deltas so the * consumer's reasoning buffer is up to date before any delta * arrives. */ if (Array.isArray(step?.summary)) { for (const item of step.summary) { if (item?.type === 'text' && typeof item.text === 'string') { controller.enqueue({ type: 'reasoning-delta', id: blockId, delta: item.text, }); } } } } else if (stepType === 'function_call') { const toolCallId = step?.id ?? blockId; const toolName = step?.name ?? 'unknown'; hasFunctionCall = true; const state: Extract<OpenBlockState, { kind: 'function_call' }> = { kind: 'function_call', id: blockId, toolCallId, toolName, argumentsAccum: '', ...(step?.signature != null ? { signature: step.signature } : {}), }; openBlocks.set(index, state); controller.enqueue({ type: 'tool-input-start', id: toolCallId, toolName, }); } else if ( stepType != null && BUILTIN_TOOL_CALL_TYPES.has(stepType) ) { const toolName = stepType === 'mcp_server_tool_call' ? (step?.name ?? 'mcp_server_tool') : builtinToolNameFromCallType(stepType); const toolCallId = step?.id ?? blockId; const state: Extract< OpenBlockState, { kind: 'builtin_tool_call' } > = { kind: 'builtin_tool_call', id: blockId, blockType: stepType, toolCallId, toolName, arguments: step?.arguments ?? {}, callEmitted: false, }; openBlocks.set(index, state); } else if ( stepType != null && BUILTIN_TOOL_RESULT_TYPES.has(stepType) ) { const toolName = stepType === 'mcp_server_tool_result' ? (step?.name ?? 'mcp_server_tool') : builtinToolNameFromResultType(stepType); const callId = step?.call_id ?? blockId; const state: Extract< OpenBlockState, { kind: 'builtin_tool_result' } > = { kind: 'builtin_tool_result', id: blockId, blockType: stepType, callId, toolName, result: step?.result ?? null, ...(step?.is_error != null ? { isError: step.is_error } : {}), resultEmitted: false, }; openBlocks.set(index, state); } else { openBlocks.set(index, { kind: 'unknown', id: blockId }); } break; } case 'step.delta': { const event = value as Extract< GoogleInteractionsEvent, { event_type: 'step.delta' } >; let open = openBlocks.get(event.index); if (open == null) break; const dtype = (event.delta as { type?: string } | undefined)?.type; /* * Promote a pending model_output block to `text` on the first * text-shaped delta. Image deltas are emitted inline below — a * model_output step can interleave text and image deltas, so the * text "open block" stays in place across image emissions instead * of being swapped for an image state. */ if (open.kind === 'pending_model_output') { if ( dtype === 'text' || dtype === 'text_annotation' || dtype === 'text_annotation_delta' ) { const promoted: Extract<OpenBlockState, { kind: 'text' }> = { kind: 'text', id: open.id, emittedSourceKeys: new Set<string>(), }; openBlocks.set(event.index, promoted); open = promoted; controller.enqueue({ type: 'text-start', id: promoted.id }); } } /* * Image deltas inside `model_output` carry the full payload in a * single chunk (no per-byte streaming). Emit the `file` part as * soon as the delta arrives so it surfaces regardless of whether * a text block is currently open at the same index. */ if ( dtype === 'image' && (open.kind === 'pending_model_output' || open.kind === 'text' || open.kind === 'image') ) { const img = event.delta as | { data?: string; mime_type?: string; uri?: string } | undefined; const google: Record<string, string> = {}; if (interactionId != null) google.interactionId = interactionId; const providerMetadata = Object.keys(google).length > 0 ? { google } : undefined; if (img?.data != null && img.data.length > 0) { controller.enqueue({ type: 'file', mediaType: img.mime_type ?? 'image/png', data: img.data, ...(providerMetadata ? { providerMetadata } : {}), }); } else if (img?.uri != null && img.uri.length > 0) { /* * V3 `LanguageModelV3File` only supports inline data (`string` / * `Uint8Array`). URL-only image outputs cannot be represented as * a file stream part on the v3 spec; surface the URI through * provider metadata so callers can still recover it. */ const uriProviderMetadata = { google: { ...(interactionId != null ? { interactionId } : {}), imageUri: img.uri, }, }; controller.enqueue({ type: 'file', mediaType: img.mime_type ?? 'image/png', data: '', providerMetadata: uriProviderMetadata, }); } // The file part was emitted inline; clear any data on an // eagerly-promoted image OpenBlockState so the `step.stop` // handler does not emit a duplicate. if (open.kind === 'image') { open.data = undefined; open.uri = undefined; } break; } const delta = event.delta as | { type?: string; text?: string; signature?: string; content?: { type?: string; text?: string }; id?: string; /* * `arguments` carries different shapes per delta kind: * - `type: 'arguments_delta'` → `string` (partial JSON) * - `type: '<builtin>_tool_call'` → `Record<string, unknown>` * The branch handler reads it with the matching type. */ arguments?: Record<string, unknown> | string; annotations?: Array<GoogleInteractionsAnnotation>; call_id?: string; result?: unknown; is_error?: boolean; data?: string; mime_type?: string; uri?: string; name?: string; } | undefined; if (open.kind === 'text' && delta?.type === 'text') { const text = delta.text ?? ''; if (text.length > 0) { controller.enqueue({ type: 'text-delta', id: open.id, delta: text, }); } } else if ( open.kind === 'text' && (delta?.type === 'text_annotation' || delta?.type === 'text_annotation_delta') ) { const sources = annotationsToSources({ annotations: delta.annotations, generateId, }); for (const source of sources) { const key = sourceKey(source); if (emittedSourceKeys.has(key)) continue; emittedSourceKeys.add(key); open.emittedSourceKeys.add(key); controller.enqueue(source); } } else if (open.kind === 'image' && delta?.type === 'image') { if (delta.data != null) open.data = delta.data; if (delta.mime_type != null) open.mimeType = delta.mime_type; if (delta.uri != null) open.uri = delta.uri; } else if (open.kind === 'reasoning') { if (delta?.type === 'thought_summary') { const item = delta.content; if (item?.type === 'text' && typeof item.text === 'string') { controller.enqueue({ type: 'reasoning-delta', id: open.id, delta: item.text, }); } } else if (delta?.type === 'thought_signature') { const signature = delta.signature; if (signature != null) { open.signature = signature; } } } else if ( open.kind === 'function_call' && delta?.type === 'arguments_delta' ) { /* * Partial JSON arguments arrive as `arguments_delta` events. * The partial JSON string lives in `delta.arguments` (a string, * not the parsed object — the `arguments_delta` name applies to * the discriminator only). Append to the accumulator and surface * each chunk as a `tool-input-delta`; the full arguments object * is emitted at `step.stop`. */ const slice = typeof delta.arguments === 'string' ? delta.arguments : ''; if (slice.length > 0) { open.argumentsAccum += slice; controller.enqueue({ type: 'tool-input-delta', id: open.toolCallId, delta: slice, }); } if (delta.id != null) { open.toolCallId = delta.id; } if (delta.signature != null) { open.signature = delta.signature; } hasFunctionCall = true; } else if ( open.kind === 'builtin_tool_call' && delta?.type === open.blockType ) { if (delta.id != null) open.toolCallId = delta.id; if ( delta.arguments != null && typeof delta.arguments === 'object' ) { open.arguments = delta.arguments; } if ( delta.name != null && open.blockType === 'mcp_server_tool_call' ) { open.toolName = delta.name; } } else if ( open.kind === 'builtin_tool_result' && delta?.type === open.blockType ) { if (delta.call_id != null) open.callId = delta.call_id; if (delta.result !== undefined) open.result = delta.result; if (delta.is_error != null) open.isError = delta.is_error; if ( delta.name != null && open.blockType === 'mcp_server_tool_result' ) { open.toolName = delta.name; } } break; } case 'step.stop': { const event = value as Extract< GoogleInteractionsEvent, { event_type: 'step.stop' } >; const open = openBlocks.get(event.index); if (open == null) break; if (open.kind === 'text') { const textProviderMetadata = interactionId != null ? { google: { interactionId } } : undefined; controller.enqueue({ type: 'text-end', id: open.id, ...(textProviderMetadata ? { providerMetadata: textProviderMetadata } : {}), }); } else if (open.kind === 'reasoning') { const google: Record<string, string> = {}; if (open.signature != null) google.signature = open.signature; if (interactionId != null) google.interactionId = interactionId; const providerMetadata = Object.keys(google).length > 0 ? { google } : undefined; controller.enqueue({ type: 'reasoning-end', id: open.id, ...(providerMetadata ? { providerMetadata } : {}), }); } else if (open.kind === 'image') { const google: Record<string, string> = {}; if (interactionId != null) google.interactionId = interactionId; const providerMetadata = Object.keys(google).length > 0 ? { google } : undefined; if (open.data != null && open.data.length > 0) { controller.enqueue({ type: 'file', mediaType: open.mimeType ?? 'image/png', data: open.data, ...(providerMetadata ? { providerMetadata } : {}), }); } else if (open.uri != null && open.uri.length > 0) { /* * V3 `LanguageModelV3File` only supports inline data (`string` / * `Uint8Array`). URL-only image outputs cannot be represented as * a file stream part on the v3 spec; surface the URI through * provider metadata so callers can still recover it. */ const uriProviderMetadata = { google: { ...(interactionId != null ? { interactionId } : {}), imageUri: open.uri, }, }; controller.enqueue({ type: 'file', mediaType: open.mimeType ?? 'image/png', data: '', providerMetadata: uriProviderMetadata, }); } } else if (open.kind === 'function_call') { const accumulated = open.argumentsAccum.length > 0 ? open.argumentsAccum : '{}'; controller.enqueue({ type: 'tool-input-end', id: open.toolCallId, }); const google: Record<string, string> = {}; if (open.signature != null) google.signature = open.signature; if (interactionId != null) google.interactionId = interactionId; const providerMetadata = Object.keys(google).length > 0 ? { google } : undefined; controller.enqueue({ type: 'tool-call', toolCallId: open.toolCallId, toolName: open.toolName, input: accumulated, ...(providerMetadata ? { providerMetadata } : {}), }); } else if (open.kind === 'builtin_tool_call' && !open.callEmitted) { controller.enqueue({ type: 'tool-call', toolCallId: open.toolCallId, toolName: open.toolName, input: JSON.stringify(open.arguments ?? {}), providerExecuted: true, }); open.callEmitted = true; } else if ( open.kind === 'builtin_tool_result' && !open.resultEmitted ) { controller.enqueue({ type: 'tool-result', toolCallId: open.callId, toolName: open.toolName, result: (open.result ?? null) as NonNullable<JSONValue>, }); open.resultEmitted = true; const sources = builtinToolResultToSources({ block: { type: open.blockType, call_id: open.callId, result: open.result, } as unknown as GoogleInteractionsBuiltinToolResultContent, generateId, }); for (const source of sources) { const key = sourceKey(source); if (emittedSourceKeys.has(key)) continue; emittedSourceKeys.add(key); controller.enqueue(source); } } openBlocks.delete(event.index); break; } case 'interaction.status_update': case 'interaction.in_progress': case 'interaction.requires_action': { const event = value as Extract< GoogleInteractionsEvent, { event_type: | 'interaction.status_update' | 'interaction.in_progress' | 'interaction.requires_action'; } >; if (event.status != null) { finishStatus = event.status; } else if (eventType === 'interaction.requires_action') { finishStatus = 'requires_action'; } else { finishStatus = 'in_progress'; } break; } case 'interaction.completed': { const event = value as Extract< GoogleInteractionsEvent, { event_type: 'interaction.completed' } >; const interaction = event.interaction as { id?: string; status?: GoogleInteractionsStatus; usage?: GoogleInteractionsUsage; service_tier?: string; }; if (interaction?.id != null && interaction.id.length > 0) { interactionId = interaction.id; } if (interaction?.status != null) { finishStatus = interaction.status; } if (interaction?.usage != null) { usage = interaction.usage; } /* * The Interactions API surfaces the applied service tier on * `interaction.completed.interaction.service_tier` (NOT on the * `x-gemini-service-tier` HTTP header that `:generateContent` * uses). Body wins over header fallback. */ if (interaction?.service_tier != null) { serviceTier = interaction.service_tier; } break; } case 'error': { const event = value as Extract< GoogleInteractionsEvent, { event_type: 'error' } >; finishStatus = 'failed'; const errorPayload = event.error ?? { message: 'Unknown interaction error', }; controller.enqueue({ type: 'error', error: errorPayload }); break; } default: break; } }, flush(controller) { const finishReason: LanguageModelV3FinishReason = { unified: mapGoogleInteractionsFinishReason({ status: finishStatus, hasFunctionCall, }), raw: finishStatus, }; const providerMetadata: SharedV3ProviderMetadata = { google: { ...(interactionId != null ? { interactionId } : {}), ...(serviceTier != null ? { serviceTier } : {}), }, }; controller.enqueue({ type: 'finish', finishReason, usage: convertGoogleInteractionsUsage(usage), providerMetadata, }); }, }); }