UNPKG

@volley/recognition-client-sdk

Version:

Recognition Service TypeScript/Node.js Client SDK

1,402 lines (1,385 loc) 84.4 kB
import { z } from 'zod'; /** * Provider types and enums for recognition services * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. */ /** * Supported speech recognition providers */ declare enum RecognitionProvider { ASSEMBLYAI = "assemblyai", DEEPGRAM = "deepgram", ELEVENLABS = "elevenlabs", FIREWORKS = "fireworks", GOOGLE = "google", GEMINI_BATCH = "gemini-batch", OPENAI_BATCH = "openai-batch", OPENAI_REALTIME = "openai-realtime" } /** * ASR API type - distinguishes between streaming and file-based transcription APIs * - STREAMING: Real-time streaming APIs (Deepgram, AssemblyAI, Google) * - FILE_BASED: File upload/batch APIs (OpenAI Batch, Gemini Batch) */ declare enum ASRApiType { STREAMING = "streaming", FILE_BASED = "file-based" } /** * Deepgram model names */ declare enum DeepgramModel { NOVA_2 = "nova-2", NOVA_3 = "nova-3", FLUX_GENERAL_EN = "flux-general-en" } /** * Google Cloud Speech models * @see https://cloud.google.com/speech-to-text/docs/transcription-model * @see https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model */ declare enum GoogleModel { CHIRP_3 = "chirp_3", CHIRP_2 = "chirp_2", CHIRP = "chirp", LATEST_LONG = "latest_long", LATEST_SHORT = "latest_short", TELEPHONY = "telephony", TELEPHONY_SHORT = "telephony_short", DEFAULT = "default", COMMAND_AND_SEARCH = "command_and_search", PHONE_CALL = "phone_call", VIDEO = "video" } /** * Fireworks AI models for ASR * @see https://docs.fireworks.ai/guides/querying-asr-models * @see https://fireworks.ai/models/fireworks/fireworks-asr-large */ declare enum FireworksModel { ASR_V1 = "fireworks-asr-large", ASR_V2 = "fireworks-asr-v2", WHISPER_V3 = "whisper-v3", WHISPER_V3_TURBO = "whisper-v3-turbo" } /** * ElevenLabs Scribe models for speech-to-text * @see https://elevenlabs.io/blog/introducing-scribe-v2-realtime * @see https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming * @see https://elevenlabs.io/docs/api-reference/speech-to-text/convert */ declare enum ElevenLabsModel { SCRIBE_V2_REALTIME = "scribe_v2_realtime", SCRIBE_V1 = "scribe_v1" } /** * OpenAI Realtime API transcription models * These are the verified `input_audio_transcription.model` values. * @see https://platform.openai.com/docs/guides/realtime */ declare enum OpenAIRealtimeModel { GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe" } /** * Type alias for any model from any provider */ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string; /** * Audio encoding types */ declare enum AudioEncoding { ENCODING_UNSPECIFIED = 0, LINEAR16 = 1, OGG_OPUS = 2, FLAC = 3, MULAW = 4, ALAW = 5 } declare namespace AudioEncoding { /** * Convert numeric ID to AudioEncoding enum * @param id - Numeric encoding identifier (0-5) * @returns AudioEncoding enum value or undefined if invalid */ function fromId(id: number): AudioEncoding | undefined; /** * Convert string name to AudioEncoding enum * @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive) * @returns AudioEncoding enum value or undefined if invalid */ function fromName(nameStr: string): AudioEncoding | undefined; /** * Convert AudioEncoding enum to numeric ID * @param encoding - AudioEncoding enum value * @returns Numeric ID (0-5) */ function toId(encoding: AudioEncoding): number; /** * Convert AudioEncoding enum to string name * @param encoding - AudioEncoding enum value * @returns String name like "LINEAR16", "MULAW", etc. */ function toName(encoding: AudioEncoding): string; /** * Check if a numeric ID is a valid encoding * @param id - Numeric identifier to validate * @returns true if valid encoding ID */ function isIdValid(id: number): boolean; /** * Check if a string name is a valid encoding * @param nameStr - String name to validate * @returns true if valid encoding name */ function isNameValid(nameStr: string): boolean; } /** * Common sample rates (in Hz) */ declare enum SampleRate { RATE_8000 = 8000, RATE_16000 = 16000, RATE_22050 = 22050, RATE_24000 = 24000, RATE_32000 = 32000, RATE_44100 = 44100, RATE_48000 = 48000 } declare namespace SampleRate { /** * Convert Hz value to SampleRate enum * @param hz - Sample rate in Hz (8000, 16000, etc.) * @returns SampleRate enum value or undefined if invalid */ function fromHz(hz: number): SampleRate | undefined; /** * Convert string name to SampleRate enum * @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive) * @returns SampleRate enum value or undefined if invalid */ function fromName(nameStr: string): SampleRate | undefined; /** * Convert SampleRate enum to Hz value * @param rate - SampleRate enum value * @returns Hz value (8000, 16000, etc.) */ function toHz(rate: SampleRate): number; /** * Convert SampleRate enum to string name * @param rate - SampleRate enum value * @returns String name like "RATE_8000", "RATE_16000", etc. */ function toName(rate: SampleRate): string; /** * Check if a numeric Hz value is a valid sample rate * @param hz - Hz value to validate * @returns true if valid sample rate */ function isHzValid(hz: number): boolean; /** * Check if a string name is a valid sample rate * @param nameStr - String name to validate * @returns true if valid sample rate name */ function isNameValid(nameStr: string): boolean; } /** * Supported languages for recognition * Using BCP-47 language tags */ declare enum Language { ENGLISH_US = "en-US", ENGLISH_GB = "en-GB", SPANISH_ES = "es-ES", SPANISH_MX = "es-MX", FRENCH_FR = "fr-FR", GERMAN_DE = "de-DE", ITALIAN_IT = "it-IT", PORTUGUESE_BR = "pt-BR", JAPANESE_JP = "ja-JP", KOREAN_KR = "ko-KR", CHINESE_CN = "zh-CN", CHINESE_TW = "zh-TW" } /** * Recognition Result Types V1 * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. * Types and schemas for recognition results sent to SDK clients */ /** * Message type discriminator for recognition results V1 */ declare enum RecognitionResultTypeV1 { TRANSCRIPTION = "Transcription", FUNCTION_CALL = "FunctionCall", METADATA = "Metadata", ERROR = "Error", CLIENT_CONTROL_MESSAGE = "ClientControlMessage" } /** * Transcription result V1 - contains transcript message * In the long run game side should not need to know it. In the short run it is send back to client. * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. */ declare const TranscriptionResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>; audioUtteranceId: z.ZodString; finalTranscript: z.ZodString; finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>; pendingTranscript: z.ZodOptional<z.ZodString>; pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>; is_finished: z.ZodBoolean; voiceStart: z.ZodOptional<z.ZodNumber>; voiceDuration: z.ZodOptional<z.ZodNumber>; voiceEnd: z.ZodOptional<z.ZodNumber>; startTimestamp: z.ZodOptional<z.ZodNumber>; endTimestamp: z.ZodOptional<z.ZodNumber>; receivedAtMs: z.ZodOptional<z.ZodNumber>; accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.TRANSCRIPTION; audioUtteranceId: string; finalTranscript: string; is_finished: boolean; finalTranscriptConfidence?: number | undefined; pendingTranscript?: string | undefined; pendingTranscriptConfidence?: number | undefined; voiceStart?: number | undefined; voiceDuration?: number | undefined; voiceEnd?: number | undefined; startTimestamp?: number | undefined; endTimestamp?: number | undefined; receivedAtMs?: number | undefined; accumulatedAudioTimeMs?: number | undefined; }, { type: RecognitionResultTypeV1.TRANSCRIPTION; audioUtteranceId: string; finalTranscript: string; is_finished: boolean; finalTranscriptConfidence?: number | undefined; pendingTranscript?: string | undefined; pendingTranscriptConfidence?: number | undefined; voiceStart?: number | undefined; voiceDuration?: number | undefined; voiceEnd?: number | undefined; startTimestamp?: number | undefined; endTimestamp?: number | undefined; receivedAtMs?: number | undefined; accumulatedAudioTimeMs?: number | undefined; }>; type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>; /** * Function call result V1 - similar to LLM function call * In the long run game server should know it, rather than TV or client. */ declare const FunctionCallResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.FUNCTION_CALL>; audioUtteranceId: z.ZodString; functionName: z.ZodString; functionArgJson: z.ZodString; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.FUNCTION_CALL; audioUtteranceId: string; functionName: string; functionArgJson: string; }, { type: RecognitionResultTypeV1.FUNCTION_CALL; audioUtteranceId: string; functionName: string; functionArgJson: string; }>; type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>; /** * Metadata result V1 - contains metadata, timing information, and ASR config * Sent when the provider connection closes to provide final timing metrics and config * In the long run game server should know it, rather than TV or client. */ declare const MetadataResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>; audioUtteranceId: z.ZodString; recordingStartMs: z.ZodOptional<z.ZodNumber>; recordingEndMs: z.ZodOptional<z.ZodNumber>; transcriptEndMs: z.ZodOptional<z.ZodNumber>; socketCloseAtMs: z.ZodOptional<z.ZodNumber>; duration: z.ZodOptional<z.ZodNumber>; volume: z.ZodOptional<z.ZodNumber>; accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>; costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>; apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>; asrConfig: z.ZodOptional<z.ZodString>; rawAsrMetadata: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.METADATA; audioUtteranceId: string; recordingStartMs?: number | undefined; recordingEndMs?: number | undefined; transcriptEndMs?: number | undefined; socketCloseAtMs?: number | undefined; duration?: number | undefined; volume?: number | undefined; accumulatedAudioTimeMs?: number | undefined; costInUSD?: number | undefined; apiType?: ASRApiType | undefined; asrConfig?: string | undefined; rawAsrMetadata?: string | undefined; }, { type: RecognitionResultTypeV1.METADATA; audioUtteranceId: string; recordingStartMs?: number | undefined; recordingEndMs?: number | undefined; transcriptEndMs?: number | undefined; socketCloseAtMs?: number | undefined; duration?: number | undefined; volume?: number | undefined; accumulatedAudioTimeMs?: number | undefined; costInUSD?: number | undefined; apiType?: ASRApiType | undefined; asrConfig?: string | undefined; rawAsrMetadata?: string | undefined; }>; type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>; /** * Error type enum V1 - categorizes different types of errors */ declare enum ErrorTypeV1 { AUTHENTICATION_ERROR = "authentication_error", VALIDATION_ERROR = "validation_error", PROVIDER_ERROR = "provider_error", TIMEOUT_ERROR = "timeout_error", QUOTA_EXCEEDED = "quota_exceeded", CONNECTION_ERROR = "connection_error", UNKNOWN_ERROR = "unknown_error" } /** * Error result V1 - contains error message * In the long run game server should know it, rather than TV or client. */ declare const ErrorResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.ERROR>; audioUtteranceId: z.ZodString; errorType: z.ZodOptional<z.ZodNativeEnum<typeof ErrorTypeV1>>; message: z.ZodOptional<z.ZodString>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; description: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.ERROR; audioUtteranceId: string; errorType?: ErrorTypeV1 | undefined; message?: string | undefined; code?: string | number | undefined; description?: string | undefined; }, { type: RecognitionResultTypeV1.ERROR; audioUtteranceId: string; errorType?: ErrorTypeV1 | undefined; message?: string | undefined; code?: string | number | undefined; description?: string | undefined; }>; type ErrorResultV1 = z.infer<typeof ErrorResultSchemaV1>; /** * Client control actions enum V1 * Actions that can be sent from server to client to control the recognition stream * In the long run audio client(mic) should know it, rather than servers. */ declare enum ClientControlActionV1 { READY_FOR_UPLOADING_RECORDING = "ready_for_uploading_recording", STOP_RECORDING = "stop_recording" } /** * Error Exception Types * * Defines structured exception types for each ErrorTypeV1 category. * Each exception type has metadata about whether it's immediately available * (can be shown to user right away vs needs investigation/retry). */ /** * Authentication/Authorization Error * isImmediatelyAvailable: false * These are system configuration issues, not user-facing */ declare const AuthenticationExceptionSchema: z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.AUTHENTICATION_ERROR>; isImmediatelyAvailable: z.ZodLiteral<false>; service: z.ZodOptional<z.ZodString>; authMethod: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.AUTHENTICATION_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; service?: string | undefined; authMethod?: string | undefined; }, { message: string; errorType: ErrorTypeV1.AUTHENTICATION_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; service?: string | undefined; authMethod?: string | undefined; }>; type AuthenticationException = z.infer<typeof AuthenticationExceptionSchema>; /** * Validation Error * isImmediatelyAvailable: true * User provided invalid input - can show them what's wrong */ declare const ValidationExceptionSchema: z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.VALIDATION_ERROR>; isImmediatelyAvailable: z.ZodLiteral<true>; field: z.ZodOptional<z.ZodString>; expected: z.ZodOptional<z.ZodString>; received: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.VALIDATION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; field?: string | undefined; expected?: string | undefined; received?: string | undefined; }, { message: string; errorType: ErrorTypeV1.VALIDATION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; field?: string | undefined; expected?: string | undefined; received?: string | undefined; }>; type ValidationException = z.infer<typeof ValidationExceptionSchema>; /** * Provider Error * isImmediatelyAvailable: false * Error from ASR provider - usually transient or needs investigation */ declare const ProviderExceptionSchema: z.ZodObject<{ code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.PROVIDER_ERROR>; isImmediatelyAvailable: z.ZodLiteral<false>; provider: z.ZodOptional<z.ZodString>; providerErrorCode: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; isTransient: z.ZodOptional<z.ZodBoolean>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.PROVIDER_ERROR; isImmediatelyAvailable: false; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; provider?: string | undefined; providerErrorCode?: string | number | undefined; isTransient?: boolean | undefined; }, { message: string; errorType: ErrorTypeV1.PROVIDER_ERROR; isImmediatelyAvailable: false; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; provider?: string | undefined; providerErrorCode?: string | number | undefined; isTransient?: boolean | undefined; }>; type ProviderException = z.infer<typeof ProviderExceptionSchema>; /** * Timeout Error * isImmediatelyAvailable: true * Request took too long - user should try again */ declare const TimeoutExceptionSchema: z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.TIMEOUT_ERROR>; isImmediatelyAvailable: z.ZodLiteral<true>; timeoutMs: z.ZodOptional<z.ZodNumber>; operation: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.TIMEOUT_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; timeoutMs?: number | undefined; operation?: string | undefined; }, { message: string; errorType: ErrorTypeV1.TIMEOUT_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; timeoutMs?: number | undefined; operation?: string | undefined; }>; type TimeoutException = z.infer<typeof TimeoutExceptionSchema>; /** * Quota Exceeded Error * isImmediatelyAvailable: true * Rate limit or quota exceeded - user should wait */ declare const QuotaExceededExceptionSchema: z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.QUOTA_EXCEEDED>; isImmediatelyAvailable: z.ZodLiteral<true>; quotaType: z.ZodOptional<z.ZodString>; resetAt: z.ZodOptional<z.ZodNumber>; retryAfterSeconds: z.ZodOptional<z.ZodNumber>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.QUOTA_EXCEEDED; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; quotaType?: string | undefined; resetAt?: number | undefined; retryAfterSeconds?: number | undefined; }, { message: string; errorType: ErrorTypeV1.QUOTA_EXCEEDED; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; quotaType?: string | undefined; resetAt?: number | undefined; retryAfterSeconds?: number | undefined; }>; type QuotaExceededException = z.infer<typeof QuotaExceededExceptionSchema>; /** * Connection Error * isImmediatelyAvailable: true * Connection establishment or network failure - user should check network or retry */ declare const ConnectionExceptionSchema: z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.CONNECTION_ERROR>; isImmediatelyAvailable: z.ZodLiteral<true>; attempts: z.ZodOptional<z.ZodNumber>; url: z.ZodOptional<z.ZodString>; underlyingError: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.CONNECTION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; attempts?: number | undefined; url?: string | undefined; underlyingError?: string | undefined; }, { message: string; errorType: ErrorTypeV1.CONNECTION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; attempts?: number | undefined; url?: string | undefined; underlyingError?: string | undefined; }>; type ConnectionException = z.infer<typeof ConnectionExceptionSchema>; /** * Unknown Error * isImmediatelyAvailable: false * Unexpected error - needs investigation */ declare const UnknownExceptionSchema: z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.UNKNOWN_ERROR>; isImmediatelyAvailable: z.ZodLiteral<false>; stack: z.ZodOptional<z.ZodString>; context: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.UNKNOWN_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; stack?: string | undefined; context?: Record<string, unknown> | undefined; }, { message: string; errorType: ErrorTypeV1.UNKNOWN_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; stack?: string | undefined; context?: Record<string, unknown> | undefined; }>; type UnknownException = z.infer<typeof UnknownExceptionSchema>; /** * Discriminated union of all exception types * Use this for type-safe error handling */ declare const RecognitionExceptionSchema: z.ZodDiscriminatedUnion<"errorType", [z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.AUTHENTICATION_ERROR>; isImmediatelyAvailable: z.ZodLiteral<false>; service: z.ZodOptional<z.ZodString>; authMethod: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.AUTHENTICATION_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; service?: string | undefined; authMethod?: string | undefined; }, { message: string; errorType: ErrorTypeV1.AUTHENTICATION_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; service?: string | undefined; authMethod?: string | undefined; }>, z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.VALIDATION_ERROR>; isImmediatelyAvailable: z.ZodLiteral<true>; field: z.ZodOptional<z.ZodString>; expected: z.ZodOptional<z.ZodString>; received: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.VALIDATION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; field?: string | undefined; expected?: string | undefined; received?: string | undefined; }, { message: string; errorType: ErrorTypeV1.VALIDATION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; field?: string | undefined; expected?: string | undefined; received?: string | undefined; }>, z.ZodObject<{ code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.PROVIDER_ERROR>; isImmediatelyAvailable: z.ZodLiteral<false>; provider: z.ZodOptional<z.ZodString>; providerErrorCode: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; isTransient: z.ZodOptional<z.ZodBoolean>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.PROVIDER_ERROR; isImmediatelyAvailable: false; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; provider?: string | undefined; providerErrorCode?: string | number | undefined; isTransient?: boolean | undefined; }, { message: string; errorType: ErrorTypeV1.PROVIDER_ERROR; isImmediatelyAvailable: false; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; provider?: string | undefined; providerErrorCode?: string | number | undefined; isTransient?: boolean | undefined; }>, z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.TIMEOUT_ERROR>; isImmediatelyAvailable: z.ZodLiteral<true>; timeoutMs: z.ZodOptional<z.ZodNumber>; operation: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.TIMEOUT_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; timeoutMs?: number | undefined; operation?: string | undefined; }, { message: string; errorType: ErrorTypeV1.TIMEOUT_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; timeoutMs?: number | undefined; operation?: string | undefined; }>, z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.QUOTA_EXCEEDED>; isImmediatelyAvailable: z.ZodLiteral<true>; quotaType: z.ZodOptional<z.ZodString>; resetAt: z.ZodOptional<z.ZodNumber>; retryAfterSeconds: z.ZodOptional<z.ZodNumber>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.QUOTA_EXCEEDED; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; quotaType?: string | undefined; resetAt?: number | undefined; retryAfterSeconds?: number | undefined; }, { message: string; errorType: ErrorTypeV1.QUOTA_EXCEEDED; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; quotaType?: string | undefined; resetAt?: number | undefined; retryAfterSeconds?: number | undefined; }>, z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.CONNECTION_ERROR>; isImmediatelyAvailable: z.ZodLiteral<true>; attempts: z.ZodOptional<z.ZodNumber>; url: z.ZodOptional<z.ZodString>; underlyingError: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.CONNECTION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; attempts?: number | undefined; url?: string | undefined; underlyingError?: string | undefined; }, { message: string; errorType: ErrorTypeV1.CONNECTION_ERROR; isImmediatelyAvailable: true; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; attempts?: number | undefined; url?: string | undefined; underlyingError?: string | undefined; }>, z.ZodObject<{ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; message: z.ZodString; audioUtteranceId: z.ZodOptional<z.ZodString>; description: z.ZodOptional<z.ZodString>; timestamp: z.ZodOptional<z.ZodNumber>; errorType: z.ZodLiteral<ErrorTypeV1.UNKNOWN_ERROR>; isImmediatelyAvailable: z.ZodLiteral<false>; stack: z.ZodOptional<z.ZodString>; context: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodUnknown>>; }, "strip", z.ZodTypeAny, { message: string; errorType: ErrorTypeV1.UNKNOWN_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; stack?: string | undefined; context?: Record<string, unknown> | undefined; }, { message: string; errorType: ErrorTypeV1.UNKNOWN_ERROR; isImmediatelyAvailable: false; provider?: RecognitionProvider | undefined; code?: string | number | undefined; audioUtteranceId?: string | undefined; description?: string | undefined; timestamp?: number | undefined; stack?: string | undefined; context?: Record<string, unknown> | undefined; }>]>; type RecognitionException = z.infer<typeof RecognitionExceptionSchema>; /** * Check if an exception should be shown to the user immediately */ declare function isExceptionImmediatelyAvailable(exception: RecognitionException): boolean; /** * Get user-friendly error message for exceptions */ declare function getUserFriendlyMessage(exception: RecognitionException): string; /** * Recognition Context Types V1 * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. * Types and schemas for recognition context data */ /** * Message type discriminator for recognition context V1 */ declare enum RecognitionContextTypeV1 { GAME_CONTEXT = "GameContext", CONTROL_SIGNAL = "ControlSignal", ASR_REQUEST = "ASRRequest" } /** * Control signal types for recognition V1 */ declare enum ControlSignalTypeV1 { START_RECORDING = "start_recording", STOP_RECORDING = "stop_recording" } /** * SlotMap - A strongly typed map from slot names to lists of values * Used for entity extraction and slot filling in voice interactions */ declare const SlotMapSchema: z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>; type SlotMap = z.infer<typeof SlotMapSchema>; /** * Game context V1 - contains game state information */ declare const GameContextSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionContextTypeV1.GAME_CONTEXT>; gameId: z.ZodString; gamePhase: z.ZodString; promptSTT: z.ZodOptional<z.ZodString>; promptSTF: z.ZodOptional<z.ZodString>; promptTTF: z.ZodOptional<z.ZodString>; slotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>; }, "strip", z.ZodTypeAny, { type: RecognitionContextTypeV1.GAME_CONTEXT; gameId: string; gamePhase: string; promptSTT?: string | undefined; promptSTF?: string | undefined; promptTTF?: string | undefined; slotMap?: Record<string, string[]> | undefined; }, { type: RecognitionContextTypeV1.GAME_CONTEXT; gameId: string; gamePhase: string; promptSTT?: string | undefined; promptSTF?: string | undefined; promptTTF?: string | undefined; slotMap?: Record<string, string[]> | undefined; }>; type GameContextV1 = z.infer<typeof GameContextSchemaV1>; /** * ASR Request V1 - contains complete ASR setup information * Sent once at connection start to configure the session */ declare const ASRRequestSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionContextTypeV1.ASR_REQUEST>; audioUtteranceId: z.ZodOptional<z.ZodString>; provider: z.ZodString; model: z.ZodOptional<z.ZodString>; language: z.ZodString; sampleRate: z.ZodNumber; encoding: z.ZodNumber; interimResults: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; useContext: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; finalTranscriptStability: z.ZodOptional<z.ZodString>; debugCommand: z.ZodOptional<z.ZodObject<{ enableDebugLog: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; enableAudioStorage: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; enableSongQuizSessionIdCheck: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; enablePilotModels: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>; }, "strip", z.ZodTypeAny, { enableDebugLog: boolean; enableAudioStorage: boolean; enableSongQuizSessionIdCheck: boolean; enablePilotModels: boolean; }, { enableDebugLog?: boolean | undefined; enableAudioStorage?: boolean | undefined; enableSongQuizSessionIdCheck?: boolean | undefined; enablePilotModels?: boolean | undefined; }>>; }, "strip", z.ZodTypeAny, { provider: string; language: string; sampleRate: number; encoding: number; interimResults: boolean; useContext: boolean; type: RecognitionContextTypeV1.ASR_REQUEST; audioUtteranceId?: string | undefined; model?: string | undefined; finalTranscriptStability?: string | undefined; debugCommand?: { enableDebugLog: boolean; enableAudioStorage: boolean; enableSongQuizSessionIdCheck: boolean; enablePilotModels: boolean; } | undefined; }, { provider: string; language: string; sampleRate: number; encoding: number; type: RecognitionContextTypeV1.ASR_REQUEST; audioUtteranceId?: string | undefined; model?: string | undefined; interimResults?: boolean | undefined; useContext?: boolean | undefined; finalTranscriptStability?: string | undefined; debugCommand?: { enableDebugLog?: boolean | undefined; enableAudioStorage?: boolean | undefined; enableSongQuizSessionIdCheck?: boolean | undefined; enablePilotModels?: boolean | undefined; } | undefined; }>; type ASRRequestV1 = z.infer<typeof ASRRequestSchemaV1>; /** * Unified ASR Request Configuration * * Provider-agnostic configuration for ASR (Automatic Speech Recognition) requests. * This interface provides a consistent API for clients regardless of the underlying provider. * * All fields use library-defined enums for type safety and consistency. * Provider-specific mappers will convert these to provider-native formats. */ /** * Final transcript stability modes * * Controls timeout duration for fallback final transcript after stopRecording(). * Similar to AssemblyAI's turn detection confidence modes but applied to our * internal timeout mechanism when vendors don't respond with is_final=true. * * @see https://www.assemblyai.com/docs/speech-to-text/universal-streaming/turn-detection */ declare enum FinalTranscriptStability { /** * Aggressive mode: 100ms timeout * Fast response, optimized for short utterances and quick back-and-forth * Use cases: IVR, quick commands, retail confirmations */ AGGRESSIVE = "aggressive", /** * Balanced mode: 200ms timeout (default) * Natural middle ground for most conversational scenarios * Use cases: General customer support, tech support, typical voice interactions */ BALANCED = "balanced", /** * Conservative mode: 400ms timeout * Wait longer for providers, optimized for complex/reflective speech * Use cases: Healthcare, complex queries, careful thought processes */ CONSERVATIVE = "conservative", /** * Experimental mode: 10000ms (10 seconds) timeout * Very long wait for batch/async providers that need significant processing time * Use cases: Batch processing (Gemini, OpenAI Whisper), complex audio analysis * Note: Should be cancelled immediately when transcript is received */ EXPERIMENTAL = "experimental" } /** * Unified ASR request configuration * * This configuration is used by: * - Client SDKs to specify recognition parameters * - Demo applications for user input * - Service layer to configure provider sessions * * Core fields only - all provider-specific options go in providerOptions * * @example * ```typescript * const config: ASRRequestConfig = { * provider: RecognitionProvider.GOOGLE, * model: GoogleModel.LATEST_LONG, * language: Language.ENGLISH_US, * sampleRate: SampleRate.RATE_16000, // or just 16000 * encoding: AudioEncoding.LINEAR16, * providerOptions: { * google: { * enableAutomaticPunctuation: true, * interimResults: true, * singleUtterance: false * } * } * }; * ``` */ interface ASRRequestConfig { /** * The ASR provider to use * Must be one of the supported providers in RecognitionProvider enum */ provider: RecognitionProvider | string; /** * Optional model specification for the provider * Can be provider-specific model enum or string * If not specified, provider's default model will be used */ model?: RecognitionModel; /** * Language/locale for recognition * Use Language enum for common languages * Can also accept BCP-47 language tags as strings */ language: Language | string; /** * Audio sample rate in Hz * Prefer using SampleRate enum values for standard rates * Can also accept numeric Hz values (e.g., 16000) */ sampleRate: SampleRate | number; /** * Audio encoding format * Must match the actual audio data being sent * Use AudioEncoding enum for standard formats */ encoding: AudioEncoding | string; /** * Enable interim (partial) results during recognition * When true, receive real-time updates before finalization * When false, only receive final results * Default: false */ interimResults?: boolean; /** * Require GameContext before starting recognition such as song titles * When true, server waits for GameContext message before processing audio * When false, recognition starts immediately * Default: false */ useContext?: boolean; /** * Final transcript stability mode * * Controls timeout duration for fallback final transcript when provider * doesn't respond with is_final=true after stopRecording(). * * - aggressive: 100ms - fast response, may cut off slow providers * - balanced: 200ms - current default, good for most cases * - conservative: 400ms - wait longer for complex utterances * * @default 'balanced' * @see FinalTranscriptStability enum for detailed descriptions */ finalTranscriptStability?: FinalTranscriptStability | string; /** * Additional provider-specific options * * Common options per provider: * - Deepgram: punctuate, smart_format, diarize, utterances * - Google: enableAutomaticPunctuation, singleUtterance, enableWordTimeOffsets * - AssemblyAI: formatTurns, filter_profanity, word_boost * * Note: interimResults is now a top-level field, but can still be overridden per provider * * @example * ```typescript * providerOptions: { * google: { * enableAutomaticPunctuation: true, * singleUtterance: false, * enableWordTimeOffsets: false * } * } * ``` */ providerOptions?: Record<string, any>; /** * Optional fallback ASR configurations * * List of alternative ASR configurations to use if the primary fails. * Each fallback config is a complete ASRRequestConfig that will be tried * in order until one succeeds. * * @example * ```typescript * fallbackModels: [ * { * provider: RecognitionProvider.DEEPGRAM, * model: DeepgramModel.NOVA_2, * language: Language.ENGLISH_US, * sampleRate: 16000, * encoding: AudioEncoding.LINEAR16 * }, * { * provider: RecognitionProvider.GOOGLE, * model: GoogleModel.LATEST_SHORT, * language: Language.ENGLISH_US, * sampleRate: 16000, * encoding: AudioEncoding.LINEAR16 * } * ] * ``` */ fallbackModels?: ASRRequestConfig[]; } /** * Partial ASR config for updates * All fields are optional for partial updates */ type PartialASRRequestConfig = Partial<ASRRequestConfig>; /** * Helper function to create a default ASR config */ declare function createDefaultASRConfig(overrides?: PartialASRRequestConfig): ASRRequestConfig; /** * Gemini Model Types * Based on available models as of January 2025 * * API Version Notes: * - Gemini 2.5+ models: Use v1beta API (early access features) * - Gemini 2.0 models: Use v1beta API (early access features) * - Gemini 1.5 models: Use v1 API (stable, production-ready) * * @see https://ai.google.dev/gemini-api/docs/models * @see https://ai.google.dev/gemini-api/docs/api-versions */ declare enum GeminiModel { GEMINI_2_5_PRO = "gemini-2.5-pro", GEMINI_2_5_FLASH = "gemini-2.5-flash", GEMINI_2_5_FLASH_LITE = "gemini-2.5-flash-lite", GEMINI_2_0_FLASH_LATEST = "gemini-2.0-flash-latest", GEMINI_2_0_FLASH_EXP = "gemini-2.0-flash-exp" } /** * OpenAI Model Types */ declare enum OpenAIModel { WHISPER_1 = "whisper-1" } /** * Standard stage/environment constants used across all services */ declare const STAGES: { readonly LOCAL: "local"; readonly DEV: "dev"; readonly STAGING: "staging"; readonly PRODUCTION: "production"; }; type Stage = typeof STAGES[keyof typeof STAGES]; /** * Generic WebSocket protocol types and utilities * Supports flexible versioning and message types * Used by both client and server implementations */ /** * Base message structure - completely flexible * @template V - Version type (number, string, etc.) */ interface Message<V = number> { v: V; type: string; data?: unknown; } /** * Version serializer interface * Converts between version type V and byte representation */ interface VersionSerializer<V> { serialize: (v: V) => number; deserialize: (byte: number) => V; } /** * WebSocketAudioClient - Abstract base class for WebSocket clients * Sends audio and control messages, receives responses from server * * Features: * - Generic version type support (number, string, etc.) * - Type-safe upward/downward message data * - Client-side backpressure monitoring * - Abstract hooks for application-specific logic * - Format-agnostic audio protocol (supports any encoding) */ type ClientConfig = { url: string; highWM?: number; lowWM?: number; }; /** * WebSocketAudioClient - Abstract base class for WebSocket clients * that send audio frames and JSON messages * * @template V - Version type (number, string, object, etc.) * @template TUpward - Type of upward message data (Client -> Server) * @template TDownward - Type of downward message data (Server -> Client) * * @example * ```typescript * class MyClient extends WebSocketAudioClient<number, MyUpMsg, MyDownMsg> { * protected onConnected() { * console.log('Connected!'); * } * * protected onMessage(msg) { * console.log('Received:', msg.type, msg.data); * } * * protected onDisconnected(code, reason) { * console.log('Disconnected:', code, reason); * } * * protected onError(error) { * console.error('Error:', error); * } * } * * const client = new MyClient({ url: 'ws://localhost:8080' }); * client.connect(); * client.sendMessage(1, 'configure', { language: 'en' }); * client.sendAudio(audioData); * ``` */ declare abstract class WebSocketAudioClient<V = number, // Version type (default: number) TUpward = unknown, // Upward message data type TDownward = unknown> { private cfg; protected versionSerializer: VersionSerializer<V>; private ws; private seq; private HWM; private LWM; constructor(cfg: ClientConfig, versionSerializer?: VersionSerializer<V>); /** * Hook: Called when WebSocket connection is established */ protected abstract onConnected(): void; /** * Hook: Called when WebSocket connection closes * @param code - Close code (see WebSocketCloseCode enum) * @param reason - Human-readable close reason */ protected abstract onDisconnected(code: number, reason: string): void; /** * Hook: Called when WebSocket error occurs */ protected abstract onError(error: Event): void; /** * Hook: Called when downward message arrives from server * Override this to handle