UNPKG

@volley/recognition-client-sdk

Version:

Recognition Service TypeScript/Node.js Client SDK

1,164 lines (1,144 loc) 40.1 kB
import { z } from 'zod'; /** * Provider types and enums for recognition services * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. */ /** * Supported speech recognition providers */ declare enum RecognitionProvider { ASSEMBLYAI = "assemblyai", DEEPGRAM = "deepgram", ELEVENLABS = "elevenlabs", FIREWORKS = "fireworks", GOOGLE = "google", GEMINI_BATCH = "gemini-batch", OPENAI_BATCH = "openai-batch", OPENAI_REALTIME = "openai-realtime" } /** * ASR API type - distinguishes between streaming and file-based transcription APIs * - STREAMING: Real-time streaming APIs (Deepgram, AssemblyAI, Google) * - FILE_BASED: File upload/batch APIs (OpenAI Batch, Gemini Batch) */ declare enum ASRApiType { STREAMING = "streaming", FILE_BASED = "file-based" } /** * Deepgram model names */ declare enum DeepgramModel { NOVA_2 = "nova-2", NOVA_3 = "nova-3", FLUX_GENERAL_EN = "flux-general-en" } /** * Google Cloud Speech models * @see https://cloud.google.com/speech-to-text/docs/transcription-model * @see https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model */ declare enum GoogleModel { CHIRP_3 = "chirp_3", CHIRP_2 = "chirp_2", CHIRP = "chirp", LATEST_LONG = "latest_long", LATEST_SHORT = "latest_short", TELEPHONY = "telephony", TELEPHONY_SHORT = "telephony_short", DEFAULT = "default", COMMAND_AND_SEARCH = "command_and_search", PHONE_CALL = "phone_call", VIDEO = "video" } /** * Fireworks AI models for ASR * @see https://docs.fireworks.ai/guides/querying-asr-models * @see https://fireworks.ai/models/fireworks/fireworks-asr-large */ declare enum FireworksModel { ASR_V1 = "fireworks-asr-large", ASR_V2 = "fireworks-asr-v2", WHISPER_V3 = "whisper-v3", WHISPER_V3_TURBO = "whisper-v3-turbo" } /** * ElevenLabs Scribe models for speech-to-text * @see https://elevenlabs.io/blog/introducing-scribe-v2-realtime * @see https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming * @see https://elevenlabs.io/docs/api-reference/speech-to-text/convert */ declare enum ElevenLabsModel { SCRIBE_V2_REALTIME = "scribe_v2_realtime", SCRIBE_V1 = "scribe_v1" } /** * OpenAI Realtime API transcription models * These are the verified `input_audio_transcription.model` values. * @see https://platform.openai.com/docs/guides/realtime */ declare enum OpenAIRealtimeModel { GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe" } /** * Type alias for any model from any provider */ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string; /** * Audio encoding types */ declare enum AudioEncoding { ENCODING_UNSPECIFIED = 0, LINEAR16 = 1, OGG_OPUS = 2, FLAC = 3, MULAW = 4, ALAW = 5 } declare namespace AudioEncoding { /** * Convert numeric ID to AudioEncoding enum * @param id - Numeric encoding identifier (0-5) * @returns AudioEncoding enum value or undefined if invalid */ function fromId(id: number): AudioEncoding | undefined; /** * Convert string name to AudioEncoding enum * @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive) * @returns AudioEncoding enum value or undefined if invalid */ function fromName(nameStr: string): AudioEncoding | undefined; /** * Convert AudioEncoding enum to numeric ID * @param encoding - AudioEncoding enum value * @returns Numeric ID (0-5) */ function toId(encoding: AudioEncoding): number; /** * Convert AudioEncoding enum to string name * @param encoding - AudioEncoding enum value * @returns String name like "LINEAR16", "MULAW", etc. */ function toName(encoding: AudioEncoding): string; /** * Check if a numeric ID is a valid encoding * @param id - Numeric identifier to validate * @returns true if valid encoding ID */ function isIdValid(id: number): boolean; /** * Check if a string name is a valid encoding * @param nameStr - String name to validate * @returns true if valid encoding name */ function isNameValid(nameStr: string): boolean; } /** * Common sample rates (in Hz) */ declare enum SampleRate { RATE_8000 = 8000, RATE_16000 = 16000, RATE_22050 = 22050, RATE_24000 = 24000, RATE_32000 = 32000, RATE_44100 = 44100, RATE_48000 = 48000 } declare namespace SampleRate { /** * Convert Hz value to SampleRate enum * @param hz - Sample rate in Hz (8000, 16000, etc.) * @returns SampleRate enum value or undefined if invalid */ function fromHz(hz: number): SampleRate | undefined; /** * Convert string name to SampleRate enum * @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive) * @returns SampleRate enum value or undefined if invalid */ function fromName(nameStr: string): SampleRate | undefined; /** * Convert SampleRate enum to Hz value * @param rate - SampleRate enum value * @returns Hz value (8000, 16000, etc.) */ function toHz(rate: SampleRate): number; /** * Convert SampleRate enum to string name * @param rate - SampleRate enum value * @returns String name like "RATE_8000", "RATE_16000", etc. */ function toName(rate: SampleRate): string; /** * Check if a numeric Hz value is a valid sample rate * @param hz - Hz value to validate * @returns true if valid sample rate */ function isHzValid(hz: number): boolean; /** * Check if a string name is a valid sample rate * @param nameStr - String name to validate * @returns true if valid sample rate name */ function isNameValid(nameStr: string): boolean; } /** * Supported languages for recognition * Using BCP-47 language tags */ declare enum Language { ENGLISH_US = "en-US", ENGLISH_GB = "en-GB", SPANISH_ES = "es-ES", SPANISH_MX = "es-MX", FRENCH_FR = "fr-FR", GERMAN_DE = "de-DE", ITALIAN_IT = "it-IT", PORTUGUESE_BR = "pt-BR", JAPANESE_JP = "ja-JP", KOREAN_KR = "ko-KR", CHINESE_CN = "zh-CN", CHINESE_TW = "zh-TW" } /** * Recognition Result Types V1 * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. * Types and schemas for recognition results sent to SDK clients */ /** * Message type discriminator for recognition results V1 */ declare enum RecognitionResultTypeV1 { TRANSCRIPTION = "Transcription", FUNCTION_CALL = "FunctionCall", METADATA = "Metadata", ERROR = "Error", CLIENT_CONTROL_MESSAGE = "ClientControlMessage" } /** * Transcription result V1 - contains transcript message * In the long run game side should not need to know it. In the short run it is send back to client. * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. */ declare const TranscriptionResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>; audioUtteranceId: z.ZodString; finalTranscript: z.ZodString; finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>; pendingTranscript: z.ZodOptional<z.ZodString>; pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>; is_finished: z.ZodBoolean; voiceStart: z.ZodOptional<z.ZodNumber>; voiceDuration: z.ZodOptional<z.ZodNumber>; voiceEnd: z.ZodOptional<z.ZodNumber>; startTimestamp: z.ZodOptional<z.ZodNumber>; endTimestamp: z.ZodOptional<z.ZodNumber>; receivedAtMs: z.ZodOptional<z.ZodNumber>; accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.TRANSCRIPTION; audioUtteranceId: string; finalTranscript: string; is_finished: boolean; finalTranscriptConfidence?: number | undefined; pendingTranscript?: string | undefined; pendingTranscriptConfidence?: number | undefined; voiceStart?: number | undefined; voiceDuration?: number | undefined; voiceEnd?: number | undefined; startTimestamp?: number | undefined; endTimestamp?: number | undefined; receivedAtMs?: number | undefined; accumulatedAudioTimeMs?: number | undefined; }, { type: RecognitionResultTypeV1.TRANSCRIPTION; audioUtteranceId: string; finalTranscript: string; is_finished: boolean; finalTranscriptConfidence?: number | undefined; pendingTranscript?: string | undefined; pendingTranscriptConfidence?: number | undefined; voiceStart?: number | undefined; voiceDuration?: number | undefined; voiceEnd?: number | undefined; startTimestamp?: number | undefined; endTimestamp?: number | undefined; receivedAtMs?: number | undefined; accumulatedAudioTimeMs?: number | undefined; }>; type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>; /** * Function call result V1 - similar to LLM function call * In the long run game server should know it, rather than TV or client. */ declare const FunctionCallResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.FUNCTION_CALL>; audioUtteranceId: z.ZodString; functionName: z.ZodString; functionArgJson: z.ZodString; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.FUNCTION_CALL; audioUtteranceId: string; functionName: string; functionArgJson: string; }, { type: RecognitionResultTypeV1.FUNCTION_CALL; audioUtteranceId: string; functionName: string; functionArgJson: string; }>; type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>; /** * Metadata result V1 - contains metadata, timing information, and ASR config * Sent when the provider connection closes to provide final timing metrics and config * In the long run game server should know it, rather than TV or client. */ declare const MetadataResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>; audioUtteranceId: z.ZodString; recordingStartMs: z.ZodOptional<z.ZodNumber>; recordingEndMs: z.ZodOptional<z.ZodNumber>; transcriptEndMs: z.ZodOptional<z.ZodNumber>; socketCloseAtMs: z.ZodOptional<z.ZodNumber>; duration: z.ZodOptional<z.ZodNumber>; volume: z.ZodOptional<z.ZodNumber>; accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>; costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>; apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>; asrConfig: z.ZodOptional<z.ZodString>; rawAsrMetadata: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.METADATA; audioUtteranceId: string; recordingStartMs?: number | undefined; recordingEndMs?: number | undefined; transcriptEndMs?: number | undefined; socketCloseAtMs?: number | undefined; duration?: number | undefined; volume?: number | undefined; accumulatedAudioTimeMs?: number | undefined; costInUSD?: number | undefined; apiType?: ASRApiType | undefined; asrConfig?: string | undefined; rawAsrMetadata?: string | undefined; }, { type: RecognitionResultTypeV1.METADATA; audioUtteranceId: string; recordingStartMs?: number | undefined; recordingEndMs?: number | undefined; transcriptEndMs?: number | undefined; socketCloseAtMs?: number | undefined; duration?: number | undefined; volume?: number | undefined; accumulatedAudioTimeMs?: number | undefined; costInUSD?: number | undefined; apiType?: ASRApiType | undefined; asrConfig?: string | undefined; rawAsrMetadata?: string | undefined; }>; type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>; /** * Error type enum V1 - categorizes different types of errors */ declare enum ErrorTypeV1 { AUTHENTICATION_ERROR = "authentication_error", VALIDATION_ERROR = "validation_error", PROVIDER_ERROR = "provider_error", TIMEOUT_ERROR = "timeout_error", QUOTA_EXCEEDED = "quota_exceeded", CONNECTION_ERROR = "connection_error", UNKNOWN_ERROR = "unknown_error" } /** * Error result V1 - contains error message * In the long run game server should know it, rather than TV or client. */ declare const ErrorResultSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionResultTypeV1.ERROR>; audioUtteranceId: z.ZodString; errorType: z.ZodOptional<z.ZodNativeEnum<typeof ErrorTypeV1>>; message: z.ZodOptional<z.ZodString>; code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>; description: z.ZodOptional<z.ZodString>; }, "strip", z.ZodTypeAny, { type: RecognitionResultTypeV1.ERROR; audioUtteranceId: string; errorType?: ErrorTypeV1 | undefined; message?: string | undefined; code?: string | number | undefined; description?: string | undefined; }, { type: RecognitionResultTypeV1.ERROR; audioUtteranceId: string; errorType?: ErrorTypeV1 | undefined; message?: string | undefined; code?: string | number | undefined; description?: string | undefined; }>; type ErrorResultV1 = z.infer<typeof ErrorResultSchemaV1>; /** * Recognition Context Types V1 * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes. * Types and schemas for recognition context data */ /** * Message type discriminator for recognition context V1 */ declare enum RecognitionContextTypeV1 { GAME_CONTEXT = "GameContext", CONTROL_SIGNAL = "ControlSignal", ASR_REQUEST = "ASRRequest" } /** * Control signal types for recognition V1 */ declare enum ControlSignalTypeV1 { START_RECORDING = "start_recording", STOP_RECORDING = "stop_recording" } /** * Game context V1 - contains game state information */ declare const GameContextSchemaV1: z.ZodObject<{ type: z.ZodLiteral<RecognitionContextTypeV1.GAME_CONTEXT>; gameId: z.ZodString; gamePhase: z.ZodString; promptSTT: z.ZodOptional<z.ZodString>; promptSTF: z.ZodOptional<z.ZodString>; promptTTF: z.ZodOptional<z.ZodString>; slotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>; }, "strip", z.ZodTypeAny, { type: RecognitionContextTypeV1.GAME_CONTEXT; gameId: string; gamePhase: string; promptSTT?: string | undefined; promptSTF?: string | undefined; promptTTF?: string | undefined; slotMap?: Record<string, string[]> | undefined; }, { type: RecognitionContextTypeV1.GAME_CONTEXT; gameId: string; gamePhase: string; promptSTT?: string | undefined; promptSTF?: string | undefined; promptTTF?: string | undefined; slotMap?: Record<string, string[]> | undefined; }>; type GameContextV1 = z.infer<typeof GameContextSchemaV1>; /** * Unified ASR Request Configuration * * Provider-agnostic configuration for ASR (Automatic Speech Recognition) requests. * This interface provides a consistent API for clients regardless of the underlying provider. * * All fields use library-defined enums for type safety and consistency. * Provider-specific mappers will convert these to provider-native formats. */ /** * Final transcript stability modes * * Controls timeout duration for fallback final transcript after stopRecording(). * Similar to AssemblyAI's turn detection confidence modes but applied to our * internal timeout mechanism when vendors don't respond with is_final=true. * * @see https://www.assemblyai.com/docs/speech-to-text/universal-streaming/turn-detection */ declare enum FinalTranscriptStability { /** * Aggressive mode: 100ms timeout * Fast response, optimized for short utterances and quick back-and-forth * Use cases: IVR, quick commands, retail confirmations */ AGGRESSIVE = "aggressive", /** * Balanced mode: 200ms timeout (default) * Natural middle ground for most conversational scenarios * Use cases: General customer support, tech support, typical voice interactions */ BALANCED = "balanced", /** * Conservative mode: 400ms timeout * Wait longer for providers, optimized for complex/reflective speech * Use cases: Healthcare, complex queries, careful thought processes */ CONSERVATIVE = "conservative", /** * Experimental mode: 10000ms (10 seconds) timeout * Very long wait for batch/async providers that need significant processing time * Use cases: Batch processing (Gemini, OpenAI Whisper), complex audio analysis * Note: Should be cancelled immediately when transcript is received */ EXPERIMENTAL = "experimental" } /** * Unified ASR request configuration * * This configuration is used by: * - Client SDKs to specify recognition parameters * - Demo applications for user input * - Service layer to configure provider sessions * * Core fields only - all provider-specific options go in providerOptions * * @example * ```typescript * const config: ASRRequestConfig = { * provider: RecognitionProvider.GOOGLE, * model: GoogleModel.LATEST_LONG, * language: Language.ENGLISH_US, * sampleRate: SampleRate.RATE_16000, // or just 16000 * encoding: AudioEncoding.LINEAR16, * providerOptions: { * google: { * enableAutomaticPunctuation: true, * interimResults: true, * singleUtterance: false * } * } * }; * ``` */ interface ASRRequestConfig { /** * The ASR provider to use * Must be one of the supported providers in RecognitionProvider enum */ provider: RecognitionProvider | string; /** * Optional model specification for the provider * Can be provider-specific model enum or string * If not specified, provider's default model will be used */ model?: RecognitionModel; /** * Language/locale for recognition * Use Language enum for common languages * Can also accept BCP-47 language tags as strings */ language: Language | string; /** * Audio sample rate in Hz * Prefer using SampleRate enum values for standard rates * Can also accept numeric Hz values (e.g., 16000) */ sampleRate: SampleRate | number; /** * Audio encoding format * Must match the actual audio data being sent * Use AudioEncoding enum for standard formats */ encoding: AudioEncoding | string; /** * Enable interim (partial) results during recognition * When true, receive real-time updates before finalization * When false, only receive final results * Default: false */ interimResults?: boolean; /** * Require GameContext before starting recognition such as song titles * When true, server waits for GameContext message before processing audio * When false, recognition starts immediately * Default: false */ useContext?: boolean; /** * Final transcript stability mode * * Controls timeout duration for fallback final transcript when provider * doesn't respond with is_final=true after stopRecording(). * * - aggressive: 100ms - fast response, may cut off slow providers * - balanced: 200ms - current default, good for most cases * - conservative: 400ms - wait longer for complex utterances * * @default 'balanced' * @see FinalTranscriptStability enum for detailed descriptions */ finalTranscriptStability?: FinalTranscriptStability | string; /** * Additional provider-specific options * * Common options per provider: * - Deepgram: punctuate, smart_format, diarize, utterances * - Google: enableAutomaticPunctuation, singleUtterance, enableWordTimeOffsets * - AssemblyAI: formatTurns, filter_profanity, word_boost * * Note: interimResults is now a top-level field, but can still be overridden per provider * * @example * ```typescript * providerOptions: { * google: { * enableAutomaticPunctuation: true, * singleUtterance: false, * enableWordTimeOffsets: false * } * } * ``` */ providerOptions?: Record<string, any>; /** * Optional fallback ASR configurations * * List of alternative ASR configurations to use if the primary fails. * Each fallback config is a complete ASRRequestConfig that will be tried * in order until one succeeds. * * @example * ```typescript * fallbackModels: [ * { * provider: RecognitionProvider.DEEPGRAM, * model: DeepgramModel.NOVA_2, * language: Language.ENGLISH_US, * sampleRate: 16000, * encoding: AudioEncoding.LINEAR16 * }, * { * provider: RecognitionProvider.GOOGLE, * model: GoogleModel.LATEST_SHORT, * language: Language.ENGLISH_US, * sampleRate: 16000, * encoding: AudioEncoding.LINEAR16 * } * ] * ``` */ fallbackModels?: ASRRequestConfig[]; } /** * Standard stage/environment constants used across all services */ declare const STAGES: { readonly LOCAL: "local"; readonly DEV: "dev"; readonly STAGING: "staging"; readonly PRODUCTION: "production"; }; type Stage = typeof STAGES[keyof typeof STAGES]; /** * Generic WebSocket protocol types and utilities * Supports flexible versioning and message types * Used by both client and server implementations */ /** * Base message structure - completely flexible * @template V - Version type (number, string, etc.) */ interface Message<V = number> { v: V; type: string; data?: unknown; } /** * Version serializer interface * Converts between version type V and byte representation */ interface VersionSerializer<V> { serialize: (v: V) => number; deserialize: (byte: number) => V; } /** * WebSocketAudioClient - Abstract base class for WebSocket clients * Sends audio and control messages, receives responses from server * * Features: * - Generic version type support (number, string, etc.) * - Type-safe upward/downward message data * - Client-side backpressure monitoring * - Abstract hooks for application-specific logic * - Format-agnostic audio protocol (supports any encoding) */ type ClientConfig = { url: string; highWM?: number; lowWM?: number; }; /** * WebSocketAudioClient - Abstract base class for WebSocket clients * that send audio frames and JSON messages * * @template V - Version type (number, string, object, etc.) * @template TUpward - Type of upward message data (Client -> Server) * @template TDownward - Type of downward message data (Server -> Client) * * @example * ```typescript * class MyClient extends WebSocketAudioClient<number, MyUpMsg, MyDownMsg> { * protected onConnected() { * console.log('Connected!'); * } * * protected onMessage(msg) { * console.log('Received:', msg.type, msg.data); * } * * protected onDisconnected(code, reason) { * console.log('Disconnected:', code, reason); * } * * protected onError(error) { * console.error('Error:', error); * } * } * * const client = new MyClient({ url: 'ws://localhost:8080' }); * client.connect(); * client.sendMessage(1, 'configure', { language: 'en' }); * client.sendAudio(audioData); * ``` */ declare abstract class WebSocketAudioClient<V = number, // Version type (default: number) TUpward = unknown, // Upward message data type TDownward = unknown> { private cfg; protected versionSerializer: VersionSerializer<V>; private ws; private seq; private HWM; private LWM; constructor(cfg: ClientConfig, versionSerializer?: VersionSerializer<V>); /** * Hook: Called when WebSocket connection is established */ protected abstract onConnected(): void; /** * Hook: Called when WebSocket connection closes * @param code - Close code (see WebSocketCloseCode enum) * @param reason - Human-readable close reason */ protected abstract onDisconnected(code: number, reason: string): void; /** * Hook: Called when WebSocket error occurs */ protected abstract onError(error: Event): void; /** * Hook: Called when downward message arrives from server * Override this to handle messages (optional - default does nothing) */ protected onMessage(_msg: Message<V> & { data: TDownward; }): void; connect(): void; /** * Send JSON message to server * @param version - Message version * @param type - Message type (developer defined) * @param data - Message payload (typed) */ sendMessage(version: V, type: string, data: TUpward): void; /** * Send audio frame with specified encoding and sample rate * @param audioData - Audio data (any format: Int16Array, Uint8Array, ArrayBuffer, etc.) * @param version - Audio frame version * @param encodingId - Audio encoding ID (0-5, e.g., AudioEncoding.LINEAR16) * @param sampleRate - Sample rate in Hz (e.g., 16000) */ sendAudio(audioData: ArrayBuffer | ArrayBufferView, version: V, encodingId: number, sampleRate: number): void; /** * Get current WebSocket buffer size */ getBufferedAmount(): number; /** * Check if local buffer is backpressured */ isLocalBackpressured(): boolean; /** * Check if ready to send audio * Verifies: connection open, no local buffer pressure */ canSend(): boolean; /** * Check if connection is open */ isOpen(): boolean; /** * Get current connection state */ getReadyState(): number; /** * Close the WebSocket connection * Protected method for subclasses to implement disconnect logic * @param code - WebSocket close code (default: 1000 = normal closure) * @param reason - Human-readable close reason */ protected closeConnection(code?: number, reason?: string): void; } /** * Recognition Client Types * * Type definitions and interfaces for the recognition client SDK. * These interfaces enable dependency injection, testing, and alternative implementations. */ /** * Client connection state enum * Represents the various states a recognition client can be in during its lifecycle */ declare enum ClientState { /** Initial state, no connection established */ INITIAL = "initial", /** Actively establishing WebSocket connection */ CONNECTING = "connecting", /** WebSocket connected but waiting for server ready signal */ CONNECTED = "connected", /** Server ready, can send audio */ READY = "ready", /** Sent stop signal, waiting for final transcript */ STOPPING = "stopping", /** Connection closed normally after stop */ STOPPED = "stopped", /** Connection failed or lost unexpectedly */ FAILED = "failed" } /** * Callback URL configuration with message type filtering */ interface RecognitionCallbackUrl { /** The callback URL endpoint */ url: string; /** Array of message types to send to this URL. If empty/undefined, all types are sent */ messageTypes?: Array<string | number>; } interface IRecognitionClientConfig { /** * WebSocket endpoint URL (optional) * Either `url` or `stage` must be provided. * If both are provided, `url` takes precedence. * * Example with explicit URL: * ```typescript * { url: 'wss://custom-endpoint.example.com/ws/v1/recognize' } * ``` */ url?: string; /** * Stage for recognition service (recommended) * Either `url` or `stage` must be provided. * If both are provided, `url` takes precedence. * Defaults to production if neither is provided. * * Example with STAGES enum (recommended): * ```typescript * import { STAGES } from '@recog/shared-types'; * { stage: STAGES.STAGING } * ``` * * String values also accepted: * ```typescript * { stage: 'staging' } // STAGES.LOCAL | STAGES.DEV | STAGES.STAGING | STAGES.PRODUCTION * ``` */ stage?: Stage | string; /** ASR configuration (provider, model, language, etc.) - optional */ asrRequestConfig?: ASRRequestConfig; /** Game context for improved recognition accuracy */ gameContext?: GameContextV1; /** Audio utterance ID (optional) - if not provided, a UUID v4 will be generated */ audioUtteranceId?: string; /** Callback URLs for server-side notifications with optional message type filtering (optional) * Game side only need to use it if another service need to be notified about the transcription results. */ callbackUrls?: RecognitionCallbackUrl[]; /** User identification (optional) */ userId?: string; /** Game session identification (optional). called 'sessionId' in Platform and most games. */ gameSessionId?: string; /** Device identification (optional) */ deviceId?: string; /** Account identification (optional) */ accountId?: string; /** Question answer identifier for tracking Q&A sessions (optional and tracking purpose only) */ questionAnswerId?: string; /** Platform for audio recording device (optional, e.g., 'ios', 'android', 'web', 'unity') */ platform?: string; /** Callback when transcript is received */ onTranscript?: (result: TranscriptionResultV1) => void; /** * Callback when function call is received * Note: Not supported in 2025. P2 feature for future speech-to-function-call capability. */ onFunctionCall?: (result: FunctionCallResultV1) => void; /** Callback when metadata is received. Only once after transcription is complete.*/ onMetadata?: (metadata: MetadataResultV1) => void; /** Callback when error occurs */ onError?: (error: ErrorResultV1) => void; /** Callback when connected to WebSocket */ onConnected?: () => void; /** * Callback when WebSocket disconnects * @param code - WebSocket close code (1000 = normal, 1006 = abnormal, etc.) * @param reason - Close reason string */ onDisconnected?: (code: number, reason: string) => void; /** High water mark for backpressure control (bytes) */ highWaterMark?: number; /** Low water mark for backpressure control (bytes) */ lowWaterMark?: number; /** Maximum buffer duration in seconds (default: 60s) */ maxBufferDurationSec?: number; /** Expected chunks per second for ring buffer sizing (default: 100) */ chunksPerSecond?: number; /** * Connection retry configuration (optional) * Only applies to initial connection establishment, not mid-stream interruptions. * * Default: { maxAttempts: 4, delayMs: 200 } (try once, retry 3 times = 4 total attempts) * * Timing: Attempt 1 → FAIL → wait 200ms → Attempt 2 → FAIL → wait 200ms → Attempt 3 → FAIL → wait 200ms → Attempt 4 * * Example: * ```typescript * { * connectionRetry: { * maxAttempts: 2, // Try connecting up to 2 times (1 retry) * delayMs: 500 // Wait 500ms between attempts * } * } * ``` */ connectionRetry?: { /** Maximum number of connection attempts (default: 4, min: 1, max: 5) */ maxAttempts?: number; /** Delay in milliseconds between retry attempts (default: 200ms) */ delayMs?: number; }; /** * Optional logger function for debugging * If not provided, no logging will occur * @param level - Log level: 'debug', 'info', 'warn', 'error' * @param message - Log message * @param data - Optional additional data */ logger?: (level: 'debug' | 'info' | 'warn' | 'error', message: string, data?: any) => void; } /** * Recognition Client Interface * * Main interface for real-time speech recognition clients. * Provides methods for connection management, audio streaming, and session control. */ interface IRecognitionClient { /** * Connect to the WebSocket endpoint * @returns Promise that resolves when connected * @throws Error if connection fails or times out */ connect(): Promise<void>; /** * Send audio data to the recognition service * Audio is buffered locally and sent when connection is ready. * @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob */ sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void; /** * Stop recording and wait for final transcript * The server will close the connection after sending the final transcript. * @returns Promise that resolves when final transcript is received */ stopRecording(): Promise<void>; /** * Force stop and immediately close connection without waiting for server * * WARNING: This is an abnormal shutdown that bypasses the graceful stop flow: * - Does NOT wait for server to process remaining audio * - Does NOT receive final transcript from server * - Immediately closes WebSocket connection * - Cleans up resources (buffers, listeners) * * Use Cases: * - User explicitly cancels/abandons session * - Timeout scenarios where waiting is not acceptable * - Need immediate cleanup and can't wait for server * * RECOMMENDED: Use stopRecording() for normal shutdown. * Only use this when immediate disconnection is required. */ stopAbnormally(): void; /** * Get the audio utterance ID for this session * Available immediately after client construction. * @returns UUID v4 string identifying this recognition session */ getAudioUtteranceId(): string; /** * Get the current state of the client * @returns Current ClientState value */ getState(): ClientState; /** * Check if WebSocket connection is open * @returns true if connected and ready to communicate */ isConnected(): boolean; /** * Check if client is currently connecting * @returns true if connection is in progress */ isConnecting(): boolean; /** * Check if client is currently stopping * @returns true if stopRecording() is in progress */ isStopping(): boolean; /** * Check if transcription has finished * @returns true if the transcription is complete */ isTranscriptionFinished(): boolean; /** * Check if the audio buffer has overflowed * @returns true if the ring buffer has wrapped around */ isBufferOverflowing(): boolean; /** * Get client statistics * @returns Statistics about audio transmission and buffering */ getStats(): IRecognitionClientStats; /** * Get the WebSocket URL being used by this client * Available immediately after client construction. * @returns WebSocket URL string */ getUrl(): string; } /** * Client statistics interface */ interface IRecognitionClientStats { /** Total audio bytes sent to server */ audioBytesSent: number; /** Total number of audio chunks sent */ audioChunksSent: number; /** Total number of audio chunks buffered */ audioChunksBuffered: number; /** Number of times the ring buffer overflowed */ bufferOverflowCount: number; /** Current number of chunks in buffer */ currentBufferedChunks: number; /** Whether the ring buffer has wrapped (overwritten old data) */ hasWrapped: boolean; } /** * Configuration for RealTimeTwoWayWebSocketRecognitionClient * This extends IRecognitionClientConfig and is the main configuration interface * for creating a new RealTimeTwoWayWebSocketRecognitionClient instance. */ interface RealTimeTwoWayWebSocketRecognitionClientConfig extends IRecognitionClientConfig { } /** * RealTimeTwoWayWebSocketRecognitionClient - Clean, compact SDK for real-time speech recognition * * Features: * - Ring buffer-based audio storage with fixed memory footprint * - Automatic buffering when disconnected, immediate send when connected * - Buffer persists after flush (for future retry/reconnection scenarios) * - Built on WebSocketAudioClient for robust protocol handling * - Simple API: connect() → sendAudio() → stopRecording() * - Type-safe message handling with callbacks * - Automatic backpressure management * - Overflow detection with buffer state tracking * * Example: * ```typescript * const client = new RealTimeTwoWayWebSocketRecognitionClient({ * url: 'ws://localhost:3101/ws/v1/recognize', * onTranscript: (result) => console.log(result.finalTranscript), * onError: (error) => console.error(error), * maxBufferDurationSec: 60 // Ring buffer for 60 seconds * }); * * await client.connect(); * * // Send audio chunks - always stored in ring buffer, sent if connected * micStream.on('data', (chunk) => client.sendAudio(chunk)); * * // Signal end of audio and wait for final results * await client.stopRecording(); * * // Server will close connection after sending finals * // No manual cleanup needed - browser handles it * ``` */ /** * Re-export TranscriptionResultV1 as TranscriptionResult for backward compatibility */ type TranscriptionResult = TranscriptionResultV1; /** * RealTimeTwoWayWebSocketRecognitionClient - SDK-level client for real-time speech recognition * * Implements IRecognitionClient interface for dependency injection and testing. * Extends WebSocketAudioClient with local audio buffering and simple callback-based API. */ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient { private static readonly PROTOCOL_VERSION; private config; private audioBuffer; private messageHandler; private state; private connectionPromise; private isDebugLogEnabled; private audioBytesSent; private audioChunksSent; private audioStatsLogInterval; private lastAudioStatsLog; constructor(config: RealTimeTwoWayWebSocketRecognitionClientConfig); /** * Internal logging helper - only logs if a logger was provided in config * Debug logs are additionally gated by isDebugLogEnabled flag * @param level - Log level: debug, info, warn, or error * @param message - Message to log * @param data - Optional additional data to log */ private log; /** * Clean up internal resources to free memory * Called when connection closes (normally or abnormally) */ private cleanup; connect(): Promise<void>; /** * Attempt to connect with retry logic * Only retries on initial connection establishment, not mid-stream interruptions */ private connectWithRetry; sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void; private sendAudioInternal; stopRecording(): Promise<void>; stopAbnormally(): void; getAudioUtteranceId(): string; getUrl(): string; getState(): ClientState; isConnected(): boolean; isConnecting(): boolean; isStopping(): boolean; isTranscriptionFinished(): boolean; isBufferOverflowing(): boolean; getStats(): IRecognitionClientStats; protected onConnected(): void; protected onDisconnected(code: number, reason: string): void; /** * Get human-readable description for WebSocket close code */ private getCloseCodeDescription; protected onError(error: Event): void; protected onMessage(msg: { v: number; type: string; data: any; }): void; /** * Handle control messages from server * @param msg - Control message containing server actions */ private handleControlMessage; /** * Send audio immediately to the server (without buffering) * @param audioData - Audio data to send */ private sendAudioNow; } export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 }; export type { GameContextV1, RealTimeTwoWayWebSocketRecognitionClientConfig, TranscriptionResult };