@volley/recognition-client-sdk
Version:
Recognition Service TypeScript/Node.js Client SDK
1,164 lines (1,144 loc) • 40.1 kB
TypeScript
import { z } from 'zod';
/**
* Provider types and enums for recognition services
* NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
*/
/**
* Supported speech recognition providers
*/
declare enum RecognitionProvider {
ASSEMBLYAI = "assemblyai",
DEEPGRAM = "deepgram",
ELEVENLABS = "elevenlabs",
FIREWORKS = "fireworks",
GOOGLE = "google",
GEMINI_BATCH = "gemini-batch",
OPENAI_BATCH = "openai-batch",
OPENAI_REALTIME = "openai-realtime"
}
/**
* ASR API type - distinguishes between streaming and file-based transcription APIs
* - STREAMING: Real-time streaming APIs (Deepgram, AssemblyAI, Google)
* - FILE_BASED: File upload/batch APIs (OpenAI Batch, Gemini Batch)
*/
declare enum ASRApiType {
STREAMING = "streaming",
FILE_BASED = "file-based"
}
/**
* Deepgram model names
*/
declare enum DeepgramModel {
NOVA_2 = "nova-2",
NOVA_3 = "nova-3",
FLUX_GENERAL_EN = "flux-general-en"
}
/**
* Google Cloud Speech models
* @see https://cloud.google.com/speech-to-text/docs/transcription-model
* @see https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model
*/
declare enum GoogleModel {
CHIRP_3 = "chirp_3",
CHIRP_2 = "chirp_2",
CHIRP = "chirp",
LATEST_LONG = "latest_long",
LATEST_SHORT = "latest_short",
TELEPHONY = "telephony",
TELEPHONY_SHORT = "telephony_short",
DEFAULT = "default",
COMMAND_AND_SEARCH = "command_and_search",
PHONE_CALL = "phone_call",
VIDEO = "video"
}
/**
* Fireworks AI models for ASR
* @see https://docs.fireworks.ai/guides/querying-asr-models
* @see https://fireworks.ai/models/fireworks/fireworks-asr-large
*/
declare enum FireworksModel {
ASR_V1 = "fireworks-asr-large",
ASR_V2 = "fireworks-asr-v2",
WHISPER_V3 = "whisper-v3",
WHISPER_V3_TURBO = "whisper-v3-turbo"
}
/**
* ElevenLabs Scribe models for speech-to-text
* @see https://elevenlabs.io/blog/introducing-scribe-v2-realtime
* @see https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming
* @see https://elevenlabs.io/docs/api-reference/speech-to-text/convert
*/
declare enum ElevenLabsModel {
SCRIBE_V2_REALTIME = "scribe_v2_realtime",
SCRIBE_V1 = "scribe_v1"
}
/**
* OpenAI Realtime API transcription models
* These are the verified `input_audio_transcription.model` values.
* @see https://platform.openai.com/docs/guides/realtime
*/
declare enum OpenAIRealtimeModel {
GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
}
/**
* Type alias for any model from any provider
*/
type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
/**
* Audio encoding types
*/
declare enum AudioEncoding {
ENCODING_UNSPECIFIED = 0,
LINEAR16 = 1,
OGG_OPUS = 2,
FLAC = 3,
MULAW = 4,
ALAW = 5
}
declare namespace AudioEncoding {
/**
* Convert numeric ID to AudioEncoding enum
* @param id - Numeric encoding identifier (0-5)
* @returns AudioEncoding enum value or undefined if invalid
*/
function fromId(id: number): AudioEncoding | undefined;
/**
* Convert string name to AudioEncoding enum
* @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
* @returns AudioEncoding enum value or undefined if invalid
*/
function fromName(nameStr: string): AudioEncoding | undefined;
/**
* Convert AudioEncoding enum to numeric ID
* @param encoding - AudioEncoding enum value
* @returns Numeric ID (0-5)
*/
function toId(encoding: AudioEncoding): number;
/**
* Convert AudioEncoding enum to string name
* @param encoding - AudioEncoding enum value
* @returns String name like "LINEAR16", "MULAW", etc.
*/
function toName(encoding: AudioEncoding): string;
/**
* Check if a numeric ID is a valid encoding
* @param id - Numeric identifier to validate
* @returns true if valid encoding ID
*/
function isIdValid(id: number): boolean;
/**
* Check if a string name is a valid encoding
* @param nameStr - String name to validate
* @returns true if valid encoding name
*/
function isNameValid(nameStr: string): boolean;
}
/**
* Common sample rates (in Hz)
*/
declare enum SampleRate {
RATE_8000 = 8000,
RATE_16000 = 16000,
RATE_22050 = 22050,
RATE_24000 = 24000,
RATE_32000 = 32000,
RATE_44100 = 44100,
RATE_48000 = 48000
}
declare namespace SampleRate {
/**
* Convert Hz value to SampleRate enum
* @param hz - Sample rate in Hz (8000, 16000, etc.)
* @returns SampleRate enum value or undefined if invalid
*/
function fromHz(hz: number): SampleRate | undefined;
/**
* Convert string name to SampleRate enum
* @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
* @returns SampleRate enum value or undefined if invalid
*/
function fromName(nameStr: string): SampleRate | undefined;
/**
* Convert SampleRate enum to Hz value
* @param rate - SampleRate enum value
* @returns Hz value (8000, 16000, etc.)
*/
function toHz(rate: SampleRate): number;
/**
* Convert SampleRate enum to string name
* @param rate - SampleRate enum value
* @returns String name like "RATE_8000", "RATE_16000", etc.
*/
function toName(rate: SampleRate): string;
/**
* Check if a numeric Hz value is a valid sample rate
* @param hz - Hz value to validate
* @returns true if valid sample rate
*/
function isHzValid(hz: number): boolean;
/**
* Check if a string name is a valid sample rate
* @param nameStr - String name to validate
* @returns true if valid sample rate name
*/
function isNameValid(nameStr: string): boolean;
}
/**
* Supported languages for recognition
* Using BCP-47 language tags
*/
declare enum Language {
ENGLISH_US = "en-US",
ENGLISH_GB = "en-GB",
SPANISH_ES = "es-ES",
SPANISH_MX = "es-MX",
FRENCH_FR = "fr-FR",
GERMAN_DE = "de-DE",
ITALIAN_IT = "it-IT",
PORTUGUESE_BR = "pt-BR",
JAPANESE_JP = "ja-JP",
KOREAN_KR = "ko-KR",
CHINESE_CN = "zh-CN",
CHINESE_TW = "zh-TW"
}
/**
* Recognition Result Types V1
* NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
* Types and schemas for recognition results sent to SDK clients
*/
/**
* Message type discriminator for recognition results V1
*/
declare enum RecognitionResultTypeV1 {
TRANSCRIPTION = "Transcription",
FUNCTION_CALL = "FunctionCall",
METADATA = "Metadata",
ERROR = "Error",
CLIENT_CONTROL_MESSAGE = "ClientControlMessage"
}
/**
* Transcription result V1 - contains transcript message
* In the long run game side should not need to know it. In the short run it is send back to client.
* NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
*/
declare const TranscriptionResultSchemaV1: z.ZodObject<{
type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
audioUtteranceId: z.ZodString;
finalTranscript: z.ZodString;
finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
pendingTranscript: z.ZodOptional<z.ZodString>;
pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
is_finished: z.ZodBoolean;
voiceStart: z.ZodOptional<z.ZodNumber>;
voiceDuration: z.ZodOptional<z.ZodNumber>;
voiceEnd: z.ZodOptional<z.ZodNumber>;
startTimestamp: z.ZodOptional<z.ZodNumber>;
endTimestamp: z.ZodOptional<z.ZodNumber>;
receivedAtMs: z.ZodOptional<z.ZodNumber>;
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
}, "strip", z.ZodTypeAny, {
type: RecognitionResultTypeV1.TRANSCRIPTION;
audioUtteranceId: string;
finalTranscript: string;
is_finished: boolean;
finalTranscriptConfidence?: number | undefined;
pendingTranscript?: string | undefined;
pendingTranscriptConfidence?: number | undefined;
voiceStart?: number | undefined;
voiceDuration?: number | undefined;
voiceEnd?: number | undefined;
startTimestamp?: number | undefined;
endTimestamp?: number | undefined;
receivedAtMs?: number | undefined;
accumulatedAudioTimeMs?: number | undefined;
}, {
type: RecognitionResultTypeV1.TRANSCRIPTION;
audioUtteranceId: string;
finalTranscript: string;
is_finished: boolean;
finalTranscriptConfidence?: number | undefined;
pendingTranscript?: string | undefined;
pendingTranscriptConfidence?: number | undefined;
voiceStart?: number | undefined;
voiceDuration?: number | undefined;
voiceEnd?: number | undefined;
startTimestamp?: number | undefined;
endTimestamp?: number | undefined;
receivedAtMs?: number | undefined;
accumulatedAudioTimeMs?: number | undefined;
}>;
type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
/**
* Function call result V1 - similar to LLM function call
* In the long run game server should know it, rather than TV or client.
*/
declare const FunctionCallResultSchemaV1: z.ZodObject<{
type: z.ZodLiteral<RecognitionResultTypeV1.FUNCTION_CALL>;
audioUtteranceId: z.ZodString;
functionName: z.ZodString;
functionArgJson: z.ZodString;
}, "strip", z.ZodTypeAny, {
type: RecognitionResultTypeV1.FUNCTION_CALL;
audioUtteranceId: string;
functionName: string;
functionArgJson: string;
}, {
type: RecognitionResultTypeV1.FUNCTION_CALL;
audioUtteranceId: string;
functionName: string;
functionArgJson: string;
}>;
type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
/**
* Metadata result V1 - contains metadata, timing information, and ASR config
* Sent when the provider connection closes to provide final timing metrics and config
* In the long run game server should know it, rather than TV or client.
*/
declare const MetadataResultSchemaV1: z.ZodObject<{
type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
audioUtteranceId: z.ZodString;
recordingStartMs: z.ZodOptional<z.ZodNumber>;
recordingEndMs: z.ZodOptional<z.ZodNumber>;
transcriptEndMs: z.ZodOptional<z.ZodNumber>;
socketCloseAtMs: z.ZodOptional<z.ZodNumber>;
duration: z.ZodOptional<z.ZodNumber>;
volume: z.ZodOptional<z.ZodNumber>;
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
asrConfig: z.ZodOptional<z.ZodString>;
rawAsrMetadata: z.ZodOptional<z.ZodString>;
}, "strip", z.ZodTypeAny, {
type: RecognitionResultTypeV1.METADATA;
audioUtteranceId: string;
recordingStartMs?: number | undefined;
recordingEndMs?: number | undefined;
transcriptEndMs?: number | undefined;
socketCloseAtMs?: number | undefined;
duration?: number | undefined;
volume?: number | undefined;
accumulatedAudioTimeMs?: number | undefined;
costInUSD?: number | undefined;
apiType?: ASRApiType | undefined;
asrConfig?: string | undefined;
rawAsrMetadata?: string | undefined;
}, {
type: RecognitionResultTypeV1.METADATA;
audioUtteranceId: string;
recordingStartMs?: number | undefined;
recordingEndMs?: number | undefined;
transcriptEndMs?: number | undefined;
socketCloseAtMs?: number | undefined;
duration?: number | undefined;
volume?: number | undefined;
accumulatedAudioTimeMs?: number | undefined;
costInUSD?: number | undefined;
apiType?: ASRApiType | undefined;
asrConfig?: string | undefined;
rawAsrMetadata?: string | undefined;
}>;
type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
/**
* Error type enum V1 - categorizes different types of errors
*/
declare enum ErrorTypeV1 {
AUTHENTICATION_ERROR = "authentication_error",
VALIDATION_ERROR = "validation_error",
PROVIDER_ERROR = "provider_error",
TIMEOUT_ERROR = "timeout_error",
QUOTA_EXCEEDED = "quota_exceeded",
CONNECTION_ERROR = "connection_error",
UNKNOWN_ERROR = "unknown_error"
}
/**
* Error result V1 - contains error message
* In the long run game server should know it, rather than TV or client.
*/
declare const ErrorResultSchemaV1: z.ZodObject<{
type: z.ZodLiteral<RecognitionResultTypeV1.ERROR>;
audioUtteranceId: z.ZodString;
errorType: z.ZodOptional<z.ZodNativeEnum<typeof ErrorTypeV1>>;
message: z.ZodOptional<z.ZodString>;
code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>;
description: z.ZodOptional<z.ZodString>;
}, "strip", z.ZodTypeAny, {
type: RecognitionResultTypeV1.ERROR;
audioUtteranceId: string;
errorType?: ErrorTypeV1 | undefined;
message?: string | undefined;
code?: string | number | undefined;
description?: string | undefined;
}, {
type: RecognitionResultTypeV1.ERROR;
audioUtteranceId: string;
errorType?: ErrorTypeV1 | undefined;
message?: string | undefined;
code?: string | number | undefined;
description?: string | undefined;
}>;
type ErrorResultV1 = z.infer<typeof ErrorResultSchemaV1>;
/**
* Recognition Context Types V1
* NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
* Types and schemas for recognition context data
*/
/**
* Message type discriminator for recognition context V1
*/
declare enum RecognitionContextTypeV1 {
GAME_CONTEXT = "GameContext",
CONTROL_SIGNAL = "ControlSignal",
ASR_REQUEST = "ASRRequest"
}
/**
* Control signal types for recognition V1
*/
declare enum ControlSignalTypeV1 {
START_RECORDING = "start_recording",
STOP_RECORDING = "stop_recording"
}
/**
* Game context V1 - contains game state information
*/
declare const GameContextSchemaV1: z.ZodObject<{
type: z.ZodLiteral<RecognitionContextTypeV1.GAME_CONTEXT>;
gameId: z.ZodString;
gamePhase: z.ZodString;
promptSTT: z.ZodOptional<z.ZodString>;
promptSTF: z.ZodOptional<z.ZodString>;
promptTTF: z.ZodOptional<z.ZodString>;
slotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>;
}, "strip", z.ZodTypeAny, {
type: RecognitionContextTypeV1.GAME_CONTEXT;
gameId: string;
gamePhase: string;
promptSTT?: string | undefined;
promptSTF?: string | undefined;
promptTTF?: string | undefined;
slotMap?: Record<string, string[]> | undefined;
}, {
type: RecognitionContextTypeV1.GAME_CONTEXT;
gameId: string;
gamePhase: string;
promptSTT?: string | undefined;
promptSTF?: string | undefined;
promptTTF?: string | undefined;
slotMap?: Record<string, string[]> | undefined;
}>;
type GameContextV1 = z.infer<typeof GameContextSchemaV1>;
/**
* Unified ASR Request Configuration
*
* Provider-agnostic configuration for ASR (Automatic Speech Recognition) requests.
* This interface provides a consistent API for clients regardless of the underlying provider.
*
* All fields use library-defined enums for type safety and consistency.
* Provider-specific mappers will convert these to provider-native formats.
*/
/**
* Final transcript stability modes
*
* Controls timeout duration for fallback final transcript after stopRecording().
* Similar to AssemblyAI's turn detection confidence modes but applied to our
* internal timeout mechanism when vendors don't respond with is_final=true.
*
* @see https://www.assemblyai.com/docs/speech-to-text/universal-streaming/turn-detection
*/
declare enum FinalTranscriptStability {
/**
* Aggressive mode: 100ms timeout
* Fast response, optimized for short utterances and quick back-and-forth
* Use cases: IVR, quick commands, retail confirmations
*/
AGGRESSIVE = "aggressive",
/**
* Balanced mode: 200ms timeout (default)
* Natural middle ground for most conversational scenarios
* Use cases: General customer support, tech support, typical voice interactions
*/
BALANCED = "balanced",
/**
* Conservative mode: 400ms timeout
* Wait longer for providers, optimized for complex/reflective speech
* Use cases: Healthcare, complex queries, careful thought processes
*/
CONSERVATIVE = "conservative",
/**
* Experimental mode: 10000ms (10 seconds) timeout
* Very long wait for batch/async providers that need significant processing time
* Use cases: Batch processing (Gemini, OpenAI Whisper), complex audio analysis
* Note: Should be cancelled immediately when transcript is received
*/
EXPERIMENTAL = "experimental"
}
/**
* Unified ASR request configuration
*
* This configuration is used by:
* - Client SDKs to specify recognition parameters
* - Demo applications for user input
* - Service layer to configure provider sessions
*
* Core fields only - all provider-specific options go in providerOptions
*
* @example
* ```typescript
* const config: ASRRequestConfig = {
* provider: RecognitionProvider.GOOGLE,
* model: GoogleModel.LATEST_LONG,
* language: Language.ENGLISH_US,
* sampleRate: SampleRate.RATE_16000, // or just 16000
* encoding: AudioEncoding.LINEAR16,
* providerOptions: {
* google: {
* enableAutomaticPunctuation: true,
* interimResults: true,
* singleUtterance: false
* }
* }
* };
* ```
*/
interface ASRRequestConfig {
/**
* The ASR provider to use
* Must be one of the supported providers in RecognitionProvider enum
*/
provider: RecognitionProvider | string;
/**
* Optional model specification for the provider
* Can be provider-specific model enum or string
* If not specified, provider's default model will be used
*/
model?: RecognitionModel;
/**
* Language/locale for recognition
* Use Language enum for common languages
* Can also accept BCP-47 language tags as strings
*/
language: Language | string;
/**
* Audio sample rate in Hz
* Prefer using SampleRate enum values for standard rates
* Can also accept numeric Hz values (e.g., 16000)
*/
sampleRate: SampleRate | number;
/**
* Audio encoding format
* Must match the actual audio data being sent
* Use AudioEncoding enum for standard formats
*/
encoding: AudioEncoding | string;
/**
* Enable interim (partial) results during recognition
* When true, receive real-time updates before finalization
* When false, only receive final results
* Default: false
*/
interimResults?: boolean;
/**
* Require GameContext before starting recognition such as song titles
* When true, server waits for GameContext message before processing audio
* When false, recognition starts immediately
* Default: false
*/
useContext?: boolean;
/**
* Final transcript stability mode
*
* Controls timeout duration for fallback final transcript when provider
* doesn't respond with is_final=true after stopRecording().
*
* - aggressive: 100ms - fast response, may cut off slow providers
* - balanced: 200ms - current default, good for most cases
* - conservative: 400ms - wait longer for complex utterances
*
* @default 'balanced'
* @see FinalTranscriptStability enum for detailed descriptions
*/
finalTranscriptStability?: FinalTranscriptStability | string;
/**
* Additional provider-specific options
*
* Common options per provider:
* - Deepgram: punctuate, smart_format, diarize, utterances
* - Google: enableAutomaticPunctuation, singleUtterance, enableWordTimeOffsets
* - AssemblyAI: formatTurns, filter_profanity, word_boost
*
* Note: interimResults is now a top-level field, but can still be overridden per provider
*
* @example
* ```typescript
* providerOptions: {
* google: {
* enableAutomaticPunctuation: true,
* singleUtterance: false,
* enableWordTimeOffsets: false
* }
* }
* ```
*/
providerOptions?: Record<string, any>;
/**
* Optional fallback ASR configurations
*
* List of alternative ASR configurations to use if the primary fails.
* Each fallback config is a complete ASRRequestConfig that will be tried
* in order until one succeeds.
*
* @example
* ```typescript
* fallbackModels: [
* {
* provider: RecognitionProvider.DEEPGRAM,
* model: DeepgramModel.NOVA_2,
* language: Language.ENGLISH_US,
* sampleRate: 16000,
* encoding: AudioEncoding.LINEAR16
* },
* {
* provider: RecognitionProvider.GOOGLE,
* model: GoogleModel.LATEST_SHORT,
* language: Language.ENGLISH_US,
* sampleRate: 16000,
* encoding: AudioEncoding.LINEAR16
* }
* ]
* ```
*/
fallbackModels?: ASRRequestConfig[];
}
/**
* Standard stage/environment constants used across all services
*/
declare const STAGES: {
readonly LOCAL: "local";
readonly DEV: "dev";
readonly STAGING: "staging";
readonly PRODUCTION: "production";
};
type Stage = typeof STAGES[keyof typeof STAGES];
/**
* Generic WebSocket protocol types and utilities
* Supports flexible versioning and message types
* Used by both client and server implementations
*/
/**
* Base message structure - completely flexible
* @template V - Version type (number, string, etc.)
*/
interface Message<V = number> {
v: V;
type: string;
data?: unknown;
}
/**
* Version serializer interface
* Converts between version type V and byte representation
*/
interface VersionSerializer<V> {
serialize: (v: V) => number;
deserialize: (byte: number) => V;
}
/**
* WebSocketAudioClient - Abstract base class for WebSocket clients
* Sends audio and control messages, receives responses from server
*
* Features:
* - Generic version type support (number, string, etc.)
* - Type-safe upward/downward message data
* - Client-side backpressure monitoring
* - Abstract hooks for application-specific logic
* - Format-agnostic audio protocol (supports any encoding)
*/
type ClientConfig = {
url: string;
highWM?: number;
lowWM?: number;
};
/**
* WebSocketAudioClient - Abstract base class for WebSocket clients
* that send audio frames and JSON messages
*
* @template V - Version type (number, string, object, etc.)
* @template TUpward - Type of upward message data (Client -> Server)
* @template TDownward - Type of downward message data (Server -> Client)
*
* @example
* ```typescript
* class MyClient extends WebSocketAudioClient<number, MyUpMsg, MyDownMsg> {
* protected onConnected() {
* console.log('Connected!');
* }
*
* protected onMessage(msg) {
* console.log('Received:', msg.type, msg.data);
* }
*
* protected onDisconnected(code, reason) {
* console.log('Disconnected:', code, reason);
* }
*
* protected onError(error) {
* console.error('Error:', error);
* }
* }
*
* const client = new MyClient({ url: 'ws://localhost:8080' });
* client.connect();
* client.sendMessage(1, 'configure', { language: 'en' });
* client.sendAudio(audioData);
* ```
*/
declare abstract class WebSocketAudioClient<V = number, // Version type (default: number)
TUpward = unknown, // Upward message data type
TDownward = unknown> {
private cfg;
protected versionSerializer: VersionSerializer<V>;
private ws;
private seq;
private HWM;
private LWM;
constructor(cfg: ClientConfig, versionSerializer?: VersionSerializer<V>);
/**
* Hook: Called when WebSocket connection is established
*/
protected abstract onConnected(): void;
/**
* Hook: Called when WebSocket connection closes
* @param code - Close code (see WebSocketCloseCode enum)
* @param reason - Human-readable close reason
*/
protected abstract onDisconnected(code: number, reason: string): void;
/**
* Hook: Called when WebSocket error occurs
*/
protected abstract onError(error: Event): void;
/**
* Hook: Called when downward message arrives from server
* Override this to handle messages (optional - default does nothing)
*/
protected onMessage(_msg: Message<V> & {
data: TDownward;
}): void;
connect(): void;
/**
* Send JSON message to server
* @param version - Message version
* @param type - Message type (developer defined)
* @param data - Message payload (typed)
*/
sendMessage(version: V, type: string, data: TUpward): void;
/**
* Send audio frame with specified encoding and sample rate
* @param audioData - Audio data (any format: Int16Array, Uint8Array, ArrayBuffer, etc.)
* @param version - Audio frame version
* @param encodingId - Audio encoding ID (0-5, e.g., AudioEncoding.LINEAR16)
* @param sampleRate - Sample rate in Hz (e.g., 16000)
*/
sendAudio(audioData: ArrayBuffer | ArrayBufferView, version: V, encodingId: number, sampleRate: number): void;
/**
* Get current WebSocket buffer size
*/
getBufferedAmount(): number;
/**
* Check if local buffer is backpressured
*/
isLocalBackpressured(): boolean;
/**
* Check if ready to send audio
* Verifies: connection open, no local buffer pressure
*/
canSend(): boolean;
/**
* Check if connection is open
*/
isOpen(): boolean;
/**
* Get current connection state
*/
getReadyState(): number;
/**
* Close the WebSocket connection
* Protected method for subclasses to implement disconnect logic
* @param code - WebSocket close code (default: 1000 = normal closure)
* @param reason - Human-readable close reason
*/
protected closeConnection(code?: number, reason?: string): void;
}
/**
* Recognition Client Types
*
* Type definitions and interfaces for the recognition client SDK.
* These interfaces enable dependency injection, testing, and alternative implementations.
*/
/**
* Client connection state enum
* Represents the various states a recognition client can be in during its lifecycle
*/
declare enum ClientState {
/** Initial state, no connection established */
INITIAL = "initial",
/** Actively establishing WebSocket connection */
CONNECTING = "connecting",
/** WebSocket connected but waiting for server ready signal */
CONNECTED = "connected",
/** Server ready, can send audio */
READY = "ready",
/** Sent stop signal, waiting for final transcript */
STOPPING = "stopping",
/** Connection closed normally after stop */
STOPPED = "stopped",
/** Connection failed or lost unexpectedly */
FAILED = "failed"
}
/**
* Callback URL configuration with message type filtering
*/
interface RecognitionCallbackUrl {
/** The callback URL endpoint */
url: string;
/** Array of message types to send to this URL. If empty/undefined, all types are sent */
messageTypes?: Array<string | number>;
}
interface IRecognitionClientConfig {
/**
* WebSocket endpoint URL (optional)
* Either `url` or `stage` must be provided.
* If both are provided, `url` takes precedence.
*
* Example with explicit URL:
* ```typescript
* { url: 'wss://custom-endpoint.example.com/ws/v1/recognize' }
* ```
*/
url?: string;
/**
* Stage for recognition service (recommended)
* Either `url` or `stage` must be provided.
* If both are provided, `url` takes precedence.
* Defaults to production if neither is provided.
*
* Example with STAGES enum (recommended):
* ```typescript
* import { STAGES } from '@recog/shared-types';
* { stage: STAGES.STAGING }
* ```
*
* String values also accepted:
* ```typescript
* { stage: 'staging' } // STAGES.LOCAL | STAGES.DEV | STAGES.STAGING | STAGES.PRODUCTION
* ```
*/
stage?: Stage | string;
/** ASR configuration (provider, model, language, etc.) - optional */
asrRequestConfig?: ASRRequestConfig;
/** Game context for improved recognition accuracy */
gameContext?: GameContextV1;
/** Audio utterance ID (optional) - if not provided, a UUID v4 will be generated */
audioUtteranceId?: string;
/** Callback URLs for server-side notifications with optional message type filtering (optional)
* Game side only need to use it if another service need to be notified about the transcription results.
*/
callbackUrls?: RecognitionCallbackUrl[];
/** User identification (optional) */
userId?: string;
/** Game session identification (optional). called 'sessionId' in Platform and most games. */
gameSessionId?: string;
/** Device identification (optional) */
deviceId?: string;
/** Account identification (optional) */
accountId?: string;
/** Question answer identifier for tracking Q&A sessions (optional and tracking purpose only) */
questionAnswerId?: string;
/** Platform for audio recording device (optional, e.g., 'ios', 'android', 'web', 'unity') */
platform?: string;
/** Callback when transcript is received */
onTranscript?: (result: TranscriptionResultV1) => void;
/**
* Callback when function call is received
* Note: Not supported in 2025. P2 feature for future speech-to-function-call capability.
*/
onFunctionCall?: (result: FunctionCallResultV1) => void;
/** Callback when metadata is received. Only once after transcription is complete.*/
onMetadata?: (metadata: MetadataResultV1) => void;
/** Callback when error occurs */
onError?: (error: ErrorResultV1) => void;
/** Callback when connected to WebSocket */
onConnected?: () => void;
/**
* Callback when WebSocket disconnects
* @param code - WebSocket close code (1000 = normal, 1006 = abnormal, etc.)
* @param reason - Close reason string
*/
onDisconnected?: (code: number, reason: string) => void;
/** High water mark for backpressure control (bytes) */
highWaterMark?: number;
/** Low water mark for backpressure control (bytes) */
lowWaterMark?: number;
/** Maximum buffer duration in seconds (default: 60s) */
maxBufferDurationSec?: number;
/** Expected chunks per second for ring buffer sizing (default: 100) */
chunksPerSecond?: number;
/**
* Connection retry configuration (optional)
* Only applies to initial connection establishment, not mid-stream interruptions.
*
* Default: { maxAttempts: 4, delayMs: 200 } (try once, retry 3 times = 4 total attempts)
*
* Timing: Attempt 1 → FAIL → wait 200ms → Attempt 2 → FAIL → wait 200ms → Attempt 3 → FAIL → wait 200ms → Attempt 4
*
* Example:
* ```typescript
* {
* connectionRetry: {
* maxAttempts: 2, // Try connecting up to 2 times (1 retry)
* delayMs: 500 // Wait 500ms between attempts
* }
* }
* ```
*/
connectionRetry?: {
/** Maximum number of connection attempts (default: 4, min: 1, max: 5) */
maxAttempts?: number;
/** Delay in milliseconds between retry attempts (default: 200ms) */
delayMs?: number;
};
/**
* Optional logger function for debugging
* If not provided, no logging will occur
* @param level - Log level: 'debug', 'info', 'warn', 'error'
* @param message - Log message
* @param data - Optional additional data
*/
logger?: (level: 'debug' | 'info' | 'warn' | 'error', message: string, data?: any) => void;
}
/**
* Recognition Client Interface
*
* Main interface for real-time speech recognition clients.
* Provides methods for connection management, audio streaming, and session control.
*/
interface IRecognitionClient {
/**
* Connect to the WebSocket endpoint
* @returns Promise that resolves when connected
* @throws Error if connection fails or times out
*/
connect(): Promise<void>;
/**
* Send audio data to the recognition service
* Audio is buffered locally and sent when connection is ready.
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
*/
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
/**
* Stop recording and wait for final transcript
* The server will close the connection after sending the final transcript.
* @returns Promise that resolves when final transcript is received
*/
stopRecording(): Promise<void>;
/**
* Force stop and immediately close connection without waiting for server
*
* WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
* - Does NOT wait for server to process remaining audio
* - Does NOT receive final transcript from server
* - Immediately closes WebSocket connection
* - Cleans up resources (buffers, listeners)
*
* Use Cases:
* - User explicitly cancels/abandons session
* - Timeout scenarios where waiting is not acceptable
* - Need immediate cleanup and can't wait for server
*
* RECOMMENDED: Use stopRecording() for normal shutdown.
* Only use this when immediate disconnection is required.
*/
stopAbnormally(): void;
/**
* Get the audio utterance ID for this session
* Available immediately after client construction.
* @returns UUID v4 string identifying this recognition session
*/
getAudioUtteranceId(): string;
/**
* Get the current state of the client
* @returns Current ClientState value
*/
getState(): ClientState;
/**
* Check if WebSocket connection is open
* @returns true if connected and ready to communicate
*/
isConnected(): boolean;
/**
* Check if client is currently connecting
* @returns true if connection is in progress
*/
isConnecting(): boolean;
/**
* Check if client is currently stopping
* @returns true if stopRecording() is in progress
*/
isStopping(): boolean;
/**
* Check if transcription has finished
* @returns true if the transcription is complete
*/
isTranscriptionFinished(): boolean;
/**
* Check if the audio buffer has overflowed
* @returns true if the ring buffer has wrapped around
*/
isBufferOverflowing(): boolean;
/**
* Get client statistics
* @returns Statistics about audio transmission and buffering
*/
getStats(): IRecognitionClientStats;
/**
* Get the WebSocket URL being used by this client
* Available immediately after client construction.
* @returns WebSocket URL string
*/
getUrl(): string;
}
/**
* Client statistics interface
*/
interface IRecognitionClientStats {
/** Total audio bytes sent to server */
audioBytesSent: number;
/** Total number of audio chunks sent */
audioChunksSent: number;
/** Total number of audio chunks buffered */
audioChunksBuffered: number;
/** Number of times the ring buffer overflowed */
bufferOverflowCount: number;
/** Current number of chunks in buffer */
currentBufferedChunks: number;
/** Whether the ring buffer has wrapped (overwritten old data) */
hasWrapped: boolean;
}
/**
* Configuration for RealTimeTwoWayWebSocketRecognitionClient
* This extends IRecognitionClientConfig and is the main configuration interface
* for creating a new RealTimeTwoWayWebSocketRecognitionClient instance.
*/
interface RealTimeTwoWayWebSocketRecognitionClientConfig extends IRecognitionClientConfig {
}
/**
* RealTimeTwoWayWebSocketRecognitionClient - Clean, compact SDK for real-time speech recognition
*
* Features:
* - Ring buffer-based audio storage with fixed memory footprint
* - Automatic buffering when disconnected, immediate send when connected
* - Buffer persists after flush (for future retry/reconnection scenarios)
* - Built on WebSocketAudioClient for robust protocol handling
* - Simple API: connect() → sendAudio() → stopRecording()
* - Type-safe message handling with callbacks
* - Automatic backpressure management
* - Overflow detection with buffer state tracking
*
* Example:
* ```typescript
* const client = new RealTimeTwoWayWebSocketRecognitionClient({
* url: 'ws://localhost:3101/ws/v1/recognize',
* onTranscript: (result) => console.log(result.finalTranscript),
* onError: (error) => console.error(error),
* maxBufferDurationSec: 60 // Ring buffer for 60 seconds
* });
*
* await client.connect();
*
* // Send audio chunks - always stored in ring buffer, sent if connected
* micStream.on('data', (chunk) => client.sendAudio(chunk));
*
* // Signal end of audio and wait for final results
* await client.stopRecording();
*
* // Server will close connection after sending finals
* // No manual cleanup needed - browser handles it
* ```
*/
/**
* Re-export TranscriptionResultV1 as TranscriptionResult for backward compatibility
*/
type TranscriptionResult = TranscriptionResultV1;
/**
* RealTimeTwoWayWebSocketRecognitionClient - SDK-level client for real-time speech recognition
*
* Implements IRecognitionClient interface for dependency injection and testing.
* Extends WebSocketAudioClient with local audio buffering and simple callback-based API.
*/
declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
private static readonly PROTOCOL_VERSION;
private config;
private audioBuffer;
private messageHandler;
private state;
private connectionPromise;
private isDebugLogEnabled;
private audioBytesSent;
private audioChunksSent;
private audioStatsLogInterval;
private lastAudioStatsLog;
constructor(config: RealTimeTwoWayWebSocketRecognitionClientConfig);
/**
* Internal logging helper - only logs if a logger was provided in config
* Debug logs are additionally gated by isDebugLogEnabled flag
* @param level - Log level: debug, info, warn, or error
* @param message - Message to log
* @param data - Optional additional data to log
*/
private log;
/**
* Clean up internal resources to free memory
* Called when connection closes (normally or abnormally)
*/
private cleanup;
connect(): Promise<void>;
/**
* Attempt to connect with retry logic
* Only retries on initial connection establishment, not mid-stream interruptions
*/
private connectWithRetry;
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
private sendAudioInternal;
stopRecording(): Promise<void>;
stopAbnormally(): void;
getAudioUtteranceId(): string;
getUrl(): string;
getState(): ClientState;
isConnected(): boolean;
isConnecting(): boolean;
isStopping(): boolean;
isTranscriptionFinished(): boolean;
isBufferOverflowing(): boolean;
getStats(): IRecognitionClientStats;
protected onConnected(): void;
protected onDisconnected(code: number, reason: string): void;
/**
* Get human-readable description for WebSocket close code
*/
private getCloseCodeDescription;
protected onError(error: Event): void;
protected onMessage(msg: {
v: number;
type: string;
data: any;
}): void;
/**
* Handle control messages from server
* @param msg - Control message containing server actions
*/
private handleControlMessage;
/**
* Send audio immediately to the server (without buffering)
* @param audioData - Audio data to send
*/
private sendAudioNow;
}
export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 };
export type { GameContextV1, RealTimeTwoWayWebSocketRecognitionClientConfig, TranscriptionResult };