react-native-deepgram
Version:
React Native SDK for Deepgram's AI-powered speech-to-text, real-time transcription, and text intelligence APIs. Supports live audio streaming, file transcription, sentiment analysis, and topic detection for iOS and Android.
1,059 lines (940 loc) • 31.7 kB
text/typescript
import { useRef, useCallback, useEffect, useState } from 'react';
import { NativeEventEmitter, NativeModules, Platform } from 'react-native';
import { Buffer } from 'buffer';
import { Deepgram } from './NativeDeepgram';
import { askMicPermission } from './helpers/askMicPermission';
import type {
DeepgramVoiceAgentSettings,
DeepgramVoiceAgentSettingsMessage,
DeepgramVoiceAgentFunctionCallResponseMessage,
DeepgramVoiceAgentClientMessage,
DeepgramVoiceAgentServerMessage,
DeepgramVoiceAgentWelcomeMessage,
DeepgramVoiceAgentSettingsAppliedMessage,
DeepgramVoiceAgentConversationTextMessage,
DeepgramVoiceAgentAgentThinkingMessage,
DeepgramVoiceAgentAgentStartedSpeakingMessage,
DeepgramVoiceAgentAgentAudioDoneMessage,
DeepgramVoiceAgentUserStartedSpeakingMessage,
DeepgramVoiceAgentFunctionCallRequestMessage,
DeepgramVoiceAgentReceiveFunctionCallResponseMessage,
DeepgramVoiceAgentPromptUpdatedMessage,
DeepgramVoiceAgentSpeakUpdatedMessage,
DeepgramVoiceAgentInjectionRefusedMessage,
DeepgramVoiceAgentWarningMessage,
DeepgramVoiceAgentErrorMessage,
DeepgramVoiceAgentAudioConfigMessage,
} from './types';
const DEFAULT_AGENT_ENDPOINT = 'wss://agent.deepgram.com/v1/agent/converse';
const DEFAULT_INPUT_SAMPLE_RATE = 16_000;
const BASE_NATIVE_SAMPLE_RATE = 16_000;
const eventName = Platform.select({
ios: 'DeepgramAudioPCM',
android: 'AudioChunk',
default: 'DeepgramAudioPCM',
});
const ensureArrayBuffer = (data: any): ArrayBuffer | null => {
if (!data) return null;
if (data instanceof ArrayBuffer) return data;
if (ArrayBuffer.isView(data)) {
const view = data as ArrayBufferView;
if (view.buffer instanceof ArrayBuffer) {
return view.buffer.slice(
view.byteOffset,
view.byteOffset + view.byteLength
);
}
const copy = new Uint8Array(view.byteLength);
copy.set(new Uint8Array(view.buffer, view.byteOffset, view.byteLength));
return copy.buffer;
}
return null;
};
const isPlainObject = (value: unknown): value is Record<string, unknown> =>
typeof value === 'object' && value !== null && !Array.isArray(value);
const cloneValue = <T>(value: T): T => {
if (Array.isArray(value)) {
return value.map((item) => cloneValue(item)) as unknown as T;
}
if (isPlainObject(value)) {
const cloned: Record<string, unknown> = {};
Object.entries(value).forEach(([key, entryValue]) => {
cloned[key] = cloneValue(entryValue);
});
return cloned as T;
}
return value;
};
const mergePlainObjects = <T extends Record<string, unknown>>(
base?: T,
override?: T
): T | undefined => {
if (!base && !override) {
return undefined;
}
if (!base) {
return override ? (cloneValue(override) as T) : undefined;
}
const result = cloneValue(base) as Record<string, unknown>;
if (!override) {
return result as T;
}
Object.entries(override).forEach(([key, overrideValue]) => {
if (overrideValue === undefined) {
result[key] = undefined;
return;
}
if (isPlainObject(overrideValue)) {
const existing = result[key];
result[key] = mergePlainObjects(
isPlainObject(existing)
? (existing as Record<string, unknown>)
: undefined,
overrideValue
);
return;
}
if (Array.isArray(overrideValue)) {
result[key] = overrideValue.map((item) => cloneValue(item));
return;
}
result[key] = overrideValue;
});
return result as T;
};
const hasKeys = (value: unknown, keys: string[]) =>
isPlainObject(value) && keys.every((key) => key in value);
const computeDownsampleFactor = (
target: number | undefined,
base: number = BASE_NATIVE_SAMPLE_RATE
) => {
if (!target || target >= base || base <= 0) {
return 1;
}
const ratio = Math.round(base / target);
return ratio > 0 ? ratio : 1;
};
const resolveDownsampleFactor = (
overrideFactor: number | undefined,
targetSampleRate: number | undefined,
nativeSampleRate: number | undefined
) => {
if (overrideFactor == null) {
return computeDownsampleFactor(targetSampleRate, nativeSampleRate);
}
const normalized = Math.max(1, Math.round(overrideFactor));
if (!nativeSampleRate || !targetSampleRate) {
return normalized;
}
if (nativeSampleRate <= targetSampleRate) {
return 1;
}
return normalized;
};
type WebSocketLike = Pick<
WebSocket,
| 'readyState'
| 'send'
| 'close'
| 'onopen'
| 'onmessage'
| 'onerror'
| 'onclose'
>;
export interface UseDeepgramVoiceAgentProps {
endpoint?: string;
defaultSettings?: DeepgramVoiceAgentSettings;
autoStartMicrophone?: boolean;
downsampleFactor?: number;
autoPlayAudio?: boolean;
trackState?: boolean;
trackConversation?: boolean;
trackAgentStatus?: boolean;
onBeforeConnect?: () => void;
onConnect?: () => void;
onClose?: (event?: any) => void;
onError?: (error: unknown) => void;
onMessage?: (message: DeepgramVoiceAgentServerMessage) => void;
onWelcome?: (message: DeepgramVoiceAgentWelcomeMessage) => void;
onSettingsApplied?: (
message: DeepgramVoiceAgentSettingsAppliedMessage
) => void;
onConversationText?: (
message: DeepgramVoiceAgentConversationTextMessage
) => void;
onAgentThinking?: (message: DeepgramVoiceAgentAgentThinkingMessage) => void;
onAgentStartedSpeaking?: (
message: DeepgramVoiceAgentAgentStartedSpeakingMessage
) => void;
onAgentAudioDone?: (message: DeepgramVoiceAgentAgentAudioDoneMessage) => void;
onUserStartedSpeaking?: (
message: DeepgramVoiceAgentUserStartedSpeakingMessage
) => void;
onFunctionCallRequest?: (
message: DeepgramVoiceAgentFunctionCallRequestMessage
) => void;
onFunctionCallResponse?: (
message: DeepgramVoiceAgentReceiveFunctionCallResponseMessage
) => void;
onPromptUpdated?: (message: DeepgramVoiceAgentPromptUpdatedMessage) => void;
onSpeakUpdated?: (message: DeepgramVoiceAgentSpeakUpdatedMessage) => void;
onInjectionRefused?: (
message: DeepgramVoiceAgentInjectionRefusedMessage
) => void;
onWarning?: (message: DeepgramVoiceAgentWarningMessage) => void;
onServerError?: (message: DeepgramVoiceAgentErrorMessage) => void;
onAudioConfig?: (message: DeepgramVoiceAgentAudioConfigMessage) => void;
onAudio?: (audioData: ArrayBuffer) => void;
}
export interface UseDeepgramVoiceAgentReturn {
connect: (settings?: DeepgramVoiceAgentSettings) => Promise<void>;
disconnect: () => void;
sendMessage: (message: DeepgramVoiceAgentClientMessage) => boolean;
sendSettings: (settings: DeepgramVoiceAgentSettings) => boolean;
injectUserMessage: (content: string) => boolean;
injectAgentMessage: (message: string) => boolean;
sendFunctionCallResponse: (
response: Omit<DeepgramVoiceAgentFunctionCallResponseMessage, 'type'>
) => boolean;
sendKeepAlive: () => boolean;
updatePrompt: (prompt: string) => boolean;
sendMedia: (chunk: ArrayBuffer | Uint8Array | number[]) => boolean;
isConnected: () => boolean;
state?: {
connectionState: 'idle' | 'connecting' | 'connected' | 'disconnected';
error: string | null;
warning: string | null;
};
conversation?: Array<{ role: string; content: string }>;
clearConversation?: () => void;
agentStatus?: {
thinking: string | null;
latency: { total?: number; tts?: number; ttt?: number } | null;
};
}
export function useDeepgramVoiceAgent({
endpoint = DEFAULT_AGENT_ENDPOINT,
defaultSettings,
autoStartMicrophone = true,
autoPlayAudio = true,
trackState = false,
trackConversation = false,
trackAgentStatus = false,
downsampleFactor,
onBeforeConnect,
onConnect,
onClose,
onError,
onMessage,
onWelcome,
onSettingsApplied,
onConversationText,
onAgentThinking,
onAgentStartedSpeaking,
onAgentAudioDone,
onUserStartedSpeaking,
onFunctionCallRequest,
onFunctionCallResponse,
onPromptUpdated,
onSpeakUpdated,
onInjectionRefused,
onWarning,
onServerError,
onAudioConfig,
onAudio,
}: UseDeepgramVoiceAgentProps = {}): UseDeepgramVoiceAgentReturn {
const ws = useRef<WebSocketLike | null>(null);
const audioSub = useRef<ReturnType<NativeEventEmitter['addListener']> | null>(
null
);
const nativeInputSampleRate = useRef(BASE_NATIVE_SAMPLE_RATE);
const targetInputSampleRate = useRef(DEFAULT_INPUT_SAMPLE_RATE);
const currentDownsample = useRef(
resolveDownsampleFactor(
downsampleFactor,
targetInputSampleRate.current,
nativeInputSampleRate.current
)
);
const microphoneActive = useRef(false);
const defaultSettingsRef = useRef(defaultSettings);
const endpointRef = useRef(endpoint);
const onBeforeConnectRef = useRef(onBeforeConnect);
const onConnectRef = useRef(onConnect);
const onCloseRef = useRef(onClose);
const onErrorRef = useRef(onError);
const onMessageRef = useRef(onMessage);
const onWelcomeRef = useRef(onWelcome);
const onSettingsAppliedRef = useRef(onSettingsApplied);
const onConversationTextRef = useRef(onConversationText);
const onAgentThinkingRef = useRef(onAgentThinking);
const onAgentStartedSpeakingRef = useRef(onAgentStartedSpeaking);
const onAgentAudioDoneRef = useRef(onAgentAudioDone);
const onUserStartedSpeakingRef = useRef(onUserStartedSpeaking);
const onFunctionCallRequestRef = useRef(onFunctionCallRequest);
const onFunctionCallResponseRef = useRef(onFunctionCallResponse);
const onPromptUpdatedRef = useRef(onPromptUpdated);
const onSpeakUpdatedRef = useRef(onSpeakUpdated);
const onInjectionRefusedRef = useRef(onInjectionRefused);
const onWarningRef = useRef(onWarning);
const onServerErrorRef = useRef(onServerError);
const onAudioConfigRef = useRef(onAudioConfig);
const onAudioRef = useRef(onAudio);
const autoStartMicRef = useRef(autoStartMicrophone);
const [internalState, setInternalState] = useState<{
connectionState: 'idle' | 'connecting' | 'connected' | 'disconnected';
error: string | null;
warning: string | null;
}>(() => ({
connectionState: 'idle',
error: null,
warning: null,
}));
const [internalConversation, setInternalConversation] = useState<
Array<{ role: string; content: string }>
>([]);
const [internalAgentStatus, setInternalAgentStatus] = useState<{
thinking: string | null;
latency: { total?: number; tts?: number; ttt?: number } | null;
}>(() => ({
thinking: null,
latency: null,
}));
const sanitizeAudioSettings = useCallback(
(audio?: DeepgramVoiceAgentSettings['audio']) => {
if (!audio) {
return undefined;
}
const sanitized: DeepgramVoiceAgentSettings['audio'] = {};
if (audio.input) {
sanitized.input = { ...audio.input };
}
if (audio.output) {
sanitized.output = { ...audio.output };
}
Object.entries(audio).forEach(([key, value]) => {
if (key === 'input' || key === 'output') {
return;
}
let clonedValue: unknown = value;
if (Array.isArray(value)) {
clonedValue = value.map((item) => cloneValue(item));
} else if (isPlainObject(value)) {
clonedValue = cloneValue(value);
}
(sanitized as any)[key] = clonedValue;
});
return Object.keys(sanitized).length > 0 ? sanitized : undefined;
},
[]
);
const sanitizeAgentConfig = useCallback(
(agent?: DeepgramVoiceAgentSettings['agent']) => {
if (!agent) {
return undefined;
}
const sanitized: DeepgramVoiceAgentSettings['agent'] = {};
Object.entries(agent).forEach(([key, value]) => {
if (key === 'speak') {
return;
}
let clonedValue: unknown = value;
if (Array.isArray(value)) {
clonedValue = value.map((item) => cloneValue(item));
} else if (isPlainObject(value)) {
clonedValue = cloneValue(value);
}
(sanitized as any)[key] = clonedValue;
});
if (agent.speak) {
sanitized.speak = { ...agent.speak };
if (agent.speak.provider) {
sanitized.speak.provider = { ...agent.speak.provider };
}
}
return Object.keys(sanitized).length > 0 ? sanitized : undefined;
},
[]
);
const sanitizeSettings = useCallback(
(
settings?: DeepgramVoiceAgentSettings
): DeepgramVoiceAgentSettings | undefined => {
if (!settings) {
return undefined;
}
const sanitized: DeepgramVoiceAgentSettings = {};
Object.entries(settings).forEach(([key, value]) => {
if (key === 'audio') {
const audio = sanitizeAudioSettings(value as any);
if (audio) {
sanitized.audio = audio;
}
return;
}
if (key === 'agent') {
const agent = sanitizeAgentConfig(value as any);
if (agent) {
sanitized.agent = agent;
}
return;
}
(sanitized as any)[key] = value;
});
return Object.keys(sanitized).length > 0 ? sanitized : undefined;
},
[sanitizeAgentConfig, sanitizeAudioSettings]
);
const mergeSettings = useCallback(
(
base?: DeepgramVoiceAgentSettings,
override?: DeepgramVoiceAgentSettings
): DeepgramVoiceAgentSettings | undefined =>
mergePlainObjects(base as any, override as any) as
| DeepgramVoiceAgentSettings
| undefined,
[]
);
defaultSettingsRef.current = defaultSettings;
endpointRef.current = endpoint;
onBeforeConnectRef.current = onBeforeConnect;
onConnectRef.current = onConnect;
onCloseRef.current = onClose;
onErrorRef.current = onError;
onMessageRef.current = onMessage;
onWelcomeRef.current = onWelcome;
onSettingsAppliedRef.current = onSettingsApplied;
onConversationTextRef.current = onConversationText;
onAgentThinkingRef.current = onAgentThinking;
onAgentStartedSpeakingRef.current = onAgentStartedSpeaking;
onAgentAudioDoneRef.current = onAgentAudioDone;
onUserStartedSpeakingRef.current = onUserStartedSpeaking;
onFunctionCallRequestRef.current = onFunctionCallRequest;
onFunctionCallResponseRef.current = onFunctionCallResponse;
onPromptUpdatedRef.current = onPromptUpdated;
onSpeakUpdatedRef.current = onSpeakUpdated;
onInjectionRefusedRef.current = onInjectionRefused;
onWarningRef.current = onWarning;
onServerErrorRef.current = onServerError;
onAudioConfigRef.current = onAudioConfig;
onAudioRef.current = onAudio;
autoStartMicRef.current = autoStartMicrophone;
if (downsampleFactor != null) {
currentDownsample.current = downsampleFactor;
}
const cleanup = useCallback(() => {
audioSub.current?.remove();
audioSub.current = null;
if (microphoneActive.current) {
Deepgram.stopRecording().catch(() => {});
microphoneActive.current = false;
}
// Cleanup audio session for playback
Deepgram.stopAudio().catch(() => {});
const socket = ws.current;
if (socket) {
ws.current = null;
try {
if (
socket.readyState === WebSocket.OPEN ||
socket.readyState === WebSocket.CONNECTING
) {
socket.close(1000, 'cleanup');
} else {
socket.close();
}
} catch {
// ignore socket close errors
}
}
}, []);
useEffect(() => () => cleanup(), [cleanup]);
const handleMicChunk = useCallback(
(ev: any) => {
const socket = ws.current;
if (!socket || socket.readyState !== WebSocket.OPEN) {
return;
}
if (typeof ev?.sampleRate === 'number' && ev.sampleRate > 0) {
if (ev.sampleRate !== nativeInputSampleRate.current) {
nativeInputSampleRate.current = ev.sampleRate;
currentDownsample.current = resolveDownsampleFactor(
downsampleFactor,
targetInputSampleRate.current,
nativeInputSampleRate.current
);
}
}
const factor = currentDownsample.current ?? 1;
let chunk: ArrayBuffer | null = null;
if (typeof ev?.b64 === 'string') {
const binary = Uint8Array.from(atob(ev.b64), (c) => c.charCodeAt(0));
let int16 = new Int16Array(binary.buffer);
if (factor > 1 && int16.length >= factor) {
const downsampled = new Int16Array(Math.floor(int16.length / factor));
for (let i = 0; i < downsampled.length; i++) {
downsampled[i] = int16[i * factor];
}
int16 = downsampled;
}
chunk = int16.buffer;
} else if (Array.isArray(ev?.data)) {
const bytes = new Uint8Array(ev.data.length);
for (let i = 0; i < ev.data.length; i++) {
const value = ev.data[i];
bytes[i] = value < 0 ? value + 256 : value;
}
const view = new DataView(bytes.buffer);
const int16 = new Int16Array(bytes.length / 2);
for (let i = 0; i < int16.length; i++) {
int16[i] = view.getInt16(i * 2, true);
}
chunk = int16.buffer;
}
if (!chunk) {
return;
}
try {
socket.send(chunk);
} catch (err) {
onErrorRef.current?.(err);
}
},
[downsampleFactor, onErrorRef]
);
const handleSocketMessage = useCallback(
(ev: any) => {
if (typeof ev.data === 'string') {
try {
const message = JSON.parse(
ev.data
) as DeepgramVoiceAgentServerMessage;
onMessageRef.current?.(message);
switch (message.type) {
case 'Welcome':
if (hasKeys(message, ['request_id'])) {
onWelcomeRef.current?.(
message as DeepgramVoiceAgentWelcomeMessage
);
}
break;
case 'SettingsApplied':
onSettingsAppliedRef.current?.(
message as DeepgramVoiceAgentSettingsAppliedMessage
);
break;
case 'ConversationText':
if (hasKeys(message, ['role', 'content'])) {
const convMsg =
message as DeepgramVoiceAgentConversationTextMessage;
if (trackConversation) {
setInternalConversation((prev) => [
...prev,
{ role: convMsg.role, content: convMsg.content },
]);
}
onConversationTextRef.current?.(convMsg);
}
break;
case 'AgentThinking':
if (hasKeys(message, ['content'])) {
const thinkMsg =
message as DeepgramVoiceAgentAgentThinkingMessage;
if (trackAgentStatus) {
setInternalAgentStatus((prev) => ({
...prev,
thinking: thinkMsg.content,
}));
}
onAgentThinkingRef.current?.(thinkMsg);
}
break;
case 'AgentStartedSpeaking':
{
const speakMsg =
message as DeepgramVoiceAgentAgentStartedSpeakingMessage;
if (trackAgentStatus) {
setInternalAgentStatus((prev) => ({
...prev,
latency: {
total: speakMsg.total_latency,
tts: speakMsg.tts_latency,
ttt: speakMsg.ttt_latency,
},
}));
}
onAgentStartedSpeakingRef.current?.(speakMsg);
}
break;
case 'AgentAudioDone':
{
const doneMsg =
message as DeepgramVoiceAgentAgentAudioDoneMessage;
if (trackAgentStatus) {
setInternalAgentStatus({
thinking: null,
latency: null,
});
}
onAgentAudioDoneRef.current?.(doneMsg);
}
break;
case 'UserStartedSpeaking':
onUserStartedSpeakingRef.current?.(
message as DeepgramVoiceAgentUserStartedSpeakingMessage
);
break;
case 'FunctionCallRequest':
if (hasKeys(message, ['functions'])) {
onFunctionCallRequestRef.current?.(
message as DeepgramVoiceAgentFunctionCallRequestMessage
);
}
break;
case 'FunctionCallResponse':
if (hasKeys(message, ['id', 'name'])) {
onFunctionCallResponseRef.current?.(
message as DeepgramVoiceAgentReceiveFunctionCallResponseMessage
);
}
break;
case 'PromptUpdated':
onPromptUpdatedRef.current?.(
message as DeepgramVoiceAgentPromptUpdatedMessage
);
break;
case 'SpeakUpdated':
onSpeakUpdatedRef.current?.(
message as DeepgramVoiceAgentSpeakUpdatedMessage
);
break;
case 'Audio':
// Audio binary data will be handled by onmessage binary path
break;
case 'AudioConfig':
if (hasKeys(message, ['sample_rate'])) {
const configMsg =
message as DeepgramVoiceAgentAudioConfigMessage;
if (autoPlayAudio) {
const sampleRate =
configMsg.sample_rate || DEFAULT_INPUT_SAMPLE_RATE;
const channels = configMsg.channels || 1;
Deepgram.startPlayer?.(sampleRate, channels);
}
onAudioConfigRef.current?.(configMsg);
}
break;
case 'InjectionRefused':
if (hasKeys(message, ['message'])) {
onInjectionRefusedRef.current?.(
message as DeepgramVoiceAgentInjectionRefusedMessage
);
}
break;
case 'Warning':
if (hasKeys(message, ['description'])) {
const warnMsg = message as DeepgramVoiceAgentWarningMessage;
if (trackState) {
setInternalState((prev) => ({
...prev,
warning: warnMsg.description,
}));
}
onWarningRef.current?.(warnMsg);
}
break;
case 'Error':
{
const description =
typeof (message as any).description === 'string'
? (message as any).description
: undefined;
const code =
typeof (message as any).code === 'string'
? (message as any).code
: undefined;
const errorMsg = description ?? code ?? 'Voice agent error';
if (trackState) {
setInternalState((prev) => ({
...prev,
connectionState: 'disconnected',
error: errorMsg,
}));
}
if (description || code) {
onServerErrorRef.current?.(
message as DeepgramVoiceAgentErrorMessage
);
}
onErrorRef.current?.(new Error(errorMsg));
}
break;
default:
break;
}
} catch (err) {
onErrorRef.current?.(err);
}
return;
}
const buffer = ensureArrayBuffer(ev.data);
if (buffer) {
if (autoPlayAudio) {
try {
const bytes = new Uint8Array(buffer);
const b64 = Buffer.from(bytes).toString('base64');
Deepgram.feedAudio?.(b64);
} catch (err) {
console.warn('[VoiceAgent] Auto-feed audio error:', err);
}
}
onAudioRef.current?.(buffer);
}
},
[
onAgentAudioDoneRef,
onAgentStartedSpeakingRef,
onAgentThinkingRef,
onConversationTextRef,
onErrorRef,
onFunctionCallRequestRef,
onFunctionCallResponseRef,
onInjectionRefusedRef,
onMessageRef,
onPromptUpdatedRef,
onServerErrorRef,
onSettingsAppliedRef,
onSpeakUpdatedRef,
onUserStartedSpeakingRef,
onWarningRef,
autoPlayAudio,
trackAgentStatus,
trackConversation,
trackState,
]
);
const sendJsonMessage = useCallback(
(message: DeepgramVoiceAgentClientMessage) => {
const socket = ws.current;
if (!socket || socket.readyState !== WebSocket.OPEN) {
return false;
}
try {
socket.send(JSON.stringify(message));
return true;
} catch (err) {
onErrorRef.current?.(err);
return false;
}
},
[]
);
const sendBinary = useCallback(
(chunk: ArrayBuffer | Uint8Array | number[]) => {
const socket = ws.current;
if (!socket || socket.readyState !== WebSocket.OPEN) {
return false;
}
let payload: ArrayBuffer | null = null;
if (chunk instanceof ArrayBuffer) {
payload = chunk;
} else if (chunk instanceof Uint8Array) {
if (chunk.buffer instanceof ArrayBuffer) {
payload = chunk.buffer.slice(
chunk.byteOffset,
chunk.byteOffset + chunk.byteLength
);
} else {
const copy = new Uint8Array(chunk.byteLength);
copy.set(chunk);
payload = copy.buffer;
}
} else if (Array.isArray(chunk)) {
const uint = new Uint8Array(chunk.length);
for (let i = 0; i < chunk.length; i++) {
uint[i] = chunk[i];
}
payload = uint.buffer;
}
if (!payload) return false;
try {
socket.send(payload);
return true;
} catch (err) {
onErrorRef.current?.(err);
return false;
}
},
[]
);
const connect = useCallback(
async (overrideSettings?: DeepgramVoiceAgentSettings) => {
cleanup();
if (trackState) {
setInternalState({
connectionState: 'connecting',
error: null,
warning: null,
});
}
if (trackConversation) {
setInternalConversation([]);
}
if (trackAgentStatus) {
setInternalAgentStatus({ thinking: null, latency: null });
}
onBeforeConnectRef.current?.();
const apiKey = (globalThis as any).__DEEPGRAM_API_KEY__;
if (!apiKey) throw new Error('Deepgram API key missing');
const shouldCaptureMic = autoStartMicRef.current;
if (shouldCaptureMic) {
const granted = await askMicPermission();
if (!granted) {
throw new Error('Microphone permission denied');
}
await Deepgram.startRecording();
microphoneActive.current = true;
const emitter = new NativeEventEmitter(NativeModules.Deepgram);
if (eventName) {
audioSub.current = emitter.addListener(eventName, handleMicChunk);
}
} else {
// Only initialize audio session for playback if not recording
// (startRecording already activates the audio session)
await Deepgram.startAudio();
}
const sanitizedDefault = sanitizeSettings(defaultSettingsRef.current);
const sanitizedOverride = sanitizeSettings(overrideSettings);
const merged = mergeSettings(sanitizedDefault, sanitizedOverride);
const mergedSettings: DeepgramVoiceAgentSettingsMessage = {
type: 'Settings',
...(merged ?? {}),
};
const targetSampleRate =
overrideSettings?.audio?.input?.sample_rate ??
defaultSettingsRef.current?.audio?.input?.sample_rate ??
DEFAULT_INPUT_SAMPLE_RATE;
targetInputSampleRate.current = targetSampleRate;
currentDownsample.current = resolveDownsampleFactor(
downsampleFactor,
targetInputSampleRate.current,
nativeInputSampleRate.current
);
const socket = new (WebSocket as any)(endpointRef.current, undefined, {
headers: { Authorization: `Token ${apiKey}` },
});
socket.binaryType = 'arraybuffer';
ws.current = socket;
socket.onopen = () => {
sendJsonMessage(mergedSettings);
if (trackState) {
setInternalState((prev) => ({
...prev,
connectionState: 'connected',
}));
}
if (autoPlayAudio) {
const sampleRate =
merged?.audio?.output?.sample_rate ?? DEFAULT_INPUT_SAMPLE_RATE;
const channels = 1;
Deepgram.startPlayer?.(sampleRate, channels);
}
onConnectRef.current?.();
};
socket.onmessage = handleSocketMessage;
socket.onerror = (err: any) => {
onErrorRef.current?.(err);
};
socket.onclose = (event: any) => {
if (trackState) {
setInternalState((prev) => ({
...prev,
connectionState: 'disconnected',
}));
}
cleanup();
onCloseRef.current?.(event);
};
},
[
cleanup,
downsampleFactor,
handleMicChunk,
handleSocketMessage,
mergeSettings,
sanitizeSettings,
sendJsonMessage,
autoPlayAudio,
trackAgentStatus,
trackConversation,
trackState,
]
);
const disconnect = useCallback(() => {
cleanup();
}, [cleanup]);
const sendSettings = useCallback(
(settings: DeepgramVoiceAgentSettings) => {
const sanitized = sanitizeSettings(settings);
return sendJsonMessage({ type: 'Settings', ...(sanitized ?? {}) });
},
[sanitizeSettings, sendJsonMessage]
);
const injectUserMessage = useCallback(
(content: string) =>
sendJsonMessage({ type: 'InjectUserMessage', content }),
[sendJsonMessage]
);
const injectAgentMessage = useCallback(
(message: string) =>
sendJsonMessage({ type: 'InjectAgentMessage', message }),
[sendJsonMessage]
);
const sendFunctionCallResponse = useCallback(
(response: Omit<DeepgramVoiceAgentFunctionCallResponseMessage, 'type'>) =>
sendJsonMessage({
type: 'FunctionCallResponse',
...response,
} as DeepgramVoiceAgentFunctionCallResponseMessage),
[sendJsonMessage]
);
const sendKeepAlive = useCallback(
() => sendJsonMessage({ type: 'KeepAlive' }),
[sendJsonMessage]
);
const updatePrompt = useCallback(
(prompt: string) => sendJsonMessage({ type: 'UpdatePrompt', prompt }),
[sendJsonMessage]
);
const sendMessage = useCallback(
(message: DeepgramVoiceAgentClientMessage) => sendJsonMessage(message),
[sendJsonMessage]
);
const isConnected = useCallback(
() => ws.current?.readyState === WebSocket.OPEN,
[]
);
const clearConversation = useCallback(() => {
if (trackConversation) {
setInternalConversation([]);
}
}, [trackConversation]);
return {
connect,
disconnect,
sendMessage,
sendSettings,
injectUserMessage,
injectAgentMessage,
sendFunctionCallResponse,
sendKeepAlive,
updatePrompt,
sendMedia: sendBinary,
isConnected,
...(trackState ? { state: internalState } : {}),
...(trackConversation
? { conversation: internalConversation, clearConversation }
: {}),
...(trackAgentStatus ? { agentStatus: internalAgentStatus } : {}),
};
}