react-native-deepgram
Version:
React Native SDK for Deepgram's AI-powered speech-to-text, real-time transcription, and text intelligence APIs. Supports live audio streaming, file transcription, sentiment analysis, and topic detection for iOS and Android.
692 lines (690 loc) • 23.4 kB
JavaScript
;
import { useRef, useCallback, useEffect, useState } from 'react';
import { NativeEventEmitter, NativeModules, Platform } from 'react-native';
import { Buffer } from 'buffer';
import { Deepgram } from "./NativeDeepgram.js";
import { askMicPermission } from "./helpers/askMicPermission.js";
const DEFAULT_AGENT_ENDPOINT = 'wss://agent.deepgram.com/v1/agent/converse';
const DEFAULT_INPUT_SAMPLE_RATE = 16_000;
const BASE_NATIVE_SAMPLE_RATE = 16_000;
const eventName = Platform.select({
ios: 'DeepgramAudioPCM',
android: 'AudioChunk',
default: 'DeepgramAudioPCM'
});
const ensureArrayBuffer = data => {
if (!data) return null;
if (data instanceof ArrayBuffer) return data;
if (ArrayBuffer.isView(data)) {
const view = data;
if (view.buffer instanceof ArrayBuffer) {
return view.buffer.slice(view.byteOffset, view.byteOffset + view.byteLength);
}
const copy = new Uint8Array(view.byteLength);
copy.set(new Uint8Array(view.buffer, view.byteOffset, view.byteLength));
return copy.buffer;
}
return null;
};
const isPlainObject = value => typeof value === 'object' && value !== null && !Array.isArray(value);
const cloneValue = value => {
if (Array.isArray(value)) {
return value.map(item => cloneValue(item));
}
if (isPlainObject(value)) {
const cloned = {};
Object.entries(value).forEach(([key, entryValue]) => {
cloned[key] = cloneValue(entryValue);
});
return cloned;
}
return value;
};
const mergePlainObjects = (base, override) => {
if (!base && !override) {
return undefined;
}
if (!base) {
return override ? cloneValue(override) : undefined;
}
const result = cloneValue(base);
if (!override) {
return result;
}
Object.entries(override).forEach(([key, overrideValue]) => {
if (overrideValue === undefined) {
result[key] = undefined;
return;
}
if (isPlainObject(overrideValue)) {
const existing = result[key];
result[key] = mergePlainObjects(isPlainObject(existing) ? existing : undefined, overrideValue);
return;
}
if (Array.isArray(overrideValue)) {
result[key] = overrideValue.map(item => cloneValue(item));
return;
}
result[key] = overrideValue;
});
return result;
};
const hasKeys = (value, keys) => isPlainObject(value) && keys.every(key => key in value);
const computeDownsampleFactor = (target, base = BASE_NATIVE_SAMPLE_RATE) => {
if (!target || target >= base || base <= 0) {
return 1;
}
const ratio = Math.round(base / target);
return ratio > 0 ? ratio : 1;
};
const resolveDownsampleFactor = (overrideFactor, targetSampleRate, nativeSampleRate) => {
if (overrideFactor == null) {
return computeDownsampleFactor(targetSampleRate, nativeSampleRate);
}
const normalized = Math.max(1, Math.round(overrideFactor));
if (!nativeSampleRate || !targetSampleRate) {
return normalized;
}
if (nativeSampleRate <= targetSampleRate) {
return 1;
}
return normalized;
};
export function useDeepgramVoiceAgent({
endpoint = DEFAULT_AGENT_ENDPOINT,
defaultSettings,
autoStartMicrophone = true,
autoPlayAudio = true,
trackState = false,
trackConversation = false,
trackAgentStatus = false,
downsampleFactor,
onBeforeConnect,
onConnect,
onClose,
onError,
onMessage,
onWelcome,
onSettingsApplied,
onConversationText,
onAgentThinking,
onAgentStartedSpeaking,
onAgentAudioDone,
onUserStartedSpeaking,
onFunctionCallRequest,
onFunctionCallResponse,
onPromptUpdated,
onSpeakUpdated,
onInjectionRefused,
onWarning,
onServerError,
onAudioConfig,
onAudio
} = {}) {
const ws = useRef(null);
const audioSub = useRef(null);
const nativeInputSampleRate = useRef(BASE_NATIVE_SAMPLE_RATE);
const targetInputSampleRate = useRef(DEFAULT_INPUT_SAMPLE_RATE);
const currentDownsample = useRef(resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current));
const microphoneActive = useRef(false);
const defaultSettingsRef = useRef(defaultSettings);
const endpointRef = useRef(endpoint);
const onBeforeConnectRef = useRef(onBeforeConnect);
const onConnectRef = useRef(onConnect);
const onCloseRef = useRef(onClose);
const onErrorRef = useRef(onError);
const onMessageRef = useRef(onMessage);
const onWelcomeRef = useRef(onWelcome);
const onSettingsAppliedRef = useRef(onSettingsApplied);
const onConversationTextRef = useRef(onConversationText);
const onAgentThinkingRef = useRef(onAgentThinking);
const onAgentStartedSpeakingRef = useRef(onAgentStartedSpeaking);
const onAgentAudioDoneRef = useRef(onAgentAudioDone);
const onUserStartedSpeakingRef = useRef(onUserStartedSpeaking);
const onFunctionCallRequestRef = useRef(onFunctionCallRequest);
const onFunctionCallResponseRef = useRef(onFunctionCallResponse);
const onPromptUpdatedRef = useRef(onPromptUpdated);
const onSpeakUpdatedRef = useRef(onSpeakUpdated);
const onInjectionRefusedRef = useRef(onInjectionRefused);
const onWarningRef = useRef(onWarning);
const onServerErrorRef = useRef(onServerError);
const onAudioConfigRef = useRef(onAudioConfig);
const onAudioRef = useRef(onAudio);
const autoStartMicRef = useRef(autoStartMicrophone);
const [internalState, setInternalState] = useState(() => ({
connectionState: 'idle',
error: null,
warning: null
}));
const [internalConversation, setInternalConversation] = useState([]);
const [internalAgentStatus, setInternalAgentStatus] = useState(() => ({
thinking: null,
latency: null
}));
const sanitizeAudioSettings = useCallback(audio => {
if (!audio) {
return undefined;
}
const sanitized = {};
if (audio.input) {
sanitized.input = {
...audio.input
};
}
if (audio.output) {
sanitized.output = {
...audio.output
};
}
Object.entries(audio).forEach(([key, value]) => {
if (key === 'input' || key === 'output') {
return;
}
let clonedValue = value;
if (Array.isArray(value)) {
clonedValue = value.map(item => cloneValue(item));
} else if (isPlainObject(value)) {
clonedValue = cloneValue(value);
}
sanitized[key] = clonedValue;
});
return Object.keys(sanitized).length > 0 ? sanitized : undefined;
}, []);
const sanitizeAgentConfig = useCallback(agent => {
if (!agent) {
return undefined;
}
const sanitized = {};
Object.entries(agent).forEach(([key, value]) => {
if (key === 'speak') {
return;
}
let clonedValue = value;
if (Array.isArray(value)) {
clonedValue = value.map(item => cloneValue(item));
} else if (isPlainObject(value)) {
clonedValue = cloneValue(value);
}
sanitized[key] = clonedValue;
});
if (agent.speak) {
sanitized.speak = {
...agent.speak
};
if (agent.speak.provider) {
sanitized.speak.provider = {
...agent.speak.provider
};
}
}
return Object.keys(sanitized).length > 0 ? sanitized : undefined;
}, []);
const sanitizeSettings = useCallback(settings => {
if (!settings) {
return undefined;
}
const sanitized = {};
Object.entries(settings).forEach(([key, value]) => {
if (key === 'audio') {
const audio = sanitizeAudioSettings(value);
if (audio) {
sanitized.audio = audio;
}
return;
}
if (key === 'agent') {
const agent = sanitizeAgentConfig(value);
if (agent) {
sanitized.agent = agent;
}
return;
}
sanitized[key] = value;
});
return Object.keys(sanitized).length > 0 ? sanitized : undefined;
}, [sanitizeAgentConfig, sanitizeAudioSettings]);
const mergeSettings = useCallback((base, override) => mergePlainObjects(base, override), []);
defaultSettingsRef.current = defaultSettings;
endpointRef.current = endpoint;
onBeforeConnectRef.current = onBeforeConnect;
onConnectRef.current = onConnect;
onCloseRef.current = onClose;
onErrorRef.current = onError;
onMessageRef.current = onMessage;
onWelcomeRef.current = onWelcome;
onSettingsAppliedRef.current = onSettingsApplied;
onConversationTextRef.current = onConversationText;
onAgentThinkingRef.current = onAgentThinking;
onAgentStartedSpeakingRef.current = onAgentStartedSpeaking;
onAgentAudioDoneRef.current = onAgentAudioDone;
onUserStartedSpeakingRef.current = onUserStartedSpeaking;
onFunctionCallRequestRef.current = onFunctionCallRequest;
onFunctionCallResponseRef.current = onFunctionCallResponse;
onPromptUpdatedRef.current = onPromptUpdated;
onSpeakUpdatedRef.current = onSpeakUpdated;
onInjectionRefusedRef.current = onInjectionRefused;
onWarningRef.current = onWarning;
onServerErrorRef.current = onServerError;
onAudioConfigRef.current = onAudioConfig;
onAudioRef.current = onAudio;
autoStartMicRef.current = autoStartMicrophone;
if (downsampleFactor != null) {
currentDownsample.current = downsampleFactor;
}
const cleanup = useCallback(() => {
audioSub.current?.remove();
audioSub.current = null;
if (microphoneActive.current) {
Deepgram.stopRecording().catch(() => {});
microphoneActive.current = false;
}
// Cleanup audio session for playback
Deepgram.stopAudio().catch(() => {});
const socket = ws.current;
if (socket) {
ws.current = null;
try {
if (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING) {
socket.close(1000, 'cleanup');
} else {
socket.close();
}
} catch {
// ignore socket close errors
}
}
}, []);
useEffect(() => () => cleanup(), [cleanup]);
const handleMicChunk = useCallback(ev => {
const socket = ws.current;
if (!socket || socket.readyState !== WebSocket.OPEN) {
return;
}
if (typeof ev?.sampleRate === 'number' && ev.sampleRate > 0) {
if (ev.sampleRate !== nativeInputSampleRate.current) {
nativeInputSampleRate.current = ev.sampleRate;
currentDownsample.current = resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current);
}
}
const factor = currentDownsample.current ?? 1;
let chunk = null;
if (typeof ev?.b64 === 'string') {
const binary = Uint8Array.from(atob(ev.b64), c => c.charCodeAt(0));
let int16 = new Int16Array(binary.buffer);
if (factor > 1 && int16.length >= factor) {
const downsampled = new Int16Array(Math.floor(int16.length / factor));
for (let i = 0; i < downsampled.length; i++) {
downsampled[i] = int16[i * factor];
}
int16 = downsampled;
}
chunk = int16.buffer;
} else if (Array.isArray(ev?.data)) {
const bytes = new Uint8Array(ev.data.length);
for (let i = 0; i < ev.data.length; i++) {
const value = ev.data[i];
bytes[i] = value < 0 ? value + 256 : value;
}
const view = new DataView(bytes.buffer);
const int16 = new Int16Array(bytes.length / 2);
for (let i = 0; i < int16.length; i++) {
int16[i] = view.getInt16(i * 2, true);
}
chunk = int16.buffer;
}
if (!chunk) {
return;
}
try {
socket.send(chunk);
} catch (err) {
onErrorRef.current?.(err);
}
}, [downsampleFactor, onErrorRef]);
const handleSocketMessage = useCallback(ev => {
if (typeof ev.data === 'string') {
try {
const message = JSON.parse(ev.data);
onMessageRef.current?.(message);
switch (message.type) {
case 'Welcome':
if (hasKeys(message, ['request_id'])) {
onWelcomeRef.current?.(message);
}
break;
case 'SettingsApplied':
onSettingsAppliedRef.current?.(message);
break;
case 'ConversationText':
if (hasKeys(message, ['role', 'content'])) {
const convMsg = message;
if (trackConversation) {
setInternalConversation(prev => [...prev, {
role: convMsg.role,
content: convMsg.content
}]);
}
onConversationTextRef.current?.(convMsg);
}
break;
case 'AgentThinking':
if (hasKeys(message, ['content'])) {
const thinkMsg = message;
if (trackAgentStatus) {
setInternalAgentStatus(prev => ({
...prev,
thinking: thinkMsg.content
}));
}
onAgentThinkingRef.current?.(thinkMsg);
}
break;
case 'AgentStartedSpeaking':
{
const speakMsg = message;
if (trackAgentStatus) {
setInternalAgentStatus(prev => ({
...prev,
latency: {
total: speakMsg.total_latency,
tts: speakMsg.tts_latency,
ttt: speakMsg.ttt_latency
}
}));
}
onAgentStartedSpeakingRef.current?.(speakMsg);
}
break;
case 'AgentAudioDone':
{
const doneMsg = message;
if (trackAgentStatus) {
setInternalAgentStatus({
thinking: null,
latency: null
});
}
onAgentAudioDoneRef.current?.(doneMsg);
}
break;
case 'UserStartedSpeaking':
onUserStartedSpeakingRef.current?.(message);
break;
case 'FunctionCallRequest':
if (hasKeys(message, ['functions'])) {
onFunctionCallRequestRef.current?.(message);
}
break;
case 'FunctionCallResponse':
if (hasKeys(message, ['id', 'name'])) {
onFunctionCallResponseRef.current?.(message);
}
break;
case 'PromptUpdated':
onPromptUpdatedRef.current?.(message);
break;
case 'SpeakUpdated':
onSpeakUpdatedRef.current?.(message);
break;
case 'Audio':
// Audio binary data will be handled by onmessage binary path
break;
case 'AudioConfig':
if (hasKeys(message, ['sample_rate'])) {
const configMsg = message;
if (autoPlayAudio) {
const sampleRate = configMsg.sample_rate || DEFAULT_INPUT_SAMPLE_RATE;
const channels = configMsg.channels || 1;
Deepgram.startPlayer?.(sampleRate, channels);
}
onAudioConfigRef.current?.(configMsg);
}
break;
case 'InjectionRefused':
if (hasKeys(message, ['message'])) {
onInjectionRefusedRef.current?.(message);
}
break;
case 'Warning':
if (hasKeys(message, ['description'])) {
const warnMsg = message;
if (trackState) {
setInternalState(prev => ({
...prev,
warning: warnMsg.description
}));
}
onWarningRef.current?.(warnMsg);
}
break;
case 'Error':
{
const description = typeof message.description === 'string' ? message.description : undefined;
const code = typeof message.code === 'string' ? message.code : undefined;
const errorMsg = description ?? code ?? 'Voice agent error';
if (trackState) {
setInternalState(prev => ({
...prev,
connectionState: 'disconnected',
error: errorMsg
}));
}
if (description || code) {
onServerErrorRef.current?.(message);
}
onErrorRef.current?.(new Error(errorMsg));
}
break;
default:
break;
}
} catch (err) {
onErrorRef.current?.(err);
}
return;
}
const buffer = ensureArrayBuffer(ev.data);
if (buffer) {
if (autoPlayAudio) {
try {
const bytes = new Uint8Array(buffer);
const b64 = Buffer.from(bytes).toString('base64');
Deepgram.feedAudio?.(b64);
} catch (err) {
console.warn('[VoiceAgent] Auto-feed audio error:', err);
}
}
onAudioRef.current?.(buffer);
}
}, [onAgentAudioDoneRef, onAgentStartedSpeakingRef, onAgentThinkingRef, onConversationTextRef, onErrorRef, onFunctionCallRequestRef, onFunctionCallResponseRef, onInjectionRefusedRef, onMessageRef, onPromptUpdatedRef, onServerErrorRef, onSettingsAppliedRef, onSpeakUpdatedRef, onUserStartedSpeakingRef, onWarningRef, autoPlayAudio, trackAgentStatus, trackConversation, trackState]);
const sendJsonMessage = useCallback(message => {
const socket = ws.current;
if (!socket || socket.readyState !== WebSocket.OPEN) {
return false;
}
try {
socket.send(JSON.stringify(message));
return true;
} catch (err) {
onErrorRef.current?.(err);
return false;
}
}, []);
const sendBinary = useCallback(chunk => {
const socket = ws.current;
if (!socket || socket.readyState !== WebSocket.OPEN) {
return false;
}
let payload = null;
if (chunk instanceof ArrayBuffer) {
payload = chunk;
} else if (chunk instanceof Uint8Array) {
if (chunk.buffer instanceof ArrayBuffer) {
payload = chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength);
} else {
const copy = new Uint8Array(chunk.byteLength);
copy.set(chunk);
payload = copy.buffer;
}
} else if (Array.isArray(chunk)) {
const uint = new Uint8Array(chunk.length);
for (let i = 0; i < chunk.length; i++) {
uint[i] = chunk[i];
}
payload = uint.buffer;
}
if (!payload) return false;
try {
socket.send(payload);
return true;
} catch (err) {
onErrorRef.current?.(err);
return false;
}
}, []);
const connect = useCallback(async overrideSettings => {
cleanup();
if (trackState) {
setInternalState({
connectionState: 'connecting',
error: null,
warning: null
});
}
if (trackConversation) {
setInternalConversation([]);
}
if (trackAgentStatus) {
setInternalAgentStatus({
thinking: null,
latency: null
});
}
onBeforeConnectRef.current?.();
const apiKey = globalThis.__DEEPGRAM_API_KEY__;
if (!apiKey) throw new Error('Deepgram API key missing');
const shouldCaptureMic = autoStartMicRef.current;
if (shouldCaptureMic) {
const granted = await askMicPermission();
if (!granted) {
throw new Error('Microphone permission denied');
}
await Deepgram.startRecording();
microphoneActive.current = true;
const emitter = new NativeEventEmitter(NativeModules.Deepgram);
if (eventName) {
audioSub.current = emitter.addListener(eventName, handleMicChunk);
}
} else {
// Only initialize audio session for playback if not recording
// (startRecording already activates the audio session)
await Deepgram.startAudio();
}
const sanitizedDefault = sanitizeSettings(defaultSettingsRef.current);
const sanitizedOverride = sanitizeSettings(overrideSettings);
const merged = mergeSettings(sanitizedDefault, sanitizedOverride);
const mergedSettings = {
type: 'Settings',
...(merged ?? {})
};
const targetSampleRate = overrideSettings?.audio?.input?.sample_rate ?? defaultSettingsRef.current?.audio?.input?.sample_rate ?? DEFAULT_INPUT_SAMPLE_RATE;
targetInputSampleRate.current = targetSampleRate;
currentDownsample.current = resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current);
const socket = new WebSocket(endpointRef.current, undefined, {
headers: {
Authorization: `Token ${apiKey}`
}
});
socket.binaryType = 'arraybuffer';
ws.current = socket;
socket.onopen = () => {
sendJsonMessage(mergedSettings);
if (trackState) {
setInternalState(prev => ({
...prev,
connectionState: 'connected'
}));
}
if (autoPlayAudio) {
const sampleRate = merged?.audio?.output?.sample_rate ?? DEFAULT_INPUT_SAMPLE_RATE;
const channels = 1;
Deepgram.startPlayer?.(sampleRate, channels);
}
onConnectRef.current?.();
};
socket.onmessage = handleSocketMessage;
socket.onerror = err => {
onErrorRef.current?.(err);
};
socket.onclose = event => {
if (trackState) {
setInternalState(prev => ({
...prev,
connectionState: 'disconnected'
}));
}
cleanup();
onCloseRef.current?.(event);
};
}, [cleanup, downsampleFactor, handleMicChunk, handleSocketMessage, mergeSettings, sanitizeSettings, sendJsonMessage, autoPlayAudio, trackAgentStatus, trackConversation, trackState]);
const disconnect = useCallback(() => {
cleanup();
}, [cleanup]);
const sendSettings = useCallback(settings => {
const sanitized = sanitizeSettings(settings);
return sendJsonMessage({
type: 'Settings',
...(sanitized ?? {})
});
}, [sanitizeSettings, sendJsonMessage]);
const injectUserMessage = useCallback(content => sendJsonMessage({
type: 'InjectUserMessage',
content
}), [sendJsonMessage]);
const injectAgentMessage = useCallback(message => sendJsonMessage({
type: 'InjectAgentMessage',
message
}), [sendJsonMessage]);
const sendFunctionCallResponse = useCallback(response => sendJsonMessage({
type: 'FunctionCallResponse',
...response
}), [sendJsonMessage]);
const sendKeepAlive = useCallback(() => sendJsonMessage({
type: 'KeepAlive'
}), [sendJsonMessage]);
const updatePrompt = useCallback(prompt => sendJsonMessage({
type: 'UpdatePrompt',
prompt
}), [sendJsonMessage]);
const sendMessage = useCallback(message => sendJsonMessage(message), [sendJsonMessage]);
const isConnected = useCallback(() => ws.current?.readyState === WebSocket.OPEN, []);
const clearConversation = useCallback(() => {
if (trackConversation) {
setInternalConversation([]);
}
}, [trackConversation]);
return {
connect,
disconnect,
sendMessage,
sendSettings,
injectUserMessage,
injectAgentMessage,
sendFunctionCallResponse,
sendKeepAlive,
updatePrompt,
sendMedia: sendBinary,
isConnected,
...(trackState ? {
state: internalState
} : {}),
...(trackConversation ? {
conversation: internalConversation,
clearConversation
} : {}),
...(trackAgentStatus ? {
agentStatus: internalAgentStatus
} : {})
};
}
//# sourceMappingURL=useDeepgramVoiceAgent.js.map