UNPKG

react-native-deepgram

Version:

React Native SDK for Deepgram's AI-powered speech-to-text, real-time transcription, and text intelligence APIs. Supports live audio streaming, file transcription, sentiment analysis, and topic detection for iOS and Android.

692 lines (690 loc) 23.4 kB
"use strict"; import { useRef, useCallback, useEffect, useState } from 'react'; import { NativeEventEmitter, NativeModules, Platform } from 'react-native'; import { Buffer } from 'buffer'; import { Deepgram } from "./NativeDeepgram.js"; import { askMicPermission } from "./helpers/askMicPermission.js"; const DEFAULT_AGENT_ENDPOINT = 'wss://agent.deepgram.com/v1/agent/converse'; const DEFAULT_INPUT_SAMPLE_RATE = 16_000; const BASE_NATIVE_SAMPLE_RATE = 16_000; const eventName = Platform.select({ ios: 'DeepgramAudioPCM', android: 'AudioChunk', default: 'DeepgramAudioPCM' }); const ensureArrayBuffer = data => { if (!data) return null; if (data instanceof ArrayBuffer) return data; if (ArrayBuffer.isView(data)) { const view = data; if (view.buffer instanceof ArrayBuffer) { return view.buffer.slice(view.byteOffset, view.byteOffset + view.byteLength); } const copy = new Uint8Array(view.byteLength); copy.set(new Uint8Array(view.buffer, view.byteOffset, view.byteLength)); return copy.buffer; } return null; }; const isPlainObject = value => typeof value === 'object' && value !== null && !Array.isArray(value); const cloneValue = value => { if (Array.isArray(value)) { return value.map(item => cloneValue(item)); } if (isPlainObject(value)) { const cloned = {}; Object.entries(value).forEach(([key, entryValue]) => { cloned[key] = cloneValue(entryValue); }); return cloned; } return value; }; const mergePlainObjects = (base, override) => { if (!base && !override) { return undefined; } if (!base) { return override ? cloneValue(override) : undefined; } const result = cloneValue(base); if (!override) { return result; } Object.entries(override).forEach(([key, overrideValue]) => { if (overrideValue === undefined) { result[key] = undefined; return; } if (isPlainObject(overrideValue)) { const existing = result[key]; result[key] = mergePlainObjects(isPlainObject(existing) ? existing : undefined, overrideValue); return; } if (Array.isArray(overrideValue)) { result[key] = overrideValue.map(item => cloneValue(item)); return; } result[key] = overrideValue; }); return result; }; const hasKeys = (value, keys) => isPlainObject(value) && keys.every(key => key in value); const computeDownsampleFactor = (target, base = BASE_NATIVE_SAMPLE_RATE) => { if (!target || target >= base || base <= 0) { return 1; } const ratio = Math.round(base / target); return ratio > 0 ? ratio : 1; }; const resolveDownsampleFactor = (overrideFactor, targetSampleRate, nativeSampleRate) => { if (overrideFactor == null) { return computeDownsampleFactor(targetSampleRate, nativeSampleRate); } const normalized = Math.max(1, Math.round(overrideFactor)); if (!nativeSampleRate || !targetSampleRate) { return normalized; } if (nativeSampleRate <= targetSampleRate) { return 1; } return normalized; }; export function useDeepgramVoiceAgent({ endpoint = DEFAULT_AGENT_ENDPOINT, defaultSettings, autoStartMicrophone = true, autoPlayAudio = true, trackState = false, trackConversation = false, trackAgentStatus = false, downsampleFactor, onBeforeConnect, onConnect, onClose, onError, onMessage, onWelcome, onSettingsApplied, onConversationText, onAgentThinking, onAgentStartedSpeaking, onAgentAudioDone, onUserStartedSpeaking, onFunctionCallRequest, onFunctionCallResponse, onPromptUpdated, onSpeakUpdated, onInjectionRefused, onWarning, onServerError, onAudioConfig, onAudio } = {}) { const ws = useRef(null); const audioSub = useRef(null); const nativeInputSampleRate = useRef(BASE_NATIVE_SAMPLE_RATE); const targetInputSampleRate = useRef(DEFAULT_INPUT_SAMPLE_RATE); const currentDownsample = useRef(resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current)); const microphoneActive = useRef(false); const defaultSettingsRef = useRef(defaultSettings); const endpointRef = useRef(endpoint); const onBeforeConnectRef = useRef(onBeforeConnect); const onConnectRef = useRef(onConnect); const onCloseRef = useRef(onClose); const onErrorRef = useRef(onError); const onMessageRef = useRef(onMessage); const onWelcomeRef = useRef(onWelcome); const onSettingsAppliedRef = useRef(onSettingsApplied); const onConversationTextRef = useRef(onConversationText); const onAgentThinkingRef = useRef(onAgentThinking); const onAgentStartedSpeakingRef = useRef(onAgentStartedSpeaking); const onAgentAudioDoneRef = useRef(onAgentAudioDone); const onUserStartedSpeakingRef = useRef(onUserStartedSpeaking); const onFunctionCallRequestRef = useRef(onFunctionCallRequest); const onFunctionCallResponseRef = useRef(onFunctionCallResponse); const onPromptUpdatedRef = useRef(onPromptUpdated); const onSpeakUpdatedRef = useRef(onSpeakUpdated); const onInjectionRefusedRef = useRef(onInjectionRefused); const onWarningRef = useRef(onWarning); const onServerErrorRef = useRef(onServerError); const onAudioConfigRef = useRef(onAudioConfig); const onAudioRef = useRef(onAudio); const autoStartMicRef = useRef(autoStartMicrophone); const [internalState, setInternalState] = useState(() => ({ connectionState: 'idle', error: null, warning: null })); const [internalConversation, setInternalConversation] = useState([]); const [internalAgentStatus, setInternalAgentStatus] = useState(() => ({ thinking: null, latency: null })); const sanitizeAudioSettings = useCallback(audio => { if (!audio) { return undefined; } const sanitized = {}; if (audio.input) { sanitized.input = { ...audio.input }; } if (audio.output) { sanitized.output = { ...audio.output }; } Object.entries(audio).forEach(([key, value]) => { if (key === 'input' || key === 'output') { return; } let clonedValue = value; if (Array.isArray(value)) { clonedValue = value.map(item => cloneValue(item)); } else if (isPlainObject(value)) { clonedValue = cloneValue(value); } sanitized[key] = clonedValue; }); return Object.keys(sanitized).length > 0 ? sanitized : undefined; }, []); const sanitizeAgentConfig = useCallback(agent => { if (!agent) { return undefined; } const sanitized = {}; Object.entries(agent).forEach(([key, value]) => { if (key === 'speak') { return; } let clonedValue = value; if (Array.isArray(value)) { clonedValue = value.map(item => cloneValue(item)); } else if (isPlainObject(value)) { clonedValue = cloneValue(value); } sanitized[key] = clonedValue; }); if (agent.speak) { sanitized.speak = { ...agent.speak }; if (agent.speak.provider) { sanitized.speak.provider = { ...agent.speak.provider }; } } return Object.keys(sanitized).length > 0 ? sanitized : undefined; }, []); const sanitizeSettings = useCallback(settings => { if (!settings) { return undefined; } const sanitized = {}; Object.entries(settings).forEach(([key, value]) => { if (key === 'audio') { const audio = sanitizeAudioSettings(value); if (audio) { sanitized.audio = audio; } return; } if (key === 'agent') { const agent = sanitizeAgentConfig(value); if (agent) { sanitized.agent = agent; } return; } sanitized[key] = value; }); return Object.keys(sanitized).length > 0 ? sanitized : undefined; }, [sanitizeAgentConfig, sanitizeAudioSettings]); const mergeSettings = useCallback((base, override) => mergePlainObjects(base, override), []); defaultSettingsRef.current = defaultSettings; endpointRef.current = endpoint; onBeforeConnectRef.current = onBeforeConnect; onConnectRef.current = onConnect; onCloseRef.current = onClose; onErrorRef.current = onError; onMessageRef.current = onMessage; onWelcomeRef.current = onWelcome; onSettingsAppliedRef.current = onSettingsApplied; onConversationTextRef.current = onConversationText; onAgentThinkingRef.current = onAgentThinking; onAgentStartedSpeakingRef.current = onAgentStartedSpeaking; onAgentAudioDoneRef.current = onAgentAudioDone; onUserStartedSpeakingRef.current = onUserStartedSpeaking; onFunctionCallRequestRef.current = onFunctionCallRequest; onFunctionCallResponseRef.current = onFunctionCallResponse; onPromptUpdatedRef.current = onPromptUpdated; onSpeakUpdatedRef.current = onSpeakUpdated; onInjectionRefusedRef.current = onInjectionRefused; onWarningRef.current = onWarning; onServerErrorRef.current = onServerError; onAudioConfigRef.current = onAudioConfig; onAudioRef.current = onAudio; autoStartMicRef.current = autoStartMicrophone; if (downsampleFactor != null) { currentDownsample.current = downsampleFactor; } const cleanup = useCallback(() => { audioSub.current?.remove(); audioSub.current = null; if (microphoneActive.current) { Deepgram.stopRecording().catch(() => {}); microphoneActive.current = false; } // Cleanup audio session for playback Deepgram.stopAudio().catch(() => {}); const socket = ws.current; if (socket) { ws.current = null; try { if (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING) { socket.close(1000, 'cleanup'); } else { socket.close(); } } catch { // ignore socket close errors } } }, []); useEffect(() => () => cleanup(), [cleanup]); const handleMicChunk = useCallback(ev => { const socket = ws.current; if (!socket || socket.readyState !== WebSocket.OPEN) { return; } if (typeof ev?.sampleRate === 'number' && ev.sampleRate > 0) { if (ev.sampleRate !== nativeInputSampleRate.current) { nativeInputSampleRate.current = ev.sampleRate; currentDownsample.current = resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current); } } const factor = currentDownsample.current ?? 1; let chunk = null; if (typeof ev?.b64 === 'string') { const binary = Uint8Array.from(atob(ev.b64), c => c.charCodeAt(0)); let int16 = new Int16Array(binary.buffer); if (factor > 1 && int16.length >= factor) { const downsampled = new Int16Array(Math.floor(int16.length / factor)); for (let i = 0; i < downsampled.length; i++) { downsampled[i] = int16[i * factor]; } int16 = downsampled; } chunk = int16.buffer; } else if (Array.isArray(ev?.data)) { const bytes = new Uint8Array(ev.data.length); for (let i = 0; i < ev.data.length; i++) { const value = ev.data[i]; bytes[i] = value < 0 ? value + 256 : value; } const view = new DataView(bytes.buffer); const int16 = new Int16Array(bytes.length / 2); for (let i = 0; i < int16.length; i++) { int16[i] = view.getInt16(i * 2, true); } chunk = int16.buffer; } if (!chunk) { return; } try { socket.send(chunk); } catch (err) { onErrorRef.current?.(err); } }, [downsampleFactor, onErrorRef]); const handleSocketMessage = useCallback(ev => { if (typeof ev.data === 'string') { try { const message = JSON.parse(ev.data); onMessageRef.current?.(message); switch (message.type) { case 'Welcome': if (hasKeys(message, ['request_id'])) { onWelcomeRef.current?.(message); } break; case 'SettingsApplied': onSettingsAppliedRef.current?.(message); break; case 'ConversationText': if (hasKeys(message, ['role', 'content'])) { const convMsg = message; if (trackConversation) { setInternalConversation(prev => [...prev, { role: convMsg.role, content: convMsg.content }]); } onConversationTextRef.current?.(convMsg); } break; case 'AgentThinking': if (hasKeys(message, ['content'])) { const thinkMsg = message; if (trackAgentStatus) { setInternalAgentStatus(prev => ({ ...prev, thinking: thinkMsg.content })); } onAgentThinkingRef.current?.(thinkMsg); } break; case 'AgentStartedSpeaking': { const speakMsg = message; if (trackAgentStatus) { setInternalAgentStatus(prev => ({ ...prev, latency: { total: speakMsg.total_latency, tts: speakMsg.tts_latency, ttt: speakMsg.ttt_latency } })); } onAgentStartedSpeakingRef.current?.(speakMsg); } break; case 'AgentAudioDone': { const doneMsg = message; if (trackAgentStatus) { setInternalAgentStatus({ thinking: null, latency: null }); } onAgentAudioDoneRef.current?.(doneMsg); } break; case 'UserStartedSpeaking': onUserStartedSpeakingRef.current?.(message); break; case 'FunctionCallRequest': if (hasKeys(message, ['functions'])) { onFunctionCallRequestRef.current?.(message); } break; case 'FunctionCallResponse': if (hasKeys(message, ['id', 'name'])) { onFunctionCallResponseRef.current?.(message); } break; case 'PromptUpdated': onPromptUpdatedRef.current?.(message); break; case 'SpeakUpdated': onSpeakUpdatedRef.current?.(message); break; case 'Audio': // Audio binary data will be handled by onmessage binary path break; case 'AudioConfig': if (hasKeys(message, ['sample_rate'])) { const configMsg = message; if (autoPlayAudio) { const sampleRate = configMsg.sample_rate || DEFAULT_INPUT_SAMPLE_RATE; const channels = configMsg.channels || 1; Deepgram.startPlayer?.(sampleRate, channels); } onAudioConfigRef.current?.(configMsg); } break; case 'InjectionRefused': if (hasKeys(message, ['message'])) { onInjectionRefusedRef.current?.(message); } break; case 'Warning': if (hasKeys(message, ['description'])) { const warnMsg = message; if (trackState) { setInternalState(prev => ({ ...prev, warning: warnMsg.description })); } onWarningRef.current?.(warnMsg); } break; case 'Error': { const description = typeof message.description === 'string' ? message.description : undefined; const code = typeof message.code === 'string' ? message.code : undefined; const errorMsg = description ?? code ?? 'Voice agent error'; if (trackState) { setInternalState(prev => ({ ...prev, connectionState: 'disconnected', error: errorMsg })); } if (description || code) { onServerErrorRef.current?.(message); } onErrorRef.current?.(new Error(errorMsg)); } break; default: break; } } catch (err) { onErrorRef.current?.(err); } return; } const buffer = ensureArrayBuffer(ev.data); if (buffer) { if (autoPlayAudio) { try { const bytes = new Uint8Array(buffer); const b64 = Buffer.from(bytes).toString('base64'); Deepgram.feedAudio?.(b64); } catch (err) { console.warn('[VoiceAgent] Auto-feed audio error:', err); } } onAudioRef.current?.(buffer); } }, [onAgentAudioDoneRef, onAgentStartedSpeakingRef, onAgentThinkingRef, onConversationTextRef, onErrorRef, onFunctionCallRequestRef, onFunctionCallResponseRef, onInjectionRefusedRef, onMessageRef, onPromptUpdatedRef, onServerErrorRef, onSettingsAppliedRef, onSpeakUpdatedRef, onUserStartedSpeakingRef, onWarningRef, autoPlayAudio, trackAgentStatus, trackConversation, trackState]); const sendJsonMessage = useCallback(message => { const socket = ws.current; if (!socket || socket.readyState !== WebSocket.OPEN) { return false; } try { socket.send(JSON.stringify(message)); return true; } catch (err) { onErrorRef.current?.(err); return false; } }, []); const sendBinary = useCallback(chunk => { const socket = ws.current; if (!socket || socket.readyState !== WebSocket.OPEN) { return false; } let payload = null; if (chunk instanceof ArrayBuffer) { payload = chunk; } else if (chunk instanceof Uint8Array) { if (chunk.buffer instanceof ArrayBuffer) { payload = chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength); } else { const copy = new Uint8Array(chunk.byteLength); copy.set(chunk); payload = copy.buffer; } } else if (Array.isArray(chunk)) { const uint = new Uint8Array(chunk.length); for (let i = 0; i < chunk.length; i++) { uint[i] = chunk[i]; } payload = uint.buffer; } if (!payload) return false; try { socket.send(payload); return true; } catch (err) { onErrorRef.current?.(err); return false; } }, []); const connect = useCallback(async overrideSettings => { cleanup(); if (trackState) { setInternalState({ connectionState: 'connecting', error: null, warning: null }); } if (trackConversation) { setInternalConversation([]); } if (trackAgentStatus) { setInternalAgentStatus({ thinking: null, latency: null }); } onBeforeConnectRef.current?.(); const apiKey = globalThis.__DEEPGRAM_API_KEY__; if (!apiKey) throw new Error('Deepgram API key missing'); const shouldCaptureMic = autoStartMicRef.current; if (shouldCaptureMic) { const granted = await askMicPermission(); if (!granted) { throw new Error('Microphone permission denied'); } await Deepgram.startRecording(); microphoneActive.current = true; const emitter = new NativeEventEmitter(NativeModules.Deepgram); if (eventName) { audioSub.current = emitter.addListener(eventName, handleMicChunk); } } else { // Only initialize audio session for playback if not recording // (startRecording already activates the audio session) await Deepgram.startAudio(); } const sanitizedDefault = sanitizeSettings(defaultSettingsRef.current); const sanitizedOverride = sanitizeSettings(overrideSettings); const merged = mergeSettings(sanitizedDefault, sanitizedOverride); const mergedSettings = { type: 'Settings', ...(merged ?? {}) }; const targetSampleRate = overrideSettings?.audio?.input?.sample_rate ?? defaultSettingsRef.current?.audio?.input?.sample_rate ?? DEFAULT_INPUT_SAMPLE_RATE; targetInputSampleRate.current = targetSampleRate; currentDownsample.current = resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current); const socket = new WebSocket(endpointRef.current, undefined, { headers: { Authorization: `Token ${apiKey}` } }); socket.binaryType = 'arraybuffer'; ws.current = socket; socket.onopen = () => { sendJsonMessage(mergedSettings); if (trackState) { setInternalState(prev => ({ ...prev, connectionState: 'connected' })); } if (autoPlayAudio) { const sampleRate = merged?.audio?.output?.sample_rate ?? DEFAULT_INPUT_SAMPLE_RATE; const channels = 1; Deepgram.startPlayer?.(sampleRate, channels); } onConnectRef.current?.(); }; socket.onmessage = handleSocketMessage; socket.onerror = err => { onErrorRef.current?.(err); }; socket.onclose = event => { if (trackState) { setInternalState(prev => ({ ...prev, connectionState: 'disconnected' })); } cleanup(); onCloseRef.current?.(event); }; }, [cleanup, downsampleFactor, handleMicChunk, handleSocketMessage, mergeSettings, sanitizeSettings, sendJsonMessage, autoPlayAudio, trackAgentStatus, trackConversation, trackState]); const disconnect = useCallback(() => { cleanup(); }, [cleanup]); const sendSettings = useCallback(settings => { const sanitized = sanitizeSettings(settings); return sendJsonMessage({ type: 'Settings', ...(sanitized ?? {}) }); }, [sanitizeSettings, sendJsonMessage]); const injectUserMessage = useCallback(content => sendJsonMessage({ type: 'InjectUserMessage', content }), [sendJsonMessage]); const injectAgentMessage = useCallback(message => sendJsonMessage({ type: 'InjectAgentMessage', message }), [sendJsonMessage]); const sendFunctionCallResponse = useCallback(response => sendJsonMessage({ type: 'FunctionCallResponse', ...response }), [sendJsonMessage]); const sendKeepAlive = useCallback(() => sendJsonMessage({ type: 'KeepAlive' }), [sendJsonMessage]); const updatePrompt = useCallback(prompt => sendJsonMessage({ type: 'UpdatePrompt', prompt }), [sendJsonMessage]); const sendMessage = useCallback(message => sendJsonMessage(message), [sendJsonMessage]); const isConnected = useCallback(() => ws.current?.readyState === WebSocket.OPEN, []); const clearConversation = useCallback(() => { if (trackConversation) { setInternalConversation([]); } }, [trackConversation]); return { connect, disconnect, sendMessage, sendSettings, injectUserMessage, injectAgentMessage, sendFunctionCallResponse, sendKeepAlive, updatePrompt, sendMedia: sendBinary, isConnected, ...(trackState ? { state: internalState } : {}), ...(trackConversation ? { conversation: internalConversation, clearConversation } : {}), ...(trackAgentStatus ? { agentStatus: internalAgentStatus } : {}) }; } //# sourceMappingURL=useDeepgramVoiceAgent.js.map