UNPKG

react-native-deepgram

Version:

React Native SDK for Deepgram's AI-powered speech-to-text, real-time transcription, and text intelligence APIs. Supports live audio streaming, file transcription, sentiment analysis, and topic detection for iOS and Android.

696 lines (694 loc) 24.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.useDeepgramVoiceAgent = useDeepgramVoiceAgent; var _react = require("react"); var _reactNative = require("react-native"); var _buffer = require("buffer"); var _NativeDeepgram = require("./NativeDeepgram.js"); var _askMicPermission = require("./helpers/askMicPermission.js"); const DEFAULT_AGENT_ENDPOINT = 'wss://agent.deepgram.com/v1/agent/converse'; const DEFAULT_INPUT_SAMPLE_RATE = 16_000; const BASE_NATIVE_SAMPLE_RATE = 16_000; const eventName = _reactNative.Platform.select({ ios: 'DeepgramAudioPCM', android: 'AudioChunk', default: 'DeepgramAudioPCM' }); const ensureArrayBuffer = data => { if (!data) return null; if (data instanceof ArrayBuffer) return data; if (ArrayBuffer.isView(data)) { const view = data; if (view.buffer instanceof ArrayBuffer) { return view.buffer.slice(view.byteOffset, view.byteOffset + view.byteLength); } const copy = new Uint8Array(view.byteLength); copy.set(new Uint8Array(view.buffer, view.byteOffset, view.byteLength)); return copy.buffer; } return null; }; const isPlainObject = value => typeof value === 'object' && value !== null && !Array.isArray(value); const cloneValue = value => { if (Array.isArray(value)) { return value.map(item => cloneValue(item)); } if (isPlainObject(value)) { const cloned = {}; Object.entries(value).forEach(([key, entryValue]) => { cloned[key] = cloneValue(entryValue); }); return cloned; } return value; }; const mergePlainObjects = (base, override) => { if (!base && !override) { return undefined; } if (!base) { return override ? cloneValue(override) : undefined; } const result = cloneValue(base); if (!override) { return result; } Object.entries(override).forEach(([key, overrideValue]) => { if (overrideValue === undefined) { result[key] = undefined; return; } if (isPlainObject(overrideValue)) { const existing = result[key]; result[key] = mergePlainObjects(isPlainObject(existing) ? existing : undefined, overrideValue); return; } if (Array.isArray(overrideValue)) { result[key] = overrideValue.map(item => cloneValue(item)); return; } result[key] = overrideValue; }); return result; }; const hasKeys = (value, keys) => isPlainObject(value) && keys.every(key => key in value); const computeDownsampleFactor = (target, base = BASE_NATIVE_SAMPLE_RATE) => { if (!target || target >= base || base <= 0) { return 1; } const ratio = Math.round(base / target); return ratio > 0 ? ratio : 1; }; const resolveDownsampleFactor = (overrideFactor, targetSampleRate, nativeSampleRate) => { if (overrideFactor == null) { return computeDownsampleFactor(targetSampleRate, nativeSampleRate); } const normalized = Math.max(1, Math.round(overrideFactor)); if (!nativeSampleRate || !targetSampleRate) { return normalized; } if (nativeSampleRate <= targetSampleRate) { return 1; } return normalized; }; function useDeepgramVoiceAgent({ endpoint = DEFAULT_AGENT_ENDPOINT, defaultSettings, autoStartMicrophone = true, autoPlayAudio = true, trackState = false, trackConversation = false, trackAgentStatus = false, downsampleFactor, onBeforeConnect, onConnect, onClose, onError, onMessage, onWelcome, onSettingsApplied, onConversationText, onAgentThinking, onAgentStartedSpeaking, onAgentAudioDone, onUserStartedSpeaking, onFunctionCallRequest, onFunctionCallResponse, onPromptUpdated, onSpeakUpdated, onInjectionRefused, onWarning, onServerError, onAudioConfig, onAudio } = {}) { const ws = (0, _react.useRef)(null); const audioSub = (0, _react.useRef)(null); const nativeInputSampleRate = (0, _react.useRef)(BASE_NATIVE_SAMPLE_RATE); const targetInputSampleRate = (0, _react.useRef)(DEFAULT_INPUT_SAMPLE_RATE); const currentDownsample = (0, _react.useRef)(resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current)); const microphoneActive = (0, _react.useRef)(false); const defaultSettingsRef = (0, _react.useRef)(defaultSettings); const endpointRef = (0, _react.useRef)(endpoint); const onBeforeConnectRef = (0, _react.useRef)(onBeforeConnect); const onConnectRef = (0, _react.useRef)(onConnect); const onCloseRef = (0, _react.useRef)(onClose); const onErrorRef = (0, _react.useRef)(onError); const onMessageRef = (0, _react.useRef)(onMessage); const onWelcomeRef = (0, _react.useRef)(onWelcome); const onSettingsAppliedRef = (0, _react.useRef)(onSettingsApplied); const onConversationTextRef = (0, _react.useRef)(onConversationText); const onAgentThinkingRef = (0, _react.useRef)(onAgentThinking); const onAgentStartedSpeakingRef = (0, _react.useRef)(onAgentStartedSpeaking); const onAgentAudioDoneRef = (0, _react.useRef)(onAgentAudioDone); const onUserStartedSpeakingRef = (0, _react.useRef)(onUserStartedSpeaking); const onFunctionCallRequestRef = (0, _react.useRef)(onFunctionCallRequest); const onFunctionCallResponseRef = (0, _react.useRef)(onFunctionCallResponse); const onPromptUpdatedRef = (0, _react.useRef)(onPromptUpdated); const onSpeakUpdatedRef = (0, _react.useRef)(onSpeakUpdated); const onInjectionRefusedRef = (0, _react.useRef)(onInjectionRefused); const onWarningRef = (0, _react.useRef)(onWarning); const onServerErrorRef = (0, _react.useRef)(onServerError); const onAudioConfigRef = (0, _react.useRef)(onAudioConfig); const onAudioRef = (0, _react.useRef)(onAudio); const autoStartMicRef = (0, _react.useRef)(autoStartMicrophone); const [internalState, setInternalState] = (0, _react.useState)(() => ({ connectionState: 'idle', error: null, warning: null })); const [internalConversation, setInternalConversation] = (0, _react.useState)([]); const [internalAgentStatus, setInternalAgentStatus] = (0, _react.useState)(() => ({ thinking: null, latency: null })); const sanitizeAudioSettings = (0, _react.useCallback)(audio => { if (!audio) { return undefined; } const sanitized = {}; if (audio.input) { sanitized.input = { ...audio.input }; } if (audio.output) { sanitized.output = { ...audio.output }; } Object.entries(audio).forEach(([key, value]) => { if (key === 'input' || key === 'output') { return; } let clonedValue = value; if (Array.isArray(value)) { clonedValue = value.map(item => cloneValue(item)); } else if (isPlainObject(value)) { clonedValue = cloneValue(value); } sanitized[key] = clonedValue; }); return Object.keys(sanitized).length > 0 ? sanitized : undefined; }, []); const sanitizeAgentConfig = (0, _react.useCallback)(agent => { if (!agent) { return undefined; } const sanitized = {}; Object.entries(agent).forEach(([key, value]) => { if (key === 'speak') { return; } let clonedValue = value; if (Array.isArray(value)) { clonedValue = value.map(item => cloneValue(item)); } else if (isPlainObject(value)) { clonedValue = cloneValue(value); } sanitized[key] = clonedValue; }); if (agent.speak) { sanitized.speak = { ...agent.speak }; if (agent.speak.provider) { sanitized.speak.provider = { ...agent.speak.provider }; } } return Object.keys(sanitized).length > 0 ? sanitized : undefined; }, []); const sanitizeSettings = (0, _react.useCallback)(settings => { if (!settings) { return undefined; } const sanitized = {}; Object.entries(settings).forEach(([key, value]) => { if (key === 'audio') { const audio = sanitizeAudioSettings(value); if (audio) { sanitized.audio = audio; } return; } if (key === 'agent') { const agent = sanitizeAgentConfig(value); if (agent) { sanitized.agent = agent; } return; } sanitized[key] = value; }); return Object.keys(sanitized).length > 0 ? sanitized : undefined; }, [sanitizeAgentConfig, sanitizeAudioSettings]); const mergeSettings = (0, _react.useCallback)((base, override) => mergePlainObjects(base, override), []); defaultSettingsRef.current = defaultSettings; endpointRef.current = endpoint; onBeforeConnectRef.current = onBeforeConnect; onConnectRef.current = onConnect; onCloseRef.current = onClose; onErrorRef.current = onError; onMessageRef.current = onMessage; onWelcomeRef.current = onWelcome; onSettingsAppliedRef.current = onSettingsApplied; onConversationTextRef.current = onConversationText; onAgentThinkingRef.current = onAgentThinking; onAgentStartedSpeakingRef.current = onAgentStartedSpeaking; onAgentAudioDoneRef.current = onAgentAudioDone; onUserStartedSpeakingRef.current = onUserStartedSpeaking; onFunctionCallRequestRef.current = onFunctionCallRequest; onFunctionCallResponseRef.current = onFunctionCallResponse; onPromptUpdatedRef.current = onPromptUpdated; onSpeakUpdatedRef.current = onSpeakUpdated; onInjectionRefusedRef.current = onInjectionRefused; onWarningRef.current = onWarning; onServerErrorRef.current = onServerError; onAudioConfigRef.current = onAudioConfig; onAudioRef.current = onAudio; autoStartMicRef.current = autoStartMicrophone; if (downsampleFactor != null) { currentDownsample.current = downsampleFactor; } const cleanup = (0, _react.useCallback)(() => { audioSub.current?.remove(); audioSub.current = null; if (microphoneActive.current) { _NativeDeepgram.Deepgram.stopRecording().catch(() => {}); microphoneActive.current = false; } // Cleanup audio session for playback _NativeDeepgram.Deepgram.stopAudio().catch(() => {}); const socket = ws.current; if (socket) { ws.current = null; try { if (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING) { socket.close(1000, 'cleanup'); } else { socket.close(); } } catch { // ignore socket close errors } } }, []); (0, _react.useEffect)(() => () => cleanup(), [cleanup]); const handleMicChunk = (0, _react.useCallback)(ev => { const socket = ws.current; if (!socket || socket.readyState !== WebSocket.OPEN) { return; } if (typeof ev?.sampleRate === 'number' && ev.sampleRate > 0) { if (ev.sampleRate !== nativeInputSampleRate.current) { nativeInputSampleRate.current = ev.sampleRate; currentDownsample.current = resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current); } } const factor = currentDownsample.current ?? 1; let chunk = null; if (typeof ev?.b64 === 'string') { const binary = Uint8Array.from(atob(ev.b64), c => c.charCodeAt(0)); let int16 = new Int16Array(binary.buffer); if (factor > 1 && int16.length >= factor) { const downsampled = new Int16Array(Math.floor(int16.length / factor)); for (let i = 0; i < downsampled.length; i++) { downsampled[i] = int16[i * factor]; } int16 = downsampled; } chunk = int16.buffer; } else if (Array.isArray(ev?.data)) { const bytes = new Uint8Array(ev.data.length); for (let i = 0; i < ev.data.length; i++) { const value = ev.data[i]; bytes[i] = value < 0 ? value + 256 : value; } const view = new DataView(bytes.buffer); const int16 = new Int16Array(bytes.length / 2); for (let i = 0; i < int16.length; i++) { int16[i] = view.getInt16(i * 2, true); } chunk = int16.buffer; } if (!chunk) { return; } try { socket.send(chunk); } catch (err) { onErrorRef.current?.(err); } }, [downsampleFactor, onErrorRef]); const handleSocketMessage = (0, _react.useCallback)(ev => { if (typeof ev.data === 'string') { try { const message = JSON.parse(ev.data); onMessageRef.current?.(message); switch (message.type) { case 'Welcome': if (hasKeys(message, ['request_id'])) { onWelcomeRef.current?.(message); } break; case 'SettingsApplied': onSettingsAppliedRef.current?.(message); break; case 'ConversationText': if (hasKeys(message, ['role', 'content'])) { const convMsg = message; if (trackConversation) { setInternalConversation(prev => [...prev, { role: convMsg.role, content: convMsg.content }]); } onConversationTextRef.current?.(convMsg); } break; case 'AgentThinking': if (hasKeys(message, ['content'])) { const thinkMsg = message; if (trackAgentStatus) { setInternalAgentStatus(prev => ({ ...prev, thinking: thinkMsg.content })); } onAgentThinkingRef.current?.(thinkMsg); } break; case 'AgentStartedSpeaking': { const speakMsg = message; if (trackAgentStatus) { setInternalAgentStatus(prev => ({ ...prev, latency: { total: speakMsg.total_latency, tts: speakMsg.tts_latency, ttt: speakMsg.ttt_latency } })); } onAgentStartedSpeakingRef.current?.(speakMsg); } break; case 'AgentAudioDone': { const doneMsg = message; if (trackAgentStatus) { setInternalAgentStatus({ thinking: null, latency: null }); } onAgentAudioDoneRef.current?.(doneMsg); } break; case 'UserStartedSpeaking': onUserStartedSpeakingRef.current?.(message); break; case 'FunctionCallRequest': if (hasKeys(message, ['functions'])) { onFunctionCallRequestRef.current?.(message); } break; case 'FunctionCallResponse': if (hasKeys(message, ['id', 'name'])) { onFunctionCallResponseRef.current?.(message); } break; case 'PromptUpdated': onPromptUpdatedRef.current?.(message); break; case 'SpeakUpdated': onSpeakUpdatedRef.current?.(message); break; case 'Audio': // Audio binary data will be handled by onmessage binary path break; case 'AudioConfig': if (hasKeys(message, ['sample_rate'])) { const configMsg = message; if (autoPlayAudio) { const sampleRate = configMsg.sample_rate || DEFAULT_INPUT_SAMPLE_RATE; const channels = configMsg.channels || 1; _NativeDeepgram.Deepgram.startPlayer?.(sampleRate, channels); } onAudioConfigRef.current?.(configMsg); } break; case 'InjectionRefused': if (hasKeys(message, ['message'])) { onInjectionRefusedRef.current?.(message); } break; case 'Warning': if (hasKeys(message, ['description'])) { const warnMsg = message; if (trackState) { setInternalState(prev => ({ ...prev, warning: warnMsg.description })); } onWarningRef.current?.(warnMsg); } break; case 'Error': { const description = typeof message.description === 'string' ? message.description : undefined; const code = typeof message.code === 'string' ? message.code : undefined; const errorMsg = description ?? code ?? 'Voice agent error'; if (trackState) { setInternalState(prev => ({ ...prev, connectionState: 'disconnected', error: errorMsg })); } if (description || code) { onServerErrorRef.current?.(message); } onErrorRef.current?.(new Error(errorMsg)); } break; default: break; } } catch (err) { onErrorRef.current?.(err); } return; } const buffer = ensureArrayBuffer(ev.data); if (buffer) { if (autoPlayAudio) { try { const bytes = new Uint8Array(buffer); const b64 = _buffer.Buffer.from(bytes).toString('base64'); _NativeDeepgram.Deepgram.feedAudio?.(b64); } catch (err) { console.warn('[VoiceAgent] Auto-feed audio error:', err); } } onAudioRef.current?.(buffer); } }, [onAgentAudioDoneRef, onAgentStartedSpeakingRef, onAgentThinkingRef, onConversationTextRef, onErrorRef, onFunctionCallRequestRef, onFunctionCallResponseRef, onInjectionRefusedRef, onMessageRef, onPromptUpdatedRef, onServerErrorRef, onSettingsAppliedRef, onSpeakUpdatedRef, onUserStartedSpeakingRef, onWarningRef, autoPlayAudio, trackAgentStatus, trackConversation, trackState]); const sendJsonMessage = (0, _react.useCallback)(message => { const socket = ws.current; if (!socket || socket.readyState !== WebSocket.OPEN) { return false; } try { socket.send(JSON.stringify(message)); return true; } catch (err) { onErrorRef.current?.(err); return false; } }, []); const sendBinary = (0, _react.useCallback)(chunk => { const socket = ws.current; if (!socket || socket.readyState !== WebSocket.OPEN) { return false; } let payload = null; if (chunk instanceof ArrayBuffer) { payload = chunk; } else if (chunk instanceof Uint8Array) { if (chunk.buffer instanceof ArrayBuffer) { payload = chunk.buffer.slice(chunk.byteOffset, chunk.byteOffset + chunk.byteLength); } else { const copy = new Uint8Array(chunk.byteLength); copy.set(chunk); payload = copy.buffer; } } else if (Array.isArray(chunk)) { const uint = new Uint8Array(chunk.length); for (let i = 0; i < chunk.length; i++) { uint[i] = chunk[i]; } payload = uint.buffer; } if (!payload) return false; try { socket.send(payload); return true; } catch (err) { onErrorRef.current?.(err); return false; } }, []); const connect = (0, _react.useCallback)(async overrideSettings => { cleanup(); if (trackState) { setInternalState({ connectionState: 'connecting', error: null, warning: null }); } if (trackConversation) { setInternalConversation([]); } if (trackAgentStatus) { setInternalAgentStatus({ thinking: null, latency: null }); } onBeforeConnectRef.current?.(); const apiKey = globalThis.__DEEPGRAM_API_KEY__; if (!apiKey) throw new Error('Deepgram API key missing'); const shouldCaptureMic = autoStartMicRef.current; if (shouldCaptureMic) { const granted = await (0, _askMicPermission.askMicPermission)(); if (!granted) { throw new Error('Microphone permission denied'); } await _NativeDeepgram.Deepgram.startRecording(); microphoneActive.current = true; const emitter = new _reactNative.NativeEventEmitter(_reactNative.NativeModules.Deepgram); if (eventName) { audioSub.current = emitter.addListener(eventName, handleMicChunk); } } else { // Only initialize audio session for playback if not recording // (startRecording already activates the audio session) await _NativeDeepgram.Deepgram.startAudio(); } const sanitizedDefault = sanitizeSettings(defaultSettingsRef.current); const sanitizedOverride = sanitizeSettings(overrideSettings); const merged = mergeSettings(sanitizedDefault, sanitizedOverride); const mergedSettings = { type: 'Settings', ...(merged ?? {}) }; const targetSampleRate = overrideSettings?.audio?.input?.sample_rate ?? defaultSettingsRef.current?.audio?.input?.sample_rate ?? DEFAULT_INPUT_SAMPLE_RATE; targetInputSampleRate.current = targetSampleRate; currentDownsample.current = resolveDownsampleFactor(downsampleFactor, targetInputSampleRate.current, nativeInputSampleRate.current); const socket = new WebSocket(endpointRef.current, undefined, { headers: { Authorization: `Token ${apiKey}` } }); socket.binaryType = 'arraybuffer'; ws.current = socket; socket.onopen = () => { sendJsonMessage(mergedSettings); if (trackState) { setInternalState(prev => ({ ...prev, connectionState: 'connected' })); } if (autoPlayAudio) { const sampleRate = merged?.audio?.output?.sample_rate ?? DEFAULT_INPUT_SAMPLE_RATE; const channels = 1; _NativeDeepgram.Deepgram.startPlayer?.(sampleRate, channels); } onConnectRef.current?.(); }; socket.onmessage = handleSocketMessage; socket.onerror = err => { onErrorRef.current?.(err); }; socket.onclose = event => { if (trackState) { setInternalState(prev => ({ ...prev, connectionState: 'disconnected' })); } cleanup(); onCloseRef.current?.(event); }; }, [cleanup, downsampleFactor, handleMicChunk, handleSocketMessage, mergeSettings, sanitizeSettings, sendJsonMessage, autoPlayAudio, trackAgentStatus, trackConversation, trackState]); const disconnect = (0, _react.useCallback)(() => { cleanup(); }, [cleanup]); const sendSettings = (0, _react.useCallback)(settings => { const sanitized = sanitizeSettings(settings); return sendJsonMessage({ type: 'Settings', ...(sanitized ?? {}) }); }, [sanitizeSettings, sendJsonMessage]); const injectUserMessage = (0, _react.useCallback)(content => sendJsonMessage({ type: 'InjectUserMessage', content }), [sendJsonMessage]); const injectAgentMessage = (0, _react.useCallback)(message => sendJsonMessage({ type: 'InjectAgentMessage', message }), [sendJsonMessage]); const sendFunctionCallResponse = (0, _react.useCallback)(response => sendJsonMessage({ type: 'FunctionCallResponse', ...response }), [sendJsonMessage]); const sendKeepAlive = (0, _react.useCallback)(() => sendJsonMessage({ type: 'KeepAlive' }), [sendJsonMessage]); const updatePrompt = (0, _react.useCallback)(prompt => sendJsonMessage({ type: 'UpdatePrompt', prompt }), [sendJsonMessage]); const sendMessage = (0, _react.useCallback)(message => sendJsonMessage(message), [sendJsonMessage]); const isConnected = (0, _react.useCallback)(() => ws.current?.readyState === WebSocket.OPEN, []); const clearConversation = (0, _react.useCallback)(() => { if (trackConversation) { setInternalConversation([]); } }, [trackConversation]); return { connect, disconnect, sendMessage, sendSettings, injectUserMessage, injectAgentMessage, sendFunctionCallResponse, sendKeepAlive, updatePrompt, sendMedia: sendBinary, isConnected, ...(trackState ? { state: internalState } : {}), ...(trackConversation ? { conversation: internalConversation, clearConversation } : {}), ...(trackAgentStatus ? { agentStatus: internalAgentStatus } : {}) }; } //# sourceMappingURL=useDeepgramVoiceAgent.js.map