UNPKG

@mazka/react-speech-to-text

Version:

A powerful, TypeScript-first React hook for speech recognition using the Web Speech API. This library provides a simple yet comprehensive interface for converting speech to text in React applications.

462 lines (461 loc) 14.8 kB
// src/use-speech-to-text.ts import { useCallback, useEffect, useRef, useState } from "react"; var useSpeechToText = (initialOptions) => { const [state, setState] = useState({ isListening: false, isSupported: false, transcript: "", interimTranscript: "", finalTranscript: "", results: [], error: null, isInitializing: true, isAutoStopping: false, isPaused: false, lastSpeechTimestamp: null }); const DEFAULT_OPTIONS = { continuous: true, interimResults: true, maxAlternatives: 1, language: "en-US", autoStopOnSilence: { enabled: false, silenceDuration: 3e3 // Default to 3 seconds } }; const recognitionRef = useRef(null); const lastSpeechTimeRef = useRef(null); const silenceTimeoutRef = useRef(null); const transcriptRef = useRef({ final: "", interim: "" }); const optionsRef = useRef({ ...DEFAULT_OPTIONS, ...initialOptions }); const getBrowserSpeechRecognition = useCallback(() => { if (typeof window === "undefined") return null; return window.SpeechRecognition || window.webkitSpeechRecognition || window.mozSpeechRecognition || window.msSpeechRecognition || null; }, []); const checkBrowserCompatibility = useCallback(() => { if (typeof window === "undefined") { return { isSupported: false, browserName: "Unknown", reason: "Not running in browser environment" }; } const userAgent = navigator.userAgent.toLowerCase(); const isBrave = "brave" in navigator && navigator.brave !== void 0; const isChrome = userAgent.includes("chrome") && !userAgent.includes("edg") && !isBrave; const isEdge = userAgent.includes("edg"); const isSafari = userAgent.includes("safari") && !userAgent.includes("chrome"); const isFirefox = userAgent.includes("firefox"); const isOpera = userAgent.includes("opr") || userAgent.includes("opera"); const speechRecognitionExists = getBrowserSpeechRecognition() !== null; if (isBrave) { return { isSupported: false, browserName: "Brave", reason: "Brave Browser does not support Web Speech API for privacy reasons" }; } if (isFirefox) { return { isSupported: false, browserName: "Firefox", reason: "Firefox does not support Web Speech API" }; } if (isOpera) { return { isSupported: speechRecognitionExists, browserName: "Opera", reason: speechRecognitionExists ? "" : "Opera has limited Web Speech API support" }; } if (isChrome) { return { isSupported: speechRecognitionExists, browserName: "Chrome", reason: speechRecognitionExists ? "" : "Chrome should support Web Speech API but it's not available" }; } if (isEdge) { return { isSupported: speechRecognitionExists, browserName: "Edge", reason: speechRecognitionExists ? "" : "Edge should support Web Speech API but it's not available" }; } if (isSafari) { const isMobile = /iPhone|iPad|iPod/i.test(navigator.userAgent); const isMac = /(Macintosh|Mac OS)/.test(navigator.userAgent); if (isMobile) { const iosVersion = navigator.userAgent.match(/OS (\d+)_(\d+)/); if (iosVersion) { const majorVersion = Number.parseInt(iosVersion[1], 10); const minorVersion = Number.parseInt(iosVersion[2], 10); const isSupported = majorVersion > 14 || majorVersion === 14 && minorVersion >= 5; return { isSupported: isSupported && speechRecognitionExists, browserName: "Safari (iOS)", reason: !isSupported ? "Web Speech API requires iOS 14.5 or later" : !speechRecognitionExists ? "Web Speech API is not available" : "" }; } } if (isMac) { return { isSupported: speechRecognitionExists, browserName: "Safari (macOS)", reason: speechRecognitionExists ? "" : "Web Speech API requires Safari 14.1 or later" }; } return { isSupported: speechRecognitionExists, browserName: "Safari", reason: speechRecognitionExists ? "" : "Web Speech API may not be supported on this Safari version" }; } return { isSupported: speechRecognitionExists, browserName: "Unknown Browser", reason: speechRecognitionExists ? "" : "This browser does not support Web Speech API" }; }, [getBrowserSpeechRecognition]); const resetSilenceTimeout = useCallback(() => { if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } if (optionsRef.current.autoStopOnSilence?.enabled) { silenceTimeoutRef.current = window.setTimeout(() => { if (recognitionRef.current) { recognitionRef.current.stop(); } if (optionsRef.current.autoStopOnSilence?.onAutoStop && transcriptRef.current.final) { optionsRef.current.autoStopOnSilence.onAutoStop( transcriptRef.current.final ); } }, optionsRef.current.autoStopOnSilence.silenceDuration); } }, []); const initializeSpeechRecognition = useCallback(() => { const browserCheck = checkBrowserCompatibility(); if (!browserCheck.isSupported) { setState((prev) => ({ ...prev, isSupported: false, isInitializing: false, error: { code: "NOT_SUPPORTED", message: browserCheck.reason, name: "NotSupportedError", browserInfo: { browserName: browserCheck.browserName, reason: browserCheck.reason } } })); return null; } const SpeechRecognitionConstructor = getBrowserSpeechRecognition(); if (!SpeechRecognitionConstructor) { setState((prev) => ({ ...prev, isSupported: false, isInitializing: false, error: { code: "NOT_SUPPORTED", message: "Speech recognition is not supported in this browser", name: "NotSupportedError", browserInfo: { browserName: browserCheck.browserName, reason: "Web Speech API constructor not found" } } })); return null; } try { const recognition = new SpeechRecognitionConstructor(); recognition.continuous = optionsRef.current.continuous ?? true; recognition.interimResults = optionsRef.current.interimResults ?? true; recognition.maxAlternatives = optionsRef.current.maxAlternatives ?? 1; recognition.lang = optionsRef.current.language ?? "en-US"; recognition.onstart = () => { lastSpeechTimeRef.current = /* @__PURE__ */ new Date(); setState((prev) => ({ ...prev, isListening: true, error: null, lastSpeechTimestamp: /* @__PURE__ */ new Date() })); resetSilenceTimeout(); }; recognition.onresult = (event) => { let interimTranscript = ""; let finalTranscript = ""; const results = []; resetSilenceTimeout(); for (let i = event.resultIndex; i < event.results.length; i++) { const result = event.results[i]; const transcript = result[0].transcript; const confidence = result[0].confidence; results.push({ transcript, confidence, isFinal: result.isFinal, timestamp: /* @__PURE__ */ new Date() }); if (result.isFinal) { finalTranscript += transcript; } else { interimTranscript += transcript; } } setState((prev) => { transcriptRef.current = { final: prev.finalTranscript + finalTranscript, interim: interimTranscript }; return { ...prev, transcript: finalTranscript + interimTranscript, interimTranscript, finalTranscript: prev.finalTranscript + finalTranscript, results: [...prev.results, ...results] }; }); }; recognition.onerror = (event) => { let errorMessage = "An unknown error occurred during speech recognition"; switch (event.error) { case "no-speech": errorMessage = "No speech was detected"; break; case "audio-capture": errorMessage = "Audio capture failed"; break; case "not-allowed": errorMessage = "Permission to use microphone was denied"; break; case "network": errorMessage = "Network error occurred"; break; case "service-not-allowed": errorMessage = "Speech recognition service is not allowed"; break; case "bad-grammar": errorMessage = "Grammar compilation failed"; break; case "language-not-supported": errorMessage = "Language is not supported"; break; case "aborted": errorMessage = "Speech recognition was aborted"; break; } setState((prev) => ({ ...prev, isListening: false, error: { code: event.error.toUpperCase().replace("-", "_"), message: errorMessage, name: "SpeechRecognitionError" } })); }; recognition.onend = () => { if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } setState((prev) => ({ ...prev, isListening: false })); }; setState((prev) => ({ ...prev, isSupported: true, isInitializing: false })); return recognition; } catch (error) { setState((prev) => ({ ...prev, isSupported: false, isInitializing: false, error: { code: "INITIALIZATION_ERROR", message: error instanceof Error ? error.message : "Failed to initialize speech recognition", name: "InitializationError" } })); return null; } }, [ getBrowserSpeechRecognition, checkBrowserCompatibility, resetSilenceTimeout ]); const startListening = useCallback( (options) => { if (!state.isSupported) { const browserCheck = checkBrowserCompatibility(); setState((prev) => ({ ...prev, error: { code: "NOT_SUPPORTED", message: "Speech recognition is not supported", name: "NotSupportedError", browserInfo: { browserName: browserCheck.browserName, reason: browserCheck.reason } } })); return; } if (state.isListening) { return; } if (options) { optionsRef.current = { ...optionsRef.current, ...options }; } const autoStopEnabled = optionsRef.current.autoStopOnSilence?.enabled === true; setState((prev) => ({ ...prev, isAutoStopping: autoStopEnabled })); if (!recognitionRef.current) { recognitionRef.current = initializeSpeechRecognition(); } if (!recognitionRef.current) { return; } try { recognitionRef.current.continuous = optionsRef.current.continuous ?? true; recognitionRef.current.interimResults = optionsRef.current.interimResults ?? true; recognitionRef.current.maxAlternatives = optionsRef.current.maxAlternatives ?? 1; recognitionRef.current.lang = optionsRef.current.language ?? "en-US"; recognitionRef.current.start(); } catch (error) { setState((prev) => ({ ...prev, error: { code: "START_ERROR", message: error instanceof Error ? error.message : "Failed to start speech recognition", name: "StartError" } })); } }, [ state.isSupported, state.isListening, initializeSpeechRecognition, checkBrowserCompatibility ] ); const stopListening = useCallback(() => { if (recognitionRef.current && state.isListening) { if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } recognitionRef.current.stop(); } }, [state.isListening]); const abortListening = useCallback(() => { if (recognitionRef.current) { if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } recognitionRef.current.abort(); } }, []); const pauseListening = useCallback(() => { if (recognitionRef.current && state.isListening) { recognitionRef.current.abort(); setState((prev) => ({ ...prev, isPaused: true, isListening: false })); } }, [state.isListening]); const resumeListening = useCallback(() => { if (recognitionRef.current && state.isPaused) { recognitionRef.current.start(); setState((prev) => ({ ...prev, isPaused: false, isListening: true })); resetSilenceTimeout(); lastSpeechTimeRef.current = /* @__PURE__ */ new Date(); } }, [state.isPaused, resetSilenceTimeout]); const resetTranscript = useCallback(() => { transcriptRef.current = { final: "", interim: "" }; setState((prev) => ({ ...prev, transcript: "", interimTranscript: "", finalTranscript: "", results: [] })); }, []); const clearError = useCallback(() => { setState((prev) => ({ ...prev, error: null })); }, []); useEffect(() => { recognitionRef.current = initializeSpeechRecognition(); return () => { if (recognitionRef.current) { recognitionRef.current.abort(); recognitionRef.current = null; } if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } }; }, [initializeSpeechRecognition]); useEffect(() => { return () => { if (recognitionRef.current) { recognitionRef.current.abort(); } if (silenceTimeoutRef.current) { clearTimeout(silenceTimeoutRef.current); silenceTimeoutRef.current = null; } }; }, []); return { // State ...state, // Actions startListening, stopListening, abortListening, pauseListening, resumeListening, resetTranscript, clearError }; }; export { useSpeechToText };