@swankylegg/voice-io
Version:
A browser-based speech recognition and synthesis assistant
457 lines (452 loc) • 17.2 kB
JavaScript
'use strict';
Object.defineProperty(exports, '__esModule', { value: true });
const STATES = {
IDLE: 'IDLE',
LISTENING: 'LISTENING',
THINKING: 'THINKING',
RESPONDING: 'RESPONDING'
};
// Add type assertion for SpeechRecognition
const SpeechRecognitionAPI = (window.SpeechRecognition || window.webkitSpeechRecognition);
/**
* Default configuration for VoiceIO instance
* @type {VoiceIOConfig}
*/
const DEFAULT_CONFIG = {
// Event handlers
onListenStart: undefined,
onListenEnd: undefined,
onRecognitionResult: undefined,
onVoiceStart: undefined,
onVoiceEnd: undefined,
onError: undefined,
// Synthesis settings
synthesis: {
pitch: 1,
rate: 1
},
// Recognition settings
recognition: {
continuous: true,
interimResults: true,
lang: 'en-US',
maxAlternatives: 3
},
// Add new callbacks
onLanguagesLoaded: undefined,
onVoicesLoaded: undefined,
};
/**
* VoiceIO - A class to handle browser-based speech recognition and synthesis
* @class
* @throws {Error} If speech recognition or synthesis is not supported by the browser
*/
class VoiceIO {
/**
* Creates a new VoiceIO instance
* @param {Partial<VoiceIOConfig>} config - Configuration options
* @param {(() => void)} [config.onListenStart] - Callback when speech recognition starts
* @param {(() => void)} [config.onListenEnd] - Callback when speech recognition ends
* @param {((results: RecognitionResult[][], bestTranscript: string, accumulatedTranscript: string) => void)} [config.onRecognitionResult] - Callback for speech recognition results
* @param {((utterance: SpeechSynthesisUtterance) => void)} [config.onVoiceStart] - Callback when speech synthesis starts
* @param {((utterance: SpeechSynthesisUtterance) => void)} [config.onVoiceEnd] - Callback when speech synthesis ends
* @param {((error: SpeechRecognitionErrorEvent | SpeechSynthesisErrorEvent) => void)} [config.onError] - Callback for error handling
* @param {((languages: LanguageInfo[]) => void)} [config.onLanguagesLoaded] - Callback when available languages are loaded
* @param {((voices: SpeechSynthesisVoice[]) => void)} [config.onVoicesLoaded] - Callback when available voices are loaded
*/
constructor(config = {}) {
this.states = STATES;
this.state = STATES.IDLE;
this.recognitionResults = [];
this.accumulatedTranscript = '';
this.selectedVoice = null;
this.voices = [];
this.voicesLoaded = false;
this.recognitionLanguages = [
{ code: 'en-US', name: 'English' },
{ code: 'es-ES', name: 'Español' },
{ code: 'fr-FR', name: 'Français' },
{ code: 'de-DE', name: 'Deutsch' },
{ code: 'it-IT', name: 'Italiano' },
{ code: 'ja-JP', name: '日本語' },
{ code: 'ko-KR', name: '한국어' },
{ code: 'zh-CN', name: '中文' }
];
// Check browser support first
if (!('SpeechRecognition' in window) && !('webkitSpeechRecognition' in window)) {
throw new Error('Speech recognition is not supported in this browser');
}
if (!('speechSynthesis' in window)) {
throw new Error('Speech synthesis is not supported in this browser');
}
// So these can be accessed directly without a separate import
this.states = STATES;
this.state = this.states.IDLE;
// Merge configs
this.config = {
...DEFAULT_CONFIG,
synthesis: { ...DEFAULT_CONFIG.synthesis, ...config.synthesis },
recognition: { ...DEFAULT_CONFIG.recognition, ...config.recognition },
...config
};
this.selectedLanguage = this.config.recognition.lang;
// Initialize recognizer first
this.initRecognizer();
// Initialize synthesizer and set up voice loading
this.synthesizer = window.speechSynthesis;
// Set up voice changed listener first
this.synthesizer.onvoiceschanged = () => {
this.voices = this.synthesizer.getVoices();
this.handleVoicesLoaded();
};
// Try loading voices immediately as well (for browsers that load synchronously)
this.voices = this.synthesizer.getVoices();
this.handleVoicesLoaded();
// Clean up on page unload
window.addEventListener('beforeunload', () => this.cleanup());
}
/**
* Initializes the speech recognition system
* @private
* @throws {Error} If speech recognition is not supported
*/
initRecognizer() {
if (!SpeechRecognitionAPI) {
throw new Error('Speech recognition not supported');
}
this.recognizer = new SpeechRecognitionAPI();
Object.assign(this.recognizer, this.config.recognition);
this.recognizer.onstart = () => {
this.config.onListenStart?.();
};
this.recognizer.onresult = (evt) => this.handleRecognitionResult(evt);
this.recognizer.onspeechend = () => {
this.recognizer.stop();
};
this.recognizer.onend = () => {
// Only call onListenEnd here, when recognition fully ends
this.config.onListenEnd?.();
};
this.recognizer.onerror = (error) => {
this.handleError(error, 'recognizer');
};
}
/**
* Handles the loading of speech synthesis voices
* @private
* Note: Some browsers (like Chrome) load voices asynchronously, which is why we need this handler
*/
handleVoicesLoaded() {
this.voicesLoaded = true;
// Force refresh voices list
this.voices = this.synthesizer.getVoices();
// Get available languages based on available voices
const availableLanguages = this.getAvailableLanguages();
// Always notify about languages, even if empty
this.config.onLanguagesLoaded?.(availableLanguages);
if (availableLanguages.length > 0) {
// If no language selected or current language isn't available,
// select first available language
const currentLanguageIsValid = availableLanguages.some(l => l.code === this.selectedLanguage);
if (!this.selectedLanguage || !currentLanguageIsValid) {
// Select first available language
this.setLanguage(availableLanguages[0].code);
}
else {
// Current language is valid, just update voices
const availableVoices = this.getVoicesForCurrentLanguage();
// Select first voice if none selected
if (availableVoices.length > 0 && !this.selectedVoice) {
this.setVoice(availableVoices[0].name);
}
// Always notify about voices for current language, even if empty
this.config.onVoicesLoaded?.(availableVoices);
}
}
else {
// Notify with empty voices list if no languages available
this.config.onVoicesLoaded?.([]);
}
}
/**
* Processes speech recognition results
* @private
* @param {SpeechRecognitionEvent} evt - The recognition event
* Note: Accumulates final transcripts and manages recognition state
*/
handleRecognitionResult(evt) {
// Convert results to a more usable format
const results = Array.from(evt.results).map(resultArray => {
return Array.from(resultArray).map(result => ({
transcript: result.transcript,
confidence: result.confidence,
isFinal: resultArray.isFinal
}));
});
// Get the best transcript by taking highest confidence result from each group
const bestTranscript = results
.map(alternatives => alternatives.reduce((best, current) => current.confidence > best.confidence ? current : best))
.map(result => result.transcript)
.join(' ');
// Accumulate final transcripts
const lastResult = results[results.length - 1];
// Stop recognizing if we have final results
if (lastResult && lastResult[0].isFinal) {
this.accumulatedTranscript = (this.accumulatedTranscript + ' ' + bestTranscript).trim();
this.stopRecognizing();
// Use setState but skip cleanup since we just stopped recognizing
this.state = this.states.IDLE; // Directly set state to avoid cleanup
}
this.recognitionResults = results;
this.config.onRecognitionResult?.(results, bestTranscript, this.accumulatedTranscript);
}
/**
* Sets the state to IDLE and performs cleanup
* @private
* Note: Resets accumulated transcript when starting new session
*/
setIdle() {
this.cleanup();
}
/**
* Starts listening for speech input
* @private
* Note: Resets accumulated transcript when starting new session
*/
setListening() {
// If we're currently speaking, just stop that
// No need for full cleanup which would also stop recognition
if (this.state === STATES.RESPONDING) {
this.stopSpeaking();
}
else {
this.cleanup();
}
this.accumulatedTranscript = ''; // Reset accumulated transcript when starting new session
this.recognizer.start();
}
/**
* Sets the state to THINKING (transitional state)
* @private
*/
setThinking() {
this.cleanup();
}
/**
* Initiates speech synthesis
* @private
* @param {string} text - The text to synthesize
*/
setSpeaking(text) {
this.cleanup();
const utterance = new SpeechSynthesisUtterance(text);
// Apply config settings
Object.assign(utterance, this.config.synthesis);
// Set the voice and language
if (this.selectedVoice) {
utterance.voice = this.selectedVoice;
utterance.lang = this.selectedVoice.lang; // Use full language code from voice
}
else {
utterance.lang = this.selectedLanguage;
}
// Add event handlers
utterance.onend = () => {
this.setState(STATES.IDLE);
this.config.onVoiceEnd?.(utterance);
};
utterance.onstart = () => {
this.config.onVoiceStart?.(utterance);
};
utterance.onerror = (error) => {
this.handleError(error, 'utterance');
};
this.synthesizer.speak(utterance);
}
/**
* Changes the current state of the VoiceIO instance
* @param {VoiceIOState} newState - The state to transition to
* @param {string} [textToSynthesize] - Text to speak when transitioning to RESPONDING state
* @throws {Error} If the state is invalid
*/
setState(newState, textToSynthesize) {
// Don't do anything if state is invalid or same
if (!(newState in STATES)) {
throw new Error(`Invalid state: ${newState}`);
}
if (this.state === newState) {
return console.log('No Voice I/O state change');
}
// Update the state first
this.state = newState;
// Then handle the state change
switch (newState) {
case STATES.IDLE:
return this.setIdle();
case STATES.RESPONDING:
return this.setSpeaking(textToSynthesize);
case STATES.LISTENING:
return this.setListening();
case STATES.THINKING:
return this.setThinking();
}
}
/**
* Handles errors from speech recognition or synthesis
* @private
* @param {SpeechRecognitionErrorEvent | SpeechSynthesisErrorEvent} error - The error that occurred
* @param {string} label - Label identifying the error source
*/
handleError(error, label) {
console.info(`Voice I/O ${label} error`, error);
this.config.onError?.(error);
this.stopRecognizing();
this.setState(STATES.IDLE);
}
/**
* Stops the speech recognition process
* @private
*/
stopRecognizing() {
if (this.recognizer) {
try {
this.recognizer.stop();
}
catch (err) {
console.warn('Error stopping recognition:', err);
}
}
}
/**
* Stops any ongoing speech synthesis
* @private
*/
stopSpeaking() {
if (this.synthesizer.speaking) {
this.synthesizer.cancel();
}
}
/**
* Performs cleanup of speech recognition and synthesis
* @public
*/
cleanup() {
try {
this.stopSpeaking();
this.stopRecognizing();
}
catch (err) {
this.handleError(err, "cleanup");
}
}
/**
* Gets all available speech synthesis voices
* @returns {SpeechSynthesisVoice[]} Array of available voices
*/
getVoices() {
return this.voices;
}
/**
* Gets the current state of the VoiceIO instance
* @returns {VoiceIOState} Current state from STATES enum
*/
getState() {
return this.state;
}
/**
* Sets the language for both speech recognition and synthesis
* @param {string} languageCode - Language code (e.g., 'en-US')
* Note: Automatically selects the first available voice for the new language
*/
setLanguage(languageCode) {
this.selectedLanguage = languageCode;
this.config.recognition.lang = languageCode;
// Get voices for new language
const availableVoices = this.getVoicesForCurrentLanguage();
// Reset voice selection
this.selectedVoice = null;
this.config.synthesis.voice = undefined;
// Always select first available voice for this language
if (availableVoices.length > 0) {
this.setVoice(availableVoices[0].name);
}
// Notify about available voices for this language
this.config.onVoicesLoaded?.(availableVoices);
// Reinitialize recognizer
this.initRecognizer();
}
/**
* Gets available voices for the currently selected language
* @private
* @returns {SpeechSynthesisVoice[]} Array of voices matching current language
* Note: Matches based on language prefix (e.g., 'en' for 'en-US')
*/
getVoicesForCurrentLanguage() {
// Get language prefix (e.g., 'en' from 'en-US')
const currentLangPrefix = this.selectedLanguage.split('-')[0].toLowerCase();
return this.voices.filter(voice => {
const voiceLangPrefix = voice.lang.split('-')[0].toLowerCase();
return voiceLangPrefix === currentLangPrefix;
});
}
/**
* Sets the voice for speech synthesis
* @param {string} voiceName - Name of the voice to use
* Note: Only allows voices that match the current language
*/
setVoice(voiceName) {
const validVoices = this.getVoicesForCurrentLanguage();
const voice = validVoices.find(v => v.name === voiceName);
if (voice) {
this.selectedVoice = voice;
this.config.synthesis.voice = voice;
}
else {
console.warn('Selected voice is not available for current language');
}
}
/**
* Gets the currently selected language code
* @returns {string} Selected language code
*/
getSelectedLanguage() {
return this.selectedLanguage;
}
/**
* Gets the currently selected voice
* @returns {SpeechSynthesisVoice|null} Selected voice or null if none selected
*/
getSelectedVoice() {
return this.selectedVoice;
}
/**
* Gets available languages that have both recognition and synthesis support
* @returns {Array<{code: string, name: string, prefix: string}>} Array of available languages
* Note: Only returns languages that have both recognition and synthesis support
*/
getAvailableLanguages() {
// Ensure we have the latest voices list
const currentVoices = this.synthesizer.getVoices();
if (currentVoices.length > 0) {
this.voices = currentVoices;
}
if (!this.voices || this.voices.length === 0) {
return [];
}
// Get languages that have synthesis voices
const synthLanguages = new Set(this.voices.map(voice => voice.lang.split('-')[0].toLowerCase()));
// Filter recognition languages to only those that also have synthesis support
return this.recognitionLanguages
.filter(lang => {
const langPrefix = lang.code.split('-')[0].toLowerCase();
return synthLanguages.has(langPrefix);
})
.map(lang => ({
code: lang.code,
name: lang.name,
prefix: lang.code.split('-')[0].toLowerCase()
}));
}
}
exports.VoiceIO = VoiceIO;
//# sourceMappingURL=voice-io.js.map