UNPKG

react-native-executorch

Version:

An easy way to run AI models in React Native with ExecuTorch

281 lines (254 loc) 8.31 kB
import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { parseUnknownError, RnExecutorchError } from '../../errors/errorUtils'; import { ResourceFetcher } from '../../utils/ResourceFetcher'; import { KokoroConfig, TextToSpeechConfig, TextToSpeechStreamingInput, TextToSpeechStreamingPhonemeInput, VoiceConfig, } from '../../types/tts'; import { Logger } from '../../common/Logger'; /** * Module for Text to Speech (TTS) functionalities. * @category Typescript API */ export class TextToSpeechModule { private nativeModule: any; private isStreaming: boolean = false; private constructor(nativeModule: unknown) { this.nativeModule = nativeModule; } /** * Creates a Text to Speech instance. * @param config - Configuration object containing `model` and `voice`. * Pass one of the built-in constants (e.g. `{ model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART }`), or use require() to pass them. * @param onDownloadProgress - Optional callback to monitor download progress, receiving a value between 0 and 1. * @returns A Promise resolving to a `TextToSpeechModule` instance. * @example * ```ts * import { TextToSpeechModule, KOKORO_MEDIUM, KOKORO_VOICE_AF_HEART } from 'react-native-executorch'; * const tts = await TextToSpeechModule.fromModelName( * { model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART }, * ); * ``` */ static async fromModelName( config: TextToSpeechConfig, onDownloadProgress: (progress: number) => void = () => {} ): Promise<TextToSpeechModule> { try { const nativeModule = await TextToSpeechModule.loadKokoro( config.model, config.voice, onDownloadProgress ); return new TextToSpeechModule(nativeModule); } catch (error) { Logger.error('Load failed:', error); throw parseUnknownError(error); } } private static async loadKokoro( model: KokoroConfig, voice: VoiceConfig, onDownloadProgressCallback: (progress: number) => void ): Promise<unknown> { if ( !voice.extra || !voice.extra.taggerSource || !voice.extra.lexiconSource ) { throw new RnExecutorchError( RnExecutorchErrorCode.InvalidConfig, 'Kokoro: voice config is missing required extra fields: taggerSource and/or lexiconSource.' ); } const paths = await ResourceFetcher.fetch( onDownloadProgressCallback, model.durationPredictorSource, model.synthesizerSource, voice.voiceSource, voice.extra.taggerSource, voice.extra.lexiconSource ); if (paths === null || paths.length !== 5) { throw new RnExecutorchError( RnExecutorchErrorCode.DownloadInterrupted, 'Download interrupted or missing resource.' ); } const modelPaths = paths.slice(0, 2) as [string, string]; const voiceDataPath = paths[2] as string; const phonemizerPaths = paths.slice(3, 5) as [string, string]; return await global.loadTextToSpeechKokoro( voice.lang, phonemizerPaths[0], phonemizerPaths[1], modelPaths[0], modelPaths[1], voiceDataPath ); } private ensureLoaded(methodName: string): void { if (this.nativeModule == null) throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, `The model is currently not loaded. Please load the model before calling ${methodName}().` ); } /** * Synthesizes the provided text into speech. * Returns a promise that resolves to the full audio waveform as a `Float32Array`. * @param text The input text to be synthesized. * @param speed Optional speed multiplier for the speech synthesis (default is 1.0). * @returns A promise resolving to the synthesized audio waveform. */ public async forward( text: string, speed: number = 1.0 ): Promise<Float32Array> { this.ensureLoaded('forward'); return await this.nativeModule.generate(text, speed); } /** * Synthesizes pre-computed phonemes into speech, bypassing the built-in phonemizer. * This allows using an external G2P system (e.g. the Python `phonemizer` library, * espeak-ng, or any custom phonemizer). * @param phonemes The pre-computed IPA phoneme string. * @param speed Optional speed multiplier for the speech synthesis (default is 1.0). * @returns A promise resolving to the synthesized audio waveform. */ public async forwardFromPhonemes( phonemes: string, speed: number = 1.0 ): Promise<Float32Array> { this.ensureLoaded('forwardFromPhonemes'); return await this.nativeModule.generateFromPhonemes(phonemes, speed); } /** * Starts a streaming synthesis session. Yields audio chunks as they are generated. * @param input - Input object containing text and optional speed. * @yields An audio chunk generated during synthesis. * @returns An async generator yielding Float32Array audio chunks. */ public async *stream({ speed, stopAutomatically, }: TextToSpeechStreamingInput): AsyncGenerator<Float32Array> { // Stores computed audio segments const queue: Float32Array[] = []; let waiter: (() => void) | null = null; let error: unknown; let nativeStreamFinished = false; this.isStreaming = true; const wake = () => { waiter?.(); waiter = null; }; (async () => { try { await this.nativeModule.stream( speed, stopAutomatically, (audio: number[]) => { queue.push(new Float32Array(audio)); wake(); } ); nativeStreamFinished = true; wake(); } catch (e) { error = e; nativeStreamFinished = true; wake(); } })(); while (this.isStreaming) { if (queue.length > 0) { yield queue.shift()!; if (nativeStreamFinished && queue.length === 0) { return; } continue; } if (error) throw error; await new Promise<void>((r) => (waiter = r)); } } /** * Starts a streaming synthesis session from pre-computed phonemes. * Bypasses the built-in phonemizer, allowing use of external G2P systems. * @param input - Input object containing phonemes and optional speed. * @yields An audio chunk generated during synthesis. * @returns An async generator yielding Float32Array audio chunks. */ public async *streamFromPhonemes({ phonemes, speed, }: TextToSpeechStreamingPhonemeInput): AsyncGenerator<Float32Array> { const queue: Float32Array[] = []; let waiter: (() => void) | null = null; let error: unknown; let nativeStreamFinished = false; const wake = () => { waiter?.(); waiter = null; }; (async () => { try { await this.nativeModule.streamFromPhonemes( phonemes, speed, (audio: number[]) => { queue.push(new Float32Array(audio)); wake(); } ); nativeStreamFinished = true; wake(); } catch (e) { error = e; nativeStreamFinished = true; wake(); } })(); while (this.isStreaming) { if (queue.length > 0) { yield queue.shift()!; if (nativeStreamFinished && queue.length === 0) { return; } continue; } if (error) throw error; await new Promise<void>((r) => (waiter = r)); } } /** * Inserts new text chunk into the buffer to be processed in streaming mode. * @param textChunk - The text fragment to append to the streaming buffer. */ public streamInsert(textChunk: string): void { this.nativeModule.streamInsert(textChunk); } /** * Stops the streaming process if there is any ongoing. * @param instant - If true, stops the streaming as soon as possible. Otherwise * allows the module to complete processing for the remains of the buffer. */ public streamStop(instant: boolean = true): void { this.nativeModule.streamStop(instant); if (instant) { this.isStreaming = false; } } /** * Unloads the model from memory. */ delete() { if (this.nativeModule !== null) { this.nativeModule.unload(); } } }