react-native-executorch
Version:
An easy way to run AI models in React Native with ExecuTorch
220 lines (197 loc) • 8.18 kB
text/typescript
import { ResourceSource } from './common';
import { RnExecutorchError } from '../errors/errorUtils';
/**
* Union of all built-in Text to Speech model names.
* @category Types
*/
export type TextToSpeechModelName = 'kokoro-small' | 'kokoro-medium';
/**
* List all the languages available in TTS models (as lang shorthands)
* @category Types
*/
export type TextToSpeechLanguage =
| 'en-us' // American English
| 'en-gb'; // British English
/**
* Voice configuration
*
* So far in Kokoro, each voice is directly associated with a language.
* @category Types
* @property {TextToSpeechLanguage} lang - speaker's language
* @property {ResourceSource} voiceSource - a source to a binary file with voice embedding
* @property {KokoroVoiceExtras} [extra] - an optional extra sources or properties related to specific voice
*/
export interface VoiceConfig {
lang: TextToSpeechLanguage;
voiceSource: ResourceSource;
extra?: KokoroVoiceExtras; // ... add more possible types
}
/**
* Kokoro-specific voice extra props
* @category Types
* @property {ResourceSource} taggerSource - source to Kokoro's tagger model binary
* @property {ResourceSource} lexiconSource - source to Kokoro's lexicon binary
*/
export interface KokoroVoiceExtras {
taggerSource: ResourceSource;
lexiconSource: ResourceSource;
}
/**
* Kokoro model configuration.
* Only the core Kokoro model sources, as phonemizer sources are included in voice configuration.
* @category Types
* @property {TextToSpeechModelName} modelName - model name identifier
* @property {ResourceSource} durationPredictorSource - source to Kokoro's duration predictor model binary
* @property {ResourceSource} synthesizerSource - source to Kokoro's synthesizer model binary
*/
export interface KokoroConfig {
modelName: TextToSpeechModelName;
durationPredictorSource: ResourceSource;
synthesizerSource: ResourceSource;
}
/**
* General Text to Speech module configuration
* @category Types
* @property {KokoroConfig} model - a selected T2S model
* @property {VoiceConfig} voice - a selected speaker's voice
* @property {KokoroOptions} [options] - a completely optional model-specific configuration
*/
export interface TextToSpeechConfig {
model: KokoroConfig; // ... add other model types in the future
voice: VoiceConfig;
}
/**
* Props for the useTextToSpeech hook.
* @category Types
* @augments TextToSpeechConfig
* @property {boolean} [preventLoad] - Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
*/
export interface TextToSpeechProps extends TextToSpeechConfig {
preventLoad?: boolean;
}
/**
* Text to Speech module input definition
* @category Types
* @property {string} text - a text to be spoken
* @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes
*/
export interface TextToSpeechInput {
text?: string;
speed?: number;
}
/**
* Text to Speech module input for pre-computed phonemes.
* Use this when you have your own phonemizer (e.g. the Python `phonemizer`
* library, espeak-ng, or any custom G2P system) and want to bypass the
* built-in phonemizer pipeline.
* @category Types
* @property {string} phonemes - pre-computed IPA phoneme string
* @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes
*/
export interface TextToSpeechPhonemeInput {
phonemes: string;
speed?: number;
}
/**
* Return type for the `useTextToSpeech` hook.
* Manages the state and operations for Text-to-Speech generation.
* @category Types
*/
export interface TextToSpeechType {
/**
* Contains the error object if the model failed to load or encountered an error during inference.
*/
error: RnExecutorchError | null;
/**
* Indicates whether the Text-to-Speech model is loaded and ready to accept inputs.
*/
isReady: boolean;
/**
* Indicates whether the model is currently generating audio.
*/
isGenerating: boolean;
/**
* Represents the download progress of the model and voice assets as a value between 0 and 1.
*/
downloadProgress: number;
/**
* Runs the model to convert the provided text into speech audio in a single pass.
* @param input - The `TextToSpeechInput` object containing the `text` to synthesize and optional `speed`.
* @returns A Promise that resolves with the generated audio data (typically a `Float32Array`).
* @throws {RnExecutorchError} If the model is not loaded or is currently generating.
*/
forward: (input: TextToSpeechInput) => Promise<Float32Array>;
/**
* Synthesizes pre-computed phonemes into speech audio in a single pass.
* Bypasses the built-in phonemizer, allowing use of external G2P systems.
* @param input - The `TextToSpeechPhonemeInput` object containing pre-computed `phonemes` and optional `speed`.
* @returns A Promise that resolves with the generated audio data.
* @throws {RnExecutorchError} If the model is not loaded or is currently generating.
*/
forwardFromPhonemes: (
input: TextToSpeechPhonemeInput
) => Promise<Float32Array>;
/**
* Streams the generated audio data incrementally.
* This is optimal for real-time playback, allowing audio to start playing before the full text is synthesized.
* @param input - The `TextToSpeechStreamingInput` object containing `text`, optional `speed`, and lifecycle callbacks (`onBegin`, `onNext`, `onEnd`).
* @returns A Promise that resolves when the streaming process is complete.
* @throws {RnExecutorchError} If the model is not loaded or is currently generating.
*/
stream: (input: TextToSpeechStreamingInput) => Promise<void>;
/**
* Streams pre-computed phonemes incrementally, bypassing the built-in phonemizer.
* @param input - The streaming input with pre-computed `phonemes` instead of `text`.
* @returns A Promise that resolves when the streaming process is complete.
* @throws {RnExecutorchError} If the model is not loaded or is currently generating.
*/
streamFromPhonemes: (
input: TextToSpeechStreamingPhonemeInput
) => Promise<void>;
/**
* Inserts new text chunk into the buffer to be processed in streaming mode.
*/
streamInsert: (textChunk: string) => void;
/**
* Interrupts and stops the currently active audio generation stream.
* @param instant If true, stops the streaming as soon as possible. Otherwise
* allows the module to complete processing for the remains of the buffer.
*/
streamStop: (instant?: boolean) => void;
}
/**
* Shared streaming lifecycle callbacks for TTS streaming modes.
* @category Types
* @property {() => void | Promise<void>} [onBegin] - Called when streaming begins
* @property {(audio: Float32Array) => void | Promise<void>} [onNext] - Called after each audio chunk gets calculated.
* @property {() => void | Promise<void>} [onEnd] - Called when streaming ends
*/
export interface TextToSpeechStreamingCallbacks {
onBegin?: () => void | Promise<void>;
onNext?: (audio: Float32Array) => void | Promise<void>;
onEnd?: () => void | Promise<void>;
}
/**
* Text to Speech streaming input definition
*
* Streaming mode in T2S is synchronized by passing specific callbacks
* executed at given moments of the streaming.
* Actions such as playing the audio should happen within the onNext callback.
* Callbacks can be both synchronous or asynchronous.
*
* Enables an incrementally expanded input, in other words adding
* new text chunks with streamInsert() as the streaming is running.
* @category Types
* @property {boolean} [stopAutomatically] - If true, streaming will stop automatically when the buffer is empty.
*/
export interface TextToSpeechStreamingInput
extends TextToSpeechInput, TextToSpeechStreamingCallbacks {
stopAutomatically?: boolean;
}
/**
* Streaming input definition for pre-computed phonemes.
* Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`.
* @category Types
*/
export interface TextToSpeechStreamingPhonemeInput
extends TextToSpeechPhonemeInput, TextToSpeechStreamingCallbacks {}