react-native-executorch
Version:
An easy way to run AI models in React Native with ExecuTorch
179 lines • 7.75 kB
TypeScript
import { ResourceSource } from './common';
import { RnExecutorchError } from '../errors/errorUtils';
/**
* Named Speech to Text model variants.
* @category Types
*/
export type SpeechToTextModelName = 'whisper-tiny-en' | 'whisper-tiny-en-quantized' | 'whisper-base-en' | 'whisper-base-en-quantized' | 'whisper-small-en' | 'whisper-small-en-quantized' | 'whisper-tiny' | 'whisper-base' | 'whisper-small';
/**
* Configuration for Speech to Text model.
* @category Types
*/
export interface SpeechToTextProps {
/**
* Configuration object containing model sources.
*/
model: SpeechToTextModelConfig;
/**
* Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
*/
preventLoad?: boolean;
}
/**
* React hook for managing Speech to Text (STT) instance.
* @category Types
*/
export interface SpeechToTextType {
/**
* Contains the error message if the model failed to load.
*/
error: null | RnExecutorchError;
/**
* Indicates whether the model has successfully loaded and is ready for inference.
*/
isReady: boolean;
/**
* Indicates whether the model is currently processing an inference.
*/
isGenerating: boolean;
/**
* Tracks the progress of the model download process.
*/
downloadProgress: number;
/**
* Runs the encoding part of the model on the provided waveform.
* @param waveform - The input audio waveform array.
* @returns A promise resolving to the encoded data.
*/
encode(waveform: Float32Array): Promise<Float32Array>;
/**
* Runs the decoder of the model.
* @param tokens - The encoded audio data.
* @param encoderOutput - The output from the encoder.
* @returns A promise resolving to the decoded text.
*/
decode(tokens: Int32Array, encoderOutput: Float32Array): Promise<Float32Array>;
/**
* Starts a transcription process for a given input array, which should be a waveform at 16kHz.
* @param waveform - The input audio waveform.
* @param options - Decoding options, check API reference for more details.
* @returns Resolves a promise with the output transcription. Result of transcription is
* object of type `TranscriptionResult`.
*/
transcribe(waveform: Float32Array, options?: DecodingOptions | undefined): Promise<TranscriptionResult>;
/**
* Starts a streaming transcription process.
* Use in combination with `streamInsert` to feed audio chunks and `streamStop` to end the stream.
* Updates `committedTranscription` and `nonCommittedTranscription` as transcription progresses.
* @param options - Decoding options including language.
* @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
* Both `committed` and `nonCommitted` are of type `TranscriptionResult`
*/
stream(options?: DecodingOptions | undefined): AsyncGenerator<{
committed: TranscriptionResult;
nonCommitted: TranscriptionResult;
}, void, unknown>;
/**
* Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription.
* @param waveform - The audio chunk to insert.
*/
streamInsert(waveform: Float32Array): void;
/**
* Stops the ongoing streaming transcription process.
*/
streamStop(): void;
}
/**
* Languages supported by whisper (not whisper.en)
* @category Types
*/
export type SpeechToTextLanguage = 'af' | 'sq' | 'ar' | 'hy' | 'az' | 'eu' | 'be' | 'bn' | 'bs' | 'bg' | 'my' | 'ca' | 'zh' | 'hr' | 'cs' | 'da' | 'nl' | 'et' | 'en' | 'fi' | 'fr' | 'gl' | 'ka' | 'de' | 'el' | 'gu' | 'ht' | 'he' | 'hi' | 'hu' | 'is' | 'id' | 'it' | 'ja' | 'kn' | 'kk' | 'km' | 'ko' | 'lo' | 'lv' | 'lt' | 'mk' | 'mg' | 'ms' | 'ml' | 'mt' | 'mr' | 'ne' | 'no' | 'fa' | 'pl' | 'pt' | 'pa' | 'ro' | 'ru' | 'sr' | 'si' | 'sk' | 'sl' | 'es' | 'su' | 'sw' | 'sv' | 'tl' | 'tg' | 'ta' | 'te' | 'th' | 'tr' | 'uk' | 'ur' | 'uz' | 'vi' | 'cy' | 'yi';
/**
* Options for decoding speech to text.
* @category Types
* @property {SpeechToTextLanguage} [language] - Optional language code to guide the transcription.
* @property {boolean} [verbose] - Optional flag. If set, transcription result is presented with timestamps
* and with additional parameters. For more details please refer to `TranscriptionResult`.
*/
export interface DecodingOptions {
language?: SpeechToTextLanguage;
verbose?: boolean;
}
/**
* Structure that represent single token with timestamp information.
* @category Types
* @property {string} [word] - Token as a string value.
* @property {number} [start] - Timestamp of the beginning of the token in audio (in seconds).
* @property {number} [end] - Timestamp of the end of the token in audio (in seconds).
*/
export interface Word {
word: string;
start: number;
end: number;
}
/**
* Structure that represent single Segment of transcription.
* @category Types
* @property {number} [start] - Timestamp of the beginning of the segment in audio (in seconds).
* @property {number} [end] - Timestamp of the end of the segment in audio (in seconds).
* @property {string} [text] - Full text of the given segment as a string.
* @property {Word[]} [words] - If `verbose` set to `true` in `DecodingOptions`, it returns word-level timestamping
* as an array of `Word`.
* @property {number[]} [tokens] - Raw tokens represented as table of integers.
* @property {number} [temperature] - Temperature for which given segment was computed.
* @property {number} [avgLogprob] - Average log probability calculated across all tokens in a segment.
* @property {number} [compressionRatio] - Compression ration achieved on a given segment.
*/
export interface TranscriptionSegment {
start: number;
end: number;
text: string;
words?: Word[];
tokens: number[];
temperature: number;
avgLogprob: number;
compressionRatio: number;
}
/**
* Structure that represent result of transcription for a one function call (either `transcribe` or `stream`).
* @category Types
* @property {'transcribe' | 'stream'} [task] - String indicating task, either 'transcribe' or 'stream'.
* @property {string} [language] - Language chosen for transcription.
* @property {number} [duration] - Duration in seconds of a given transcription.
* @property {string} [text] - The whole text of a transcription as a `string`.
* @property {TranscriptionSegment[]} [segments] - If `verbose` set to `true` in `DecodingOptions`, it contains array of
* `TranscriptionSegment` with details split into separate transcription segments.
*/
export interface TranscriptionResult {
task?: 'transcribe' | 'stream';
language: string;
duration: number;
text: string;
segments?: TranscriptionSegment[];
}
/**
* Configuration for Speech to Text model.
* @category Types
*/
export interface SpeechToTextModelConfig {
/**
* The built-in model name (e.g. `'whisper-tiny-en'`). Used for telemetry and hook reload triggers.
* Pass one of the pre-built STT constants (e.g. `WHISPER_TINY_EN`) to populate all required fields.
*/
modelName: SpeechToTextModelName;
/**
* A boolean flag indicating whether the model supports multiple languages.
*/
isMultilingual: boolean;
/**
* A string that specifies the location of a `.pte` file for the model.
*
* We expect the model to have 2 bundled methods: 'decode' and 'encode'.
*/
modelSource: ResourceSource;
/**
* A string that specifies the location to the tokenizer for the model.
*/
tokenizerSource: ResourceSource;
}
//# sourceMappingURL=stt.d.ts.map