@transcribe/transcriber
Version:
Transcribe speech to text in the browser.
706 lines (649 loc) • 17.5 kB
TypeScript
declare module '@transcribe/transcriber' {
export type DtwType =
| "tiny"
| "base"
| "small"
| "tiny.en"
| "base.en"
| "small.en";
export type StreamStatus = "loading" | "waiting" | "processing" | "stopped";
export type ExitStatus = {
name: string;
message: string;
status: number;
};
export type TranscribeToken = {
/** Token id */
id: number;
/** Confidence 0..1 */
p: number;
/** word level timestamp dtw if enabled */
dtw?: {
offset: number;
/** hh:mm:ss,sss */
timestamp: string;
};
/** Transcribed text. */
text: string;
/** Text time offset. */
offsets?: {
from: number;
to: number;
};
/** Offset as timestamp hh:mm:ss,sss */
timestamps?: {
/** hh:mm:ss,sss */
from: string;
/** hh:mm:ss,sss */
to: string;
};
};
type TranscribeSegment = {
/** Transcribed text. */
text: string;
/** Text time offset. */
offsets: {
from: number;
to: number;
};
/** Offset as timestamp hh:mm:ss,sss */
timestamps: {
/** hh:mm:ss,sss */
from: string;
/** hh:mm:ss,sss */
to: string;
};
tokens: TranscribeToken[];
};
export type TranscribeResultSegement = {
result: {
/** Language code used by transcriber. */
language: string;
};
segment: TranscribeSegment;
};
export type TranscribeResult = {
result: {
/** Language code used by transcriber. */
language: string;
};
/** Transcription results split in segements. */
transcription: TranscribeSegment[];
};
export type TranscriberOptions = {
/**
* Emscripten exported createModule function.
* @see {@link https://emscripten.org/docs/api_reference/module.html}
*
* @param {any} moduleArg Used to override module defaults.
* @returns {Promise<any>} Returns the module object.
*/
createModule: (moduleArg?: {}) => Promise<any>;
/**
* Whisper.cpp model file in ggml format.
* Will fetch if string, otherwise will use the provided file.
* @see {@link https://huggingface.co/ggerganov/whisper.cpp}
*/
model: string | File;
/**
* Called on wasm print to stdout.
*
* @param {string} message
*/
print?: (message: string) => void;
/**
* Called on wasm print to stderr.
*
* @param {string} message
*/
printErr?: (message: string) => void;
preInit?: () => void;
preRun?: () => void;
onAbort?: () => void;
onExit?: (exitStatus: ExitStatus) => void;
};
export type FileTranscriberOptions = TranscriberOptions & {
/**
* If transcriber should compute word level timestamps using DTW algorithm, specify the type of the model used.
*/
dtwType?: DtwType;
/**
* Called when init is ready.
*/
onReady?: () => void;
/**
* Called on transcriber progress.
*
* @param {number} progress 0..100
*/
onProgress?: (progress: number) => void;
/**
* Called when transcription is canceled.
*/
onCanceled?: () => void;
/**
* Called when a new transcribed segment is ready.
*
* @param {TranscribeResultSegement} segment
*/
onSegment?: (segment: TranscribeResultSegement) => void;
/**
* Called when transcription is complete.
*
* @param {TranscribeResult} result
*/
onComplete?: (result: TranscribeResult) => void;
};
export type StreamTranscriberOptions = TranscriberOptions & {
/**
* Path to the audio worklet scripts (vad.js and buffer.js).
* Defaults to the ./audio-worklets sub directory from where StreamTranscriber.js gets loaded.
*/
audioWorkletPath?: string;
/**
* Called when init is ready.
*/
onReady?: () => void;
/**
* Called when a new transcription from stream is ready.
*
* @param {TranscribeResultSegement} segment
*/
onSegment?: (segment: TranscribeResultSegement) => void;
/**
* Called when stream status changes.
*
* @param {StreamStatus} status
*/
onStreamStatus?: (status: StreamStatus) => void;
};
export type FileTranscribeOptions = {
/**
* Language code of the audio, eg. "en"
* @defaut "auto"
*/
lang?: string;
/**
* Number of threads to use, defaults to max threads available.
*/
threads?: number;
/**
* Translate the text to english
* @default false
*/
translate?: boolean;
/**
* Maximum number of characters in a single segment, 0 for not set.
* @default 0
*/
max_len?: number;
/**
* If true, transcriber will try to split the text on word boundaries.
* @default false
*/
split_on_word?: boolean;
/**
* If true, transcriber will try to suppress non-speech segments.
* @default false
*/
suppress_non_speech?: boolean;
/**
* If true, calculates word level timestamps.
* @default true
*/
token_timestamps?: boolean;
};
export type StreamStartOptions = {
/**
* Language code of the audio, eg. "en"
*/
lang?: string;
/**
* Number of threads to use, defaults to max threads available
*/
threads?: number;
/**
* Translate the text to english
* @default false
*/
translate?: boolean;
/**
* If true, transcriber will try to suppress non-speech segments
* @default true
*/
suppress_non_speech?: boolean;
/**
* Maximum number of tokens in a single segment, see whisper.cpp
* @default 16
*/
max_tokens?: number;
/**
* Audio context buffer size in samples, see whisper.cpp
* @default 256
*/
audio_ctx?: number;
};
export type StreamTranscribeOptions = {
/**
* Audio in ms to include before the begin of speech detected.
* @default 200
*/
preRecordMs?: number;
/**
* Maximum record time in ms before calling transcribe (aka. flush buffer).
* @default 5000
*/
maxRecordMs?: number;
/**
* Minimum time in ms of silence before transcribe is called.
* @default 500
*/
minSilenceMs?: number;
/**
* Called when there's a change in voice activity.
*/
onVoiceActivity?: (active: boolean) => void;
};
/**
* Transcribe audio to text using the whisper.cpp speech-to-text implementation.
*
* @class FileTranscriber
* @extends Transcriber
*/
export class FileTranscriber extends Transcriber {
/**
* Create a new FileTranscriber instance.
*
* @constructor
* @param {FileTranscriberOptions} options
*/
constructor(options: FileTranscriberOptions);
/**
* @private
* @type {DtwType}
*/
private _dtwType;
/**
* Callback when init is ready.
*
* @private
* @type {FileTranscriberOptions.onReady}
*/
private _onReady;
/**
* Callback when transcription is done.
*
* @private
* @type {FileTranscriberOptions.onComplete}
*/
private _onComplete;
/**
* Callback when transcription got canceled.
*
* @private
* @type {FileTranscriberOptions.onCanceled}
*/
private _onCanceled;
/**
* Resolve callback for cancel.
*
* @private
* @type {function(): void | null}
*/
private _resolveCancel;
/**
* Resolve callback for transcribe complete.
*
* @private
* @type {function(): void | null}
*/
private _resolveComplete;
/**
* Called when transcription is complete.
*
* @type {FileTranscriberOptions.onComplete}
*/
set onComplete(callback: any);
/**
* Called when transcription is canceled.
*
* @type {FileTranscriberOptions.onCanceled}
*/
set onCanceled(callback: any);
/**
* Called on transcriber progress.
*
* @type {FileTranscriberOptions.onProgress}
*/
set onProgress(callback: any);
/**
* Called when a new transcribed segment is ready.
*
* @type {FileTranscriberOptions.onSegment}
*/
set onSegment(callback: any);
/**
* DTW type.
*
* @type {DtwType}
*/
get dtwType(): DtwType;
/**
* Transcribe audio to text.
*
* @param {File|string} audio Audio file or URL.
* @param {FileTranscribeOptions} [options]
* @param {string} [options.lang="auto"] Language code.
* @param {number} [options.threads=this.maxThreads] Number of threads to use.
* @param {boolean} [options.translate=false] Translate the text.
* @param {number} [options.max_len=0] Maximum segment length in characters.
* @param {boolean} [options.split_on_word=false] Split the text on word.
* @param {boolean} [options.suppress_non_speech=false] Suppress non-speech.
* @param {boolean} [options.token_timestamps=true] Calculate token timestamps.
* @returns {Promise<TranscribeResult>}
*/
transcribe(audio: File | string, { lang, threads, translate, max_len, split_on_word, suppress_non_speech, token_timestamps, }?: FileTranscribeOptions): Promise<TranscribeResult>;
/**
* Cancel the transcription. May take some time.
*
* @returns {Promise<void>}
*/
cancel(): Promise<void>;
/**
* Just for resolving the cancel promise.
*
* @private
*/
private _onCancel;
/**
* Just for resolving the transcribe complete promise.
*
* @private
*/
private _onTranscribed;
/**
* Load audio, convert to 16kHz mono.
*
* @private
* @param {File|string} file Audio file or URL
* @returns {Promise<Float32Array>}
*/
private _loadAudio;
}
/**
* Transcribe an audio stream (e.g microhpone input) to text using the whisper.cpp speech-to-text implementation. This is experimental and not working in Firefox because sample rate conversion with AudioContext is not supported. Also, transcribe.wasm is way to slow for real-time transcription.
*
* @class StreamTranscriber
* @extends Transcriber
* @experimental
*/
export class StreamTranscriber extends Transcriber {
/**
* Create a new StreamTranscriber instance.
*
* @constructor
* @param {StreamTranscriberOptions} options
*/
constructor(options: StreamTranscriberOptions);
/**
* Path to the audio worklet scripts (vad.js and buffer.js).
*/
_audioWorkletsPath: any;
/**
* Callback when init is ready.
* @private
* @type {StreamTranscriberOptions.onReady}
*/
private _onReady;
/**
* Is stream transcriber running.
*
* @type boolean
* @private
*/
private _isStreamRunning;
/**
* Audio context for stream transcription.
*
* @private
* @type {AudioContext}
*/
private _streamAudioContext;
/**
* Media source for stream transcription.
*
* @private
* @type {MediaStreamAudioSourceNode}
*/
private _streamMediaSource;
/**
* Called when a new transcription from stream is ready.
*
* @type {StreamTranscriberOptions.onSegment}
*/
set onSegment(callback: any);
/**
* Called when stream status changes.
*
* @type {StreamTranscriberOptions.onStreamStatus}
*/
set onStreamStatus(callback: any);
/**
* Stream running state.
*
* @param {boolean} state
*/
get isStreamRunning(): boolean;
/**
* Get path to the audio worklet scripts (vad.js and buffer.js).
*
* @param {string} filename
* @param {URL|string} [baseUrl=import.meta.url]
* @returns {string}
*/
getAudioWorkletPath(filename: string, baseUrl?: URL | string): string;
/**
* Start stream transcription.
*
* @param {StreamStartOptions} options
* @param {string} [options.lang="auto"] Language code
* @param {number} [options.threads=this.maxThreads] Number of threads to use
* @param {boolean} [options.translate=false] Translate the text
* @param {boolean} [options.suppress_non_speech=true] Suppress non-speech
* @param {number} [options.max_tokens=16] Maximum tokens
* @param {number} [options.audio_ctx=512] Audio context size
* @returns {Promise<void>}
*/
start({ lang, threads, translate, suppress_non_speech, max_tokens, audio_ctx, }?: StreamStartOptions): Promise<void>;
/**
* Stop stream transcription.
*
* @returns {Promise<void>}
*/
stop(): Promise<void>;
/**
* Transcribe stream audio.
*
* @param {MediaStream} stream
* @param {StreamTranscribeOptions} options
* @param {number} [options.preRecordMs=500] Audio in ms to include before the begin of speech detected.
* @param {number} [options.maxRecordMs=5000] Maximum record time in ms before calling transcribe (aka. flush buffer).
* @param {number} [options.minSilenceMs=500] Minimum time in ms of silence before transcribe is called.
* @param {function (isSpeaking: boolean): void} [options.onVoiceActivity=null] Callback when voice activity detected.
* @returns {Promise<void>}
*/
transcribe(stream: MediaStream, { preRecordMs, maxRecordMs, minSilenceMs, onVoiceActivity, }?: StreamTranscribeOptions): Promise<void>;
}
/**
* Base class for transcribers.
*/
export class Transcriber {
/**
* @constructor
* @param {TranscriberOptions} options
*/
constructor(options: TranscriberOptions);
/**
* Emscripten createModule function.
*
* @protected
* @type {function}
*/
protected _createModule: Function;
/**
* Model file.
*
* @protected
* @type {string|File}
*/
protected _model: string | File;
/**
* Wasm Module.
*
* @type {object}
* */
Module: object;
/**
* Is shout runtime initialized.
*
* @protected
* @type {boolean}
*/
protected _isRuntimeInitialized: boolean;
/**
* Is model file loaded.
*
* @protected
* @type {boolean}
*/
protected _isModelFileLoaded: boolean;
/**
* Is everything initialized and ready to transcribe.
*
* @protected
* @type {boolean}
*/
protected _isReady: boolean;
/**
* Model filename in wasm filesystem.
*
* @protected
* @type {string}
*/
protected _modelFilename: string;
/**
* Filename of the model in wasm filesystem.
*
* @type {string}
*/
get modelInternalFilename(): string;
/**
* Model file.
*
* @type {string|File}
*/
get model(): string | File;
/**
* Maximum number of threads.
*
* @type {number}
* @default 2
*/
get maxThreads(): number;
/**
* Is runtime initialized.
*
* @type {boolean}
*
*/
get isRuntimeInitialized(): boolean;
/**
* Is model file loaded.
*
* @type {boolean}
*/
get isModelFileLoaded(): boolean;
/**
* True when ready to transcribe.
*
* @type {boolean}
*/
get isReady(): boolean;
/**
* Load model and create a new shout instance.
*/
init(): Promise<void>;
/**
* Write audio data directly to wasm memory and returns pointer.
* Call this after loading audio.
* Remember to free the memory after transcribing with `this.Module._free(dataPtr)`.
*
* @param {Float32Array} pcmf32 Raw audio data. Must be 16kHz, mono.
* @param {number} [ptr=null] Pointer to the audio data in wasm memory.
* @returns {number} Pointer to the audio data in wasm memory.
*/
writeAudioToMemory(pcmf32: Float32Array, ptr?: number): number;
/**
* Free wasm memory and destroy module.
*
* @return {void}
*/
destroy(): void;
/**
* Unload model file and free wasm memory.
*
* @protected
*/
protected _freeWasmModule(): void;
/**
* Load model file into wasm filesystem.
*
* @protected
* @returns {Promise<void>}
*/
protected _loadModel(): Promise<void>;
}
/**
* Load audio file as raw data, convert to [sampleRate] mono.
*
* @param {File} file Audio file
* @param {number} sampleRate Output sample rate
* @param {boolean} [toMono = false] Convert to mono
* @returns {Promise<Float32Array|null>}
*/
export function audioFileToPcm32(file: File, sampleRate: number, toMono?: boolean): Promise<Float32Array | null>;
/**
* Convert buffer to mono.
*
* @param {AudioBuffer} buffer
* @returns {Promise<AudioBuffer>}
*/
export function downmixAudioBufferToMono(buffer: AudioBuffer): Promise<AudioBuffer>;
/**
* Check if browser is Firefox.
*
* @returns {boolean}
*/
export function isFirefox(): boolean;
/**
* Create VAD (Voice Activity Detection) AudioWorkletNode.
* Call audioContext.addModule("vad.js") before creating the node.
*
* @param {AudioContext} audioContext Audio context
* @param {CallableFunction} onSpeech Callback when speech detected
* @param {CallableFunction} onSilence Callback when silence detected
* @param {number} [minSilence=100] Minimum silence duration before call onSilence, in ms
* @returns {AudioWorkletNode|null}
*/
export function createVad(audioContext: AudioContext, onSpeech: CallableFunction, onSilence: CallableFunction, minSilence?: number): AudioWorkletNode | null;
/**
* Create buffer AudioWorkletNode.
* Call audioContext.addModule("buffer.js") before creating the node.
*
* @param {AudioContext} audioContext
* @param {CallableFunction} onBuffer Called when buffer is full. (samples: Float32Array) => void
* @param {number} [bufferSizeMs=100]
* @returns {AudioWorkletNode|null}
*/
export function createBuffer(audioContext: AudioContext, onBuffer: CallableFunction, bufferSizeMs?: number): AudioWorkletNode | null;
}