UNPKG

@transcribe/transcriber

Version:

Transcribe speech to text in the browser.

706 lines (649 loc) 17.5 kB
declare module '@transcribe/transcriber' { export type DtwType = | "tiny" | "base" | "small" | "tiny.en" | "base.en" | "small.en"; export type StreamStatus = "loading" | "waiting" | "processing" | "stopped"; export type ExitStatus = { name: string; message: string; status: number; }; export type TranscribeToken = { /** Token id */ id: number; /** Confidence 0..1 */ p: number; /** word level timestamp dtw if enabled */ dtw?: { offset: number; /** hh:mm:ss,sss */ timestamp: string; }; /** Transcribed text. */ text: string; /** Text time offset. */ offsets?: { from: number; to: number; }; /** Offset as timestamp hh:mm:ss,sss */ timestamps?: { /** hh:mm:ss,sss */ from: string; /** hh:mm:ss,sss */ to: string; }; }; type TranscribeSegment = { /** Transcribed text. */ text: string; /** Text time offset. */ offsets: { from: number; to: number; }; /** Offset as timestamp hh:mm:ss,sss */ timestamps: { /** hh:mm:ss,sss */ from: string; /** hh:mm:ss,sss */ to: string; }; tokens: TranscribeToken[]; }; export type TranscribeResultSegement = { result: { /** Language code used by transcriber. */ language: string; }; segment: TranscribeSegment; }; export type TranscribeResult = { result: { /** Language code used by transcriber. */ language: string; }; /** Transcription results split in segements. */ transcription: TranscribeSegment[]; }; export type TranscriberOptions = { /** * Emscripten exported createModule function. * @see {@link https://emscripten.org/docs/api_reference/module.html} * * @param {any} moduleArg Used to override module defaults. * @returns {Promise<any>} Returns the module object. */ createModule: (moduleArg?: {}) => Promise<any>; /** * Whisper.cpp model file in ggml format. * Will fetch if string, otherwise will use the provided file. * @see {@link https://huggingface.co/ggerganov/whisper.cpp} */ model: string | File; /** * Called on wasm print to stdout. * * @param {string} message */ print?: (message: string) => void; /** * Called on wasm print to stderr. * * @param {string} message */ printErr?: (message: string) => void; preInit?: () => void; preRun?: () => void; onAbort?: () => void; onExit?: (exitStatus: ExitStatus) => void; }; export type FileTranscriberOptions = TranscriberOptions & { /** * If transcriber should compute word level timestamps using DTW algorithm, specify the type of the model used. */ dtwType?: DtwType; /** * Called when init is ready. */ onReady?: () => void; /** * Called on transcriber progress. * * @param {number} progress 0..100 */ onProgress?: (progress: number) => void; /** * Called when transcription is canceled. */ onCanceled?: () => void; /** * Called when a new transcribed segment is ready. * * @param {TranscribeResultSegement} segment */ onSegment?: (segment: TranscribeResultSegement) => void; /** * Called when transcription is complete. * * @param {TranscribeResult} result */ onComplete?: (result: TranscribeResult) => void; }; export type StreamTranscriberOptions = TranscriberOptions & { /** * Path to the audio worklet scripts (vad.js and buffer.js). * Defaults to the ./audio-worklets sub directory from where StreamTranscriber.js gets loaded. */ audioWorkletPath?: string; /** * Called when init is ready. */ onReady?: () => void; /** * Called when a new transcription from stream is ready. * * @param {TranscribeResultSegement} segment */ onSegment?: (segment: TranscribeResultSegement) => void; /** * Called when stream status changes. * * @param {StreamStatus} status */ onStreamStatus?: (status: StreamStatus) => void; }; export type FileTranscribeOptions = { /** * Language code of the audio, eg. "en" * @defaut "auto" */ lang?: string; /** * Number of threads to use, defaults to max threads available. */ threads?: number; /** * Translate the text to english * @default false */ translate?: boolean; /** * Maximum number of characters in a single segment, 0 for not set. * @default 0 */ max_len?: number; /** * If true, transcriber will try to split the text on word boundaries. * @default false */ split_on_word?: boolean; /** * If true, transcriber will try to suppress non-speech segments. * @default false */ suppress_non_speech?: boolean; /** * If true, calculates word level timestamps. * @default true */ token_timestamps?: boolean; }; export type StreamStartOptions = { /** * Language code of the audio, eg. "en" */ lang?: string; /** * Number of threads to use, defaults to max threads available */ threads?: number; /** * Translate the text to english * @default false */ translate?: boolean; /** * If true, transcriber will try to suppress non-speech segments * @default true */ suppress_non_speech?: boolean; /** * Maximum number of tokens in a single segment, see whisper.cpp * @default 16 */ max_tokens?: number; /** * Audio context buffer size in samples, see whisper.cpp * @default 256 */ audio_ctx?: number; }; export type StreamTranscribeOptions = { /** * Audio in ms to include before the begin of speech detected. * @default 200 */ preRecordMs?: number; /** * Maximum record time in ms before calling transcribe (aka. flush buffer). * @default 5000 */ maxRecordMs?: number; /** * Minimum time in ms of silence before transcribe is called. * @default 500 */ minSilenceMs?: number; /** * Called when there's a change in voice activity. */ onVoiceActivity?: (active: boolean) => void; }; /** * Transcribe audio to text using the whisper.cpp speech-to-text implementation. * * @class FileTranscriber * @extends Transcriber */ export class FileTranscriber extends Transcriber { /** * Create a new FileTranscriber instance. * * @constructor * @param {FileTranscriberOptions} options */ constructor(options: FileTranscriberOptions); /** * @private * @type {DtwType} */ private _dtwType; /** * Callback when init is ready. * * @private * @type {FileTranscriberOptions.onReady} */ private _onReady; /** * Callback when transcription is done. * * @private * @type {FileTranscriberOptions.onComplete} */ private _onComplete; /** * Callback when transcription got canceled. * * @private * @type {FileTranscriberOptions.onCanceled} */ private _onCanceled; /** * Resolve callback for cancel. * * @private * @type {function(): void | null} */ private _resolveCancel; /** * Resolve callback for transcribe complete. * * @private * @type {function(): void | null} */ private _resolveComplete; /** * Called when transcription is complete. * * @type {FileTranscriberOptions.onComplete} */ set onComplete(callback: any); /** * Called when transcription is canceled. * * @type {FileTranscriberOptions.onCanceled} */ set onCanceled(callback: any); /** * Called on transcriber progress. * * @type {FileTranscriberOptions.onProgress} */ set onProgress(callback: any); /** * Called when a new transcribed segment is ready. * * @type {FileTranscriberOptions.onSegment} */ set onSegment(callback: any); /** * DTW type. * * @type {DtwType} */ get dtwType(): DtwType; /** * Transcribe audio to text. * * @param {File|string} audio Audio file or URL. * @param {FileTranscribeOptions} [options] * @param {string} [options.lang="auto"] Language code. * @param {number} [options.threads=this.maxThreads] Number of threads to use. * @param {boolean} [options.translate=false] Translate the text. * @param {number} [options.max_len=0] Maximum segment length in characters. * @param {boolean} [options.split_on_word=false] Split the text on word. * @param {boolean} [options.suppress_non_speech=false] Suppress non-speech. * @param {boolean} [options.token_timestamps=true] Calculate token timestamps. * @returns {Promise<TranscribeResult>} */ transcribe(audio: File | string, { lang, threads, translate, max_len, split_on_word, suppress_non_speech, token_timestamps, }?: FileTranscribeOptions): Promise<TranscribeResult>; /** * Cancel the transcription. May take some time. * * @returns {Promise<void>} */ cancel(): Promise<void>; /** * Just for resolving the cancel promise. * * @private */ private _onCancel; /** * Just for resolving the transcribe complete promise. * * @private */ private _onTranscribed; /** * Load audio, convert to 16kHz mono. * * @private * @param {File|string} file Audio file or URL * @returns {Promise<Float32Array>} */ private _loadAudio; } /** * Transcribe an audio stream (e.g microhpone input) to text using the whisper.cpp speech-to-text implementation. This is experimental and not working in Firefox because sample rate conversion with AudioContext is not supported. Also, transcribe.wasm is way to slow for real-time transcription. * * @class StreamTranscriber * @extends Transcriber * @experimental */ export class StreamTranscriber extends Transcriber { /** * Create a new StreamTranscriber instance. * * @constructor * @param {StreamTranscriberOptions} options */ constructor(options: StreamTranscriberOptions); /** * Path to the audio worklet scripts (vad.js and buffer.js). */ _audioWorkletsPath: any; /** * Callback when init is ready. * @private * @type {StreamTranscriberOptions.onReady} */ private _onReady; /** * Is stream transcriber running. * * @type boolean * @private */ private _isStreamRunning; /** * Audio context for stream transcription. * * @private * @type {AudioContext} */ private _streamAudioContext; /** * Media source for stream transcription. * * @private * @type {MediaStreamAudioSourceNode} */ private _streamMediaSource; /** * Called when a new transcription from stream is ready. * * @type {StreamTranscriberOptions.onSegment} */ set onSegment(callback: any); /** * Called when stream status changes. * * @type {StreamTranscriberOptions.onStreamStatus} */ set onStreamStatus(callback: any); /** * Stream running state. * * @param {boolean} state */ get isStreamRunning(): boolean; /** * Get path to the audio worklet scripts (vad.js and buffer.js). * * @param {string} filename * @param {URL|string} [baseUrl=import.meta.url] * @returns {string} */ getAudioWorkletPath(filename: string, baseUrl?: URL | string): string; /** * Start stream transcription. * * @param {StreamStartOptions} options * @param {string} [options.lang="auto"] Language code * @param {number} [options.threads=this.maxThreads] Number of threads to use * @param {boolean} [options.translate=false] Translate the text * @param {boolean} [options.suppress_non_speech=true] Suppress non-speech * @param {number} [options.max_tokens=16] Maximum tokens * @param {number} [options.audio_ctx=512] Audio context size * @returns {Promise<void>} */ start({ lang, threads, translate, suppress_non_speech, max_tokens, audio_ctx, }?: StreamStartOptions): Promise<void>; /** * Stop stream transcription. * * @returns {Promise<void>} */ stop(): Promise<void>; /** * Transcribe stream audio. * * @param {MediaStream} stream * @param {StreamTranscribeOptions} options * @param {number} [options.preRecordMs=500] Audio in ms to include before the begin of speech detected. * @param {number} [options.maxRecordMs=5000] Maximum record time in ms before calling transcribe (aka. flush buffer). * @param {number} [options.minSilenceMs=500] Minimum time in ms of silence before transcribe is called. * @param {function (isSpeaking: boolean): void} [options.onVoiceActivity=null] Callback when voice activity detected. * @returns {Promise<void>} */ transcribe(stream: MediaStream, { preRecordMs, maxRecordMs, minSilenceMs, onVoiceActivity, }?: StreamTranscribeOptions): Promise<void>; } /** * Base class for transcribers. */ export class Transcriber { /** * @constructor * @param {TranscriberOptions} options */ constructor(options: TranscriberOptions); /** * Emscripten createModule function. * * @protected * @type {function} */ protected _createModule: Function; /** * Model file. * * @protected * @type {string|File} */ protected _model: string | File; /** * Wasm Module. * * @type {object} * */ Module: object; /** * Is shout runtime initialized. * * @protected * @type {boolean} */ protected _isRuntimeInitialized: boolean; /** * Is model file loaded. * * @protected * @type {boolean} */ protected _isModelFileLoaded: boolean; /** * Is everything initialized and ready to transcribe. * * @protected * @type {boolean} */ protected _isReady: boolean; /** * Model filename in wasm filesystem. * * @protected * @type {string} */ protected _modelFilename: string; /** * Filename of the model in wasm filesystem. * * @type {string} */ get modelInternalFilename(): string; /** * Model file. * * @type {string|File} */ get model(): string | File; /** * Maximum number of threads. * * @type {number} * @default 2 */ get maxThreads(): number; /** * Is runtime initialized. * * @type {boolean} * */ get isRuntimeInitialized(): boolean; /** * Is model file loaded. * * @type {boolean} */ get isModelFileLoaded(): boolean; /** * True when ready to transcribe. * * @type {boolean} */ get isReady(): boolean; /** * Load model and create a new shout instance. */ init(): Promise<void>; /** * Write audio data directly to wasm memory and returns pointer. * Call this after loading audio. * Remember to free the memory after transcribing with `this.Module._free(dataPtr)`. * * @param {Float32Array} pcmf32 Raw audio data. Must be 16kHz, mono. * @param {number} [ptr=null] Pointer to the audio data in wasm memory. * @returns {number} Pointer to the audio data in wasm memory. */ writeAudioToMemory(pcmf32: Float32Array, ptr?: number): number; /** * Free wasm memory and destroy module. * * @return {void} */ destroy(): void; /** * Unload model file and free wasm memory. * * @protected */ protected _freeWasmModule(): void; /** * Load model file into wasm filesystem. * * @protected * @returns {Promise<void>} */ protected _loadModel(): Promise<void>; } /** * Load audio file as raw data, convert to [sampleRate] mono. * * @param {File} file Audio file * @param {number} sampleRate Output sample rate * @param {boolean} [toMono = false] Convert to mono * @returns {Promise<Float32Array|null>} */ export function audioFileToPcm32(file: File, sampleRate: number, toMono?: boolean): Promise<Float32Array | null>; /** * Convert buffer to mono. * * @param {AudioBuffer} buffer * @returns {Promise<AudioBuffer>} */ export function downmixAudioBufferToMono(buffer: AudioBuffer): Promise<AudioBuffer>; /** * Check if browser is Firefox. * * @returns {boolean} */ export function isFirefox(): boolean; /** * Create VAD (Voice Activity Detection) AudioWorkletNode. * Call audioContext.addModule("vad.js") before creating the node. * * @param {AudioContext} audioContext Audio context * @param {CallableFunction} onSpeech Callback when speech detected * @param {CallableFunction} onSilence Callback when silence detected * @param {number} [minSilence=100] Minimum silence duration before call onSilence, in ms * @returns {AudioWorkletNode|null} */ export function createVad(audioContext: AudioContext, onSpeech: CallableFunction, onSilence: CallableFunction, minSilence?: number): AudioWorkletNode | null; /** * Create buffer AudioWorkletNode. * Call audioContext.addModule("buffer.js") before creating the node. * * @param {AudioContext} audioContext * @param {CallableFunction} onBuffer Called when buffer is full. (samples: Float32Array) => void * @param {number} [bufferSizeMs=100] * @returns {AudioWorkletNode|null} */ export function createBuffer(audioContext: AudioContext, onBuffer: CallableFunction, bufferSizeMs?: number): AudioWorkletNode | null; }