smart-whisper-electron
Version:
Whisper.cpp Node.js binding with auto model offloading strategy.
314 lines (305 loc) • 13.1 kB
TypeScript
import EventEmitter from 'node:events';
declare enum WhisperSamplingStrategy {
WHISPER_SAMPLING_GREEDY = 0,
WHISPER_SAMPLING_BEAM_SEARCH = 1
}
type TranscribeFormat = "simple" | "detail";
/**
* See {@link https://github.com/ggerganov/whisper.cpp/blob/00b7a4be02ca82d53ac69dd2dd438c16e2af7658/whisper.h#L433C19-L433C19} for details.
*/
interface TranscribeParams<Format extends TranscribeFormat = TranscribeFormat, TokenTimestamp extends boolean = false> {
strategy: WhisperSamplingStrategy;
n_threads: number;
n_max_text_ctx: number;
offset_ms: number;
duration_ms: number;
translate: boolean;
no_context: boolean;
no_timestamps: boolean;
single_segment: boolean;
print_special: boolean;
print_progress: boolean;
print_realtime: boolean;
print_timestamps: boolean;
token_timestamps: TokenTimestamp;
thold_pt: number;
thold_ptsum: number;
max_len: number;
split_on_word: boolean;
max_tokens: number;
speed_up: boolean;
debug_mode: boolean;
audio_ctx: number;
tdrz_enable: boolean;
initial_prompt: string;
/**
* Language code, e.g. "en", "de", "fr", "es", "it", "nl", "pt", "ru", "tr", "uk", "pl", "sv", "cs", "zh", "ja", "ko"
*/
language: string;
suppress_blank: boolean;
suppress_non_speech_tokens: boolean;
temperature: number;
max_initial_ts: number;
length_penalty: number;
temperature_inc: number;
entropy_thold: number;
logprob_thold: number;
no_speech_thold: number;
best_of: number;
beam_size: number;
format: Format;
}
interface TranscribeSimpleResult {
from: number;
to: number;
text: string;
}
/**
* Represents a detailed result of transcription.
*/
interface TranscribeDetailedResult<TokenTimestamp extends boolean> extends TranscribeSimpleResult {
/** The detected spoken language. */
lang: string;
/** The confidence level of the transcription, calculated by the average probability of the tokens. */
confidence: number;
/** The tokens generated during the transcription process. */
tokens: {
/** The text of the token, for CJK languages, due to the BPE encoding, the token text may not be readable. */
text: string;
/** The ID of the token. */
id: number;
/** The probability of the token. */
p: number;
/** The start timestamp of the token, in milliseconds. Only available when `token_timestamps` of {@link TranscribeParams} is `true`. */
from: TokenTimestamp extends true ? number : undefined;
/** The end timestamp of the token, in milliseconds. Only available when `token_timestamps` of {@link TranscribeParams} is `true`. */
to: TokenTimestamp extends true ? number : undefined;
}[];
}
type TranscribeResult<Format extends TranscribeFormat = TranscribeFormat, TokenTimestamp extends boolean = boolean> = Format extends "simple" ? TranscribeSimpleResult : TranscribeDetailedResult<TokenTimestamp>;
/**
* A external handle to a model.
*/
type Handle = {
readonly "": unique symbol;
};
declare enum WhisperAligmentHeadsPreset {
NONE = 0
}
interface WhisperContextParams {
use_gpu?: boolean;
flash_attn?: boolean;
gpu_device?: number;
dtw_token_timestamps?: boolean;
dtw_aheads_preset?: WhisperAligmentHeadsPreset;
dtw_n_top?: number;
dtw_mem_size?: number;
offload?: number;
}
interface WhisperConfig$1 {
/**
* Whether to use GPU acceleration (if available)
* @default true
*/
gpu?: boolean;
/**
* Time in seconds after which the model is freed from memory
* @default 0 (disabled)
*/
offload?: number;
/**
* Advanced configuration parameters
*/
params?: WhisperContextParams;
}
declare namespace Binding {
/**
* Load a model from a whisper weights file.
* @param file The path to the whisper weights file.
* @param gpu Whether to use the GPU or not.
* @param callback A callback that will be called with the handle to the model.
*/
function load(file: string, gpu: boolean, callback: (handle: Handle) => void): void;
/**
* Release the memory of the model, it will be unusable after this.
* @param handle The handle to the model.
* @param callback A callback that will be called when the model is freed.
*/
function free(handle: Handle, callback: () => void): void;
/**
* Transcribe a PCM buffer.
* @param handle The handle to the model.
* @param pcm The PCM buffer.
* @param params The parameters to use for transcription.
* @param finish A callback that will be called when the transcription is finished.
* @param progress A callback that will be called when a new result is available.
*/
function transcribe<Format extends TranscribeFormat, TokenTimestamp extends boolean>(handle: Handle, pcm: Float32Array, params: Partial<TranscribeParams<Format, TokenTimestamp>>, finish: (results: TranscribeResult<Format, TokenTimestamp>[]) => void, progress: (result: TranscribeResult<Format, TokenTimestamp>) => void): void;
class WhisperModel {
private _ctx;
constructor(handle: Handle);
get handle(): Handle | null;
get freed(): boolean;
/**
* Release the memory of the model, it will be unusable after this.
* It's safe to call this multiple times, but it will only free the model once.
*/
free(): Promise<void>;
/**
* Load a model from a whisper weights file.
* @param file The path to the whisper weights file.
* @param config Configuration for the model or boolean for GPU usage
*/
static load(file: string, config?: WhisperConfig$1 | boolean): Promise<WhisperModel>;
}
}
/**
* The native binding for the underlying C++ addon.
*/
declare const binding: typeof Binding;
declare class WhisperModel extends binding.WhisperModel {
}
declare class TranscribeTask<Format extends TranscribeFormat, TokenTimestamp extends boolean> extends EventEmitter {
private _model;
private _result;
/**
* You should not construct this class directly, use {@link TranscribeTask.run} instead.
*/
constructor(model: WhisperModel);
get model(): WhisperModel;
/**
* A promise that resolves to the result of the transcription task.
*/
get result(): Promise<TranscribeResult<Format, TokenTimestamp>[]>;
private _run;
static run<Format extends TranscribeFormat, TokenTimestamp extends boolean>(model: WhisperModel, pcm: Float32Array, params: Partial<TranscribeParams<Format, TokenTimestamp>>): Promise<TranscribeTask<Format, TokenTimestamp>>;
on(event: "finish", listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void): this;
on(event: "transcribed", listener: (result: TranscribeResult<Format, TokenTimestamp>) => void): this;
once(event: "finish", listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void): this;
once(event: "transcribed", listener: (result: TranscribeResult<Format, TokenTimestamp>) => void): this;
off(event: "finish", listener: (results: TranscribeResult<Format, TokenTimestamp>[]) => void): this;
off(event: "transcribed", listener: (result: TranscribeResult<Format, TokenTimestamp>) => void): this;
}
interface WhisperConfig {
/**
* Time in seconds to wait before offloading the model if it's not being used.
*/
offload: number;
/**
* Whether to use the GPU or not.
*/
gpu: boolean;
/**
* Advanced configuration parameters
*/
params?: WhisperContextParams;
}
/**
* The Whisper class is responsible for managing the lifecycle and operations of whisper model.
* It handles the loading and offloading of the model, managing transcription tasks, and configuring model parameters.
*/
declare class Whisper {
private _file;
private _available;
private _loading;
private _tasks;
private _config;
private _offload_timer;
/**
* Constructs a new Whisper instance with a specified model file and configuration.
* @param file - The path to the Whisper model file.
* @param config - Optional configuration for the Whisper instance.
*/
constructor(file: string, config?: Partial<WhisperConfig>);
get file(): string;
set file(file: string);
get config(): WhisperConfig;
get tasks(): Promise<TranscribeResult[]>[];
reset_offload_timer(): void;
private clear_offload_timer;
model(): Promise<WhisperModel>;
/**
* Loads the whisper model asynchronously.
* If the model is already being loaded, returns the existing one.
*
* You don't need to call this method directly, it's called automatically if necessary when you call {@link Whisper.transcribe}.
*
* @returns A Promise that resolves to the loaded model.
*/
load(): Promise<WhisperModel>;
/**
* Transcribes the given PCM audio data using the Whisper model.
* @param pcm - The mono 16k PCM audio data to transcribe.
* @param params - Optional parameters for transcription.
* @returns A promise that resolves to the result of the transcription task.
*/
transcribe<Format extends TranscribeFormat, TokenTimestamp extends boolean>(pcm: Float32Array, params?: Partial<TranscribeParams<Format, TokenTimestamp>>): Promise<TranscribeTask<Format, TokenTimestamp>>;
free(): Promise<void>;
}
/**
* MODELS is an object that contains the URLs of different ggml whisper models.
* Each model is represented by a key-value pair, where the key is the model name
* and the value is the URL of the model.
*/
declare const MODELS: {
readonly tiny: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin";
readonly "tiny.en": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin";
readonly small: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin";
readonly "small.en": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin";
readonly base: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin";
readonly "base.en": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin";
readonly medium: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin";
readonly "medium.en": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin";
readonly "large-v1": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";
readonly "large-v2": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin";
readonly "large-v3": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin";
readonly "large-v3-turbo": "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin";
};
type ModelName = keyof typeof MODELS | (string & {});
/**
* Downloads a ggml whisper model from a specified URL or shorthand.
*
* @param model - The model to download, specified either as a key of the {@link MODELS} object or as a URL.
* @returns A promise that resolves to the name of the downloaded model.
* @throws An error if the model URL or shorthand is invalid, or if the model fails to download.
*/
declare function download(model: ModelName): Promise<string>;
/**
* Removes a locally downloaded model.
* @param model - The name of the model to remove.
*/
declare function remove(model: ModelName): void;
/**
* Retrieves a list of model names that are available locally.
* @returns An array of model names.
*/
declare function list(): ModelName[];
/**
* Checks if a model exists.
* @param model - The name of the model.
* @returns True if the model exists, false otherwise.
*/
declare function check(model: ModelName): boolean;
/**
* Resolves the absolute path of a model.
* @param model - The name of the model.
* @returns The resolved path of the model.
* @throws Error if the model is not found.
*/
declare function resolve(model: ModelName): string;
declare const dir: {
root: string;
models: string;
};
declare const index_MODELS: typeof MODELS;
type index_ModelName = ModelName;
declare const index_check: typeof check;
declare const index_dir: typeof dir;
declare const index_download: typeof download;
declare const index_list: typeof list;
declare const index_remove: typeof remove;
declare const index_resolve: typeof resolve;
declare namespace index {
export { index_MODELS as MODELS, type index_ModelName as ModelName, index_check as check, index_dir as dir, index_download as download, index_list as list, index_remove as remove, index_resolve as resolve };
}
export { Binding, type Handle, type TranscribeDetailedResult, type TranscribeFormat, type TranscribeParams, type TranscribeResult, type TranscribeSimpleResult, TranscribeTask, Whisper, WhisperAligmentHeadsPreset, type WhisperConfig$1 as WhisperConfig, type WhisperContextParams, WhisperModel, WhisperSamplingStrategy, binding, index as manager };