UNPKG

web-voice-detection

Version:

A WebAssembly-powered Voice Activity Detection library for the browser.

124 lines (123 loc) 4.29 kB
import { Message } from "./messages"; import { SpeechProbabilities } from "./models"; export type FrameProcessorOptions = { /** * Threshold over which values returned by the ONNXModel model * will be considered as positively indicating speech. * The ONNXModel model is run on each frame. * This number should be between 0 and 1. */ positiveSpeechThreshold: number; /** * Threshold under which values returned by the ONNXModel model * will be considered as indicating an absence of speech. * Note that the creators of the ONNXModel have historically * set this number at 0.15 less than `positiveSpeechThreshold`. */ negativeSpeechThreshold: number; /** * After a detected value under the `negativeSpeechThreshold` is observed, * the algorithm will wait `redemptionFrames` frames before running `onSpeechEnd`. * If the model returns a value over `positiveSpeechThreshold` during this * grace period, then the algorithm will consider the previously-detected * "speech end" as having been a false negative. */ redemptionFrames: number; /** * Number of audio samples (under a sample rate of 16000) to comprise * one "frame" to feed to the ONNXModel model. The `frame` serves * as a unit of measurement of lengths of audio segments and many other * parameters are defined in terms of rames. The authors of the * ONNX model offer the following warning: * * > WARNING! ONNXModel models were trained using * > 512, 1024, 1536 samples for 16000 sample rate * > and 256, 512, 768 samples for 8000 sample rate. * > Values other than these may affect model perfomance! * * In this context, audio fed to the model always has sample rate 16000. * It is probably a good idea to leave this at 1536. */ frameSamples: number; /** * Number of frames to prepend to the audio * segment that will be passed to `onSpeechEnd`. */ preSpeechPadFrames: number; /** * If an audio segment is detected as a speech segment according to initial algorithm * but it has fewer than `minSpeechFrames`, * it will be discarded and `onMisfire` will be run instead of `onSpeechEnd`. */ minSpeechFrames: number; /** * If true, when the user pauses the detection, it may trigger `onSpeechEnd`. */ submitUserSpeechOnPause: boolean; }; export declare const defaultFrameProcessorOptions: FrameProcessorOptions; export declare function validateOptions(options: FrameProcessorOptions): void; export type FrameProcessorInterface = { resume: () => void; process: (arr: Float32Array) => Promise<{ probs?: SpeechProbabilities; msg?: Message; audio?: Float32Array; }>; endSegment: () => { msg?: Message; audio?: Float32Array; }; }; export declare class FrameProcessor implements FrameProcessorInterface { modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>; modelResetFunc: () => any; options: FrameProcessorOptions; speaking: boolean; audioBuffer: { frame: Float32Array; isSpeech: boolean; }[]; redemptionCounter: number; active: boolean; constructor(modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>, modelResetFunc: () => any, options: FrameProcessorOptions); reset: () => void; pause: () => { msg: Message; audio: Float32Array; } | { msg: Message; audio?: undefined; } | { msg?: undefined; audio?: undefined; }; resume: () => void; endSegment: () => { msg: Message; audio: Float32Array; } | { msg: Message; audio?: undefined; } | { msg?: undefined; audio?: undefined; }; process: (frame: Float32Array) => Promise<{ probs?: undefined; msg?: undefined; audio?: undefined; } | { probs: SpeechProbabilities; msg: Message; audio?: undefined; } | { probs: SpeechProbabilities; msg: Message; audio: Float32Array; } | { probs: SpeechProbabilities; msg?: undefined; audio?: undefined; }>; }