web-voice-detection
Version:
A WebAssembly-powered Voice Activity Detection library for the browser.
234 lines (204 loc) • 6.93 kB
text/typescript
import { log } from "./logging";
import { Message } from "./messages";
import { SpeechProbabilities } from "./models";
const RECOMMENDED_FRAME_SAMPLES = [512, 1024, 1536];
export type FrameProcessorOptions = {
/**
* Threshold over which values returned by the ONNXModel model
* will be considered as positively indicating speech.
* The ONNXModel model is run on each frame.
* This number should be between 0 and 1.
*/
positiveSpeechThreshold: number;
/**
* Threshold under which values returned by the ONNXModel model
* will be considered as indicating an absence of speech.
* Note that the creators of the ONNXModel have historically
* set this number at 0.15 less than `positiveSpeechThreshold`.
*/
negativeSpeechThreshold: number;
/**
* After a detected value under the `negativeSpeechThreshold` is observed,
* the algorithm will wait `redemptionFrames` frames before running `onSpeechEnd`.
* If the model returns a value over `positiveSpeechThreshold` during this
* grace period, then the algorithm will consider the previously-detected
* "speech end" as having been a false negative.
*/
redemptionFrames: number;
/**
* Number of audio samples (under a sample rate of 16000) to comprise
* one "frame" to feed to the ONNXModel model. The `frame` serves
* as a unit of measurement of lengths of audio segments and many other
* parameters are defined in terms of rames. The authors of the
* ONNX model offer the following warning:
*
* > WARNING! ONNXModel models were trained using
* > 512, 1024, 1536 samples for 16000 sample rate
* > and 256, 512, 768 samples for 8000 sample rate.
* > Values other than these may affect model perfomance!
*
* In this context, audio fed to the model always has sample rate 16000.
* It is probably a good idea to leave this at 1536.
*/
frameSamples: number;
/**
* Number of frames to prepend to the audio
* segment that will be passed to `onSpeechEnd`.
*/
preSpeechPadFrames: number;
/**
* If an audio segment is detected as a speech segment according to initial algorithm
* but it has fewer than `minSpeechFrames`,
* it will be discarded and `onMisfire` will be run instead of `onSpeechEnd`.
*/
minSpeechFrames: number;
/**
* If true, when the user pauses the detection, it may trigger `onSpeechEnd`.
*/
submitUserSpeechOnPause: boolean;
};
export const defaultFrameProcessorOptions: FrameProcessorOptions = {
positiveSpeechThreshold: 0.5,
negativeSpeechThreshold: 0.5 - 0.15,
preSpeechPadFrames: 1,
redemptionFrames: 8,
frameSamples: 1536,
minSpeechFrames: 3,
submitUserSpeechOnPause: false,
};
export function validateOptions(options: FrameProcessorOptions) {
if (!RECOMMENDED_FRAME_SAMPLES.includes(options.frameSamples)) {
log.warn("You are using an unusual frame size");
}
if (options.positiveSpeechThreshold < 0 || options.negativeSpeechThreshold > 1) {
log.error("postiveSpeechThreshold should be a number between 0 and 1");
}
if (
options.negativeSpeechThreshold < 0 ||
options.negativeSpeechThreshold > options.positiveSpeechThreshold
) {
log.error("negativeSpeechThreshold should be between 0 and postiveSpeechThreshold");
}
if (options.preSpeechPadFrames < 0) {
log.error("preSpeechPadFrames should be positive");
}
if (options.redemptionFrames < 0) {
log.error("preSpeechPadFrames should be positive");
}
}
export type FrameProcessorInterface = {
resume: () => void;
process: (arr: Float32Array) => Promise<{
probs?: SpeechProbabilities;
msg?: Message;
audio?: Float32Array;
}>;
endSegment: () => { msg?: Message; audio?: Float32Array };
};
const concatArrays = (arrays: Float32Array[]): Float32Array => {
const sizes = arrays.reduce(
(out, next) => {
out.push((out.at(-1) as number) + next.length);
return out;
},
[0],
);
const outArray = new Float32Array(sizes.at(-1) as number);
arrays.forEach((arr, index) => {
const place = sizes[index];
outArray.set(arr, place);
});
return outArray;
};
export class FrameProcessor implements FrameProcessorInterface {
speaking: boolean = false;
audioBuffer: { frame: Float32Array; isSpeech: boolean }[];
redemptionCounter = 0;
active = false;
constructor(
public modelProcessFunc: (frame: Float32Array) => Promise<SpeechProbabilities>,
public modelResetFunc: () => any,
public options: FrameProcessorOptions,
) {
this.audioBuffer = [];
this.reset();
}
reset = () => {
this.speaking = false;
this.audioBuffer = [];
this.modelResetFunc();
this.redemptionCounter = 0;
};
pause = () => {
this.active = false;
if (this.options.submitUserSpeechOnPause) {
return this.endSegment();
} else {
this.reset();
return {};
}
};
resume = () => {
this.active = true;
};
endSegment = () => {
const audioBuffer = this.audioBuffer;
this.audioBuffer = [];
const speaking = this.speaking;
this.reset();
const speechFrameCount = audioBuffer.reduce((acc, item) => {
return acc + +item.isSpeech;
}, 0);
if (speaking) {
if (speechFrameCount >= this.options.minSpeechFrames) {
const audio = concatArrays(audioBuffer.map((item) => item.frame));
return { msg: Message.SpeechEnd, audio };
} else {
return { msg: Message.Misfire };
}
}
return {};
};
process = async (frame: Float32Array) => {
if (!this.active) {
return {};
}
const probs = await this.modelProcessFunc(frame);
this.audioBuffer.push({
frame,
isSpeech: probs.isSpeech >= this.options.positiveSpeechThreshold,
});
if (probs.isSpeech >= this.options.positiveSpeechThreshold && this.redemptionCounter) {
this.redemptionCounter = 0;
}
if (probs.isSpeech >= this.options.positiveSpeechThreshold && !this.speaking) {
this.speaking = true;
return { probs, msg: Message.SpeechStart };
}
if (
probs.isSpeech < this.options.negativeSpeechThreshold &&
this.speaking &&
++this.redemptionCounter >= this.options.redemptionFrames
) {
this.redemptionCounter = 0;
this.speaking = false;
const audioBuffer = this.audioBuffer;
this.audioBuffer = [];
const speechFrameCount = audioBuffer.reduce((acc, item) => {
return acc + +item.isSpeech;
}, 0);
if (speechFrameCount >= this.options.minSpeechFrames) {
const audio = concatArrays(audioBuffer.map((item) => item.frame));
return { probs, msg: Message.SpeechEnd, audio };
} else {
return { probs, msg: Message.Misfire };
}
}
if (!this.speaking) {
while (this.audioBuffer.length > this.options.preSpeechPadFrames) {
this.audioBuffer.shift();
}
}
return { probs };
};
}