web-vad
Version:
Web Voice Activity Detection (VAD)
470 lines (449 loc) • 16.7 kB
JavaScript
import {Tensor as $hgUW1$Tensor, InferenceSession as $hgUW1$InferenceSession} from "onnxruntime-web";
function $parcel$defineInteropFlag(a) {
Object.defineProperty(a, '__esModule', {value: true, configurable: true});
}
function $parcel$export(e, n, v, s) {
Object.defineProperty(e, n, {get: v, set: s, enumerable: true, configurable: true});
}
var $ce973a3aaf304746$exports = {};
$parcel$defineInteropFlag($ce973a3aaf304746$exports);
$parcel$export($ce973a3aaf304746$exports, "VADState", () => $ce973a3aaf304746$export$700c1521c45794a6);
$parcel$export($ce973a3aaf304746$exports, "VAD", () => $ce973a3aaf304746$export$d0b8dcd6dc101d82);
$parcel$export($ce973a3aaf304746$exports, "default", () => $ce973a3aaf304746$export$2e2bcd8739ae039);
/*
Some of this code, together with the default options found in index.ts,
were taken (or took inspiration) from https://github.com/snakers4/silero-vad
*/ var $dc2966de588d0343$export$f69c19e57285b83a;
(function(Message) {
Message["AudioFrame"] = "AUDIO_FRAME";
Message["SpeechStart"] = "SPEECH_START";
Message["VADMisfire"] = "VAD_MISFIRE";
Message["SpeechEnd"] = "SPEECH_END";
Message["SpeechStop"] = "SPEECH_STOP";
})($dc2966de588d0343$export$f69c19e57285b83a || ($dc2966de588d0343$export$f69c19e57285b83a = {}));
const $a6a66754aa7b2357$export$ae9c9253c0f8e534 = [
512,
1024,
1536
];
const $a6a66754aa7b2357$export$e50124db40db28c2 = {
positiveSpeechThreshold: 0.5,
negativeSpeechThreshold: 0.35,
preSpeechPadFrames: 1,
redemptionFrames: 8,
frameSamples: 1536,
minSpeechFrames: 3,
submitUserSpeechOnPause: false
};
const $a6a66754aa7b2357$var$concatArrays = (arrays)=>{
const sizes = arrays.reduce((out, next)=>{
out.push(out.at(-1) + next.length);
return out;
}, [
0
]);
const outArray = new Float32Array(sizes.at(-1));
arrays.forEach((arr, index)=>{
const place = sizes[index];
outArray.set(arr, place);
});
return outArray;
};
class $a6a66754aa7b2357$export$1d7e16ee15e61187 {
modelProcessFunc;
modelResetFunc;
options;
speaking;
audioBuffer;
redemptionCounter;
active;
constructor(modelProcessFunc, modelResetFunc, options){
this.modelProcessFunc = modelProcessFunc;
this.modelResetFunc = modelResetFunc;
this.options = options;
this.speaking = false;
this.redemptionCounter = 0;
this.active = false;
this.reset = ()=>{
this.speaking = false;
this.audioBuffer = [];
this.modelResetFunc();
this.redemptionCounter = 0;
};
this.pause = ()=>{
this.active = false;
if (this.options.submitUserSpeechOnPause) return this.endSegment();
else {
this.reset();
return {};
}
};
this.resume = ()=>{
this.active = true;
};
this.endSegment = ()=>{
const audioBuffer = this.audioBuffer;
this.audioBuffer = [];
const speaking = this.speaking;
this.reset();
const speechFrameCount = audioBuffer.reduce((acc, item)=>{
return acc + +item.isSpeech;
}, 0);
if (speaking) {
if (speechFrameCount >= this.options.minSpeechFrames) {
const audio = $a6a66754aa7b2357$var$concatArrays(audioBuffer.map((item)=>item.frame));
return {
msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechEnd,
audio: audio
};
} else return {
msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).VADMisfire
};
}
return {};
};
this.process = async (frame)=>{
if (!this.active) return {};
const probs = await this.modelProcessFunc(frame);
this.audioBuffer.push({
frame: frame,
isSpeech: probs.isSpeech >= this.options.positiveSpeechThreshold
});
if (probs.isSpeech >= this.options.positiveSpeechThreshold && this.redemptionCounter) this.redemptionCounter = 0;
if (probs.isSpeech >= this.options.positiveSpeechThreshold && !this.speaking) {
this.speaking = true;
return {
probs: probs,
msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechStart
};
}
if (probs.isSpeech < this.options.negativeSpeechThreshold && this.speaking && ++this.redemptionCounter >= this.options.redemptionFrames) {
this.redemptionCounter = 0;
this.speaking = false;
const audioBuffer = this.audioBuffer;
this.audioBuffer = [];
const speechFrameCount = audioBuffer.reduce((acc, item)=>{
return acc + +item.isSpeech;
}, 0);
if (speechFrameCount >= this.options.minSpeechFrames) {
const audio = $a6a66754aa7b2357$var$concatArrays(audioBuffer.map((item)=>item.frame));
return {
probs: probs,
msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechEnd,
audio: audio
};
} else return {
probs: probs,
msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).VADMisfire
};
}
if (!this.speaking) while(this.audioBuffer.length > this.options.preSpeechPadFrames)this.audioBuffer.shift();
return {
probs: probs
};
};
this.audioBuffer = [];
this.reset();
}
reset;
pause;
resume;
endSegment;
process;
}
class $77f7b1278a02417f$export$30968c4af2c3e6f4 {
_session;
_sr;
_h;
_c;
zeroes = Array(128).fill(0);
constructor(){
this._session = null;
this._sr = new $hgUW1$Tensor("int64", [
16000n
]);
this._h = new $hgUW1$Tensor("float32", this.zeroes, [
2,
1,
64
]);
this._c = new $hgUW1$Tensor("float32", this.zeroes, [
2,
1,
64
]);
}
async init(modelURL) {
try {
const modelArrayBuffer = await fetch(modelURL, {
cache: "force-cache"
}).then((m)=>m.arrayBuffer());
this._session = await $hgUW1$InferenceSession.create(modelArrayBuffer);
this.reset_state();
} catch (e) {
throw new Error(`Unable to load model: ${modelURL} - Have you moved it to the public/static folder?`);
}
}
reset_state = ()=>{
this._h = new $hgUW1$Tensor("float32", this.zeroes, [
2,
1,
64
]);
this._c = new $hgUW1$Tensor("float32", this.zeroes, [
2,
1,
64
]);
};
process = async (audioFrame)=>{
if (!this._session) throw new Error("Model not loaded");
const t = new $hgUW1$Tensor("float32", audioFrame, [
1,
audioFrame.length
]);
const inputs = {
input: t,
h: this._h,
c: this._c,
sr: this._sr
};
const out = await this._session.run(inputs);
this._h = out.hn;
this._c = out.cn;
const isSpeech = out.output.data[0];
const notSpeech = 1 - isSpeech;
return {
notSpeech: notSpeech,
isSpeech: isSpeech
};
};
}
class $0c77cb1bb2feb95e$export$206ec340d8ea02fe {
options;
audioContext;
vadNode;
frameProcessor;
constructor(audioContext, options){
this.audioContext = audioContext;
this.options = options;
this.vadNode = null;
this.frameProcessor = null;
}
async load() {
// Load worklet
try {
await this.audioContext.audioWorklet.addModule(this.options.workletURL);
} catch (e) {
console.error(`Encountered an error while loading worklet. Please import the worklet file at:
${this.options.workletURL}
If need be, you can customize the worklet file location using the \`workletURL\` option.`);
throw e;
}
const vadNode = new AudioWorkletNode(this.audioContext, "vad-helper-worklet", {
processorOptions: {
frameSamples: this.options.frameSamples
}
});
// Load Silero
const model = new (0, $77f7b1278a02417f$export$30968c4af2c3e6f4)();
await model.init(this.options.modelURL);
// Load frame processor
const frameProcessor = new (0, $a6a66754aa7b2357$export$1d7e16ee15e61187)(model.process, model.reset_state, {
frameSamples: this.options.frameSamples,
positiveSpeechThreshold: this.options.positiveSpeechThreshold,
negativeSpeechThreshold: this.options.negativeSpeechThreshold,
redemptionFrames: this.options.redemptionFrames,
preSpeechPadFrames: this.options.preSpeechPadFrames,
minSpeechFrames: this.options.minSpeechFrames,
submitUserSpeechOnPause: this.options.submitUserSpeechOnPause
});
vadNode.port.onmessage = async (ev)=>{
switch(ev.data?.message){
case (0, $dc2966de588d0343$export$f69c19e57285b83a).AudioFrame:
{
const buffer = ev.data.data;
const frame = new Float32Array(buffer);
await this.processFrame(frame);
break;
}
default:
break;
}
};
this.vadNode = vadNode;
this.frameProcessor = frameProcessor;
return this;
}
start = ()=>{
this.frameProcessor?.resume();
};
receive = (node)=>{
node.connect(this.vadNode);
};
pause = ()=>{
if (!this.frameProcessor) return;
const ev = this.frameProcessor.pause();
this.handleFrameProcessorEvent(ev);
};
processFrame = async (frame)=>{
if (!this.frameProcessor) return;
const ev = await this.frameProcessor.process(frame);
this.handleFrameProcessorEvent(ev);
};
destroy = ()=>{
this.vadNode?.port.postMessage({
message: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechStop
});
this.vadNode?.disconnect();
};
handleFrameProcessorEvent = (ev)=>{
if (ev.probs !== undefined) this.options.onFrameProcessed(ev.probs);
switch(ev.msg){
case (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechStart:
this.options.onSpeechStart();
break;
case (0, $dc2966de588d0343$export$f69c19e57285b83a).VADMisfire:
this.options.onVADMisfire();
break;
case (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechEnd:
this.options.onSpeechEnd(ev.audio);
break;
default:
break;
}
};
}
var $0c77cb1bb2feb95e$export$2e2bcd8739ae039 = $0c77cb1bb2feb95e$export$206ec340d8ea02fe;
const $ce973a3aaf304746$var$defaultVADOptions = {
...(0, $a6a66754aa7b2357$export$e50124db40db28c2),
workletURL: "./worklet.js",
modelURL: "./silero_vad.onnx",
stream: null,
onVADMisfire: ()=>{
console.log("[VAD] VAD misfire");
},
onSpeechStart: ()=>{
console.log("[VAD] Speech start detected");
},
onSpeechEnd: ()=>{
console.log("[VAD] Speech end detected");
},
onFrameProcessed: ()=>{},
additionalAudioConstraints: {
channelCount: 1,
echoCancellation: true,
autoGainControl: true,
noiseSuppression: true
}
};
var $ce973a3aaf304746$export$700c1521c45794a6;
(function(VADState) {
VADState["initializing"] = "initializing";
VADState["loading"] = "loading";
VADState["ready"] = "ready";
VADState["listening"] = "listening";
VADState["paused"] = "paused";
VADState["destroyed"] = "destroyed";
VADState["errored"] = "errored";
})($ce973a3aaf304746$export$700c1521c45794a6 || ($ce973a3aaf304746$export$700c1521c45794a6 = {}));
class $ce973a3aaf304746$export$d0b8dcd6dc101d82 {
options;
state;
audioContext = null;
stream = null;
sourceNode = null;
audioVADNode = null;
constructor(options){
this.options = {
...$ce973a3aaf304746$var$defaultVADOptions,
...options,
additionalAudioConstraints: {
...$ce973a3aaf304746$var$defaultVADOptions.additionalAudioConstraints,
...options.additionalAudioConstraints
}
};
this.state = "initializing";
if (!this.validateOptions()) return;
}
async init() {
let stream = this.options.stream;
if (!stream) stream = await navigator.mediaDevices.getUserMedia({
audio: {
...this.options.additionalAudioConstraints,
channelCount: 1,
echoCancellation: true,
autoGainControl: true,
noiseSuppression: true
}
});
// Create a new audio context
const audioContext = new AudioContext();
const sourceNode = new MediaStreamAudioSourceNode(audioContext, {
mediaStream: stream
});
this.state = "loading";
// Create a new Audio VAD node to load models and process frames
const audioNodeVAD = new (0, $0c77cb1bb2feb95e$export$206ec340d8ea02fe)(audioContext, this.options);
await audioNodeVAD.load();
audioNodeVAD.receive(sourceNode);
// Update references
this.audioContext = audioContext;
this.sourceNode = sourceNode;
this.audioVADNode = audioNodeVAD;
this.state = "ready";
return audioNodeVAD;
}
validateOptions() {
const options = this.options;
if (!(0, $a6a66754aa7b2357$export$ae9c9253c0f8e534).includes(options.frameSamples)) console.warn("You are using an unusual frame size");
if (options.positiveSpeechThreshold < 0 || options.positiveSpeechThreshold > 1) console.error("postiveSpeechThreshold should be a number between 0 and 1");
if (options.negativeSpeechThreshold < 0 || options.negativeSpeechThreshold > options.positiveSpeechThreshold) console.error("negativeSpeechThreshold should be between 0 and postiveSpeechThreshold");
if (options.preSpeechPadFrames < 0) console.error("preSpeechPadFrames should be positive");
if (options.redemptionFrames < 0) console.error("preSpeechPadFrames should be positive");
return true;
}
onFrameProcessed() {}
onVADMisfire() {}
onSpeechStart() {}
onSpeechEnd() {}
pause = ()=>{
this.audioVADNode?.pause();
this.state = "paused";
};
start = ()=>{
if (this.state !== "ready") {
this.state = "errored";
throw Error("Attempt to start VAD without initializing. Please await init() first.");
}
this.audioVADNode?.start();
this.state = "listening";
};
destroy = ()=>{
if (this.state === "listening") this.pause();
if (this.options.stream === undefined) this.stream?.getTracks().forEach((track)=>track.stop());
this.sourceNode?.disconnect();
this.audioVADNode?.destroy();
this.audioContext?.close();
this.state = "destroyed";
};
static async precacheModels(sileroURL) {
try {
await fetch(sileroURL, {
cache: "force-cache"
});
} catch (e) {
throw new Error(`Unable to load Silero model: ${sileroURL} - Have you moved it to the public/static folder?`);
}
try {
await fetch("/ort-wasm-simd-threaded.wasm", {
cache: "force-cache"
});
} catch (e) {
throw new Error(`Unable to load ONNX runtime: /ort-wasm-simd-threaded.wasm - Have you moved it to the public/static folder?`);
}
}
}
var $ce973a3aaf304746$export$2e2bcd8739ae039 = $ce973a3aaf304746$export$d0b8dcd6dc101d82;
export {$ce973a3aaf304746$export$700c1521c45794a6 as VADState, $ce973a3aaf304746$export$d0b8dcd6dc101d82 as VAD};
//# sourceMappingURL=index.js.map