UNPKG

web-vad

Version:

Web Voice Activity Detection (VAD)

github.com/jptaylor/web-vad

jptaylor/web-vad

470 lines (449 loc) • 16.7 kB

JavaScript

import {Tensor as $hgUW1$Tensor, InferenceSession as $hgUW1$InferenceSession} from "onnxruntime-web"; function $parcel$defineInteropFlag(a) { Object.defineProperty(a, '__esModule', {value: true, configurable: true}); } function $parcel$export(e, n, v, s) { Object.defineProperty(e, n, {get: v, set: s, enumerable: true, configurable: true}); } var $ce973a3aaf304746$exports = {}; $parcel$defineInteropFlag($ce973a3aaf304746$exports); $parcel$export($ce973a3aaf304746$exports, "VADState", () => $ce973a3aaf304746$export$700c1521c45794a6); $parcel$export($ce973a3aaf304746$exports, "VAD", () => $ce973a3aaf304746$export$d0b8dcd6dc101d82); $parcel$export($ce973a3aaf304746$exports, "default", () => $ce973a3aaf304746$export$2e2bcd8739ae039); /* Some of this code, together with the default options found in index.ts, were taken (or took inspiration) from https://github.com/snakers4/silero-vad */ var $dc2966de588d0343$export$f69c19e57285b83a; (function(Message) { Message["AudioFrame"] = "AUDIO_FRAME"; Message["SpeechStart"] = "SPEECH_START"; Message["VADMisfire"] = "VAD_MISFIRE"; Message["SpeechEnd"] = "SPEECH_END"; Message["SpeechStop"] = "SPEECH_STOP"; })($dc2966de588d0343$export$f69c19e57285b83a || ($dc2966de588d0343$export$f69c19e57285b83a = {})); const $a6a66754aa7b2357$export$ae9c9253c0f8e534 = [ 512, 1024, 1536 ]; const $a6a66754aa7b2357$export$e50124db40db28c2 = { positiveSpeechThreshold: 0.5, negativeSpeechThreshold: 0.35, preSpeechPadFrames: 1, redemptionFrames: 8, frameSamples: 1536, minSpeechFrames: 3, submitUserSpeechOnPause: false }; const $a6a66754aa7b2357$var$concatArrays = (arrays)=>{ const sizes = arrays.reduce((out, next)=>{ out.push(out.at(-1) + next.length); return out; }, [ 0 ]); const outArray = new Float32Array(sizes.at(-1)); arrays.forEach((arr, index)=>{ const place = sizes[index]; outArray.set(arr, place); }); return outArray; }; class $a6a66754aa7b2357$export$1d7e16ee15e61187 { modelProcessFunc; modelResetFunc; options; speaking; audioBuffer; redemptionCounter; active; constructor(modelProcessFunc, modelResetFunc, options){ this.modelProcessFunc = modelProcessFunc; this.modelResetFunc = modelResetFunc; this.options = options; this.speaking = false; this.redemptionCounter = 0; this.active = false; this.reset = ()=>{ this.speaking = false; this.audioBuffer = []; this.modelResetFunc(); this.redemptionCounter = 0; }; this.pause = ()=>{ this.active = false; if (this.options.submitUserSpeechOnPause) return this.endSegment(); else { this.reset(); return {}; } }; this.resume = ()=>{ this.active = true; }; this.endSegment = ()=>{ const audioBuffer = this.audioBuffer; this.audioBuffer = []; const speaking = this.speaking; this.reset(); const speechFrameCount = audioBuffer.reduce((acc, item)=>{ return acc + +item.isSpeech; }, 0); if (speaking) { if (speechFrameCount >= this.options.minSpeechFrames) { const audio = $a6a66754aa7b2357$var$concatArrays(audioBuffer.map((item)=>item.frame)); return { msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechEnd, audio: audio }; } else return { msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).VADMisfire }; } return {}; }; this.process = async (frame)=>{ if (!this.active) return {}; const probs = await this.modelProcessFunc(frame); this.audioBuffer.push({ frame: frame, isSpeech: probs.isSpeech >= this.options.positiveSpeechThreshold }); if (probs.isSpeech >= this.options.positiveSpeechThreshold && this.redemptionCounter) this.redemptionCounter = 0; if (probs.isSpeech >= this.options.positiveSpeechThreshold && !this.speaking) { this.speaking = true; return { probs: probs, msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechStart }; } if (probs.isSpeech < this.options.negativeSpeechThreshold && this.speaking && ++this.redemptionCounter >= this.options.redemptionFrames) { this.redemptionCounter = 0; this.speaking = false; const audioBuffer = this.audioBuffer; this.audioBuffer = []; const speechFrameCount = audioBuffer.reduce((acc, item)=>{ return acc + +item.isSpeech; }, 0); if (speechFrameCount >= this.options.minSpeechFrames) { const audio = $a6a66754aa7b2357$var$concatArrays(audioBuffer.map((item)=>item.frame)); return { probs: probs, msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechEnd, audio: audio }; } else return { probs: probs, msg: (0, $dc2966de588d0343$export$f69c19e57285b83a).VADMisfire }; } if (!this.speaking) while(this.audioBuffer.length > this.options.preSpeechPadFrames)this.audioBuffer.shift(); return { probs: probs }; }; this.audioBuffer = []; this.reset(); } reset; pause; resume; endSegment; process; } class $77f7b1278a02417f$export$30968c4af2c3e6f4 { _session; _sr; _h; _c; zeroes = Array(128).fill(0); constructor(){ this._session = null; this._sr = new $hgUW1$Tensor("int64", [ 16000n ]); this._h = new $hgUW1$Tensor("float32", this.zeroes, [ 2, 1, 64 ]); this._c = new $hgUW1$Tensor("float32", this.zeroes, [ 2, 1, 64 ]); } async init(modelURL) { try { const modelArrayBuffer = await fetch(modelURL, { cache: "force-cache" }).then((m)=>m.arrayBuffer()); this._session = await $hgUW1$InferenceSession.create(modelArrayBuffer); this.reset_state(); } catch (e) { throw new Error(`Unable to load model: ${modelURL} - Have you moved it to the public/static folder?`); } } reset_state = ()=>{ this._h = new $hgUW1$Tensor("float32", this.zeroes, [ 2, 1, 64 ]); this._c = new $hgUW1$Tensor("float32", this.zeroes, [ 2, 1, 64 ]); }; process = async (audioFrame)=>{ if (!this._session) throw new Error("Model not loaded"); const t = new $hgUW1$Tensor("float32", audioFrame, [ 1, audioFrame.length ]); const inputs = { input: t, h: this._h, c: this._c, sr: this._sr }; const out = await this._session.run(inputs); this._h = out.hn; this._c = out.cn; const isSpeech = out.output.data[0]; const notSpeech = 1 - isSpeech; return { notSpeech: notSpeech, isSpeech: isSpeech }; }; } class $0c77cb1bb2feb95e$export$206ec340d8ea02fe { options; audioContext; vadNode; frameProcessor; constructor(audioContext, options){ this.audioContext = audioContext; this.options = options; this.vadNode = null; this.frameProcessor = null; } async load() { // Load worklet try { await this.audioContext.audioWorklet.addModule(this.options.workletURL); } catch (e) { console.error(`Encountered an error while loading worklet. Please import the worklet file at: ${this.options.workletURL} If need be, you can customize the worklet file location using the \`workletURL\` option.`); throw e; } const vadNode = new AudioWorkletNode(this.audioContext, "vad-helper-worklet", { processorOptions: { frameSamples: this.options.frameSamples } }); // Load Silero const model = new (0, $77f7b1278a02417f$export$30968c4af2c3e6f4)(); await model.init(this.options.modelURL); // Load frame processor const frameProcessor = new (0, $a6a66754aa7b2357$export$1d7e16ee15e61187)(model.process, model.reset_state, { frameSamples: this.options.frameSamples, positiveSpeechThreshold: this.options.positiveSpeechThreshold, negativeSpeechThreshold: this.options.negativeSpeechThreshold, redemptionFrames: this.options.redemptionFrames, preSpeechPadFrames: this.options.preSpeechPadFrames, minSpeechFrames: this.options.minSpeechFrames, submitUserSpeechOnPause: this.options.submitUserSpeechOnPause }); vadNode.port.onmessage = async (ev)=>{ switch(ev.data?.message){ case (0, $dc2966de588d0343$export$f69c19e57285b83a).AudioFrame: { const buffer = ev.data.data; const frame = new Float32Array(buffer); await this.processFrame(frame); break; } default: break; } }; this.vadNode = vadNode; this.frameProcessor = frameProcessor; return this; } start = ()=>{ this.frameProcessor?.resume(); }; receive = (node)=>{ node.connect(this.vadNode); }; pause = ()=>{ if (!this.frameProcessor) return; const ev = this.frameProcessor.pause(); this.handleFrameProcessorEvent(ev); }; processFrame = async (frame)=>{ if (!this.frameProcessor) return; const ev = await this.frameProcessor.process(frame); this.handleFrameProcessorEvent(ev); }; destroy = ()=>{ this.vadNode?.port.postMessage({ message: (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechStop }); this.vadNode?.disconnect(); }; handleFrameProcessorEvent = (ev)=>{ if (ev.probs !== undefined) this.options.onFrameProcessed(ev.probs); switch(ev.msg){ case (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechStart: this.options.onSpeechStart(); break; case (0, $dc2966de588d0343$export$f69c19e57285b83a).VADMisfire: this.options.onVADMisfire(); break; case (0, $dc2966de588d0343$export$f69c19e57285b83a).SpeechEnd: this.options.onSpeechEnd(ev.audio); break; default: break; } }; } var $0c77cb1bb2feb95e$export$2e2bcd8739ae039 = $0c77cb1bb2feb95e$export$206ec340d8ea02fe; const $ce973a3aaf304746$var$defaultVADOptions = { ...(0, $a6a66754aa7b2357$export$e50124db40db28c2), workletURL: "./worklet.js", modelURL: "./silero_vad.onnx", stream: null, onVADMisfire: ()=>{ console.log("[VAD] VAD misfire"); }, onSpeechStart: ()=>{ console.log("[VAD] Speech start detected"); }, onSpeechEnd: ()=>{ console.log("[VAD] Speech end detected"); }, onFrameProcessed: ()=>{}, additionalAudioConstraints: { channelCount: 1, echoCancellation: true, autoGainControl: true, noiseSuppression: true } }; var $ce973a3aaf304746$export$700c1521c45794a6; (function(VADState) { VADState["initializing"] = "initializing"; VADState["loading"] = "loading"; VADState["ready"] = "ready"; VADState["listening"] = "listening"; VADState["paused"] = "paused"; VADState["destroyed"] = "destroyed"; VADState["errored"] = "errored"; })($ce973a3aaf304746$export$700c1521c45794a6 || ($ce973a3aaf304746$export$700c1521c45794a6 = {})); class $ce973a3aaf304746$export$d0b8dcd6dc101d82 { options; state; audioContext = null; stream = null; sourceNode = null; audioVADNode = null; constructor(options){ this.options = { ...$ce973a3aaf304746$var$defaultVADOptions, ...options, additionalAudioConstraints: { ...$ce973a3aaf304746$var$defaultVADOptions.additionalAudioConstraints, ...options.additionalAudioConstraints } }; this.state = "initializing"; if (!this.validateOptions()) return; } async init() { let stream = this.options.stream; if (!stream) stream = await navigator.mediaDevices.getUserMedia({ audio: { ...this.options.additionalAudioConstraints, channelCount: 1, echoCancellation: true, autoGainControl: true, noiseSuppression: true } }); // Create a new audio context const audioContext = new AudioContext(); const sourceNode = new MediaStreamAudioSourceNode(audioContext, { mediaStream: stream }); this.state = "loading"; // Create a new Audio VAD node to load models and process frames const audioNodeVAD = new (0, $0c77cb1bb2feb95e$export$206ec340d8ea02fe)(audioContext, this.options); await audioNodeVAD.load(); audioNodeVAD.receive(sourceNode); // Update references this.audioContext = audioContext; this.sourceNode = sourceNode; this.audioVADNode = audioNodeVAD; this.state = "ready"; return audioNodeVAD; } validateOptions() { const options = this.options; if (!(0, $a6a66754aa7b2357$export$ae9c9253c0f8e534).includes(options.frameSamples)) console.warn("You are using an unusual frame size"); if (options.positiveSpeechThreshold < 0 || options.positiveSpeechThreshold > 1) console.error("postiveSpeechThreshold should be a number between 0 and 1"); if (options.negativeSpeechThreshold < 0 || options.negativeSpeechThreshold > options.positiveSpeechThreshold) console.error("negativeSpeechThreshold should be between 0 and postiveSpeechThreshold"); if (options.preSpeechPadFrames < 0) console.error("preSpeechPadFrames should be positive"); if (options.redemptionFrames < 0) console.error("preSpeechPadFrames should be positive"); return true; } onFrameProcessed() {} onVADMisfire() {} onSpeechStart() {} onSpeechEnd() {} pause = ()=>{ this.audioVADNode?.pause(); this.state = "paused"; }; start = ()=>{ if (this.state !== "ready") { this.state = "errored"; throw Error("Attempt to start VAD without initializing. Please await init() first."); } this.audioVADNode?.start(); this.state = "listening"; }; destroy = ()=>{ if (this.state === "listening") this.pause(); if (this.options.stream === undefined) this.stream?.getTracks().forEach((track)=>track.stop()); this.sourceNode?.disconnect(); this.audioVADNode?.destroy(); this.audioContext?.close(); this.state = "destroyed"; }; static async precacheModels(sileroURL) { try { await fetch(sileroURL, { cache: "force-cache" }); } catch (e) { throw new Error(`Unable to load Silero model: ${sileroURL} - Have you moved it to the public/static folder?`); } try { await fetch("/ort-wasm-simd-threaded.wasm", { cache: "force-cache" }); } catch (e) { throw new Error(`Unable to load ONNX runtime: /ort-wasm-simd-threaded.wasm - Have you moved it to the public/static folder?`); } } } var $ce973a3aaf304746$export$2e2bcd8739ae039 = $ce973a3aaf304746$export$d0b8dcd6dc101d82; export {$ce973a3aaf304746$export$700c1521c45794a6 as VADState, $ce973a3aaf304746$export$d0b8dcd6dc101d82 as VAD}; //# sourceMappingURL=index.js.map