@ericedouard/vad-node-realtime
Version:
Powerful, user-friendly realtime voice activity detector (VAD) for node
165 lines • 6.11 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.RealTimeVAD = exports.getDefaultRealTimeVADOptions = exports.DEFAULT_MODEL = void 0;
const frame_processor_1 = require("./common/frame-processor");
const messages_1 = require("./common/messages");
const models_1 = require("./common/models");
const resampler_1 = require("./common/resampler");
exports.DEFAULT_MODEL = "v5";
/**
* Build default options based on chosen model
*/
function getDefaultRealTimeVADOptions(model = exports.DEFAULT_MODEL) {
const frameOpts = model === "v5"
? frame_processor_1.defaultV5FrameProcessorOptions
: frame_processor_1.defaultLegacyFrameProcessorOptions;
return {
...frameOpts,
sampleRate: 16000,
onFrameProcessed: () => { },
onVADMisfire: () => {
/* no-op */
},
onSpeechStart: () => {
/* no-op */
},
onSpeechRealStart: () => {
/* no-op */
},
onSpeechEnd: () => {
/* no-op */
},
ortConfig: undefined,
model,
};
}
exports.getDefaultRealTimeVADOptions = getDefaultRealTimeVADOptions;
/**
* RealTimeVAD processes raw audio buffers, frames, and emits events
*/
class RealTimeVAD {
/**
* Construct a new instance with provided options and loaded model
*/
constructor(options, modelInstance) {
this.options = options;
this.buffer = new Float32Array(0);
this.active = false;
this.resampler = null;
/** Handle events emitted by the frame processor */
this.handleEvent = (ev) => {
switch (ev.msg) {
case messages_1.Message.FrameProcessed:
this.options.onFrameProcessed(ev.probs, ev.frame);
break;
case messages_1.Message.SpeechStart:
this.options.onSpeechStart();
break;
case messages_1.Message.SpeechRealStart:
this.options.onSpeechRealStart();
break;
case messages_1.Message.VADMisfire:
this.options.onVADMisfire();
break;
case messages_1.Message.SpeechEnd:
this.options.onSpeechEnd(ev.audio);
break;
}
};
this.modelInstance = modelInstance;
this.frameSize = options.frameSamples;
this.frameProcessor = new frame_processor_1.FrameProcessor(modelInstance.process, modelInstance.reset_state, {
frameSamples: options.frameSamples,
positiveSpeechThreshold: options.positiveSpeechThreshold,
negativeSpeechThreshold: options.negativeSpeechThreshold,
redemptionFrames: options.redemptionFrames,
preSpeechPadFrames: options.preSpeechPadFrames,
minSpeechFrames: options.minSpeechFrames,
submitUserSpeechOnPause: options.submitUserSpeechOnPause,
});
if (options.sampleRate !== 16000) {
this.resampler = new resampler_1.Resampler({
nativeSampleRate: options.sampleRate,
targetSampleRate: 16000,
targetFrameSize: this.frameSize,
});
}
}
/**
* Create and initialize a RealTimeVAD instance
*/
static async new(ort, modelFetcher, opts = {}) {
const fullOpts = {
...getDefaultRealTimeVADOptions(opts.model),
...opts,
};
(0, frame_processor_1.validateOptions)(fullOpts);
if (fullOpts.ortConfig)
fullOpts.ortConfig(ort);
const modelVersion = fullOpts.model || exports.DEFAULT_MODEL;
const factory = modelVersion === "v5" ? models_1.SileroV5.new : models_1.SileroLegacy.new;
const modelInstance = await factory(ort, modelFetcher);
return new RealTimeVAD(fullOpts, modelInstance);
}
/** Start processing incoming frames */
start() {
this.active = true;
this.frameProcessor.resume();
}
/** Pause processing; may emit end-segment on pause */
pause() {
this.active = false;
this.frameProcessor.pause(this.handleEvent);
}
/** Feed raw audio (any sample rate) into the VAD */
async processAudio(audioData) {
if (!this.active)
return;
let data = audioData;
if (this.resampler) {
const chunks = [];
for await (const frame of this.resampler.stream(audioData)) {
chunks.push(frame);
}
data = new Float32Array(chunks.reduce((sum, c) => sum + c.length, 0));
let off = 0;
for (const c of chunks) {
data.set(c, off);
off += c.length;
}
}
// append to internal buffer
const tmp = new Float32Array(this.buffer.length + data.length);
tmp.set(this.buffer);
tmp.set(data, this.buffer.length);
this.buffer = tmp;
// process complete frames
while (this.buffer.length >= this.frameSize) {
const frame = this.buffer.subarray(0, this.frameSize);
this.buffer = this.buffer.subarray(this.frameSize);
await this.frameProcessor.process(frame, this.handleEvent);
}
}
/** Flush any remaining audio and end segment */
async flush() {
if (this.buffer.length > 0 && this.buffer.length < this.frameSize) {
const pad = new Float32Array(this.frameSize);
pad.set(this.buffer);
await this.frameProcessor.process(pad, this.handleEvent);
}
this.frameProcessor.endSegment(this.handleEvent);
this.buffer = new Float32Array(0);
}
/** Reset internal state */
reset() {
this.buffer = new Float32Array(0);
this.modelInstance.reset_state();
}
/** Clean up resources */
destroy() {
this.pause();
this.reset();
}
}
exports.RealTimeVAD = RealTimeVAD;
//# sourceMappingURL=real-time-vad.js.map