@steelbrain/media-speech-detection-web
Version:
Production-ready speech detection using Silero VAD ONNX model for web browsers
362 lines • 14.4 kB
JavaScript
import * as ort from 'onnxruntime-web';
/**
* Preloads the Silero VAD ONNX model by fetching it into browser cache.
*
* This function fetches the VAD model file to ensure it's cached by the browser,
* eliminating the network delay when speech detection is first used. The browser's
* HTTP cache will handle storing and serving the model for subsequent requests.
*
* @returns Promise that resolves when the model file has been fetched and cached
* @throws Error if the model file cannot be fetched
*
* @example
* ```typescript
* // Preload during app initialization
* await preloadModel();
*
* // Later, speech filters will load faster from browser cache
* const speechTransform = speechFilter({
* onSpeechStart: () => console.log('🎤 Speech started')
* });
* ```
*/
export async function preloadModel() {
try {
const modelUrl = new URL('../silero_vad.onnx', import.meta.url).href;
const response = await fetch(modelUrl);
if (!response.ok) {
throw new Error(`Failed to fetch model: ${response.status} ${response.statusText}`);
}
// Consume the response to ensure it's fully cached
await response.arrayBuffer();
}
catch (error) {
throw new Error(`Failed to preload Silero VAD model: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Speech filter transform stream - filters audio to only output speech chunks
*
* Usage:
* ```typescript
* const speechTransform = speechFilter({
* onSpeechStart: () => console.log('🎤 Speech started'),
* onSpeechEnd: () => console.log('🔇 Speech ended')
* });
*
* audioStream.pipeThrough(speechTransform).pipeTo(speechProcessor);
*
* // .tee() pattern for events-only processing
* const [liveStream, eventsStream] = audioStream.tee();
* liveStream.pipeTo(speechProcessor);
* eventsStream.pipeThrough(speechFilter({
* noEmit: true, // Don't emit chunks
* onSpeechStart: () => showRecordingIndicator(),
* onSpeechEnd: () => hideRecordingIndicator()
* }));
* ```
*/
export function speechFilter(options = {}) {
let vadProcessor = null;
return new TransformStream({
start: async () => {
vadProcessor = new VADProcessor(options);
await vadProcessor.initialize();
},
transform: async (chunk, controller) => {
if (!vadProcessor) {
throw new Error('VAD processor not initialized');
}
const speechChunks = await vadProcessor.processChunk(chunk);
if (speechChunks.length > 0) {
options.onDebugLog?.(`VAD Transform: Processing ${speechChunks.length} speech chunks`);
}
// Only emit chunks downstream if noEmit is false (default behavior)
if (!options.noEmit) {
for (const speechChunk of speechChunks) {
controller.enqueue(speechChunk);
}
}
},
flush: async (controller) => {
if (vadProcessor) {
const finalChunks = vadProcessor.finalize();
// Only emit final chunks if noEmit is false
if (!options.noEmit) {
for (const chunk of finalChunks) {
controller.enqueue(chunk);
}
}
await vadProcessor.destroy();
}
},
});
}
/**
* Internal VAD processor - simplified version of the main implementation
*/
class VADProcessor {
// Configuration
threshold;
negativeThreshold;
minSpeechFrames;
redemptionFrames;
lookBackFrames;
// Event handlers
eventHandlers;
// ONNX Runtime
session = null;
state = null;
context = new Float32Array(64);
// Processing state
audioBuffer = new Float32Array(0);
speechBuffer = [];
lookBackBuffer = [];
// VAD state machine
vadState = 'silent';
speechFrameCount = 0;
redemptionCounter = 0;
speechStartTime = 0;
frameIndex = 0;
// Constants
SAMPLE_RATE = 16000;
FRAME_SIZE = 512;
CONTEXT_SIZE = 64;
constructor(options) {
// Extract event handlers
this.eventHandlers = {};
if (options.onSpeechStart)
this.eventHandlers.onSpeechStart = options.onSpeechStart;
if (options.onSpeechEnd)
this.eventHandlers.onSpeechEnd = options.onSpeechEnd;
if (options.onMisfire)
this.eventHandlers.onMisfire = options.onMisfire;
if (options.onError)
this.eventHandlers.onError = options.onError;
if (options.onDebugLog)
this.eventHandlers.onDebugLog = options.onDebugLog;
// Convert configuration to internal units
this.threshold = Math.max(0.01, Math.min(0.99, options.threshold ?? 0.5));
this.negativeThreshold = Math.max(0.01, Math.min(this.threshold - 0.01, this.threshold - 0.15));
this.minSpeechFrames = Math.max(1, Math.round((options.minSpeechDurationMs ?? 160) / 32));
this.redemptionFrames = Math.max(1, Math.round((options.redemptionDurationMs ?? 400) / 32));
this.lookBackFrames = Math.max(0, Math.round((options.lookBackDurationMs ?? 384) / 32)); // Default: 12 frames
}
async initialize() {
if (this.session)
return;
try {
this.session = await ort.InferenceSession.create(new URL('../silero_vad.onnx', import.meta.url).href);
this.resetState();
}
catch (error) {
throw new Error(`Failed to initialize Silero VAD model: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
async processChunk(chunk) {
const outputChunks = [];
// Add to audio buffer
this.audioBuffer = this.appendBuffer(this.audioBuffer, chunk);
// Process complete frames
while (this.audioBuffer.length >= this.FRAME_SIZE) {
const frame = this.audioBuffer.slice(0, this.FRAME_SIZE);
this.audioBuffer = this.audioBuffer.slice(this.FRAME_SIZE);
// Maintain lookback buffer
this.updateLookBackBuffer(frame);
// Detect speech
const speechProbability = await this.detectSpeech(frame);
if (speechProbability > 0) {
// Only log when there's some chance of speech
this.eventHandlers.onDebugLog?.(`VAD: Frame ${this.frameIndex}, probability: ${speechProbability.toFixed(3)}, state: ${this.vadState}`);
}
const speechChunks = await this.handleSpeechDetection(speechProbability, frame);
outputChunks.push(...speechChunks);
this.frameIndex++;
}
return outputChunks;
}
finalize() {
const outputChunks = [];
if (this.vadState === 'speaking' || this.vadState === 'intermediate') {
const speechDurationSeconds = (Date.now() - this.speechStartTime) / 1000;
this.resetSpeechState();
if (speechDurationSeconds >= (this.minSpeechFrames * this.FRAME_SIZE) / this.SAMPLE_RATE) {
this.eventHandlers.onSpeechEnd?.(new Float32Array(0));
}
else {
this.eventHandlers.onMisfire?.();
}
}
return outputChunks;
}
async destroy() {
this.session = null;
this.state = null;
this.audioBuffer = new Float32Array(0);
this.speechBuffer = [];
this.lookBackBuffer = [];
this.context.fill(0);
}
resetState() {
const zeros = new Float32Array(2 * 1 * 128).fill(0);
this.state = new ort.Tensor('float32', zeros, [2, 1, 128]);
this.context.fill(0);
this.vadState = 'silent';
this.speechFrameCount = 0;
this.redemptionCounter = 0;
this.speechStartTime = 0;
this.frameIndex = 0;
this.audioBuffer = new Float32Array(0);
this.speechBuffer = [];
this.lookBackBuffer = [];
}
updateLookBackBuffer(frame) {
if (this.lookBackFrames === 0)
return;
// Only build lookback buffer during silence
// Don't clear it during other states - it gets cleared when used
if (this.vadState === 'silent') {
this.lookBackBuffer.push(new Float32Array(frame));
if (this.lookBackBuffer.length > this.lookBackFrames) {
this.lookBackBuffer.shift();
}
}
}
async detectSpeech(audioFrame) {
if (!this.session || !this.state) {
return 0;
}
// Create contextual frame
const contextualFrame = new Float32Array(this.CONTEXT_SIZE + audioFrame.length);
contextualFrame.set(this.context);
contextualFrame.set(audioFrame, this.CONTEXT_SIZE);
try {
// Create input tensors
const audioTensor = new ort.Tensor('float32', contextualFrame, [1, contextualFrame.length]);
const srTensor = new ort.Tensor('int64', new BigInt64Array([BigInt(this.SAMPLE_RATE)]), [1]);
// Run inference
const results = await this.session.run({
input: audioTensor,
state: this.state,
sr: srTensor,
});
// Update state and context
this.state = results.stateN;
this.context.set(contextualFrame.slice(-this.CONTEXT_SIZE));
return results.output.data[0];
}
catch (error) {
this.eventHandlers.onError?.(error instanceof Error ? error : new Error(String(error)));
// Update context even on error to maintain continuity
this.context.set(contextualFrame.slice(-this.CONTEXT_SIZE));
return 0;
}
}
async handleSpeechDetection(probability, frame) {
const outputChunks = [];
const speechState = this.classifySpeechState(probability);
switch (this.vadState) {
case 'silent':
if (speechState === 'speech') {
this.vadState = 'detecting';
this.speechFrameCount = 1;
this.speechBuffer = [new Float32Array(frame)];
}
break;
case 'detecting':
if (speechState === 'speech') {
this.speechFrameCount++;
this.speechBuffer.push(new Float32Array(frame));
if (this.speechFrameCount >= this.minSpeechFrames) {
this.vadState = 'speaking';
this.speechStartTime = Date.now();
this.redemptionCounter = 0;
// Output lookback + speech buffer (natural audio context)
const speechFrames = [...this.lookBackBuffer, ...this.speechBuffer];
const lookbackCount = this.lookBackBuffer.length;
const speechCount = this.speechBuffer.length;
this.lookBackBuffer = [];
this.eventHandlers.onDebugLog?.(`VAD: Speech confirmed! Outputting ${speechFrames.length} frames (${lookbackCount} lookback + ${speechCount} speech)`);
outputChunks.push(...speechFrames);
this.speechBuffer = [];
this.eventHandlers.onSpeechStart?.();
}
}
else if (speechState === 'non-speech') {
this.vadState = 'silent';
this.speechFrameCount = 0;
this.speechBuffer = [];
}
break;
case 'speaking':
if (speechState === 'non-speech') {
this.vadState = 'intermediate';
this.redemptionCounter = 1;
}
else if (speechState === 'intermediate') {
this.vadState = 'intermediate';
this.redemptionCounter = Math.max(1, this.redemptionCounter);
}
else {
this.redemptionCounter = 0;
this.eventHandlers.onDebugLog?.(`VAD: Continuing speech - outputting frame`);
outputChunks.push(new Float32Array(frame));
}
break;
case 'intermediate':
if (speechState === 'speech') {
this.vadState = 'speaking';
this.redemptionCounter = 0;
outputChunks.push(new Float32Array(frame));
}
else {
// Continue outputting intermediate frames during redemption period
outputChunks.push(new Float32Array(frame));
this.redemptionCounter++;
if (this.redemptionCounter >= this.redemptionFrames) {
this.endSpeechSegment();
}
}
break;
}
return outputChunks;
}
classifySpeechState(probability) {
if (probability >= this.threshold) {
return 'speech';
}
else if (probability >= this.negativeThreshold) {
return 'intermediate';
}
else {
return 'non-speech';
}
}
endSpeechSegment() {
const speechDurationSeconds = (Date.now() - this.speechStartTime) / 1000;
this.resetSpeechState();
if (speechDurationSeconds >= (this.minSpeechFrames * this.FRAME_SIZE) / this.SAMPLE_RATE) {
this.eventHandlers.onSpeechEnd?.(new Float32Array(0));
}
else {
this.eventHandlers.onMisfire?.();
}
}
resetSpeechState() {
this.vadState = 'silent';
this.speechFrameCount = 0;
this.redemptionCounter = 0;
this.speechBuffer = [];
}
appendBuffer(buffer, newData) {
if (newData.length === 0)
return buffer;
if (buffer.length === 0)
return new Float32Array(newData);
const combined = new Float32Array(buffer.length + newData.length);
combined.set(buffer);
combined.set(newData, buffer.length);
return combined;
}
}
//# sourceMappingURL=index.js.map