UNPKG

murmuraba

Version:

Real-time audio noise reduction with advanced chunked processing for web applications

145 lines (144 loc) 5.27 kB
/** * Main VAD Implementation * Combines multiple VAD algorithms for robust voice activity detection */ import { DEFAULT_VAD_CONFIG } from './types'; import { EnergyVAD } from './algorithms/energy-vad'; import { ZCRVAD } from './algorithms/zcr-vad'; import { SegmentDetector } from './algorithms/segment-detector'; import { RNNoiseEngine } from '../engines/rnnoise-engine'; export async function murmubaraVAD(buffer, config = {}) { const vadConfig = { ...DEFAULT_VAD_CONFIG, ...config }; // Convert buffer to Float32Array const audioData = await convertToFloat32(buffer); // Resample to 24kHz if needed (RNNoise requirement) const targetSampleRate = 24000; const resampled = await resampleAudio(audioData, targetSampleRate); // Initialize detectors const energyVAD = new EnergyVAD(); const zcrVAD = new ZCRVAD(); const segmentDetector = new SegmentDetector(vadConfig.minSegmentDuration, vadConfig.hangoverTime); // Try to use RNNoise if available let rnnoiseEngine = null; if (vadConfig.useRNNoise) { try { rnnoiseEngine = new RNNoiseEngine(); await rnnoiseEngine.initialize(); } catch (error) { console.warn('RNNoise not available, falling back to energy/ZCR VAD'); rnnoiseEngine = null; } } // Process audio in frames const frameSize = vadConfig.frameSize || 480; // 20ms at 24kHz const frameTime = frameSize / targetSampleRate; const numFrames = Math.floor(resampled.length / frameSize); const scores = []; const metrics = []; for (let i = 0; i < numFrames; i++) { const start = i * frameSize; const end = start + frameSize; const frame = resampled.slice(start, end); if (frame.length < frameSize) { // Pad last frame if needed const paddedFrame = new Float32Array(frameSize); paddedFrame.set(frame); frame.set(paddedFrame); } // Get VAD scores from different algorithms const energyScore = energyVAD.detect(frame); const zcrScore = zcrVAD.detect(frame); let vadScore; if (rnnoiseEngine && rnnoiseEngine.isInitialized) { // Use RNNoise VAD if available rnnoiseEngine.process(frame); vadScore = rnnoiseEngine.lastVad || 0; // Combine with energy for robustness vadScore = vadScore * 0.7 + energyScore * 0.3; } else { // Combine energy and ZCR scores vadScore = energyScore * 0.7 + zcrScore * 0.3; } scores.push(vadScore); // Calculate metrics const energyMetrics = energyVAD.getMetrics(frame); const zcr = zcrVAD.calculateZCR(frame); metrics.push({ timestamp: i * frameTime, vadScore, energy: energyMetrics.energy, zeroCrossingRate: zcr }); } // Smooth scores const smoothedScores = segmentDetector.smoothScores(scores, 5); // Detect voice segments const voiceSegments = segmentDetector.detectSegments(smoothedScores, frameTime, 0.5); // Calculate average VAD score const average = smoothedScores.reduce((a, b) => a + b, 0) / smoothedScores.length; // Cleanup if (rnnoiseEngine) { rnnoiseEngine.cleanup(); } return { average, scores: smoothedScores, metrics, voiceSegments }; } /** * Convert audio buffer to Float32Array */ async function convertToFloat32(buffer) { // Check if it's already Float32 if (buffer.byteLength % 4 === 0) { const testView = new DataView(buffer); const firstSample = testView.getFloat32(0, true); if (Math.abs(firstSample) <= 1.0) { return new Float32Array(buffer); } } // Fallback to simple conversion // AudioConverter is for WebM->WAV conversion, not for getting Float32 return simpleConvertToFloat32(buffer); } /** * Simple conversion to Float32 (assumes 16-bit PCM) */ function simpleConvertToFloat32(buffer) { const int16 = new Int16Array(buffer); const float32 = new Float32Array(int16.length); for (let i = 0; i < int16.length; i++) { float32[i] = int16[i] / 32768; } return float32; } /** * Resample audio to target sample rate */ async function resampleAudio(audioData, targetSampleRate, originalSampleRate = 44100) { if (originalSampleRate === targetSampleRate) { return audioData; } // Simple linear interpolation resampling const ratio = originalSampleRate / targetSampleRate; const outputLength = Math.floor(audioData.length / ratio); const output = new Float32Array(outputLength); for (let i = 0; i < outputLength; i++) { const sourceIndex = i * ratio; const sourceIndexInt = Math.floor(sourceIndex); const fraction = sourceIndex - sourceIndexInt; if (sourceIndexInt + 1 < audioData.length) { // Linear interpolation output[i] = audioData[sourceIndexInt] * (1 - fraction) + audioData[sourceIndexInt + 1] * fraction; } else { output[i] = audioData[sourceIndexInt]; } } return output; }