UNPKG

@jaehyun-ko/speaker-verification

Version:

Real-time speaker verification in the browser using NeXt-TDNN models

158 lines (157 loc) 6.63 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.AudioPreprocessor = void 0; const types_1 = require("../core/types"); const utils_1 = require("./utils"); class AudioPreprocessor { constructor(config = {}) { this.melFilterBank = null; this.config = { ...types_1.DEFAULT_MEL_CONFIG, ...config }; this.fftProcessor = new utils_1.FFT(this.config.nFft); this.initializeMelFilterBank(); } initializeMelFilterBank() { const { nFft, nMels, sampleRate } = this.config; const fftBins = Math.floor(nFft / 2) + 1; // Convert frequencies to mel scale - matching Python's f_min=20, f_max=7600 const fMin = 20; const fMax = 7600; const melMin = this.hzToMel(fMin); const melMax = this.hzToMel(fMax); // Create mel points const melPoints = new Float32Array(nMels + 2); for (let i = 0; i < nMels + 2; i++) { melPoints[i] = melMin + (melMax - melMin) * i / (nMels + 1); } // Convert back to Hz const hzPoints = melPoints.map(mel => this.melToHz(mel)); // Convert to FFT bin numbers const binPoints = hzPoints.map(hz => Math.floor((nFft + 1) * hz / sampleRate)); // Create triangular filters this.melFilterBank = []; for (let i = 0; i < nMels; i++) { const filter = new Float32Array(fftBins); const startBin = binPoints[i]; const centerBin = binPoints[i + 1]; const endBin = binPoints[i + 2]; // Rising edge for (let j = startBin; j < centerBin; j++) { filter[j] = (j - startBin) / (centerBin - startBin); } // Falling edge for (let j = centerBin; j < endBin; j++) { filter[j] = (endBin - j) / (endBin - centerBin); } this.melFilterBank.push(filter); } } hzToMel(hz) { return 2595 * Math.log10(1 + hz / 700); } melToHz(mel) { return 700 * (Math.pow(10, mel / 2595) - 1); } preEmphasis(signal) { if (!signal || signal.length === 0) { throw new Error('Cannot apply pre-emphasis to empty or undefined signal'); } const { preEmphasisCoef = 0.97 } = this.config; const emphasized = new Float32Array(signal.length); emphasized[0] = signal[0]; for (let i = 1; i < signal.length; i++) { emphasized[i] = signal[i] - preEmphasisCoef * signal[i - 1]; } return emphasized; } applyWindow(frame) { const { winLength } = this.config; const windowed = new Float32Array(frame.length); // Hamming window - matching Python's torch.hamming_window for (let i = 0; i < winLength; i++) { const windowValue = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (winLength - 1)); windowed[i] = frame[i] * windowValue; } return windowed; } fft(signal) { const { nFft } = this.config; const real = new Float32Array(nFft); const imag = new Float32Array(nFft); // Zero-pad signal to nFft size for (let i = 0; i < Math.min(signal.length, nFft); i++) { real[i] = signal[i]; } // Perform FFT this.fftProcessor.forward(real, imag); // Get magnitude spectrum (only positive frequencies) const halfNfft = Math.floor(nFft / 2) + 1; const magnitude = new Float32Array(halfNfft); for (let i = 0; i < halfNfft; i++) { magnitude[i] = Math.sqrt(real[i] * real[i] + imag[i] * imag[i]); } return magnitude; } computeMelSpectrogram(audioSegment) { const { data } = audioSegment; const { winLength, hopLength, nMels } = this.config; // Validate input data if (!data || data.length === 0) { throw new Error('Invalid audio data: audio segment is empty or undefined'); } // Pre-emphasis const emphasized = this.preEmphasis(data); // Calculate number of frames const numFrames = Math.floor((emphasized.length - winLength) / hopLength) + 1; // Initialize mel spectrogram const melSpectrogram = new Float32Array(nMels * numFrames); // Process each frame for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) { const start = frameIdx * hopLength; const frame = emphasized.slice(start, start + winLength); // Apply window const windowed = this.applyWindow(frame); // Compute FFT const spectrum = this.fft(windowed); // Apply mel filterbank for (let melIdx = 0; melIdx < nMels; melIdx++) { let melEnergy = 0; const filter = this.melFilterBank[melIdx]; // spectrum.length should match filter.length const minLen = Math.min(spectrum.length, filter.length); for (let i = 0; i < minLen; i++) { melEnergy += spectrum[i] * spectrum[i] * filter[i]; } // Ensure melEnergy is not negative or NaN if (isNaN(melEnergy) || melEnergy < 0) { melEnergy = 0; } // Convert to log scale - matching Python's torch.log(x + 1e-6) // Python uses natural log, not log10 melSpectrogram[melIdx * numFrames + frameIdx] = Math.log(melEnergy + 1e-6); } } // Apply mean normalization - matching Python's x - torch.mean(x, dim=-1, keepdim=True) // For each mel bin, subtract its mean across time for (let melIdx = 0; melIdx < nMels; melIdx++) { let sum = 0; for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) { sum += melSpectrogram[melIdx * numFrames + frameIdx]; } const mean = sum / numFrames; for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) { melSpectrogram[melIdx * numFrames + frameIdx] -= mean; } } return melSpectrogram; } // Reshape mel spectrogram for model input reshapeForModel(melSpectrogram, numFrames) { const { nMels } = this.config; // Original shape: [nMels * numFrames] (row-major) // Target shape: [1, nMels, numFrames] for ONNX model // Already in correct order, just return return melSpectrogram; } } exports.AudioPreprocessor = AudioPreprocessor;