@jaehyun-ko/speaker-verification
Version:
Real-time speaker verification in the browser using NeXt-TDNN models
158 lines (157 loc) • 6.63 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.AudioPreprocessor = void 0;
const types_1 = require("../core/types");
const utils_1 = require("./utils");
class AudioPreprocessor {
constructor(config = {}) {
this.melFilterBank = null;
this.config = { ...types_1.DEFAULT_MEL_CONFIG, ...config };
this.fftProcessor = new utils_1.FFT(this.config.nFft);
this.initializeMelFilterBank();
}
initializeMelFilterBank() {
const { nFft, nMels, sampleRate } = this.config;
const fftBins = Math.floor(nFft / 2) + 1;
// Convert frequencies to mel scale - matching Python's f_min=20, f_max=7600
const fMin = 20;
const fMax = 7600;
const melMin = this.hzToMel(fMin);
const melMax = this.hzToMel(fMax);
// Create mel points
const melPoints = new Float32Array(nMels + 2);
for (let i = 0; i < nMels + 2; i++) {
melPoints[i] = melMin + (melMax - melMin) * i / (nMels + 1);
}
// Convert back to Hz
const hzPoints = melPoints.map(mel => this.melToHz(mel));
// Convert to FFT bin numbers
const binPoints = hzPoints.map(hz => Math.floor((nFft + 1) * hz / sampleRate));
// Create triangular filters
this.melFilterBank = [];
for (let i = 0; i < nMels; i++) {
const filter = new Float32Array(fftBins);
const startBin = binPoints[i];
const centerBin = binPoints[i + 1];
const endBin = binPoints[i + 2];
// Rising edge
for (let j = startBin; j < centerBin; j++) {
filter[j] = (j - startBin) / (centerBin - startBin);
}
// Falling edge
for (let j = centerBin; j < endBin; j++) {
filter[j] = (endBin - j) / (endBin - centerBin);
}
this.melFilterBank.push(filter);
}
}
hzToMel(hz) {
return 2595 * Math.log10(1 + hz / 700);
}
melToHz(mel) {
return 700 * (Math.pow(10, mel / 2595) - 1);
}
preEmphasis(signal) {
if (!signal || signal.length === 0) {
throw new Error('Cannot apply pre-emphasis to empty or undefined signal');
}
const { preEmphasisCoef = 0.97 } = this.config;
const emphasized = new Float32Array(signal.length);
emphasized[0] = signal[0];
for (let i = 1; i < signal.length; i++) {
emphasized[i] = signal[i] - preEmphasisCoef * signal[i - 1];
}
return emphasized;
}
applyWindow(frame) {
const { winLength } = this.config;
const windowed = new Float32Array(frame.length);
// Hamming window - matching Python's torch.hamming_window
for (let i = 0; i < winLength; i++) {
const windowValue = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (winLength - 1));
windowed[i] = frame[i] * windowValue;
}
return windowed;
}
fft(signal) {
const { nFft } = this.config;
const real = new Float32Array(nFft);
const imag = new Float32Array(nFft);
// Zero-pad signal to nFft size
for (let i = 0; i < Math.min(signal.length, nFft); i++) {
real[i] = signal[i];
}
// Perform FFT
this.fftProcessor.forward(real, imag);
// Get magnitude spectrum (only positive frequencies)
const halfNfft = Math.floor(nFft / 2) + 1;
const magnitude = new Float32Array(halfNfft);
for (let i = 0; i < halfNfft; i++) {
magnitude[i] = Math.sqrt(real[i] * real[i] + imag[i] * imag[i]);
}
return magnitude;
}
computeMelSpectrogram(audioSegment) {
const { data } = audioSegment;
const { winLength, hopLength, nMels } = this.config;
// Validate input data
if (!data || data.length === 0) {
throw new Error('Invalid audio data: audio segment is empty or undefined');
}
// Pre-emphasis
const emphasized = this.preEmphasis(data);
// Calculate number of frames
const numFrames = Math.floor((emphasized.length - winLength) / hopLength) + 1;
// Initialize mel spectrogram
const melSpectrogram = new Float32Array(nMels * numFrames);
// Process each frame
for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) {
const start = frameIdx * hopLength;
const frame = emphasized.slice(start, start + winLength);
// Apply window
const windowed = this.applyWindow(frame);
// Compute FFT
const spectrum = this.fft(windowed);
// Apply mel filterbank
for (let melIdx = 0; melIdx < nMels; melIdx++) {
let melEnergy = 0;
const filter = this.melFilterBank[melIdx];
// spectrum.length should match filter.length
const minLen = Math.min(spectrum.length, filter.length);
for (let i = 0; i < minLen; i++) {
melEnergy += spectrum[i] * spectrum[i] * filter[i];
}
// Ensure melEnergy is not negative or NaN
if (isNaN(melEnergy) || melEnergy < 0) {
melEnergy = 0;
}
// Convert to log scale - matching Python's torch.log(x + 1e-6)
// Python uses natural log, not log10
melSpectrogram[melIdx * numFrames + frameIdx] =
Math.log(melEnergy + 1e-6);
}
}
// Apply mean normalization - matching Python's x - torch.mean(x, dim=-1, keepdim=True)
// For each mel bin, subtract its mean across time
for (let melIdx = 0; melIdx < nMels; melIdx++) {
let sum = 0;
for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) {
sum += melSpectrogram[melIdx * numFrames + frameIdx];
}
const mean = sum / numFrames;
for (let frameIdx = 0; frameIdx < numFrames; frameIdx++) {
melSpectrogram[melIdx * numFrames + frameIdx] -= mean;
}
}
return melSpectrogram;
}
// Reshape mel spectrogram for model input
reshapeForModel(melSpectrogram, numFrames) {
const { nMels } = this.config;
// Original shape: [nMels * numFrames] (row-major)
// Target shape: [1, nMels, numFrames] for ONNX model
// Already in correct order, just return
return melSpectrogram;
}
}
exports.AudioPreprocessor = AudioPreprocessor;