UNPKG

@jaehyun-ko/speaker-verification

Version:

Real-time speaker verification in the browser using NeXt-TDNN models

github.com/jaehyun-ko/node-speaker-verification

jaehyun-ko/node-speaker-verification

145 lines (144 loc) • 6.53 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.SpeakerVerificationEngine = void 0; const model_1 = require("./model"); const preprocessor_1 = require("../audio/preprocessor"); const score_normalization_1 = require("./score-normalization"); class SpeakerVerificationEngine { constructor(modelConfig, preprocessorConfig, sNormConfig) { this.scoreNormalizer = null; this.isInitialized = false; this.model = new model_1.NeXtTDNNModel(modelConfig); this.preprocessor = new preprocessor_1.AudioPreprocessor(preprocessorConfig); this.metric = modelConfig.metric || 'cosine'; // Default to cosine // Initialize score normalizer if config provided if (sNormConfig) { this.scoreNormalizer = new score_normalization_1.ScoreNormalizer(sNormConfig); } } async initialize() { await this.model.initialize(); this.isInitialized = true; } async processAudioSegment(audioSegment) { if (!this.isInitialized) { throw new Error('Engine not initialized. Call initialize() first.'); } // Model expects exactly 300 frames // With winLength=400 and hopLength=160: // numFrames = floor((numSamples - winLength) / hopLength) + 1 // 300 = floor((numSamples - 400) / 160) + 1 // numSamples = (300 - 1) * 160 + 400 = 48240 const targetSamples = 48240; // Exact samples for 300 frames let processedData = audioSegment.data; // Pad or truncate to get exactly 300 frames if (processedData.length < targetSamples) { // Pad with zeros if audio is shorter const padded = new Float32Array(targetSamples); padded.set(processedData); processedData = padded; } else if (processedData.length > targetSamples) { // Truncate if audio is longer processedData = processedData.slice(0, targetSamples); } // Create adjusted audio segment const adjustedSegment = { data: processedData, sampleRate: audioSegment.sampleRate, duration: processedData.length / audioSegment.sampleRate }; // Compute mel-spectrogram const melSpectrogram = this.preprocessor.computeMelSpectrogram(adjustedSegment); // Calculate number of frames (should be exactly 300 for 3 seconds) const { winLength, hopLength } = this.preprocessor['config']; const numFrames = Math.floor((adjustedSegment.data.length - winLength) / hopLength) + 1; // Run inference return await this.model.infer(melSpectrogram, numFrames); } async cleanup() { await this.model.cleanup(); this.isInitialized = false; } // Check if embedding is normalized (for debugging) static isNormalized(embedding) { let norm = 0; for (let i = 0; i < embedding.length; i++) { norm += embedding[i] * embedding[i]; } norm = Math.sqrt(norm); // Check if norm is approximately 1 (within floating point precision) return Math.abs(norm - 1.0) < 0.001; } // Compute cosine similarity between two embeddings static computeSimilarity(embedding1, embedding2) { if (embedding1.length !== embedding2.length) { throw new Error('Embeddings must have the same length'); } // Embeddings are already normalized by the model // Just compute dot product for cosine similarity let dotProduct = 0; for (let i = 0; i < embedding1.length; i++) { dotProduct += embedding1[i] * embedding2[i]; } // Clamp to [-1, 1] to handle numerical precision issues return Math.max(-1, Math.min(1, dotProduct)); } // Compute Euclidean distance between two embeddings static computeEuclideanDistance(embedding1, embedding2) { if (embedding1.length !== embedding2.length) { throw new Error('Embeddings must have the same length'); } let sumSquaredDiff = 0; for (let i = 0; i < embedding1.length; i++) { const diff = embedding1[i] - embedding2[i]; sumSquaredDiff += diff * diff; } const distance = Math.sqrt(sumSquaredDiff); return distance; } // Verify if two audio segments are from the same speaker static verifySpeaker(embedding1, embedding2, threshold = 0.5) { const similarity = this.computeSimilarity(embedding1.embedding, embedding2.embedding); return similarity >= threshold; } // Load cohort embeddings for score normalization async loadCohortEmbeddings(url) { if (!this.scoreNormalizer) { this.scoreNormalizer = new score_normalization_1.ScoreNormalizer(); } await this.scoreNormalizer.loadCohortEmbeddings(url); } // Compute similarity with optional score normalization computeSimilarityWithNorm(embedding1, embedding2) { let rawScore; if (this.metric === 'euclidean') { // For Euclidean distance, compute distance and convert to similarity const distance = SpeakerVerificationEngine.computeEuclideanDistance(embedding1.embedding, embedding2.embedding); // Convert distance to similarity score (0-1 range) // Assuming max distance of 2.0 for normalized embeddings rawScore = Math.max(0, 1 - distance / 2.0); } else { // Default to cosine similarity rawScore = SpeakerVerificationEngine.computeSimilarity(embedding1.embedding, embedding2.embedding); } let normalizedScore; if (this.scoreNormalizer && this.scoreNormalizer.getCohortStats().loaded) { normalizedScore = this.scoreNormalizer.normalize(embedding1.embedding, embedding2.embedding, rawScore); } return { raw: rawScore, normalized: normalizedScore, metric: this.metric }; } // Verify speakers with score normalization verifySpeakerWithNorm(embedding1, embedding2, threshold = 0.5, useNormalized = true) { const scores = this.computeSimilarityWithNorm(embedding1, embedding2); const scoreToUse = useNormalized && scores.normalized !== undefined ? scores.normalized : scores.raw; return { isMatch: scoreToUse >= threshold, scores }; } } exports.SpeakerVerificationEngine = SpeakerVerificationEngine;