@jaehyun-ko/speaker-verification
Version:
Real-time speaker verification in the browser using NeXt-TDNN models
145 lines (144 loc) • 6.53 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.SpeakerVerificationEngine = void 0;
const model_1 = require("./model");
const preprocessor_1 = require("../audio/preprocessor");
const score_normalization_1 = require("./score-normalization");
class SpeakerVerificationEngine {
constructor(modelConfig, preprocessorConfig, sNormConfig) {
this.scoreNormalizer = null;
this.isInitialized = false;
this.model = new model_1.NeXtTDNNModel(modelConfig);
this.preprocessor = new preprocessor_1.AudioPreprocessor(preprocessorConfig);
this.metric = modelConfig.metric || 'cosine'; // Default to cosine
// Initialize score normalizer if config provided
if (sNormConfig) {
this.scoreNormalizer = new score_normalization_1.ScoreNormalizer(sNormConfig);
}
}
async initialize() {
await this.model.initialize();
this.isInitialized = true;
}
async processAudioSegment(audioSegment) {
if (!this.isInitialized) {
throw new Error('Engine not initialized. Call initialize() first.');
}
// Model expects exactly 300 frames
// With winLength=400 and hopLength=160:
// numFrames = floor((numSamples - winLength) / hopLength) + 1
// 300 = floor((numSamples - 400) / 160) + 1
// numSamples = (300 - 1) * 160 + 400 = 48240
const targetSamples = 48240; // Exact samples for 300 frames
let processedData = audioSegment.data;
// Pad or truncate to get exactly 300 frames
if (processedData.length < targetSamples) {
// Pad with zeros if audio is shorter
const padded = new Float32Array(targetSamples);
padded.set(processedData);
processedData = padded;
}
else if (processedData.length > targetSamples) {
// Truncate if audio is longer
processedData = processedData.slice(0, targetSamples);
}
// Create adjusted audio segment
const adjustedSegment = {
data: processedData,
sampleRate: audioSegment.sampleRate,
duration: processedData.length / audioSegment.sampleRate
};
// Compute mel-spectrogram
const melSpectrogram = this.preprocessor.computeMelSpectrogram(adjustedSegment);
// Calculate number of frames (should be exactly 300 for 3 seconds)
const { winLength, hopLength } = this.preprocessor['config'];
const numFrames = Math.floor((adjustedSegment.data.length - winLength) / hopLength) + 1;
// Run inference
return await this.model.infer(melSpectrogram, numFrames);
}
async cleanup() {
await this.model.cleanup();
this.isInitialized = false;
}
// Check if embedding is normalized (for debugging)
static isNormalized(embedding) {
let norm = 0;
for (let i = 0; i < embedding.length; i++) {
norm += embedding[i] * embedding[i];
}
norm = Math.sqrt(norm);
// Check if norm is approximately 1 (within floating point precision)
return Math.abs(norm - 1.0) < 0.001;
}
// Compute cosine similarity between two embeddings
static computeSimilarity(embedding1, embedding2) {
if (embedding1.length !== embedding2.length) {
throw new Error('Embeddings must have the same length');
}
// Embeddings are already normalized by the model
// Just compute dot product for cosine similarity
let dotProduct = 0;
for (let i = 0; i < embedding1.length; i++) {
dotProduct += embedding1[i] * embedding2[i];
}
// Clamp to [-1, 1] to handle numerical precision issues
return Math.max(-1, Math.min(1, dotProduct));
}
// Compute Euclidean distance between two embeddings
static computeEuclideanDistance(embedding1, embedding2) {
if (embedding1.length !== embedding2.length) {
throw new Error('Embeddings must have the same length');
}
let sumSquaredDiff = 0;
for (let i = 0; i < embedding1.length; i++) {
const diff = embedding1[i] - embedding2[i];
sumSquaredDiff += diff * diff;
}
const distance = Math.sqrt(sumSquaredDiff);
return distance;
}
// Verify if two audio segments are from the same speaker
static verifySpeaker(embedding1, embedding2, threshold = 0.5) {
const similarity = this.computeSimilarity(embedding1.embedding, embedding2.embedding);
return similarity >= threshold;
}
// Load cohort embeddings for score normalization
async loadCohortEmbeddings(url) {
if (!this.scoreNormalizer) {
this.scoreNormalizer = new score_normalization_1.ScoreNormalizer();
}
await this.scoreNormalizer.loadCohortEmbeddings(url);
}
// Compute similarity with optional score normalization
computeSimilarityWithNorm(embedding1, embedding2) {
let rawScore;
if (this.metric === 'euclidean') {
// For Euclidean distance, compute distance and convert to similarity
const distance = SpeakerVerificationEngine.computeEuclideanDistance(embedding1.embedding, embedding2.embedding);
// Convert distance to similarity score (0-1 range)
// Assuming max distance of 2.0 for normalized embeddings
rawScore = Math.max(0, 1 - distance / 2.0);
}
else {
// Default to cosine similarity
rawScore = SpeakerVerificationEngine.computeSimilarity(embedding1.embedding, embedding2.embedding);
}
let normalizedScore;
if (this.scoreNormalizer && this.scoreNormalizer.getCohortStats().loaded) {
normalizedScore = this.scoreNormalizer.normalize(embedding1.embedding, embedding2.embedding, rawScore);
}
return { raw: rawScore, normalized: normalizedScore, metric: this.metric };
}
// Verify speakers with score normalization
verifySpeakerWithNorm(embedding1, embedding2, threshold = 0.5, useNormalized = true) {
const scores = this.computeSimilarityWithNorm(embedding1, embedding2);
const scoreToUse = useNormalized && scores.normalized !== undefined
? scores.normalized
: scores.raw;
return {
isMatch: scoreToUse >= threshold,
scores
};
}
}
exports.SpeakerVerificationEngine = SpeakerVerificationEngine;