UNPKG

@jaehyun-ko/speaker-verification

Version:

Real-time speaker verification in the browser using NeXt-TDNN models

271 lines (270 loc) 9.65 kB
"use strict"; /** * Easy-to-use API for NeXt-TDNN Speaker Verification * * Simple interface for speaker verification that handles all preprocessing, * model loading, and comparison automatically. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.SpeakerVerification = void 0; const inference_1 = require("./core/inference"); class SpeakerVerification { constructor() { this.engine = null; this.audioContext = null; this.currentModelId = ''; this.modelCache = new Map(); if (typeof window !== 'undefined') { this.audioContext = new (window.AudioContext || window.webkitAudioContext)(); } } /** * Initialize with a specific model * @param modelKey Model key from MODELS (e.g., 'standard-256', 'mobile-128') * @param options Additional options */ async initialize(modelKey = 'standard-256', options) { const modelInfo = SpeakerVerification.MODELS[modelKey]; if (!modelInfo) { throw new Error(`Unknown model: ${modelKey}. Available models: ${Object.keys(SpeakerVerification.MODELS).join(', ')}`); } // Check cache first if (options?.cacheModel !== false && this.modelCache.has(modelInfo.id)) { const cachedData = this.modelCache.get(modelInfo.id); await this.loadEngine(cachedData, modelInfo.id); return; } // Load model data let modelData; if (options?.modelData) { modelData = options.modelData; } else { // Load from CDN const modelUrl = `https://huggingface.co/jaehyun-ko/next-tdnn-onnx/resolve/main/${modelInfo.id}.onnx`; const response = await fetch(modelUrl); if (!response.ok) { throw new Error(`Failed to load model from ${modelUrl}: ${response.statusText}`); } modelData = await response.arrayBuffer(); } // Cache model if requested if (options?.cacheModel !== false) { this.modelCache.set(modelInfo.id, modelData); } await this.loadEngine(modelData, modelInfo.id); } async loadEngine(modelData, modelId) { // Clean up previous engine if (this.engine) { await this.engine.cleanup(); } // Create new engine this.engine = new inference_1.SpeakerVerificationEngine({ modelData: modelData, metric: 'cosine' // All NeXt-TDNN models use cosine similarity }); await this.engine.initialize(); this.currentModelId = modelId; } /** * Compare two audio inputs and return similarity * @param audio1 First audio (File, Blob, ArrayBuffer, or Float32Array) * @param audio2 Second audio (File, Blob, ArrayBuffer, or Float32Array) * @returns Similarity score (0.0-1.0, higher = more similar) */ async compareAudio(audio1, audio2) { if (!this.engine) { throw new Error('Engine not initialized. Call initialize() first.'); } if (!this.audioContext) { throw new Error('AudioContext not available. This API requires a browser environment.'); } const startTime = performance.now(); // Convert audio inputs to Float32Array const audioData1 = await this.processAudioInput(audio1); const audioData2 = await this.processAudioInput(audio2); // Process audio segments const embedding1 = await this.engine.processAudioSegment({ data: audioData1, sampleRate: 16000, duration: audioData1.length / 16000 }); const embedding2 = await this.engine.processAudioSegment({ data: audioData2, sampleRate: 16000, duration: audioData2.length / 16000 }); // Compute similarity const result = this.engine.computeSimilarityWithNorm(embedding1, embedding2); const similarity = result.raw; const processingTime = performance.now() - startTime; return { similarity, processingTime }; } /** * Extract speaker embedding from audio input * @param audio Audio input (File, Blob, ArrayBuffer, or Float32Array) * @returns Speaker embedding as Float32Array */ async getEmbedding(audio) { if (!this.engine) { throw new Error('Engine not initialized. Call initialize() first.'); } if (!this.audioContext) { throw new Error('AudioContext not available. This API requires a browser environment.'); } const startTime = performance.now(); // Convert audio input to Float32Array const audioData = await this.processAudioInput(audio); // Process audio segment const speakerEmbedding = await this.engine.processAudioSegment({ data: audioData, sampleRate: 16000, duration: audioData.length / 16000 }); const processingTime = performance.now() - startTime; return { embedding: speakerEmbedding.embedding, processingTime }; } /** * Compare two pre-computed embeddings * @param embedding1 First speaker embedding * @param embedding2 Second speaker embedding * @returns Similarity score (0.0-1.0, higher = more similar) */ compareEmbeddings(embedding1, embedding2) { if (!this.engine) { throw new Error('Engine not initialized. Call initialize() first.'); } // Use the static method from SpeakerVerificationEngine return inference_1.SpeakerVerificationEngine.computeSimilarity(embedding1, embedding2); } /** * Process various audio input formats into Float32Array */ async processAudioInput(audio) { if (!this.audioContext) { throw new Error('AudioContext not available'); } // If already Float32Array, check if resampling is needed if (audio instanceof Float32Array) { // Assume it's already at 16kHz return audio; } // Convert to ArrayBuffer if needed let arrayBuffer; if (audio instanceof ArrayBuffer) { // Clone the ArrayBuffer to prevent detached buffer issues arrayBuffer = audio.slice(0); } else if (audio instanceof Blob || (typeof File !== 'undefined' && audio instanceof File)) { arrayBuffer = await audio.arrayBuffer(); } else { throw new Error('Unsupported audio input type'); } // Decode audio (this will detach the arrayBuffer) const audioBuffer = await this.audioContext.decodeAudioData(arrayBuffer); let audioData = audioBuffer.getChannelData(0); // Get mono channel // Resample to 16kHz if needed if (audioBuffer.sampleRate !== 16000) { audioData = this.resampleAudio(audioData, audioBuffer.sampleRate, 16000); } return audioData; } /** * Simple resampling function */ resampleAudio(audioData, fromSampleRate, toSampleRate) { const ratio = fromSampleRate / toSampleRate; const newLength = Math.floor(audioData.length / ratio); const resampled = new Float32Array(newLength); for (let i = 0; i < newLength; i++) { const srcIndex = Math.floor(i * ratio); resampled[i] = audioData[srcIndex]; } return resampled; } /** * Get information about the current model */ getCurrentModel() { if (!this.currentModelId) return null; for (const [key, info] of Object.entries(SpeakerVerification.MODELS)) { if (info.id === this.currentModelId) { return info; } } return null; } /** * Clean up resources */ async cleanup() { if (this.engine) { await this.engine.cleanup(); this.engine = null; } if (this.audioContext) { await this.audioContext.close(); this.audioContext = null; } this.modelCache.clear(); } } exports.SpeakerVerification = SpeakerVerification; // Available models SpeakerVerification.MODELS = { // Standard models (recommended) 'standard-256': { id: 'NeXt_TDNN_C256_B3_K65_7', name: 'Standard (256 channels)', size: 28 * 1024 * 1024, channels: 256, type: 'standard' }, 'standard-128': { id: 'NeXt_TDNN_C128_B3_K65_7', name: 'Compact (128 channels)', size: 7.5 * 1024 * 1024, channels: 128, type: 'standard' }, 'standard-192': { id: 'NeXt_TDNN_C192_B1_K65_7', name: 'Medium (192 channels)', size: 16 * 1024 * 1024, channels: 192, type: 'standard' }, 'standard-384': { id: 'NeXt_TDNN_C384_B1_K65_7', name: 'Large (384 channels)', size: 32 * 1024 * 1024, channels: 384, type: 'standard' }, // Mobile models (lightweight) 'mobile-128': { id: 'NeXt_TDNN_light_C128_B3_K65', name: 'Mobile Compact (128 channels)', size: 5 * 1024 * 1024, channels: 128, type: 'mobile' }, 'mobile-256': { id: 'NeXt_TDNN_light_C256_B3_K65', name: 'Mobile Standard (256 channels)', size: 20 * 1024 * 1024, channels: 256, type: 'mobile' } }; // Export for easy use exports.default = SpeakerVerification;