@jaehyun-ko/speaker-verification
Version:
Real-time speaker verification in the browser using NeXt-TDNN models
271 lines (270 loc) • 9.65 kB
JavaScript
"use strict";
/**
* Easy-to-use API for NeXt-TDNN Speaker Verification
*
* Simple interface for speaker verification that handles all preprocessing,
* model loading, and comparison automatically.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.SpeakerVerification = void 0;
const inference_1 = require("./core/inference");
class SpeakerVerification {
constructor() {
this.engine = null;
this.audioContext = null;
this.currentModelId = '';
this.modelCache = new Map();
if (typeof window !== 'undefined') {
this.audioContext = new (window.AudioContext || window.webkitAudioContext)();
}
}
/**
* Initialize with a specific model
* @param modelKey Model key from MODELS (e.g., 'standard-256', 'mobile-128')
* @param options Additional options
*/
async initialize(modelKey = 'standard-256', options) {
const modelInfo = SpeakerVerification.MODELS[modelKey];
if (!modelInfo) {
throw new Error(`Unknown model: ${modelKey}. Available models: ${Object.keys(SpeakerVerification.MODELS).join(', ')}`);
}
// Check cache first
if (options?.cacheModel !== false && this.modelCache.has(modelInfo.id)) {
const cachedData = this.modelCache.get(modelInfo.id);
await this.loadEngine(cachedData, modelInfo.id);
return;
}
// Load model data
let modelData;
if (options?.modelData) {
modelData = options.modelData;
}
else {
// Load from CDN
const modelUrl = `https://huggingface.co/jaehyun-ko/next-tdnn-onnx/resolve/main/${modelInfo.id}.onnx`;
const response = await fetch(modelUrl);
if (!response.ok) {
throw new Error(`Failed to load model from ${modelUrl}: ${response.statusText}`);
}
modelData = await response.arrayBuffer();
}
// Cache model if requested
if (options?.cacheModel !== false) {
this.modelCache.set(modelInfo.id, modelData);
}
await this.loadEngine(modelData, modelInfo.id);
}
async loadEngine(modelData, modelId) {
// Clean up previous engine
if (this.engine) {
await this.engine.cleanup();
}
// Create new engine
this.engine = new inference_1.SpeakerVerificationEngine({
modelData: modelData,
metric: 'cosine' // All NeXt-TDNN models use cosine similarity
});
await this.engine.initialize();
this.currentModelId = modelId;
}
/**
* Compare two audio inputs and return similarity
* @param audio1 First audio (File, Blob, ArrayBuffer, or Float32Array)
* @param audio2 Second audio (File, Blob, ArrayBuffer, or Float32Array)
* @returns Similarity score (0.0-1.0, higher = more similar)
*/
async compareAudio(audio1, audio2) {
if (!this.engine) {
throw new Error('Engine not initialized. Call initialize() first.');
}
if (!this.audioContext) {
throw new Error('AudioContext not available. This API requires a browser environment.');
}
const startTime = performance.now();
// Convert audio inputs to Float32Array
const audioData1 = await this.processAudioInput(audio1);
const audioData2 = await this.processAudioInput(audio2);
// Process audio segments
const embedding1 = await this.engine.processAudioSegment({
data: audioData1,
sampleRate: 16000,
duration: audioData1.length / 16000
});
const embedding2 = await this.engine.processAudioSegment({
data: audioData2,
sampleRate: 16000,
duration: audioData2.length / 16000
});
// Compute similarity
const result = this.engine.computeSimilarityWithNorm(embedding1, embedding2);
const similarity = result.raw;
const processingTime = performance.now() - startTime;
return {
similarity,
processingTime
};
}
/**
* Extract speaker embedding from audio input
* @param audio Audio input (File, Blob, ArrayBuffer, or Float32Array)
* @returns Speaker embedding as Float32Array
*/
async getEmbedding(audio) {
if (!this.engine) {
throw new Error('Engine not initialized. Call initialize() first.');
}
if (!this.audioContext) {
throw new Error('AudioContext not available. This API requires a browser environment.');
}
const startTime = performance.now();
// Convert audio input to Float32Array
const audioData = await this.processAudioInput(audio);
// Process audio segment
const speakerEmbedding = await this.engine.processAudioSegment({
data: audioData,
sampleRate: 16000,
duration: audioData.length / 16000
});
const processingTime = performance.now() - startTime;
return {
embedding: speakerEmbedding.embedding,
processingTime
};
}
/**
* Compare two pre-computed embeddings
* @param embedding1 First speaker embedding
* @param embedding2 Second speaker embedding
* @returns Similarity score (0.0-1.0, higher = more similar)
*/
compareEmbeddings(embedding1, embedding2) {
if (!this.engine) {
throw new Error('Engine not initialized. Call initialize() first.');
}
// Use the static method from SpeakerVerificationEngine
return inference_1.SpeakerVerificationEngine.computeSimilarity(embedding1, embedding2);
}
/**
* Process various audio input formats into Float32Array
*/
async processAudioInput(audio) {
if (!this.audioContext) {
throw new Error('AudioContext not available');
}
// If already Float32Array, check if resampling is needed
if (audio instanceof Float32Array) {
// Assume it's already at 16kHz
return audio;
}
// Convert to ArrayBuffer if needed
let arrayBuffer;
if (audio instanceof ArrayBuffer) {
// Clone the ArrayBuffer to prevent detached buffer issues
arrayBuffer = audio.slice(0);
}
else if (audio instanceof Blob || (typeof File !== 'undefined' && audio instanceof File)) {
arrayBuffer = await audio.arrayBuffer();
}
else {
throw new Error('Unsupported audio input type');
}
// Decode audio (this will detach the arrayBuffer)
const audioBuffer = await this.audioContext.decodeAudioData(arrayBuffer);
let audioData = audioBuffer.getChannelData(0); // Get mono channel
// Resample to 16kHz if needed
if (audioBuffer.sampleRate !== 16000) {
audioData = this.resampleAudio(audioData, audioBuffer.sampleRate, 16000);
}
return audioData;
}
/**
* Simple resampling function
*/
resampleAudio(audioData, fromSampleRate, toSampleRate) {
const ratio = fromSampleRate / toSampleRate;
const newLength = Math.floor(audioData.length / ratio);
const resampled = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const srcIndex = Math.floor(i * ratio);
resampled[i] = audioData[srcIndex];
}
return resampled;
}
/**
* Get information about the current model
*/
getCurrentModel() {
if (!this.currentModelId)
return null;
for (const [key, info] of Object.entries(SpeakerVerification.MODELS)) {
if (info.id === this.currentModelId) {
return info;
}
}
return null;
}
/**
* Clean up resources
*/
async cleanup() {
if (this.engine) {
await this.engine.cleanup();
this.engine = null;
}
if (this.audioContext) {
await this.audioContext.close();
this.audioContext = null;
}
this.modelCache.clear();
}
}
exports.SpeakerVerification = SpeakerVerification;
// Available models
SpeakerVerification.MODELS = {
// Standard models (recommended)
'standard-256': {
id: 'NeXt_TDNN_C256_B3_K65_7',
name: 'Standard (256 channels)',
size: 28 * 1024 * 1024,
channels: 256,
type: 'standard'
},
'standard-128': {
id: 'NeXt_TDNN_C128_B3_K65_7',
name: 'Compact (128 channels)',
size: 7.5 * 1024 * 1024,
channels: 128,
type: 'standard'
},
'standard-192': {
id: 'NeXt_TDNN_C192_B1_K65_7',
name: 'Medium (192 channels)',
size: 16 * 1024 * 1024,
channels: 192,
type: 'standard'
},
'standard-384': {
id: 'NeXt_TDNN_C384_B1_K65_7',
name: 'Large (384 channels)',
size: 32 * 1024 * 1024,
channels: 384,
type: 'standard'
},
// Mobile models (lightweight)
'mobile-128': {
id: 'NeXt_TDNN_light_C128_B3_K65',
name: 'Mobile Compact (128 channels)',
size: 5 * 1024 * 1024,
channels: 128,
type: 'mobile'
},
'mobile-256': {
id: 'NeXt_TDNN_light_C256_B3_K65',
name: 'Mobile Standard (256 channels)',
size: 20 * 1024 * 1024,
channels: 256,
type: 'mobile'
}
};
// Export for easy use
exports.default = SpeakerVerification;