UNPKG

@jaehyun-ko/speaker-verification

Version:

Real-time speaker verification in the browser using NeXt-TDNN models

2 lines 15.1 kB
!function(e,t){"object"==typeof exports&&"object"==typeof module?module.exports=t(require("ort")):"function"==typeof define&&define.amd?define(["ort"],t):"object"==typeof exports?exports.SpeakerVerification=t(require("ort")):e.SpeakerVerification=t(e.ort)}(this,e=>(()=>{"use strict";var t={47:t=>{t.exports=e},276:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.ScoreNormalizer=void 0,t.ScoreNormalizer=class{constructor(e={}){this.cohortEmbeddings=[],this.config={cohortSize:e.cohortSize||6e3,topK:e.topK||300}}addCohortEmbeddings(e){this.cohortEmbeddings=e.slice(0,this.config.cohortSize)}async loadCohortEmbeddings(e){try{const t=await fetch(e),i=(await t.json()).embeddings.map(e=>new Float32Array(e));this.addCohortEmbeddings(i)}catch(e){throw e}}computeSimilarity(e,t){let i=0;for(let n=0;n<e.length;n++)i+=e[n]*t[n];return i}normalize(e,t,i){if(0===this.cohortEmbeddings.length)return i;const n=[];for(const t of this.cohortEmbeddings){const i=this.computeSimilarity(e,t);n.push(i)}const o=[];for(const e of this.cohortEmbeddings){const i=this.computeSimilarity(t,e);o.push(i)}n.sort((e,t)=>t-e),o.sort((e,t)=>t-e);const r=n.slice(0,this.config.topK),s=o.slice(0,this.config.topK),a=this.computeMean(r),l=this.computeStd(r,a),h=this.computeMean(s);return.5*((i-a)/(l+1e-6)+(i-h)/(this.computeStd(s,h)+1e-6))}computeMean(e){return 0===e.length?0:e.reduce((e,t)=>e+t,0)/e.length}computeStd(e,t){if(0===e.length)return 1;const i=e.reduce((e,i)=>e+Math.pow(i-t,2),0)/e.length;return Math.sqrt(i)}getCohortStats(){return{size:this.cohortEmbeddings.length,topK:this.config.topK,loaded:this.cohortEmbeddings.length>0}}}},567:function(e,t,i){var n,o=this&&this.__createBinding||(Object.create?function(e,t,i,n){void 0===n&&(n=i);var o=Object.getOwnPropertyDescriptor(t,i);o&&!("get"in o?!t.__esModule:o.writable||o.configurable)||(o={enumerable:!0,get:function(){return t[i]}}),Object.defineProperty(e,n,o)}:function(e,t,i,n){void 0===n&&(n=i),e[n]=t[i]}),r=this&&this.__setModuleDefault||(Object.create?function(e,t){Object.defineProperty(e,"default",{enumerable:!0,value:t})}:function(e,t){e.default=t}),s=this&&this.__importStar||(n=function(e){return n=Object.getOwnPropertyNames||function(e){var t=[];for(var i in e)Object.prototype.hasOwnProperty.call(e,i)&&(t[t.length]=i);return t},n(e)},function(e){if(e&&e.__esModule)return e;var t={};if(null!=e)for(var i=n(e),s=0;s<i.length;s++)"default"!==i[s]&&o(t,e,i[s]);return r(t,e),t});Object.defineProperty(t,"__esModule",{value:!0}),t.NeXtTDNNModel=void 0;const a=s(i(47));t.NeXtTDNNModel=class{constructor(e){this.session=null,this.config=e}async initialize(){try{a.env.wasm.numThreads=navigator.hardwareConcurrency||4,a.env.wasm.simd=!0;const e={executionProviders:this.config.executionProviders||["wasm"],graphOptimizationLevel:"all"},t=this.config.modelPath,i=this.config.modelData;if(!t&&!i)throw new Error("Either modelPath or modelData must be provided");if(t&&"string"==typeof t)this.session=await a.InferenceSession.create(t,e);else{if(!(i||t&&"string"!=typeof t))throw new Error("Invalid model source");{const n=i||t;this.session=await a.InferenceSession.create(n,e)}}}catch(e){throw new Error(`Failed to load model: ${e}`)}}async infer(e,t){if(!this.session)throw new Error("Model not initialized. Call initialize() first.");try{const i=this.session.inputNames[0],n=this.session.outputNames[0],o=80,r=new a.Tensor("float32",e,[1,o,t]),s=(performance.now(),await this.session.run({[i]:r})),l=(performance.now(),s[n]),h=l.data;if(0===h.length)throw new Error("Model output is empty");if(Array.from(h).some(e=>isNaN(e)))throw new Error("Model output contains NaN values");let c;if(2===l.dims.length){const[e,t]=l.dims;c=new Float32Array(h)}else{if(3!==l.dims.length)throw new Error(`Unexpected output dimensions: ${l.dims}`);{const[e,t,i]=l.dims;c=new Float32Array(t);for(let e=0;e<t;e++){let t=0;for(let n=0;n<i;n++)t+=h[e*i+n];c[e]=t/i}}}let d=0;for(let e=0;e<c.length;e++)d+=c[e]*c[e];d=Math.sqrt(d),(0===d||isNaN(d))&&(d=1);for(let e=0;e<c.length;e++)c[e]=c[e]/d;let u=0;for(let e=0;e<c.length;e++)u+=c[e]*c[e];return{embedding:c,timestamp:Date.now()}}catch(e){throw new Error(`Inference failed: ${e}`)}}async cleanup(){this.session&&(await this.session.release(),this.session=null)}getModelInfo(){return this.session?(this.session.inputNames[0],this.session.outputNames[0],{inputShape:[1,80,-1],outputShape:[1,192]}):null}}},672:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.FFT=void 0,t.nextPowerOf2=function(e){return Math.pow(2,Math.ceil(Math.log2(e)))},t.normalizeAudio=function(e){const t=Math.max(...e.map(Math.abs));if(0===t)return e;const i=new Float32Array(e.length),n=1/t;for(let t=0;t<e.length;t++)i[t]=e[t]*n;return i},t.resampleAudio=function(e,t,i){if(t===i)return e;const n=i/t,o=Math.round(e.length*n),r=new Float32Array(o);for(let t=0;t<o;t++){const i=t/n,o=Math.floor(i),s=i-o;o+1<e.length?r[t]=e[o]*(1-s)+e[o+1]*s:r[t]=e[o]}return r},t.FFT=class{constructor(e){this.size=e;const t=Math.log2(e);if(t!==Math.floor(t))throw new Error("FFT size must be a power of 2");this.cosTable=new Float32Array(e/2),this.sinTable=new Float32Array(e/2);for(let t=0;t<e/2;t++){const i=2*Math.PI*t/e;this.cosTable[t]=Math.cos(i),this.sinTable[t]=Math.sin(i)}this.reverseTable=new Uint32Array(e);const i=32-t;for(let t=0;t<e;t++)this.reverseTable[t]=this.reverseBits(t)>>>i}reverseBits(e){return(65535&(e=(16711935&(e=(252645135&(e=(858993459&(e=(1431655765&e)<<1|(2863311530&e)>>>1))<<2|(3435973836&e)>>>2))<<4|(4042322160&e)>>>4))<<8|(4278255360&e)>>>8))<<16|(4294901760&e)>>>16}forward(e,t){const i=this.size;for(let n=0;n<i;n++){const i=this.reverseTable[n];i>n&&([e[n],e[i]]=[e[i],e[n]],[t[n],t[i]]=[t[i],t[n]])}for(let n=2;n<=i;n*=2){const o=n/2,r=i/n;for(let s=0;s<i;s+=n)for(let i=s,n=0;i<s+o;i++,n+=r){const r=i+o,s=this.cosTable[n],a=this.sinTable[n],l=e[r]*s-t[r]*a,h=e[r]*a+t[r]*s;e[r]=e[i]-l,t[r]=t[i]-h,e[i]+=l,t[i]+=h}}}getMagnitudeSpectrum(e,t){const i=Math.floor(this.size/2)+1,n=new Float32Array(i);for(let o=0;o<i;o++)n[o]=Math.sqrt(e[o]*e[o]+t[o]*t[o]);return n}}},726:(e,t,i)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.AudioPreprocessor=void 0;const n=i(879),o=i(672);t.AudioPreprocessor=class{constructor(e={}){this.melFilterBank=null,this.config={...n.DEFAULT_MEL_CONFIG,...e},this.fftProcessor=new o.FFT(this.config.nFft),this.initializeMelFilterBank()}initializeMelFilterBank(){const{nFft:e,nMels:t,sampleRate:i}=this.config,n=Math.floor(e/2)+1,o=this.hzToMel(20),r=this.hzToMel(7600),s=new Float32Array(t+2);for(let e=0;e<t+2;e++)s[e]=o+(r-o)*e/(t+1);const a=s.map(e=>this.melToHz(e)).map(t=>Math.floor((e+1)*t/i));this.melFilterBank=[];for(let e=0;e<t;e++){const t=new Float32Array(n),i=a[e],o=a[e+1],r=a[e+2];for(let e=i;e<o;e++)t[e]=(e-i)/(o-i);for(let e=o;e<r;e++)t[e]=(r-e)/(r-o);this.melFilterBank.push(t)}}hzToMel(e){return 2595*Math.log10(1+e/700)}melToHz(e){return 700*(Math.pow(10,e/2595)-1)}preEmphasis(e){if(!e||0===e.length)throw new Error("Cannot apply pre-emphasis to empty or undefined signal");const{preEmphasisCoef:t=.97}=this.config,i=new Float32Array(e.length);i[0]=e[0];for(let n=1;n<e.length;n++)i[n]=e[n]-t*e[n-1];return i}applyWindow(e){const{winLength:t}=this.config,i=new Float32Array(e.length);for(let n=0;n<t;n++){const o=.54-.46*Math.cos(2*Math.PI*n/(t-1));i[n]=e[n]*o}return i}fft(e){const{nFft:t}=this.config,i=new Float32Array(t),n=new Float32Array(t);for(let n=0;n<Math.min(e.length,t);n++)i[n]=e[n];this.fftProcessor.forward(i,n);const o=Math.floor(t/2)+1,r=new Float32Array(o);for(let e=0;e<o;e++)r[e]=Math.sqrt(i[e]*i[e]+n[e]*n[e]);return r}computeMelSpectrogram(e){const{data:t}=e,{winLength:i,hopLength:n,nMels:o}=this.config;if(!t||0===t.length)throw new Error("Invalid audio data: audio segment is empty or undefined");const r=this.preEmphasis(t),s=Math.floor((r.length-i)/n)+1,a=new Float32Array(o*s);for(let e=0;e<s;e++){const t=e*n,l=r.slice(t,t+i),h=this.applyWindow(l),c=this.fft(h);for(let t=0;t<o;t++){let i=0;const n=this.melFilterBank[t],o=Math.min(c.length,n.length);for(let e=0;e<o;e++)i+=c[e]*c[e]*n[e];(isNaN(i)||i<0)&&(i=0),a[t*s+e]=Math.log(i+1e-6)}}for(let e=0;e<o;e++){let t=0;for(let i=0;i<s;i++)t+=a[e*s+i];const i=t/s;for(let t=0;t<s;t++)a[e*s+t]-=i}return a}reshapeForModel(e,t){const{nMels:i}=this.config;return e}}},873:(e,t,i)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.SpeakerVerificationEngine=void 0;const n=i(567),o=i(726),r=i(276);class s{constructor(e,t,i){this.scoreNormalizer=null,this.isInitialized=!1,this.model=new n.NeXtTDNNModel(e),this.preprocessor=new o.AudioPreprocessor(t),this.metric=e.metric||"cosine",i&&(this.scoreNormalizer=new r.ScoreNormalizer(i))}async initialize(){await this.model.initialize(),this.isInitialized=!0}async processAudioSegment(e){if(!this.isInitialized)throw new Error("Engine not initialized. Call initialize() first.");const t=48240;let i=e.data;if(i.length<t){const e=new Float32Array(t);e.set(i),i=e}else i.length>t&&(i=i.slice(0,t));const n={data:i,sampleRate:e.sampleRate,duration:i.length/e.sampleRate},o=this.preprocessor.computeMelSpectrogram(n),{winLength:r,hopLength:s}=this.preprocessor.config,a=Math.floor((n.data.length-r)/s)+1;return await this.model.infer(o,a)}async cleanup(){await this.model.cleanup(),this.isInitialized=!1}static isNormalized(e){let t=0;for(let i=0;i<e.length;i++)t+=e[i]*e[i];return t=Math.sqrt(t),Math.abs(t-1)<.001}static computeSimilarity(e,t){if(e.length!==t.length)throw new Error("Embeddings must have the same length");let i=0;for(let n=0;n<e.length;n++)i+=e[n]*t[n];return Math.max(-1,Math.min(1,i))}static computeEuclideanDistance(e,t){if(e.length!==t.length)throw new Error("Embeddings must have the same length");let i=0;for(let n=0;n<e.length;n++){const o=e[n]-t[n];i+=o*o}return Math.sqrt(i)}static verifySpeaker(e,t,i=.5){return this.computeSimilarity(e.embedding,t.embedding)>=i}async loadCohortEmbeddings(e){this.scoreNormalizer||(this.scoreNormalizer=new r.ScoreNormalizer),await this.scoreNormalizer.loadCohortEmbeddings(e)}computeSimilarityWithNorm(e,t){let i,n;if("euclidean"===this.metric){const n=s.computeEuclideanDistance(e.embedding,t.embedding);i=Math.max(0,1-n/2)}else i=s.computeSimilarity(e.embedding,t.embedding);return this.scoreNormalizer&&this.scoreNormalizer.getCohortStats().loaded&&(n=this.scoreNormalizer.normalize(e.embedding,t.embedding,i)),{raw:i,normalized:n,metric:this.metric}}verifySpeakerWithNorm(e,t,i=.5,n=!0){const o=this.computeSimilarityWithNorm(e,t);return{isMatch:(n&&void 0!==o.normalized?o.normalized:o.raw)>=i,scores:o}}}t.SpeakerVerificationEngine=s},879:(e,t)=>{Object.defineProperty(t,"__esModule",{value:!0}),t.DEFAULT_MEL_CONFIG=t.DEFAULT_AUDIO_CONFIG=void 0,t.DEFAULT_AUDIO_CONFIG={sampleRate:16e3,channelCount:1,bufferSize:4096},t.DEFAULT_MEL_CONFIG={sampleRate:16e3,nFft:512,winLength:400,hopLength:160,nMels:80,preEmphasisCoef:.97}}},i={};function n(e){var o=i[e];if(void 0!==o)return o.exports;var r=i[e]={exports:{}};return t[e].call(r.exports,r,r.exports,n),r.exports}var o={};return(()=>{var e=o;e.SpeakerVerification=void 0;const t=n(873);class i{constructor(){this.engine=null,this.audioContext=null,this.currentModelId="",this.modelCache=new Map,"undefined"!=typeof window&&(this.audioContext=new(window.AudioContext||window.webkitAudioContext))}async initialize(e="standard-256",t){const n=i.MODELS[e];if(!n)throw new Error(`Unknown model: ${e}. Available models: ${Object.keys(i.MODELS).join(", ")}`);if(!1!==t?.cacheModel&&this.modelCache.has(n.id)){const e=this.modelCache.get(n.id);return void await this.loadEngine(e,n.id)}let o;if(t?.modelData)o=t.modelData;else{const e=`https://huggingface.co/jaehyun-ko/next-tdnn-onnx/resolve/main/${n.id}.onnx`,t=await fetch(e);if(!t.ok)throw new Error(`Failed to load model from ${e}: ${t.statusText}`);o=await t.arrayBuffer()}!1!==t?.cacheModel&&this.modelCache.set(n.id,o),await this.loadEngine(o,n.id)}async loadEngine(e,i){this.engine&&await this.engine.cleanup(),this.engine=new t.SpeakerVerificationEngine({modelData:e,metric:"cosine"}),await this.engine.initialize(),this.currentModelId=i}async compareAudio(e,t){if(!this.engine)throw new Error("Engine not initialized. Call initialize() first.");if(!this.audioContext)throw new Error("AudioContext not available. This API requires a browser environment.");const i=performance.now(),n=await this.processAudioInput(e),o=await this.processAudioInput(t),r=await this.engine.processAudioSegment({data:n,sampleRate:16e3,duration:n.length/16e3}),s=await this.engine.processAudioSegment({data:o,sampleRate:16e3,duration:o.length/16e3});return{similarity:this.engine.computeSimilarityWithNorm(r,s).raw,processingTime:performance.now()-i}}async getEmbedding(e){if(!this.engine)throw new Error("Engine not initialized. Call initialize() first.");if(!this.audioContext)throw new Error("AudioContext not available. This API requires a browser environment.");const t=performance.now(),i=await this.processAudioInput(e),n=await this.engine.processAudioSegment({data:i,sampleRate:16e3,duration:i.length/16e3}),o=performance.now()-t;return{embedding:n.embedding,processingTime:o}}compareEmbeddings(e,i){if(!this.engine)throw new Error("Engine not initialized. Call initialize() first.");return t.SpeakerVerificationEngine.computeSimilarity(e,i)}async processAudioInput(e){if(!this.audioContext)throw new Error("AudioContext not available");if(e instanceof Float32Array)return e;let t;if(e instanceof ArrayBuffer)t=e.slice(0);else{if(!(e instanceof Blob||"undefined"!=typeof File&&e instanceof File))throw new Error("Unsupported audio input type");t=await e.arrayBuffer()}const i=await this.audioContext.decodeAudioData(t);let n=i.getChannelData(0);return 16e3!==i.sampleRate&&(n=this.resampleAudio(n,i.sampleRate,16e3)),n}resampleAudio(e,t,i){const n=t/i,o=Math.floor(e.length/n),r=new Float32Array(o);for(let t=0;t<o;t++){const i=Math.floor(t*n);r[t]=e[i]}return r}getCurrentModel(){if(!this.currentModelId)return null;for(const[e,t]of Object.entries(i.MODELS))if(t.id===this.currentModelId)return t;return null}async cleanup(){this.engine&&(await this.engine.cleanup(),this.engine=null),this.audioContext&&(await this.audioContext.close(),this.audioContext=null),this.modelCache.clear()}}e.SpeakerVerification=i,i.MODELS={"standard-256":{id:"NeXt_TDNN_C256_B3_K65_7",name:"Standard (256 channels)",size:29360128,channels:256,type:"standard"},"standard-128":{id:"NeXt_TDNN_C128_B3_K65_7",name:"Compact (128 channels)",size:7864320,channels:128,type:"standard"},"standard-192":{id:"NeXt_TDNN_C192_B1_K65_7",name:"Medium (192 channels)",size:16777216,channels:192,type:"standard"},"standard-384":{id:"NeXt_TDNN_C384_B1_K65_7",name:"Large (384 channels)",size:33554432,channels:384,type:"standard"},"mobile-128":{id:"NeXt_TDNN_light_C128_B3_K65",name:"Mobile Compact (128 channels)",size:5242880,channels:128,type:"mobile"},"mobile-256":{id:"NeXt_TDNN_light_C256_B3_K65",name:"Mobile Standard (256 channels)",size:20971520,channels:256,type:"mobile"}}})(),o.SpeakerVerification})()); //# sourceMappingURL=speaker-verification.js.map