UNPKG

sherpa-onnx

Version:

Speech-to-text, text-to-speech, speaker diarization, and speech enhancement using Next-gen Kaldi without internet connection

365 lines (285 loc) 9.91 kB
function freeConfig(config, Module) { if ('buffer' in config) { Module._free(config.buffer); } if ('transducer' in config) { freeConfig(config.transducer, Module); } if ('featConfig' in config) { freeConfig(config.featConfig, Module); } if ('modelConfig' in config) { freeConfig(config.modelConfig, Module); } if ('keywordsBuffer' in config) { Module._free(config.keywordsBuffer); } Module._free(config.ptr); } function initSherpaOnnxOnlineTransducerModelConfig(config, Module) { const encoderLen = Module.lengthBytesUTF8(config.encoder) + 1; const decoderLen = Module.lengthBytesUTF8(config.decoder) + 1; const joinerLen = Module.lengthBytesUTF8(config.joiner) + 1; const n = encoderLen + decoderLen + joinerLen; const buffer = Module._malloc(n); const len = 3 * 4; // 3 pointers const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8(config.encoder, buffer + offset, encoderLen); offset += encoderLen; Module.stringToUTF8(config.decoder, buffer + offset, decoderLen); offset += decoderLen; Module.stringToUTF8(config.joiner, buffer + offset, joinerLen); offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += encoderLen; Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += decoderLen; Module.setValue(ptr + 8, buffer + offset, 'i8*'); return { buffer: buffer, ptr: ptr, len: len, } } // The user should free the returned pointers function initModelConfig(config, Module) { if (!('tokensBuf' in config)) { config.tokensBuf = ''; } if (!('tokensBufSize' in config)) { config.tokensBufSize = 0; } const transducer = initSherpaOnnxOnlineTransducerModelConfig(config.transducer, Module); const paraformer_len = 2 * 4 const zipfomer2_ctc_len = 1 * 4 const nemo_ctc_len = 1 * 4 const t_one_ctc_len = 1 * 4 const len = transducer.len + paraformer_len + zipfomer2_ctc_len + 9 * 4 + nemo_ctc_len + t_one_ctc_len; const ptr = Module._malloc(len); Module.HEAPU8.fill(0, ptr, ptr + len); let offset = 0; Module._CopyHeap(transducer.ptr, transducer.len, ptr + offset); const tokensLen = Module.lengthBytesUTF8(config.tokens) + 1; const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; const modelTypeLen = Module.lengthBytesUTF8(config.modelType || '') + 1; const modelingUnitLen = Module.lengthBytesUTF8(config.modelingUnit || '') + 1; const bpeVocabLen = Module.lengthBytesUTF8(config.bpeVocab || '') + 1; const tokensBufLen = Module.lengthBytesUTF8(config.tokensBuf || '') + 1; const bufferLen = tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen + tokensBufLen; const buffer = Module._malloc(bufferLen); offset = 0; Module.stringToUTF8(config.tokens, buffer, tokensLen); offset += tokensLen; Module.stringToUTF8(config.provider || 'cpu', buffer + offset, providerLen); offset += providerLen; Module.stringToUTF8(config.modelType || '', buffer + offset, modelTypeLen); offset += modelTypeLen; Module.stringToUTF8( config.modelingUnit || '', buffer + offset, modelingUnitLen); offset += modelingUnitLen; Module.stringToUTF8(config.bpeVocab || '', buffer + offset, bpeVocabLen); offset += bpeVocabLen; Module.stringToUTF8(config.tokensBuf || '', buffer + offset, tokensBufLen); offset += tokensBufLen; offset = transducer.len + paraformer_len + zipfomer2_ctc_len; Module.setValue(ptr + offset, buffer, 'i8*'); // tokens offset += 4; Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + tokensLen, 'i8*'); // provider offset += 4; Module.setValue(ptr + offset, config.debug, 'i32'); offset += 4; Module.setValue( ptr + offset, buffer + tokensLen + providerLen, 'i8*'); // modelType offset += 4; Module.setValue( ptr + offset, buffer + tokensLen + providerLen + modelTypeLen, 'i8*'); // modelingUnit offset += 4; Module.setValue( ptr + offset, buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen, 'i8*'); // bpeVocab offset += 4; Module.setValue( ptr + offset, buffer + tokensLen + providerLen + modelTypeLen + modelingUnitLen + bpeVocabLen, 'i8*'); // tokens_buf offset += 4; Module.setValue(ptr + offset, config.tokensBufSize || 0, 'i32'); offset += 4; // skip nemo_ctc and t_one_ctc return { buffer: buffer, ptr: ptr, len: len, transducer: transducer } } function initFeatureExtractorConfig(config, Module) { let ptr = Module._malloc(4 * 2); Module.setValue(ptr, config.samplingRate || 16000, 'i32'); Module.setValue(ptr + 4, config.featureDim || 80, 'i32'); return { ptr: ptr, len: 8, } } function initKwsConfig(config, Module) { if (!('featConfig' in config)) { config.featConfig = { sampleRate: 16000, featureDim: 80, }; } if (!('keywordsBuf' in config)) { config.keywordsBuf = ''; } if (!('keywordsBufSize' in config)) { config.keywordsBufSize = 0; } let featConfig = initFeatureExtractorConfig(config.featConfig, Module); let modelConfig = initModelConfig(config.modelConfig, Module); let numBytes = featConfig.len + modelConfig.len + 4 * 7; let ptr = Module._malloc(numBytes); let offset = 0; Module._CopyHeap(featConfig.ptr, featConfig.len, ptr + offset); offset += featConfig.len; Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset) offset += modelConfig.len; Module.setValue(ptr + offset, config.maxActivePaths || 4, 'i32'); offset += 4; Module.setValue(ptr + offset, config.numTrailingBlanks || 1, 'i32'); offset += 4; Module.setValue(ptr + offset, config.keywordsScore || 1.0, 'float'); offset += 4; Module.setValue(ptr + offset, config.keywordsThreshold || 0.25, 'float'); offset += 4; let keywordsLen = Module.lengthBytesUTF8(config.keywords) + 1; let keywordsBufLen = Module.lengthBytesUTF8(config.keywordsBuf) + 1; let keywordsBuffer = Module._malloc(keywordsLen + keywordsBufLen); Module.stringToUTF8(config.keywords, keywordsBuffer, keywordsLen); Module.stringToUTF8( config.keywordsBuf, keywordsBuffer + keywordsLen, keywordsBufLen); Module.setValue(ptr + offset, keywordsBuffer, 'i8*'); offset += 4; Module.setValue(ptr + offset, keywordsBuffer + keywordsLen, 'i8*'); offset += 4; Module.setValue(ptr + offset, config.keywordsBufLen, 'i32'); offset += 4; return { ptr: ptr, len: numBytes, featConfig: featConfig, modelConfig: modelConfig, keywordsBuffer: keywordsBuffer } } class Stream { constructor(handle, Module) { this.handle = handle; this.pointer = null; this.n = 0; this.Module = Module; } free() { if (this.handle) { this.Module._SherpaOnnxDestroyOnlineStream(this.handle); this.handle = null; this.Module._free(this.pointer); this.pointer = null; this.n = 0; } } /** * @param sampleRate {Number} * @param samples {Float32Array} Containing samples in the range [-1, 1] */ acceptWaveform(sampleRate, samples) { if (this.n < samples.length) { this.Module._free(this.pointer); this.pointer = this.Module._malloc(samples.length * samples.BYTES_PER_ELEMENT); this.n = samples.length } this.Module.HEAPF32.set(samples, this.pointer / samples.BYTES_PER_ELEMENT); this.Module._SherpaOnnxOnlineStreamAcceptWaveform( this.handle, sampleRate, this.pointer, samples.length); } inputFinished() { this.Module._SherpaOnnxOnlineStreamInputFinished(this.handle); } }; class Kws { constructor(configObj, Module) { this.config = configObj; let config = initKwsConfig(configObj, Module) let handle = Module._SherpaOnnxCreateKeywordSpotter(config.ptr); freeConfig(config, Module); this.handle = handle; this.Module = Module; } free() { this.Module._SherpaOnnxDestroyKeywordSpotter(this.handle); this.handle = 0 } createStream() { let handle = this.Module._SherpaOnnxCreateKeywordStream(this.handle); return new Stream(handle, this.Module); } isReady(stream) { return this.Module._SherpaOnnxIsKeywordStreamReady( this.handle, stream.handle) == 1; } decode(stream) { this.Module._SherpaOnnxDecodeKeywordStream(this.handle, stream.handle); } reset(stream) { this.Module._SherpaOnnxResetKeywordStream(this.handle, stream.handle); } getResult(stream) { let r = this.Module._SherpaOnnxGetKeywordResult(this.handle, stream.handle); let jsonPtr = this.Module.getValue(r + 24, 'i8*'); let json = this.Module.UTF8ToString(jsonPtr); this.Module._SherpaOnnxDestroyKeywordResult(r); return JSON.parse(json); } } function createKws(Module, myConfig) { let transducerConfig = { encoder: './encoder-epoch-12-avg-2-chunk-16-left-64.onnx', decoder: './decoder-epoch-12-avg-2-chunk-16-left-64.onnx', joiner: './joiner-epoch-12-avg-2-chunk-16-left-64.onnx', }; let modelConfig = { transducer: transducerConfig, tokens: './tokens.txt', provider: 'cpu', modelType: '', numThreads: 1, debug: 1, modelingUnit: 'cjkchar', bpeVocab: '', }; let featConfig = { samplingRate: 16000, featureDim: 80, }; let configObj = { featConfig: featConfig, modelConfig: modelConfig, maxActivePaths: 4, numTrailingBlanks: 1, keywordsScore: 1.0, keywordsThreshold: 0.25, keywords: 'x iǎo ài t óng x ué @小爱同学\n' + 'j ūn g ē n iú b ī @军哥牛逼' }; if (myConfig) { configObj = myConfig; } return new Kws(configObj, Module); } if (typeof process == 'object' && typeof process.versions == 'object' && typeof process.versions.node == 'string') { module.exports = { createKws, }; }