UNPKG

vosk

Version:

Node binding for continuous offline voice recoginition with Vosk library.

434 lines (396 loc) 14.4 kB
// @ts-check 'use strict' /** * @module vosk */ const os = require('os'); const path = require('path'); /** @type {import('ffi-napi')} */ const ffi = require('ffi-napi'); /** @type {import('ref-napi')} */ const ref = require('ref-napi'); const vosk_model = ref.types.void; const vosk_model_ptr = ref.refType(vosk_model); const vosk_spk_model = ref.types.void; const vosk_spk_model_ptr = ref.refType(vosk_spk_model); const vosk_recognizer = ref.types.void; const vosk_recognizer_ptr = ref.refType(vosk_recognizer); /** * @typedef {Object} WordResult * @property {number} conf The confidence rate in the detection. 0 For unlikely, and 1 for totally accurate. * @property {number} start The start of the timeframe when the word is pronounced in seconds * @property {number} end The end of the timeframe when the word is pronounced in seconds * @property {string} word The word detected */ /** * @typedef {Object} RecognitionResults * @property {WordResult[]} result Details about the words that have been detected * @property {string} text The complete sentence that have been detected */ /** * @typedef {Object} SpeakerResults * @property {number[]} spk A floating vector representing speaker identity. It is usually about 128 numbers which uniquely represent speaker voice. * @property {number} spk_frames The number of frames used to extract speaker vector. The more frames you have the more reliable is speaker vector. */ /** * @typedef {Object} BaseRecognizerParam * @property {Model} model The language model to be used * @property {number} sampleRate The sample rate. Most models are trained at 16kHz */ /** * @typedef {Object} GrammarRecognizerParam * @property {string[]} grammar The list of sentences to be recognized. */ /** * @typedef {Object} SpeakerRecognizerParam * @property {SpeakerModel} speakerModel The SpeakerModel that will enable speaker identification */ /** * @template {SpeakerRecognizerParam | GrammarRecognizerParam} T * @typedef {T extends SpeakerRecognizerParam ? SpeakerResults & RecognitionResults : RecognitionResults} Result */ /** * @typedef {Object} PartialResults * @property {string} partial The partial sentence that have been detected until now */ /** @typedef {string[]} Grammar The list of strings to be recognized */ let soname; if (os.platform() == 'win32') { // Update path to load dependent dlls let currentPath = process.env.Path; let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x86_64")); process.env.Path = currentPath + path.delimiter + dllDirectory; soname = path.join(__dirname, "lib", "win-x86_64", "libvosk.dll") } else if (os.platform() == 'darwin') { soname = path.join(__dirname, "lib", "osx-universal", "libvosk.dylib") } else { soname = path.join(__dirname, "lib", "linux-x86_64", "libvosk.so") } const libvosk = ffi.Library(soname, { 'vosk_set_log_level': ['void', ['int']], 'vosk_model_new': [vosk_model_ptr, ['string']], 'vosk_model_free': ['void', [vosk_model_ptr]], 'vosk_spk_model_new': [vosk_spk_model_ptr, ['string']], 'vosk_spk_model_free': ['void', [vosk_spk_model_ptr]], 'vosk_recognizer_new': [vosk_recognizer_ptr, [vosk_model_ptr, 'float']], 'vosk_recognizer_new_spk': [vosk_recognizer_ptr, [vosk_model_ptr, 'float', vosk_spk_model_ptr]], 'vosk_recognizer_new_grm': [vosk_recognizer_ptr, [vosk_model_ptr, 'float', 'string']], 'vosk_recognizer_free': ['void', [vosk_recognizer_ptr]], 'vosk_recognizer_set_max_alternatives': ['void', [vosk_recognizer_ptr, 'int']], 'vosk_recognizer_set_words': ['void', [vosk_recognizer_ptr, 'bool']], 'vosk_recognizer_set_partial_words': ['void', [vosk_recognizer_ptr, 'bool']], 'vosk_recognizer_set_spk_model': ['void', [vosk_recognizer_ptr, vosk_spk_model_ptr]], 'vosk_recognizer_accept_waveform': ['bool', [vosk_recognizer_ptr, 'pointer', 'int']], 'vosk_recognizer_result': ['string', [vosk_recognizer_ptr]], 'vosk_recognizer_final_result': ['string', [vosk_recognizer_ptr]], 'vosk_recognizer_partial_result': ['string', [vosk_recognizer_ptr]], 'vosk_recognizer_reset': ['void', [vosk_recognizer_ptr]], }); /** * Set log level for Kaldi messages * @param {number} level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence. */ function setLogLevel(level) { libvosk.vosk_set_log_level(level); } /** * Build a Model from a model file. * @see models [models](https://alphacephei.com/vosk/models) */ class Model { /** * Build a Model to be used with the voice recognition. Each language should have it's own Model * for the speech recognition to work. * @param {string} modelPath The abstract pathname to the model * @see models [models](https://alphacephei.com/vosk/models) */ constructor(modelPath) { /** * Store the handle. * For internal use only * @type {unknown} */ this.handle = libvosk.vosk_model_new(modelPath); } /** * Releases the model memory * * The model object is reference-counted so if some recognizer * depends on this model, model might still stay alive. When * last recognizer is released, model will be released too. */ free() { libvosk.vosk_model_free(this.handle); } } /** * Build a Speaker Model from a speaker model file. * The Speaker Model enables speaker identification. * @see models [models](https://alphacephei.com/vosk/models) */ class SpeakerModel { /** * Loads speaker model data from the file and returns the model object * * @param {string} modelPath the path of the model on the filesystem * @see models [models](https://alphacephei.com/vosk/models) */ constructor(modelPath) { /** * Store the handle. * For internal use only * @type {unknown} */ this.handle = libvosk.vosk_spk_model_new(modelPath); } /** * Releases the model memory * * The model object is reference-counted so if some recognizer * depends on this model, model might still stay alive. When * last recognizer is released, model will be released too. */ free() { libvosk.vosk_spk_model_free(this.handle); } } /** * Helper to narrow down type while using `hasOwnProperty`. * @see hasOwnProperty [typescript issue](https://fettblog.eu/typescript-hasownproperty/) * @template {Object} Obj * @template {PropertyKey} Key * @param {Obj} obj * @param {Key} prop * @returns {obj is Obj & Record<Key, unknown>} */ function hasOwnProperty(obj, prop) { return obj.hasOwnProperty(prop) } /** * @template T * @template U * @typedef {{ [P in Exclude<keyof T, keyof U>]?: never }} Without */ /** * @template T * @template U * @typedef {(T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U} XOR */ /** * Create a Recognizer that will be able to transform audio streams into text using a Model. * @template {XOR<SpeakerRecognizerParam, Partial<GrammarRecognizerParam>>} T extra parameter * @see Model */ class Recognizer { /** * Create a Recognizer that will handle speech to text recognition. * @constructor * @param {T & BaseRecognizerParam} param The Recognizer parameters * * Sometimes when you want to improve recognition accuracy and when you don't need * to recognize large vocabulary you can specify a list of phrases to recognize. This * will improve recognizer speed and accuracy but might return [unk] if user said * something different. * * Only recognizers with lookahead models support this type of quick configuration. * Precompiled HCLG graph models are not supported. */ constructor(param) { const { model, sampleRate } = param // Prevent the user to receive unpredictable results if (hasOwnProperty(param, 'speakerModel') && hasOwnProperty(param, 'grammar')) { throw new Error('grammar and speakerModel cannot be used together for now.') } /** * Store the handle. * For internal use only * @type {unknown} */ this.handle = hasOwnProperty(param, 'speakerModel') ? libvosk.vosk_recognizer_new_spk(model.handle, sampleRate, param.speakerModel.handle) : hasOwnProperty(param, 'grammar') ? libvosk.vosk_recognizer_new_grm(model.handle, sampleRate, JSON.stringify(param.grammar)) : libvosk.vosk_recognizer_new(model.handle, sampleRate); } /** * Releases the model memory * * The model object is reference-counted so if some recognizer * depends on this model, model might still stay alive. When * last recognizer is released, model will be released too. */ free() { libvosk.vosk_recognizer_free(this.handle); } /** Configures recognizer to output n-best results * * <pre> * { * "alternatives": [ * { "text": "one two three four five", "confidence": 0.97 }, * { "text": "one two three for five", "confidence": 0.03 }, * ] * } * </pre> * * @param max_alternatives - maximum alternatives to return from recognition results */ setMaxAlternatives(max_alternatives) { libvosk.vosk_recognizer_set_max_alternatives(this.handle, max_alternatives); } /** Configures recognizer to output words with times * * <pre> * "result" : [{ * "conf" : 1.000000, * "end" : 1.110000, * "start" : 0.870000, * "word" : "what" * }, { * "conf" : 1.000000, * "end" : 1.530000, * "start" : 1.110000, * "word" : "zero" * }, { * "conf" : 1.000000, * "end" : 1.950000, * "start" : 1.530000, * "word" : "zero" * }, { * "conf" : 1.000000, * "end" : 2.340000, * "start" : 1.950000, * "word" : "zero" * }, { * "conf" : 1.000000, * "end" : 2.610000, * "start" : 2.340000, * "word" : "one" * }], * </pre> * * @param words - boolean value */ setWords(words) { libvosk.vosk_recognizer_set_words(this.handle, words); } /** Same as above, but for partial results*/ setPartialWords(partial_words) { libvosk.vosk_recognizer_set_partial_words(this.handle, partial_words); } /** Adds speaker recognition model to already created recognizer. Helps to initialize * speaker recognition for grammar-based recognizer. * * @param spk_model Speaker recognition model */ setSpkModel(spk_model) { libvosk.vosk_recognizer_set_spk_model(this.handle, spk_model.handle); } /** * Accept voice data * * accept and process new chunk of voice data * * @param {Buffer} data audio data in PCM 16-bit mono format * @returns true if silence is occured and you can retrieve a new utterance with result method */ acceptWaveform(data) { return libvosk.vosk_recognizer_accept_waveform(this.handle, data, data.length); }; /** * Accept voice data * * accept and process new chunk of voice data * * @param {Buffer} data audio data in PCM 16-bit mono format * @returns true if silence is occured and you can retrieve a new utterance with result method */ acceptWaveformAsync(data) { return new Promise((resolve, reject) => { libvosk.vosk_recognizer_accept_waveform.async(this.handle, data, data.length, function(err, result) { if (err) { reject(err); } else { resolve(result); } }); }); }; /** Returns speech recognition result in a string * * @returns the result in JSON format which contains decoded line, decoded * words, times in seconds and confidences. You can parse this result * with any json parser * <pre> * { * "result" : [{ * "conf" : 1.000000, * "end" : 1.110000, * "start" : 0.870000, * "word" : "what" * }, { * "conf" : 1.000000, * "end" : 1.530000, * "start" : 1.110000, * "word" : "zero" * }, { * "conf" : 1.000000, * "end" : 1.950000, * "start" : 1.530000, * "word" : "zero" * }, { * "conf" : 1.000000, * "end" : 2.340000, * "start" : 1.950000, * "word" : "zero" * }, { * "conf" : 1.000000, * "end" : 2.610000, * "start" : 2.340000, * "word" : "one" * }], * "text" : "what zero zero zero one" * } * </pre> */ resultString() { return libvosk.vosk_recognizer_result(this.handle); }; /** * Returns speech recognition results * @returns {Result<T>} The results */ result() { return JSON.parse(libvosk.vosk_recognizer_result(this.handle)); }; /** * speech recognition text which is not yet finalized. * result may change as recognizer process more data. * * @returns {PartialResults} The partial results */ partialResult() { return JSON.parse(libvosk.vosk_recognizer_partial_result(this.handle)); }; /** * Returns speech recognition result. Same as result, but doesn't wait for silence * You usually call it in the end of the stream to get final bits of audio. It * flushes the feature pipeline, so all remaining audio chunks got processed. * * @returns {Result<T>} speech result. */ finalResult() { return JSON.parse(libvosk.vosk_recognizer_final_result(this.handle)); }; /** * * Resets current results so the recognition can continue from scratch */ reset() { libvosk.vosk_recognizer_reset(this.handle); } } exports.setLogLevel = setLogLevel exports.Model = Model exports.SpeakerModel = SpeakerModel exports.Recognizer = Recognizer