vosk
Version:
Node binding for continuous offline voice recoginition with Vosk library.
434 lines (396 loc) • 14.4 kB
JavaScript
// @ts-check
/**
* @module vosk
*/
const os = require('os');
const path = require('path');
/** @type {import('ffi-napi')} */
const ffi = require('ffi-napi');
/** @type {import('ref-napi')} */
const ref = require('ref-napi');
const vosk_model = ref.types.void;
const vosk_model_ptr = ref.refType(vosk_model);
const vosk_spk_model = ref.types.void;
const vosk_spk_model_ptr = ref.refType(vosk_spk_model);
const vosk_recognizer = ref.types.void;
const vosk_recognizer_ptr = ref.refType(vosk_recognizer);
/**
* @typedef {Object} WordResult
* @property {number} conf The confidence rate in the detection. 0 For unlikely, and 1 for totally accurate.
* @property {number} start The start of the timeframe when the word is pronounced in seconds
* @property {number} end The end of the timeframe when the word is pronounced in seconds
* @property {string} word The word detected
*/
/**
* @typedef {Object} RecognitionResults
* @property {WordResult[]} result Details about the words that have been detected
* @property {string} text The complete sentence that have been detected
*/
/**
* @typedef {Object} SpeakerResults
* @property {number[]} spk A floating vector representing speaker identity. It is usually about 128 numbers which uniquely represent speaker voice.
* @property {number} spk_frames The number of frames used to extract speaker vector. The more frames you have the more reliable is speaker vector.
*/
/**
* @typedef {Object} BaseRecognizerParam
* @property {Model} model The language model to be used
* @property {number} sampleRate The sample rate. Most models are trained at 16kHz
*/
/**
* @typedef {Object} GrammarRecognizerParam
* @property {string[]} grammar The list of sentences to be recognized.
*/
/**
* @typedef {Object} SpeakerRecognizerParam
* @property {SpeakerModel} speakerModel The SpeakerModel that will enable speaker identification
*/
/**
* @template {SpeakerRecognizerParam | GrammarRecognizerParam} T
* @typedef {T extends SpeakerRecognizerParam ? SpeakerResults & RecognitionResults : RecognitionResults} Result
*/
/**
* @typedef {Object} PartialResults
* @property {string} partial The partial sentence that have been detected until now
*/
/** @typedef {string[]} Grammar The list of strings to be recognized */
let soname;
if (os.platform() == 'win32') {
// Update path to load dependent dlls
let currentPath = process.env.Path;
let dllDirectory = path.resolve(path.join(__dirname, "lib", "win-x86_64"));
process.env.Path = currentPath + path.delimiter + dllDirectory;
soname = path.join(__dirname, "lib", "win-x86_64", "libvosk.dll")
} else if (os.platform() == 'darwin') {
soname = path.join(__dirname, "lib", "osx-universal", "libvosk.dylib")
} else {
soname = path.join(__dirname, "lib", "linux-x86_64", "libvosk.so")
}
const libvosk = ffi.Library(soname, {
'vosk_set_log_level': ['void', ['int']],
'vosk_model_new': [vosk_model_ptr, ['string']],
'vosk_model_free': ['void', [vosk_model_ptr]],
'vosk_spk_model_new': [vosk_spk_model_ptr, ['string']],
'vosk_spk_model_free': ['void', [vosk_spk_model_ptr]],
'vosk_recognizer_new': [vosk_recognizer_ptr, [vosk_model_ptr, 'float']],
'vosk_recognizer_new_spk': [vosk_recognizer_ptr, [vosk_model_ptr, 'float', vosk_spk_model_ptr]],
'vosk_recognizer_new_grm': [vosk_recognizer_ptr, [vosk_model_ptr, 'float', 'string']],
'vosk_recognizer_free': ['void', [vosk_recognizer_ptr]],
'vosk_recognizer_set_max_alternatives': ['void', [vosk_recognizer_ptr, 'int']],
'vosk_recognizer_set_words': ['void', [vosk_recognizer_ptr, 'bool']],
'vosk_recognizer_set_partial_words': ['void', [vosk_recognizer_ptr, 'bool']],
'vosk_recognizer_set_spk_model': ['void', [vosk_recognizer_ptr, vosk_spk_model_ptr]],
'vosk_recognizer_accept_waveform': ['bool', [vosk_recognizer_ptr, 'pointer', 'int']],
'vosk_recognizer_result': ['string', [vosk_recognizer_ptr]],
'vosk_recognizer_final_result': ['string', [vosk_recognizer_ptr]],
'vosk_recognizer_partial_result': ['string', [vosk_recognizer_ptr]],
'vosk_recognizer_reset': ['void', [vosk_recognizer_ptr]],
});
/**
* Set log level for Kaldi messages
* @param {number} level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence.
*/
function setLogLevel(level) {
libvosk.vosk_set_log_level(level);
}
/**
* Build a Model from a model file.
* @see models [models](https://alphacephei.com/vosk/models)
*/
class Model {
/**
* Build a Model to be used with the voice recognition. Each language should have it's own Model
* for the speech recognition to work.
* @param {string} modelPath The abstract pathname to the model
* @see models [models](https://alphacephei.com/vosk/models)
*/
constructor(modelPath) {
/**
* Store the handle.
* For internal use only
* @type {unknown}
*/
this.handle = libvosk.vosk_model_new(modelPath);
}
/**
* Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too.
*/
free() {
libvosk.vosk_model_free(this.handle);
}
}
/**
* Build a Speaker Model from a speaker model file.
* The Speaker Model enables speaker identification.
* @see models [models](https://alphacephei.com/vosk/models)
*/
class SpeakerModel {
/**
* Loads speaker model data from the file and returns the model object
*
* @param {string} modelPath the path of the model on the filesystem
* @see models [models](https://alphacephei.com/vosk/models)
*/
constructor(modelPath) {
/**
* Store the handle.
* For internal use only
* @type {unknown}
*/
this.handle = libvosk.vosk_spk_model_new(modelPath);
}
/**
* Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too.
*/
free() {
libvosk.vosk_spk_model_free(this.handle);
}
}
/**
* Helper to narrow down type while using `hasOwnProperty`.
* @see hasOwnProperty [typescript issue](https://fettblog.eu/typescript-hasownproperty/)
* @template {Object} Obj
* @template {PropertyKey} Key
* @param {Obj} obj
* @param {Key} prop
* @returns {obj is Obj & Record<Key, unknown>}
*/
function hasOwnProperty(obj, prop) {
return obj.hasOwnProperty(prop)
}
/**
* @template T
* @template U
* @typedef {{ [P in Exclude<keyof T, keyof U>]?: never }} Without
*/
/**
* @template T
* @template U
* @typedef {(T | U) extends object ? (Without<T, U> & U) | (Without<U, T> & T) : T | U} XOR
*/
/**
* Create a Recognizer that will be able to transform audio streams into text using a Model.
* @template {XOR<SpeakerRecognizerParam, Partial<GrammarRecognizerParam>>} T extra parameter
* @see Model
*/
class Recognizer {
/**
* Create a Recognizer that will handle speech to text recognition.
* @constructor
* @param {T & BaseRecognizerParam} param The Recognizer parameters
*
* Sometimes when you want to improve recognition accuracy and when you don't need
* to recognize large vocabulary you can specify a list of phrases to recognize. This
* will improve recognizer speed and accuracy but might return [unk] if user said
* something different.
*
* Only recognizers with lookahead models support this type of quick configuration.
* Precompiled HCLG graph models are not supported.
*/
constructor(param) {
const { model, sampleRate } = param
// Prevent the user to receive unpredictable results
if (hasOwnProperty(param, 'speakerModel') && hasOwnProperty(param, 'grammar')) {
throw new Error('grammar and speakerModel cannot be used together for now.')
}
/**
* Store the handle.
* For internal use only
* @type {unknown}
*/
this.handle = hasOwnProperty(param, 'speakerModel')
? libvosk.vosk_recognizer_new_spk(model.handle, sampleRate, param.speakerModel.handle)
: hasOwnProperty(param, 'grammar')
? libvosk.vosk_recognizer_new_grm(model.handle, sampleRate, JSON.stringify(param.grammar))
: libvosk.vosk_recognizer_new(model.handle, sampleRate);
}
/**
* Releases the model memory
*
* The model object is reference-counted so if some recognizer
* depends on this model, model might still stay alive. When
* last recognizer is released, model will be released too.
*/
free() {
libvosk.vosk_recognizer_free(this.handle);
}
/** Configures recognizer to output n-best results
*
* <pre>
* {
* "alternatives": [
* { "text": "one two three four five", "confidence": 0.97 },
* { "text": "one two three for five", "confidence": 0.03 },
* ]
* }
* </pre>
*
* @param max_alternatives - maximum alternatives to return from recognition results
*/
setMaxAlternatives(max_alternatives) {
libvosk.vosk_recognizer_set_max_alternatives(this.handle, max_alternatives);
}
/** Configures recognizer to output words with times
*
* <pre>
* "result" : [{
* "conf" : 1.000000,
* "end" : 1.110000,
* "start" : 0.870000,
* "word" : "what"
* }, {
* "conf" : 1.000000,
* "end" : 1.530000,
* "start" : 1.110000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 1.950000,
* "start" : 1.530000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.340000,
* "start" : 1.950000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.610000,
* "start" : 2.340000,
* "word" : "one"
* }],
* </pre>
*
* @param words - boolean value
*/
setWords(words) {
libvosk.vosk_recognizer_set_words(this.handle, words);
}
/** Same as above, but for partial results*/
setPartialWords(partial_words) {
libvosk.vosk_recognizer_set_partial_words(this.handle, partial_words);
}
/** Adds speaker recognition model to already created recognizer. Helps to initialize
* speaker recognition for grammar-based recognizer.
*
* @param spk_model Speaker recognition model
*/
setSpkModel(spk_model) {
libvosk.vosk_recognizer_set_spk_model(this.handle, spk_model.handle);
}
/**
* Accept voice data
*
* accept and process new chunk of voice data
*
* @param {Buffer} data audio data in PCM 16-bit mono format
* @returns true if silence is occured and you can retrieve a new utterance with result method
*/
acceptWaveform(data) {
return libvosk.vosk_recognizer_accept_waveform(this.handle, data, data.length);
};
/**
* Accept voice data
*
* accept and process new chunk of voice data
*
* @param {Buffer} data audio data in PCM 16-bit mono format
* @returns true if silence is occured and you can retrieve a new utterance with result method
*/
acceptWaveformAsync(data) {
return new Promise((resolve, reject) => {
libvosk.vosk_recognizer_accept_waveform.async(this.handle, data, data.length, function(err, result) {
if (err) {
reject(err);
} else {
resolve(result);
}
});
});
};
/** Returns speech recognition result in a string
*
* @returns the result in JSON format which contains decoded line, decoded
* words, times in seconds and confidences. You can parse this result
* with any json parser
* <pre>
* {
* "result" : [{
* "conf" : 1.000000,
* "end" : 1.110000,
* "start" : 0.870000,
* "word" : "what"
* }, {
* "conf" : 1.000000,
* "end" : 1.530000,
* "start" : 1.110000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 1.950000,
* "start" : 1.530000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.340000,
* "start" : 1.950000,
* "word" : "zero"
* }, {
* "conf" : 1.000000,
* "end" : 2.610000,
* "start" : 2.340000,
* "word" : "one"
* }],
* "text" : "what zero zero zero one"
* }
* </pre>
*/
resultString() {
return libvosk.vosk_recognizer_result(this.handle);
};
/**
* Returns speech recognition results
* @returns {Result<T>} The results
*/
result() {
return JSON.parse(libvosk.vosk_recognizer_result(this.handle));
};
/**
* speech recognition text which is not yet finalized.
* result may change as recognizer process more data.
*
* @returns {PartialResults} The partial results
*/
partialResult() {
return JSON.parse(libvosk.vosk_recognizer_partial_result(this.handle));
};
/**
* Returns speech recognition result. Same as result, but doesn't wait for silence
* You usually call it in the end of the stream to get final bits of audio. It
* flushes the feature pipeline, so all remaining audio chunks got processed.
*
* @returns {Result<T>} speech result.
*/
finalResult() {
return JSON.parse(libvosk.vosk_recognizer_final_result(this.handle));
};
/**
*
* Resets current results so the recognition can continue from scratch
*/
reset() {
libvosk.vosk_recognizer_reset(this.handle);
}
}
exports.setLogLevel = setLogLevel
exports.Model = Model
exports.SpeakerModel = SpeakerModel
exports.Recognizer = Recognizer