UNPKG

@solyarisoftware/voskjs

Version:

NodeJs developers API for Vosk-api speech-to-text engine.

775 lines (606 loc) 22.7 kB
#!/usr/bin/env node /** * @module voskjs * * @public * @function logLevel * @function loadModel * @function transcript * @function freeModel * -* @see VoskAPI https://github.com/alphacep/vosk-api/blob/master/nodejs/index.js * @see https://github.com/alphacep/vosk-api/blob/master/nodejs/index.js#L207 * @see https://www.tutorialsteacher.com/nodejs/nodejs-eventemitter * util.inherits(vosk.Recognizer, emitter) */ const fs = require('fs') const util = require('util') const emitter = require('events').EventEmitter const { Readable } = require('stream') const wav = require('wav') const vosk = require('vosk') const { info } = require('./lib/info') const { getArgs } = require('./lib/getArgs') const { setTimer, getTimer, unixTimeMsecs } = require('./lib/chronos') /** * @constant */ const SAMPLE_RATE = 16000 const PARTIAL_RESULT_EVENT = 'partial' const END_OF_SPEECH_EVENT = 'endOfSpeech' const FINAL_RESULT_EVENT = 'final' /** * @function logLevel * @public * Set log level for Vosk/Kaldi log messages * * @param {number} level The higher, the more verbose. 0 for infos and errors. Less than 0 for silence. */ function logLevel(level=0) { // set vosk log level vosk.setLogLevel(level) } /** * @function loadModel * Create a run time model from the specified directory * * @public * * @param {String} modelDirectory directory name of the Vosk model * * @typedef ModelObject * @property {VoskModel} model run time model object returned by Vosk engine. * @property {Number} latency elpased time in msecs * * @return {promise<ModelObject>} * */ function loadModel(modelDirectory) { // check if model directory exists, async fs.access(modelDirectory, (err) => { if (err) throw `${err}: file ${modelDirectory} not found.` }) // create new run time model from the specified directory // TODO try/catch? const model = new vosk.Model(modelDirectory) return model } /** * @function createRecognizer * * Create a run time Vosk recognizer * * @typedef VoskRecognizerArgsObject * @property {Boolean} multiThreads if true, an external (Vosk engine) thread is spawned on the fly * that need in server (concurrent requests) architecture. * @property {Number} sampleRate Default value: 16000 * @property {String[]} grammar array of words, or sentences * @property {Number} alternatives maximum alternatives to return from recognition results * @property {Boolean} words if true, recognizer result will include word by word details * @example * { * result: [ * { conf: 1, end: 1.02, start: 0.36, word: 'experience' }, * { conf: 1, end: 1.35, start: 1.02, word: 'proves' }, * { conf: 1, end: 1.74, start: 1.35, word: 'this' } * ], * text: 'experience proves this' * } * * @param {ModelObject} model the Vosk model returned by InitModel() * @param {VoskRecognizerArgsObject} [options] Vosk Recognizer arguments setting. Optional * @return {VoskRecognizerObject} * */ function createRecognizer(model, { sampleRate=SAMPLE_RATE, grammar=null, alternatives=0, words=true } = {}) { // if a grammar is specified, pass it to the Vosk Recognizer const voskRecognizerArgs = grammar ? {model, sampleRate, grammar} : {model, sampleRate} // create Vosk recognizer // TODO try/catch? const recognizer = new vosk.Recognizer(voskRecognizerArgs) if ( alternatives ) recognizer.setMaxAlternatives(alternatives) recognizer.setWords(words) return recognizer } /** * @function transcriptFromFile * speech recognition into a text, from an audio file, given a specified Vosk model * * @alias transcript * @public * @async * * @param {String} fileName the name of speech file, in WAV format * @param {ModelObject} model the Vosk model returned by InitModel() * @param {VoskRecognizerArgsObject} [options] Vosk Recognizer arguments setting. Optional. * * @return {Promise<VoskResultObject>} transcript object returned by Vosk engine * */ async function transcriptFromFile(fileName, model, { multiThreads=true, sampleRate=SAMPLE_RATE, grammar=null, alternatives=0, words=true } = {}) { const DEBUG = false return new Promise( (resolve, reject) => { // validate audiofile existence, async fs.access(fileName, (err) => { if (err) return reject(`${err}: file ${fileName} not found.`) }) if (DEBUG) setTimer('createRecognizer') const recognizer = createRecognizer( model, {sampleRate, grammar, alternatives, words} ) if (DEBUG) console.log(`recognizer latency : ${getTimer('createRecognizer')}ms`) const wfStream = fs.createReadStream(fileName, {'highWaterMark': 4096}) const wfReader = new wav.Reader() wfStream.pipe(wfReader) const pcmChunks = new Readable().wrap(wfReader) wfReader.on('format', async ( { audioFormat, sampleRate, channels } ) => { if (audioFormat != 1 || channels != 1) return reject(`${fileName}: audio file (sample rate: ${sampleRate}) must be WAV format mono PCM.`) for await (const data of pcmChunks) { // // WARNING // From vosk version 0.3.25 // the acceptWaveformAsync function runs in a dedicated thread. // That wold improve performances in case of concurrent requests // from the caller (server) program // // Previous vosk version 0.3.25 // const end_of_speech = recognizer.acceptWaveform(data) // const end_of_speech = multiThreads ? await recognizer.acceptWaveformAsync(data) : recognizer.acceptWaveform(data) // // WARNING // 1. AcceptWaveform returns true when silence is detected and you can retrieve the result with Result(). // 2. If silence is not detected you can retrieve PartialResult() only. // 3. FinalResult means the stream is ended, you flush the buffers and retrieve remaining result. // By Nicolay Shmirev. See: https://github.com/alphacep/vosk-api/issues/590#issuecomment-863065813 // if (end_of_speech) { // End of speech means silence detected. // We want to transcript all the audio so the processing continue until the end. // debug //console.log('DEBUG', END_OF_SPEECH_EVENT, recognizer.result()) continue } //else // console.log('partialResult', recognizer.partialResult()) } // copy final Vosk engine result object const result = {...recognizer.finalResult(recognizer)} recognizer.free() return resolve(result) }) }) } /** * @function transcriptEventsFromFile * * speech recognition into a text, from an audio file, given a specified Vosk model, * return an events emitter * * @public * * @param {String} fileName the name of speech file, in WAV format * @param {ModelObject} model the Vosk model returned by InitModel() * @param {VoskRecognizerArgsObject} [options] Vosk Recognizer arguments setting. Optional. * * @return {Emitter} emit events * */ function transcriptEventsFromFile(fileName, model, { multiThreads=true, sampleRate=SAMPLE_RATE, grammar=null, alternatives=0, words=true } = {}) { const DEBUG = false // validate audiofile existence, async fs.access(fileName, (err) => { if (err) throw (`${err}: file ${fileName} not found.`) }) if (DEBUG) setTimer('createRecognizer') // the function is enabled to emit events const event = new emitter() const recognizer = createRecognizer( model, {sampleRate, grammar, alternatives, words} ) if (DEBUG) console.log(`recognizer latency : ${getTimer('createRecognizer')}ms`) const wfStream = fs.createReadStream(fileName, {'highWaterMark': 4096}) const wfReader = new wav.Reader() wfStream.pipe(wfReader) const pcmChunks = new Readable().wrap(wfReader) wfReader.on('format', async ( { audioFormat, sampleRate, channels } ) => { if (audioFormat != 1 || channels != 1) throw (`${fileName}: audio file (sample rate: ${sampleRate}) must be WAV format mono PCM.`) const lastPartialResult = {} for await (const data of pcmChunks) { // // WARNING // From vosk version 0.3.25 // the acceptWaveformAsync function runs in a dedicated thread. // That wold improve performances in case of concurrent requests // from the caller (server) program // // Previous vosk version 0.3.25 // const end_of_speech = recognizer.acceptWaveform(data) // const end_of_speech = multiThreads ? await recognizer.acceptWaveformAsync(data) : recognizer.acceptWaveform(data) // // WARNING // Emit partial result events // 1. AcceptWaveform returns true when silence is detected and you can retrieve the result with Result(). // 2. If silence is not detected you can retrieve PartialResult() only. // 3. FinalResult means the stream is ended, you flush the buffers and retrieve remaining result. // By Nicolay Shmirev. See: https://github.com/alphacep/vosk-api/issues/590#issuecomment-863065813 // if (end_of_speech) event.emit(END_OF_SPEECH_EVENT, recognizer.result()) else { const partialResult = recognizer.partialResult() // Doesn't emit duplicated events: // Is the current partialResult different form the last one? // If true a new event is emiitted. // This avoid "duplicated" events to be emitted if (partialResult.partial !== lastPartialResult.partial) { event.emit(PARTIAL_RESULT_EVENT, partialResult) lastPartialResult.partial = partialResult.partial } } } // // WARNING // Emit partial result events // 1. AcceptWaveform returns true when silence is detected and you can retrieve the result with Result(). // 2. If silence is not detected you can retrieve PartialResult() only. // 3. FinalResult means the stream is ended, you flush the buffers and retrieve remaining result. // By Nicolay Shmirev. See: https://github.com/alphacep/vosk-api/issues/590#issuecomment-863065813 // event.emit(FINAL_RESULT_EVENT, recognizer.finalResult(recognizer)) recognizer.free() }) return event } /** * @function transcriptFromBuffer * speech recognition into a text, from an audio file, given a specified Vosk model * * @alias transcript * @public * @async * * @param {Buffer} buffer input buffer, in PCM format * @param {ModelObject} model the Vosk model returned by InitModel() * @param {VoskRecognizerArgsObject} [options] Vosk Recognizer arguments setting. Optional. * * @return {Promise<VoskResultObject>} transcript object returned by Vosk engine * */ async function transcriptFromBuffer(buffer, model, { multiThreads=true, sampleRate=SAMPLE_RATE, grammar=null, alternatives=0, words=true } = {}) { const recognizer = createRecognizer( model, {sampleRate, grammar, alternatives, words} ) // https://gist.github.com/wpscholar/270005d42b860b1c33cf5ab25b37928a // https://stackoverflow.com/questions/47089230/how-to-convert-buffer-to-stream-in-nodejs // // WARNING // From vosk version 0.3.25 // the acceptWaveformAsync function runs in a dedicated thread. // That wold improve performances in case of cocurrent requests // from the caller (server) program // // Previous vosk version 0.3.25 // const end_of_speech = recognizer.acceptWaveform(data) // if ( multiThreads ) await recognizer.acceptWaveformAsync(buffer) else recognizer.acceptWaveform(buffer) // copy final Vosk engine result object const result = {...recognizer.finalResult(recognizer)} recognizer.free() return Promise.resolve(result) } /** * @function transcriptEventsFromBuffer * speech recognition into a text, from an audio file, given a specified Vosk model * * @alias transcript * @public * @async * * @param {Buffer} buffer input buffer, in PCM format * @param {ModelObject} model the Vosk model returned by InitModel() * @param {VoskRecognizerArgsObject} [options] Vosk Recognizer arguments setting. Optional. * * @return {Emitter} * */ async function transcriptEventsFromBuffer(buffer, model, { multiThreads=true, sampleRate=SAMPLE_RATE, grammar=null, alternatives=0, words=true } = {}) { // the function is enabled to emit events const event = new emitter() const recognizer = createRecognizer( model, {sampleRate, grammar, alternatives, words} ) // https://gist.github.com/wpscholar/270005d42b860b1c33cf5ab25b37928a // https://stackoverflow.com/questions/47089230/how-to-convert-buffer-to-stream-in-nodejs const lastPartialResult = {} for(;;) { // // WARNING // From vosk version 0.3.25 // the acceptWaveformAsync function runs in a dedicated thread. // That wold improve performances in case of cocurrent requests // from the caller (server) program // // Previous vosk version 0.3.25 // const end_of_speech = recognizer.acceptWaveform(data) // const end_of_speech = multiThreads ? await recognizer.acceptWaveformAsync(buffer) : recognizer.acceptWaveform(buffer) // // WARNING // Emit partial result events // 1. AcceptWaveform returns true when silence is detected and you can retrieve the result with Result(). // 2. If silence is not detected you can retrieve PartialResult() only. // 3. FinalResult means the stream is ended, you flush the buffers and retrieve remaining result. // By Nicolay Shmirev. See: https://github.com/alphacep/vosk-api/issues/590#issuecomment-863065813 // if (end_of_speech) { event.emit(END_OF_SPEECH_EVENT, recognizer.result()) continue } else { const partialResult = recognizer.partialResult() // Doesn't emit duplicated events: // Is the current partialResult different form the last one? // If true a new event is emiitted. // This avoid "duplicated" events to be emitted if (partialResult.partial !== lastPartialResult.partial) { event.emit(PARTIAL_RESULT_EVENT, partialResult) lastPartialResult.partial = partialResult.partial } break } } // // WARNING // if end_Of_Speech is detected (the buffere contains a sentence followed by a silence) // and the result() function is called, so // the finalResult() contains just the remaining (last) part of the sentence before the end of the audio. // It's up to user to collect events data assempling the final textual (multisentence) result. // event.emit(FINAL_RESULT_EVENT, recognizer.finalResult()) recognizer.free() return event } /** * @function freeModel * @public * * @param {ModelObject} model * */ function freeModel(model) { model.free() } /** * test section */ function helpAndExit() { console.log('voskjs is a CLI utility to test Vosk-api features') console.log (info()) console.log() console.log('Usage') console.log() console.log(' voskjs \\ ') console.log(' --model=<model directory> \\ ') console.log(' --audio=<audio file name> \\ ') console.log(' [--grammar=<list of comma-separated words or sentences>] \\ ') console.log(' [--samplerate=<Number, usually 16000 or 8000>] \\ ') console.log(' [--alternatives=<number of max alternatives in text result>] \\ ') console.log(' [--textonly] \\ ') console.log(' [--tableevents] \\ ') console.log(' [--objectevents] \\ ') console.log(' [--debug=<Vosk debug level>] ') console.log() console.log('Examples') console.log() console.log(' 1. Recognize a speech file using a specific model directory:') console.log() console.log(' voskjs --audio=audio/2830-3980-0043.wav --model=models/vosk-model-en-us-aspire-0.2') console.log() console.log(' 2. Recognize a speech file setting a grammar (with a dynamic graph model) and a number of alternative:') console.log() console.log(' voskjs \\ ') console.log(' --audio=audio/2830-3980-0043.wav \\ ') console.log(' --model=models/vosk-model-small-en-us-0.15 \\ ') console.log(' --grammar="experience proves this, bla bla bla"') console.log(' --alternatives=3') console.log() process.exit(1) } /** * @function checkArgs * command line parsing * * @param {String} args * * @typedef {Object} SentenceAndAttributes * @property {String} language * @returns {SentenceAndAttributes} * */ function checkArgs(args) { // mandatory arguments const modelDirectory = args.model const audioFile = args.audio // optional arguments const grammar = args.grammar const sampleRate = args.samplerate const alternatives = args.alternatives const textOnly = args.textonly const tableevents = args.tableevents const objectevents = args.objectevents // if not specified, set default Vosk debug level to -1 (silent mode) const debug = args.debug ? args.debug : -1 if ( !modelDirectory ) helpAndExit() if ( !audioFile ) helpAndExit() return { modelDirectory, audioFile, // if grammar args is present, as comma separated sentences, // convert it in an array of strings grammar: grammar ? grammar.split(',').map(sentence => sentence.trim()) : undefined, // convert to Number sampleRate: sampleRate ? +sampleRate : undefined, alternatives, textOnly, tableevents, objectevents, debug } } function printObject(object, args) { const defaultArgs = {showHidden:false, breakLength:Infinity, depth:null, colors:true} return util.inspect(object, {...defaultArgs, ...args } ) } function printResultsAsTable(results) { console.log ('Events table:') console.log () console.log('| %s | %s | %s |', 'time'.padEnd(6), 'event'.padEnd(12), 'text'.padEnd(40) ) console.log('| %s | %s | %s |', '-'.repeat(6), '-'.repeat(12), '-'.repeat(40) ) for (const result of results) { if ( result.event === PARTIAL_RESULT_EVENT ) console.log('| %s | %s | %s', result.time.toString().padStart(6), result.event.padEnd(12), result.data.partial //.padStart(11+6+2+1) ) if ( result.event === END_OF_SPEECH_EVENT ) console.log('| %s | %s | %s', result.time.toString().padStart(6), result.event.padEnd(12), result.data.text //.padStart(11+6+2+1) ) if ( result.event === FINAL_RESULT_EVENT ) console.log('| %s | %s | %s', result.time.toString().padStart(6), result.event.padEnd(12), result.data.text //.padStart(11+6+2+1) ) } console.log() } function statistics(modelDirectory, audioFile, grammar, sampleRate, alternatives, textOnly, tableevents, objectevents, debug, sentences) { console.log('voskjs is a CLI utility to test Vosk-api features') console.log (info()) console.log() console.log('Statistics:') console.log() console.log(`model directory : ${modelDirectory}`) console.log(`speech file name : ${audioFile}`) console.log(`grammar : ${grammar ? grammar : 'not specified. Default: NO'}`) console.log(`sample rate : ${sampleRate ? sampleRate : 'not specified. Default: 16000'}`) console.log(`max alternatives : ${alternatives}`) console.log(`text only / JSON : ${textOnly ? 'text' : 'JSON'}`) console.log(`Vosk debug level : ${debug}`) console.log() console.log(`load model latency : ${getTimer('loadModel')}ms`) console.log(`transcript latency : ${getTimer('transcript')}ms`) console.log(`transcript text : ${sentences.join(' ')}`) console.log() } /** * @function main * unit test */ async function main() { // get command line arguments const { args } = getArgs() const { modelDirectory, audioFile, grammar, sampleRate, alternatives, textOnly, tableevents, objectevents, debug } = checkArgs(args) const words = ! textOnly // set the vosk log level to silence logLevel(debug) setTimer('loadModel') // load in memory a Vosk directory model const model = loadModel(modelDirectory) setTimer('transcript') const startSentenceTimer = unixTimeMsecs() let transcriptEvents const sentences = [] const results = [] // speech recognition from an audio file try { transcriptEvents = transcriptEventsFromFile(audioFile, model, {grammar, sampleRate, alternatives, words}) } catch(error) { console.error(error) } transcriptEvents.on(PARTIAL_RESULT_EVENT, data => { if ( !textOnly ) { const dataItem = { time: unixTimeMsecs() - startSentenceTimer, event: PARTIAL_RESULT_EVENT, data } results.push(dataItem) } }) transcriptEvents.on(END_OF_SPEECH_EVENT, data => { sentences.push(data.text) if ( ! textOnly ) { const dataItem = { time: unixTimeMsecs() - startSentenceTimer, event: END_OF_SPEECH_EVENT, data } results.push(dataItem) } }) transcriptEvents.on(FINAL_RESULT_EVENT, data => { sentences.push(data.text) if ( textOnly ) { console.log(sentences.join(' ')) } else { const dataItem = { time: unixTimeMsecs() - startSentenceTimer, event: FINAL_RESULT_EVENT, data } results.push(dataItem) if ( !textOnly ) statistics(modelDirectory, audioFile, grammar, sampleRate, alternatives, textOnly, tableevents, objectevents, debug, sentences) if ( !textOnly && tableevents) printResultsAsTable(results) if ( !textOnly && objectevents) console.log(printObject(results, {breakLength:80})) } }) // free the runtime model freeModel(model) } if (require.main === module) main() module.exports = { logLevel, loadModel, transcriptFromBuffer, transcriptEventsFromBuffer, transcriptFromFile, transcriptEventsFromFile, //transcript: transcriptFromFile, // alias freeModel }