@solyarisoftware/voskjs
Version:
NodeJs developers API for Vosk-api speech-to-text engine.
267 lines (215 loc) • 7.09 kB
JavaScript
const fs = require('fs')
const { spawn } = require('child_process')
const SAMPLE_RATE = 16000
const BUFFER_SIZE = 4000
/**
* ffmpegArgsToPCM
* set ffmpeg args to convert any source audio file/buffer to a PCM file/buffer
*
* @function
*
* @param {String} sourceFile
* @param {String} destinationFile
*
* @see https://stackoverflow.com/questions/45899585/pipe-input-in-to-ffmpeg-stdin
* @see https://ffmpeg.org/ffmpeg-protocols.html#pipe
*
* @param {Number} sampleRate
* @param {Number} bufferSize
*
* @return {String[]} array containing the list of arguments for ffmpeg cli program
*/
const ffmpegArgsToPCM = (sourceFile, sampleRate, bufferSize, destinationFile) => [
'-loglevel',
'quiet',
'-i', sourceFile,
'-ar', String(sampleRate),
'-ac', '1',
'-f', 's16le',
'-bufsize', String(bufferSize),
destinationFile
]
/**
* toPCM
*
* @function
* @async
* @requires ffmpeg installed in your host
*
* transcode (file or buffer) audio
* from an input file or an input Buffer to a PCM audio buffer
* spawning an external ffmpeg process.
*
* Input buffer is passed to ffmpeg child process, piping stream trough stdin
* Output buffer is piped by ffmpeg child process, piping stream to stdout
* This way, using stdin/stdout I/O you skip disk I/O, avoiding filesystem!
*
* @typedef {ToPCMObject}
* @property {String} sourceFile. pipe:0 means stdin
* @property {String} destinationFile. pipe:1 means stdout
* @property {Number} sampleRate
* @property {Number} bufferSize
*
* @param {ToPCMObject} ToPCMBufferObject
* @return {Promise<Buffer>}
*
* @example
* // 1. from an input Buffer to an output Buffer
* const sourceBuffer = ...
* toPCM( {sourceBuffer} )
*
* // 2. from an input Buffer to an output file
* const destinationFile = 'file.webm.wav'
* toPCM( {sourceBuffer, destinationFile} )
*
* // 3. from an input file to an output Buffer
* const sourceFile = "file.webm"
* toPCM( {sourceFile} )
*
* // 4. from an input file to an output file
* const sourceFile = "file.webm"
* toPCM( {sourceFile, destinationFile} )
*
*/
function toPCM({sourceBuffer=undefined, sourceFile='pipe:0', destinationFile='pipe:1', sampleRate=SAMPLE_RATE, bufferSize=BUFFER_SIZE} = {}) {
return new Promise( (resolve, reject) => {
//const ffmpegStart = new Date()
let ffmpeg
try {
ffmpeg = spawn('ffmpeg', ffmpegArgsToPCM(sourceFile, sampleRate, bufferSize, destinationFile))
}
catch (error) {
return reject(`ffmpeg spawn error: ${error}`)
}
//console.log(`ffmpeg command : ffmpeg ${ffmpegArgsToPCM.join(' ')}`)
/**
* if the source Buffer is specified, write it into ffmpeg process stdin avoiding disk I/O.
*
* @see
* https://stackoverflow.com/questions/30937751/pass-a-buffer-to-a-node-js-child-process
* https://stackoverflow.com/questions/30943250/pass-buffer-to-childprocess-node-js?noredirect=1&lq=1
*/
if (sourceBuffer)
ffmpeg.stdin.end(sourceBuffer)
//let bufferDataItem = 1
// allocate a buffer to contain PCM data collected from stdout
let buffer = Buffer.alloc(0)
ffmpeg.stdout.on('data', (stdout) => {
buffer = Buffer.concat([buffer, stdout])
//console.log()
//console.log(`data item : ${bufferDataItem++}`)
//console.log(`data size : ${stdout.length}`)
//console.log(`ffmpeg item latency : ${new Date() - ffmpegStart}ms`)
})
ffmpeg.on('close', () => resolve(buffer))
})
}
/**
* audio duration in seconds (of a pcm buffer)
*
* @function
* @public
*
* @see https://stackoverflow.com/questions/62553574/calculate-duration-of-arraybuffer-without-audio-api
*
* @param {Buffer} arrayBuffer
* @param {Number} numChannels
* @param {Number} sampleRate
* @param {Number} bytesPerSample
*
* @return {Number} duration in seconds
*
*/
function duration(arrayBuffer, numChannels=1, sampleRate=16000, bytesPerSample=2) {
// total samples/frames
const totalSamples = arrayBuffer.byteLength / bytesPerSample / numChannels
// total seconds
return totalSamples / sampleRate
}
/**
* Real-Time Factor
* Real-Time Factor (RTF) is defined as processing-time / length-of-audio.
* The exact real-time factor of an STT model will depend on the hardware setup, so you may experience a different RTF.
*
* @param {Number} processingTime (secs)
* @param {Number} audioLength (secs)
* @param {Number} decimals number of decimals precision
* @return {Number} calculated value as floating number
*/
const rtf = (processingTime, audioLenght, decimals=2) =>
+((processingTime / audioLenght).toFixed(decimals))
/**
* aproxKiloBytes
* pretty print an audio length in KB
*
* @function
* @public
*
* @param {Number} duration
* @return {String}
*
* @example
* aproxKiloBytes(63345) // => '63345~63Kb'
*
*/
const aproxKiloBytes = (bytes) =>
`${bytes}~${Math.round(bytes / 1024)}K`
/**
* aproxSecs
* pretty print an audio duration in seconds
*
* @function
* @public
*
* @param {Number} duration
* @return {String}
*
* @example
* msecsSecs(1.23445678) // => '1234ms~1s'
*
*/
const aproxSecs = (duration) =>
`${Math.round(duration*1000)}ms~${Math.round(duration)}s`
const byteLength = (buffer) =>
buffer.byteLength
// test
async function main() {
const sourceFile = '../audio/2830-3980-0043.wav.webm'
/*
console.log()
console.log('TEST 1: ffmpeg arguments for PCM output ')
const args = toPCMffmpegArgs({sourceFile})
console.log(`ffmpeg command arguments: ffmpeg ${args.join(' ')}`)
*/
console.log()
console.log('TEST: input Buffer (any audio format), output Buffer (PCM format)')
console.log('-----------------------------------------------------------------')
const readFileSyncStart = new Date()
// https://nodejs.org/api/fs.html#fs_file_system_flags
const sourceBuffer = fs.readFileSync(sourceFile, { flag: 'rs+' } )
const readFileSyncStop = new Date()
console.log(`source file : ${sourceFile}`)
console.log(`readFileSync latency : ${readFileSyncStop - readFileSyncStart}ms`)
const toPCMStart = new Date()
const destinationBuffer = await toPCM( { sourceFile /*, sourceBuffer*/ } )
const toPCMStop = new Date()
const latencyMsecs = toPCMStop - toPCMStart
const bufferLenght = byteLength(destinationBuffer)
const durationSecs = duration(destinationBuffer)
const durationMsecs = durationSecs*1000
const realtimeFactor = rtf(latencyMsecs, durationMsecs)
console.log(`toPCM latency : ${latencyMsecs}ms`)
console.log(`audio PCM buffer length : ${aproxKiloBytes(bufferLenght)}`)
console.log(`audio duration : ${aproxSecs(durationSecs)}`)
console.log(`audio real time factor (RTF) : ${realtimeFactor}`)
}
if (require.main === module)
main()
module.exports = {
duration,
aproxSecs,
aproxKiloBytes,
byteLength,
rtf,
toPCM
}