UNPKG

watson-speech

Version:

IBM Watson Speech to Text and Text to Speech SDK for web browsers.

257 lines (222 loc) 10.3 kB
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"> <title>JSDoc: Source: speech-to-text/webaudio-l16-stream.js</title> <script src="scripts/prettify/prettify.js"> </script> <script src="scripts/prettify/lang-css.js"> </script> <!--[if lt IE 9]> <script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script> <![endif]--> <link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css"> <link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css"> </head> <body> <div id="main"> <h1 class="page-title">Source: speech-to-text/webaudio-l16-stream.js</h1> <section> <article> <pre class="prettyprint source linenums"><code>'use strict'; var Transform = require('stream').Transform; var util = require('util'); var defaults = require('defaults'); var TARGET_SAMPLE_RATE = 16000; /** * Transforms Buffers or AudioBuffers into a binary stream of l16 (raw wav) audio, downsampling in the process. * * The watson speech-to-text service works on 16kHz and internally downsamples audio received at higher samplerates. * WebAudio is usually 44.1kHz or 48kHz, so downsampling here reduces bandwidth usage by ~2/3. * * Format event + stream can be combined with https://www.npmjs.com/package/wav to generate a wav file with a proper header * * Todo: support multi-channel audio (for use with &lt;audio>/&lt;video> elements) - will require interleaving audio channels * * @param {Object} options * @constructor */ function WebAudioL16Stream(options) { options = this.options = defaults(options, { sourceSampleRate: 48000, downsample: true }); Transform.call(this, options); this.bufferUnusedSamples = []; if (options.objectMode || options.writableObjectMode) { this._transform = this.handleFirstAudioBuffer; } else { this._transform = this.transformBuffer; process.nextTick(this.emitFormat.bind(this)); } } util.inherits(WebAudioL16Stream, Transform); WebAudioL16Stream.prototype.emitFormat = function emitFormat() { this.emit('format', { channels: 1, bitDepth: 16, sampleRate: this.options.downsample ? TARGET_SAMPLE_RATE : this.options.sourceSampleRate, signed: true, float: false }); }; /** * Downsamples WebAudio to 16 kHz. * * Browsers can downsample WebAudio natively with OfflineAudioContext's but it was designed for non-streaming use and * requires a new context for each AudioBuffer. Firefox can handle this, but chrome (v47) crashes after a few minutes. * So, we'll do it in JS for now. * * This really belongs in it's own stream, but there's no way to create new AudioBuffer instances from JS, so its * fairly coupled to the wav conversion code. * * @param {AudioBuffer} bufferNewSamples Microphone/MediaElement audio chunk * @return {Float32Array} 'audio/l16' chunk */ WebAudioL16Stream.prototype.downsample = function downsample(bufferNewSamples) { var buffer = null; var newSamples = bufferNewSamples.length; var unusedSamples = this.bufferUnusedSamples.length; var i; var offset; if (unusedSamples > 0) { buffer = new Float32Array(unusedSamples + newSamples); for (i = 0; i &lt; unusedSamples; ++i) { buffer[i] = this.bufferUnusedSamples[i]; } for (i = 0; i &lt; newSamples; ++i) { buffer[unusedSamples + i] = bufferNewSamples[i]; } } else { buffer = bufferNewSamples; } // Downsampling and low-pass filter: // Input audio is typically 44.1kHz or 48kHz, this downsamples it to 16kHz. // It uses a FIR (finite impulse response) Filter to remove (or, at least attinuate) // audio frequencies > ~8kHz because sampled audio cannot accurately represent // frequiencies greater than half of the sample rate. // (Human voice tops out at &lt; 4kHz, so nothing important is lost for transcription.) // See http://dsp.stackexchange.com/a/37475/26392 for a good explination of this code. var filter = [ -0.037935, -0.00089024, 0.040173, 0.019989, 0.0047792, -0.058675, -0.056487, -0.0040653, 0.14527, 0.26927, 0.33913, 0.26927, 0.14527, -0.0040653, -0.056487, -0.058675, 0.0047792, 0.019989, 0.040173, -0.00089024, -0.037935 ]; var samplingRateRatio = this.options.sourceSampleRate / TARGET_SAMPLE_RATE; var nOutputSamples = Math.floor((buffer.length - filter.length) / samplingRateRatio) + 1; var outputBuffer = new Float32Array(nOutputSamples); for (i = 0; i + filter.length - 1 &lt; buffer.length; i++) { offset = Math.round(samplingRateRatio * i); var sample = 0; for (var j = 0; j &lt; filter.length; ++j) { sample += buffer[offset + j] * filter[j]; } outputBuffer[i] = sample; } var indexSampleAfterLastUsed = Math.round(samplingRateRatio * i); var remaining = buffer.length - indexSampleAfterLastUsed; if (remaining > 0) { this.bufferUnusedSamples = new Float32Array(remaining); for (i = 0; i &lt; remaining; ++i) { this.bufferUnusedSamples[i] = buffer[indexSampleAfterLastUsed + i]; } } else { this.bufferUnusedSamples = new Float32Array(0); } return outputBuffer; }; /** * Accepts a Float32Array of audio data and converts it to a Buffer of l16 audio data (raw wav) * * Explanation for the math: The raw values captured from the Web Audio API are * in 32-bit Floating Point, between -1 and 1 (per the specification). * The values for 16-bit PCM range between -32768 and +32767 (16-bit signed integer). * Filter &amp; combine samples to reduce frequency, then multiply to by 0x7FFF (32767) to convert. * Store in little endian. * * @param {Float32Array} input * @return {Buffer} */ WebAudioL16Stream.prototype.floatTo16BitPCM = function(input) { var output = new DataView(new ArrayBuffer(input.length * 2)); // length is in bytes (8-bit), so *2 to get 16-bit length for (var i = 0; i &lt; input.length; i++) { var multiplier = input[i] &lt; 0 ? 0x8000 : 0x7fff; // 16-bit signed range is -32768 to 32767 output.setInt16(i * 2, input[i] * multiplier | 0, true); // index, value, little edian } return Buffer.from(output.buffer); }; /** * Does some one-time setup to grab sampleRate and emit format, then sets _transform to the actual audio buffer handler and calls it. * @param {AudioBuffer} audioBuffer * @param {String} encoding * @param {Function} next */ WebAudioL16Stream.prototype.handleFirstAudioBuffer = function handleFirstAudioBuffer(audioBuffer, encoding, next) { this.options.sourceSampleRate = audioBuffer.sampleRate; this.emitFormat(); this._transform = this.transformAudioBuffer; this._transform(audioBuffer, encoding, next); }; /** * Accepts an AudioBuffer (for objectMode), then downsamples to 16000 and converts to a 16-bit pcm * * @param {AudioBuffer} audioBuffer * @param {String} encoding * @param {Function} next */ WebAudioL16Stream.prototype.transformAudioBuffer = function(audioBuffer, encoding, next) { var source = audioBuffer.getChannelData(0); if (this.options.downsample) { source = this.downsample(source); } this.push(this.floatTo16BitPCM(source)); next(); }; /** * Accepts a Buffer (for binary mode), then downsamples to 16000 and converts to a 16-bit pcm * * @param {Buffer} nodebuffer * @param {String} encoding * @param {Function} next */ WebAudioL16Stream.prototype.transformBuffer = function(nodebuffer, encoding, next) { var source = new Float32Array(nodebuffer.buffer); if (this.options.downsample) { source = this.downsample(source); } this.push(this.floatTo16BitPCM(source)); next(); }; // new Float32Array(nodebuffer.buffer) module.exports = WebAudioL16Stream; </code></pre> </article> </section> </div> <nav> <h2><a href="index.html">Home</a></h2><h3>Modules</h3><ul><li><a href="module-watson-speech.html">watson-speech</a></li><li><a href="module-watson-speech_speech-to-text.html">watson-speech/speech-to-text</a></li><li><a href="module-watson-speech_speech-to-text_get-models.html">watson-speech/speech-to-text/get-models</a></li><li><a href="module-watson-speech_speech-to-text_recognize-file.html">watson-speech/speech-to-text/recognize-file</a></li><li><a href="module-watson-speech_speech-to-text_recognize-microphone.html">watson-speech/speech-to-text/recognize-microphone</a></li><li><a href="module-watson-speech_text-to-speech.html">watson-speech/text-to-speech</a></li><li><a href="module-watson-speech_text-to-speech_get-voices.html">watson-speech/text-to-speech/get-voices</a></li><li><a href="module-watson-speech_text-to-speech_synthesize.html">watson-speech/text-to-speech/synthesize</a></li></ul><h3>Classes</h3><ul><li><a href="FilePlayer.html">FilePlayer</a></li><li><a href="FormatStream.html">FormatStream</a></li><li><a href="RecognizeStream.html">RecognizeStream</a></li><li><a href="ResultStream.html">ResultStream</a></li><li><a href="SpeakerStream.html">SpeakerStream</a></li><li><a href="TimingStream.html">TimingStream</a></li><li><a href="UrlPlayer.html">UrlPlayer</a></li><li><a href="WebAudioL16Stream.html">WebAudioL16Stream</a></li><li><a href="WritableElementStream.html">WritableElementStream</a></li></ul><h3>Events</h3><ul><li><a href="RecognizeStream.html#event:close">close</a></li><li><a href="RecognizeStream.html#event:data">data</a></li><li><a href="RecognizeStream.html#event:error">error</a></li><li><a href="RecognizeStream.html#event:listening">listening</a></li><li><a href="RecognizeStream.html#event:message">message</a></li><li><a href="RecognizeStream.html#event:open">open</a></li><li><a href="RecognizeStream.html#event:send-data">send-data</a></li><li><a href="RecognizeStream.html#event:send-json">send-json</a></li><li><a href="RecognizeStream.html#event:stop">stop</a></li><li><a href="SpeakerStream.html#event:data">data</a></li></ul><h3>Global</h3><ul><li><a href="global.html#getContentTypeFromFile">getContentTypeFromFile</a></li><li><a href="global.html#playFile">playFile</a></li></ul> </nav> <br class="clear"> <footer> Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.3</a> on Tue Feb 21 2017 17:41:51 GMT+0000 (UTC) </footer> <script> prettyPrint(); </script> <script src="scripts/linenumber.js"> </script> </body> </html>