watson-speech
Version:
IBM Watson Speech to Text and Text to Speech SDK for web browsers.
257 lines (222 loc) • 10.3 kB
HTML
<html lang="en">
<head>
<meta charset="utf-8">
<title>JSDoc: Source: speech-to-text/webaudio-l16-stream.js</title>
<script src="scripts/prettify/prettify.js"> </script>
<script src="scripts/prettify/lang-css.js"> </script>
<!--[if lt IE 9]>
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
<link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css">
<link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css">
</head>
<body>
<div id="main">
<h1 class="page-title">Source: speech-to-text/webaudio-l16-stream.js</h1>
<section>
<article>
<pre class="prettyprint source linenums"><code>'use strict';
var Transform = require('stream').Transform;
var util = require('util');
var defaults = require('defaults');
var TARGET_SAMPLE_RATE = 16000;
/**
* Transforms Buffers or AudioBuffers into a binary stream of l16 (raw wav) audio, downsampling in the process.
*
* The watson speech-to-text service works on 16kHz and internally downsamples audio received at higher samplerates.
* WebAudio is usually 44.1kHz or 48kHz, so downsampling here reduces bandwidth usage by ~2/3.
*
* Format event + stream can be combined with https://www.npmjs.com/package/wav to generate a wav file with a proper header
*
* Todo: support multi-channel audio (for use with <audio>/<video> elements) - will require interleaving audio channels
*
* @param {Object} options
* @constructor
*/
function WebAudioL16Stream(options) {
options = this.options = defaults(options, {
sourceSampleRate: 48000,
downsample: true
});
Transform.call(this, options);
this.bufferUnusedSamples = [];
if (options.objectMode || options.writableObjectMode) {
this._transform = this.handleFirstAudioBuffer;
} else {
this._transform = this.transformBuffer;
process.nextTick(this.emitFormat.bind(this));
}
}
util.inherits(WebAudioL16Stream, Transform);
WebAudioL16Stream.prototype.emitFormat = function emitFormat() {
this.emit('format', {
channels: 1,
bitDepth: 16,
sampleRate: this.options.downsample ? TARGET_SAMPLE_RATE : this.options.sourceSampleRate,
signed: true,
float: false
});
};
/**
* Downsamples WebAudio to 16 kHz.
*
* Browsers can downsample WebAudio natively with OfflineAudioContext's but it was designed for non-streaming use and
* requires a new context for each AudioBuffer. Firefox can handle this, but chrome (v47) crashes after a few minutes.
* So, we'll do it in JS for now.
*
* This really belongs in it's own stream, but there's no way to create new AudioBuffer instances from JS, so its
* fairly coupled to the wav conversion code.
*
* @param {AudioBuffer} bufferNewSamples Microphone/MediaElement audio chunk
* @return {Float32Array} 'audio/l16' chunk
*/
WebAudioL16Stream.prototype.downsample = function downsample(bufferNewSamples) {
var buffer = null;
var newSamples = bufferNewSamples.length;
var unusedSamples = this.bufferUnusedSamples.length;
var i;
var offset;
if (unusedSamples > 0) {
buffer = new Float32Array(unusedSamples + newSamples);
for (i = 0; i < unusedSamples; ++i) {
buffer[i] = this.bufferUnusedSamples[i];
}
for (i = 0; i < newSamples; ++i) {
buffer[unusedSamples + i] = bufferNewSamples[i];
}
} else {
buffer = bufferNewSamples;
}
// Downsampling and low-pass filter:
// Input audio is typically 44.1kHz or 48kHz, this downsamples it to 16kHz.
// It uses a FIR (finite impulse response) Filter to remove (or, at least attinuate)
// audio frequencies > ~8kHz because sampled audio cannot accurately represent
// frequiencies greater than half of the sample rate.
// (Human voice tops out at < 4kHz, so nothing important is lost for transcription.)
// See http://dsp.stackexchange.com/a/37475/26392 for a good explination of this code.
var filter = [
-0.037935,
-0.00089024,
0.040173,
0.019989,
0.0047792,
-0.058675,
-0.056487,
-0.0040653,
0.14527,
0.26927,
0.33913,
0.26927,
0.14527,
-0.0040653,
-0.056487,
-0.058675,
0.0047792,
0.019989,
0.040173,
-0.00089024,
-0.037935
];
var samplingRateRatio = this.options.sourceSampleRate / TARGET_SAMPLE_RATE;
var nOutputSamples = Math.floor((buffer.length - filter.length) / samplingRateRatio) + 1;
var outputBuffer = new Float32Array(nOutputSamples);
for (i = 0; i + filter.length - 1 < buffer.length; i++) {
offset = Math.round(samplingRateRatio * i);
var sample = 0;
for (var j = 0; j < filter.length; ++j) {
sample += buffer[offset + j] * filter[j];
}
outputBuffer[i] = sample;
}
var indexSampleAfterLastUsed = Math.round(samplingRateRatio * i);
var remaining = buffer.length - indexSampleAfterLastUsed;
if (remaining > 0) {
this.bufferUnusedSamples = new Float32Array(remaining);
for (i = 0; i < remaining; ++i) {
this.bufferUnusedSamples[i] = buffer[indexSampleAfterLastUsed + i];
}
} else {
this.bufferUnusedSamples = new Float32Array(0);
}
return outputBuffer;
};
/**
* Accepts a Float32Array of audio data and converts it to a Buffer of l16 audio data (raw wav)
*
* Explanation for the math: The raw values captured from the Web Audio API are
* in 32-bit Floating Point, between -1 and 1 (per the specification).
* The values for 16-bit PCM range between -32768 and +32767 (16-bit signed integer).
* Filter & combine samples to reduce frequency, then multiply to by 0x7FFF (32767) to convert.
* Store in little endian.
*
* @param {Float32Array} input
* @return {Buffer}
*/
WebAudioL16Stream.prototype.floatTo16BitPCM = function(input) {
var output = new DataView(new ArrayBuffer(input.length * 2)); // length is in bytes (8-bit), so *2 to get 16-bit length
for (var i = 0; i < input.length; i++) {
var multiplier = input[i] < 0 ? 0x8000 : 0x7fff; // 16-bit signed range is -32768 to 32767
output.setInt16(i * 2, input[i] * multiplier | 0, true); // index, value, little edian
}
return Buffer.from(output.buffer);
};
/**
* Does some one-time setup to grab sampleRate and emit format, then sets _transform to the actual audio buffer handler and calls it.
* @param {AudioBuffer} audioBuffer
* @param {String} encoding
* @param {Function} next
*/
WebAudioL16Stream.prototype.handleFirstAudioBuffer = function handleFirstAudioBuffer(audioBuffer, encoding, next) {
this.options.sourceSampleRate = audioBuffer.sampleRate;
this.emitFormat();
this._transform = this.transformAudioBuffer;
this._transform(audioBuffer, encoding, next);
};
/**
* Accepts an AudioBuffer (for objectMode), then downsamples to 16000 and converts to a 16-bit pcm
*
* @param {AudioBuffer} audioBuffer
* @param {String} encoding
* @param {Function} next
*/
WebAudioL16Stream.prototype.transformAudioBuffer = function(audioBuffer, encoding, next) {
var source = audioBuffer.getChannelData(0);
if (this.options.downsample) {
source = this.downsample(source);
}
this.push(this.floatTo16BitPCM(source));
next();
};
/**
* Accepts a Buffer (for binary mode), then downsamples to 16000 and converts to a 16-bit pcm
*
* @param {Buffer} nodebuffer
* @param {String} encoding
* @param {Function} next
*/
WebAudioL16Stream.prototype.transformBuffer = function(nodebuffer, encoding, next) {
var source = new Float32Array(nodebuffer.buffer);
if (this.options.downsample) {
source = this.downsample(source);
}
this.push(this.floatTo16BitPCM(source));
next();
};
// new Float32Array(nodebuffer.buffer)
module.exports = WebAudioL16Stream;
</code></pre>
</article>
</section>
</div>
<nav>
<h2><a href="index.html">Home</a></h2><h3>Modules</h3><ul><li><a href="module-watson-speech.html">watson-speech</a></li><li><a href="module-watson-speech_speech-to-text.html">watson-speech/speech-to-text</a></li><li><a href="module-watson-speech_speech-to-text_get-models.html">watson-speech/speech-to-text/get-models</a></li><li><a href="module-watson-speech_speech-to-text_recognize-file.html">watson-speech/speech-to-text/recognize-file</a></li><li><a href="module-watson-speech_speech-to-text_recognize-microphone.html">watson-speech/speech-to-text/recognize-microphone</a></li><li><a href="module-watson-speech_text-to-speech.html">watson-speech/text-to-speech</a></li><li><a href="module-watson-speech_text-to-speech_get-voices.html">watson-speech/text-to-speech/get-voices</a></li><li><a href="module-watson-speech_text-to-speech_synthesize.html">watson-speech/text-to-speech/synthesize</a></li></ul><h3>Classes</h3><ul><li><a href="FilePlayer.html">FilePlayer</a></li><li><a href="FormatStream.html">FormatStream</a></li><li><a href="RecognizeStream.html">RecognizeStream</a></li><li><a href="ResultStream.html">ResultStream</a></li><li><a href="SpeakerStream.html">SpeakerStream</a></li><li><a href="TimingStream.html">TimingStream</a></li><li><a href="UrlPlayer.html">UrlPlayer</a></li><li><a href="WebAudioL16Stream.html">WebAudioL16Stream</a></li><li><a href="WritableElementStream.html">WritableElementStream</a></li></ul><h3>Events</h3><ul><li><a href="RecognizeStream.html#event:close">close</a></li><li><a href="RecognizeStream.html#event:data">data</a></li><li><a href="RecognizeStream.html#event:error">error</a></li><li><a href="RecognizeStream.html#event:listening">listening</a></li><li><a href="RecognizeStream.html#event:message">message</a></li><li><a href="RecognizeStream.html#event:open">open</a></li><li><a href="RecognizeStream.html#event:send-data">send-data</a></li><li><a href="RecognizeStream.html#event:send-json">send-json</a></li><li><a href="RecognizeStream.html#event:stop">stop</a></li><li><a href="SpeakerStream.html#event:data">data</a></li></ul><h3>Global</h3><ul><li><a href="global.html#getContentTypeFromFile">getContentTypeFromFile</a></li><li><a href="global.html#playFile">playFile</a></li></ul>
</nav>
<br class="clear">
<footer>
Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.4.3</a> on Tue Feb 21 2017 17:41:51 GMT+0000 (UTC)
</footer>
<script> prettyPrint(); </script>
<script src="scripts/linenumber.js"> </script>
</body>
</html>