watson-speech
Version:
IBM Watson Speech to Text and Text to Speech SDK for web browsers.
209 lines (191 loc) • 7.17 kB
JavaScript
'use strict';
var { Transform } = require('readable-stream');
var util = require('util');
var defaults = require('defaults');
// some versions of the buffer browser lib don't support Buffer.from (such as the one included by the current version of express-browserify)
var bufferFrom = require('buffer-from');
var TARGET_SAMPLE_RATE = 16000;
/**
* Transforms Buffers or AudioBuffers into a binary stream of l16 (raw wav) audio, downsampling in the process.
*
* The watson speech-to-text service works on 16kHz and internally downsamples audio received at higher samplerates.
* WebAudio is usually 44.1kHz or 48kHz, so downsampling here reduces bandwidth usage by ~2/3.
*
* Format event + stream can be combined with https://www.npmjs.com/package/wav to generate a wav file with a proper header
*
* Todo: support multi-channel audio (for use with <audio>/<video> elements) - will require interleaving audio channels
*
* @param {Object} options
* @constructor
*/
function WebAudioL16Stream(options) {
options = this.options = defaults(options, {
sourceSampleRate: 48000,
downsample: true
});
Transform.call(this, options);
this.bufferUnusedSamples = [];
if (options.objectMode || options.writableObjectMode) {
this._transform = this.handleFirstAudioBuffer;
} else {
this._transform = this.transformBuffer;
process.nextTick(this.emitFormat.bind(this));
}
}
util.inherits(WebAudioL16Stream, Transform);
WebAudioL16Stream.prototype.emitFormat = function emitFormat() {
this.emit('format', {
channels: 1,
bitDepth: 16,
sampleRate: this.options.downsample ? TARGET_SAMPLE_RATE : this.options.sourceSampleRate,
signed: true,
float: false
});
};
/**
* Downsamples WebAudio to 16 kHz.
*
* Browsers can downsample WebAudio natively with OfflineAudioContext's but it was designed for non-streaming use and
* requires a new context for each AudioBuffer. Firefox can handle this, but chrome (v47) crashes after a few minutes.
* So, we'll do it in JS for now.
*
* This really belongs in it's own stream, but there's no way to create new AudioBuffer instances from JS, so its
* fairly coupled to the wav conversion code.
*
* @param {AudioBuffer} bufferNewSamples Microphone/MediaElement audio chunk
* @return {Float32Array} 'audio/l16' chunk
*/
WebAudioL16Stream.prototype.downsample = function downsample(bufferNewSamples) {
var buffer = null;
var newSamples = bufferNewSamples.length;
var unusedSamples = this.bufferUnusedSamples.length;
var i;
var offset;
if (unusedSamples > 0) {
buffer = new Float32Array(unusedSamples + newSamples);
for (i = 0; i < unusedSamples; ++i) {
buffer[i] = this.bufferUnusedSamples[i];
}
for (i = 0; i < newSamples; ++i) {
buffer[unusedSamples + i] = bufferNewSamples[i];
}
} else {
buffer = bufferNewSamples;
}
// Downsampling and low-pass filter:
// Input audio is typically 44.1kHz or 48kHz, this downsamples it to 16kHz.
// It uses a FIR (finite impulse response) Filter to remove (or, at least attinuate)
// audio frequencies > ~8kHz because sampled audio cannot accurately represent
// frequiencies greater than half of the sample rate.
// (Human voice tops out at < 4kHz, so nothing important is lost for transcription.)
// See http://dsp.stackexchange.com/a/37475/26392 for a good explination of this code.
var filter = [
-0.037935,
-0.00089024,
0.040173,
0.019989,
0.0047792,
-0.058675,
-0.056487,
-0.0040653,
0.14527,
0.26927,
0.33913,
0.26927,
0.14527,
-0.0040653,
-0.056487,
-0.058675,
0.0047792,
0.019989,
0.040173,
-0.00089024,
-0.037935
];
var samplingRateRatio = this.options.sourceSampleRate / TARGET_SAMPLE_RATE;
var nOutputSamples = Math.floor((buffer.length - filter.length) / samplingRateRatio) + 1;
var outputBuffer = new Float32Array(nOutputSamples);
for (i = 0; i < outputBuffer.length; i++) {
offset = Math.round(samplingRateRatio * i);
var sample = 0;
for (var j = 0; j < filter.length; ++j) {
sample += buffer[offset + j] * filter[j];
}
outputBuffer[i] = sample;
}
var indexSampleAfterLastUsed = Math.round(samplingRateRatio * i);
var remaining = buffer.length - indexSampleAfterLastUsed;
if (remaining > 0) {
this.bufferUnusedSamples = new Float32Array(remaining);
for (i = 0; i < remaining; ++i) {
this.bufferUnusedSamples[i] = buffer[indexSampleAfterLastUsed + i];
}
} else {
this.bufferUnusedSamples = new Float32Array(0);
}
return outputBuffer;
};
/**
* Accepts a Float32Array of audio data and converts it to a Buffer of l16 audio data (raw wav)
*
* Explanation for the math: The raw values captured from the Web Audio API are
* in 32-bit Floating Point, between -1 and 1 (per the specification).
* The values for 16-bit PCM range between -32768 and +32767 (16-bit signed integer).
* Filter & combine samples to reduce frequency, then multiply to by 0x7FFF (32767) to convert.
* Store in little endian.
*
* @param {Float32Array} input
* @return {Buffer}
*/
WebAudioL16Stream.prototype.floatTo16BitPCM = function(input) {
var output = new DataView(new ArrayBuffer(input.length * 2)); // length is in bytes (8-bit), so *2 to get 16-bit length
for (var i = 0; i < input.length; i++) {
var multiplier = input[i] < 0 ? 0x8000 : 0x7fff; // 16-bit signed range is -32768 to 32767
output.setInt16(i * 2, (input[i] * multiplier) | 0, true); // index, value ("| 0" = convert to 32-bit int, round towards 0), littleEndian.
}
return bufferFrom(output.buffer);
};
/**
* Does some one-time setup to grab sampleRate and emit format, then sets _transform to the actual audio buffer handler and calls it.
* @param {AudioBuffer} audioBuffer
* @param {String} encoding
* @param {Function} next
*/
WebAudioL16Stream.prototype.handleFirstAudioBuffer = function handleFirstAudioBuffer(audioBuffer, encoding, next) {
this.options.sourceSampleRate = audioBuffer.sampleRate;
this.emitFormat();
this._transform = this.transformAudioBuffer;
this._transform(audioBuffer, encoding, next);
};
/**
* Accepts an AudioBuffer (for objectMode), then downsamples to 16000 and converts to a 16-bit pcm
*
* @param {AudioBuffer} audioBuffer
* @param {String} encoding
* @param {Function} next
*/
WebAudioL16Stream.prototype.transformAudioBuffer = function(audioBuffer, encoding, next) {
var source = audioBuffer.getChannelData(0);
if (this.options.downsample) {
source = this.downsample(source);
}
this.push(this.floatTo16BitPCM(source));
next();
};
/**
* Accepts a Buffer (for binary mode), then downsamples to 16000 and converts to a 16-bit pcm
*
* @param {Buffer} nodebuffer
* @param {String} encoding
* @param {Function} next
*/
WebAudioL16Stream.prototype.transformBuffer = function(nodebuffer, encoding, next) {
var source = new Float32Array(nodebuffer.buffer);
if (this.options.downsample) {
source = this.downsample(source);
}
this.push(this.floatTo16BitPCM(source));
next();
};
// new Float32Array(nodebuffer.buffer)
module.exports = WebAudioL16Stream;