UNPKG

watson-speech

Version:

IBM Watson Speech to Text and Text to Speech SDK for web browsers.

watson-speech.mybluemix.net

watson-developer-cloud/speech-javascript-sdk

209 lines (191 loc) • 7.17 kB

JavaScript

'use strict'; var { Transform } = require('readable-stream'); var util = require('util'); var defaults = require('defaults'); // some versions of the buffer browser lib don't support Buffer.from (such as the one included by the current version of express-browserify) var bufferFrom = require('buffer-from'); var TARGET_SAMPLE_RATE = 16000; /** * Transforms Buffers or AudioBuffers into a binary stream of l16 (raw wav) audio, downsampling in the process. * * The watson speech-to-text service works on 16kHz and internally downsamples audio received at higher samplerates. * WebAudio is usually 44.1kHz or 48kHz, so downsampling here reduces bandwidth usage by ~2/3. * * Format event + stream can be combined with https://www.npmjs.com/package/wav to generate a wav file with a proper header * * Todo: support multi-channel audio (for use with <audio>/<video> elements) - will require interleaving audio channels * * @param {Object} options * @constructor */ function WebAudioL16Stream(options) { options = this.options = defaults(options, { sourceSampleRate: 48000, downsample: true }); Transform.call(this, options); this.bufferUnusedSamples = []; if (options.objectMode || options.writableObjectMode) { this._transform = this.handleFirstAudioBuffer; } else { this._transform = this.transformBuffer; process.nextTick(this.emitFormat.bind(this)); } } util.inherits(WebAudioL16Stream, Transform); WebAudioL16Stream.prototype.emitFormat = function emitFormat() { this.emit('format', { channels: 1, bitDepth: 16, sampleRate: this.options.downsample ? TARGET_SAMPLE_RATE : this.options.sourceSampleRate, signed: true, float: false }); }; /** * Downsamples WebAudio to 16 kHz. * * Browsers can downsample WebAudio natively with OfflineAudioContext's but it was designed for non-streaming use and * requires a new context for each AudioBuffer. Firefox can handle this, but chrome (v47) crashes after a few minutes. * So, we'll do it in JS for now. * * This really belongs in it's own stream, but there's no way to create new AudioBuffer instances from JS, so its * fairly coupled to the wav conversion code. * * @param {AudioBuffer} bufferNewSamples Microphone/MediaElement audio chunk * @return {Float32Array} 'audio/l16' chunk */ WebAudioL16Stream.prototype.downsample = function downsample(bufferNewSamples) { var buffer = null; var newSamples = bufferNewSamples.length; var unusedSamples = this.bufferUnusedSamples.length; var i; var offset; if (unusedSamples > 0) { buffer = new Float32Array(unusedSamples + newSamples); for (i = 0; i < unusedSamples; ++i) { buffer[i] = this.bufferUnusedSamples[i]; } for (i = 0; i < newSamples; ++i) { buffer[unusedSamples + i] = bufferNewSamples[i]; } } else { buffer = bufferNewSamples; } // Downsampling and low-pass filter: // Input audio is typically 44.1kHz or 48kHz, this downsamples it to 16kHz. // It uses a FIR (finite impulse response) Filter to remove (or, at least attinuate) // audio frequencies > ~8kHz because sampled audio cannot accurately represent // frequiencies greater than half of the sample rate. // (Human voice tops out at < 4kHz, so nothing important is lost for transcription.) // See http://dsp.stackexchange.com/a/37475/26392 for a good explination of this code. var filter = [ -0.037935, -0.00089024, 0.040173, 0.019989, 0.0047792, -0.058675, -0.056487, -0.0040653, 0.14527, 0.26927, 0.33913, 0.26927, 0.14527, -0.0040653, -0.056487, -0.058675, 0.0047792, 0.019989, 0.040173, -0.00089024, -0.037935 ]; var samplingRateRatio = this.options.sourceSampleRate / TARGET_SAMPLE_RATE; var nOutputSamples = Math.floor((buffer.length - filter.length) / samplingRateRatio) + 1; var outputBuffer = new Float32Array(nOutputSamples); for (i = 0; i < outputBuffer.length; i++) { offset = Math.round(samplingRateRatio * i); var sample = 0; for (var j = 0; j < filter.length; ++j) { sample += buffer[offset + j] * filter[j]; } outputBuffer[i] = sample; } var indexSampleAfterLastUsed = Math.round(samplingRateRatio * i); var remaining = buffer.length - indexSampleAfterLastUsed; if (remaining > 0) { this.bufferUnusedSamples = new Float32Array(remaining); for (i = 0; i < remaining; ++i) { this.bufferUnusedSamples[i] = buffer[indexSampleAfterLastUsed + i]; } } else { this.bufferUnusedSamples = new Float32Array(0); } return outputBuffer; }; /** * Accepts a Float32Array of audio data and converts it to a Buffer of l16 audio data (raw wav) * * Explanation for the math: The raw values captured from the Web Audio API are * in 32-bit Floating Point, between -1 and 1 (per the specification). * The values for 16-bit PCM range between -32768 and +32767 (16-bit signed integer). * Filter & combine samples to reduce frequency, then multiply to by 0x7FFF (32767) to convert. * Store in little endian. * * @param {Float32Array} input * @return {Buffer} */ WebAudioL16Stream.prototype.floatTo16BitPCM = function(input) { var output = new DataView(new ArrayBuffer(input.length * 2)); // length is in bytes (8-bit), so *2 to get 16-bit length for (var i = 0; i < input.length; i++) { var multiplier = input[i] < 0 ? 0x8000 : 0x7fff; // 16-bit signed range is -32768 to 32767 output.setInt16(i * 2, (input[i] * multiplier) | 0, true); // index, value ("| 0" = convert to 32-bit int, round towards 0), littleEndian. } return bufferFrom(output.buffer); }; /** * Does some one-time setup to grab sampleRate and emit format, then sets _transform to the actual audio buffer handler and calls it. * @param {AudioBuffer} audioBuffer * @param {String} encoding * @param {Function} next */ WebAudioL16Stream.prototype.handleFirstAudioBuffer = function handleFirstAudioBuffer(audioBuffer, encoding, next) { this.options.sourceSampleRate = audioBuffer.sampleRate; this.emitFormat(); this._transform = this.transformAudioBuffer; this._transform(audioBuffer, encoding, next); }; /** * Accepts an AudioBuffer (for objectMode), then downsamples to 16000 and converts to a 16-bit pcm * * @param {AudioBuffer} audioBuffer * @param {String} encoding * @param {Function} next */ WebAudioL16Stream.prototype.transformAudioBuffer = function(audioBuffer, encoding, next) { var source = audioBuffer.getChannelData(0); if (this.options.downsample) { source = this.downsample(source); } this.push(this.floatTo16BitPCM(source)); next(); }; /** * Accepts a Buffer (for binary mode), then downsamples to 16000 and converts to a 16-bit pcm * * @param {Buffer} nodebuffer * @param {String} encoding * @param {Function} next */ WebAudioL16Stream.prototype.transformBuffer = function(nodebuffer, encoding, next) { var source = new Float32Array(nodebuffer.buffer); if (this.options.downsample) { source = this.downsample(source); } this.push(this.floatTo16BitPCM(source)); next(); }; // new Float32Array(nodebuffer.buffer) module.exports = WebAudioL16Stream;