watson-speech
Version:
IBM Watson Speech to Text and Text to Speech SDK for web browsers.
194 lines (170 loc) • 7.57 kB
JavaScript
/**
* Copyright 2015 IBM Corp. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
;
var getUserMedia = require('get-user-media-promise');
var MicrophoneStream = require('microphone-stream');
var RecognizeStream = require('./recognize-stream.js');
var L16 = require('./webaudio-l16-stream.js');
var FormatStream = require('./format-stream.js');
var assign = require('object.assign/polyfill')();
var WritableElementStream = require('./writable-element-stream');
var { Writable } = require('readable-stream');
var ResultStream = require('./result-stream');
var SpeakerStream = require('./speaker-stream');
var preservedMicStream;
var bitBucket = new Writable({
write: function(chunk, encoding, callback) {
// when the keepMicrophone option is enabled, unused audio data is sent here so that it isn't buffered by other streams.
callback();
},
objectMode: true, // can still accept strings/buffers
decodeStrings: false
});
/**
* @module watson-speech/speech-to-text/recognize-microphone
*/
/**
* Create and return a RecognizeStream sourcing audio from the user's microphone
*
* @param {Object} options - Also passed to {RecognizeStream}, and {FormatStream} when applicable
* @param {String} options.token - Auth Token for CF services - see https://github.com/watson-developer-cloud/node-sdk#authorization
* @param {String} options.access_token - IAM Access Token for RC services - see https://github.com/watson-developer-cloud/node-sdk#authorization
* @param {String} [options.url='wss://stream.watsonplatform.net/speech-to-text/api'] - Base URL for a service instance
* @param {Boolean} [options.format=true] - pipe the text through a FormatStream which performs light formatting. Also controls smart_formatting option unless explicitly set.
* @param {Boolean} [options.keepMicrophone=false] - keeps an internal reference to the microphone stream to reuse in subsequent calls (prevents multiple permissions dialogs in firefox)
* @param {String|DOMElement} [options.outputElement] pipe the text to a [WriteableElementStream](WritableElementStream.html) targeting the specified element. Also defaults objectMode to true to enable interim results.
* @param {Boolean} [options.extractResults=false] pipe results through a ResultStream stream to simplify the objects. (Default behavior before v0.22) Requires objectMode.
* @param {Boolean} [options.resultsBySpeaker=false] Pipe results through a SpeakerStream. Forces speaker_labels and objectMode to be true.
* @param {MediaStream} [options.mediaStream] Optionally pass in an existing MediaStream
*
* @return {RecognizeStream|SpeakerStream|FormatStream|ResultStream}
*/
module.exports = function recognizeMicrophone(options) {
if (!options || (!options.token && !options.access_token)) {
throw new Error('WatsonSpeechToText: missing required parameter: opts.token (CF) or opts.access_token (RC)');
}
// the WritableElementStream works best in objectMode
if (options.outputElement && options.objectMode !== false) {
options.objectMode = true;
}
// the ResultExtractor only works in objectMode
if (options.extractResults) {
options.objectMode = true;
}
// SpeakerStream requires objectMode and speaker_labels
if (options.resultsBySpeaker) {
options.objectMode = true;
options.speaker_labels = true;
}
// default format to true (capitals and periods)
// default smart_formatting to options.format value (dates, currency, etc.)
options.format = options.format !== false;
if (typeof options.smart_formatting === 'undefined') {
options.smart_formatting = options.format;
}
var rsOpts = assign(
{
'content-type': 'audio/l16;rate=16000',
interim_results: true
},
options
);
var recognizeStream = new RecognizeStream(rsOpts);
var streams = [recognizeStream]; // collect all of the streams so that we can bundle up errors and send them to the last one
// set up the output first so that we have a place to emit errors
// if there's trouble with the input stream
var stream = recognizeStream;
var keepMic = options.keepMicrophone;
var micStream;
if (keepMic && preservedMicStream) {
preservedMicStream.unpipe(bitBucket);
micStream = preservedMicStream;
} else {
// create the MicrophoneStream synchronously to allow it to resume the context in Safari on iOS 11
micStream = new MicrophoneStream({
objectMode: true,
bufferSize: options.bufferSize
});
var pm = options.mediaStream ? Promise.resolve(options.mediaStream) : getUserMedia({ video: false, audio: true });
pm.then(function(mediaStream) {
micStream.setStream(mediaStream);
if (keepMic) {
preservedMicStream = micStream;
}
}).catch(function(err) {
stream.emit('error', err);
if (err.name === 'NotSupportedError') {
stream.end(); // end the stream
}
});
}
var l16Stream = new L16({ writableObjectMode: true });
micStream.pipe(l16Stream).pipe(recognizeStream);
streams.push(micStream, l16Stream);
/**
* unpipes the mic stream to prevent any more audio from being sent over the wire
* temporarily re-pipes it to the bitBucket (basically /dev/null) becuse
* otherwise it will buffer the audio from in between calls and prepend it to the next one
*
* @private
*/
function end() {
micStream.unpipe(l16Stream);
micStream.pipe(bitBucket);
l16Stream.end();
}
// trigger on both stop and end events:
// stop will not fire when a stream ends due to a timeout
// but when stop does fire, we want to honor it immediately
// end will always fire, but it may take a few moments after stop
if (keepMic) {
recognizeStream.on('end', end);
recognizeStream.on('stop', end);
} else {
recognizeStream.on('end', micStream.stop.bind(micStream));
recognizeStream.on('stop', micStream.stop.bind(micStream));
}
if (options.resultsBySpeaker) {
stream = stream.pipe(new SpeakerStream(options));
streams.push(stream);
}
if (options.format) {
stream = stream.pipe(new FormatStream(options));
streams.push(stream);
}
if (options.outputElement) {
// we don't want to return the WES, just send data to it
streams.push(stream.pipe(new WritableElementStream(options)));
}
if (options.extractResults) {
stream = stream.pipe(new ResultStream());
streams.push(stream);
}
// Capture errors from any stream except the last one and emit them on the last one
streams.forEach(function(prevStream) {
if (prevStream !== stream) {
prevStream.on('error', stream.emit.bind(stream, 'error'));
}
});
if (stream !== recognizeStream) {
// add a stop button to whatever the final stream ends up being
stream.stop = recognizeStream.stop.bind(recognizeStream);
}
// expose the original stream to for debugging (and to support the JSON tab on the STT demo)
stream.recognizeStream = recognizeStream;
return stream;
};
module.exports.isSupported = getUserMedia.isSupported;