@euirim/microsoft-cognitiveservices-speech-sdk
Version:
Microsoft Cognitive Services Speech SDK for JavaScript
542 lines (540 loc) • 31.6 kB
JavaScript
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
import { ReplayableAudioNode } from "../common.browser/Exports";
import { ConnectionState, createGuid, createNoDashGuid, Deferred, MessageType, PromiseHelper, } from "../common/Exports";
import { ActivityReceivedEventArgs, CancellationErrorCode, CancellationReason, PropertyCollection, PropertyId, RecognitionEventArgs, ResultReason, SessionEventArgs, SpeechRecognitionCanceledEventArgs, SpeechRecognitionEventArgs, SpeechRecognitionResult, } from "../sdk/Exports";
import { DialogServiceTurnStateManager } from "./DialogServiceTurnStateManager";
import { CancellationErrorCodePropertyName, EnumTranslation, RecognitionStatus, RequestSession, ServiceRecognizerBase, SimpleSpeechPhrase, SpeechDetected, SpeechHypothesis, } from "./Exports";
import { ActivityPayloadResponse } from "./ServiceMessages/ActivityResponsePayload";
import { SpeechConnectionMessage } from "./SpeechConnectionMessage.Internal";
export class DialogServiceAdapter extends ServiceRecognizerBase {
constructor(authentication, connectionFactory, audioSource, recognizerConfig, dialogServiceConnector) {
super(authentication, connectionFactory, audioSource, recognizerConfig, dialogServiceConnector);
this.sendMessage = (message) => {
const interactionGuid = createGuid();
const requestId = createNoDashGuid();
const agentMessage = {
context: {
interactionId: interactionGuid
},
messagePayload: message,
version: 0.5
};
const agentMessageJson = JSON.stringify(agentMessage);
this.fetchDialogConnection().onSuccessContinueWith((connection) => {
connection.send(new SpeechConnectionMessage(MessageType.Text, "agent", requestId, "application/json", agentMessageJson));
});
};
this.listenOnce = (recoMode, successCallback, errorCallback) => {
this.privRecognizerConfig.recognitionMode = recoMode;
this.privDialogRequestSession.startNewRecognition();
this.privDialogRequestSession.listenForServiceTelemetry(this.privDialogAudioSource.events);
// Start the connection to the service. The promise this will create is stored and will be used by configureConnection().
this.dialogConnectImpl();
this.sendPreAudioMessages();
this.privSuccessCallback = successCallback;
return this.privDialogAudioSource
.attach(this.privDialogRequestSession.audioNodeId)
.continueWithPromise((result) => {
let audioNode;
if (result.isError) {
this.cancelRecognition(this.privDialogRequestSession.sessionId, this.privDialogRequestSession.requestId, CancellationReason.Error, CancellationErrorCode.ConnectionFailure, result.error, successCallback);
return PromiseHelper.fromError(result.error);
}
else {
audioNode = new ReplayableAudioNode(result.result, this.privDialogAudioSource.format);
this.privDialogRequestSession.onAudioSourceAttachCompleted(audioNode, false);
}
return this.privDialogAudioSource.deviceInfo.onSuccessContinueWithPromise((deviceInfo) => {
this.privRecognizerConfig.SpeechServiceConfig.Context.audio = { source: deviceInfo };
return this.configConnection()
.on((_) => {
const sessionStartEventArgs = new SessionEventArgs(this.privDialogRequestSession.sessionId);
if (!!this.privRecognizer.sessionStarted) {
this.privRecognizer.sessionStarted(this.privRecognizer, sessionStartEventArgs);
}
const audioSendPromise = this.sendAudio(audioNode);
// /* tslint:disable:no-empty */
audioSendPromise.on((_) => { }, (error) => {
this.cancelRecognition(this.privDialogRequestSession.sessionId, this.privDialogRequestSession.requestId, CancellationReason.Error, CancellationErrorCode.RuntimeError, error, successCallback);
});
}, (error) => {
this.cancelRecognition(this.privDialogRequestSession.sessionId, this.privDialogRequestSession.requestId, CancellationReason.Error, CancellationErrorCode.ConnectionFailure, error, successCallback);
}).continueWithPromise((result) => {
if (result.isError) {
return PromiseHelper.fromError(result.error);
}
else {
return PromiseHelper.fromResult(true);
}
});
});
});
};
this.sendAudio = (audioStreamNode) => {
// NOTE: Home-baked promises crash ios safari during the invocation
// of the error callback chain (looks like the recursion is way too deep, and
// it blows up the stack). The following construct is a stop-gap that does not
// bubble the error up the callback chain and hence circumvents this problem.
// TODO: rewrite with ES6 promises.
const deferred = new Deferred();
// The time we last sent data to the service.
let nextSendTime = Date.now();
const audioFormat = this.privDialogAudioSource.format;
// Max amount to send before we start to throttle
const fastLaneSizeMs = this.privRecognizerConfig.parameters.getProperty("SPEECH-TransmitLengthBeforThrottleMs", "5000");
const maxSendUnthrottledBytes = audioFormat.avgBytesPerSec / 1000 * parseInt(fastLaneSizeMs, 10);
const startRecogNumber = this.privDialogRequestSession.recogNumber;
const readAndUploadCycle = () => {
// If speech is done, stop sending audio.
if (!this.privDialogIsDisposed &&
!this.privDialogRequestSession.isSpeechEnded &&
this.privDialogRequestSession.isRecognizing &&
this.privDialogRequestSession.recogNumber === startRecogNumber) {
this.fetchDialogConnection().on((connection) => {
audioStreamNode.read().on((audioStreamChunk) => {
// we have a new audio chunk to upload.
if (this.privDialogRequestSession.isSpeechEnded) {
// If service already recognized audio end then don't send any more audio
deferred.resolve(true);
return;
}
let payload;
let sendDelay;
if (audioStreamChunk.isEnd) {
payload = null;
sendDelay = 0;
}
else {
payload = audioStreamChunk.buffer;
this.privDialogRequestSession.onAudioSent(payload.byteLength);
if (maxSendUnthrottledBytes >= this.privDialogRequestSession.bytesSent) {
sendDelay = 0;
}
else {
sendDelay = Math.max(0, nextSendTime - Date.now());
}
}
// Are we ready to send, or need we delay more?
setTimeout(() => {
if (payload !== null) {
nextSendTime = Date.now() + (payload.byteLength * 1000 / (audioFormat.avgBytesPerSec * 2));
}
const uploaded = connection.send(new SpeechConnectionMessage(MessageType.Binary, "audio", this.privDialogRequestSession.requestId, null, payload));
if (!audioStreamChunk.isEnd) {
uploaded.continueWith((_) => {
// Regardless of success or failure, schedule the next upload.
// If the underlying connection was broken, the next cycle will
// get a new connection and re-transmit missing audio automatically.
readAndUploadCycle();
});
}
else {
// the audio stream has been closed, no need to schedule next
// read-upload cycle.
this.privDialogRequestSession.onSpeechEnded();
deferred.resolve(true);
}
}, sendDelay);
}, (error) => {
if (this.privDialogRequestSession.isSpeechEnded) {
// For whatever reason, Reject is used to remove queue subscribers inside
// the Queue.DrainAndDispose invoked from DetachAudioNode down below, which
// means that sometimes things can be rejected in normal circumstances, without
// any errors.
deferred.resolve(true); // TODO: remove the argument, it's is completely meaningless.
}
else {
// Only reject, if there was a proper error.
deferred.reject(error);
}
});
}, (error) => {
deferred.reject(error);
});
}
};
readAndUploadCycle();
return deferred.promise();
};
this.receiveDialogMessageOverride = (successCallback, errorCallBack) => {
// we won't rely on the cascading promises of the connection since we want to continually be available to receive messages
const communicationCustodian = new Deferred();
this.fetchDialogConnection().on((connection) => {
return connection.read()
.onSuccessContinueWithPromise((message) => {
const isDisposed = this.isDisposed();
const terminateMessageLoop = (!this.isDisposed() && this.terminateMessageLoop);
if (isDisposed || terminateMessageLoop) {
// We're done.
communicationCustodian.resolve(undefined);
return PromiseHelper.fromResult(undefined);
}
if (!message) {
return this.receiveDialogMessageOverride();
}
const connectionMessage = SpeechConnectionMessage.fromConnectionMessage(message);
switch (connectionMessage.path.toLowerCase()) {
case "turn.start":
{
const turnRequestId = connectionMessage.requestId.toUpperCase();
const audioSessionReqId = this.privDialogRequestSession.requestId.toUpperCase();
// turn started by the service
if (turnRequestId !== audioSessionReqId) {
this.privTurnStateManager.StartTurn(turnRequestId);
}
}
break;
case "speech.startdetected":
const speechStartDetected = SpeechDetected.fromJSON(connectionMessage.textBody);
const speechStartEventArgs = new RecognitionEventArgs(speechStartDetected.Offset, this.privDialogRequestSession.sessionId);
if (!!this.privRecognizer.speechStartDetected) {
this.privRecognizer.speechStartDetected(this.privRecognizer, speechStartEventArgs);
}
break;
case "speech.enddetected":
let json;
if (connectionMessage.textBody.length > 0) {
json = connectionMessage.textBody;
}
else {
// If the request was empty, the JSON returned is empty.
json = "{ Offset: 0 }";
}
const speechStopDetected = SpeechDetected.fromJSON(json);
this.privDialogRequestSession.onServiceRecognized(speechStopDetected.Offset + this.privDialogRequestSession.currentTurnAudioOffset);
const speechStopEventArgs = new RecognitionEventArgs(speechStopDetected.Offset + this.privDialogRequestSession.currentTurnAudioOffset, this.privDialogRequestSession.sessionId);
if (!!this.privRecognizer.speechEndDetected) {
this.privRecognizer.speechEndDetected(this.privRecognizer, speechStopEventArgs);
}
break;
case "turn.end":
{
const turnEndRequestId = connectionMessage.requestId.toUpperCase();
const audioSessionReqId = this.privDialogRequestSession.requestId.toUpperCase();
// turn started by the service
if (turnEndRequestId !== audioSessionReqId) {
this.privTurnStateManager.CompleteTurn(turnEndRequestId);
}
else {
// Audio session turn
const sessionStopEventArgs = new SessionEventArgs(this.privDialogRequestSession.sessionId);
this.privDialogRequestSession.onServiceTurnEndResponse(false);
if (this.privDialogRequestSession.isSpeechEnded) {
if (!!this.privRecognizer.sessionStopped) {
this.privRecognizer.sessionStopped(this.privRecognizer, sessionStopEventArgs);
}
}
}
}
break;
default:
this.processTypeSpecificMessages(connectionMessage, successCallback, errorCallBack);
}
return this.receiveDialogMessageOverride();
});
}, (error) => {
this.terminateMessageLoop = true;
});
return communicationCustodian.promise();
};
this.fetchDialogConnection = () => {
return this.configConnection();
};
this.sendAgentConfig = (connection) => {
if (this.agentConfig && !this.agentConfigSent) {
const agentConfigJson = this.agentConfig.toJsonString();
this.agentConfigSent = true;
return connection.send(new SpeechConnectionMessage(MessageType.Text, "agent.config", this.privDialogRequestSession.requestId, "application/json", agentConfigJson));
}
return PromiseHelper.fromResult(true);
};
this.sendAgentContext = (connection) => {
const guid = createGuid();
const agentContext = {
channelData: "",
context: {
interactionId: guid
},
version: 0.5
};
const agentContextJson = JSON.stringify(agentContext);
return connection.send(new SpeechConnectionMessage(MessageType.Text, "speech.agent.context", this.privDialogRequestSession.requestId, "application/json", agentContextJson));
};
this.privDialogServiceConnector = dialogServiceConnector;
this.privDialogAuthentication = authentication;
this.receiveMessageOverride = this.receiveDialogMessageOverride;
this.privTurnStateManager = new DialogServiceTurnStateManager();
this.recognizeOverride = this.listenOnce;
this.connectImplOverride = this.dialogConnectImpl;
this.configConnectionOverride = this.configConnection;
this.fetchConnectionOverride = this.fetchDialogConnection;
this.disconnectOverride = this.privDisconnect;
this.privDialogAudioSource = audioSource;
this.privDialogRequestSession = new RequestSession(audioSource.id());
this.privDialogConnectionFactory = connectionFactory;
this.privDialogIsDisposed = false;
this.agentConfigSent = false;
}
isDisposed() {
return this.privDialogIsDisposed;
}
dispose(reason) {
this.privDialogIsDisposed = true;
if (this.privConnectionConfigPromise) {
this.privConnectionConfigPromise.onSuccessContinueWith((connection) => {
connection.dispose(reason);
});
}
}
privDisconnect() {
this.cancelRecognition(this.privDialogRequestSession.sessionId, this.privDialogRequestSession.requestId, CancellationReason.Error, CancellationErrorCode.NoError, "Disconnecting", undefined);
this.terminateMessageLoop = true;
this.agentConfigSent = false;
if (this.privDialogConnectionPromise.result().isCompleted) {
if (!this.privDialogConnectionPromise.result().isError) {
this.privDialogConnectionPromise.result().result.dispose();
this.privDialogConnectionPromise = null;
}
}
else {
this.privDialogConnectionPromise.onSuccessContinueWith((connection) => {
connection.dispose();
});
}
}
processTypeSpecificMessages(connectionMessage, successCallback, errorCallBack) {
const resultProps = new PropertyCollection();
if (connectionMessage.messageType === MessageType.Text) {
resultProps.setProperty(PropertyId.SpeechServiceResponse_JsonResult, connectionMessage.textBody);
}
let result;
switch (connectionMessage.path.toLowerCase()) {
case "speech.phrase":
const speechPhrase = SimpleSpeechPhrase.fromJSON(connectionMessage.textBody);
this.privDialogRequestSession.onPhraseRecognized(this.privDialogRequestSession.currentTurnAudioOffset + speechPhrase.Offset + speechPhrase.Duration);
if (speechPhrase.RecognitionStatus === RecognitionStatus.Success) {
const args = this.fireEventForResult(speechPhrase, resultProps);
if (!!this.privDialogServiceConnector.recognized) {
try {
this.privDialogServiceConnector.recognized(this.privDialogServiceConnector, args);
/* tslint:disable:no-empty */
}
catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
// report result to promise.
if (!!this.privSuccessCallback) {
try {
this.privSuccessCallback(args.result);
}
catch (e) {
if (!!errorCallBack) {
errorCallBack(e);
}
}
// Only invoke the call back once.
// and if it's successful don't invoke the
// error after that.
this.privSuccessCallback = undefined;
errorCallBack = undefined;
}
}
break;
case "speech.hypothesis":
const hypothesis = SpeechHypothesis.fromJSON(connectionMessage.textBody);
const offset = hypothesis.Offset + this.privDialogRequestSession.currentTurnAudioOffset;
result = new SpeechRecognitionResult(this.privDialogRequestSession.requestId, ResultReason.RecognizingSpeech, hypothesis.Text, hypothesis.Duration, offset, undefined, connectionMessage.textBody, resultProps);
this.privDialogRequestSession.onHypothesis(offset);
const ev = new SpeechRecognitionEventArgs(result, hypothesis.Duration, this.privDialogRequestSession.sessionId);
if (!!this.privDialogServiceConnector.recognizing) {
try {
this.privDialogServiceConnector.recognizing(this.privDialogServiceConnector, ev);
/* tslint:disable:no-empty */
}
catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
break;
case "audio":
{
const audioRequestId = connectionMessage.requestId.toUpperCase();
const turn = this.privTurnStateManager.GetTurn(audioRequestId);
try {
// Empty binary message signals end of stream.
if (!connectionMessage.binaryBody) {
turn.endAudioStream();
}
else {
turn.audioStream.write(connectionMessage.binaryBody);
}
}
catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
break;
case "response":
{
const responseRequestId = connectionMessage.requestId.toUpperCase();
const activityPayload = ActivityPayloadResponse.fromJSON(connectionMessage.textBody);
const turn = this.privTurnStateManager.GetTurn(responseRequestId);
// update the conversation Id
if (activityPayload.conversationId) {
const updateAgentConfig = this.agentConfig.get();
updateAgentConfig.botInfo.conversationId = activityPayload.conversationId;
this.agentConfig.set(updateAgentConfig);
}
const pullAudioOutputStream = turn.processActivityPayload(activityPayload);
const activity = new ActivityReceivedEventArgs(activityPayload.messagePayload, pullAudioOutputStream);
if (!!this.privDialogServiceConnector.activityReceived) {
try {
this.privDialogServiceConnector.activityReceived(this.privDialogServiceConnector, activity);
/* tslint:disable:no-empty */
}
catch (error) {
// Not going to let errors in the event handler
// trip things up.
}
}
}
break;
default:
break;
}
}
// Cancels recognition.
cancelRecognition(sessionId, requestId, cancellationReason, errorCode, error, cancelRecoCallback) {
this.terminateMessageLoop = true;
if (!!this.privDialogRequestSession.isRecognizing) {
this.privDialogRequestSession.onStopRecognizing();
}
if (!!this.privDialogServiceConnector.canceled) {
const properties = new PropertyCollection();
properties.setProperty(CancellationErrorCodePropertyName, CancellationErrorCode[errorCode]);
const cancelEvent = new SpeechRecognitionCanceledEventArgs(cancellationReason, error, errorCode, undefined, sessionId);
try {
this.privDialogServiceConnector.canceled(this.privDialogServiceConnector, cancelEvent);
/* tslint:disable:no-empty */
}
catch (_a) { }
if (!!cancelRecoCallback) {
const result = new SpeechRecognitionResult(undefined, // ResultId
ResultReason.Canceled, undefined, // Text
undefined, // Druation
undefined, // Offset
error, undefined, // Json
properties);
try {
cancelRecoCallback(result);
/* tslint:disable:no-empty */
}
catch (_b) { }
}
}
}
// Establishes a websocket connection to the end point.
dialogConnectImpl(isUnAuthorized = false) {
if (this.privDialogConnectionPromise) {
if (this.privDialogConnectionPromise.result().isCompleted &&
(this.privDialogConnectionPromise.result().isError
|| this.privDialogConnectionPromise.result().result.state() === ConnectionState.Disconnected)) {
this.agentConfigSent = false;
this.privDialogConnectionPromise = null;
}
else {
return this.privDialogConnectionPromise;
}
}
this.privDialogAuthFetchEventId = createNoDashGuid();
// keep the connectionId for reconnect events
if (this.privConnectionId === undefined) {
this.privConnectionId = createNoDashGuid();
}
this.privDialogRequestSession.onPreConnectionStart(this.privDialogAuthFetchEventId, this.privConnectionId);
const authPromise = isUnAuthorized ? this.privDialogAuthentication.fetchOnExpiry(this.privDialogAuthFetchEventId) : this.privDialogAuthentication.fetch(this.privDialogAuthFetchEventId);
this.privDialogConnectionPromise = authPromise
.continueWithPromise((result) => {
if (result.isError) {
this.privDialogRequestSession.onAuthCompleted(true, result.error);
throw new Error(result.error);
}
else {
this.privDialogRequestSession.onAuthCompleted(false);
}
const connection = this.privDialogConnectionFactory.create(this.privRecognizerConfig, result.result, this.privConnectionId);
this.privDialogRequestSession.listenForServiceTelemetry(connection.events);
// Attach to the underlying event. No need to hold onto the detach pointers as in the event the connection goes away,
// it'll stop sending events.
connection.events.attach((event) => {
this.connectionEvents.onEvent(event);
});
return connection.open().onSuccessContinueWithPromise((response) => {
if (response.statusCode === 200) {
this.privDialogRequestSession.onPreConnectionStart(this.privDialogAuthFetchEventId, this.privConnectionId);
this.privDialogRequestSession.onConnectionEstablishCompleted(response.statusCode);
return PromiseHelper.fromResult(connection);
}
else if (response.statusCode === 403 && !isUnAuthorized) {
return this.dialogConnectImpl(true);
}
else {
this.privDialogRequestSession.onConnectionEstablishCompleted(response.statusCode, response.reason);
return PromiseHelper.fromError(`Unable to contact server. StatusCode: ${response.statusCode}, ${this.privRecognizerConfig.parameters.getProperty(PropertyId.SpeechServiceConnection_Endpoint)} Reason: ${response.reason}`);
}
});
});
this.privConnectionLoop = this.startMessageLoop();
return this.privDialogConnectionPromise;
}
startMessageLoop() {
this.terminateMessageLoop = false;
const messageRetrievalPromise = this.receiveDialogMessageOverride();
return messageRetrievalPromise.on((r) => {
return true;
}, (error) => {
this.cancelRecognition(this.privDialogRequestSession.sessionId, this.privDialogRequestSession.requestId, CancellationReason.Error, CancellationErrorCode.RuntimeError, error, this.privSuccessCallback);
});
}
// Takes an established websocket connection to the endpoint and sends speech configuration information.
configConnection() {
if (this.privConnectionConfigPromise) {
if (this.privConnectionConfigPromise.result().isCompleted &&
(this.privConnectionConfigPromise.result().isError
|| this.privConnectionConfigPromise.result().result.state() === ConnectionState.Disconnected)) {
this.privConnectionConfigPromise = null;
return this.configConnection();
}
else {
return this.privConnectionConfigPromise;
}
}
this.privConnectionConfigPromise = this.dialogConnectImpl().onSuccessContinueWithPromise((connection) => {
return this.sendSpeechServiceConfig(connection, this.privDialogRequestSession, this.privRecognizerConfig.SpeechServiceConfig.serialize())
.onSuccessContinueWithPromise((_) => {
return this.sendAgentConfig(connection).onSuccessContinueWith((_) => {
return connection;
});
});
});
return this.privConnectionConfigPromise;
}
sendPreAudioMessages() {
this.fetchDialogConnection().onSuccessContinueWith((connection) => {
this.sendAgentContext(connection);
});
}
fireEventForResult(serviceResult, properties) {
const resultReason = EnumTranslation.implTranslateRecognitionResult(serviceResult.RecognitionStatus);
const offset = serviceResult.Offset + this.privDialogRequestSession.currentTurnAudioOffset;
const result = new SpeechRecognitionResult(this.privDialogRequestSession.requestId, resultReason, serviceResult.DisplayText, serviceResult.Duration, offset, undefined, JSON.stringify(serviceResult), properties);
const ev = new SpeechRecognitionEventArgs(result, offset, this.privDialogRequestSession.sessionId);
return ev;
}
}
//# sourceMappingURL=DialogServiceAdapter.js.map