UNPKG

microsoft-cognitiveservices-speech-sdk

Version:

Microsoft Cognitive Services Speech SDK for JavaScript

docs.microsoft.com/azure/cognitive-services/speech-service/

Microsoft/cognitive-services-speech-sdk-js

492 lines (490 loc) • 28.2 kB

JavaScript

"use strict"; // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT license. Object.defineProperty(exports, "__esModule", { value: true }); exports.DialogServiceAdapter = void 0; const Exports_js_1 = require("../common.browser/Exports.js"); const DialogEvents_js_1 = require("../common/DialogEvents.js"); const Exports_js_2 = require("../common/Exports.js"); const AudioOutputFormat_js_1 = require("../sdk/Audio/AudioOutputFormat.js"); const Exports_js_3 = require("../sdk/Exports.js"); const DialogServiceTurnStateManager_js_1 = require("./DialogServiceTurnStateManager.js"); const Exports_js_4 = require("./Exports.js"); const ActivityResponsePayload_js_1 = require("./ServiceMessages/ActivityResponsePayload.js"); const SpeechConnectionMessage_Internal_js_1 = require("./SpeechConnectionMessage.Internal.js"); class DialogServiceAdapter extends Exports_js_4.ServiceRecognizerBase { constructor(authentication, connectionFactory, audioSource, recognizerConfig, dialogServiceConnector) { super(authentication, connectionFactory, audioSource, recognizerConfig, dialogServiceConnector); this.privEvents = new Exports_js_2.EventSource(); this.privDialogServiceConnector = dialogServiceConnector; this.receiveMessageOverride = () => this.receiveDialogMessageOverride(); this.privTurnStateManager = new DialogServiceTurnStateManager_js_1.DialogServiceTurnStateManager(); this.recognizeOverride = (recoMode, successCallback, errorCallback) => this.listenOnce(recoMode, successCallback, errorCallback); this.postConnectImplOverride = (connection) => this.dialogConnectImpl(connection); this.configConnectionOverride = (connection) => this.configConnection(connection); this.disconnectOverride = () => this.privDisconnect(); this.privDialogAudioSource = audioSource; this.agentConfigSent = false; this.privLastResult = null; this.connectionEvents.attach((connectionEvent) => { if (connectionEvent.name === "ConnectionClosedEvent") { this.terminateMessageLoop = true; } }); } async sendMessage(message) { const interactionGuid = Exports_js_2.createGuid(); const requestId = Exports_js_2.createNoDashGuid(); const agentMessage = { context: { interactionId: interactionGuid }, // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment messagePayload: JSON.parse(message), version: 0.5 }; const agentMessageJson = JSON.stringify(agentMessage); const connection = await this.fetchConnection(); await connection.send(new SpeechConnectionMessage_Internal_js_1.SpeechConnectionMessage(Exports_js_2.MessageType.Text, "agent", requestId, "application/json", agentMessageJson)); } async privDisconnect() { await this.cancelRecognition(this.privRequestSession.sessionId, this.privRequestSession.requestId, Exports_js_3.CancellationReason.Error, Exports_js_3.CancellationErrorCode.NoError, "Disconnecting"); this.terminateMessageLoop = true; this.agentConfigSent = false; return; } processTypeSpecificMessages(connectionMessage) { const resultProps = new Exports_js_3.PropertyCollection(); if (connectionMessage.messageType === Exports_js_2.MessageType.Text) { resultProps.setProperty(Exports_js_3.PropertyId.SpeechServiceResponse_JsonResult, connectionMessage.textBody); } let result; let processed; switch (connectionMessage.path.toLowerCase()) { case "speech.phrase": const speechPhrase = Exports_js_4.SimpleSpeechPhrase.fromJSON(connectionMessage.textBody, this.privRequestSession.currentTurnAudioOffset); this.privRequestSession.onPhraseRecognized(speechPhrase.Offset + speechPhrase.Duration); if (speechPhrase.RecognitionStatus !== Exports_js_4.RecognitionStatus.TooManyRequests && speechPhrase.RecognitionStatus !== Exports_js_4.RecognitionStatus.Error) { const args = this.fireEventForResult(speechPhrase, resultProps); this.privLastResult = args.result; if (!!this.privDialogServiceConnector.recognized) { try { this.privDialogServiceConnector.recognized(this.privDialogServiceConnector, args); /* eslint-disable no-empty */ } catch (error) { // Not going to let errors in the event handler // trip things up. } } } processed = true; break; case "speech.hypothesis": const hypothesis = Exports_js_4.SpeechHypothesis.fromJSON(connectionMessage.textBody, this.privRequestSession.currentTurnAudioOffset); result = new Exports_js_3.SpeechRecognitionResult(this.privRequestSession.requestId, Exports_js_3.ResultReason.RecognizingSpeech, hypothesis.Text, hypothesis.Duration, hypothesis.Offset, hypothesis.Language, hypothesis.LanguageDetectionConfidence, undefined, undefined, hypothesis.asJson(), resultProps); this.privRequestSession.onHypothesis(hypothesis.Offset); const ev = new Exports_js_3.SpeechRecognitionEventArgs(result, hypothesis.Offset, this.privRequestSession.sessionId); if (!!this.privDialogServiceConnector.recognizing) { try { this.privDialogServiceConnector.recognizing(this.privDialogServiceConnector, ev); /* eslint-disable no-empty */ } catch (error) { // Not going to let errors in the event handler // trip things up. } } processed = true; break; case "speech.keyword": const keyword = Exports_js_4.SpeechKeyword.fromJSON(connectionMessage.textBody, this.privRequestSession.currentTurnAudioOffset); result = new Exports_js_3.SpeechRecognitionResult(this.privRequestSession.requestId, keyword.Status === "Accepted" ? Exports_js_3.ResultReason.RecognizedKeyword : Exports_js_3.ResultReason.NoMatch, keyword.Text, keyword.Duration, keyword.Offset, undefined, undefined, undefined, undefined, keyword.asJson(), resultProps); if (keyword.Status !== "Accepted") { this.privLastResult = result; } const event = new Exports_js_3.SpeechRecognitionEventArgs(result, result.duration, result.resultId); if (!!this.privDialogServiceConnector.recognized) { try { this.privDialogServiceConnector.recognized(this.privDialogServiceConnector, event); /* eslint-disable no-empty */ } catch (error) { // Not going to let errors in the event handler // trip things up. } } processed = true; break; case "audio": { const audioRequestId = connectionMessage.requestId.toUpperCase(); const turn = this.privTurnStateManager.GetTurn(audioRequestId); try { // Empty binary message signals end of stream. if (!connectionMessage.binaryBody) { turn.endAudioStream(); } else { turn.audioStream.write(connectionMessage.binaryBody); } } catch (error) { // Not going to let errors in the event handler // trip things up. } } processed = true; break; case "response": { this.handleResponseMessage(connectionMessage); } processed = true; break; default: break; } const defferal = new Exports_js_2.Deferred(); defferal.resolve(processed); return defferal.promise; } // Cancels recognition. async cancelRecognition(sessionId, requestId, cancellationReason, errorCode, error) { this.terminateMessageLoop = true; if (!!this.privRequestSession.isRecognizing) { await this.privRequestSession.onStopRecognizing(); } if (!!this.privDialogServiceConnector.canceled) { const properties = new Exports_js_3.PropertyCollection(); properties.setProperty(Exports_js_4.CancellationErrorCodePropertyName, Exports_js_3.CancellationErrorCode[errorCode]); const cancelEvent = new Exports_js_3.SpeechRecognitionCanceledEventArgs(cancellationReason, error, errorCode, undefined, sessionId); try { this.privDialogServiceConnector.canceled(this.privDialogServiceConnector, cancelEvent); /* eslint-disable no-empty */ } catch { } if (!!this.privSuccessCallback) { const result = new Exports_js_3.SpeechRecognitionResult(undefined, // ResultId Exports_js_3.ResultReason.Canceled, undefined, // Text undefined, // Duration undefined, // Offset undefined, // Language undefined, // Language Detection Confidence undefined, // Speaker Id error, undefined, // Json properties); try { this.privSuccessCallback(result); this.privSuccessCallback = undefined; /* eslint-disable no-empty */ } catch { } } } } async listenOnce(recoMode, successCallback, errorCallback) { this.privRecognizerConfig.recognitionMode = recoMode; this.privSuccessCallback = successCallback; this.privErrorCallback = errorCallback; this.privRequestSession.startNewRecognition(); this.privRequestSession.listenForServiceTelemetry(this.privDialogAudioSource.events); this.privRecognizerConfig.parameters.setProperty(Exports_js_3.PropertyId.Speech_SessionId, this.privRequestSession.sessionId); // Start the connection to the service. The promise this will create is stored and will be used by configureConnection(). const conPromise = this.connectImpl(); const preAudioPromise = this.sendPreAudioMessages(); const node = await this.privDialogAudioSource.attach(this.privRequestSession.audioNodeId); const format = await this.privDialogAudioSource.format; const deviceInfo = await this.privDialogAudioSource.deviceInfo; const audioNode = new Exports_js_1.ReplayableAudioNode(node, format.avgBytesPerSec); await this.privRequestSession.onAudioSourceAttachCompleted(audioNode, false); this.privRecognizerConfig.SpeechServiceConfig.Context.audio = { source: deviceInfo }; try { await conPromise; await preAudioPromise; } catch (error) { await this.cancelRecognition(this.privRequestSession.sessionId, this.privRequestSession.requestId, Exports_js_3.CancellationReason.Error, Exports_js_3.CancellationErrorCode.ConnectionFailure, error); return Promise.resolve(); } const sessionStartEventArgs = new Exports_js_3.SessionEventArgs(this.privRequestSession.sessionId); if (!!this.privRecognizer.sessionStarted) { this.privRecognizer.sessionStarted(this.privRecognizer, sessionStartEventArgs); } const audioSendPromise = this.sendAudio(audioNode); // /* eslint-disable no-empty */ audioSendPromise.then(() => { }, async (error) => { await this.cancelRecognition(this.privRequestSession.sessionId, this.privRequestSession.requestId, Exports_js_3.CancellationReason.Error, Exports_js_3.CancellationErrorCode.RuntimeError, error); }); } // Establishes a websocket connection to the end point. dialogConnectImpl(connection) { this.privConnectionLoop = this.startMessageLoop(); return connection; } receiveDialogMessageOverride() { // we won't rely on the cascading promises of the connection since we want to continually be available to receive messages const communicationCustodian = new Exports_js_2.Deferred(); const loop = async () => { try { const isDisposed = this.isDisposed(); const terminateMessageLoop = (!this.isDisposed() && this.terminateMessageLoop); if (isDisposed || terminateMessageLoop) { // We're done. communicationCustodian.resolve(undefined); return; } const connection = await this.fetchConnection(); const message = await connection.read(); if (!message) { return loop(); } const connectionMessage = SpeechConnectionMessage_Internal_js_1.SpeechConnectionMessage.fromConnectionMessage(message); switch (connectionMessage.path.toLowerCase()) { case "turn.start": { const turnRequestId = connectionMessage.requestId.toUpperCase(); const audioSessionReqId = this.privRequestSession.requestId.toUpperCase(); // turn started by the service if (turnRequestId !== audioSessionReqId) { this.privTurnStateManager.StartTurn(turnRequestId); } else { this.privRequestSession.onServiceTurnStartResponse(); } } break; case "speech.startdetected": const speechStartDetected = Exports_js_4.SpeechDetected.fromJSON(connectionMessage.textBody, this.privRequestSession.currentTurnAudioOffset); const speechStartEventArgs = new Exports_js_3.RecognitionEventArgs(speechStartDetected.Offset, this.privRequestSession.sessionId); if (!!this.privRecognizer.speechStartDetected) { this.privRecognizer.speechStartDetected(this.privRecognizer, speechStartEventArgs); } break; case "speech.enddetected": let json; if (connectionMessage.textBody.length > 0) { json = connectionMessage.textBody; } else { // If the request was empty, the JSON returned is empty. json = "{ Offset: 0 }"; } const speechStopDetected = Exports_js_4.SpeechDetected.fromJSON(json, this.privRequestSession.currentTurnAudioOffset); this.privRequestSession.onServiceRecognized(speechStopDetected.Offset); const speechStopEventArgs = new Exports_js_3.RecognitionEventArgs(speechStopDetected.Offset, this.privRequestSession.sessionId); if (!!this.privRecognizer.speechEndDetected) { this.privRecognizer.speechEndDetected(this.privRecognizer, speechStopEventArgs); } break; case "turn.end": { const turnEndRequestId = connectionMessage.requestId.toUpperCase(); const audioSessionReqId = this.privRequestSession.requestId.toUpperCase(); // turn started by the service if (turnEndRequestId !== audioSessionReqId) { this.privTurnStateManager.CompleteTurn(turnEndRequestId); } else { // Audio session turn const sessionStopEventArgs = new Exports_js_3.SessionEventArgs(this.privRequestSession.sessionId); await this.privRequestSession.onServiceTurnEndResponse(false); if (!this.privRecognizerConfig.isContinuousRecognition || this.privRequestSession.isSpeechEnded || !this.privRequestSession.isRecognizing) { if (!!this.privRecognizer.sessionStopped) { this.privRecognizer.sessionStopped(this.privRecognizer, sessionStopEventArgs); } } // report result to promise. if (!!this.privSuccessCallback && this.privLastResult) { try { this.privSuccessCallback(this.privLastResult); this.privLastResult = null; } catch (e) { if (!!this.privErrorCallback) { this.privErrorCallback(e); } } // Only invoke the call back once. // and if it's successful don't invoke the // error after that. this.privSuccessCallback = undefined; this.privErrorCallback = undefined; } } } break; default: try { const processed = await this.processTypeSpecificMessages(connectionMessage); if (!processed) { if (!!this.serviceEvents) { this.serviceEvents.onEvent(new Exports_js_2.ServiceEvent(connectionMessage.path.toLowerCase(), connectionMessage.textBody)); } } } catch (e) { // } } const ret = loop(); return ret; } catch (error) { this.terminateMessageLoop = true; communicationCustodian.resolve(); } }; loop().catch((reason) => { Exports_js_2.Events.instance.onEvent(new Exports_js_2.BackgroundEvent(reason)); }); return communicationCustodian.promise; } async startMessageLoop() { this.terminateMessageLoop = false; try { await this.receiveDialogMessageOverride(); } catch (error) { await this.cancelRecognition(this.privRequestSession.sessionId, this.privRequestSession.requestId, Exports_js_3.CancellationReason.Error, Exports_js_3.CancellationErrorCode.RuntimeError, error); } return Promise.resolve(); } // Takes an established websocket connection to the endpoint and sends speech configuration information. async configConnection(connection) { if (this.terminateMessageLoop) { this.terminateMessageLoop = false; return Promise.reject("Connection to service terminated."); } await this.sendSpeechServiceConfig(connection, this.privRequestSession, this.privRecognizerConfig.SpeechServiceConfig.serialize()); await this.sendAgentConfig(connection); return connection; } async sendPreAudioMessages() { const connection = await this.fetchConnection(); this.addKeywordContextData(); await this.sendSpeechContext(connection, true); await this.sendAgentContext(connection); await this.sendWaveHeader(connection); } sendAgentConfig(connection) { if (this.agentConfig && !this.agentConfigSent) { if (this.privRecognizerConfig .parameters .getProperty(Exports_js_3.PropertyId.Conversation_DialogType) === Exports_js_3.DialogServiceConfig.DialogTypes.CustomCommands) { const config = this.agentConfig.get(); config.botInfo.commandsCulture = this.privRecognizerConfig.parameters.getProperty(Exports_js_3.PropertyId.SpeechServiceConnection_RecoLanguage, "en-us"); this.agentConfig.set(config); } this.onEvent(new DialogEvents_js_1.SendingAgentContextMessageEvent(this.agentConfig)); const agentConfigJson = this.agentConfig.toJsonString(); // guard against sending this multiple times on one connection this.agentConfigSent = true; return connection.send(new SpeechConnectionMessage_Internal_js_1.SpeechConnectionMessage(Exports_js_2.MessageType.Text, "agent.config", this.privRequestSession.requestId, "application/json", agentConfigJson)); } return; } sendAgentContext(connection) { const guid = Exports_js_2.createGuid(); const speechActivityTemplate = this.privDialogServiceConnector.properties.getProperty(Exports_js_3.PropertyId.Conversation_Speech_Activity_Template); const agentContext = { channelData: "", context: { interactionId: guid }, messagePayload: typeof speechActivityTemplate === undefined ? undefined : speechActivityTemplate, version: 0.5 }; const agentContextJson = JSON.stringify(agentContext); return connection.send(new SpeechConnectionMessage_Internal_js_1.SpeechConnectionMessage(Exports_js_2.MessageType.Text, "speech.agent.context", this.privRequestSession.requestId, "application/json", agentContextJson)); } fireEventForResult(serviceResult, properties) { const resultReason = Exports_js_4.EnumTranslation.implTranslateRecognitionResult(serviceResult.RecognitionStatus); const result = new Exports_js_3.SpeechRecognitionResult(this.privRequestSession.requestId, resultReason, serviceResult.DisplayText, serviceResult.Duration, serviceResult.Offset, serviceResult.Language, serviceResult.LanguageDetectionConfidence, undefined, undefined, serviceResult.asJson(), properties); const ev = new Exports_js_3.SpeechRecognitionEventArgs(result, serviceResult.Offset, this.privRequestSession.sessionId); return ev; } handleResponseMessage(responseMessage) { // "response" messages can contain either "message" (activity) or "MessageStatus" data. Fire the appropriate // event according to the message type that's specified. const responsePayload = JSON.parse(responseMessage.textBody); switch (responsePayload.messageType.toLowerCase()) { case "message": const responseRequestId = responseMessage.requestId.toUpperCase(); const activityPayload = ActivityResponsePayload_js_1.ActivityPayloadResponse.fromJSON(responseMessage.textBody); const turn = this.privTurnStateManager.GetTurn(responseRequestId); // update the conversation Id if (activityPayload.conversationId) { const updateAgentConfig = this.agentConfig.get(); updateAgentConfig.botInfo.conversationId = activityPayload.conversationId; this.agentConfig.set(updateAgentConfig); } const pullAudioOutputStream = turn.processActivityPayload(activityPayload, AudioOutputFormat_js_1.AudioOutputFormatImpl.fromSpeechSynthesisOutputFormatString(this.privDialogServiceConnector.properties.getProperty(Exports_js_3.PropertyId.SpeechServiceConnection_SynthOutputFormat, undefined))); const activity = new Exports_js_3.ActivityReceivedEventArgs(activityPayload.messagePayload, pullAudioOutputStream); if (!!this.privDialogServiceConnector.activityReceived) { try { this.privDialogServiceConnector.activityReceived(this.privDialogServiceConnector, activity); /* eslint-disable-next-line no-empty */ } catch (error) { // Not going to let errors in the event handler // trip things up. } } break; case "messagestatus": if (!!this.privDialogServiceConnector.turnStatusReceived) { try { this.privDialogServiceConnector.turnStatusReceived(this.privDialogServiceConnector, new Exports_js_3.TurnStatusReceivedEventArgs(responseMessage.textBody)); /* eslint-disable-next-line no-empty */ } catch (error) { // Not going to let errors in the event handler // trip things up. } } break; default: Exports_js_2.Events.instance.onEvent(new Exports_js_2.BackgroundEvent(`Unexpected response of type ${responsePayload.messageType}. Ignoring.`)); break; } } onEvent(event) { this.privEvents.onEvent(event); Exports_js_2.Events.instance.onEvent(event); } addKeywordContextData() { const keywordPropertyValue = this.privRecognizerConfig.parameters.getProperty("SPEECH-KeywordsToDetect"); if (keywordPropertyValue === undefined) { return; } const keywordOffsetPropertyValue = this.privRecognizerConfig.parameters .getProperty("SPEECH-KeywordsToDetect-Offsets"); const keywordDurationPropertyValue = this.privRecognizerConfig.parameters .getProperty("SPEECH-KeywordsToDetect-Durations"); const keywords = keywordPropertyValue.split(";"); const keywordOffsets = keywordOffsetPropertyValue === undefined ? [] : keywordOffsetPropertyValue.split(";"); const keywordDurations = keywordDurationPropertyValue === undefined ? [] : keywordDurationPropertyValue.split(";"); const keywordDefinitionArray = []; for (let i = 0; i < keywords.length; i++) { const definition = {}; definition.text = keywords[i]; if (i < keywordOffsets.length) { definition.offset = Number(keywordOffsets[i]); } if (i < keywordDurations.length) { definition.duration = Number(keywordDurations[i]); } keywordDefinitionArray.push(definition); } this.speechContext.setSection("invocationSource", "VoiceActivationWithKeyword"); this.speechContext.setSection("keywordDetection", [{ clientDetectedKeywords: keywordDefinitionArray, onReject: { action: "EndOfTurn" }, type: "startTrigger" }]); } } exports.DialogServiceAdapter = DialogServiceAdapter; //# sourceMappingURL=DialogServiceAdapter.js.map