microsoft-cognitiveservices-speech-sdk
Version:
Microsoft Cognitive Services Speech SDK for JavaScript
1 lines • 16.1 kB
Source Map (JSON)
{"version":3,"sources":["src/common.speech/SynthesisTurn.ts"],"names":[],"mappings":"AAGA,OAAO,EAIH,iBAAiB,EACpB,MAAM,sBAAsB,CAAC;AAC9B,OAAO,EAAE,qBAAqB,EAAE,MAAM,mCAAmC,CAAC;AAE1E,OAAO,EACH,kBAAkB,EAGlB,qBAAqB,EACxB,MAAM,mBAAmB,CAAC;AAC3B,OAAO,EAAE,kBAAkB,EAAgB,MAAM,6CAA6C,CAAC;AAC/F,OAAO,EAEH,oBAAoB,EAGvB,MAAM,sBAAsB,CAAC;AAE9B,MAAM,WAAW,yBAAyB;IACtC,UAAU,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,uBAAuB;IACpC,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,kBAAkB;IAC/B,OAAO,EAAE,yBAAyB,CAAC;IACnC,KAAK,EAAE,uBAAuB,CAAC;IAC/B,MAAM,EAAE;QACJ,gBAAgB,EAAE,MAAM,CAAC;KAC5B,CAAC;CACL;AAED,qBAAa,aAAa;IAEtB,IAAW,SAAS,IAAI,MAAM,CAE7B;IAED,IAAW,QAAQ,IAAI,MAAM,CAE5B;IAED,IAAW,QAAQ,CAAC,KAAK,EAAE,MAAM,EAEhC;IAED,IAAW,iBAAiB,IAAI,qBAAqB,CAEpD;IAED,IAAW,iBAAiB,CAAC,MAAM,EAAE,qBAAqB,EAEzD;IAED,IAAW,qBAAqB,IAAI,OAAO,CAAC,IAAI,CAAC,CAEhD;IAED,IAAW,gBAAgB,IAAI,OAAO,CAErC;IAED,IAAW,cAAc,IAAI,OAAO,CAEnC;IAED,IAAW,iBAAiB,IAAI,MAAM,CAErC;IAED,IAAW,qBAAqB,IAAI,MAAM,CAEzC;IAGD,IAAW,aAAa,IAAI,MAAM,CAEjC;IAED,IAAW,aAAa,IAAI,MAAM,CAEjC;IAED,IAAW,eAAe,IAAI,kBAAkB,CAQ/C;IAED,OAAO,CAAC,cAAc,CAAkB;IACxC,OAAO,CAAC,oBAAoB,CAAS;IACrC,OAAO,CAAC,kBAAkB,CAAkB;IAC5C,OAAO,CAAC,oBAAoB,CAAkB;IAC9C,OAAO,CAAC,iBAAiB,CAAa;IACtC,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,gBAAgB,CAAiB;IACzC,OAAO,CAAC,UAAU,CAAkB;IACpC,OAAO,CAAC,qBAAqB,CAAwB;IACrD,OAAO,CAAC,qBAAqB,CAA4B;IACzD,OAAO,CAAC,iBAAiB,CAAc;IACvC,OAAO,CAAC,2BAA2B,CAAc;IACjD,OAAO,CAAC,cAAc,CAAa;IACnC,OAAO,CAAC,uBAAuB,CAAa;IAC5C,OAAO,CAAC,kBAAkB,CAAa;IACvC,OAAO,CAAC,2BAA2B,CAAa;IAChD,OAAO,CAAC,0BAA0B,CAAS;IAC3C,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,UAAU,CAAU;IAC5B,OAAO,CAAC,wBAAwB,CAAoB;IACpD,OAAO,CAAC,iBAAiB,CAAS;IAClC,OAAO,CAAC,aAAa,CAAS;;IAUjB,mBAAmB,IAAI,OAAO,CAAC,WAAW,CAAC;IAW3C,6BAA6B,IAAI,OAAO,CAAC,WAAW,CAAC;IAgB3D,iBAAiB,CAAC,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,CAAC,EAAE,iBAAiB,GAAG,IAAI;IAwBlH,oBAAoB,CAAC,gBAAgB,EAAE,MAAM,GAAG,IAAI;IAKpD,eAAe,CAAC,OAAO,EAAE,OAAO,GAAG,IAAI;IAMvC,8BAA8B,CAAC,UAAU,EAAE,MAAM,GAAG,IAAI;IAUxD,wBAAwB,CAAC,YAAY,EAAE,MAAM,GAAG,IAAI;IAKpD,wBAAwB,IAAI,IAAI;IAMhC,0BAA0B,CAAC,YAAY,EAAE,MAAM,GAAG,IAAI;IAgBtD,oBAAoB,CAAC,IAAI,EAAE,WAAW,GAAG,IAAI;IAU7C,mBAAmB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,IAAI;IAIvD,wBAAwB,CAAC,QAAQ,EAAE,kBAAkB,GAAG,IAAI;IAM5D,YAAY,CAAC,QAAQ,EAAE,kBAAkB,GAAG,IAAI;IAI1C,wBAAwB,IAAI,OAAO,CAAC,qBAAqB,CAAC;IAYhE,OAAO,IAAI,IAAI;IAOf,kBAAkB,IAAI,IAAI;IAIjC;;;OAGG;IACI,0BAA0B,IAAI,MAAM;IAM3C,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,oBAAoB,GAAG,IAAI;IAIpD;;;;OAIG;IACH,OAAO,CAAC,MAAM,CAAC,QAAQ;IAIvB,OAAO,CAAC,gBAAgB;IAwBxB,OAAO,CAAC,UAAU;YAaJ,sBAAsB;IAWpC;;;;OAIG;IACH,OAAO,CAAC,YAAY;CAGvB","file":"SynthesisTurn.d.ts","sourcesContent":["// Copyright (c) Microsoft Corporation. All rights reserved.\r\n// Licensed under the MIT license.\r\n\r\nimport {\r\n createNoDashGuid,\r\n Deferred,\r\n Events,\r\n IAudioDestination\r\n} from \"../common/Exports.js\";\r\nimport { AudioOutputFormatImpl } from \"../sdk/Audio/AudioOutputFormat.js\";\r\nimport { PullAudioOutputStreamImpl } from \"../sdk/Audio/AudioOutputStream.js\";\r\nimport {\r\n PropertyCollection,\r\n PropertyId,\r\n ResultReason,\r\n SpeechSynthesisResult,\r\n} from \"../sdk/Exports.js\";\r\nimport { ISynthesisMetadata, MetadataType } from \"./ServiceMessages/SynthesisAudioMetadata.js\";\r\nimport {\r\n ConnectingToSynthesisServiceEvent,\r\n SpeechSynthesisEvent,\r\n SynthesisStartedEvent,\r\n SynthesisTriggeredEvent,\r\n} from \"./SynthesisEvents.js\";\r\n\r\nexport interface ISynthesisResponseContext {\r\n serviceTag: string;\r\n}\r\n\r\nexport interface ISynthesisResponseAudio {\r\n type: string;\r\n streamId: string;\r\n}\r\n\r\nexport interface ISynthesisResponse {\r\n context: ISynthesisResponseContext;\r\n audio: ISynthesisResponseAudio;\r\n webrtc: {\r\n connectionString: string;\r\n };\r\n}\r\n\r\nexport class SynthesisTurn {\r\n\r\n public get requestId(): string {\r\n return this.privRequestId;\r\n }\r\n\r\n public get streamId(): string {\r\n return this.privStreamId;\r\n }\r\n\r\n public set streamId(value: string) {\r\n this.privStreamId = value;\r\n }\r\n\r\n public get audioOutputFormat(): AudioOutputFormatImpl {\r\n return this.privAudioOutputFormat;\r\n }\r\n\r\n public set audioOutputFormat(format: AudioOutputFormatImpl) {\r\n this.privAudioOutputFormat = format;\r\n }\r\n\r\n public get turnCompletionPromise(): Promise<void> {\r\n return this.privTurnDeferral.promise;\r\n }\r\n\r\n public get isSynthesisEnded(): boolean {\r\n return this.privIsSynthesisEnded;\r\n }\r\n\r\n public get isSynthesizing(): boolean {\r\n return this.privIsSynthesizing;\r\n }\r\n\r\n public get currentTextOffset(): number {\r\n return this.privTextOffset;\r\n }\r\n\r\n public get currentSentenceOffset(): number {\r\n return this.privSentenceOffset;\r\n }\r\n\r\n // The number of bytes received for current turn\r\n public get bytesReceived(): number {\r\n return this.privBytesReceived;\r\n }\r\n\r\n public get audioDuration(): number {\r\n return this.privAudioDuration;\r\n }\r\n\r\n public get extraProperties(): PropertyCollection {\r\n if (!!this.privWebRTCSDP) {\r\n const properties = new PropertyCollection();\r\n properties.setProperty(PropertyId.TalkingAvatarService_WebRTC_SDP, this.privWebRTCSDP);\r\n return properties;\r\n }\r\n\r\n return undefined;\r\n }\r\n\r\n private privIsDisposed: boolean = false;\r\n private privAuthFetchEventId: string;\r\n private privIsSynthesizing: boolean = false;\r\n private privIsSynthesisEnded: boolean = false;\r\n private privBytesReceived: number = 0;\r\n private privRequestId: string;\r\n private privStreamId: string;\r\n private privTurnDeferral: Deferred<void>;\r\n private privInTurn: boolean = false;\r\n private privAudioOutputFormat: AudioOutputFormatImpl;\r\n private privAudioOutputStream: PullAudioOutputStreamImpl;\r\n private privReceivedAudio: ArrayBuffer;\r\n private privReceivedAudioWithHeader: ArrayBuffer;\r\n private privTextOffset: number = 0;\r\n private privNextSearchTextIndex: number = 0;\r\n private privSentenceOffset: number = 0;\r\n private privNextSearchSentenceIndex: number = 0;\r\n private privPartialVisemeAnimation: string;\r\n private privRawText: string;\r\n private privIsSSML: boolean;\r\n private privTurnAudioDestination: IAudioDestination;\r\n private privAudioDuration: number;\r\n private privWebRTCSDP: string;\r\n\r\n public constructor() {\r\n this.privRequestId = createNoDashGuid();\r\n this.privTurnDeferral = new Deferred<void>();\r\n\r\n // We're not in a turn, so resolve.\r\n this.privTurnDeferral.resolve();\r\n }\r\n\r\n public async getAllReceivedAudio(): Promise<ArrayBuffer> {\r\n if (!!this.privReceivedAudio) {\r\n return Promise.resolve(this.privReceivedAudio);\r\n }\r\n if (!this.privIsSynthesisEnded) {\r\n return null;\r\n }\r\n await this.readAllAudioFromStream();\r\n return Promise.resolve(this.privReceivedAudio);\r\n }\r\n\r\n public async getAllReceivedAudioWithHeader(): Promise<ArrayBuffer> {\r\n if (!!this.privReceivedAudioWithHeader) {\r\n return this.privReceivedAudioWithHeader;\r\n }\r\n if (!this.privIsSynthesisEnded) {\r\n return null;\r\n }\r\n if (this.audioOutputFormat.hasHeader) {\r\n const audio: ArrayBuffer = await this.getAllReceivedAudio();\r\n this.privReceivedAudioWithHeader = this.audioOutputFormat.addHeader(audio);\r\n return this.privReceivedAudioWithHeader;\r\n } else {\r\n return this.getAllReceivedAudio();\r\n }\r\n }\r\n\r\n public startNewSynthesis(requestId: string, rawText: string, isSSML: boolean, audioDestination?: IAudioDestination): void {\r\n this.privIsSynthesisEnded = false;\r\n this.privIsSynthesizing = true;\r\n this.privRequestId = requestId;\r\n this.privRawText = rawText;\r\n this.privIsSSML = isSSML;\r\n this.privAudioOutputStream = new PullAudioOutputStreamImpl();\r\n this.privAudioOutputStream.format = this.privAudioOutputFormat;\r\n this.privReceivedAudio = null;\r\n this.privReceivedAudioWithHeader = null;\r\n this.privBytesReceived = 0;\r\n this.privTextOffset = 0;\r\n this.privNextSearchTextIndex = 0;\r\n this.privSentenceOffset = 0;\r\n this.privNextSearchSentenceIndex = 0;\r\n this.privPartialVisemeAnimation = \"\";\r\n this.privWebRTCSDP = \"\";\r\n if (audioDestination !== undefined) {\r\n this.privTurnAudioDestination = audioDestination;\r\n this.privTurnAudioDestination.format = this.privAudioOutputFormat;\r\n }\r\n this.onEvent(new SynthesisTriggeredEvent(this.requestId, undefined, audioDestination === undefined ? undefined : audioDestination.id()));\r\n }\r\n\r\n public onPreConnectionStart(authFetchEventId: string): void {\r\n this.privAuthFetchEventId = authFetchEventId;\r\n this.onEvent(new ConnectingToSynthesisServiceEvent(this.privRequestId, this.privAuthFetchEventId));\r\n }\r\n\r\n public onAuthCompleted(isError: boolean): void {\r\n if (isError) {\r\n this.onComplete();\r\n }\r\n }\r\n\r\n public onConnectionEstablishCompleted(statusCode: number): void {\r\n if (statusCode === 200) {\r\n this.onEvent(new SynthesisStartedEvent(this.requestId, this.privAuthFetchEventId));\r\n this.privBytesReceived = 0;\r\n return;\r\n } else if (statusCode === 403) {\r\n this.onComplete();\r\n }\r\n }\r\n\r\n public onServiceResponseMessage(responseJson: string): void {\r\n const response: ISynthesisResponse = JSON.parse(responseJson) as ISynthesisResponse;\r\n this.streamId = response.audio.streamId;\r\n }\r\n\r\n public onServiceTurnEndResponse(): void {\r\n this.privInTurn = false;\r\n this.privTurnDeferral.resolve();\r\n this.onComplete();\r\n }\r\n\r\n public onServiceTurnStartResponse(responseJson: string): void {\r\n if (!!this.privTurnDeferral && !!this.privInTurn) {\r\n // What? How are we starting a turn with another not done?\r\n this.privTurnDeferral.reject(\"Another turn started before current completed.\");\r\n // Avoid UnhandledPromiseRejection if privTurnDeferral is not being awaited\r\n // eslint-disable-next-line @typescript-eslint/no-empty-function\r\n this.privTurnDeferral.promise.then().catch((): void => { });\r\n }\r\n this.privInTurn = true;\r\n this.privTurnDeferral = new Deferred<void>();\r\n const response: ISynthesisResponse = JSON.parse(responseJson) as ISynthesisResponse;\r\n if (!!response.webrtc) {\r\n this.privWebRTCSDP = response.webrtc.connectionString;\r\n }\r\n }\r\n\r\n public onAudioChunkReceived(data: ArrayBuffer): void {\r\n if (this.isSynthesizing) {\r\n this.privAudioOutputStream.write(data);\r\n this.privBytesReceived += data.byteLength;\r\n if (this.privTurnAudioDestination !== undefined) {\r\n this.privTurnAudioDestination.write(data);\r\n }\r\n }\r\n }\r\n\r\n public onTextBoundaryEvent(metadata: ISynthesisMetadata): void {\r\n this.updateTextOffset(metadata.Data.text.Text, metadata.Type);\r\n }\r\n\r\n public onVisemeMetadataReceived(metadata: ISynthesisMetadata): void {\r\n if (metadata.Data.AnimationChunk !== undefined) {\r\n this.privPartialVisemeAnimation += metadata.Data.AnimationChunk;\r\n }\r\n }\r\n\r\n public onSessionEnd(metadata: ISynthesisMetadata): void {\r\n this.privAudioDuration = metadata.Data.Offset;\r\n }\r\n\r\n public async constructSynthesisResult(): Promise<SpeechSynthesisResult> {\r\n const audioBuffer: ArrayBuffer = await this.getAllReceivedAudioWithHeader();\r\n return new SpeechSynthesisResult(\r\n this.requestId,\r\n ResultReason.SynthesizingAudioCompleted,\r\n audioBuffer,\r\n undefined,\r\n this.extraProperties,\r\n this.audioDuration\r\n );\r\n }\r\n\r\n public dispose(): void {\r\n if (!this.privIsDisposed) {\r\n // we should have completed by now. If we did not its an unknown error.\r\n this.privIsDisposed = true;\r\n }\r\n }\r\n\r\n public onStopSynthesizing(): void {\r\n this.onComplete();\r\n }\r\n\r\n /**\r\n * Gets the viseme animation string (merged from animation chunk), and clears the internal\r\n * partial animation.\r\n */\r\n public getAndClearVisemeAnimation(): string {\r\n const animation: string = this.privPartialVisemeAnimation;\r\n this.privPartialVisemeAnimation = \"\";\r\n return animation;\r\n }\r\n\r\n protected onEvent(event: SpeechSynthesisEvent): void {\r\n Events.instance.onEvent(event);\r\n }\r\n\r\n /**\r\n * Check if the text is an XML(SSML) tag\r\n * @param text\r\n * @private\r\n */\r\n private static isXmlTag(text: string): boolean {\r\n return text.length >= 2 && text[0] === \"<\" && text[text.length - 1] === \">\";\r\n }\r\n\r\n private updateTextOffset(text: string, type: MetadataType): void {\r\n if (type === MetadataType.WordBoundary) {\r\n this.privTextOffset = this.privRawText.indexOf(text, this.privNextSearchTextIndex);\r\n if (this.privTextOffset >= 0) {\r\n this.privNextSearchTextIndex = this.privTextOffset + text.length;\r\n if (this.privIsSSML) {\r\n if (this.withinXmlTag(this.privTextOffset) && !SynthesisTurn.isXmlTag(text)) {\r\n this.updateTextOffset(text, type);\r\n }\r\n }\r\n }\r\n } else {\r\n this.privSentenceOffset = this.privRawText.indexOf(text, this.privNextSearchSentenceIndex);\r\n if (this.privSentenceOffset >= 0) {\r\n this.privNextSearchSentenceIndex = this.privSentenceOffset + text.length;\r\n if (this.privIsSSML) {\r\n if (this.withinXmlTag(this.privSentenceOffset) && !SynthesisTurn.isXmlTag(text)) {\r\n this.updateTextOffset(text, type);\r\n }\r\n }\r\n }\r\n }\r\n }\r\n\r\n private onComplete(): void {\r\n if (this.privIsSynthesizing) {\r\n this.privIsSynthesizing = false;\r\n this.privIsSynthesisEnded = true;\r\n this.privAudioOutputStream.close();\r\n this.privInTurn = false;\r\n if (this.privTurnAudioDestination !== undefined) {\r\n this.privTurnAudioDestination.close();\r\n this.privTurnAudioDestination = undefined;\r\n }\r\n }\r\n }\r\n\r\n private async readAllAudioFromStream(): Promise<void> {\r\n if (this.privIsSynthesisEnded) {\r\n this.privReceivedAudio = new ArrayBuffer(this.bytesReceived);\r\n try {\r\n await this.privAudioOutputStream.read(this.privReceivedAudio);\r\n } catch (e) {\r\n this.privReceivedAudio = new ArrayBuffer(0);\r\n }\r\n }\r\n }\r\n\r\n /**\r\n * Check if current idx is in XML(SSML) tag\r\n * @param idx\r\n * @private\r\n */\r\n private withinXmlTag(idx: number): boolean {\r\n return this.privRawText.indexOf(\"<\", idx + 1) > this.privRawText.indexOf(\">\", idx + 1);\r\n }\r\n}\r\n"]}