UNPKG

web-speech-cognitive-services

Version:

Polyfill Web Speech API with Cognitive Services Speech-to-Text service

github.com/compulim/web-speech-cognitive-services

compulim/web-speech-cognitive-services

1,381 lines (1,333 loc) • 53.1 kB

JavaScript

"use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.js var src_exports = {}; __export(src_exports, { createSpeechRecognitionPonyfill: () => createSpeechRecognitionPonyfill, createSpeechRecognitionPonyfillFromRecognizer: () => createSpeechRecognitionPonyfillFromRecognizer, createSpeechServicesPonyfill: () => SpeechServices_default, createSpeechSynthesisPonyfill: () => TextToSpeech_default, fetchAuthorizationToken: () => fetchAuthorizationToken }); module.exports = __toCommonJS(src_exports); // src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.ts var import_microsoft_cognitiveservices_speech_sdk2 = require("microsoft-cognitiveservices-speech-sdk"); // src/SpeechServices/patchOptions.ts var import_valibot8 = require("valibot"); // src/SpeechServices/resolveFunctionOrReturnValue.ts function isFunction(value) { return typeof value === "function"; } function resolveFunctionOrReturnValue(fnOrValue) { return isFunction(fnOrValue) ? fnOrValue() : fnOrValue; } // src/SpeechServices/SpeechSDK.ts var import_microsoft_cognitiveservices_speech_sdk = require("microsoft-cognitiveservices-speech-sdk/distrib/lib/microsoft.cognitiveservices.speech.sdk.js"); var SpeechSDK = { AudioConfig: import_microsoft_cognitiveservices_speech_sdk.AudioConfig, OutputFormat: import_microsoft_cognitiveservices_speech_sdk.OutputFormat, ResultReason: import_microsoft_cognitiveservices_speech_sdk.ResultReason, SpeechConfig: import_microsoft_cognitiveservices_speech_sdk.SpeechConfig, SpeechRecognizer: import_microsoft_cognitiveservices_speech_sdk.SpeechRecognizer }; var SpeechSDK_default = SpeechSDK; // src/SpeechServices/SpeechToText/validation/credentialsSchema.ts var import_valibot = require("valibot"); var credentialsSchema = (0, import_valibot.pipe)( (0, import_valibot.intersect)([ (0, import_valibot.union)( [ (0, import_valibot.object)({ authorizationToken: (0, import_valibot.string)(), subscriptionKey: (0, import_valibot.optional)((0, import_valibot.undefined_)('"subscriptionKey" must be unset when "authorizationToken" is set.')) }), (0, import_valibot.object)({ authorizationToken: (0, import_valibot.optional)((0, import_valibot.undefined_)('"authorizationToken" must be unset when "subscriptionKey" is set.')), subscriptionKey: (0, import_valibot.string)() }) ], 'The object must either have either "authorizationToken" or "subscriptionKey" set, but not both.' ), (0, import_valibot.union)( [ (0, import_valibot.object)({ customVoiceHostname: (0, import_valibot.optional)((0, import_valibot.undefined_)('"customVoiceHostname" must be unest when "region" is set.')), region: (0, import_valibot.string)(), speechRecognitionHostname: (0, import_valibot.optional)( (0, import_valibot.undefined_)('"speechRecognitionHostname" must be unest when "region" is set.') ), speechSynthesisHostname: (0, import_valibot.optional)((0, import_valibot.undefined_)('"speechSynthesisHostname" must be unest when "region" is set.')) }), (0, import_valibot.object)({ customVoiceHostname: (0, import_valibot.optional)((0, import_valibot.union)([(0, import_valibot.string)(), (0, import_valibot.undefined_)()])), region: (0, import_valibot.optional)((0, import_valibot.undefined_)('"region" must be unset when "*Hostname" is set.')), speechRecognitionHostname: (0, import_valibot.string)(), speechSynthesisHostname: (0, import_valibot.string)() }) ], 'The object must either have either "region" or "*Hostname" set, but not both.' ) ]), (0, import_valibot.readonly)() ); var credentialsSchema_default = credentialsSchema; // src/SpeechServices/SpeechToText/validation/enableTelemetrySchema.ts var import_valibot2 = require("valibot"); var enableTelemetrySchema = (0, import_valibot2.optional)((0, import_valibot2.boolean)()); var enableTelemetrySchema_default = enableTelemetrySchema; // src/SpeechServices/SpeechToText/validation/initialSilenceTimeoutSchema.ts var import_valibot3 = require("valibot"); var initialSilenceTimeoutSchema = (0, import_valibot3.optional)((0, import_valibot3.pipe)((0, import_valibot3.number)(), (0, import_valibot3.minValue)(1), (0, import_valibot3.maxValue)(6e4))); var initialSilenceTimeoutSchema_default = initialSilenceTimeoutSchema; // src/SpeechServices/SpeechToText/validation/looseEventsSchema.ts var import_valibot4 = require("valibot"); var looseEventsSchema = (0, import_valibot4.optional)((0, import_valibot4.boolean)(), false); var looseEventsSchema_default = looseEventsSchema; // src/SpeechServices/SpeechToText/validation/referenceGrammarsSchema.ts var import_valibot5 = require("valibot"); var referenceGrammarsSchema = (0, import_valibot5.pipe)( (0, import_valibot5.optional)((0, import_valibot5.array)((0, import_valibot5.string)()), []), // any(), // array(string()), // transform<string[], readonly string[]>(value => (Object.isFrozen(value) ? value : Object.freeze([...value]))) (0, import_valibot5.transform)((value) => Object.isFrozen(value) ? value : Object.freeze([...value])) ); var referenceGrammarsSchema_default = referenceGrammarsSchema; // src/SpeechServices/SpeechToText/validation/speechRecognitionEndpointIdSchema.ts var import_valibot6 = require("valibot"); var speechRecognitionEndpointIdSchema = (0, import_valibot6.optional)((0, import_valibot6.string)()); var speechRecognitionEndpointIdSchema_default = speechRecognitionEndpointIdSchema; // src/SpeechServices/SpeechToText/validation/textNormalizationSchema.ts var import_valibot7 = require("valibot"); var textNormalizationSchema = (0, import_valibot7.optional)( (0, import_valibot7.enum_)({ display: "display", itn: "itn", lexical: "lexical", maskeditn: "maskeditn" }), "display" ); var textNormalizationSchema_default = textNormalizationSchema; // src/SpeechServices/patchOptions.ts var { AudioConfig: AudioConfig2 } = SpeechSDK_default; var shouldWarnOnSubscriptionKey = true; function patchOptions(init) { const { audioConfig, credentials, enableTelemetry, initialSilenceTimeout, looseEvent, referenceGrammars, speechRecognitionEndpointId, textNormalization } = init; let { looseEvents } = init; if (typeof looseEvent !== "undefined") { console.warn('web-speech-cognitive-services: The option "looseEvent" should be named as "looseEvents".'); looseEvents = looseEvent; } return Object.freeze({ audioConfig: audioConfig || AudioConfig2.fromDefaultMicrophoneInput(), // We set telemetry to true to honor the default telemetry settings of Speech SDK // https://github.com/Microsoft/cognitive-services-speech-sdk-js#data--telemetry enableTelemetry: (0, import_valibot8.parse)(enableTelemetrySchema_default, enableTelemetry), fetchCredentials: async () => { const parsedCredentials = (0, import_valibot8.parse)(credentialsSchema_default, await resolveFunctionOrReturnValue(credentials)); if (shouldWarnOnSubscriptionKey && parsedCredentials.subscriptionKey) { console.warn( "web-speech-cognitive-services: In production environment, subscription key should not be used, authorization token should be used instead." ); shouldWarnOnSubscriptionKey = false; } return parsedCredentials; }, initialSilenceTimeout: (0, import_valibot8.parse)(initialSilenceTimeoutSchema_default, initialSilenceTimeout), looseEvents: (0, import_valibot8.parse)(looseEventsSchema_default, looseEvents), referenceGrammars: (0, import_valibot8.parse)(referenceGrammarsSchema_default, referenceGrammars), speechRecognitionEndpointId: (0, import_valibot8.parse)(speechRecognitionEndpointIdSchema_default, speechRecognitionEndpointId), textNormalization: (0, import_valibot8.parse)(textNormalizationSchema_default, textNormalization) }); } // src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfillFromRecognizer.ts var import_valibot9 = require("valibot"); // ../../node_modules/p-defer/index.js function pDefer() { const deferred = {}; deferred.promise = new Promise((resolve, reject) => { deferred.resolve = resolve; deferred.reject = reject; }); return deferred; } // src/Util/createPromiseQueue.js function createPromiseQueue() { let shiftDeferred; const queue = []; const push = (value) => { if (shiftDeferred) { const { resolve } = shiftDeferred; shiftDeferred = null; resolve(value); } else { queue.push(value); } }; const shift = () => { if (queue.length) { return Promise.resolve(queue.shift()); } return (shiftDeferred || (shiftDeferred = pDefer())).promise; }; return { push, shift }; } // src/SpeechServices/SpeechToText/SpeechRecognitionAlternative.ts var SpeechRecognitionAlternative = class { constructor({ confidence, transcript }) { this.#confidence = confidence; this.#transcript = transcript; } #confidence; #transcript; get confidence() { return this.#confidence; } get transcript() { return this.#transcript; } }; // src/SpeechServices/SpeechToText/private/FakeArray.ts var FakeArray = class { constructor(array2) { if (!array2) { throw new Error("array must be set."); } this.#array = array2; for (const key in array2) { Object.defineProperty(this, key, { enumerable: true, get() { return array2[key]; } }); } } #array; [Symbol.iterator]() { return this.#array[Symbol.iterator](); } get length() { return this.#array.length; } }; // src/SpeechServices/SpeechToText/SpeechRecognitionResult.ts var SpeechRecognitionResult = class extends FakeArray { constructor(init) { super(init.results); this.#isFinal = init.isFinal; } #isFinal; get isFinal() { return this.#isFinal; } }; // src/SpeechServices/SpeechToText/cognitiveServiceEventResultToWebSpeechRecognitionResult.ts var { ResultReason: { RecognizingSpeech, RecognizedSpeech } } = SpeechSDK_default; function cognitiveServiceEventResultToWebSpeechRecognitionResult(result, init) { const { maxAlternatives = Infinity, textNormalization = "display" } = init || {}; const json = typeof result.json === "string" ? JSON.parse(result.json) : result.json; if (result.reason === RecognizingSpeech || result.reason === RecognizedSpeech && !json.NBest) { return new SpeechRecognitionResult({ isFinal: result.reason === RecognizedSpeech, results: [ new SpeechRecognitionAlternative({ confidence: 0.5, transcript: result.text }) ] }); } else if (result.reason === RecognizedSpeech) { return new SpeechRecognitionResult({ isFinal: true, results: (json.NBest || []).slice(0, maxAlternatives).map( ({ Confidence: confidence, Display: display, ITN: itn, Lexical: lexical, MaskedITN: maskedITN }) => new SpeechRecognitionAlternative({ confidence, transcript: textNormalization === "itn" ? itn : textNormalization === "lexical" ? lexical : textNormalization === "maskeditn" ? maskedITN : display }) ) }); } return new SpeechRecognitionResult({ isFinal: false, results: [] }); } // src/SpeechServices/SpeechToText/cognitiveServicesAsyncToPromise.ts function cognitiveServicesAsyncToPromise(fn, context = void 0) { return (...args) => ( // eslint-disable-next-line prefer-spread new Promise((resolve, reject) => fn.apply(context, [...args, resolve, reject])) ); } // src/SpeechServices/SpeechToText/private/EventListenerMap.ts var EventListenerMap = class { constructor(eventTarget) { this.#eventTarget = eventTarget; this.#propertyMap = {}; } #eventTarget; #propertyMap; getProperty(name) { return this.#propertyMap[name]; } setProperty(name, value) { const existing = this.#propertyMap[name]; existing && this.#eventTarget.removeEventListener(name, existing); if (value) { this.#eventTarget.addEventListener(name, value); } this.#propertyMap[name] = value; } }; // src/SpeechServices/SpeechToText/private/prepareAudioConfig.ts var import_AudioSourceEvents = require("microsoft-cognitiveservices-speech-sdk/distrib/lib/src/common/AudioSourceEvents.js"); // src/SpeechServices/SpeechToText/private/averageAmplitude.ts function averageAmplitude(arrayBuffer) { const array2 = Array.from(new Int16Array(arrayBuffer)); return array2.reduce((averageAmplitude2, amplitude) => averageAmplitude2 + Math.abs(amplitude), 0) / array2.length; } // src/SpeechServices/SpeechToText/private/prepareAudioConfig.ts function prepareAudioConfig(audioConfig) { const audioConfigImpl = audioConfig; const originalAttach = audioConfigImpl.attach; const boundOriginalAttach = audioConfigImpl.attach.bind(audioConfigImpl); let firstChunk = false; let muted = false; audioConfigImpl.attach = async () => { const reader = await boundOriginalAttach(""); return { ...reader, read: async () => { const chunk = await reader.read(); if (!firstChunk && averageAmplitude(chunk.buffer) > 150) { audioConfigImpl.events.onEvent(new import_AudioSourceEvents.AudioSourceEvent("FirstAudibleChunk", "")); firstChunk = true; } if (muted) { return { buffer: new ArrayBuffer(0), isEnd: true, timeReceived: Date.now() }; } return chunk; } }; }; return { audioConfig, pause: () => { muted = true; }, unprepare: () => { audioConfigImpl.attach = originalAttach; } }; } // src/SpeechServices/SpeechToText/private/serializeRecognitionResult.ts function serializeRecognitionResult({ duration, errorDetails, json, offset, properties, reason, resultId, text }) { return Object.freeze({ duration, errorDetails, json: json && JSON.parse(json), offset, properties, reason, resultId, text }); } // src/SpeechServices/SpeechToText/SpeechGrammarList.ts var SpeechGrammarList = class { constructor() { this.#phrases = []; } addFromString() { throw new Error("JSGF is not supported"); } addFromURI() { throw new Error("JSGF is not supported"); } item() { throw new Error("JSGF is not supported"); } get length() { throw new Error("JSGF is not supported"); } #phrases; get phrases() { return this.#phrases; } set phrases(value) { if (Array.isArray(value)) { this.#phrases = Object.freeze([...value]); } else if (typeof value === "string") { this.#phrases = Object.freeze([value]); } else { throw new Error(`The provided value is not an array or of type 'string'`); } } }; // src/SpeechServices/SpeechToText/SpeechRecognitionErrorEvent.ts var SpeechRecognitionErrorEvent = class extends Event { constructor(type, { error, message }) { super(type); this.#error = error; this.#message = message; } #error; #message; get error() { return this.#error; } get message() { return this.#message; } get type() { return "error"; } }; // src/SpeechServices/SpeechToText/SpeechRecognitionResultList.ts var SpeechRecognitionResultList = class extends FakeArray { constructor(result) { super(result); } }; // src/SpeechServices/SpeechToText/SpeechRecognitionEvent.ts var SpeechRecognitionEvent = class extends Event { constructor(type, { data, resultIndex, results } = {}) { super(type); this.#data = data; this.#resultIndex = resultIndex; this.#results = results || new SpeechRecognitionResultList([]); } #data; // TODO: "resultIndex" should be set. #resultIndex; #results; get data() { return this.#data; } get resultIndex() { return this.#resultIndex; } get results() { return this.#results; } get type() { return super.type; } }; // src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfillFromRecognizer.ts var { ResultReason: ResultReason2, SpeechRecognizer: SpeechRecognizer2 } = SpeechSDK_default; var enableTelemetrySchema2 = (0, import_valibot9.union)([(0, import_valibot9.boolean)(), (0, import_valibot9.undefined_)()]); function createSpeechRecognitionPonyfillFromRecognizer({ createRecognizer, enableTelemetry, looseEvents, referenceGrammars, textNormalization }) { createRecognizer = (0, import_valibot9.parse)((0, import_valibot9.function_)(), createRecognizer); enableTelemetry = (0, import_valibot9.parse)(enableTelemetrySchema2, enableTelemetry); looseEvents = (0, import_valibot9.parse)((0, import_valibot9.boolean)(), looseEvents); referenceGrammars = (0, import_valibot9.parse)(referenceGrammarsSchema_default, referenceGrammars); textNormalization = (0, import_valibot9.parse)(textNormalizationSchema_default, textNormalization); typeof enableTelemetry !== "undefined" && SpeechRecognizer2.enableTelemetry(enableTelemetry); class SpeechRecognition extends EventTarget { #continuous = false; #eventListenerMap = new EventListenerMap(this); #grammars = new SpeechGrammarList(); #interimResults = false; #lang = typeof window !== "undefined" ? window.document.documentElement.getAttribute("lang") || window.navigator.language : "en-US"; // eslint-disable-next-line no-magic-numbers #maxAlternatives = 1; emitCognitiveServices(type, event) { this.dispatchEvent( new SpeechRecognitionEvent("cognitiveservices", { data: { ...event, type } }) ); } get continuous() { return this.#continuous; } set continuous(value) { this.#continuous = value; } get grammars() { return this.#grammars; } set grammars(value) { if (value instanceof SpeechGrammarList) { this.#grammars = value; } else { throw new Error(`The provided value is not of type 'SpeechGrammarList'`); } } get interimResults() { return this.#interimResults; } set interimResults(value) { this.#interimResults = value; } get maxAlternatives() { return this.#maxAlternatives; } set maxAlternatives(value) { this.#maxAlternatives = value; } get lang() { return this.#lang; } set lang(value) { this.#lang = value; } get onaudioend() { return this.#eventListenerMap.getProperty("audioend"); } set onaudioend(value) { this.#eventListenerMap.setProperty("audioend", value); } /** @type { ((event: SpeechRecognitionEvent<'audiostart'>) => void) | undefined } */ get onaudiostart() { return this.#eventListenerMap.getProperty("audiostart"); } set onaudiostart(value) { this.#eventListenerMap.setProperty("audiostart", value); } /** @type { ((event: SpeechRecognitionEvent<'cognitiveservices'>) => void) | undefined } */ get oncognitiveservices() { return this.#eventListenerMap.getProperty("cognitiveservices"); } set oncognitiveservices(value) { this.#eventListenerMap.setProperty("cognitiveservices", value); } /** @type { ((event: SpeechRecognitionEvent<'end'>) => void) | undefined } */ get onend() { return this.#eventListenerMap.getProperty("end"); } set onend(value) { this.#eventListenerMap.setProperty("end", value); } /** @type { ((event: SpeechRecognitionErrorEvent) => void) | undefined } */ get onerror() { return this.#eventListenerMap.getProperty("error"); } set onerror(value) { this.#eventListenerMap.setProperty("error", value); } /** @type { ((event: SpeechRecognitionEvent<'result'>) => void) | undefined } */ get onresult() { return this.#eventListenerMap.getProperty("result"); } set onresult(value) { this.#eventListenerMap.setProperty("result", value); } /** @type { ((event: SpeechRecognitionEvent<'soundend'>) => void) | undefined } */ get onsoundend() { return this.#eventListenerMap.getProperty("soundend"); } set onsoundend(value) { this.#eventListenerMap.setProperty("soundend", value); } /** @type { ((event: SpeechRecognitionEvent<'soundstart'>) => void) | undefined } */ get onsoundstart() { return this.#eventListenerMap.getProperty("soundstart"); } set onsoundstart(value) { this.#eventListenerMap.setProperty("soundstart", value); } /** @type { ((event: SpeechRecognitionEvent<'speechend'>) => void) | undefined } */ get onspeechend() { return this.#eventListenerMap.getProperty("speechend"); } set onspeechend(value) { this.#eventListenerMap.setProperty("speechend", value); } /** @type { ((event: SpeechRecognitionEvent<'speechstart'>) => void) | undefined } */ get onspeechstart() { return this.#eventListenerMap.getProperty("speechstart"); } set onspeechstart(value) { this.#eventListenerMap.setProperty("speechstart", value); } /** @type { ((event: SpeechRecognitionEvent<'start'>) => void) | undefined } */ get onstart() { return this.#eventListenerMap.getProperty("start"); } set onstart(value) { this.#eventListenerMap.setProperty("start", value); } abort; stop; start() { this._startOnce().catch((err) => { this.dispatchEvent( new SpeechRecognitionErrorEvent("error", { error: err, message: err && (err.stack || err.message) }) ); }); } async _startOnce() { const recognizer = await createRecognizer(this.lang); const { pause, unprepare } = prepareAudioConfig(recognizer["audioConfig"]); try { const queue = createPromiseQueue(); let soundStarted; let speechStarted; let stopping; const { detach: detachAudioConfigEvent } = recognizer["audioConfig"].events.attach( (event) => { const { name } = event; if (name === "AudioSourceReadyEvent") { queue.push({ audioSourceReady: {} }); } else if (name === "AudioSourceOffEvent") { queue.push({ audioSourceOff: {} }); } else if (name === "FirstAudibleChunk") { queue.push({ firstAudibleChunk: {} }); } } ); recognizer.canceled = (_, { errorDetails, offset, reason, sessionId }) => { queue.push({ canceled: { errorDetails, offset, reason, sessionId } }); }; recognizer.recognized = (_, { offset, result, sessionId }) => { queue.push({ recognized: { offset, result: serializeRecognitionResult(result), sessionId } }); }; recognizer.recognizing = (_, { offset, result, sessionId }) => { queue.push({ recognizing: { offset, result: serializeRecognitionResult(result), sessionId } }); }; recognizer.sessionStarted = (_, { sessionId }) => { queue.push({ sessionStarted: { sessionId } }); }; recognizer.sessionStopped = (_, { sessionId }) => { queue.push({ sessionStopped: { sessionId } }); }; recognizer.speechStartDetected = (_, { offset, sessionId }) => { queue.push({ speechStartDetected: { offset, sessionId } }); }; recognizer.speechEndDetected = (_, { sessionId }) => { queue.push({ speechEndDetected: { sessionId } }); }; const { phrases } = this.grammars; const { dynamicGrammar } = recognizer["privReco"]; referenceGrammars && referenceGrammars.length && dynamicGrammar.addReferenceGrammar([...referenceGrammars]); phrases && phrases.length && dynamicGrammar.addPhrase([...phrases]); await cognitiveServicesAsyncToPromise(recognizer.startContinuousRecognitionAsync, recognizer)(); if (typeof recognizer.stopContinuousRecognitionAsync === "function") { this.abort = () => queue.push({ abort: {} }); this.stop = () => queue.push({ stop: {} }); } else { this.abort = this.stop = void 0; } let audioStarted; let finalEvent = void 0; let finalizedResults = []; for (let loop = 0; !stopping || audioStarted; loop++) { const event = await queue.shift(); const { abort, audioSourceOff, audioSourceReady, canceled, firstAudibleChunk, recognized, recognizing, stop } = event; Object.keys(event).forEach((name) => this.emitCognitiveServices(name, event[name])); const errorMessage = canceled && canceled.errorDetails; if (/Permission\sdenied/u.test(errorMessage || "")) { finalEvent = new SpeechRecognitionErrorEvent("error", { error: "not-allowed" }); break; } if (!loop) { this.dispatchEvent(new SpeechRecognitionEvent("start")); } if (errorMessage) { if (/1006/u.test(errorMessage)) { if (!audioStarted) { this.dispatchEvent(new SpeechRecognitionEvent("audiostart")); this.dispatchEvent(new SpeechRecognitionEvent("audioend")); } finalEvent = new SpeechRecognitionErrorEvent("error", { error: "network" }); } else { finalEvent = new SpeechRecognitionErrorEvent("error", { error: "unknown" }); } break; } else if (abort || stop) { if (abort) { finalEvent = new SpeechRecognitionErrorEvent("error", { error: "aborted" }); stopping = "abort"; } else { pause(); stopping = "stop"; } if (abort && recognizer.stopContinuousRecognitionAsync) { await cognitiveServicesAsyncToPromise(recognizer.stopContinuousRecognitionAsync, recognizer)(); } } else if (audioSourceReady) { this.dispatchEvent(new SpeechRecognitionEvent("audiostart")); audioStarted = true; } else if (firstAudibleChunk) { this.dispatchEvent(new SpeechRecognitionEvent("soundstart")); soundStarted = true; } else if (audioSourceOff) { speechStarted && this.dispatchEvent(new SpeechRecognitionEvent("speechend")); soundStarted && this.dispatchEvent(new SpeechRecognitionEvent("soundend")); audioStarted && this.dispatchEvent(new SpeechRecognitionEvent("audioend")); audioStarted = soundStarted = speechStarted = false; break; } else if (stopping !== "abort") { if (recognized && recognized.result && recognized.result.reason === ResultReason2.NoMatch) { if (!this.continuous || stopping === "stop") { finalEvent = new SpeechRecognitionEvent("result", { results: new SpeechRecognitionResultList(finalizedResults) }); recognizer.stopContinuousRecognitionAsync && await cognitiveServicesAsyncToPromise( recognizer.stopContinuousRecognitionAsync, recognizer )(); break; } } else if (recognized || recognizing) { if (!audioStarted) { this.dispatchEvent(new SpeechRecognitionEvent("audiostart")); audioStarted = true; } if (!soundStarted) { this.dispatchEvent(new SpeechRecognitionEvent("soundstart")); soundStarted = true; } if (!speechStarted) { this.dispatchEvent(new SpeechRecognitionEvent("speechstart")); speechStarted = true; } if (recognized) { const result = cognitiveServiceEventResultToWebSpeechRecognitionResult(recognized.result, { maxAlternatives: this.maxAlternatives, textNormalization }); const recognizable = !!result[0]?.transcript; if (recognizable) { finalizedResults = [...finalizedResults, result]; this.continuous && this.dispatchEvent( new SpeechRecognitionEvent("result", { results: new SpeechRecognitionResultList(finalizedResults) }) ); } if (this.continuous && recognizable) { finalEvent = void 0; } else { finalEvent = new SpeechRecognitionEvent("result", { results: new SpeechRecognitionResultList(finalizedResults) }); } if ((!this.continuous || stopping === "stop") && recognizer.stopContinuousRecognitionAsync) { await cognitiveServicesAsyncToPromise(recognizer.stopContinuousRecognitionAsync, recognizer)(); } if (looseEvents && finalEvent && recognizable) { this.dispatchEvent(finalEvent); finalEvent = void 0; } } else if (recognizing) { this.interimResults && this.dispatchEvent( new SpeechRecognitionEvent("result", { results: new SpeechRecognitionResultList([ ...finalizedResults, cognitiveServiceEventResultToWebSpeechRecognitionResult(recognizing.result, { maxAlternatives: this.maxAlternatives, textNormalization }) ]) }) ); } } } } if (speechStarted) { this.dispatchEvent(new SpeechRecognitionEvent("speechend")); } if (soundStarted) { this.dispatchEvent(new SpeechRecognitionEvent("soundend")); } if (audioStarted) { this.dispatchEvent(new SpeechRecognitionEvent("audioend")); } if (finalEvent) { if (finalEvent.type === "result" && !finalEvent.results.length) { finalEvent = new SpeechRecognitionErrorEvent("error", { error: "no-speech" }); } this.dispatchEvent(finalEvent); } this.dispatchEvent(new SpeechRecognitionEvent("end")); detachAudioConfigEvent(); } catch (err) { console.error(err); throw err; } finally { unprepare(); recognizer["dispose"](false); } } } return { SpeechGrammarList, SpeechRecognition, SpeechRecognitionEvent }; } // src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.ts var { OutputFormat: OutputFormat2, SpeechConfig: SpeechConfig2, SpeechRecognizer: SpeechRecognizer3 } = SpeechSDK_default; function createSpeechRecognitionPonyfill(options) { const { audioConfig, enableTelemetry, fetchCredentials, initialSilenceTimeout, looseEvents, referenceGrammars, speechRecognitionEndpointId, textNormalization } = patchOptions(options); if (!audioConfig && (!window.navigator.mediaDevices || !window.navigator.mediaDevices.getUserMedia)) { throw new Error( "web-speech-cognitive-services: This browser does not support Media Capture and Streams API and it will not work with Cognitive Services Speech Services." ); } const createRecognizer = async (lang) => { const credentials = await fetchCredentials(); let speechConfig; if (typeof credentials.speechRecognitionHostname !== "undefined") { const host = new URL("wss://localhost:443"); host.hostname = credentials.speechRecognitionHostname; if (credentials.authorizationToken) { speechConfig = SpeechConfig2.fromHost(host); speechConfig.authorizationToken = credentials.authorizationToken; } else { speechConfig = SpeechConfig2.fromHost(host, credentials.subscriptionKey); } } else { speechConfig = typeof credentials.authorizationToken !== "undefined" ? SpeechConfig2.fromAuthorizationToken(credentials.authorizationToken, credentials.region) : SpeechConfig2.fromSubscription(credentials.subscriptionKey, credentials.region); } if (speechRecognitionEndpointId) { speechConfig.endpointId = speechRecognitionEndpointId; } speechConfig.outputFormat = OutputFormat2.Detailed; speechConfig.speechRecognitionLanguage = lang || "en-US"; typeof initialSilenceTimeout === "number" && speechConfig.setProperty(import_microsoft_cognitiveservices_speech_sdk2.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "" + initialSilenceTimeout); return new SpeechRecognizer3(speechConfig, audioConfig); }; return createSpeechRecognitionPonyfillFromRecognizer({ createRecognizer, enableTelemetry, looseEvents, referenceGrammars, textNormalization }); } // src/SpeechServices/TextToSpeech/createSpeechSynthesisPonyfill.js var import_event_target_shim3 = require("event-target-shim"); var import_async = require("on-error-resume-next/async"); // src/SpeechServices/TextToSpeech/AudioContextQueue.js var import_memoize_one = __toESM(require("memoize-one")); // src/SpeechServices/TextToSpeech/AudioContextConsumer.js var AudioContextConsumer = class { constructor(audioContext) { this.audioContext = audioContext; } pause() { this.audioContext && this.audioContext.suspend(); this.playingUtterance && this.playingUtterance.dispatchEvent(new CustomEvent("pause")); } resume() { this.audioContext && this.audioContext.resume(); this.playingUtterance && this.playingUtterance.dispatchEvent(new CustomEvent("resume")); } async start(queue) { let utterance; while (utterance = queue.shift()) { this.playingUtterance = utterance; await utterance.play(this.audioContext); this.playingUtterance = null; } } stop() { this.playingUtterance && this.playingUtterance.stop(); if (this.audioContext.state === "suspended") { this.audioContext.resume(); } } }; // src/SpeechServices/TextToSpeech/AudioContextQueue.js var AudioContextQueue = class { constructor({ audioContext, ponyfill }) { this.consumer = null; this.paused = false; this.queue = []; this.getAudioContext = (0, import_memoize_one.default)(() => audioContext || new ponyfill.AudioContext()); } pause() { this.paused = true; this.consumer && this.consumer.pause(); } push(utterance) { this.queue.push(utterance); this.startConsumer(); } resume() { this.paused = false; if (this.consumer) { this.consumer.resume(); } else { this.startConsumer(); } } get speaking() { return !!this.consumer; } async startConsumer() { while (!this.paused && this.queue.length && !this.consumer) { this.consumer = new AudioContextConsumer(this.getAudioContext()); await this.consumer.start(this.queue); this.consumer = null; } } stop() { this.queue.splice(0); this.consumer && this.consumer.stop(); } }; // src/SpeechServices/TextToSpeech/SpeechSynthesisEvent.js var import_event_target_shim = require("event-target-shim"); var SpeechSynthesisEvent = class extends import_event_target_shim.Event { constructor(type) { super(type); } }; // src/SpeechServices/TextToSpeech/SpeechSynthesisUtterance.js var import_event_as_promise = require("event-as-promise"); var import_event_target_shim2 = require("event-target-shim"); // src/SpeechServices/TextToSpeech/fetchSpeechData.js var import_base64_arraybuffer = require("base64-arraybuffer"); // src/SpeechServices/TextToSpeech/buildSSML.js function relativePercentage(value) { let relative = Math.round((value - 1) * 100); if (relative >= 0) { relative = "+" + relative; } return relative + "%"; } function buildSSML({ lang, pitch = 1, rate = 1, text, voice, volume }) { return `<speak version="1.0" xml:lang="${lang}"> <voice xml:lang="${lang}" name="${voice}"> <prosody pitch="${relativePercentage(pitch)}" rate="${relativePercentage(rate)}" volume="${relativePercentage( volume )}"> ${text} </prosody> </voice> </speak>`; } // src/SpeechServices/TextToSpeech/isSSML.js var SPEAK_TAG_PATTERN = /^\s*<speak(\s|\/?>)/u; var XML_PROLOG_PATTERN = /^\s*<\?xml\s/u; function isSSML(text) { return SPEAK_TAG_PATTERN.test(text) || XML_PROLOG_PATTERN.test(text); } // src/SpeechServices/TextToSpeech/fetchSpeechData.js var DEFAULT_LANGUAGE = "en-US"; var DEFAULT_OUTPUT_FORMAT = "riff-16khz-16bit-mono-pcm"; var DEFAULT_VOICE = "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)"; var EMPTY_MP3_BASE64 = "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU3LjU2LjEwMQAAAAAAAAAAAAAA//tAwAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAACAAABhgC7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7//////////////////////////////////////////////////////////////////8AAAAATGF2YzU3LjY0AAAAAAAAAAAAAAAAJAUHAAAAAAAAAYYoRBqpAAAAAAD/+xDEAAPAAAGkAAAAIAAANIAAAARMQU1FMy45OS41VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVf/7EMQpg8AAAaQAAAAgAAA0gAAABFVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV"; async function fetchSpeechData({ deploymentId, fetchCredentials, lang = DEFAULT_LANGUAGE, outputFormat = DEFAULT_OUTPUT_FORMAT, pitch, rate, text, voice = DEFAULT_VOICE, volume }) { if (!text) { return (0, import_base64_arraybuffer.decode)(EMPTY_MP3_BASE64); } const { authorizationToken, region, speechSynthesisHostname, subscriptionKey } = await fetchCredentials(); if (authorizationToken && subscriptionKey || !authorizationToken && !subscriptionKey) { throw new Error('Only "authorizationToken" or "subscriptionKey" should be set.'); } else if (region && speechSynthesisHostname || !region && !speechSynthesisHostname) { throw new Error('Only "region" or "speechSynthesisHostnamename" should be set.'); } const ssml = isSSML(text) ? text : buildSSML({ lang, pitch, rate, text, voice, volume }); const hostname = speechSynthesisHostname || (deploymentId ? `${encodeURI(region)}.voice.speech.microsoft.com` : `${encodeURI(region)}.tts.speech.microsoft.com`); const search = deploymentId ? `?deploymentId=${encodeURI(deploymentId)}` : ""; const url = `https://${hostname}/cognitiveservices/v1${search}`; const res = await fetch(url, { headers: { "Content-Type": "application/ssml+xml", "X-Microsoft-OutputFormat": outputFormat, ...authorizationToken ? { Authorization: `Bearer ${authorizationToken}` } : { "Ocp-Apim-Subscription-Key": subscriptionKey } }, method: "POST", body: ssml }); if (!res.ok) { throw new Error(`web-speech-cognitive-services: Failed to syntheis speech, server returned ${res.status}`); } return res.arrayBuffer(); } // src/SpeechServices/TextToSpeech/subscribeEvent.js function subscribeEvent(target, name, handler) { target.addEventListener(name, handler); return () => target.removeEventListener(name, handler); } // src/SpeechServices/TextToSpeech/SpeechSynthesisUtterance.js function asyncDecodeAudioData(audioContext, arrayBuffer) { return new Promise((resolve, reject) => { const promise = audioContext.decodeAudioData(arrayBuffer, resolve, reject); promise && typeof promise.then === "function" && resolve(promise); }); } function playDecoded(audioContext, audioBuffer, source) { return new Promise((resolve, reject) => { const audioContextClosed = new import_event_as_promise.EventAsPromise(); const sourceEnded = new import_event_as_promise.EventAsPromise(); const unsubscribe = subscribeEvent( audioContext, "statechange", ({ target: { state } }) => state === "closed" && audioContextClosed.eventListener() ); try { source.buffer = audioBuffer; source.onended = sourceEnded.eventListener; source.connect(audioContext.destination); source.start(0); Promise.race([audioContextClosed.upcoming(), sourceEnded.upcoming()]).then(resolve); } catch (err) { reject(err); } finally { unsubscribe(); } }); } var SpeechSynthesisUtterance = class extends import_event_target_shim2.EventTarget { constructor(text) { super(); this._lang = null; this._pitch = 1; this._rate = 1; this._voice = null; this._volume = 1; this.text = text; this.onboundary = null; this.onend = null; this.onerror = null; this.onmark = null; this.onpause = null; this.onresume = null; this.onstart = null; } get lang() { return this._lang; } set lang(value) { this._lang = value; } get onboundary() { return (0, import_event_target_shim2.getEventAttributeValue)(this, "boundary"); } set onboundary(value) { (0, import_event_target_shim2.setEventAttributeValue)(this, "boundary", value); } get onend() { return (0, import_event_target_shim2.getEventAttributeValue)(this, "end"); } set onend(value) { (0, import_event_target_shim2.setEventAttributeValue)(this, "end", value); } get onerror() { return (0, import_event_target_shim2.getEventAttributeValue)(this, "error"); } set onerror(value) { (0, import_event_target_shim2.setEventAttributeValue)(this, "error", value); } get onmark() { return (0, import_event_target_shim2.getEventAttributeValue)(this, "mark"); } set onmark(value) { (0, import_event_target_shim2.setEventAttributeValue)(this, "mark", value); } get onpause() { return (0, import_event_target_shim2.getEventAttributeValue)(this, "pause"); } set onpause(value) { (0, import_event_target_shim2.setEventAttributeValue)(this, "pause", value); } get onresume() { return (0, import_event_target_shim2.getEventAttributeValue)(this, "resume"); } set onresume(value) { (0, import_event_target_shim2.setEventAttributeValue)(this, "resume", value); } get onstart() { return (0, import_event_target_shim2.getEventAttributeValue)(this, "start"); } set onstart(value) { (0, import_event_target_shim2.setEventAttributeValue)(this, "start", value); } get pitch() { return this._pitch; } set pitch(value) { this._pitch = value; } get rate() { return this._rate; } set rate(value) { this._rate = value; } get voice() { return this._voice; } set voice(value) { this._voice = value; } get volume() { return this._volume; } set volume(value) { this._volume = value; } preload({ deploymentId, fetchCredentials, outputFormat }) { this.arrayBufferPromise = fetchSpeechData({ fetchCredentials, deploymentId, lang: this.lang || window.navigator.language, outputFormat, pitch: this.pitch, rate: this.rate, text: this.text, voice: this.voice && this.voice.voiceURI, volume: this.volume }); this.arrayBufferPromise.catch(); } async play(audioContext) { try { this.dispatchEvent(new SpeechSynthesisEvent("start")); const source = audioContext.createBufferSource(); const audioBuffer = await asyncDecodeAudioData(audioContext, await this.arrayBufferPromise); this._playingSource = source; await playDecoded(audioContext, audioBuffer, source); this._playingSource = null; this.dispatchEvent(new SpeechSynthesisEvent("end")); } catch (error) { this.dispatchEvent(new ErrorEvent("error", { error: "synthesis-failed", message: error.stack })); } } stop() { this._playingSource && this._playingSource.stop(); } }; var SpeechSynthesisUtterance_default = SpeechSynthesisUtterance; // src/SpeechServices/TextToSpeech/SpeechSynthesisVoice.js var SpeechSynthesisVoice = class { constructor({ gender, lang, voiceURI }) { this._default = false; this._gender = gender; this._lang = lang; this._localService = false; this._name = voiceURI; this._voiceURI = voiceURI; } get default() { return this._default; } get gender() { return this._gender; } get lang() { return this._lang; } get localService() { return this._localService; } get name() { return this._name; } get voiceURI() { return this._voiceURI; } }; // src/SpeechServices/TextToSpeech/fetchCustomVoices.js async function fetchCustomVoices_({ customVoiceHostname, deploymentId, region, subscriptionKey }) { const hostname = customVoiceHostname || `${region}.customvoice.api.speech.microsoft.com`; const res = await fetch( `https://${encodeURI(hostname)}/api/texttospeech/v2.0/endpoints/${encodeURIComponent(deploymentId)}`, { headers: { accept: "application/json", "ocp-apim-subscription-key": subscriptionKey } } ); if (!res.ok) { throw new Error("Failed to fetch custom voices"); } return res.json(); } async function fetchCustomVoices({ customVoiceHostname, deploymentId, region, subscriptionKey }) { const { models } = await fetchCustomVoices_({ customVoiceHostname, deploymentId, region, subscriptionKey }); return models.map( ({ properties: { Gender: gender }, locale: lang, name: voiceURI }) => new SpeechSynthesisVoice({ gender, lang, voiceURI }) ).sort(({ name: x }, { name: y }) => x > y ? 1 : x < y ? -1 : 0); } // src/SpeechServices/TextToSpeech/fetchVoices.js async function fetchVoices({ authorizationToken, region, speechSynthesisHostname, subscriptionKey }) { const hostname = speechSynthesisHostname || `${encodeURI(region)}.tts.speech.microsoft.com`; const res = await fetch(`https://${hostname}/cognitiveservices/voices/list`, { headers: { "content-type": "application/json", ...authorizationToken ? { authorization: `Bearer ${authorizationToken}` } : { "Ocp-Apim-Subscription-Key": subscriptionKey } } }); if (!res.ok) { throw new Error("Failed to fetch voices"); } const voices = await res.json(); return voices.map(({ Gender: gender, Locale: lang, Name: voiceURI }) => new SpeechSynthesisVoice({ gender, lang, voiceURI })).sort(({ name: x }, { name: y }) => x > y ? 1 : x < y ? -1 : 0); } // src/SpeechServices/TextToSpeech/createSpeechSynthesisPonyfill.js var DEFAULT_OUTPUT_FORMAT2 = "audio-24khz-160kbitrate-mono-mp3"; var EMPTY_ARRAY = []; function createSpeechRecognitionPonyfill2(options) { const { audioContext, fetchCredentials, ponyfill = { AudioContext: window.AudioContext || window.webkitAudioContext }, speechSynthesisDeploymentId, speechSynthesisOutputFormat = DEFAULT_OUTPUT_FORMAT2 } = patchOptions(options); if (!audioContext && !ponyfill.AudioContext) { console.warn( "web-speech-cognitive-services: This browser does not support Web Audio and it will not work with Cognitive Services Speech Services." ); return {}; } class SpeechSynthesis extends import_event_target_shim3.EventTarget { constructor() { super(); this.queue = new AudioContextQueue({ audioContext, ponyfill }); this.updateVoices(); } cancel() { this.queue.stop(); } getVoices() { return EMPTY_ARRAY; } get onvoiceschanged() { return (0, import_event_target_shim3.getEventAttributeValue)(this, "voiceschanged"); } set onvoiceschanged(value) { (0, import_event_target_shim3.setEventAttributeValue)(this, "voiceschanged", value); } pause() { this.queue.pause(); } resume() { this.queue.resume(); } speak(utterance) { if (!(utterance instanceof SpeechSynthesisUtterance_default)) { throw new Error("invalid utterance"); } const { reject, resolve, promise } = pDefer(); const handleError = ({ error: errorCode, message }) => { const error = new Error(errorCode); error.stack = message; reject(error); }; utterance.addEventListener("end", resolve); utterance.addEventList