web-speech-cognitive-services
Version:
Polyfill Web Speech API with Cognitive Services Speech-to-Text service
1,381 lines (1,333 loc) • 53.1 kB
JavaScript
"use strict";
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.js
var src_exports = {};
__export(src_exports, {
createSpeechRecognitionPonyfill: () => createSpeechRecognitionPonyfill,
createSpeechRecognitionPonyfillFromRecognizer: () => createSpeechRecognitionPonyfillFromRecognizer,
createSpeechServicesPonyfill: () => SpeechServices_default,
createSpeechSynthesisPonyfill: () => TextToSpeech_default,
fetchAuthorizationToken: () => fetchAuthorizationToken
});
module.exports = __toCommonJS(src_exports);
// src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.ts
var import_microsoft_cognitiveservices_speech_sdk2 = require("microsoft-cognitiveservices-speech-sdk");
// src/SpeechServices/patchOptions.ts
var import_valibot8 = require("valibot");
// src/SpeechServices/resolveFunctionOrReturnValue.ts
function isFunction(value) {
return typeof value === "function";
}
function resolveFunctionOrReturnValue(fnOrValue) {
return isFunction(fnOrValue) ? fnOrValue() : fnOrValue;
}
// src/SpeechServices/SpeechSDK.ts
var import_microsoft_cognitiveservices_speech_sdk = require("microsoft-cognitiveservices-speech-sdk/distrib/lib/microsoft.cognitiveservices.speech.sdk.js");
var SpeechSDK = {
AudioConfig: import_microsoft_cognitiveservices_speech_sdk.AudioConfig,
OutputFormat: import_microsoft_cognitiveservices_speech_sdk.OutputFormat,
ResultReason: import_microsoft_cognitiveservices_speech_sdk.ResultReason,
SpeechConfig: import_microsoft_cognitiveservices_speech_sdk.SpeechConfig,
SpeechRecognizer: import_microsoft_cognitiveservices_speech_sdk.SpeechRecognizer
};
var SpeechSDK_default = SpeechSDK;
// src/SpeechServices/SpeechToText/validation/credentialsSchema.ts
var import_valibot = require("valibot");
var credentialsSchema = (0, import_valibot.pipe)(
(0, import_valibot.intersect)([
(0, import_valibot.union)(
[
(0, import_valibot.object)({
authorizationToken: (0, import_valibot.string)(),
subscriptionKey: (0, import_valibot.optional)((0, import_valibot.undefined_)('"subscriptionKey" must be unset when "authorizationToken" is set.'))
}),
(0, import_valibot.object)({
authorizationToken: (0, import_valibot.optional)((0, import_valibot.undefined_)('"authorizationToken" must be unset when "subscriptionKey" is set.')),
subscriptionKey: (0, import_valibot.string)()
})
],
'The object must either have either "authorizationToken" or "subscriptionKey" set, but not both.'
),
(0, import_valibot.union)(
[
(0, import_valibot.object)({
customVoiceHostname: (0, import_valibot.optional)((0, import_valibot.undefined_)('"customVoiceHostname" must be unest when "region" is set.')),
region: (0, import_valibot.string)(),
speechRecognitionHostname: (0, import_valibot.optional)(
(0, import_valibot.undefined_)('"speechRecognitionHostname" must be unest when "region" is set.')
),
speechSynthesisHostname: (0, import_valibot.optional)((0, import_valibot.undefined_)('"speechSynthesisHostname" must be unest when "region" is set.'))
}),
(0, import_valibot.object)({
customVoiceHostname: (0, import_valibot.optional)((0, import_valibot.union)([(0, import_valibot.string)(), (0, import_valibot.undefined_)()])),
region: (0, import_valibot.optional)((0, import_valibot.undefined_)('"region" must be unset when "*Hostname" is set.')),
speechRecognitionHostname: (0, import_valibot.string)(),
speechSynthesisHostname: (0, import_valibot.string)()
})
],
'The object must either have either "region" or "*Hostname" set, but not both.'
)
]),
(0, import_valibot.readonly)()
);
var credentialsSchema_default = credentialsSchema;
// src/SpeechServices/SpeechToText/validation/enableTelemetrySchema.ts
var import_valibot2 = require("valibot");
var enableTelemetrySchema = (0, import_valibot2.optional)((0, import_valibot2.boolean)());
var enableTelemetrySchema_default = enableTelemetrySchema;
// src/SpeechServices/SpeechToText/validation/initialSilenceTimeoutSchema.ts
var import_valibot3 = require("valibot");
var initialSilenceTimeoutSchema = (0, import_valibot3.optional)((0, import_valibot3.pipe)((0, import_valibot3.number)(), (0, import_valibot3.minValue)(1), (0, import_valibot3.maxValue)(6e4)));
var initialSilenceTimeoutSchema_default = initialSilenceTimeoutSchema;
// src/SpeechServices/SpeechToText/validation/looseEventsSchema.ts
var import_valibot4 = require("valibot");
var looseEventsSchema = (0, import_valibot4.optional)((0, import_valibot4.boolean)(), false);
var looseEventsSchema_default = looseEventsSchema;
// src/SpeechServices/SpeechToText/validation/referenceGrammarsSchema.ts
var import_valibot5 = require("valibot");
var referenceGrammarsSchema = (0, import_valibot5.pipe)(
(0, import_valibot5.optional)((0, import_valibot5.array)((0, import_valibot5.string)()), []),
// any(),
// array(string()),
// transform<string[], readonly string[]>(value => (Object.isFrozen(value) ? value : Object.freeze([...value])))
(0, import_valibot5.transform)((value) => Object.isFrozen(value) ? value : Object.freeze([...value]))
);
var referenceGrammarsSchema_default = referenceGrammarsSchema;
// src/SpeechServices/SpeechToText/validation/speechRecognitionEndpointIdSchema.ts
var import_valibot6 = require("valibot");
var speechRecognitionEndpointIdSchema = (0, import_valibot6.optional)((0, import_valibot6.string)());
var speechRecognitionEndpointIdSchema_default = speechRecognitionEndpointIdSchema;
// src/SpeechServices/SpeechToText/validation/textNormalizationSchema.ts
var import_valibot7 = require("valibot");
var textNormalizationSchema = (0, import_valibot7.optional)(
(0, import_valibot7.enum_)({
display: "display",
itn: "itn",
lexical: "lexical",
maskeditn: "maskeditn"
}),
"display"
);
var textNormalizationSchema_default = textNormalizationSchema;
// src/SpeechServices/patchOptions.ts
var { AudioConfig: AudioConfig2 } = SpeechSDK_default;
var shouldWarnOnSubscriptionKey = true;
function patchOptions(init) {
const {
audioConfig,
credentials,
enableTelemetry,
initialSilenceTimeout,
looseEvent,
referenceGrammars,
speechRecognitionEndpointId,
textNormalization
} = init;
let { looseEvents } = init;
if (typeof looseEvent !== "undefined") {
console.warn('web-speech-cognitive-services: The option "looseEvent" should be named as "looseEvents".');
looseEvents = looseEvent;
}
return Object.freeze({
audioConfig: audioConfig || AudioConfig2.fromDefaultMicrophoneInput(),
// We set telemetry to true to honor the default telemetry settings of Speech SDK
// https://github.com/Microsoft/cognitive-services-speech-sdk-js#data--telemetry
enableTelemetry: (0, import_valibot8.parse)(enableTelemetrySchema_default, enableTelemetry),
fetchCredentials: async () => {
const parsedCredentials = (0, import_valibot8.parse)(credentialsSchema_default, await resolveFunctionOrReturnValue(credentials));
if (shouldWarnOnSubscriptionKey && parsedCredentials.subscriptionKey) {
console.warn(
"web-speech-cognitive-services: In production environment, subscription key should not be used, authorization token should be used instead."
);
shouldWarnOnSubscriptionKey = false;
}
return parsedCredentials;
},
initialSilenceTimeout: (0, import_valibot8.parse)(initialSilenceTimeoutSchema_default, initialSilenceTimeout),
looseEvents: (0, import_valibot8.parse)(looseEventsSchema_default, looseEvents),
referenceGrammars: (0, import_valibot8.parse)(referenceGrammarsSchema_default, referenceGrammars),
speechRecognitionEndpointId: (0, import_valibot8.parse)(speechRecognitionEndpointIdSchema_default, speechRecognitionEndpointId),
textNormalization: (0, import_valibot8.parse)(textNormalizationSchema_default, textNormalization)
});
}
// src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfillFromRecognizer.ts
var import_valibot9 = require("valibot");
// ../../node_modules/p-defer/index.js
function pDefer() {
const deferred = {};
deferred.promise = new Promise((resolve, reject) => {
deferred.resolve = resolve;
deferred.reject = reject;
});
return deferred;
}
// src/Util/createPromiseQueue.js
function createPromiseQueue() {
let shiftDeferred;
const queue = [];
const push = (value) => {
if (shiftDeferred) {
const { resolve } = shiftDeferred;
shiftDeferred = null;
resolve(value);
} else {
queue.push(value);
}
};
const shift = () => {
if (queue.length) {
return Promise.resolve(queue.shift());
}
return (shiftDeferred || (shiftDeferred = pDefer())).promise;
};
return {
push,
shift
};
}
// src/SpeechServices/SpeechToText/SpeechRecognitionAlternative.ts
var SpeechRecognitionAlternative = class {
constructor({ confidence, transcript }) {
this.#confidence = confidence;
this.#transcript = transcript;
}
#confidence;
#transcript;
get confidence() {
return this.#confidence;
}
get transcript() {
return this.#transcript;
}
};
// src/SpeechServices/SpeechToText/private/FakeArray.ts
var FakeArray = class {
constructor(array2) {
if (!array2) {
throw new Error("array must be set.");
}
this.#array = array2;
for (const key in array2) {
Object.defineProperty(this, key, {
enumerable: true,
get() {
return array2[key];
}
});
}
}
#array;
[Symbol.iterator]() {
return this.#array[Symbol.iterator]();
}
get length() {
return this.#array.length;
}
};
// src/SpeechServices/SpeechToText/SpeechRecognitionResult.ts
var SpeechRecognitionResult = class extends FakeArray {
constructor(init) {
super(init.results);
this.#isFinal = init.isFinal;
}
#isFinal;
get isFinal() {
return this.#isFinal;
}
};
// src/SpeechServices/SpeechToText/cognitiveServiceEventResultToWebSpeechRecognitionResult.ts
var {
ResultReason: { RecognizingSpeech, RecognizedSpeech }
} = SpeechSDK_default;
function cognitiveServiceEventResultToWebSpeechRecognitionResult(result, init) {
const { maxAlternatives = Infinity, textNormalization = "display" } = init || {};
const json = typeof result.json === "string" ? JSON.parse(result.json) : result.json;
if (result.reason === RecognizingSpeech || result.reason === RecognizedSpeech && !json.NBest) {
return new SpeechRecognitionResult({
isFinal: result.reason === RecognizedSpeech,
results: [
new SpeechRecognitionAlternative({
confidence: 0.5,
transcript: result.text
})
]
});
} else if (result.reason === RecognizedSpeech) {
return new SpeechRecognitionResult({
isFinal: true,
results: (json.NBest || []).slice(0, maxAlternatives).map(
({ Confidence: confidence, Display: display, ITN: itn, Lexical: lexical, MaskedITN: maskedITN }) => new SpeechRecognitionAlternative({
confidence,
transcript: textNormalization === "itn" ? itn : textNormalization === "lexical" ? lexical : textNormalization === "maskeditn" ? maskedITN : display
})
)
});
}
return new SpeechRecognitionResult({ isFinal: false, results: [] });
}
// src/SpeechServices/SpeechToText/cognitiveServicesAsyncToPromise.ts
function cognitiveServicesAsyncToPromise(fn, context = void 0) {
return (...args) => (
// eslint-disable-next-line prefer-spread
new Promise((resolve, reject) => fn.apply(context, [...args, resolve, reject]))
);
}
// src/SpeechServices/SpeechToText/private/EventListenerMap.ts
var EventListenerMap = class {
constructor(eventTarget) {
this.#eventTarget = eventTarget;
this.#propertyMap = {};
}
#eventTarget;
#propertyMap;
getProperty(name) {
return this.#propertyMap[name];
}
setProperty(name, value) {
const existing = this.#propertyMap[name];
existing && this.#eventTarget.removeEventListener(name, existing);
if (value) {
this.#eventTarget.addEventListener(name, value);
}
this.#propertyMap[name] = value;
}
};
// src/SpeechServices/SpeechToText/private/prepareAudioConfig.ts
var import_AudioSourceEvents = require("microsoft-cognitiveservices-speech-sdk/distrib/lib/src/common/AudioSourceEvents.js");
// src/SpeechServices/SpeechToText/private/averageAmplitude.ts
function averageAmplitude(arrayBuffer) {
const array2 = Array.from(new Int16Array(arrayBuffer));
return array2.reduce((averageAmplitude2, amplitude) => averageAmplitude2 + Math.abs(amplitude), 0) / array2.length;
}
// src/SpeechServices/SpeechToText/private/prepareAudioConfig.ts
function prepareAudioConfig(audioConfig) {
const audioConfigImpl = audioConfig;
const originalAttach = audioConfigImpl.attach;
const boundOriginalAttach = audioConfigImpl.attach.bind(audioConfigImpl);
let firstChunk = false;
let muted = false;
audioConfigImpl.attach = async () => {
const reader = await boundOriginalAttach("");
return {
...reader,
read: async () => {
const chunk = await reader.read();
if (!firstChunk && averageAmplitude(chunk.buffer) > 150) {
audioConfigImpl.events.onEvent(new import_AudioSourceEvents.AudioSourceEvent("FirstAudibleChunk", ""));
firstChunk = true;
}
if (muted) {
return { buffer: new ArrayBuffer(0), isEnd: true, timeReceived: Date.now() };
}
return chunk;
}
};
};
return {
audioConfig,
pause: () => {
muted = true;
},
unprepare: () => {
audioConfigImpl.attach = originalAttach;
}
};
}
// src/SpeechServices/SpeechToText/private/serializeRecognitionResult.ts
function serializeRecognitionResult({
duration,
errorDetails,
json,
offset,
properties,
reason,
resultId,
text
}) {
return Object.freeze({
duration,
errorDetails,
json: json && JSON.parse(json),
offset,
properties,
reason,
resultId,
text
});
}
// src/SpeechServices/SpeechToText/SpeechGrammarList.ts
var SpeechGrammarList = class {
constructor() {
this.#phrases = [];
}
addFromString() {
throw new Error("JSGF is not supported");
}
addFromURI() {
throw new Error("JSGF is not supported");
}
item() {
throw new Error("JSGF is not supported");
}
get length() {
throw new Error("JSGF is not supported");
}
#phrases;
get phrases() {
return this.#phrases;
}
set phrases(value) {
if (Array.isArray(value)) {
this.#phrases = Object.freeze([...value]);
} else if (typeof value === "string") {
this.#phrases = Object.freeze([value]);
} else {
throw new Error(`The provided value is not an array or of type 'string'`);
}
}
};
// src/SpeechServices/SpeechToText/SpeechRecognitionErrorEvent.ts
var SpeechRecognitionErrorEvent = class extends Event {
constructor(type, { error, message }) {
super(type);
this.#error = error;
this.#message = message;
}
#error;
#message;
get error() {
return this.#error;
}
get message() {
return this.#message;
}
get type() {
return "error";
}
};
// src/SpeechServices/SpeechToText/SpeechRecognitionResultList.ts
var SpeechRecognitionResultList = class extends FakeArray {
constructor(result) {
super(result);
}
};
// src/SpeechServices/SpeechToText/SpeechRecognitionEvent.ts
var SpeechRecognitionEvent = class extends Event {
constructor(type, { data, resultIndex, results } = {}) {
super(type);
this.#data = data;
this.#resultIndex = resultIndex;
this.#results = results || new SpeechRecognitionResultList([]);
}
#data;
// TODO: "resultIndex" should be set.
#resultIndex;
#results;
get data() {
return this.#data;
}
get resultIndex() {
return this.#resultIndex;
}
get results() {
return this.#results;
}
get type() {
return super.type;
}
};
// src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfillFromRecognizer.ts
var { ResultReason: ResultReason2, SpeechRecognizer: SpeechRecognizer2 } = SpeechSDK_default;
var enableTelemetrySchema2 = (0, import_valibot9.union)([(0, import_valibot9.boolean)(), (0, import_valibot9.undefined_)()]);
function createSpeechRecognitionPonyfillFromRecognizer({
createRecognizer,
enableTelemetry,
looseEvents,
referenceGrammars,
textNormalization
}) {
createRecognizer = (0, import_valibot9.parse)((0, import_valibot9.function_)(), createRecognizer);
enableTelemetry = (0, import_valibot9.parse)(enableTelemetrySchema2, enableTelemetry);
looseEvents = (0, import_valibot9.parse)((0, import_valibot9.boolean)(), looseEvents);
referenceGrammars = (0, import_valibot9.parse)(referenceGrammarsSchema_default, referenceGrammars);
textNormalization = (0, import_valibot9.parse)(textNormalizationSchema_default, textNormalization);
typeof enableTelemetry !== "undefined" && SpeechRecognizer2.enableTelemetry(enableTelemetry);
class SpeechRecognition extends EventTarget {
#continuous = false;
#eventListenerMap = new EventListenerMap(this);
#grammars = new SpeechGrammarList();
#interimResults = false;
#lang = typeof window !== "undefined" ? window.document.documentElement.getAttribute("lang") || window.navigator.language : "en-US";
// eslint-disable-next-line no-magic-numbers
#maxAlternatives = 1;
emitCognitiveServices(type, event) {
this.dispatchEvent(
new SpeechRecognitionEvent("cognitiveservices", {
data: {
...event,
type
}
})
);
}
get continuous() {
return this.#continuous;
}
set continuous(value) {
this.#continuous = value;
}
get grammars() {
return this.#grammars;
}
set grammars(value) {
if (value instanceof SpeechGrammarList) {
this.#grammars = value;
} else {
throw new Error(`The provided value is not of type 'SpeechGrammarList'`);
}
}
get interimResults() {
return this.#interimResults;
}
set interimResults(value) {
this.#interimResults = value;
}
get maxAlternatives() {
return this.#maxAlternatives;
}
set maxAlternatives(value) {
this.#maxAlternatives = value;
}
get lang() {
return this.#lang;
}
set lang(value) {
this.#lang = value;
}
get onaudioend() {
return this.#eventListenerMap.getProperty("audioend");
}
set onaudioend(value) {
this.#eventListenerMap.setProperty("audioend", value);
}
/** @type { ((event: SpeechRecognitionEvent<'audiostart'>) => void) | undefined } */
get onaudiostart() {
return this.#eventListenerMap.getProperty("audiostart");
}
set onaudiostart(value) {
this.#eventListenerMap.setProperty("audiostart", value);
}
/** @type { ((event: SpeechRecognitionEvent<'cognitiveservices'>) => void) | undefined } */
get oncognitiveservices() {
return this.#eventListenerMap.getProperty("cognitiveservices");
}
set oncognitiveservices(value) {
this.#eventListenerMap.setProperty("cognitiveservices", value);
}
/** @type { ((event: SpeechRecognitionEvent<'end'>) => void) | undefined } */
get onend() {
return this.#eventListenerMap.getProperty("end");
}
set onend(value) {
this.#eventListenerMap.setProperty("end", value);
}
/** @type { ((event: SpeechRecognitionErrorEvent) => void) | undefined } */
get onerror() {
return this.#eventListenerMap.getProperty("error");
}
set onerror(value) {
this.#eventListenerMap.setProperty("error", value);
}
/** @type { ((event: SpeechRecognitionEvent<'result'>) => void) | undefined } */
get onresult() {
return this.#eventListenerMap.getProperty("result");
}
set onresult(value) {
this.#eventListenerMap.setProperty("result", value);
}
/** @type { ((event: SpeechRecognitionEvent<'soundend'>) => void) | undefined } */
get onsoundend() {
return this.#eventListenerMap.getProperty("soundend");
}
set onsoundend(value) {
this.#eventListenerMap.setProperty("soundend", value);
}
/** @type { ((event: SpeechRecognitionEvent<'soundstart'>) => void) | undefined } */
get onsoundstart() {
return this.#eventListenerMap.getProperty("soundstart");
}
set onsoundstart(value) {
this.#eventListenerMap.setProperty("soundstart", value);
}
/** @type { ((event: SpeechRecognitionEvent<'speechend'>) => void) | undefined } */
get onspeechend() {
return this.#eventListenerMap.getProperty("speechend");
}
set onspeechend(value) {
this.#eventListenerMap.setProperty("speechend", value);
}
/** @type { ((event: SpeechRecognitionEvent<'speechstart'>) => void) | undefined } */
get onspeechstart() {
return this.#eventListenerMap.getProperty("speechstart");
}
set onspeechstart(value) {
this.#eventListenerMap.setProperty("speechstart", value);
}
/** @type { ((event: SpeechRecognitionEvent<'start'>) => void) | undefined } */
get onstart() {
return this.#eventListenerMap.getProperty("start");
}
set onstart(value) {
this.#eventListenerMap.setProperty("start", value);
}
abort;
stop;
start() {
this._startOnce().catch((err) => {
this.dispatchEvent(
new SpeechRecognitionErrorEvent("error", { error: err, message: err && (err.stack || err.message) })
);
});
}
async _startOnce() {
const recognizer = await createRecognizer(this.lang);
const { pause, unprepare } = prepareAudioConfig(recognizer["audioConfig"]);
try {
const queue = createPromiseQueue();
let soundStarted;
let speechStarted;
let stopping;
const { detach: detachAudioConfigEvent } = recognizer["audioConfig"].events.attach(
(event) => {
const { name } = event;
if (name === "AudioSourceReadyEvent") {
queue.push({ audioSourceReady: {} });
} else if (name === "AudioSourceOffEvent") {
queue.push({ audioSourceOff: {} });
} else if (name === "FirstAudibleChunk") {
queue.push({ firstAudibleChunk: {} });
}
}
);
recognizer.canceled = (_, { errorDetails, offset, reason, sessionId }) => {
queue.push({
canceled: {
errorDetails,
offset,
reason,
sessionId
}
});
};
recognizer.recognized = (_, { offset, result, sessionId }) => {
queue.push({
recognized: {
offset,
result: serializeRecognitionResult(result),
sessionId
}
});
};
recognizer.recognizing = (_, { offset, result, sessionId }) => {
queue.push({
recognizing: {
offset,
result: serializeRecognitionResult(result),
sessionId
}
});
};
recognizer.sessionStarted = (_, { sessionId }) => {
queue.push({ sessionStarted: { sessionId } });
};
recognizer.sessionStopped = (_, { sessionId }) => {
queue.push({ sessionStopped: { sessionId } });
};
recognizer.speechStartDetected = (_, { offset, sessionId }) => {
queue.push({ speechStartDetected: { offset, sessionId } });
};
recognizer.speechEndDetected = (_, { sessionId }) => {
queue.push({ speechEndDetected: { sessionId } });
};
const { phrases } = this.grammars;
const { dynamicGrammar } = recognizer["privReco"];
referenceGrammars && referenceGrammars.length && dynamicGrammar.addReferenceGrammar([...referenceGrammars]);
phrases && phrases.length && dynamicGrammar.addPhrase([...phrases]);
await cognitiveServicesAsyncToPromise(recognizer.startContinuousRecognitionAsync, recognizer)();
if (typeof recognizer.stopContinuousRecognitionAsync === "function") {
this.abort = () => queue.push({ abort: {} });
this.stop = () => queue.push({ stop: {} });
} else {
this.abort = this.stop = void 0;
}
let audioStarted;
let finalEvent = void 0;
let finalizedResults = [];
for (let loop = 0; !stopping || audioStarted; loop++) {
const event = await queue.shift();
const {
abort,
audioSourceOff,
audioSourceReady,
canceled,
firstAudibleChunk,
recognized,
recognizing,
stop
} = event;
Object.keys(event).forEach((name) => this.emitCognitiveServices(name, event[name]));
const errorMessage = canceled && canceled.errorDetails;
if (/Permission\sdenied/u.test(errorMessage || "")) {
finalEvent = new SpeechRecognitionErrorEvent("error", { error: "not-allowed" });
break;
}
if (!loop) {
this.dispatchEvent(new SpeechRecognitionEvent("start"));
}
if (errorMessage) {
if (/1006/u.test(errorMessage)) {
if (!audioStarted) {
this.dispatchEvent(new SpeechRecognitionEvent("audiostart"));
this.dispatchEvent(new SpeechRecognitionEvent("audioend"));
}
finalEvent = new SpeechRecognitionErrorEvent("error", { error: "network" });
} else {
finalEvent = new SpeechRecognitionErrorEvent("error", { error: "unknown" });
}
break;
} else if (abort || stop) {
if (abort) {
finalEvent = new SpeechRecognitionErrorEvent("error", { error: "aborted" });
stopping = "abort";
} else {
pause();
stopping = "stop";
}
if (abort && recognizer.stopContinuousRecognitionAsync) {
await cognitiveServicesAsyncToPromise(recognizer.stopContinuousRecognitionAsync, recognizer)();
}
} else if (audioSourceReady) {
this.dispatchEvent(new SpeechRecognitionEvent("audiostart"));
audioStarted = true;
} else if (firstAudibleChunk) {
this.dispatchEvent(new SpeechRecognitionEvent("soundstart"));
soundStarted = true;
} else if (audioSourceOff) {
speechStarted && this.dispatchEvent(new SpeechRecognitionEvent("speechend"));
soundStarted && this.dispatchEvent(new SpeechRecognitionEvent("soundend"));
audioStarted && this.dispatchEvent(new SpeechRecognitionEvent("audioend"));
audioStarted = soundStarted = speechStarted = false;
break;
} else if (stopping !== "abort") {
if (recognized && recognized.result && recognized.result.reason === ResultReason2.NoMatch) {
if (!this.continuous || stopping === "stop") {
finalEvent = new SpeechRecognitionEvent("result", {
results: new SpeechRecognitionResultList(finalizedResults)
});
recognizer.stopContinuousRecognitionAsync && await cognitiveServicesAsyncToPromise(
recognizer.stopContinuousRecognitionAsync,
recognizer
)();
break;
}
} else if (recognized || recognizing) {
if (!audioStarted) {
this.dispatchEvent(new SpeechRecognitionEvent("audiostart"));
audioStarted = true;
}
if (!soundStarted) {
this.dispatchEvent(new SpeechRecognitionEvent("soundstart"));
soundStarted = true;
}
if (!speechStarted) {
this.dispatchEvent(new SpeechRecognitionEvent("speechstart"));
speechStarted = true;
}
if (recognized) {
const result = cognitiveServiceEventResultToWebSpeechRecognitionResult(recognized.result, {
maxAlternatives: this.maxAlternatives,
textNormalization
});
const recognizable = !!result[0]?.transcript;
if (recognizable) {
finalizedResults = [...finalizedResults, result];
this.continuous && this.dispatchEvent(
new SpeechRecognitionEvent("result", {
results: new SpeechRecognitionResultList(finalizedResults)
})
);
}
if (this.continuous && recognizable) {
finalEvent = void 0;
} else {
finalEvent = new SpeechRecognitionEvent("result", {
results: new SpeechRecognitionResultList(finalizedResults)
});
}
if ((!this.continuous || stopping === "stop") && recognizer.stopContinuousRecognitionAsync) {
await cognitiveServicesAsyncToPromise(recognizer.stopContinuousRecognitionAsync, recognizer)();
}
if (looseEvents && finalEvent && recognizable) {
this.dispatchEvent(finalEvent);
finalEvent = void 0;
}
} else if (recognizing) {
this.interimResults && this.dispatchEvent(
new SpeechRecognitionEvent("result", {
results: new SpeechRecognitionResultList([
...finalizedResults,
cognitiveServiceEventResultToWebSpeechRecognitionResult(recognizing.result, {
maxAlternatives: this.maxAlternatives,
textNormalization
})
])
})
);
}
}
}
}
if (speechStarted) {
this.dispatchEvent(new SpeechRecognitionEvent("speechend"));
}
if (soundStarted) {
this.dispatchEvent(new SpeechRecognitionEvent("soundend"));
}
if (audioStarted) {
this.dispatchEvent(new SpeechRecognitionEvent("audioend"));
}
if (finalEvent) {
if (finalEvent.type === "result" && !finalEvent.results.length) {
finalEvent = new SpeechRecognitionErrorEvent("error", { error: "no-speech" });
}
this.dispatchEvent(finalEvent);
}
this.dispatchEvent(new SpeechRecognitionEvent("end"));
detachAudioConfigEvent();
} catch (err) {
console.error(err);
throw err;
} finally {
unprepare();
recognizer["dispose"](false);
}
}
}
return {
SpeechGrammarList,
SpeechRecognition,
SpeechRecognitionEvent
};
}
// src/SpeechServices/SpeechToText/createSpeechRecognitionPonyfill.ts
var { OutputFormat: OutputFormat2, SpeechConfig: SpeechConfig2, SpeechRecognizer: SpeechRecognizer3 } = SpeechSDK_default;
function createSpeechRecognitionPonyfill(options) {
const {
audioConfig,
enableTelemetry,
fetchCredentials,
initialSilenceTimeout,
looseEvents,
referenceGrammars,
speechRecognitionEndpointId,
textNormalization
} = patchOptions(options);
if (!audioConfig && (!window.navigator.mediaDevices || !window.navigator.mediaDevices.getUserMedia)) {
throw new Error(
"web-speech-cognitive-services: This browser does not support Media Capture and Streams API and it will not work with Cognitive Services Speech Services."
);
}
const createRecognizer = async (lang) => {
const credentials = await fetchCredentials();
let speechConfig;
if (typeof credentials.speechRecognitionHostname !== "undefined") {
const host = new URL("wss://localhost:443");
host.hostname = credentials.speechRecognitionHostname;
if (credentials.authorizationToken) {
speechConfig = SpeechConfig2.fromHost(host);
speechConfig.authorizationToken = credentials.authorizationToken;
} else {
speechConfig = SpeechConfig2.fromHost(host, credentials.subscriptionKey);
}
} else {
speechConfig = typeof credentials.authorizationToken !== "undefined" ? SpeechConfig2.fromAuthorizationToken(credentials.authorizationToken, credentials.region) : SpeechConfig2.fromSubscription(credentials.subscriptionKey, credentials.region);
}
if (speechRecognitionEndpointId) {
speechConfig.endpointId = speechRecognitionEndpointId;
}
speechConfig.outputFormat = OutputFormat2.Detailed;
speechConfig.speechRecognitionLanguage = lang || "en-US";
typeof initialSilenceTimeout === "number" && speechConfig.setProperty(import_microsoft_cognitiveservices_speech_sdk2.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "" + initialSilenceTimeout);
return new SpeechRecognizer3(speechConfig, audioConfig);
};
return createSpeechRecognitionPonyfillFromRecognizer({
createRecognizer,
enableTelemetry,
looseEvents,
referenceGrammars,
textNormalization
});
}
// src/SpeechServices/TextToSpeech/createSpeechSynthesisPonyfill.js
var import_event_target_shim3 = require("event-target-shim");
var import_async = require("on-error-resume-next/async");
// src/SpeechServices/TextToSpeech/AudioContextQueue.js
var import_memoize_one = __toESM(require("memoize-one"));
// src/SpeechServices/TextToSpeech/AudioContextConsumer.js
var AudioContextConsumer = class {
constructor(audioContext) {
this.audioContext = audioContext;
}
pause() {
this.audioContext && this.audioContext.suspend();
this.playingUtterance && this.playingUtterance.dispatchEvent(new CustomEvent("pause"));
}
resume() {
this.audioContext && this.audioContext.resume();
this.playingUtterance && this.playingUtterance.dispatchEvent(new CustomEvent("resume"));
}
async start(queue) {
let utterance;
while (utterance = queue.shift()) {
this.playingUtterance = utterance;
await utterance.play(this.audioContext);
this.playingUtterance = null;
}
}
stop() {
this.playingUtterance && this.playingUtterance.stop();
if (this.audioContext.state === "suspended") {
this.audioContext.resume();
}
}
};
// src/SpeechServices/TextToSpeech/AudioContextQueue.js
var AudioContextQueue = class {
constructor({ audioContext, ponyfill }) {
this.consumer = null;
this.paused = false;
this.queue = [];
this.getAudioContext = (0, import_memoize_one.default)(() => audioContext || new ponyfill.AudioContext());
}
pause() {
this.paused = true;
this.consumer && this.consumer.pause();
}
push(utterance) {
this.queue.push(utterance);
this.startConsumer();
}
resume() {
this.paused = false;
if (this.consumer) {
this.consumer.resume();
} else {
this.startConsumer();
}
}
get speaking() {
return !!this.consumer;
}
async startConsumer() {
while (!this.paused && this.queue.length && !this.consumer) {
this.consumer = new AudioContextConsumer(this.getAudioContext());
await this.consumer.start(this.queue);
this.consumer = null;
}
}
stop() {
this.queue.splice(0);
this.consumer && this.consumer.stop();
}
};
// src/SpeechServices/TextToSpeech/SpeechSynthesisEvent.js
var import_event_target_shim = require("event-target-shim");
var SpeechSynthesisEvent = class extends import_event_target_shim.Event {
constructor(type) {
super(type);
}
};
// src/SpeechServices/TextToSpeech/SpeechSynthesisUtterance.js
var import_event_as_promise = require("event-as-promise");
var import_event_target_shim2 = require("event-target-shim");
// src/SpeechServices/TextToSpeech/fetchSpeechData.js
var import_base64_arraybuffer = require("base64-arraybuffer");
// src/SpeechServices/TextToSpeech/buildSSML.js
function relativePercentage(value) {
let relative = Math.round((value - 1) * 100);
if (relative >= 0) {
relative = "+" + relative;
}
return relative + "%";
}
function buildSSML({ lang, pitch = 1, rate = 1, text, voice, volume }) {
return `<speak version="1.0" xml:lang="${lang}">
<voice xml:lang="${lang}" name="${voice}">
<prosody pitch="${relativePercentage(pitch)}" rate="${relativePercentage(rate)}" volume="${relativePercentage(
volume
)}">
${text}
</prosody>
</voice>
</speak>`;
}
// src/SpeechServices/TextToSpeech/isSSML.js
var SPEAK_TAG_PATTERN = /^\s*<speak(\s|\/?>)/u;
var XML_PROLOG_PATTERN = /^\s*<\?xml\s/u;
function isSSML(text) {
return SPEAK_TAG_PATTERN.test(text) || XML_PROLOG_PATTERN.test(text);
}
// src/SpeechServices/TextToSpeech/fetchSpeechData.js
var DEFAULT_LANGUAGE = "en-US";
var DEFAULT_OUTPUT_FORMAT = "riff-16khz-16bit-mono-pcm";
var DEFAULT_VOICE = "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)";
var EMPTY_MP3_BASE64 = "SUQzBAAAAAAAI1RTU0UAAAAPAAADTGF2ZjU3LjU2LjEwMQAAAAAAAAAAAAAA//tAwAAAAAAAAAAAAAAAAAAAAAAASW5mbwAAAA8AAAACAAABhgC7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7u7//////////////////////////////////////////////////////////////////8AAAAATGF2YzU3LjY0AAAAAAAAAAAAAAAAJAUHAAAAAAAAAYYoRBqpAAAAAAD/+xDEAAPAAAGkAAAAIAAANIAAAARMQU1FMy45OS41VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVf/7EMQpg8AAAaQAAAAgAAA0gAAABFVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVV";
async function fetchSpeechData({
deploymentId,
fetchCredentials,
lang = DEFAULT_LANGUAGE,
outputFormat = DEFAULT_OUTPUT_FORMAT,
pitch,
rate,
text,
voice = DEFAULT_VOICE,
volume
}) {
if (!text) {
return (0, import_base64_arraybuffer.decode)(EMPTY_MP3_BASE64);
}
const { authorizationToken, region, speechSynthesisHostname, subscriptionKey } = await fetchCredentials();
if (authorizationToken && subscriptionKey || !authorizationToken && !subscriptionKey) {
throw new Error('Only "authorizationToken" or "subscriptionKey" should be set.');
} else if (region && speechSynthesisHostname || !region && !speechSynthesisHostname) {
throw new Error('Only "region" or "speechSynthesisHostnamename" should be set.');
}
const ssml = isSSML(text) ? text : buildSSML({ lang, pitch, rate, text, voice, volume });
const hostname = speechSynthesisHostname || (deploymentId ? `${encodeURI(region)}.voice.speech.microsoft.com` : `${encodeURI(region)}.tts.speech.microsoft.com`);
const search = deploymentId ? `?deploymentId=${encodeURI(deploymentId)}` : "";
const url = `https://${hostname}/cognitiveservices/v1${search}`;
const res = await fetch(url, {
headers: {
"Content-Type": "application/ssml+xml",
"X-Microsoft-OutputFormat": outputFormat,
...authorizationToken ? {
Authorization: `Bearer ${authorizationToken}`
} : {
"Ocp-Apim-Subscription-Key": subscriptionKey
}
},
method: "POST",
body: ssml
});
if (!res.ok) {
throw new Error(`web-speech-cognitive-services: Failed to syntheis speech, server returned ${res.status}`);
}
return res.arrayBuffer();
}
// src/SpeechServices/TextToSpeech/subscribeEvent.js
function subscribeEvent(target, name, handler) {
target.addEventListener(name, handler);
return () => target.removeEventListener(name, handler);
}
// src/SpeechServices/TextToSpeech/SpeechSynthesisUtterance.js
function asyncDecodeAudioData(audioContext, arrayBuffer) {
return new Promise((resolve, reject) => {
const promise = audioContext.decodeAudioData(arrayBuffer, resolve, reject);
promise && typeof promise.then === "function" && resolve(promise);
});
}
function playDecoded(audioContext, audioBuffer, source) {
return new Promise((resolve, reject) => {
const audioContextClosed = new import_event_as_promise.EventAsPromise();
const sourceEnded = new import_event_as_promise.EventAsPromise();
const unsubscribe = subscribeEvent(
audioContext,
"statechange",
({ target: { state } }) => state === "closed" && audioContextClosed.eventListener()
);
try {
source.buffer = audioBuffer;
source.onended = sourceEnded.eventListener;
source.connect(audioContext.destination);
source.start(0);
Promise.race([audioContextClosed.upcoming(), sourceEnded.upcoming()]).then(resolve);
} catch (err) {
reject(err);
} finally {
unsubscribe();
}
});
}
var SpeechSynthesisUtterance = class extends import_event_target_shim2.EventTarget {
constructor(text) {
super();
this._lang = null;
this._pitch = 1;
this._rate = 1;
this._voice = null;
this._volume = 1;
this.text = text;
this.onboundary = null;
this.onend = null;
this.onerror = null;
this.onmark = null;
this.onpause = null;
this.onresume = null;
this.onstart = null;
}
get lang() {
return this._lang;
}
set lang(value) {
this._lang = value;
}
get onboundary() {
return (0, import_event_target_shim2.getEventAttributeValue)(this, "boundary");
}
set onboundary(value) {
(0, import_event_target_shim2.setEventAttributeValue)(this, "boundary", value);
}
get onend() {
return (0, import_event_target_shim2.getEventAttributeValue)(this, "end");
}
set onend(value) {
(0, import_event_target_shim2.setEventAttributeValue)(this, "end", value);
}
get onerror() {
return (0, import_event_target_shim2.getEventAttributeValue)(this, "error");
}
set onerror(value) {
(0, import_event_target_shim2.setEventAttributeValue)(this, "error", value);
}
get onmark() {
return (0, import_event_target_shim2.getEventAttributeValue)(this, "mark");
}
set onmark(value) {
(0, import_event_target_shim2.setEventAttributeValue)(this, "mark", value);
}
get onpause() {
return (0, import_event_target_shim2.getEventAttributeValue)(this, "pause");
}
set onpause(value) {
(0, import_event_target_shim2.setEventAttributeValue)(this, "pause", value);
}
get onresume() {
return (0, import_event_target_shim2.getEventAttributeValue)(this, "resume");
}
set onresume(value) {
(0, import_event_target_shim2.setEventAttributeValue)(this, "resume", value);
}
get onstart() {
return (0, import_event_target_shim2.getEventAttributeValue)(this, "start");
}
set onstart(value) {
(0, import_event_target_shim2.setEventAttributeValue)(this, "start", value);
}
get pitch() {
return this._pitch;
}
set pitch(value) {
this._pitch = value;
}
get rate() {
return this._rate;
}
set rate(value) {
this._rate = value;
}
get voice() {
return this._voice;
}
set voice(value) {
this._voice = value;
}
get volume() {
return this._volume;
}
set volume(value) {
this._volume = value;
}
preload({ deploymentId, fetchCredentials, outputFormat }) {
this.arrayBufferPromise = fetchSpeechData({
fetchCredentials,
deploymentId,
lang: this.lang || window.navigator.language,
outputFormat,
pitch: this.pitch,
rate: this.rate,
text: this.text,
voice: this.voice && this.voice.voiceURI,
volume: this.volume
});
this.arrayBufferPromise.catch();
}
async play(audioContext) {
try {
this.dispatchEvent(new SpeechSynthesisEvent("start"));
const source = audioContext.createBufferSource();
const audioBuffer = await asyncDecodeAudioData(audioContext, await this.arrayBufferPromise);
this._playingSource = source;
await playDecoded(audioContext, audioBuffer, source);
this._playingSource = null;
this.dispatchEvent(new SpeechSynthesisEvent("end"));
} catch (error) {
this.dispatchEvent(new ErrorEvent("error", { error: "synthesis-failed", message: error.stack }));
}
}
stop() {
this._playingSource && this._playingSource.stop();
}
};
var SpeechSynthesisUtterance_default = SpeechSynthesisUtterance;
// src/SpeechServices/TextToSpeech/SpeechSynthesisVoice.js
var SpeechSynthesisVoice = class {
constructor({ gender, lang, voiceURI }) {
this._default = false;
this._gender = gender;
this._lang = lang;
this._localService = false;
this._name = voiceURI;
this._voiceURI = voiceURI;
}
get default() {
return this._default;
}
get gender() {
return this._gender;
}
get lang() {
return this._lang;
}
get localService() {
return this._localService;
}
get name() {
return this._name;
}
get voiceURI() {
return this._voiceURI;
}
};
// src/SpeechServices/TextToSpeech/fetchCustomVoices.js
async function fetchCustomVoices_({ customVoiceHostname, deploymentId, region, subscriptionKey }) {
const hostname = customVoiceHostname || `${region}.customvoice.api.speech.microsoft.com`;
const res = await fetch(
`https://${encodeURI(hostname)}/api/texttospeech/v2.0/endpoints/${encodeURIComponent(deploymentId)}`,
{
headers: {
accept: "application/json",
"ocp-apim-subscription-key": subscriptionKey
}
}
);
if (!res.ok) {
throw new Error("Failed to fetch custom voices");
}
return res.json();
}
async function fetchCustomVoices({ customVoiceHostname, deploymentId, region, subscriptionKey }) {
const { models } = await fetchCustomVoices_({ customVoiceHostname, deploymentId, region, subscriptionKey });
return models.map(
({ properties: { Gender: gender }, locale: lang, name: voiceURI }) => new SpeechSynthesisVoice({ gender, lang, voiceURI })
).sort(({ name: x }, { name: y }) => x > y ? 1 : x < y ? -1 : 0);
}
// src/SpeechServices/TextToSpeech/fetchVoices.js
async function fetchVoices({ authorizationToken, region, speechSynthesisHostname, subscriptionKey }) {
const hostname = speechSynthesisHostname || `${encodeURI(region)}.tts.speech.microsoft.com`;
const res = await fetch(`https://${hostname}/cognitiveservices/voices/list`, {
headers: {
"content-type": "application/json",
...authorizationToken ? {
authorization: `Bearer ${authorizationToken}`
} : {
"Ocp-Apim-Subscription-Key": subscriptionKey
}
}
});
if (!res.ok) {
throw new Error("Failed to fetch voices");
}
const voices = await res.json();
return voices.map(({ Gender: gender, Locale: lang, Name: voiceURI }) => new SpeechSynthesisVoice({ gender, lang, voiceURI })).sort(({ name: x }, { name: y }) => x > y ? 1 : x < y ? -1 : 0);
}
// src/SpeechServices/TextToSpeech/createSpeechSynthesisPonyfill.js
var DEFAULT_OUTPUT_FORMAT2 = "audio-24khz-160kbitrate-mono-mp3";
var EMPTY_ARRAY = [];
function createSpeechRecognitionPonyfill2(options) {
const {
audioContext,
fetchCredentials,
ponyfill = {
AudioContext: window.AudioContext || window.webkitAudioContext
},
speechSynthesisDeploymentId,
speechSynthesisOutputFormat = DEFAULT_OUTPUT_FORMAT2
} = patchOptions(options);
if (!audioContext && !ponyfill.AudioContext) {
console.warn(
"web-speech-cognitive-services: This browser does not support Web Audio and it will not work with Cognitive Services Speech Services."
);
return {};
}
class SpeechSynthesis extends import_event_target_shim3.EventTarget {
constructor() {
super();
this.queue = new AudioContextQueue({ audioContext, ponyfill });
this.updateVoices();
}
cancel() {
this.queue.stop();
}
getVoices() {
return EMPTY_ARRAY;
}
get onvoiceschanged() {
return (0, import_event_target_shim3.getEventAttributeValue)(this, "voiceschanged");
}
set onvoiceschanged(value) {
(0, import_event_target_shim3.setEventAttributeValue)(this, "voiceschanged", value);
}
pause() {
this.queue.pause();
}
resume() {
this.queue.resume();
}
speak(utterance) {
if (!(utterance instanceof SpeechSynthesisUtterance_default)) {
throw new Error("invalid utterance");
}
const { reject, resolve, promise } = pDefer();
const handleError = ({ error: errorCode, message }) => {
const error = new Error(errorCode);
error.stack = message;
reject(error);
};
utterance.addEventListener("end", resolve);
utterance.addEventList