UNPKG

edge-tts-client

Version:

Client-side (web browser) implementation of Edge TTS package — Microsoft Edge Read Aloud API called to generate free text-to-speech

217 lines (216 loc) 9.41 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.EdgeTTSClient = exports.ProsodyOptions = void 0; const buffer_1 = require("buffer"); const constants_1 = require("./constants"); // Ensure Buffer is globally available for browser-like environments if (typeof globalThis.Buffer === 'undefined') { globalThis.Buffer = buffer_1.Buffer; } // Generates a random hex string of the specified length function generateRandomHex(length) { const randomValues = new Uint8Array(length); window.crypto.getRandomValues(randomValues); return Array.from(randomValues, (byte) => `0${byte.toString(16)}`.slice(-2)).join(""); } class EventEmitter { constructor() { this.eventListeners = { data: [], close: [], end: [] }; } on(event, callback) { this.eventListeners[event].push(callback); } emit(event, data) { this.eventListeners[event].forEach((callback) => callback(data)); } } class ProsodyOptions { constructor() { this.pitch = "+0Hz"; this.rate = 1.0; this.volume = 100.0; } } exports.ProsodyOptions = ProsodyOptions; class EdgeTTSClient { constructor(enableLogging = false) { this.ws = null; this.voice = null; this.voiceLocale = null; this.outputFormat = null; this.requestQueue = {}; this.connectionStartTime = 0; this.enableLogging = enableLogging; this.isBrowser = typeof window !== "undefined" && typeof window.document !== "undefined"; } log(...args) { if (this.enableLogging) console.log(...args); } sendMessage(message) { return __awaiter(this, void 0, void 0, function* () { var _a, _b; for (let attempt = 1; attempt <= 3 && ((_a = this.ws) === null || _a === void 0 ? void 0 : _a.readyState) !== WebSocket.OPEN; attempt++) { if (attempt === 1) this.connectionStartTime = Date.now(); this.log(`Connecting... attempt ${attempt}`); yield this.initWebSocket(); } (_b = this.ws) === null || _b === void 0 ? void 0 : _b.send(message); }); } initWebSocket() { this.ws = new WebSocket(EdgeTTSClient.SYNTH_URL); this.ws.binaryType = "arraybuffer"; let metadataBuffer = []; return new Promise((resolve, reject) => { this.ws.onopen = () => { this.log("Connected in", (Date.now() - this.connectionStartTime) / 1000, "seconds"); this.sendMessage(this.getConfigMessage()).then(resolve); }; this.ws.onmessage = (event) => this.handleMessage(event, metadataBuffer); this.ws.onclose = () => this.handleClose(); this.ws.onerror = (error) => reject(`Connection Error: ${error}`); }); } handleMessage(event, metadataBuffer) { var _a; const buffer = buffer_1.Buffer.from(event.data); const message = buffer.toString(); const requestIdMatch = /X-RequestId:(.*?)\r\n/.exec(message); const requestId = requestIdMatch ? requestIdMatch[1] : ""; if (message.includes("Path:turn.start")) { metadataBuffer.length = 0; } else if (message.includes("Path:turn.end")) { (_a = this.requestQueue[requestId]) === null || _a === void 0 ? void 0 : _a.emit("end", metadataBuffer); } else if (message.includes("Path:audio")) { this.cacheAudioData(buffer, requestId); } else if (message.includes("Path:audio.metadata")) { const startIndex = message.indexOf("{"); metadataBuffer.push(JSON.parse(message.slice(startIndex)).Metadata[0]); } else { this.log("Unknown Message", message); } } handleClose() { this.log("Disconnected after:", (Date.now() - this.connectionStartTime) / 1000, "seconds"); for (const requestId in this.requestQueue) { this.requestQueue[requestId].emit("close", null); } } cacheAudioData(buffer, requestId) { var _a; // Convert the BINARY_DELIM string to a Uint8Array using TextEncoder const binaryDelimBytes = new TextEncoder().encode(EdgeTTSClient.BINARY_DELIM); // Use the helper function to find the delimiter index in the buffer const delimiterIndex = this.findDelimiterIndex(buffer, binaryDelimBytes); if (delimiterIndex === -1) { this.log('Delimiter not found in the buffer.'); return; } const audioDataStart = delimiterIndex + binaryDelimBytes.length; const audioData = buffer.slice(audioDataStart); (_a = this.requestQueue[requestId]) === null || _a === void 0 ? void 0 : _a.emit("data", audioData); this.log("Received audio chunk of size:", audioData === null || audioData === void 0 ? void 0 : audioData.length); } // Helper function to find the index of a byte sequence within another byte sequence findDelimiterIndex(buffer, delimiter) { for (let i = 0; i <= buffer.length - delimiter.length; i++) { let match = true; for (let j = 0; j < delimiter.length; j++) { if (buffer[i + j] !== delimiter[j]) { match = false; break; } } if (match) return i; } return -1; } getConfigMessage() { return `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n{ "context": { "synthesis": { "audio": { "metadataoptions": { "sentenceBoundaryEnabled": "true", "wordBoundaryEnabled": "true" }, "outputFormat": "${this.outputFormat}" } } } }`; } getVoices() { return fetch(EdgeTTSClient.VOICES_URL) .then((response) => response.json()) .catch((error) => Promise.reject(error)); } setMetadata(voiceName, outputFormat, voiceLocale) { return __awaiter(this, void 0, void 0, function* () { this.voice = voiceName; this.outputFormat = outputFormat; this.voiceLocale = voiceLocale || this.inferLocaleFromVoiceName(voiceName); if (!this.voiceLocale) { throw new Error("Could not infer voiceLocale from voiceName!"); } if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { this.connectionStartTime = Date.now(); yield this.initWebSocket(); } }); } inferLocaleFromVoiceName(voiceName) { const match = EdgeTTSClient.VOICE_LANG_REGEX.exec(voiceName); return match ? match[0] : null; } close() { var _a; (_a = this.ws) === null || _a === void 0 ? void 0 : _a.close(); } toStream(text, options = new ProsodyOptions()) { return this.sendSSMLRequest(this.buildSSML(text, options)); } buildSSML(text, options) { return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this.voiceLocale}"> <voice name="${this.voice}"> <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}"> ${text} </prosody> </voice> </speak>`; } sendSSMLRequest(ssml) { if (!this.ws) { throw new Error("WebSocket not initialized. Call setMetadata first."); } const requestId = generateRandomHex(16); const requestMessage = `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n${ssml.trim()}`; const eventEmitter = new EventEmitter(); this.requestQueue[requestId] = eventEmitter; this.sendMessage(requestMessage).then(); return eventEmitter; } } exports.EdgeTTSClient = EdgeTTSClient; EdgeTTSClient.OUTPUT_FORMAT = constants_1.OUTPUT_FORMAT; EdgeTTSClient.CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; EdgeTTSClient.VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${EdgeTTSClient.CLIENT_TOKEN}`; EdgeTTSClient.SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${EdgeTTSClient.CLIENT_TOKEN}`; EdgeTTSClient.BINARY_DELIM = "Path:audio\r\n"; EdgeTTSClient.VOICE_LANG_REGEX = /\w{2}-\w{2}/;