UNPKG

edge-tts-universal

Version:

Universal text-to-speech library using Microsoft Edge's online TTS service. Works in Node.js and browsers WITHOUT needing Microsoft Edge, Windows, or an API key

github.com/travisvn/edge-tts-universal

travisvn/edge-tts-universal

997 lines (985 loc) • 34.4 kB

JavaScript

// src/browser.ts var EdgeTTSBrowser = class { /** * @param text The text to be synthesized. * @param voice The voice to use for synthesis. * @param options Prosody options (rate, volume, pitch). */ constructor(text, voice = "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", options = {}) { this.ws = null; this.WSS_URL = "wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1"; this.TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; this.text = text; this.voice = voice; this.rate = options.rate || "+0%"; this.volume = options.volume || "+0%"; this.pitch = options.pitch || "+0Hz"; } /** * Initiates the synthesis process. * @returns A promise that resolves with the synthesized audio and subtitle data. */ async synthesize() { await this.connect(); if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { throw new Error("WebSocket is not connected."); } this.ws.send(this.createSpeechConfig()); this.ws.send(this.createSSML()); return new Promise((resolve, reject) => { const audioChunks = []; let wordBoundaries = []; if (this.ws) { this.ws.onmessage = (event) => { if (typeof event.data === "string") { const { headers, body } = this.parseMessage(event.data); if (headers.Path === "audio.metadata") { try { const metadata = JSON.parse(body); if (metadata.Metadata && Array.isArray(metadata.Metadata)) { const boundaries = metadata.Metadata.filter((item) => item.Type === "WordBoundary" && item.Data).map((item) => ({ offset: item.Data.Offset, duration: item.Data.Duration, text: item.Data.text.Text })); wordBoundaries = wordBoundaries.concat(boundaries); } } catch (e) { } } else if (headers.Path === "turn.end") { if (this.ws) this.ws.close(); } } else if (event.data instanceof Blob) { event.data.arrayBuffer().then((arrayBuffer) => { const dataView = new DataView(arrayBuffer); const headerLength = dataView.getUint16(0); if (arrayBuffer.byteLength > headerLength + 2) { const audioData = new Uint8Array(arrayBuffer, headerLength + 2); audioChunks.push(audioData); } }); } }; this.ws.onclose = () => { const audioBlob = new Blob( audioChunks, { type: "audio/mpeg" } ); resolve({ audio: audioBlob, subtitle: wordBoundaries }); }; this.ws.onerror = (error) => { reject(error); }; } }); } /** * Establishes a connection to the WebSocket server. */ connect() { const connectionId = this.generateConnectionId(); const secMsGec = this.generateSecMsGec(); const url = `${this.WSS_URL}?TrustedClientToken=${this.TRUSTED_CLIENT_TOKEN}&ConnectionId=${connectionId}&Sec-MS-GEC=${secMsGec}&Sec-MS-GEC-Version=1-130.0.2849.68`; this.ws = new WebSocket(url); return new Promise((resolve, reject) => { if (!this.ws) { return reject(new Error("WebSocket not initialized")); } this.ws.onopen = () => { resolve(); }; this.ws.onerror = (error) => { reject(error); }; }); } /** * Parses a string message from the WebSocket into headers and a body. */ parseMessage(message) { const parts = message.split("\r\n\r\n"); const headerLines = parts[0].split("\r\n"); const headers = {}; headerLines.forEach((line) => { const [key, value] = line.split(":", 2); if (key && value) { headers[key.trim()] = value.trim(); } }); return { headers, body: parts[1] || "" }; } /** * Creates the speech configuration message. */ createSpeechConfig() { const config = { context: { synthesis: { audio: { metadataoptions: { sentenceBoundaryEnabled: false, wordBoundaryEnabled: true }, outputFormat: "audio-24khz-48kbitrate-mono-mp3" } } } }; return `X-Timestamp:${this.getTimestamp()}\r Content-Type:application/json; charset=utf-8\r Path:speech.config\r \r ${JSON.stringify(config)}`; } /** * Creates the SSML (Speech Synthesis Markup Language) message. */ createSSML() { const ssml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'> <voice name='${this.voice}'> <prosody pitch='${this.pitch}' rate='${this.rate}' volume='${this.volume}'> ${this.escapeXml(this.text)} </prosody> </voice> </speak>`; return `X-RequestId:${this.generateConnectionId()}\r Content-Type:application/ssml+xml\r X-Timestamp:${this.getTimestamp()}Z\r Path:ssml\r \r ${ssml}`; } generateConnectionId() { return "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, (c) => { const r = Math.random() * 16 | 0; const v = c === "x" ? r : r & 3 | 8; return v.toString(16); }); } getTimestamp() { return (/* @__PURE__ */ new Date()).toISOString().replace(/[:-]|\.\d{3}/g, ""); } escapeXml(text) { return text.replace(/[<>&'"]/g, (char) => { switch (char) { case "<": return "<"; case ">": return ">"; case "&": return "&"; case "'": return "'"; case '"': return """; default: return char; } }); } /** * Browser-compatible version of DRM security token generation * Uses Web Crypto API instead of Node.js crypto */ async generateSecMsGec() { const WIN_EPOCH2 = 11644473600; const S_TO_NS2 = 1e9; let ticks = Date.now() / 1e3; ticks += WIN_EPOCH2; ticks -= ticks % 300; ticks *= S_TO_NS2 / 100; const strToHash = `${ticks.toFixed(0)}${this.TRUSTED_CLIENT_TOKEN}`; const encoder = new TextEncoder(); const data = encoder.encode(strToHash); const hashBuffer = await crypto.subtle.digest("SHA-256", data); const hashArray = Array.from(new Uint8Array(hashBuffer)); return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("").toUpperCase(); } }; // src/browser-utils.ts function browserConnectId() { const array = new Uint8Array(16); crypto.getRandomValues(array); array[6] = array[6] & 15 | 64; array[8] = array[8] & 63 | 128; const hex = Array.from(array, (byte) => byte.toString(16).padStart(2, "0")).join(""); const uuid = `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(16, 20)}-${hex.slice(20, 32)}`; return uuid.replace(/-/g, ""); } function browserEscape(text) { return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'"); } function browserUnescape(text) { return text.replace(/"/g, '"').replace(/'/g, "'").replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&"); } function browserRemoveIncompatibleCharacters(text) { return text.replace(/[\u0000-\u0008\u000B\u000C\u000E-\u001F]/g, " "); } function browserDateToString() { return (/* @__PURE__ */ new Date()).toUTCString().replace("GMT", "GMT+0000 (Coordinated Universal Time)"); } function browserMkssml(voice, rate, volume, pitch, escapedText) { return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='${voice}'><prosody pitch='${pitch}' rate='${rate}' volume='${volume}'>${escapedText}</prosody></voice></speak>`; } function browserSsmlHeadersPlusData(requestId, timestamp, ssml) { return `X-RequestId:${requestId}\r Content-Type:application/ssml+xml\r X-Timestamp:${timestamp}Z\r Path:ssml\r \r ${ssml}`; } // src/exceptions.ts var EdgeTTSException = class extends Error { constructor(message) { super(message); this.name = "EdgeTTSException"; } }; var SkewAdjustmentError = class extends EdgeTTSException { constructor(message) { super(message); this.name = "SkewAdjustmentError"; } }; var UnknownResponse = class extends EdgeTTSException { constructor(message) { super(message); this.name = "UnknownResponse"; } }; var UnexpectedResponse = class extends EdgeTTSException { constructor(message) { super(message); this.name = "UnexpectedResponse"; } }; var NoAudioReceived = class extends EdgeTTSException { constructor(message) { super(message); this.name = "NoAudioReceived"; } }; var WebSocketError = class extends EdgeTTSException { constructor(message) { super(message); this.name = "WebSocketError"; } }; var ValueError = class extends EdgeTTSException { constructor(message) { super(message); this.name = "ValueError"; } }; // src/tts_config.ts var TTSConfig = class _TTSConfig { /** * Creates a new TTSConfig instance with the specified parameters. * * @param options - Configuration options * @param options.voice - Voice name (supports both short and full formats) * @param options.rate - Speech rate adjustment (default: "+0%") * @param options.volume - Volume adjustment (default: "+0%") * @param options.pitch - Pitch adjustment (default: "+0Hz") * @throws {ValueError} If any parameter has an invalid format */ constructor({ voice, rate = "+0%", volume = "+0%", pitch = "+0Hz" }) { this.voice = voice; this.rate = rate; this.volume = volume; this.pitch = pitch; this.validate(); } validate() { const match = /^([a-z]{2,})-([A-Z]{2,})-(.+Neural)$/.exec(this.voice); if (match) { const [, lang] = match; let [, , region, name] = match; if (name.includes("-")) { const parts = name.split("-"); region += `-${parts[0]}`; name = parts[1]; } this.voice = `Microsoft Server Speech Text to Speech Voice (${lang}-${region}, ${name})`; } _TTSConfig.validateStringParam( "voice", this.voice, /^Microsoft Server Speech Text to Speech Voice \(.+,.+\)$/ ); _TTSConfig.validateStringParam("rate", this.rate, /^[+-]\d+%$/); _TTSConfig.validateStringParam("volume", this.volume, /^[+-]\d+%$/); _TTSConfig.validateStringParam("pitch", this.pitch, /^[+-]\d+Hz$/); } static validateStringParam(paramName, paramValue, pattern) { if (typeof paramValue !== "string") { throw new TypeError(`${paramName} must be a string`); } if (!pattern.test(paramValue)) { throw new ValueError(`Invalid ${paramName} '${paramValue}'.`); } } }; // src/constants.ts var BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud"; var TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; var WSS_URL = `wss://${BASE_URL}/edge/v1?TrustedClientToken=${TRUSTED_CLIENT_TOKEN}`; var VOICE_LIST_URL = `https://${BASE_URL}/voices/list?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`; var DEFAULT_VOICE = "en-US-EmmaMultilingualNeural"; var CHROMIUM_FULL_VERSION = "130.0.2849.68"; var CHROMIUM_MAJOR_VERSION = CHROMIUM_FULL_VERSION.split(".")[0]; var SEC_MS_GEC_VERSION = `1-${CHROMIUM_FULL_VERSION}`; var BASE_HEADERS = { "User-Agent": `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${CHROMIUM_MAJOR_VERSION}.0.0.0 Safari/537.36 Edg/${CHROMIUM_MAJOR_VERSION}.0.0.0`, "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9" }; var VOICE_HEADERS = { ...BASE_HEADERS, "Authority": "speech.platform.bing.com", "Sec-CH-UA": `" Not;A Brand";v="99", "Microsoft Edge";v="${CHROMIUM_MAJOR_VERSION}", "Chromium";v="${CHROMIUM_MAJOR_VERSION}"`, "Sec-CH-UA-Mobile": "?0", "Accept": "*/*", "Sec-Fetch-Site": "none", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty" }; // src/browser-drm.ts var WIN_EPOCH = 11644473600; var S_TO_NS = 1e9; var _BrowserDRM = class _BrowserDRM { static adjClockSkewSeconds(skewSeconds) { _BrowserDRM.clockSkewSeconds += skewSeconds; } static getUnixTimestamp() { return Date.now() / 1e3 + _BrowserDRM.clockSkewSeconds; } static parseRfc2616Date(date) { try { return new Date(date).getTime() / 1e3; } catch (e) { return null; } } static handleClientResponseError(response) { if (!response.headers) { throw new SkewAdjustmentError("No headers in response."); } const serverDate = response.headers["date"] || response.headers["Date"]; if (!serverDate) { throw new SkewAdjustmentError("No server date in headers."); } const serverDateParsed = _BrowserDRM.parseRfc2616Date(serverDate); if (serverDateParsed === null) { throw new SkewAdjustmentError(`Failed to parse server date: ${serverDate}`); } const clientDate = _BrowserDRM.getUnixTimestamp(); _BrowserDRM.adjClockSkewSeconds(serverDateParsed - clientDate); } static async generateSecMsGec() { let ticks = _BrowserDRM.getUnixTimestamp(); ticks += WIN_EPOCH; ticks -= ticks % 300; ticks *= S_TO_NS / 100; const strToHash = `${ticks.toFixed(0)}${TRUSTED_CLIENT_TOKEN}`; const encoder = new TextEncoder(); const data = encoder.encode(strToHash); const hashBuffer = await crypto.subtle.digest("SHA-256", data); const hashArray = Array.from(new Uint8Array(hashBuffer)); return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("").toUpperCase(); } }; _BrowserDRM.clockSkewSeconds = 0; var BrowserDRM = _BrowserDRM; // src/browser-communicate.ts var BrowserBuffer = class { static from(input, encoding) { if (typeof input === "string") { return new TextEncoder().encode(input); } else if (input instanceof ArrayBuffer) { return new Uint8Array(input); } else if (input instanceof Uint8Array) { return input; } throw new Error("Unsupported input type for BrowserBuffer.from"); } static concat(arrays) { const totalLength = arrays.reduce((sum, arr) => sum + arr.length, 0); const result = new Uint8Array(totalLength); let offset = 0; for (const arr of arrays) { result.set(arr, offset); offset += arr.length; } return result; } }; function browserGetHeadersAndDataFromText(message) { const messageString = new TextDecoder().decode(message); const headerEndIndex = messageString.indexOf("\r\n\r\n"); const headers = {}; if (headerEndIndex !== -1) { const headerString = messageString.substring(0, headerEndIndex); const headerLines = headerString.split("\r\n"); for (const line of headerLines) { const [key, value] = line.split(":", 2); if (key && value) { headers[key] = value.trim(); } } } const headerByteLength = new TextEncoder().encode(messageString.substring(0, headerEndIndex + 4)).length; return [headers, message.slice(headerByteLength)]; } function browserGetHeadersAndDataFromBinary(message) { if (message.length < 2) { throw new Error("Message too short to contain header length"); } const headerLength = message[0] << 8 | message[1]; const headers = {}; if (headerLength > 0 && headerLength + 2 <= message.length) { const headerBytes = message.slice(2, headerLength + 2); const headerString = new TextDecoder().decode(headerBytes); const headerLines = headerString.split("\r\n"); for (const line of headerLines) { const [key, value] = line.split(":", 2); if (key && value) { headers[key] = value.trim(); } } } return [headers, message.slice(headerLength + 2)]; } function browserSplitTextByByteLength(text, byteLength) { return (function* () { let buffer = new TextEncoder().encode(text); while (buffer.length > byteLength) { let splitAt = byteLength; const slice = buffer.slice(0, byteLength); const sliceText = new TextDecoder().decode(slice); const lastNewline = sliceText.lastIndexOf("\n"); const lastSpace = sliceText.lastIndexOf(" "); if (lastNewline > 0) { splitAt = new TextEncoder().encode(sliceText.substring(0, lastNewline)).length; } else if (lastSpace > 0) { splitAt = new TextEncoder().encode(sliceText.substring(0, lastSpace)).length; } const chunk = buffer.slice(0, splitAt); const chunkText = new TextDecoder().decode(chunk).trim(); if (chunkText) { yield new TextEncoder().encode(chunkText); } buffer = buffer.slice(splitAt); } const remainingText = new TextDecoder().decode(buffer).trim(); if (remainingText) { yield new TextEncoder().encode(remainingText); } })(); } var BrowserCommunicate = class { /** * Creates a new browser Communicate instance for text-to-speech synthesis. * * @param text - The text to synthesize * @param options - Configuration options for synthesis */ constructor(text, options = {}) { this.state = { partialText: BrowserBuffer.from(""), offsetCompensation: 0, lastDurationOffset: 0, streamWasCalled: false }; this.ttsConfig = new TTSConfig({ voice: options.voice || DEFAULT_VOICE, rate: options.rate, volume: options.volume, pitch: options.pitch }); if (typeof text !== "string") { throw new TypeError("text must be a string"); } this.texts = browserSplitTextByByteLength( browserEscape(browserRemoveIncompatibleCharacters(text)), // browserCalcMaxMesgSize(this.ttsConfig.voice, this.ttsConfig.rate, this.ttsConfig.volume, this.ttsConfig.pitch), 4096 ); this.connectionTimeout = options.connectionTimeout; } parseMetadata(data) { const metadata = JSON.parse(new TextDecoder().decode(data)); for (const metaObj of metadata["Metadata"]) { const metaType = metaObj["Type"]; if (metaType === "WordBoundary") { const currentOffset = metaObj["Data"]["Offset"] + this.state.offsetCompensation; const currentDuration = metaObj["Data"]["Duration"]; return { type: metaType, offset: currentOffset, duration: currentDuration, text: browserUnescape(metaObj["Data"]["text"]["Text"]) }; } if (metaType === "SessionEnd") { continue; } throw new UnknownResponse(`Unknown metadata type: ${metaType}`); } throw new UnexpectedResponse("No WordBoundary metadata found"); } async *_stream() { const url = `${WSS_URL}&Sec-MS-GEC=${await BrowserDRM.generateSecMsGec()}&Sec-MS-GEC-Version=${SEC_MS_GEC_VERSION}&ConnectionId=${browserConnectId()}`; const websocket = new WebSocket(url); const messageQueue = []; let resolveMessage = null; let timeoutId; if (this.connectionTimeout) { timeoutId = window.setTimeout(() => { websocket.close(); messageQueue.push(new WebSocketError("Connection timeout")); if (resolveMessage) resolveMessage(); }, this.connectionTimeout); } websocket.onmessage = (event) => { if (timeoutId) { window.clearTimeout(timeoutId); timeoutId = void 0; } const data = event.data; if (typeof data === "string") { const [headers, parsedData] = browserGetHeadersAndDataFromText(BrowserBuffer.from(data)); const path = headers["Path"]; if (path === "audio.metadata") { try { const parsedMetadata = this.parseMetadata(parsedData); this.state.lastDurationOffset = parsedMetadata.offset + parsedMetadata.duration; messageQueue.push(parsedMetadata); } catch (e) { messageQueue.push(e); } } else if (path === "turn.end") { this.state.offsetCompensation = this.state.lastDurationOffset; websocket.close(); } else if (path !== "response" && path !== "turn.start") { messageQueue.push(new UnknownResponse(`Unknown path received: ${path}`)); } } else if (data instanceof ArrayBuffer) { const bufferData = BrowserBuffer.from(data); if (bufferData.length < 2) { messageQueue.push(new UnexpectedResponse("We received a binary message, but it is missing the header length.")); } else { const [headers, audioData] = browserGetHeadersAndDataFromBinary(bufferData); if (headers["Path"] !== "audio") { messageQueue.push(new UnexpectedResponse("Received binary message, but the path is not audio.")); } else { const contentType = headers["Content-Type"]; if (contentType !== "audio/mpeg") { if (audioData.length > 0) { messageQueue.push(new UnexpectedResponse("Received binary message, but with an unexpected Content-Type.")); } } else if (audioData.length === 0) { messageQueue.push(new UnexpectedResponse("Received binary message, but it is missing the audio data.")); } else { messageQueue.push({ type: "audio", data: audioData }); } } } } else if (data instanceof Blob) { data.arrayBuffer().then((arrayBuffer) => { const bufferData = BrowserBuffer.from(arrayBuffer); if (bufferData.length < 2) { messageQueue.push(new UnexpectedResponse("We received a binary message, but it is missing the header length.")); } else { const [headers, audioData] = browserGetHeadersAndDataFromBinary(bufferData); if (headers["Path"] !== "audio") { messageQueue.push(new UnexpectedResponse("Received binary message, but the path is not audio.")); } else { const contentType = headers["Content-Type"]; if (contentType !== "audio/mpeg") { if (audioData.length > 0) { messageQueue.push(new UnexpectedResponse("Received binary message, but with an unexpected Content-Type.")); } } else if (audioData.length === 0) { messageQueue.push(new UnexpectedResponse("Received binary message, but it is missing the audio data.")); } else { messageQueue.push({ type: "audio", data: audioData }); } } } if (resolveMessage) resolveMessage(); }); } if (resolveMessage) resolveMessage(); }; websocket.onerror = (error) => { if (timeoutId) { window.clearTimeout(timeoutId); timeoutId = void 0; } messageQueue.push(new WebSocketError("WebSocket error occurred")); if (resolveMessage) resolveMessage(); }; websocket.onclose = () => { if (timeoutId) { window.clearTimeout(timeoutId); timeoutId = void 0; } messageQueue.push("close"); if (resolveMessage) resolveMessage(); }; await new Promise((resolve, reject) => { websocket.onopen = () => { if (timeoutId) { window.clearTimeout(timeoutId); timeoutId = void 0; } resolve(); }; if (this.connectionTimeout) { setTimeout(() => { if (websocket.readyState === WebSocket.CONNECTING) { websocket.close(); reject(new WebSocketError("Connection timeout")); } }, this.connectionTimeout); } }); websocket.send( `X-Timestamp:${browserDateToString()}\r Content-Type:application/json; charset=utf-8\r Path:speech.config\r \r {"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":"audio-24khz-48kbitrate-mono-mp3"}}}}\r ` ); websocket.send( browserSsmlHeadersPlusData( browserConnectId(), browserDateToString(), browserMkssml(this.ttsConfig.voice, this.ttsConfig.rate, this.ttsConfig.volume, this.ttsConfig.pitch, new TextDecoder().decode(this.state.partialText)) ) ); let audioWasReceived = false; while (true) { if (messageQueue.length > 0) { const message = messageQueue.shift(); if (message === "close") { if (!audioWasReceived) { throw new NoAudioReceived("No audio was received."); } break; } else if (message instanceof Error) { throw message; } else { if (message.type === "audio") audioWasReceived = true; yield message; } } else { await new Promise((resolve) => { resolveMessage = resolve; setTimeout(resolve, 50); }); } } } /** * Streams text-to-speech synthesis results using native browser WebSocket. * Uses only browser-native APIs, avoiding Node.js dependencies. * * @yields BrowserTTSChunk - Audio data or word boundary information * @throws {Error} If called more than once * @throws {NoAudioReceived} If no audio data is received * @throws {WebSocketError} If WebSocket connection fails */ async *stream() { if (this.state.streamWasCalled) { throw new Error("stream can only be called once."); } this.state.streamWasCalled = true; for (const partialText of this.texts) { this.state.partialText = partialText; for await (const message of this._stream()) { yield message; } } } }; // src/browser-simple.ts function concatUint8Arrays(arrays) { if (arrays.length === 0) return new Uint8Array(0); if (arrays.length === 1) return arrays[0]; const totalLength = arrays.reduce((sum, arr) => sum + arr.length, 0); const result = new Uint8Array(totalLength); let offset = 0; for (const arr of arrays) { if (arr.length > 0) { result.set(arr, offset); offset += arr.length; } } return result; } var BrowserEdgeTTS = class { /** * @param text The text to be synthesized. * @param voice The voice to use for synthesis. * @param options Prosody options (rate, volume, pitch). */ constructor(text, voice = "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", options = {}) { this.text = text; this.voice = voice; this.rate = options.rate || "+0%"; this.volume = options.volume || "+0%"; this.pitch = options.pitch || "+0Hz"; } /** * Initiates the synthesis process using browser-native APIs. * @returns A promise that resolves with the synthesized audio and subtitle data. */ async synthesize() { const communicate = new BrowserCommunicate(this.text, { voice: this.voice, rate: this.rate, volume: this.volume, pitch: this.pitch }); const audioChunks = []; const wordBoundaries = []; for await (const chunk of communicate.stream()) { if (chunk.type === "audio" && chunk.data) { audioChunks.push(chunk.data); } else if (chunk.type === "WordBoundary" && chunk.offset !== void 0 && chunk.duration !== void 0 && chunk.text !== void 0) { wordBoundaries.push({ offset: chunk.offset, duration: chunk.duration, text: chunk.text }); } } const audioBuffer = concatUint8Arrays(audioChunks); const audioBlob = new Blob([ audioBuffer ], { type: "audio/mpeg" }); return { audio: audioBlob, subtitle: wordBoundaries }; } }; function formatTimestamp(timeIn100ns, format) { const totalSeconds = Math.floor(timeIn100ns / 1e7); const hours = Math.floor(totalSeconds / 3600); const minutes = Math.floor(totalSeconds % 3600 / 60); const seconds = totalSeconds % 60; const milliseconds = Math.floor(timeIn100ns % 1e7 / 1e4); const separator = format === "vtt" ? "." : ","; return `${padNumber(hours)}:${padNumber(minutes)}:${padNumber(seconds)}${separator}${padNumber(milliseconds, 3)}`; } function padNumber(num, length = 2) { return num.toString().padStart(length, "0"); } function createVTT(wordBoundaries) { let vttContent = "WEBVTT\n\n"; wordBoundaries.forEach((word, index) => { const startTime = formatTimestamp(word.offset, "vtt"); const endTime = formatTimestamp(word.offset + word.duration, "vtt"); vttContent += `${index + 1} `; vttContent += `${startTime} --> ${endTime} `; vttContent += `${word.text} `; }); return vttContent; } function createSRT(wordBoundaries) { let srtContent = ""; wordBoundaries.forEach((word, index) => { const startTime = formatTimestamp(word.offset, "srt"); const endTime = formatTimestamp(word.offset + word.duration, "srt"); srtContent += `${index + 1} `; srtContent += `${startTime} --> ${endTime} `; srtContent += `${word.text} `; }); return srtContent; } // src/browser-voices.ts var BrowserFetchError = class extends Error { constructor(message, response) { super(message); this.name = "BrowserFetchError"; this.response = response; } }; async function _listVoices() { const url = `${VOICE_LIST_URL}&Sec-MS-GEC=${await BrowserDRM.generateSecMsGec()}&Sec-MS-GEC-Version=${SEC_MS_GEC_VERSION}`; try { const response = await fetch(url, { headers: VOICE_HEADERS }); if (!response.ok) { const headers = {}; response.headers.forEach((value, key) => { headers[key] = value; }); throw new BrowserFetchError(`HTTP ${response.status}`, { status: response.status, headers }); } const data = await response.json(); for (const voice of data) { voice.VoiceTag.ContentCategories = voice.VoiceTag.ContentCategories.map((c) => c.trim()); voice.VoiceTag.VoicePersonalities = voice.VoiceTag.VoicePersonalities.map((p) => p.trim()); } return data; } catch (error) { if (error instanceof BrowserFetchError) { throw error; } throw new BrowserFetchError(error instanceof Error ? error.message : "Unknown fetch error"); } } async function listVoices() { try { return await _listVoices(); } catch (e) { if (e instanceof BrowserFetchError && e.response?.status === 403) { BrowserDRM.handleClientResponseError(e.response); return await _listVoices(); } throw e; } } var BrowserVoicesManager = class _BrowserVoicesManager { constructor() { this.voices = []; this.calledCreate = false; } /** * Creates a new BrowserVoicesManager instance. * * @param customVoices - Optional custom voice list instead of fetching from API * @returns Promise resolving to BrowserVoicesManager instance */ static async create(customVoices) { const manager = new _BrowserVoicesManager(); const voices = customVoices ?? await listVoices(); manager.voices = voices.map((voice) => ({ ...voice, Language: voice.Locale.split("-")[0] })); manager.calledCreate = true; return manager; } /** * Finds voices matching the specified criteria. * * @param filter - Filter criteria for voice selection * @returns Array of voices matching the filter * @throws {Error} If called before create() */ find(filter) { if (!this.calledCreate) { throw new Error("BrowserVoicesManager.find() called before BrowserVoicesManager.create()"); } return this.voices.filter((voice) => { return Object.entries(filter).every(([key, value]) => { return voice[key] === value; }); }); } }; // src/submaker.ts function formatTime(seconds) { const h = Math.floor(seconds / 3600); const m = Math.floor(seconds % 3600 / 60); const s = Math.floor(seconds % 60); const ms = Math.round((seconds - Math.floor(seconds)) * 1e3); const pad = (num, size = 2) => num.toString().padStart(size, "0"); return `${pad(h)}:${pad(m)}:${pad(s)},${pad(ms, 3)}`; } var SubMaker = class { constructor() { this.cues = []; } /** * Adds a WordBoundary chunk to the subtitle maker. * * @param msg - Must be a WordBoundary type chunk with offset, duration, and text * @throws {ValueError} If chunk is not a WordBoundary with required fields */ feed(msg) { if (msg.type !== "WordBoundary" || msg.offset === void 0 || msg.duration === void 0 || msg.text === void 0) { throw new ValueError("Invalid message type, expected 'WordBoundary' with offset, duration and text"); } const start = msg.offset / 1e7; const end = (msg.offset + msg.duration) / 1e7; this.cues.push({ index: this.cues.length + 1, start, end, content: msg.text }); } /** * Merges consecutive cues to create subtitle entries with multiple words. * This is useful for creating more readable subtitles instead of word-by-word display. * * @param words - Maximum number of words per merged cue * @throws {ValueError} If words parameter is invalid */ mergeCues(words) { if (words <= 0) { throw new ValueError("Invalid number of words to merge, expected > 0"); } if (this.cues.length === 0) { return; } const newCues = []; let currentCue = this.cues[0]; for (const cue of this.cues.slice(1)) { if (currentCue.content.split(" ").length < words) { currentCue = { ...currentCue, end: cue.end, content: `${currentCue.content} ${cue.content}` }; } else { newCues.push(currentCue); currentCue = cue; } } newCues.push(currentCue); this.cues = newCues.map((cue, i) => ({ ...cue, index: i + 1 })); } /** * Returns the subtitles in SRT format. * * @returns SRT formatted subtitles */ getSrt() { return this.cues.map((cue) => { return `${cue.index}\r ${formatTime(cue.start)} --> ${formatTime(cue.end)}\r ${cue.content}\r `; }).join("\r\n"); } toString() { return this.getSrt(); } }; export { BrowserCommunicate as Communicate, BrowserDRM as DRM, BrowserEdgeTTS as EdgeTTS, EdgeTTSBrowser, EdgeTTSException, BrowserFetchError as FetchError, NoAudioReceived, SkewAdjustmentError, SubMaker, UnexpectedResponse, UnknownResponse, ValueError, BrowserVoicesManager as VoicesManager, WebSocketError, createSRT, createVTT, listVoices }; //# sourceMappingURL=browser.js.map //# sourceMappingURL=browser.js.map