edge-tts-universal
Version:
Universal text-to-speech library using Microsoft Edge's online TTS service. Works in Node.js and browsers WITHOUT needing Microsoft Edge, Windows, or an API key
997 lines (985 loc) • 34.4 kB
JavaScript
// src/browser.ts
var EdgeTTSBrowser = class {
/**
* @param text The text to be synthesized.
* @param voice The voice to use for synthesis.
* @param options Prosody options (rate, volume, pitch).
*/
constructor(text, voice = "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", options = {}) {
this.ws = null;
this.WSS_URL = "wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1";
this.TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
this.text = text;
this.voice = voice;
this.rate = options.rate || "+0%";
this.volume = options.volume || "+0%";
this.pitch = options.pitch || "+0Hz";
}
/**
* Initiates the synthesis process.
* @returns A promise that resolves with the synthesized audio and subtitle data.
*/
async synthesize() {
await this.connect();
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
throw new Error("WebSocket is not connected.");
}
this.ws.send(this.createSpeechConfig());
this.ws.send(this.createSSML());
return new Promise((resolve, reject) => {
const audioChunks = [];
let wordBoundaries = [];
if (this.ws) {
this.ws.onmessage = (event) => {
if (typeof event.data === "string") {
const { headers, body } = this.parseMessage(event.data);
if (headers.Path === "audio.metadata") {
try {
const metadata = JSON.parse(body);
if (metadata.Metadata && Array.isArray(metadata.Metadata)) {
const boundaries = metadata.Metadata.filter((item) => item.Type === "WordBoundary" && item.Data).map((item) => ({
offset: item.Data.Offset,
duration: item.Data.Duration,
text: item.Data.text.Text
}));
wordBoundaries = wordBoundaries.concat(boundaries);
}
} catch (e) {
}
} else if (headers.Path === "turn.end") {
if (this.ws) this.ws.close();
}
} else if (event.data instanceof Blob) {
event.data.arrayBuffer().then((arrayBuffer) => {
const dataView = new DataView(arrayBuffer);
const headerLength = dataView.getUint16(0);
if (arrayBuffer.byteLength > headerLength + 2) {
const audioData = new Uint8Array(arrayBuffer, headerLength + 2);
audioChunks.push(audioData);
}
});
}
};
this.ws.onclose = () => {
const audioBlob = new Blob(
audioChunks,
{ type: "audio/mpeg" }
);
resolve({ audio: audioBlob, subtitle: wordBoundaries });
};
this.ws.onerror = (error) => {
reject(error);
};
}
});
}
/**
* Establishes a connection to the WebSocket server.
*/
connect() {
const connectionId = this.generateConnectionId();
const secMsGec = this.generateSecMsGec();
const url = `${this.WSS_URL}?TrustedClientToken=${this.TRUSTED_CLIENT_TOKEN}&ConnectionId=${connectionId}&Sec-MS-GEC=${secMsGec}&Sec-MS-GEC-Version=1-130.0.2849.68`;
this.ws = new WebSocket(url);
return new Promise((resolve, reject) => {
if (!this.ws) {
return reject(new Error("WebSocket not initialized"));
}
this.ws.onopen = () => {
resolve();
};
this.ws.onerror = (error) => {
reject(error);
};
});
}
/**
* Parses a string message from the WebSocket into headers and a body.
*/
parseMessage(message) {
const parts = message.split("\r\n\r\n");
const headerLines = parts[0].split("\r\n");
const headers = {};
headerLines.forEach((line) => {
const [key, value] = line.split(":", 2);
if (key && value) {
headers[key.trim()] = value.trim();
}
});
return { headers, body: parts[1] || "" };
}
/**
* Creates the speech configuration message.
*/
createSpeechConfig() {
const config = {
context: {
synthesis: {
audio: {
metadataoptions: {
sentenceBoundaryEnabled: false,
wordBoundaryEnabled: true
},
outputFormat: "audio-24khz-48kbitrate-mono-mp3"
}
}
}
};
return `X-Timestamp:${this.getTimestamp()}\r
Content-Type:application/json; charset=utf-8\r
Path:speech.config\r
\r
${JSON.stringify(config)}`;
}
/**
* Creates the SSML (Speech Synthesis Markup Language) message.
*/
createSSML() {
const ssml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'>
<voice name='${this.voice}'>
<prosody pitch='${this.pitch}' rate='${this.rate}' volume='${this.volume}'>
${this.escapeXml(this.text)}
</prosody>
</voice>
</speak>`;
return `X-RequestId:${this.generateConnectionId()}\r
Content-Type:application/ssml+xml\r
X-Timestamp:${this.getTimestamp()}Z\r
Path:ssml\r
\r
${ssml}`;
}
generateConnectionId() {
return "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, (c) => {
const r = Math.random() * 16 | 0;
const v = c === "x" ? r : r & 3 | 8;
return v.toString(16);
});
}
getTimestamp() {
return (/* @__PURE__ */ new Date()).toISOString().replace(/[:-]|\.\d{3}/g, "");
}
escapeXml(text) {
return text.replace(/[<>&'"]/g, (char) => {
switch (char) {
case "<":
return "<";
case ">":
return ">";
case "&":
return "&";
case "'":
return "'";
case '"':
return """;
default:
return char;
}
});
}
/**
* Browser-compatible version of DRM security token generation
* Uses Web Crypto API instead of Node.js crypto
*/
async generateSecMsGec() {
const WIN_EPOCH2 = 11644473600;
const S_TO_NS2 = 1e9;
let ticks = Date.now() / 1e3;
ticks += WIN_EPOCH2;
ticks -= ticks % 300;
ticks *= S_TO_NS2 / 100;
const strToHash = `${ticks.toFixed(0)}${this.TRUSTED_CLIENT_TOKEN}`;
const encoder = new TextEncoder();
const data = encoder.encode(strToHash);
const hashBuffer = await crypto.subtle.digest("SHA-256", data);
const hashArray = Array.from(new Uint8Array(hashBuffer));
return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("").toUpperCase();
}
};
// src/browser-utils.ts
function browserConnectId() {
const array = new Uint8Array(16);
crypto.getRandomValues(array);
array[6] = array[6] & 15 | 64;
array[8] = array[8] & 63 | 128;
const hex = Array.from(array, (byte) => byte.toString(16).padStart(2, "0")).join("");
const uuid = `${hex.slice(0, 8)}-${hex.slice(8, 12)}-${hex.slice(12, 16)}-${hex.slice(16, 20)}-${hex.slice(20, 32)}`;
return uuid.replace(/-/g, "");
}
function browserEscape(text) {
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
}
function browserUnescape(text) {
return text.replace(/"/g, '"').replace(/'/g, "'").replace(/</g, "<").replace(/>/g, ">").replace(/&/g, "&");
}
function browserRemoveIncompatibleCharacters(text) {
return text.replace(/[\u0000-\u0008\u000B\u000C\u000E-\u001F]/g, " ");
}
function browserDateToString() {
return (/* @__PURE__ */ new Date()).toUTCString().replace("GMT", "GMT+0000 (Coordinated Universal Time)");
}
function browserMkssml(voice, rate, volume, pitch, escapedText) {
return `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='en-US'><voice name='${voice}'><prosody pitch='${pitch}' rate='${rate}' volume='${volume}'>${escapedText}</prosody></voice></speak>`;
}
function browserSsmlHeadersPlusData(requestId, timestamp, ssml) {
return `X-RequestId:${requestId}\r
Content-Type:application/ssml+xml\r
X-Timestamp:${timestamp}Z\r
Path:ssml\r
\r
${ssml}`;
}
// src/exceptions.ts
var EdgeTTSException = class extends Error {
constructor(message) {
super(message);
this.name = "EdgeTTSException";
}
};
var SkewAdjustmentError = class extends EdgeTTSException {
constructor(message) {
super(message);
this.name = "SkewAdjustmentError";
}
};
var UnknownResponse = class extends EdgeTTSException {
constructor(message) {
super(message);
this.name = "UnknownResponse";
}
};
var UnexpectedResponse = class extends EdgeTTSException {
constructor(message) {
super(message);
this.name = "UnexpectedResponse";
}
};
var NoAudioReceived = class extends EdgeTTSException {
constructor(message) {
super(message);
this.name = "NoAudioReceived";
}
};
var WebSocketError = class extends EdgeTTSException {
constructor(message) {
super(message);
this.name = "WebSocketError";
}
};
var ValueError = class extends EdgeTTSException {
constructor(message) {
super(message);
this.name = "ValueError";
}
};
// src/tts_config.ts
var TTSConfig = class _TTSConfig {
/**
* Creates a new TTSConfig instance with the specified parameters.
*
* @param options - Configuration options
* @param options.voice - Voice name (supports both short and full formats)
* @param options.rate - Speech rate adjustment (default: "+0%")
* @param options.volume - Volume adjustment (default: "+0%")
* @param options.pitch - Pitch adjustment (default: "+0Hz")
* @throws {ValueError} If any parameter has an invalid format
*/
constructor({
voice,
rate = "+0%",
volume = "+0%",
pitch = "+0Hz"
}) {
this.voice = voice;
this.rate = rate;
this.volume = volume;
this.pitch = pitch;
this.validate();
}
validate() {
const match = /^([a-z]{2,})-([A-Z]{2,})-(.+Neural)$/.exec(this.voice);
if (match) {
const [, lang] = match;
let [, , region, name] = match;
if (name.includes("-")) {
const parts = name.split("-");
region += `-${parts[0]}`;
name = parts[1];
}
this.voice = `Microsoft Server Speech Text to Speech Voice (${lang}-${region}, ${name})`;
}
_TTSConfig.validateStringParam(
"voice",
this.voice,
/^Microsoft Server Speech Text to Speech Voice \(.+,.+\)$/
);
_TTSConfig.validateStringParam("rate", this.rate, /^[+-]\d+%$/);
_TTSConfig.validateStringParam("volume", this.volume, /^[+-]\d+%$/);
_TTSConfig.validateStringParam("pitch", this.pitch, /^[+-]\d+Hz$/);
}
static validateStringParam(paramName, paramValue, pattern) {
if (typeof paramValue !== "string") {
throw new TypeError(`${paramName} must be a string`);
}
if (!pattern.test(paramValue)) {
throw new ValueError(`Invalid ${paramName} '${paramValue}'.`);
}
}
};
// src/constants.ts
var BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud";
var TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
var WSS_URL = `wss://${BASE_URL}/edge/v1?TrustedClientToken=${TRUSTED_CLIENT_TOKEN}`;
var VOICE_LIST_URL = `https://${BASE_URL}/voices/list?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`;
var DEFAULT_VOICE = "en-US-EmmaMultilingualNeural";
var CHROMIUM_FULL_VERSION = "130.0.2849.68";
var CHROMIUM_MAJOR_VERSION = CHROMIUM_FULL_VERSION.split(".")[0];
var SEC_MS_GEC_VERSION = `1-${CHROMIUM_FULL_VERSION}`;
var BASE_HEADERS = {
"User-Agent": `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${CHROMIUM_MAJOR_VERSION}.0.0.0 Safari/537.36 Edg/${CHROMIUM_MAJOR_VERSION}.0.0.0`,
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9"
};
var VOICE_HEADERS = {
...BASE_HEADERS,
"Authority": "speech.platform.bing.com",
"Sec-CH-UA": `" Not;A Brand";v="99", "Microsoft Edge";v="${CHROMIUM_MAJOR_VERSION}", "Chromium";v="${CHROMIUM_MAJOR_VERSION}"`,
"Sec-CH-UA-Mobile": "?0",
"Accept": "*/*",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Dest": "empty"
};
// src/browser-drm.ts
var WIN_EPOCH = 11644473600;
var S_TO_NS = 1e9;
var _BrowserDRM = class _BrowserDRM {
static adjClockSkewSeconds(skewSeconds) {
_BrowserDRM.clockSkewSeconds += skewSeconds;
}
static getUnixTimestamp() {
return Date.now() / 1e3 + _BrowserDRM.clockSkewSeconds;
}
static parseRfc2616Date(date) {
try {
return new Date(date).getTime() / 1e3;
} catch (e) {
return null;
}
}
static handleClientResponseError(response) {
if (!response.headers) {
throw new SkewAdjustmentError("No headers in response.");
}
const serverDate = response.headers["date"] || response.headers["Date"];
if (!serverDate) {
throw new SkewAdjustmentError("No server date in headers.");
}
const serverDateParsed = _BrowserDRM.parseRfc2616Date(serverDate);
if (serverDateParsed === null) {
throw new SkewAdjustmentError(`Failed to parse server date: ${serverDate}`);
}
const clientDate = _BrowserDRM.getUnixTimestamp();
_BrowserDRM.adjClockSkewSeconds(serverDateParsed - clientDate);
}
static async generateSecMsGec() {
let ticks = _BrowserDRM.getUnixTimestamp();
ticks += WIN_EPOCH;
ticks -= ticks % 300;
ticks *= S_TO_NS / 100;
const strToHash = `${ticks.toFixed(0)}${TRUSTED_CLIENT_TOKEN}`;
const encoder = new TextEncoder();
const data = encoder.encode(strToHash);
const hashBuffer = await crypto.subtle.digest("SHA-256", data);
const hashArray = Array.from(new Uint8Array(hashBuffer));
return hashArray.map((b) => b.toString(16).padStart(2, "0")).join("").toUpperCase();
}
};
_BrowserDRM.clockSkewSeconds = 0;
var BrowserDRM = _BrowserDRM;
// src/browser-communicate.ts
var BrowserBuffer = class {
static from(input, encoding) {
if (typeof input === "string") {
return new TextEncoder().encode(input);
} else if (input instanceof ArrayBuffer) {
return new Uint8Array(input);
} else if (input instanceof Uint8Array) {
return input;
}
throw new Error("Unsupported input type for BrowserBuffer.from");
}
static concat(arrays) {
const totalLength = arrays.reduce((sum, arr) => sum + arr.length, 0);
const result = new Uint8Array(totalLength);
let offset = 0;
for (const arr of arrays) {
result.set(arr, offset);
offset += arr.length;
}
return result;
}
};
function browserGetHeadersAndDataFromText(message) {
const messageString = new TextDecoder().decode(message);
const headerEndIndex = messageString.indexOf("\r\n\r\n");
const headers = {};
if (headerEndIndex !== -1) {
const headerString = messageString.substring(0, headerEndIndex);
const headerLines = headerString.split("\r\n");
for (const line of headerLines) {
const [key, value] = line.split(":", 2);
if (key && value) {
headers[key] = value.trim();
}
}
}
const headerByteLength = new TextEncoder().encode(messageString.substring(0, headerEndIndex + 4)).length;
return [headers, message.slice(headerByteLength)];
}
function browserGetHeadersAndDataFromBinary(message) {
if (message.length < 2) {
throw new Error("Message too short to contain header length");
}
const headerLength = message[0] << 8 | message[1];
const headers = {};
if (headerLength > 0 && headerLength + 2 <= message.length) {
const headerBytes = message.slice(2, headerLength + 2);
const headerString = new TextDecoder().decode(headerBytes);
const headerLines = headerString.split("\r\n");
for (const line of headerLines) {
const [key, value] = line.split(":", 2);
if (key && value) {
headers[key] = value.trim();
}
}
}
return [headers, message.slice(headerLength + 2)];
}
function browserSplitTextByByteLength(text, byteLength) {
return (function* () {
let buffer = new TextEncoder().encode(text);
while (buffer.length > byteLength) {
let splitAt = byteLength;
const slice = buffer.slice(0, byteLength);
const sliceText = new TextDecoder().decode(slice);
const lastNewline = sliceText.lastIndexOf("\n");
const lastSpace = sliceText.lastIndexOf(" ");
if (lastNewline > 0) {
splitAt = new TextEncoder().encode(sliceText.substring(0, lastNewline)).length;
} else if (lastSpace > 0) {
splitAt = new TextEncoder().encode(sliceText.substring(0, lastSpace)).length;
}
const chunk = buffer.slice(0, splitAt);
const chunkText = new TextDecoder().decode(chunk).trim();
if (chunkText) {
yield new TextEncoder().encode(chunkText);
}
buffer = buffer.slice(splitAt);
}
const remainingText = new TextDecoder().decode(buffer).trim();
if (remainingText) {
yield new TextEncoder().encode(remainingText);
}
})();
}
var BrowserCommunicate = class {
/**
* Creates a new browser Communicate instance for text-to-speech synthesis.
*
* @param text - The text to synthesize
* @param options - Configuration options for synthesis
*/
constructor(text, options = {}) {
this.state = {
partialText: BrowserBuffer.from(""),
offsetCompensation: 0,
lastDurationOffset: 0,
streamWasCalled: false
};
this.ttsConfig = new TTSConfig({
voice: options.voice || DEFAULT_VOICE,
rate: options.rate,
volume: options.volume,
pitch: options.pitch
});
if (typeof text !== "string") {
throw new TypeError("text must be a string");
}
this.texts = browserSplitTextByByteLength(
browserEscape(browserRemoveIncompatibleCharacters(text)),
// browserCalcMaxMesgSize(this.ttsConfig.voice, this.ttsConfig.rate, this.ttsConfig.volume, this.ttsConfig.pitch),
4096
);
this.connectionTimeout = options.connectionTimeout;
}
parseMetadata(data) {
const metadata = JSON.parse(new TextDecoder().decode(data));
for (const metaObj of metadata["Metadata"]) {
const metaType = metaObj["Type"];
if (metaType === "WordBoundary") {
const currentOffset = metaObj["Data"]["Offset"] + this.state.offsetCompensation;
const currentDuration = metaObj["Data"]["Duration"];
return {
type: metaType,
offset: currentOffset,
duration: currentDuration,
text: browserUnescape(metaObj["Data"]["text"]["Text"])
};
}
if (metaType === "SessionEnd") {
continue;
}
throw new UnknownResponse(`Unknown metadata type: ${metaType}`);
}
throw new UnexpectedResponse("No WordBoundary metadata found");
}
async *_stream() {
const url = `${WSS_URL}&Sec-MS-GEC=${await BrowserDRM.generateSecMsGec()}&Sec-MS-GEC-Version=${SEC_MS_GEC_VERSION}&ConnectionId=${browserConnectId()}`;
const websocket = new WebSocket(url);
const messageQueue = [];
let resolveMessage = null;
let timeoutId;
if (this.connectionTimeout) {
timeoutId = window.setTimeout(() => {
websocket.close();
messageQueue.push(new WebSocketError("Connection timeout"));
if (resolveMessage) resolveMessage();
}, this.connectionTimeout);
}
websocket.onmessage = (event) => {
if (timeoutId) {
window.clearTimeout(timeoutId);
timeoutId = void 0;
}
const data = event.data;
if (typeof data === "string") {
const [headers, parsedData] = browserGetHeadersAndDataFromText(BrowserBuffer.from(data));
const path = headers["Path"];
if (path === "audio.metadata") {
try {
const parsedMetadata = this.parseMetadata(parsedData);
this.state.lastDurationOffset = parsedMetadata.offset + parsedMetadata.duration;
messageQueue.push(parsedMetadata);
} catch (e) {
messageQueue.push(e);
}
} else if (path === "turn.end") {
this.state.offsetCompensation = this.state.lastDurationOffset;
websocket.close();
} else if (path !== "response" && path !== "turn.start") {
messageQueue.push(new UnknownResponse(`Unknown path received: ${path}`));
}
} else if (data instanceof ArrayBuffer) {
const bufferData = BrowserBuffer.from(data);
if (bufferData.length < 2) {
messageQueue.push(new UnexpectedResponse("We received a binary message, but it is missing the header length."));
} else {
const [headers, audioData] = browserGetHeadersAndDataFromBinary(bufferData);
if (headers["Path"] !== "audio") {
messageQueue.push(new UnexpectedResponse("Received binary message, but the path is not audio."));
} else {
const contentType = headers["Content-Type"];
if (contentType !== "audio/mpeg") {
if (audioData.length > 0) {
messageQueue.push(new UnexpectedResponse("Received binary message, but with an unexpected Content-Type."));
}
} else if (audioData.length === 0) {
messageQueue.push(new UnexpectedResponse("Received binary message, but it is missing the audio data."));
} else {
messageQueue.push({ type: "audio", data: audioData });
}
}
}
} else if (data instanceof Blob) {
data.arrayBuffer().then((arrayBuffer) => {
const bufferData = BrowserBuffer.from(arrayBuffer);
if (bufferData.length < 2) {
messageQueue.push(new UnexpectedResponse("We received a binary message, but it is missing the header length."));
} else {
const [headers, audioData] = browserGetHeadersAndDataFromBinary(bufferData);
if (headers["Path"] !== "audio") {
messageQueue.push(new UnexpectedResponse("Received binary message, but the path is not audio."));
} else {
const contentType = headers["Content-Type"];
if (contentType !== "audio/mpeg") {
if (audioData.length > 0) {
messageQueue.push(new UnexpectedResponse("Received binary message, but with an unexpected Content-Type."));
}
} else if (audioData.length === 0) {
messageQueue.push(new UnexpectedResponse("Received binary message, but it is missing the audio data."));
} else {
messageQueue.push({ type: "audio", data: audioData });
}
}
}
if (resolveMessage) resolveMessage();
});
}
if (resolveMessage) resolveMessage();
};
websocket.onerror = (error) => {
if (timeoutId) {
window.clearTimeout(timeoutId);
timeoutId = void 0;
}
messageQueue.push(new WebSocketError("WebSocket error occurred"));
if (resolveMessage) resolveMessage();
};
websocket.onclose = () => {
if (timeoutId) {
window.clearTimeout(timeoutId);
timeoutId = void 0;
}
messageQueue.push("close");
if (resolveMessage) resolveMessage();
};
await new Promise((resolve, reject) => {
websocket.onopen = () => {
if (timeoutId) {
window.clearTimeout(timeoutId);
timeoutId = void 0;
}
resolve();
};
if (this.connectionTimeout) {
setTimeout(() => {
if (websocket.readyState === WebSocket.CONNECTING) {
websocket.close();
reject(new WebSocketError("Connection timeout"));
}
}, this.connectionTimeout);
}
});
websocket.send(
`X-Timestamp:${browserDateToString()}\r
Content-Type:application/json; charset=utf-8\r
Path:speech.config\r
\r
{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},"outputFormat":"audio-24khz-48kbitrate-mono-mp3"}}}}\r
`
);
websocket.send(
browserSsmlHeadersPlusData(
browserConnectId(),
browserDateToString(),
browserMkssml(this.ttsConfig.voice, this.ttsConfig.rate, this.ttsConfig.volume, this.ttsConfig.pitch, new TextDecoder().decode(this.state.partialText))
)
);
let audioWasReceived = false;
while (true) {
if (messageQueue.length > 0) {
const message = messageQueue.shift();
if (message === "close") {
if (!audioWasReceived) {
throw new NoAudioReceived("No audio was received.");
}
break;
} else if (message instanceof Error) {
throw message;
} else {
if (message.type === "audio") audioWasReceived = true;
yield message;
}
} else {
await new Promise((resolve) => {
resolveMessage = resolve;
setTimeout(resolve, 50);
});
}
}
}
/**
* Streams text-to-speech synthesis results using native browser WebSocket.
* Uses only browser-native APIs, avoiding Node.js dependencies.
*
* @yields BrowserTTSChunk - Audio data or word boundary information
* @throws {Error} If called more than once
* @throws {NoAudioReceived} If no audio data is received
* @throws {WebSocketError} If WebSocket connection fails
*/
async *stream() {
if (this.state.streamWasCalled) {
throw new Error("stream can only be called once.");
}
this.state.streamWasCalled = true;
for (const partialText of this.texts) {
this.state.partialText = partialText;
for await (const message of this._stream()) {
yield message;
}
}
}
};
// src/browser-simple.ts
function concatUint8Arrays(arrays) {
if (arrays.length === 0) return new Uint8Array(0);
if (arrays.length === 1) return arrays[0];
const totalLength = arrays.reduce((sum, arr) => sum + arr.length, 0);
const result = new Uint8Array(totalLength);
let offset = 0;
for (const arr of arrays) {
if (arr.length > 0) {
result.set(arr, offset);
offset += arr.length;
}
}
return result;
}
var BrowserEdgeTTS = class {
/**
* @param text The text to be synthesized.
* @param voice The voice to use for synthesis.
* @param options Prosody options (rate, volume, pitch).
*/
constructor(text, voice = "Microsoft Server Speech Text to Speech Voice (en-US, EmmaMultilingualNeural)", options = {}) {
this.text = text;
this.voice = voice;
this.rate = options.rate || "+0%";
this.volume = options.volume || "+0%";
this.pitch = options.pitch || "+0Hz";
}
/**
* Initiates the synthesis process using browser-native APIs.
* @returns A promise that resolves with the synthesized audio and subtitle data.
*/
async synthesize() {
const communicate = new BrowserCommunicate(this.text, {
voice: this.voice,
rate: this.rate,
volume: this.volume,
pitch: this.pitch
});
const audioChunks = [];
const wordBoundaries = [];
for await (const chunk of communicate.stream()) {
if (chunk.type === "audio" && chunk.data) {
audioChunks.push(chunk.data);
} else if (chunk.type === "WordBoundary" && chunk.offset !== void 0 && chunk.duration !== void 0 && chunk.text !== void 0) {
wordBoundaries.push({
offset: chunk.offset,
duration: chunk.duration,
text: chunk.text
});
}
}
const audioBuffer = concatUint8Arrays(audioChunks);
const audioBlob = new Blob([
audioBuffer
], { type: "audio/mpeg" });
return {
audio: audioBlob,
subtitle: wordBoundaries
};
}
};
function formatTimestamp(timeIn100ns, format) {
const totalSeconds = Math.floor(timeIn100ns / 1e7);
const hours = Math.floor(totalSeconds / 3600);
const minutes = Math.floor(totalSeconds % 3600 / 60);
const seconds = totalSeconds % 60;
const milliseconds = Math.floor(timeIn100ns % 1e7 / 1e4);
const separator = format === "vtt" ? "." : ",";
return `${padNumber(hours)}:${padNumber(minutes)}:${padNumber(seconds)}${separator}${padNumber(milliseconds, 3)}`;
}
function padNumber(num, length = 2) {
return num.toString().padStart(length, "0");
}
function createVTT(wordBoundaries) {
let vttContent = "WEBVTT\n\n";
wordBoundaries.forEach((word, index) => {
const startTime = formatTimestamp(word.offset, "vtt");
const endTime = formatTimestamp(word.offset + word.duration, "vtt");
vttContent += `${index + 1}
`;
vttContent += `${startTime} --> ${endTime}
`;
vttContent += `${word.text}
`;
});
return vttContent;
}
function createSRT(wordBoundaries) {
let srtContent = "";
wordBoundaries.forEach((word, index) => {
const startTime = formatTimestamp(word.offset, "srt");
const endTime = formatTimestamp(word.offset + word.duration, "srt");
srtContent += `${index + 1}
`;
srtContent += `${startTime} --> ${endTime}
`;
srtContent += `${word.text}
`;
});
return srtContent;
}
// src/browser-voices.ts
var BrowserFetchError = class extends Error {
constructor(message, response) {
super(message);
this.name = "BrowserFetchError";
this.response = response;
}
};
async function _listVoices() {
const url = `${VOICE_LIST_URL}&Sec-MS-GEC=${await BrowserDRM.generateSecMsGec()}&Sec-MS-GEC-Version=${SEC_MS_GEC_VERSION}`;
try {
const response = await fetch(url, {
headers: VOICE_HEADERS
});
if (!response.ok) {
const headers = {};
response.headers.forEach((value, key) => {
headers[key] = value;
});
throw new BrowserFetchError(`HTTP ${response.status}`, {
status: response.status,
headers
});
}
const data = await response.json();
for (const voice of data) {
voice.VoiceTag.ContentCategories = voice.VoiceTag.ContentCategories.map((c) => c.trim());
voice.VoiceTag.VoicePersonalities = voice.VoiceTag.VoicePersonalities.map((p) => p.trim());
}
return data;
} catch (error) {
if (error instanceof BrowserFetchError) {
throw error;
}
throw new BrowserFetchError(error instanceof Error ? error.message : "Unknown fetch error");
}
}
async function listVoices() {
try {
return await _listVoices();
} catch (e) {
if (e instanceof BrowserFetchError && e.response?.status === 403) {
BrowserDRM.handleClientResponseError(e.response);
return await _listVoices();
}
throw e;
}
}
var BrowserVoicesManager = class _BrowserVoicesManager {
constructor() {
this.voices = [];
this.calledCreate = false;
}
/**
* Creates a new BrowserVoicesManager instance.
*
* @param customVoices - Optional custom voice list instead of fetching from API
* @returns Promise resolving to BrowserVoicesManager instance
*/
static async create(customVoices) {
const manager = new _BrowserVoicesManager();
const voices = customVoices ?? await listVoices();
manager.voices = voices.map((voice) => ({
...voice,
Language: voice.Locale.split("-")[0]
}));
manager.calledCreate = true;
return manager;
}
/**
* Finds voices matching the specified criteria.
*
* @param filter - Filter criteria for voice selection
* @returns Array of voices matching the filter
* @throws {Error} If called before create()
*/
find(filter) {
if (!this.calledCreate) {
throw new Error("BrowserVoicesManager.find() called before BrowserVoicesManager.create()");
}
return this.voices.filter((voice) => {
return Object.entries(filter).every(([key, value]) => {
return voice[key] === value;
});
});
}
};
// src/submaker.ts
function formatTime(seconds) {
const h = Math.floor(seconds / 3600);
const m = Math.floor(seconds % 3600 / 60);
const s = Math.floor(seconds % 60);
const ms = Math.round((seconds - Math.floor(seconds)) * 1e3);
const pad = (num, size = 2) => num.toString().padStart(size, "0");
return `${pad(h)}:${pad(m)}:${pad(s)},${pad(ms, 3)}`;
}
var SubMaker = class {
constructor() {
this.cues = [];
}
/**
* Adds a WordBoundary chunk to the subtitle maker.
*
* @param msg - Must be a WordBoundary type chunk with offset, duration, and text
* @throws {ValueError} If chunk is not a WordBoundary with required fields
*/
feed(msg) {
if (msg.type !== "WordBoundary" || msg.offset === void 0 || msg.duration === void 0 || msg.text === void 0) {
throw new ValueError("Invalid message type, expected 'WordBoundary' with offset, duration and text");
}
const start = msg.offset / 1e7;
const end = (msg.offset + msg.duration) / 1e7;
this.cues.push({
index: this.cues.length + 1,
start,
end,
content: msg.text
});
}
/**
* Merges consecutive cues to create subtitle entries with multiple words.
* This is useful for creating more readable subtitles instead of word-by-word display.
*
* @param words - Maximum number of words per merged cue
* @throws {ValueError} If words parameter is invalid
*/
mergeCues(words) {
if (words <= 0) {
throw new ValueError("Invalid number of words to merge, expected > 0");
}
if (this.cues.length === 0) {
return;
}
const newCues = [];
let currentCue = this.cues[0];
for (const cue of this.cues.slice(1)) {
if (currentCue.content.split(" ").length < words) {
currentCue = {
...currentCue,
end: cue.end,
content: `${currentCue.content} ${cue.content}`
};
} else {
newCues.push(currentCue);
currentCue = cue;
}
}
newCues.push(currentCue);
this.cues = newCues.map((cue, i) => ({ ...cue, index: i + 1 }));
}
/**
* Returns the subtitles in SRT format.
*
* @returns SRT formatted subtitles
*/
getSrt() {
return this.cues.map((cue) => {
return `${cue.index}\r
${formatTime(cue.start)} --> ${formatTime(cue.end)}\r
${cue.content}\r
`;
}).join("\r\n");
}
toString() {
return this.getSrt();
}
};
export { BrowserCommunicate as Communicate, BrowserDRM as DRM, BrowserEdgeTTS as EdgeTTS, EdgeTTSBrowser, EdgeTTSException, BrowserFetchError as FetchError, NoAudioReceived, SkewAdjustmentError, SubMaker, UnexpectedResponse, UnknownResponse, ValueError, BrowserVoicesManager as VoicesManager, WebSocketError, createSRT, createVTT, listVoices };
//# sourceMappingURL=browser.js.map
//# sourceMappingURL=browser.js.map