edge-tts-generator
Version:
Generate text-to-speech narration for free, leveraging the Read Aloud feature in Microsoft Edge
342 lines (338 loc) • 11.9 kB
JavaScript
// src/constants.ts
var OUTPUT_FORMAT = /* @__PURE__ */ ((OUTPUT_FORMAT2) => {
OUTPUT_FORMAT2["AUDIO_24KHZ_48KBITRATE_MONO_MP3"] = "audio-24khz-48kbitrate-mono-mp3";
OUTPUT_FORMAT2["AUDIO_24KHZ_96KBITRATE_MONO_MP3"] = "audio-24khz-96kbitrate-mono-mp3";
OUTPUT_FORMAT2["WEBM_24KHZ_16BIT_MONO_OPUS"] = "webm-24khz-16bit-mono-opus";
return OUTPUT_FORMAT2;
})(OUTPUT_FORMAT || {});
var PITCH = /* @__PURE__ */ ((PITCH2) => {
PITCH2["X_LOW"] = "x-low";
PITCH2["LOW"] = "low";
PITCH2["MEDIUM"] = "medium";
PITCH2["HIGH"] = "high";
PITCH2["X_HIGH"] = "x-high";
PITCH2["DEFAULT"] = "default";
return PITCH2;
})(PITCH || {});
var RATE = /* @__PURE__ */ ((RATE2) => {
RATE2["X_SLOW"] = "x-slow";
RATE2["SLOW"] = "slow";
RATE2["MEDIUM"] = "medium";
RATE2["FAST"] = "fast";
RATE2["X_FAST"] = "x-fast";
RATE2["DEFAULT"] = "default";
return RATE2;
})(RATE || {});
var VOLUME = /* @__PURE__ */ ((VOLUME2) => {
VOLUME2["SILENT"] = "silent";
VOLUME2["X_SOFT"] = "x-soft";
VOLUME2["SOFT"] = "soft";
VOLUME2["MEDIUM"] = "medium";
VOLUME2["LOUD"] = "loud";
VOLUME2["X_LOUD"] = "x-LOUD";
VOLUME2["DEFAULT"] = "default";
return VOLUME2;
})(VOLUME || {});
// src/edge-tts.ts
import { Buffer as Buffer2 } from "buffer";
import { WebSocket } from "ws";
import fetch from "node-fetch";
import { randomBytes } from "crypto";
import { TextEncoder } from "util";
function generateRandomHex(length) {
const randomBuffer = randomBytes(length);
return randomBuffer.toString("hex");
}
var EventEmitter = class {
eventListeners;
constructor() {
this.eventListeners = { data: [], close: [], end: [], error: [] };
}
on(event, callback) {
this.eventListeners[event].push(callback);
}
emit(event, data) {
this.eventListeners[event].forEach((callback) => callback(data));
}
};
var ProsodyOptions = class {
pitch = "+0Hz";
rate = 1;
volume = 100;
};
var EdgeTTSClient = class _EdgeTTSClient {
static OUTPUT_FORMAT = OUTPUT_FORMAT;
static CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${_EdgeTTSClient.CLIENT_TOKEN}`;
static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${_EdgeTTSClient.CLIENT_TOKEN}`;
static BINARY_DELIM = "Path:audio\r\n";
static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
enableLogging;
ws = null;
voice = null;
voiceLocale = null;
outputFormat = null;
requestQueue = {};
connectionStartTime = 0;
constructor(enableLogging = false) {
this.enableLogging = enableLogging;
}
log(...args) {
if (this.enableLogging) console.log(...args);
}
async sendMessage(message) {
for (let attempt = 1; attempt <= 3 && (this.ws === null || this.ws.readyState !== WebSocket.OPEN); attempt++) {
if (attempt === 1) this.connectionStartTime = Date.now();
this.log(`Connecting... attempt ${attempt}`);
await this.initWebSocket();
}
this.ws?.send(message);
}
initWebSocket() {
this.ws = new WebSocket(_EdgeTTSClient.SYNTH_URL);
this.ws.binaryType = "arraybuffer";
let metadataBuffer = [];
return new Promise((resolve, reject) => {
this.ws.onopen = () => {
this.log("Connected in", (Date.now() - this.connectionStartTime) / 1e3, "seconds");
this.sendMessage(this.getConfigMessage()).then(resolve);
};
this.ws.onmessage = (event) => this.handleMessage(event, metadataBuffer);
this.ws.onclose = () => this.handleClose();
this.ws.onerror = (error) => reject(`Connection Error: ${error.message}`);
});
}
handleMessage(event, metadataBuffer) {
const buffer = Buffer2.from(event.data);
const message = buffer.toString();
const requestIdMatch = /X-RequestId:(.*?)\r\n/.exec(message);
const requestId = requestIdMatch ? requestIdMatch[1] : "";
if (message.includes("Path:turn.start")) {
metadataBuffer.length = 0;
} else if (message.includes("Path:turn.end")) {
this.requestQueue[requestId]?.emit("end", metadataBuffer);
} else if (message.includes("Path:audio")) {
this.cacheAudioData(buffer, requestId);
} else if (message.includes("Path:audio.metadata")) {
const startIndex = message.indexOf("{");
metadataBuffer.push(JSON.parse(message.slice(startIndex)).Metadata[0]);
} else {
this.log("Unknown Message", message);
}
}
handleClose() {
this.log("Disconnected after:", (Date.now() - this.connectionStartTime) / 1e3, "seconds");
for (const requestId in this.requestQueue) {
this.requestQueue[requestId].emit("close", null);
}
}
cacheAudioData(buffer, requestId) {
const binaryDelimBytes = new TextEncoder().encode(_EdgeTTSClient.BINARY_DELIM);
const delimiterIndex = this.findDelimiterIndex(buffer, binaryDelimBytes);
if (delimiterIndex === -1) {
this.log("Delimiter not found in the buffer.");
return;
}
const audioDataStart = delimiterIndex + binaryDelimBytes.length;
const audioData = buffer.subarray(audioDataStart);
this.requestQueue[requestId]?.emit("data", audioData);
this.log("Received audio chunk of size:", audioData?.length);
}
// Helper function to find the index of a byte sequence within another byte sequence
findDelimiterIndex(buffer, delimiter) {
for (let i = 0; i <= buffer.length - delimiter.length; i++) {
let match = true;
for (let j = 0; j < delimiter.length; j++) {
if (buffer[i + j] !== delimiter[j]) {
match = false;
break;
}
}
if (match) return i;
}
return -1;
}
getConfigMessage() {
return `Content-Type:application/json; charset=utf-8\r
Path:speech.config\r
\r
{
"context": {
"synthesis": {
"audio": {
"metadataoptions": {
"sentenceBoundaryEnabled": "true",
"wordBoundaryEnabled": "true"
},
"outputFormat": "${this.outputFormat}"
}
}
}
}`;
}
async getVoices() {
try {
const response = await fetch(_EdgeTTSClient.VOICES_URL);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
return await response.json();
} catch (error) {
return Promise.reject(error);
}
}
async setMetadata(voiceName, outputFormat, voiceLocale) {
this.voice = voiceName;
this.outputFormat = outputFormat;
this.voiceLocale = voiceLocale || this.inferLocaleFromVoiceName(voiceName);
if (!this.voiceLocale) {
throw new Error("Could not infer voiceLocale from voiceName!");
}
if (!this.ws || this.ws.readyState !== WebSocket.OPEN) {
this.connectionStartTime = Date.now();
await this.initWebSocket();
}
}
inferLocaleFromVoiceName(voiceName) {
const match = _EdgeTTSClient.VOICE_LANG_REGEX.exec(voiceName);
return match ? match[0] : null;
}
close() {
this.ws?.close();
}
toStream(text, options = new ProsodyOptions()) {
return this.sendSSMLRequest(this.buildSSML(text, options));
}
buildSSML(text, options) {
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this.voiceLocale}">
<voice name="${this.voice}">
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
${text}
</prosody>
</voice>
</speak>`;
}
sendSSMLRequest(ssml) {
if (!this.ws) {
throw new Error("WebSocket not initialized. Call setMetadata first.");
}
const requestId = generateRandomHex(16);
const requestMessage = `X-RequestId:${requestId}\r
Content-Type:application/ssml+xml\r
Path:ssml\r
\r
${ssml.trim()}`;
const eventEmitter = new EventEmitter();
this.requestQueue[requestId] = eventEmitter;
this.sendMessage(requestMessage).then();
return eventEmitter;
}
};
// src/generate-mp3.ts
import * as fs from "fs/promises";
import * as path from "path";
// src/utils.ts
function replaceComparisonSymbols(text) {
return text.replace(/>=/g, "\u2265").replace(/<=/g, "\u2264");
}
function escapeAmpersand(text) {
return text.replace(/&/g, "&");
}
function escapeXml(text) {
return text.replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
}
function filterMarkdown(text, overrideAmpersandEscape = false) {
const noFrontmatter = text.replace(/^-{3}[\s\S]*?-{3}\n?/, "");
const noUrls = noFrontmatter.replace(/https?:\/\/[^\s]+/g, "");
const noCodeBlocks = noUrls.replace(/```[\s\S]*?```/g, "").replace(/^( {4}|\t).+/gm, "");
let cleanedMarkdown = noCodeBlocks.replace(/(\*\*|__)(.*?)\1/g, "$2").replace(/(\*|_)(.*?)\1/g, "$2").replace(/`([^`]*)`/g, "$1").replace(/~~(.*?)~~/g, "$1").replace(/^[#*-]+\s*/gm, "").replace(/^[\-\+\*]\s+/gm, "").replace(/^\d+\.\s+/gm, "").replace(/^>\s+/gm, "").replace(/^[-*]{3,}\s*$/gm, "");
cleanedMarkdown = replaceComparisonSymbols(cleanedMarkdown);
cleanedMarkdown = cleanedMarkdown.replace(/<([^>\s]+)[^>]*>/g, "");
cleanedMarkdown = overrideAmpersandEscape ? cleanedMarkdown.trim() : escapeAmpersand(cleanedMarkdown.trim());
const finalText = escapeXml(cleanedMarkdown);
return finalText;
}
// src/generate-mp3.ts
var DEFAULT_OPTIONS = {
voice: "en-GB-RyanNeural",
speed: 1.1,
enableLogging: false,
disableFilter: false
};
async function textToSpeechMp3({
text,
outputPath,
fileName,
options = DEFAULT_OPTIONS
}) {
const client = new EdgeTTSClient(options.enableLogging);
try {
const finalFileName = fileName.toLowerCase().endsWith(".mp3") ? fileName : `${fileName}.mp3`;
const fullOutputPath = path.join(outputPath, finalFileName);
await fs.mkdir(outputPath, { recursive: true });
await client.setMetadata(options.voice, "audio-24khz-48kbitrate-mono-mp3" /* AUDIO_24KHZ_48KBITRATE_MONO_MP3 */);
const prosodyOptions = new ProsodyOptions();
prosodyOptions.rate = options.speed ?? DEFAULT_OPTIONS.speed;
if (!options.disableFilter) {
text = filterMarkdown(text);
}
const stream = client.toStream(text, prosodyOptions);
const chunks = [];
stream.on("data", (chunk) => {
chunks.push(chunk);
});
return new Promise((resolve, reject) => {
stream.on("end", async () => {
const audioBuffer = Buffer.concat(chunks);
try {
await fs.writeFile(fullOutputPath, audioBuffer);
console.log(`Audio saved to ${fullOutputPath}`);
resolve();
} catch (err) {
console.error("Error writing audio to file:", err);
reject(err);
} finally {
client.close();
}
});
stream.on("error", (error) => {
console.error("Stream error:", error);
client.close();
reject(error);
});
});
} catch (error) {
console.error("Error during text-to-speech:", error);
client.close();
throw error;
}
}
async function batchTextToSpeechMp3(inputs, outputPath, globalOptions = DEFAULT_OPTIONS) {
await fs.mkdir(outputPath, { recursive: true });
for (const input of inputs) {
try {
await textToSpeechMp3({
text: input.text,
outputPath,
fileName: input.title,
options: {
...globalOptions,
...input.options || {}
}
});
} catch (error) {
console.error(`Failed to process "${input.title}":`, error);
}
}
}
export {
EdgeTTSClient,
OUTPUT_FORMAT,
PITCH,
ProsodyOptions,
RATE,
VOLUME,
batchTextToSpeechMp3,
textToSpeechMp3
};
//# sourceMappingURL=index.mjs.map