edge-tts-generator
Version:
Generate text-to-speech narration for free, leveraging the Read Aloud feature in Microsoft Edge
310 lines (304 loc) • 12.8 kB
JavaScript
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
// If the importer is in node compatibility mode or this is not an ESM
// file that has been converted to a CommonJS file using a Babel-
// compatible transform (i.e. "__esModule" has not been set), then set
// "default" to the CommonJS "module.exports" for node compatibility.
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
mod
));
// src/tts-cli.ts
var import_commander = require("commander");
// src/edge-tts.ts
var import_buffer = require("buffer");
// src/constants.ts
var OUTPUT_FORMAT = /* @__PURE__ */ ((OUTPUT_FORMAT2) => {
OUTPUT_FORMAT2["AUDIO_24KHZ_48KBITRATE_MONO_MP3"] = "audio-24khz-48kbitrate-mono-mp3";
OUTPUT_FORMAT2["AUDIO_24KHZ_96KBITRATE_MONO_MP3"] = "audio-24khz-96kbitrate-mono-mp3";
OUTPUT_FORMAT2["WEBM_24KHZ_16BIT_MONO_OPUS"] = "webm-24khz-16bit-mono-opus";
return OUTPUT_FORMAT2;
})(OUTPUT_FORMAT || {});
// src/edge-tts.ts
var import_ws = require("ws");
var import_node_fetch = __toESM(require("node-fetch"));
var import_crypto = require("crypto");
var import_util = require("util");
function generateRandomHex(length) {
const randomBuffer = (0, import_crypto.randomBytes)(length);
return randomBuffer.toString("hex");
}
var EventEmitter = class {
eventListeners;
constructor() {
this.eventListeners = { data: [], close: [], end: [], error: [] };
}
on(event, callback) {
this.eventListeners[event].push(callback);
}
emit(event, data) {
this.eventListeners[event].forEach((callback) => callback(data));
}
};
var ProsodyOptions = class {
pitch = "+0Hz";
rate = 1;
volume = 100;
};
var EdgeTTSClient = class _EdgeTTSClient {
static OUTPUT_FORMAT = OUTPUT_FORMAT;
static CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${_EdgeTTSClient.CLIENT_TOKEN}`;
static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${_EdgeTTSClient.CLIENT_TOKEN}`;
static BINARY_DELIM = "Path:audio\r\n";
static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
enableLogging;
ws = null;
voice = null;
voiceLocale = null;
outputFormat = null;
requestQueue = {};
connectionStartTime = 0;
constructor(enableLogging = false) {
this.enableLogging = enableLogging;
}
log(...args) {
if (this.enableLogging) console.log(...args);
}
async sendMessage(message) {
var _a;
for (let attempt = 1; attempt <= 3 && (this.ws === null || this.ws.readyState !== import_ws.WebSocket.OPEN); attempt++) {
if (attempt === 1) this.connectionStartTime = Date.now();
this.log(`Connecting... attempt ${attempt}`);
await this.initWebSocket();
}
(_a = this.ws) == null ? void 0 : _a.send(message);
}
initWebSocket() {
this.ws = new import_ws.WebSocket(_EdgeTTSClient.SYNTH_URL);
this.ws.binaryType = "arraybuffer";
let metadataBuffer = [];
return new Promise((resolve, reject) => {
this.ws.onopen = () => {
this.log("Connected in", (Date.now() - this.connectionStartTime) / 1e3, "seconds");
this.sendMessage(this.getConfigMessage()).then(resolve);
};
this.ws.onmessage = (event) => this.handleMessage(event, metadataBuffer);
this.ws.onclose = () => this.handleClose();
this.ws.onerror = (error) => reject(`Connection Error: ${error.message}`);
});
}
handleMessage(event, metadataBuffer) {
var _a;
const buffer = import_buffer.Buffer.from(event.data);
const message = buffer.toString();
const requestIdMatch = /X-RequestId:(.*?)\r\n/.exec(message);
const requestId = requestIdMatch ? requestIdMatch[1] : "";
if (message.includes("Path:turn.start")) {
metadataBuffer.length = 0;
} else if (message.includes("Path:turn.end")) {
(_a = this.requestQueue[requestId]) == null ? void 0 : _a.emit("end", metadataBuffer);
} else if (message.includes("Path:audio")) {
this.cacheAudioData(buffer, requestId);
} else if (message.includes("Path:audio.metadata")) {
const startIndex = message.indexOf("{");
metadataBuffer.push(JSON.parse(message.slice(startIndex)).Metadata[0]);
} else {
this.log("Unknown Message", message);
}
}
handleClose() {
this.log("Disconnected after:", (Date.now() - this.connectionStartTime) / 1e3, "seconds");
for (const requestId in this.requestQueue) {
this.requestQueue[requestId].emit("close", null);
}
}
cacheAudioData(buffer, requestId) {
var _a;
const binaryDelimBytes = new import_util.TextEncoder().encode(_EdgeTTSClient.BINARY_DELIM);
const delimiterIndex = this.findDelimiterIndex(buffer, binaryDelimBytes);
if (delimiterIndex === -1) {
this.log("Delimiter not found in the buffer.");
return;
}
const audioDataStart = delimiterIndex + binaryDelimBytes.length;
const audioData = buffer.subarray(audioDataStart);
(_a = this.requestQueue[requestId]) == null ? void 0 : _a.emit("data", audioData);
this.log("Received audio chunk of size:", audioData == null ? void 0 : audioData.length);
}
// Helper function to find the index of a byte sequence within another byte sequence
findDelimiterIndex(buffer, delimiter) {
for (let i = 0; i <= buffer.length - delimiter.length; i++) {
let match = true;
for (let j = 0; j < delimiter.length; j++) {
if (buffer[i + j] !== delimiter[j]) {
match = false;
break;
}
}
if (match) return i;
}
return -1;
}
getConfigMessage() {
return `Content-Type:application/json; charset=utf-8\r
Path:speech.config\r
\r
{
"context": {
"synthesis": {
"audio": {
"metadataoptions": {
"sentenceBoundaryEnabled": "true",
"wordBoundaryEnabled": "true"
},
"outputFormat": "${this.outputFormat}"
}
}
}
}`;
}
async getVoices() {
try {
const response = await (0, import_node_fetch.default)(_EdgeTTSClient.VOICES_URL);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
return await response.json();
} catch (error) {
return Promise.reject(error);
}
}
async setMetadata(voiceName, outputFormat, voiceLocale) {
this.voice = voiceName;
this.outputFormat = outputFormat;
this.voiceLocale = voiceLocale || this.inferLocaleFromVoiceName(voiceName);
if (!this.voiceLocale) {
throw new Error("Could not infer voiceLocale from voiceName!");
}
if (!this.ws || this.ws.readyState !== import_ws.WebSocket.OPEN) {
this.connectionStartTime = Date.now();
await this.initWebSocket();
}
}
inferLocaleFromVoiceName(voiceName) {
const match = _EdgeTTSClient.VOICE_LANG_REGEX.exec(voiceName);
return match ? match[0] : null;
}
close() {
var _a;
(_a = this.ws) == null ? void 0 : _a.close();
}
toStream(text, options = new ProsodyOptions()) {
return this.sendSSMLRequest(this.buildSSML(text, options));
}
buildSSML(text, options) {
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this.voiceLocale}">
<voice name="${this.voice}">
<prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
${text}
</prosody>
</voice>
</speak>`;
}
sendSSMLRequest(ssml) {
if (!this.ws) {
throw new Error("WebSocket not initialized. Call setMetadata first.");
}
const requestId = generateRandomHex(16);
const requestMessage = `X-RequestId:${requestId}\r
Content-Type:application/ssml+xml\r
Path:ssml\r
\r
${ssml.trim()}`;
const eventEmitter = new EventEmitter();
this.requestQueue[requestId] = eventEmitter;
this.sendMessage(requestMessage).then();
return eventEmitter;
}
};
// src/utils.ts
function replaceComparisonSymbols(text) {
return text.replace(/>=/g, "\u2265").replace(/<=/g, "\u2264");
}
function escapeAmpersand(text) {
return text.replace(/&/g, "&");
}
function escapeXml(text) {
return text.replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
}
function filterMarkdown(text, overrideAmpersandEscape = false) {
const noFrontmatter = text.replace(/^-{3}[\s\S]*?-{3}\n?/, "");
const noUrls = noFrontmatter.replace(/https?:\/\/[^\s]+/g, "");
const noCodeBlocks = noUrls.replace(/```[\s\S]*?```/g, "").replace(/^( {4}|\t).+/gm, "");
let cleanedMarkdown = noCodeBlocks.replace(/(\*\*|__)(.*?)\1/g, "$2").replace(/(\*|_)(.*?)\1/g, "$2").replace(/`([^`]*)`/g, "$1").replace(/~~(.*?)~~/g, "$1").replace(/^[#*-]+\s*/gm, "").replace(/^[\-\+\*]\s+/gm, "").replace(/^\d+\.\s+/gm, "").replace(/^>\s+/gm, "").replace(/^[-*]{3,}\s*$/gm, "");
cleanedMarkdown = replaceComparisonSymbols(cleanedMarkdown);
cleanedMarkdown = cleanedMarkdown.replace(/<([^>\s]+)[^>]*>/g, "");
cleanedMarkdown = overrideAmpersandEscape ? cleanedMarkdown.trim() : escapeAmpersand(cleanedMarkdown.trim());
const finalText = escapeXml(cleanedMarkdown);
return finalText;
}
// src/tts-cli.ts
var import_promises = require("fs/promises");
var import_fs = require("fs");
var import_path = require("path");
var program = new import_commander.Command();
program.name("tts-generator").description("Generate text-to-speech audio from a UTF-8 encoded text file.").argument("<file>", "Path to the UTF-8 encoded text file").option("-v, --voice <voice>", "Specify the voice to use (e.g., en-US-JennyNeural)", "en-US-JennyNeural").option("-d, --outputFolder <folder>", "Specify the output folder for the audio file", "./output").option("-o, --fileName <fileName>", "Specify the name of the output file", "noname").option("-s, --speed <speed>", "Specify the speech rate (ex. 0.5 = 0.5x playback speed (50% speed)). Default = 1.2", parseFloat, 1.2).option("--disableFilter", "Disable basic text filtering (removes newlines and extra spaces)", false).action(async (file, options) => {
try {
const fileContent = await (0, import_promises.readFile)(file, "utf-8");
let textToSpeak = fileContent;
if (!options.disableFilter) {
textToSpeak = filterMarkdown(textToSpeak);
}
const client = new EdgeTTSClient();
await client.setMetadata(options.voice, "audio-24khz-48kbitrate-mono-mp3" /* AUDIO_24KHZ_48KBITRATE_MONO_MP3 */);
const prosodyOptions = new ProsodyOptions();
prosodyOptions.rate = options.speed;
const stream = client.toStream(textToSpeak, prosodyOptions);
const outputFileName = options.fileName2 == "noname" ? `${(0, import_path.basename)(file, (0, import_path.extname)(file))}-${options.voice}.mp3` : `${options.fileName}.mp3`;
const outputPath = (0, import_path.join)(options.outputFolder, outputFileName);
try {
await (0, import_promises.mkdir)(options.outputFolder, { recursive: true });
} catch (error) {
if (error.code !== "EEXIST") {
console.error(`Error creating output folder: ${error.message}`);
return;
}
}
const outputFileStream = (0, import_fs.createWriteStream)(outputPath);
stream.on("data", (chunk) => {
outputFileStream.write(chunk);
});
stream.on("end", () => {
outputFileStream.end();
console.log(`Successfully generated audio: ${outputPath}`);
client.close();
});
stream.on("error", (error) => {
console.error("Error during audio generation:", error);
outputFileStream.end();
client.close();
});
outputFileStream.on("error", (error) => {
console.error("Error writing to output file:", error);
client.close();
});
} catch (error) {
console.error("Error:", error.message);
}
});
program.parse(process.argv);
//# sourceMappingURL=tts-cli.js.map