UNPKG

edge-tts-generator

Version:

Generate text-to-speech narration for free, leveraging the Read Aloud feature in Microsoft Edge

310 lines (304 loc) • 12.8 kB

JavaScript

#!/usr/bin/env node var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); // src/tts-cli.ts var import_commander = require("commander"); // src/edge-tts.ts var import_buffer = require("buffer"); // src/constants.ts var OUTPUT_FORMAT = /* @__PURE__ */ ((OUTPUT_FORMAT2) => { OUTPUT_FORMAT2["AUDIO_24KHZ_48KBITRATE_MONO_MP3"] = "audio-24khz-48kbitrate-mono-mp3"; OUTPUT_FORMAT2["AUDIO_24KHZ_96KBITRATE_MONO_MP3"] = "audio-24khz-96kbitrate-mono-mp3"; OUTPUT_FORMAT2["WEBM_24KHZ_16BIT_MONO_OPUS"] = "webm-24khz-16bit-mono-opus"; return OUTPUT_FORMAT2; })(OUTPUT_FORMAT || {}); // src/edge-tts.ts var import_ws = require("ws"); var import_node_fetch = __toESM(require("node-fetch")); var import_crypto = require("crypto"); var import_util = require("util"); function generateRandomHex(length) { const randomBuffer = (0, import_crypto.randomBytes)(length); return randomBuffer.toString("hex"); } var EventEmitter = class { eventListeners; constructor() { this.eventListeners = { data: [], close: [], end: [], error: [] }; } on(event, callback) { this.eventListeners[event].push(callback); } emit(event, data) { this.eventListeners[event].forEach((callback) => callback(data)); } }; var ProsodyOptions = class { pitch = "+0Hz"; rate = 1; volume = 100; }; var EdgeTTSClient = class _EdgeTTSClient { static OUTPUT_FORMAT = OUTPUT_FORMAT; static CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${_EdgeTTSClient.CLIENT_TOKEN}`; static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${_EdgeTTSClient.CLIENT_TOKEN}`; static BINARY_DELIM = "Path:audio\r\n"; static VOICE_LANG_REGEX = /\w{2}-\w{2}/; enableLogging; ws = null; voice = null; voiceLocale = null; outputFormat = null; requestQueue = {}; connectionStartTime = 0; constructor(enableLogging = false) { this.enableLogging = enableLogging; } log(...args) { if (this.enableLogging) console.log(...args); } async sendMessage(message) { var _a; for (let attempt = 1; attempt <= 3 && (this.ws === null || this.ws.readyState !== import_ws.WebSocket.OPEN); attempt++) { if (attempt === 1) this.connectionStartTime = Date.now(); this.log(`Connecting... attempt ${attempt}`); await this.initWebSocket(); } (_a = this.ws) == null ? void 0 : _a.send(message); } initWebSocket() { this.ws = new import_ws.WebSocket(_EdgeTTSClient.SYNTH_URL); this.ws.binaryType = "arraybuffer"; let metadataBuffer = []; return new Promise((resolve, reject) => { this.ws.onopen = () => { this.log("Connected in", (Date.now() - this.connectionStartTime) / 1e3, "seconds"); this.sendMessage(this.getConfigMessage()).then(resolve); }; this.ws.onmessage = (event) => this.handleMessage(event, metadataBuffer); this.ws.onclose = () => this.handleClose(); this.ws.onerror = (error) => reject(`Connection Error: ${error.message}`); }); } handleMessage(event, metadataBuffer) { var _a; const buffer = import_buffer.Buffer.from(event.data); const message = buffer.toString(); const requestIdMatch = /X-RequestId:(.*?)\r\n/.exec(message); const requestId = requestIdMatch ? requestIdMatch[1] : ""; if (message.includes("Path:turn.start")) { metadataBuffer.length = 0; } else if (message.includes("Path:turn.end")) { (_a = this.requestQueue[requestId]) == null ? void 0 : _a.emit("end", metadataBuffer); } else if (message.includes("Path:audio")) { this.cacheAudioData(buffer, requestId); } else if (message.includes("Path:audio.metadata")) { const startIndex = message.indexOf("{"); metadataBuffer.push(JSON.parse(message.slice(startIndex)).Metadata[0]); } else { this.log("Unknown Message", message); } } handleClose() { this.log("Disconnected after:", (Date.now() - this.connectionStartTime) / 1e3, "seconds"); for (const requestId in this.requestQueue) { this.requestQueue[requestId].emit("close", null); } } cacheAudioData(buffer, requestId) { var _a; const binaryDelimBytes = new import_util.TextEncoder().encode(_EdgeTTSClient.BINARY_DELIM); const delimiterIndex = this.findDelimiterIndex(buffer, binaryDelimBytes); if (delimiterIndex === -1) { this.log("Delimiter not found in the buffer."); return; } const audioDataStart = delimiterIndex + binaryDelimBytes.length; const audioData = buffer.subarray(audioDataStart); (_a = this.requestQueue[requestId]) == null ? void 0 : _a.emit("data", audioData); this.log("Received audio chunk of size:", audioData == null ? void 0 : audioData.length); } // Helper function to find the index of a byte sequence within another byte sequence findDelimiterIndex(buffer, delimiter) { for (let i = 0; i <= buffer.length - delimiter.length; i++) { let match = true; for (let j = 0; j < delimiter.length; j++) { if (buffer[i + j] !== delimiter[j]) { match = false; break; } } if (match) return i; } return -1; } getConfigMessage() { return `Content-Type:application/json; charset=utf-8\r Path:speech.config\r \r { "context": { "synthesis": { "audio": { "metadataoptions": { "sentenceBoundaryEnabled": "true", "wordBoundaryEnabled": "true" }, "outputFormat": "${this.outputFormat}" } } } }`; } async getVoices() { try { const response = await (0, import_node_fetch.default)(_EdgeTTSClient.VOICES_URL); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } return await response.json(); } catch (error) { return Promise.reject(error); } } async setMetadata(voiceName, outputFormat, voiceLocale) { this.voice = voiceName; this.outputFormat = outputFormat; this.voiceLocale = voiceLocale || this.inferLocaleFromVoiceName(voiceName); if (!this.voiceLocale) { throw new Error("Could not infer voiceLocale from voiceName!"); } if (!this.ws || this.ws.readyState !== import_ws.WebSocket.OPEN) { this.connectionStartTime = Date.now(); await this.initWebSocket(); } } inferLocaleFromVoiceName(voiceName) { const match = _EdgeTTSClient.VOICE_LANG_REGEX.exec(voiceName); return match ? match[0] : null; } close() { var _a; (_a = this.ws) == null ? void 0 : _a.close(); } toStream(text, options = new ProsodyOptions()) { return this.sendSSMLRequest(this.buildSSML(text, options)); } buildSSML(text, options) { return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this.voiceLocale}"> <voice name="${this.voice}"> <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}"> ${text} </prosody> </voice> </speak>`; } sendSSMLRequest(ssml) { if (!this.ws) { throw new Error("WebSocket not initialized. Call setMetadata first."); } const requestId = generateRandomHex(16); const requestMessage = `X-RequestId:${requestId}\r Content-Type:application/ssml+xml\r Path:ssml\r \r ${ssml.trim()}`; const eventEmitter = new EventEmitter(); this.requestQueue[requestId] = eventEmitter; this.sendMessage(requestMessage).then(); return eventEmitter; } }; // src/utils.ts function replaceComparisonSymbols(text) { return text.replace(/>=/g, "\u2265").replace(/<=/g, "\u2264"); } function escapeAmpersand(text) { return text.replace(/&/g, "&"); } function escapeXml(text) { return text.replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'"); } function filterMarkdown(text, overrideAmpersandEscape = false) { const noFrontmatter = text.replace(/^-{3}[\s\S]*?-{3}\n?/, ""); const noUrls = noFrontmatter.replace(/https?:\/\/[^\s]+/g, ""); const noCodeBlocks = noUrls.replace(/```[\s\S]*?```/g, "").replace(/^( {4}|\t).+/gm, ""); let cleanedMarkdown = noCodeBlocks.replace(/(\*\*|__)(.*?)\1/g, "$2").replace(/(\*|_)(.*?)\1/g, "$2").replace(/`([^`]*)`/g, "$1").replace(/~~(.*?)~~/g, "$1").replace(/^[#*-]+\s*/gm, "").replace(/^[\-\+\*]\s+/gm, "").replace(/^\d+\.\s+/gm, "").replace(/^>\s+/gm, "").replace(/^[-*]{3,}\s*$/gm, ""); cleanedMarkdown = replaceComparisonSymbols(cleanedMarkdown); cleanedMarkdown = cleanedMarkdown.replace(/<([^>\s]+)[^>]*>/g, ""); cleanedMarkdown = overrideAmpersandEscape ? cleanedMarkdown.trim() : escapeAmpersand(cleanedMarkdown.trim()); const finalText = escapeXml(cleanedMarkdown); return finalText; } // src/tts-cli.ts var import_promises = require("fs/promises"); var import_fs = require("fs"); var import_path = require("path"); var program = new import_commander.Command(); program.name("tts-generator").description("Generate text-to-speech audio from a UTF-8 encoded text file.").argument("<file>", "Path to the UTF-8 encoded text file").option("-v, --voice <voice>", "Specify the voice to use (e.g., en-US-JennyNeural)", "en-US-JennyNeural").option("-d, --outputFolder <folder>", "Specify the output folder for the audio file", "./output").option("-o, --fileName <fileName>", "Specify the name of the output file", "noname").option("-s, --speed <speed>", "Specify the speech rate (ex. 0.5 = 0.5x playback speed (50% speed)). Default = 1.2", parseFloat, 1.2).option("--disableFilter", "Disable basic text filtering (removes newlines and extra spaces)", false).action(async (file, options) => { try { const fileContent = await (0, import_promises.readFile)(file, "utf-8"); let textToSpeak = fileContent; if (!options.disableFilter) { textToSpeak = filterMarkdown(textToSpeak); } const client = new EdgeTTSClient(); await client.setMetadata(options.voice, "audio-24khz-48kbitrate-mono-mp3" /* AUDIO_24KHZ_48KBITRATE_MONO_MP3 */); const prosodyOptions = new ProsodyOptions(); prosodyOptions.rate = options.speed; const stream = client.toStream(textToSpeak, prosodyOptions); const outputFileName = options.fileName2 == "noname" ? `${(0, import_path.basename)(file, (0, import_path.extname)(file))}-${options.voice}.mp3` : `${options.fileName}.mp3`; const outputPath = (0, import_path.join)(options.outputFolder, outputFileName); try { await (0, import_promises.mkdir)(options.outputFolder, { recursive: true }); } catch (error) { if (error.code !== "EEXIST") { console.error(`Error creating output folder: ${error.message}`); return; } } const outputFileStream = (0, import_fs.createWriteStream)(outputPath); stream.on("data", (chunk) => { outputFileStream.write(chunk); }); stream.on("end", () => { outputFileStream.end(); console.log(`Successfully generated audio: ${outputPath}`); client.close(); }); stream.on("error", (error) => { console.error("Error during audio generation:", error); outputFileStream.end(); client.close(); }); outputFileStream.on("error", (error) => { console.error("Error writing to output file:", error); client.close(); }); } catch (error) { console.error("Error:", error.message); } }); program.parse(process.argv); //# sourceMappingURL=tts-cli.js.map