UNPKG

edge-tts-generator

Version:

Generate text-to-speech narration for free, leveraging the Read Aloud feature in Microsoft Edge

342 lines (338 loc) • 11.9 kB

JavaScript

// src/constants.ts var OUTPUT_FORMAT = /* @__PURE__ */ ((OUTPUT_FORMAT2) => { OUTPUT_FORMAT2["AUDIO_24KHZ_48KBITRATE_MONO_MP3"] = "audio-24khz-48kbitrate-mono-mp3"; OUTPUT_FORMAT2["AUDIO_24KHZ_96KBITRATE_MONO_MP3"] = "audio-24khz-96kbitrate-mono-mp3"; OUTPUT_FORMAT2["WEBM_24KHZ_16BIT_MONO_OPUS"] = "webm-24khz-16bit-mono-opus"; return OUTPUT_FORMAT2; })(OUTPUT_FORMAT || {}); var PITCH = /* @__PURE__ */ ((PITCH2) => { PITCH2["X_LOW"] = "x-low"; PITCH2["LOW"] = "low"; PITCH2["MEDIUM"] = "medium"; PITCH2["HIGH"] = "high"; PITCH2["X_HIGH"] = "x-high"; PITCH2["DEFAULT"] = "default"; return PITCH2; })(PITCH || {}); var RATE = /* @__PURE__ */ ((RATE2) => { RATE2["X_SLOW"] = "x-slow"; RATE2["SLOW"] = "slow"; RATE2["MEDIUM"] = "medium"; RATE2["FAST"] = "fast"; RATE2["X_FAST"] = "x-fast"; RATE2["DEFAULT"] = "default"; return RATE2; })(RATE || {}); var VOLUME = /* @__PURE__ */ ((VOLUME2) => { VOLUME2["SILENT"] = "silent"; VOLUME2["X_SOFT"] = "x-soft"; VOLUME2["SOFT"] = "soft"; VOLUME2["MEDIUM"] = "medium"; VOLUME2["LOUD"] = "loud"; VOLUME2["X_LOUD"] = "x-LOUD"; VOLUME2["DEFAULT"] = "default"; return VOLUME2; })(VOLUME || {}); // src/edge-tts.ts import { Buffer as Buffer2 } from "buffer"; import { WebSocket } from "ws"; import fetch from "node-fetch"; import { randomBytes } from "crypto"; import { TextEncoder } from "util"; function generateRandomHex(length) { const randomBuffer = randomBytes(length); return randomBuffer.toString("hex"); } var EventEmitter = class { eventListeners; constructor() { this.eventListeners = { data: [], close: [], end: [], error: [] }; } on(event, callback) { this.eventListeners[event].push(callback); } emit(event, data) { this.eventListeners[event].forEach((callback) => callback(data)); } }; var ProsodyOptions = class { pitch = "+0Hz"; rate = 1; volume = 100; }; var EdgeTTSClient = class _EdgeTTSClient { static OUTPUT_FORMAT = OUTPUT_FORMAT; static CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${_EdgeTTSClient.CLIENT_TOKEN}`; static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${_EdgeTTSClient.CLIENT_TOKEN}`; static BINARY_DELIM = "Path:audio\r\n"; static VOICE_LANG_REGEX = /\w{2}-\w{2}/; enableLogging; ws = null; voice = null; voiceLocale = null; outputFormat = null; requestQueue = {}; connectionStartTime = 0; constructor(enableLogging = false) { this.enableLogging = enableLogging; } log(...args) { if (this.enableLogging) console.log(...args); } async sendMessage(message) { for (let attempt = 1; attempt <= 3 && (this.ws === null || this.ws.readyState !== WebSocket.OPEN); attempt++) { if (attempt === 1) this.connectionStartTime = Date.now(); this.log(`Connecting... attempt ${attempt}`); await this.initWebSocket(); } this.ws?.send(message); } initWebSocket() { this.ws = new WebSocket(_EdgeTTSClient.SYNTH_URL); this.ws.binaryType = "arraybuffer"; let metadataBuffer = []; return new Promise((resolve, reject) => { this.ws.onopen = () => { this.log("Connected in", (Date.now() - this.connectionStartTime) / 1e3, "seconds"); this.sendMessage(this.getConfigMessage()).then(resolve); }; this.ws.onmessage = (event) => this.handleMessage(event, metadataBuffer); this.ws.onclose = () => this.handleClose(); this.ws.onerror = (error) => reject(`Connection Error: ${error.message}`); }); } handleMessage(event, metadataBuffer) { const buffer = Buffer2.from(event.data); const message = buffer.toString(); const requestIdMatch = /X-RequestId:(.*?)\r\n/.exec(message); const requestId = requestIdMatch ? requestIdMatch[1] : ""; if (message.includes("Path:turn.start")) { metadataBuffer.length = 0; } else if (message.includes("Path:turn.end")) { this.requestQueue[requestId]?.emit("end", metadataBuffer); } else if (message.includes("Path:audio")) { this.cacheAudioData(buffer, requestId); } else if (message.includes("Path:audio.metadata")) { const startIndex = message.indexOf("{"); metadataBuffer.push(JSON.parse(message.slice(startIndex)).Metadata[0]); } else { this.log("Unknown Message", message); } } handleClose() { this.log("Disconnected after:", (Date.now() - this.connectionStartTime) / 1e3, "seconds"); for (const requestId in this.requestQueue) { this.requestQueue[requestId].emit("close", null); } } cacheAudioData(buffer, requestId) { const binaryDelimBytes = new TextEncoder().encode(_EdgeTTSClient.BINARY_DELIM); const delimiterIndex = this.findDelimiterIndex(buffer, binaryDelimBytes); if (delimiterIndex === -1) { this.log("Delimiter not found in the buffer."); return; } const audioDataStart = delimiterIndex + binaryDelimBytes.length; const audioData = buffer.subarray(audioDataStart); this.requestQueue[requestId]?.emit("data", audioData); this.log("Received audio chunk of size:", audioData?.length); } // Helper function to find the index of a byte sequence within another byte sequence findDelimiterIndex(buffer, delimiter) { for (let i = 0; i <= buffer.length - delimiter.length; i++) { let match = true; for (let j = 0; j < delimiter.length; j++) { if (buffer[i + j] !== delimiter[j]) { match = false; break; } } if (match) return i; } return -1; } getConfigMessage() { return `Content-Type:application/json; charset=utf-8\r Path:speech.config\r \r { "context": { "synthesis": { "audio": { "metadataoptions": { "sentenceBoundaryEnabled": "true", "wordBoundaryEnabled": "true" }, "outputFormat": "${this.outputFormat}" } } } }`; } async getVoices() { try { const response = await fetch(_EdgeTTSClient.VOICES_URL); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } return await response.json(); } catch (error) { return Promise.reject(error); } } async setMetadata(voiceName, outputFormat, voiceLocale) { this.voice = voiceName; this.outputFormat = outputFormat; this.voiceLocale = voiceLocale || this.inferLocaleFromVoiceName(voiceName); if (!this.voiceLocale) { throw new Error("Could not infer voiceLocale from voiceName!"); } if (!this.ws || this.ws.readyState !== WebSocket.OPEN) { this.connectionStartTime = Date.now(); await this.initWebSocket(); } } inferLocaleFromVoiceName(voiceName) { const match = _EdgeTTSClient.VOICE_LANG_REGEX.exec(voiceName); return match ? match[0] : null; } close() { this.ws?.close(); } toStream(text, options = new ProsodyOptions()) { return this.sendSSMLRequest(this.buildSSML(text, options)); } buildSSML(text, options) { return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this.voiceLocale}"> <voice name="${this.voice}"> <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}"> ${text} </prosody> </voice> </speak>`; } sendSSMLRequest(ssml) { if (!this.ws) { throw new Error("WebSocket not initialized. Call setMetadata first."); } const requestId = generateRandomHex(16); const requestMessage = `X-RequestId:${requestId}\r Content-Type:application/ssml+xml\r Path:ssml\r \r ${ssml.trim()}`; const eventEmitter = new EventEmitter(); this.requestQueue[requestId] = eventEmitter; this.sendMessage(requestMessage).then(); return eventEmitter; } }; // src/generate-mp3.ts import * as fs from "fs/promises"; import * as path from "path"; // src/utils.ts function replaceComparisonSymbols(text) { return text.replace(/>=/g, "\u2265").replace(/<=/g, "\u2264"); } function escapeAmpersand(text) { return text.replace(/&/g, "&"); } function escapeXml(text) { return text.replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'"); } function filterMarkdown(text, overrideAmpersandEscape = false) { const noFrontmatter = text.replace(/^-{3}[\s\S]*?-{3}\n?/, ""); const noUrls = noFrontmatter.replace(/https?:\/\/[^\s]+/g, ""); const noCodeBlocks = noUrls.replace(/```[\s\S]*?```/g, "").replace(/^( {4}|\t).+/gm, ""); let cleanedMarkdown = noCodeBlocks.replace(/(\*\*|__)(.*?)\1/g, "$2").replace(/(\*|_)(.*?)\1/g, "$2").replace(/`([^`]*)`/g, "$1").replace(/~~(.*?)~~/g, "$1").replace(/^[#*-]+\s*/gm, "").replace(/^[\-\+\*]\s+/gm, "").replace(/^\d+\.\s+/gm, "").replace(/^>\s+/gm, "").replace(/^[-*]{3,}\s*$/gm, ""); cleanedMarkdown = replaceComparisonSymbols(cleanedMarkdown); cleanedMarkdown = cleanedMarkdown.replace(/<([^>\s]+)[^>]*>/g, ""); cleanedMarkdown = overrideAmpersandEscape ? cleanedMarkdown.trim() : escapeAmpersand(cleanedMarkdown.trim()); const finalText = escapeXml(cleanedMarkdown); return finalText; } // src/generate-mp3.ts var DEFAULT_OPTIONS = { voice: "en-GB-RyanNeural", speed: 1.1, enableLogging: false, disableFilter: false }; async function textToSpeechMp3({ text, outputPath, fileName, options = DEFAULT_OPTIONS }) { const client = new EdgeTTSClient(options.enableLogging); try { const finalFileName = fileName.toLowerCase().endsWith(".mp3") ? fileName : `${fileName}.mp3`; const fullOutputPath = path.join(outputPath, finalFileName); await fs.mkdir(outputPath, { recursive: true }); await client.setMetadata(options.voice, "audio-24khz-48kbitrate-mono-mp3" /* AUDIO_24KHZ_48KBITRATE_MONO_MP3 */); const prosodyOptions = new ProsodyOptions(); prosodyOptions.rate = options.speed ?? DEFAULT_OPTIONS.speed; if (!options.disableFilter) { text = filterMarkdown(text); } const stream = client.toStream(text, prosodyOptions); const chunks = []; stream.on("data", (chunk) => { chunks.push(chunk); }); return new Promise((resolve, reject) => { stream.on("end", async () => { const audioBuffer = Buffer.concat(chunks); try { await fs.writeFile(fullOutputPath, audioBuffer); console.log(`Audio saved to ${fullOutputPath}`); resolve(); } catch (err) { console.error("Error writing audio to file:", err); reject(err); } finally { client.close(); } }); stream.on("error", (error) => { console.error("Stream error:", error); client.close(); reject(error); }); }); } catch (error) { console.error("Error during text-to-speech:", error); client.close(); throw error; } } async function batchTextToSpeechMp3(inputs, outputPath, globalOptions = DEFAULT_OPTIONS) { await fs.mkdir(outputPath, { recursive: true }); for (const input of inputs) { try { await textToSpeechMp3({ text: input.text, outputPath, fileName: input.title, options: { ...globalOptions, ...input.options || {} } }); } catch (error) { console.error(`Failed to process "${input.title}":`, error); } } } export { EdgeTTSClient, OUTPUT_FORMAT, PITCH, ProsodyOptions, RATE, VOLUME, batchTextToSpeechMp3, textToSpeechMp3 }; //# sourceMappingURL=index.mjs.map