UNPKG

@andresaya/edge-tts

Version:

Edge TTS is a package that allows access to the online text-to-speech service used by Microsoft Edge without the need for Microsoft Edge, Windows, or an API key.

503 lines (502 loc) 20.7 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.EdgeTTS = void 0; const ws_1 = __importDefault(require("ws")); const constants_1 = require("../config/constants"); const promises_1 = require("fs/promises"); const buffer_1 = require("buffer"); const https_1 = __importDefault(require("https")); function ensureBuffer(data) { if (buffer_1.Buffer.isBuffer(data)) { return data; } if (data instanceof ArrayBuffer) { return buffer_1.Buffer.from(data); } if (Array.isArray(data)) { return buffer_1.Buffer.concat(data); } if (typeof data === 'string') { return buffer_1.Buffer.from(data, 'utf-8'); } throw new Error(`Unsupported RawData type: ${typeof data}`); } class EdgeTTS { audio_stream = []; audio_format = 'mp3'; output_format = 'audio-24khz-48kbitrate-mono-mp3'; word_boundaries = []; ws; async normalizeVoices(data) { const out = []; for (const v of data || []) { const short = v?.ShortName || ""; const locale = v?.Locale || ""; // base: remove locale prefix and Neural/NeuralHD suffix let base = short.replace(/^[a-z]{2}-[A-Z]{2}-/, ""); base = base.replace(/NeuralHD$/, "").replace(/Neural$/, "").trim(); // VoiceType: if NeuralHD/Neural in Name or ShortName const mix = `${v?.Name || ""} ${short}`; const voiceType = v?.VoiceType || (/NeuralHD/i.test(mix) ? "NeuralHD" : "Neural"); // LocaleName: prefer LocaleName -> LanguageName -> locale const localeName = v?.LocaleName || (locale || null); // DisplayName: prefer DisplayName -> FriendlyName -> base -> short let display = v?.DisplayName || v?.FriendlyName || base || short; display = display.replace(/^Microsoft\s+/i, ""); display = display.split(" - ")[0].trim(); display = display.replace(/\s*Online\s*\(Natural\)\s*/i, " "); display = display.replace(/\s*Online\s*/i, " "); display = display.replace(/\s+/g, " ").trim(); // VoiceTag parsing const tag = (v?.VoiceTag && typeof v.VoiceTag === "object") ? v.VoiceTag : {}; const tailored = Array.isArray(tag.TailoredScenarios) ? tag.TailoredScenarios : (Array.isArray(tag.ContentCategories) ? tag.ContentCategories : []); const personalities = Array.isArray(tag.VoicePersonalities) ? tag.VoicePersonalities : []; out.push({ Name: short || (v?.Name || ""), DisplayName: display, LocalName: display, ShortName: short || (v?.Name || ""), Gender: v?.Gender ?? null, Locale: locale || null, LocaleName: localeName, SecondaryLocaleList: Array.isArray(v?.SecondaryLocaleList) ? v.SecondaryLocaleList : [], VoiceType: voiceType, VoiceTag: { TailoredScenarios: tailored, VoicePersonalities: personalities, }, FriendlyName: `${display} (${voiceType}) - ${localeName}`, }); } return out; } async getVoices() { const secMsGEC = await this.generateSecMsGec(constants_1.Constants.TRUSTED_CLIENT_TOKEN); const httpsAgent = new https_1.default.Agent({ rejectUnauthorized: false }); const url = `${constants_1.Constants.VOICES_URL}` + `?TrustedClientToken=${constants_1.Constants.TRUSTED_CLIENT_TOKEN}` + `&Sec-MS-GEC=${secMsGEC}` + `&Sec-MS-GEC-Version=${constants_1.Constants.VERSION_MS_GEC}`; const headers = { ...constants_1.Constants.getBaseHeaders(), "Accept-Encoding": "identity", // evita gzip/br/zstd }; const voicesRaw = await new Promise((resolve, reject) => { const req = https_1.default.request(url, { method: "GET", headers, agent: httpsAgent }, (res) => { const chunks = []; res.on("data", (chunk) => chunks.push(buffer_1.Buffer.isBuffer(chunk) ? chunk : buffer_1.Buffer.from(chunk))); res.on("error", reject); res.on("end", () => { const body = buffer_1.Buffer.concat(chunks).toString("utf8"); try { const parsed = JSON.parse(body); // soporta array directo o { voices: [...] } const voices = Array.isArray(parsed) ? parsed : (parsed.voices || parsed.Voices || []); resolve(Array.isArray(voices) ? voices : []); } catch (e) { reject(new Error("JSON inválido: " + (e?.message || String(e)))); } }); }); req.on("error", reject); req.end(); }); return this.normalizeVoices(voicesRaw); } async getVoicesByLanguage(locale) { const voices = await this.getVoices(); return voices.filter(voice => voice.Locale.startsWith(locale)); } async getVoicesByGender(gender) { const voices = await this.getVoices(); return voices.filter(voice => voice.Gender === gender); } generateUUID() { return 'xxxxxxxx-xxxx-xxxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function (c) { const r = Math.random() * 16 | 0; const v = c === 'x' ? r : (r & 0x3 | 0x8); return v.toString(16); }); } validatePitch(pitch) { if (typeof pitch === 'number') { return (pitch >= 0 ? `+${pitch}Hz` : `${pitch}Hz`); } if (!/^[+-]?\d{1,3}(?:\.\d+)?Hz$/.test(pitch)) { throw new Error("Invalid pitch format. Expected format: '-100Hz to +100Hz' or a number."); } return pitch; } validateRate(rate) { let rateValue; if (typeof rate === 'string') { rateValue = parseFloat(rate.replace('%', '')); if (isNaN(rateValue)) throw new Error("Invalid rate format."); } else { rateValue = rate; } if (rateValue >= 0) { return `+${rateValue}%`; } return `${rateValue}%`; } validateVolume(volume) { let volumeValue; if (typeof volume === 'string') { volumeValue = parseInt(volume.replace('%', ''), 10); if (isNaN(volumeValue)) throw new Error("Invalid volume format."); } else { volumeValue = volume; } if (volumeValue < -100 || volumeValue > 100) { throw new Error("Volume cannot be negative. Expected a value from -100% to 100% (or more)."); } return `${volumeValue}%`; } async synthesize(text, voice = 'en-US-AnaNeural', options = {}) { const secMsGEC = await this.generateSecMsGec(constants_1.Constants.TRUSTED_CLIENT_TOKEN); return new Promise((resolve, reject) => { this.audio_stream = []; const reqId = this.generateUUID(); const url = `${constants_1.Constants.WSS_URL}?TrustedClientToken=${constants_1.Constants.TRUSTED_CLIENT_TOKEN}&Sec-MS-GEC=${secMsGEC}&Sec-MS-GEC-Version=${constants_1.Constants.VERSION_MS_GEC}&ConnectionId=${reqId}`; this.ws = new ws_1.default(url, { headers: constants_1.Constants.getBaseHeaders(), rejectUnauthorized: false }); const SSML_text = this.getSSML(text, voice, options); const outputFormat = options.outputFormat || 'audio-24khz-48kbitrate-mono-mp3'; this.output_format = outputFormat; let timedOut = false; let inactivityTimeout; const resetInactivityTimeout = () => { clearTimeout(inactivityTimeout); inactivityTimeout = setTimeout(() => { timedOut = true; if (this.ws && this.ws.readyState === ws_1.default.OPEN) { this.ws.close(); } reject(new Error("WebSocket inactivity timeout - no response from server")); }, 30000); // 30 seconds of inactivity }; this.ws.on('open', () => { resetInactivityTimeout(); // start the inactivity timeout const message = this.buildTTSConfigMessage(outputFormat); this.ws.send(message); const timestamp = this.nowRFC1123(); const speechMessage = `X-RequestId:${reqId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${timestamp}\r\nPath:ssml\r\n\r\n${SSML_text}`; this.ws.send(speechMessage); }); this.ws.on('message', (data) => { resetInactivityTimeout(); // restart inactivity timeout this.processAudioData(data); }); this.ws.on('error', (err) => { clearTimeout(inactivityTimeout); if (this.ws && this.ws.readyState === ws_1.default.OPEN) { this.ws.close(); } reject(err); }); this.ws.on('close', () => { clearTimeout(inactivityTimeout); if (!timedOut) { resolve(); } }); }); } escapeXML(text) { return text .replace(/&/g, '&amp;') .replace(/</g, '&lt;') .replace(/>/g, '&gt;') .replace(/"/g, '&quot;') .replace(/'/g, '&apos;'); } getSSML(content, voice, options = {}) { const pitch = this.validatePitch(options.pitch ?? 0); const rate = this.validateRate(options.rate ?? 0); const volume = this.validateVolume(options.volume ?? 0); const escapedText = this.escapeXML(content); return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US"> <voice name="${voice}"> <prosody pitch="${pitch}" rate="${rate}" volume="${volume}"> ${escapedText} </prosody> </voice> </speak> `; } nowRFC1123(timeZone = 'UTC') { const now = new Date(); const options = { weekday: 'short', year: 'numeric', month: 'short', day: '2-digit', hour: '2-digit', minute: '2-digit', second: '2-digit', timeZone, timeZoneName: 'short' }; return now.toLocaleString('en-US', options); } parseRFC1123(rfcStr) { return new Date(rfcStr); } buildTTSConfigMessage(outputFormat = 'audio-24khz-48kbitrate-mono-mp3') { const timestamp = this.nowRFC1123(); return `X-Timestamp:${timestamp}\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n` + `{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":true},"outputFormat":"${outputFormat}"}}}}`; } async *synthesizeStream(text, voice = 'en-US-AnaNeural', options = {}) { this.audio_stream = []; const reqId = this.generateUUID(); const secMsGEC = await this.generateSecMsGec(constants_1.Constants.TRUSTED_CLIENT_TOKEN); const url = `${constants_1.Constants.WSS_URL}?TrustedClientToken=${constants_1.Constants.TRUSTED_CLIENT_TOKEN}&Sec-MS-GEC=${secMsGEC}&Sec-MS-GEC-Version=${constants_1.Constants.VERSION_MS_GEC}&ConnectionId=${reqId}`; this.ws = new ws_1.default(url, { headers: constants_1.Constants.getBaseHeaders(), rejectUnauthorized: false }); const SSML_text = this.getSSML(text, voice, options); const outputFormat = options.outputFormat || 'audio-24khz-48kbitrate-mono-mp3'; this.output_format = outputFormat; const queue = []; let done = false; let error = null; let notify = null; const push = (chunk) => { queue.push(chunk); if (notify) { notify(); notify = null; } }; let timedOut = false; let inactivityTimeout; const resetInactivityTimeout = () => { clearTimeout(inactivityTimeout); inactivityTimeout = setTimeout(() => { timedOut = true; error = new Error("WebSocket inactivity timeout - no response from server"); done = true; if (this.ws && this.ws.readyState === ws_1.default.OPEN) { this.ws.close(); } if (notify) { notify(); notify = null; } }, 30000); // 30 seconds of inactivity }; this.ws.on('open', () => { resetInactivityTimeout(); // start the inactivity timeout const message = this.buildTTSConfigMessage(outputFormat); this.ws.send(message); const timestamp = this.nowRFC1123(); const speechMessage = `X-RequestId:${reqId}\r\nContent-Type:application/ssml+xml\r\nX-Timestamp:${timestamp}\r\nPath:ssml\r\n\r\n${SSML_text}`; this.ws.send(speechMessage); }); this.ws.on('message', (data) => { resetInactivityTimeout(); // restart inactivity timeout const buffer = ensureBuffer(data); const needle = buffer_1.Buffer.from('Path:audio\r\n'); const audioStartIndex = buffer.indexOf(new Uint8Array(needle)); if (audioStartIndex !== -1) { const audioChunk = buffer.subarray(audioStartIndex + needle.length); const chunk = new Uint8Array(audioChunk); this.audio_stream.push(chunk); push(chunk); } if (buffer.toString().includes("Path:audio.metadata")) { const metadataStart = buffer.indexOf("\r\n\r\n") + 4; const metadataJson = buffer.toString().substring(metadataStart); const meta = this.parseMetadata(metadataJson); if (meta !== null) { this.word_boundaries.push(meta); } return; } if (buffer.toString().includes('Path:turn.end')) { this.ws?.close(); } }); this.ws.on('error', (err) => { clearTimeout(inactivityTimeout); error = err; done = true; if (notify) { notify(); notify = null; } }); this.ws.on('close', () => { clearTimeout(inactivityTimeout); done = true; if (notify) { notify(); notify = null; } }); while (!done || queue.length > 0) { if (queue.length === 0) { await new Promise(resolve => (notify = resolve)); continue; } const chunk = queue.shift(); if (chunk) { yield chunk; } } if (error) { throw error; } } processAudioData(data) { const buffer = ensureBuffer(data); const needle = buffer_1.Buffer.from("Path:audio\r\n"); const audioStartIndex = buffer.indexOf(new Uint8Array(needle)); if (audioStartIndex !== -1) { const audioChunk = buffer.subarray(audioStartIndex + needle.length); this.audio_stream.push(new Uint8Array(audioChunk)); } if (buffer.toString().includes("Path:audio.metadata")) { const metadataStart = buffer.indexOf("\r\n\r\n") + 4; const metadataJson = buffer.toString().substring(metadataStart); const meta = this.parseMetadata(metadataJson); if (meta !== null) { this.word_boundaries.push(meta); } return; } if (buffer.toString().includes("Path:turn.end")) { this.ws?.close(); } } parseMetadata(data, offsetCompensation = 0) { let metadata; try { metadata = JSON.parse(data); } catch { return null; } if (!metadata.Metadata) { return null; } for (const metaObj of metadata.Metadata) { if (metaObj.Type === "WordBoundary") { const currentOffset = metaObj.Data.Offset + offsetCompensation; const currentDuration = metaObj.Data.Duration; return { type: "WordBoundary", offset: currentOffset, duration: currentDuration, text: metaObj.Data.text?.Text, }; } } return null; } generateSecMsGec = async (trustedClientToken) => { const now = this.nowRFC1123(); const fixedDate = this.parseRFC1123(now); const ticks = Math.floor(fixedDate.getTime() / 1000) + 11644473600; const rounded = ticks - (ticks % 300); const windowsTicks = rounded * 10_000_000; const encoder = new TextEncoder(); const data = encoder.encode(`${windowsTicks}${trustedClientToken}`); const hashBuffer = await crypto.subtle.digest('SHA-256', data); return Array.from(new Uint8Array(hashBuffer)) .map(b => b.toString(16).padStart(2, '0')) .join('') .toUpperCase(); }; getDuration() { if (this.audio_stream.length === 0) { throw new Error("No audio data available"); } // Estimate duration based on the size of the audio stream const bufferSize = this.toBuffer().length; const estimatedDuration = bufferSize / (24000 * 3); // 24000 Hz sample rate, 3 bytes per sample (16-bit stereo) return estimatedDuration; } getFileExtension(format) { if (format.includes('mp3')) return 'mp3'; if (format.includes('opus') && format.includes('webm')) return 'webm'; if (format.includes('opus') && format.includes('ogg')) return 'ogg'; if (format.includes('wav') || format.includes('riff')) return 'wav'; if (format.includes('pcm') && format.includes('raw')) return 'pcm'; if (format.includes('alaw')) return 'alaw'; if (format.includes('mulaw')) return 'mulaw'; if (format.includes('truesilk')) return 'silk'; if (format.includes('g722')) return 'g722'; if (format.includes('amr')) return 'amr'; return 'audio'; } getAudioInfo() { const buffer = this.toBuffer(); return { size: buffer.length, format: this.getFileExtension(this.output_format), estimatedDuration: this.getDuration() }; } async toFile(outputPath, format) { if (!format) { format = this.getFileExtension(this.output_format); } const audioBuffer = this.toBuffer(); const finalPath = `${outputPath}.${format}`; await (0, promises_1.writeFile)(finalPath, new Uint8Array(audioBuffer)); return finalPath; } toRaw() { return this.toBase64(); } toBase64() { return this.toBuffer().toString('base64'); } toBuffer() { if (this.audio_stream.length === 0) { throw new Error("No audio data available. Did you run synthesize() first?"); } return buffer_1.Buffer.concat(this.audio_stream); } async saveMetadata(outputPath) { if (this.word_boundaries.length === 0) { throw new Error("No metadata available to save."); } const json = JSON.stringify(this.word_boundaries, null, 4); await (0, promises_1.writeFile)(outputPath, json); } getWordBoundaries() { return this.word_boundaries; } } exports.EdgeTTS = EdgeTTS;