UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

432 lines (424 loc) 15.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.SAPITTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); const node_child_process_1 = require("node:child_process"); const node_fs_1 = require("node:fs"); const node_os_1 = require("node:os"); const node_path_1 = require("node:path"); /** * SAPI TTS Client for Windows environments * * This client uses Windows Speech API (SAPI) through PowerShell to provide: * - High-quality Windows TTS synthesis * - SSML support * - Rich voice metadata * - Word boundary events * - Rate, pitch, and volume controls */ class SAPITTSClient extends abstract_tts_1.AbstractTTSClient { constructor(credentials = {}) { super(credentials); // Validate Windows environment this.validateEnvironment(); // Set a default voice (will be determined at runtime) this.voiceId = null; } /** * Validate that we're running on Windows with PowerShell available */ validateEnvironment() { // Check if we're in a Node.js environment if (typeof process === "undefined" || !process.versions || !process.versions.node) { throw new Error("SAPITTSClient is only supported in Node.js environments"); } // Check if we're on Windows if (process.platform !== "win32") { throw new Error(`SAPITTSClient is only supported on Windows. Current platform: ${process.platform}`); } } /** * SAPI does not require credentials but we validate the environment */ async checkCredentials() { try { this.validateEnvironment(); // Test if PowerShell and System.Speech are available const testScript = ` try { Add-Type -AssemblyName System.Speech $synth = [System.Speech.Synthesis.SpeechSynthesizer]::new() $voices = $synth.GetInstalledVoices() Write-Output "OK" } catch { Write-Error $_.Exception.Message exit 1 } `; const result = await this.runPowerShellScript(testScript); return result.trim() === "OK"; } catch (error) { console.error("SAPI TTS not available:", error); return false; } } /** * Get available SAPI voices with rich metadata */ async getVoices() { try { const script = ` Add-Type -AssemblyName System.Speech $synth = [System.Speech.Synthesis.SpeechSynthesizer]::new() $voices = $synth.GetInstalledVoices() $voiceData = @() foreach ($voice in $voices) { if ($voice.Enabled) { $info = $voice.VoiceInfo $voiceObj = @{ Name = $info.Name Id = $info.Id Gender = $info.Gender.ToString() Age = $info.Age.ToString() Culture = $info.Culture.Name Language = $info.Culture.DisplayName Description = $info.Description } $voiceData += $voiceObj } } $voiceData | ConvertTo-Json -Depth 3 `; const result = await this.runPowerShellScript(script); const parsedResult = JSON.parse(result); // Ensure we have an array (PowerShell returns single object when there's only one voice) const voiceData = Array.isArray(parsedResult) ? parsedResult : [parsedResult]; // Convert to unified voice format const unifiedVoices = voiceData.map((voice) => ({ id: voice.Id || voice.Name || "unknown", name: voice.Name || "Unknown Voice", gender: voice.Gender === "Male" || voice.Gender === "Female" ? voice.Gender : "Unknown", provider: "sapi", languageCodes: [ { bcp47: voice.Culture || "en-US", iso639_3: this.convertCultureToISO639(voice.Culture || "en-US"), display: voice.Language || "English (United States)", }, ], })); return unifiedVoices; } catch (error) { console.error("Error getting SAPI voices:", error); return []; } } /** * Get raw voices from SAPI */ async _getVoices() { return this.getVoices(); } /** * Run a PowerShell script and return the output */ async runPowerShellScript(script) { return new Promise((resolve, reject) => { const powershell = (0, node_child_process_1.spawn)("powershell.exe", [ "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", script, ]); let stdout = ""; let stderr = ""; powershell.stdout.on("data", (data) => { stdout += data.toString(); }); powershell.stderr.on("data", (data) => { stderr += data.toString(); }); powershell.on("close", (code) => { if (code === 0) { resolve(stdout); } else { reject(new Error(`PowerShell script failed with code ${code}: ${stderr}`)); } }); powershell.on("error", (error) => { reject(new Error(`Failed to start PowerShell: ${error.message}`)); }); }); } /** * Convert culture code to ISO 639-3 language code */ convertCultureToISO639(culture) { const cultureMap = { "en-US": "eng", "en-GB": "eng", "en-AU": "eng", "en-CA": "eng", "es-ES": "spa", "es-MX": "spa", "fr-FR": "fra", "fr-CA": "fra", "de-DE": "deu", "it-IT": "ita", "pt-BR": "por", "pt-PT": "por", "ru-RU": "rus", "ja-JP": "jpn", "ko-KR": "kor", "zh-CN": "cmn", "zh-TW": "cmn", "ar-SA": "ara", "hi-IN": "hin", "th-TH": "tha", "vi-VN": "vie", "nl-NL": "nld", "sv-SE": "swe", "da-DK": "dan", "no-NO": "nor", "fi-FI": "fin", "pl-PL": "pol", "cs-CZ": "ces", "hu-HU": "hun", "tr-TR": "tur", "he-IL": "heb", }; const langCode = culture.split("-")[0]; return cultureMap[culture] || cultureMap[langCode] || "eng"; } /** * Synthesize text to audio bytes using SAPI */ async synthToBytes(text, options) { try { // Create a temporary filename for the audio export const tempFilename = (0, node_path_1.join)((0, node_os_1.tmpdir)(), `${SAPITTSClient.TEMP_PREFIX}${Date.now()}.wav`); // Prepare synthesis options const voice = options?.voice || this.voiceId || null; const rate = this.convertRate(options?.rate); const volume = this.convertVolume(options?.volume); // Prepare text for synthesis (ensure proper SSML format if needed) const processedText = this._isSSML(text) ? this.ensureProperSSML(text) : text; const escapedText = this.escapePowerShellString(processedText); // Build PowerShell script for synthesis const script = ` Add-Type -AssemblyName System.Speech [Console]::InputEncoding = [System.Text.Encoding]::UTF8 $synth = [System.Speech.Synthesis.SpeechSynthesizer]::new() # Set voice if specified ${voice ? ` try { $synth.SelectVoice("${this.escapePowerShellString(voice)}") } catch { # If voice selection fails, continue with default voice Write-Warning "Could not select voice '${this.escapePowerShellString(voice)}', using default voice" }` : ""} # Set speech properties $synth.Rate = ${rate} $synth.Volume = ${volume} # Set output to WAV file $synth.SetOutputToWaveFile("${this.escapePowerShellString(tempFilename)}") try { # Synthesize speech (supports both plain text and SSML) ${this._isSSML(text) ? '$synth.SpeakSsml($text)' : '$synth.Speak($text)'} Write-Output "SUCCESS" } catch { Write-Error $_.Exception.Message exit 1 } finally { $synth.Dispose() } `; // Execute PowerShell script const result = await this.runPowerShellScript(`$text = "${escapedText}"; ${script}`); if (!result.includes("SUCCESS")) { throw new Error("SAPI synthesis failed"); } // Read the generated WAV file if (!(0, node_fs_1.existsSync)(tempFilename)) { throw new Error("SAPI failed to generate audio file"); } const audioBuffer = (0, node_fs_1.readFileSync)(tempFilename); // Clean up the temporary file try { (0, node_fs_1.unlinkSync)(tempFilename); } catch (cleanupError) { console.warn("Could not clean up temporary file:", cleanupError); } // Create estimated word timings (SAPI doesn't provide real-time events in this mode) this._createEstimatedWordTimings(this._isSSML(text) ? this.stripSSML(text) : text); return new Uint8Array(audioBuffer); } catch (error) { console.error("Error synthesizing speech with SAPI:", error); throw error; } } /** * Synthesize text to a byte stream with word boundaries */ async synthToBytestream(text, options) { try { // Get the audio bytes first const audioBytes = await this.synthToBytes(text, options); // For now, use estimated word boundaries // TODO: Implement real-time word boundary events using SAPI events const plainText = this._isSSML(text) ? this.stripSSML(text) : text; const words = plainText.split(/\s+/).filter((word) => word.length > 0); const estimatedDuration = 0.3; // Estimated duration per word in seconds const wordBoundaries = []; let currentTime = 0; for (const word of words) { if (word.trim()) { wordBoundaries.push({ text: word, offset: currentTime * 1000, // Convert to milliseconds duration: estimatedDuration * 1000, // Convert to milliseconds }); currentTime += estimatedDuration; } } // Create a readable stream from the audio bytes const audioStream = new ReadableStream({ start(controller) { controller.enqueue(audioBytes); controller.close(); }, }); return { audioStream, wordBoundaries, }; } catch (error) { console.error("Error synthesizing speech to stream with SAPI:", error); throw error; } } /** * Convert rate option to SAPI rate format * @param rate Rate option (string or number) * @returns Rate value for SAPI (-10 to 10) */ convertRate(rate) { if (!rate) return 0; // Default rate if (typeof rate === "string") { switch (rate.toLowerCase()) { case "x-slow": return -8; case "slow": return -4; case "medium": case "normal": return 0; case "fast": return 4; case "x-fast": return 8; default: // Try to parse as number const parsed = Number.parseFloat(rate); return Number.isNaN(parsed) ? 0 : Math.max(-10, Math.min(10, parsed)); } } if (typeof rate === "number") { // Clamp between -10 and 10 return Math.max(-10, Math.min(10, rate)); } return 0; } /** * Convert volume option to SAPI volume format * @param volume Volume option (string or number) * @returns Volume value for SAPI (0 to 100) */ convertVolume(volume) { if (!volume) return 100; // Default volume if (typeof volume === "string") { switch (volume.toLowerCase()) { case "silent": return 0; case "x-soft": return 20; case "soft": return 40; case "medium": case "normal": return 60; case "loud": return 80; case "x-loud": return 100; default: // Try to parse as number const parsed = Number.parseFloat(volume); return Number.isNaN(parsed) ? 100 : Math.max(0, Math.min(100, parsed)); } } if (typeof volume === "number") { // Clamp between 0 and 100 return Math.max(0, Math.min(100, volume)); } return 100; } /** * Escape a string for use in PowerShell * @param str String to escape * @returns Escaped string */ escapePowerShellString(str) { return str .replace(/\\/g, "\\\\") .replace(/"/g, '""') .replace(/`/g, "``") .replace(/\$/g, "`$"); } /** * Strip SSML tags from text (fallback for plain text processing) * @param text Text with SSML tags * @returns Plain text without SSML tags */ stripSSML(text) { // Simple SSML tag removal - SAPI handles SSML natively, but this is for word timing estimation return text.replace(/<[^>]*>/g, "").trim(); } /** * Ensure SSML has proper format for SAPI * @param text SSML text * @returns Properly formatted SSML */ ensureProperSSML(text) { // Check if the SSML already has version attribute if (text.includes('version=')) { return text; } // If it's a simple <speak> tag, add the version attribute if (text.startsWith('<speak>')) { return text.replace('<speak>', '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">'); } // If it doesn't start with <speak>, wrap it properly if (!text.startsWith('<speak')) { return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="en-US">${text}</speak>`; } return text; } } exports.SAPITTSClient = SAPITTSClient; Object.defineProperty(SAPITTSClient, "TEMP_PREFIX", { enumerable: true, configurable: true, writable: true, value: "sapi_tts_" });