UNPKG

js-tts-wrapper

Version:

A JavaScript/TypeScript library that provides a unified API for working with multiple cloud-based Text-to-Speech (TTS) services

506 lines (498 loc) 19 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.SAPITTSClient = void 0; const abstract_tts_1 = require("../core/abstract-tts"); const SpeechMarkdown = __importStar(require("../markdown/converter")); const node_child_process_1 = require("node:child_process"); const node_fs_1 = require("node:fs"); const node_os_1 = require("node:os"); const node_path_1 = require("node:path"); /** * SAPI TTS Client for Windows environments * * This client uses Windows Speech API (SAPI) through PowerShell to provide: * - High-quality Windows TTS synthesis * - SSML support * - Rich voice metadata * - Word boundary events * - Rate, pitch, and volume controls */ class SAPITTSClient extends abstract_tts_1.AbstractTTSClient { constructor(credentials = {}) { super(credentials); // Validate Windows environment this.validateEnvironment(); // Set a default voice (will be determined at runtime) this.voiceId = null; } /** * Validate that we're running on Windows with PowerShell available */ validateEnvironment() { // Check if we're in a Node.js environment if (typeof process === "undefined" || !process.versions || !process.versions.node) { throw new Error("SAPITTSClient is only supported in Node.js environments"); } // Check if we're on Windows if (process.platform !== "win32") { throw new Error(`SAPITTSClient is only supported on Windows. Current platform: ${process.platform}`); } } /** * Get the list of required credential types for this engine * @returns Array of required credential field names */ getRequiredCredentials() { return []; // SAPI doesn't require credentials, only environment validation } /** * SAPI does not require credentials but we validate the environment */ async checkCredentials() { try { this.validateEnvironment(); // Test if PowerShell and System.Speech are available const testScript = ` try { Add-Type -AssemblyName System.Speech $synth = [System.Speech.Synthesis.SpeechSynthesizer]::new() $voices = $synth.GetInstalledVoices() Write-Output "OK" } catch { Write-Error $_.Exception.Message exit 1 } `; const result = await this.runPowerShellScript(testScript); return result.trim() === "OK"; } catch (error) { console.error("SAPI TTS not available:", error); return false; } } /** * Get available SAPI voices with rich metadata */ async getVoices() { try { const script = ` Add-Type -AssemblyName System.Speech $synth = [System.Speech.Synthesis.SpeechSynthesizer]::new() $voices = $synth.GetInstalledVoices() $voiceData = @() foreach ($voice in $voices) { if ($voice.Enabled) { $info = $voice.VoiceInfo $voiceObj = @{ Name = $info.Name Id = $info.Id Gender = $info.Gender.ToString() Age = $info.Age.ToString() Culture = $info.Culture.Name Language = $info.Culture.DisplayName Description = $info.Description } $voiceData += $voiceObj } } $voiceData | ConvertTo-Json -Depth 3 `; const result = await this.runPowerShellScript(script); const parsedResult = JSON.parse(result); // Ensure we have an array (PowerShell returns single object when there's only one voice) const voiceData = Array.isArray(parsedResult) ? parsedResult : [parsedResult]; // Convert to unified voice format const unifiedVoices = voiceData.map((voice) => ({ id: voice.Id || voice.Name || "unknown", name: voice.Name || "Unknown Voice", gender: voice.Gender === "Male" || voice.Gender === "Female" ? voice.Gender : "Unknown", provider: "sapi", languageCodes: [ { bcp47: voice.Culture || "en-US", iso639_3: this.convertCultureToISO639(voice.Culture || "en-US"), display: voice.Language || "English (United States)", }, ], })); return unifiedVoices; } catch (error) { console.error("Error getting SAPI voices:", error); return []; } } /** * Get raw voices from SAPI */ async _getVoices() { return this.getVoices(); } /** * Run a PowerShell script and return the output */ async runPowerShellScript(script) { return new Promise((resolve, reject) => { const powershell = (0, node_child_process_1.spawn)("powershell.exe", [ "-NoProfile", "-NonInteractive", "-ExecutionPolicy", "Bypass", "-Command", script, ]); let stdout = ""; let stderr = ""; powershell.stdout.on("data", (data) => { stdout += data.toString(); }); powershell.stderr.on("data", (data) => { stderr += data.toString(); }); powershell.on("close", (code) => { if (code === 0) { resolve(stdout); } else { reject(new Error(`PowerShell script failed with code ${code}: ${stderr}`)); } }); powershell.on("error", (error) => { reject(new Error(`Failed to start PowerShell: ${error.message}`)); }); }); } /** * Convert culture code to ISO 639-3 language code */ convertCultureToISO639(culture) { const cultureMap = { "en-US": "eng", "en-GB": "eng", "en-AU": "eng", "en-CA": "eng", "es-ES": "spa", "es-MX": "spa", "fr-FR": "fra", "fr-CA": "fra", "de-DE": "deu", "it-IT": "ita", "pt-BR": "por", "pt-PT": "por", "ru-RU": "rus", "ja-JP": "jpn", "ko-KR": "kor", "zh-CN": "cmn", "zh-TW": "cmn", "ar-SA": "ara", "hi-IN": "hin", "th-TH": "tha", "vi-VN": "vie", "nl-NL": "nld", "sv-SE": "swe", "da-DK": "dan", "no-NO": "nor", "fi-FI": "fin", "pl-PL": "pol", "cs-CZ": "ces", "hu-HU": "hun", "tr-TR": "tur", "he-IL": "heb", }; const langCode = culture.split("-")[0]; return cultureMap[culture] || cultureMap[langCode] || "eng"; } /** * Synthesize text to audio bytes using SAPI */ async synthToBytes(text, options) { try { // Create a temporary filename for the audio export const tempFilename = (0, node_path_1.join)((0, node_os_1.tmpdir)(), `${SAPITTSClient.TEMP_PREFIX}${Date.now()}.wav`); // Prepare synthesis options const voice = options?.voice || this.voiceId || null; const rate = this.convertRate(options?.rate); const volume = this.convertVolume(options?.volume); // Prepare text for synthesis (handle Speech Markdown and SSML) let processedText = text; // Convert from Speech Markdown if requested if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { const ssmlText = await SpeechMarkdown.toSSML(processedText, "microsoft-azure"); processedText = ssmlText; } // Always ensure SSML format - wrap plain text in speak tags like other engines if (this._isSSML(processedText)) { processedText = this.ensureProperSSML(processedText); } else { // Wrap plain text in SSML tags to enable consistent SSML processing processedText = this.ensureProperSSML(`<speak>${processedText}</speak>`); } const escapedText = this.escapePowerShellString(processedText); // Always use SSML processing since we now always have SSML format const isSSMLProcessed = true; // Build PowerShell script for synthesis const script = ` Add-Type -AssemblyName System.Speech [Console]::InputEncoding = [System.Text.Encoding]::UTF8 $synth = [System.Speech.Synthesis.SpeechSynthesizer]::new() # Set voice if specified ${voice ? ` try { $synth.SelectVoice("${this.escapePowerShellString(voice)}") } catch { # If voice selection fails, continue with default voice Write-Warning "Could not select voice '${this.escapePowerShellString(voice)}', using default voice" }` : ""} # Set speech properties $synth.Rate = ${rate} $synth.Volume = ${volume} # Set output to WAV file $synth.SetOutputToWaveFile("${this.escapePowerShellString(tempFilename)}") try { # Synthesize speech using SSML (all text is now wrapped in SSML format) $synth.SpeakSsml($text) Write-Output "SUCCESS" } catch { Write-Error $_.Exception.Message exit 1 } finally { $synth.Dispose() } `; // Execute PowerShell script const result = await this.runPowerShellScript(`$text = "${escapedText}"; ${script}`); if (!result.includes("SUCCESS")) { throw new Error("SAPI synthesis failed"); } // Read the generated WAV file if (!(0, node_fs_1.existsSync)(tempFilename)) { throw new Error("SAPI failed to generate audio file"); } const audioBuffer = (0, node_fs_1.readFileSync)(tempFilename); // Clean up the temporary file try { (0, node_fs_1.unlinkSync)(tempFilename); } catch (cleanupError) { console.warn("Could not clean up temporary file:", cleanupError); } // Create estimated word timings (SAPI doesn't provide real-time events in this mode) this._createEstimatedWordTimings(isSSMLProcessed ? this.stripSSML(processedText) : processedText); return new Uint8Array(audioBuffer); } catch (error) { console.error("Error synthesizing speech with SAPI:", error); throw error; } } /** * Synthesize text to a byte stream with word boundaries */ async synthToBytestream(text, options) { try { // Get the audio bytes first const audioBytes = await this.synthToBytes(text, options); // For now, use estimated word boundaries // TODO: Implement real-time word boundary events using SAPI events let processedText = text; // Convert from Speech Markdown if requested if (options?.useSpeechMarkdown && SpeechMarkdown.isSpeechMarkdown(processedText)) { const ssmlText = await SpeechMarkdown.toSSML(processedText, "microsoft-azure"); processedText = ssmlText; } // Always ensure SSML format - wrap plain text in speak tags like other engines if (this._isSSML(processedText)) { processedText = this.ensureProperSSML(processedText); } else { // Wrap plain text in SSML tags to enable consistent SSML processing processedText = this.ensureProperSSML(`<speak>${processedText}</speak>`); } const plainText = this._isSSML(processedText) ? this.stripSSML(processedText) : processedText; const words = plainText.split(/\s+/).filter((word) => word.length > 0); const estimatedDuration = 0.3; // Estimated duration per word in seconds const wordBoundaries = []; let currentTime = 0; for (const word of words) { if (word.trim()) { wordBoundaries.push({ text: word, offset: currentTime * 1000, // Convert to milliseconds duration: estimatedDuration * 1000, // Convert to milliseconds }); currentTime += estimatedDuration; } } // Create a readable stream from the audio bytes const audioStream = new ReadableStream({ start(controller) { controller.enqueue(audioBytes); controller.close(); }, }); return { audioStream, wordBoundaries, }; } catch (error) { console.error("Error synthesizing speech to stream with SAPI:", error); throw error; } } /** * Convert rate option to SAPI rate format * @param rate Rate option (string or number) * @returns Rate value for SAPI (-10 to 10) */ convertRate(rate) { if (!rate) return 0; // Default rate if (typeof rate === "string") { switch (rate.toLowerCase()) { case "x-slow": return -8; case "slow": return -4; case "medium": case "normal": return 0; case "fast": return 4; case "x-fast": return 8; default: // Try to parse as number const parsed = Number.parseFloat(rate); return Number.isNaN(parsed) ? 0 : Math.max(-10, Math.min(10, parsed)); } } if (typeof rate === "number") { // Clamp between -10 and 10 return Math.max(-10, Math.min(10, rate)); } return 0; } /** * Convert volume option to SAPI volume format * @param volume Volume option (string or number) * @returns Volume value for SAPI (0 to 100) */ convertVolume(volume) { if (!volume) return 100; // Default volume if (typeof volume === "string") { switch (volume.toLowerCase()) { case "silent": return 0; case "x-soft": return 20; case "soft": return 40; case "medium": case "normal": return 60; case "loud": return 80; case "x-loud": return 100; default: // Try to parse as number const parsed = Number.parseFloat(volume); return Number.isNaN(parsed) ? 100 : Math.max(0, Math.min(100, parsed)); } } if (typeof volume === "number") { // Clamp between 0 and 100 return Math.max(0, Math.min(100, volume)); } return 100; } /** * Escape a string for use in PowerShell * @param str String to escape * @returns Escaped string */ escapePowerShellString(str) { return str .replace(/\\/g, "\\\\") .replace(/"/g, '""') .replace(/`/g, "``") .replace(/\$/g, "`$"); } /** * Strip SSML tags from text (fallback for plain text processing) * @param text Text with SSML tags * @returns Plain text without SSML tags */ stripSSML(text) { // Simple SSML tag removal - SAPI handles SSML natively, but this is for word timing estimation return text.replace(/<[^>]*>/g, "").trim(); } /** * Ensure SSML has proper format for SAPI * @param text SSML text * @returns Properly formatted SSML */ ensureProperSSML(text) { // Trim whitespace for proper parsing const trimmedText = text.trim(); // Check if the SSML already has version attribute if (trimmedText.includes('version=')) { return text; // Return original text to preserve formatting } // If it's a simple <speak> tag, add the version attribute // Note: SAPI requires xml:lang="en" (not "en-US") for SSML to work properly if (trimmedText.startsWith('<speak>')) { return text.replace('<speak>', '<speak version="1.0" xml:lang="en">'); } // If it doesn't start with <speak>, wrap it properly // Note: SAPI requires xml:lang="en" (not "en-US") for SSML to work properly if (!trimmedText.startsWith('<speak')) { return `<speak version="1.0" xml:lang="en">${text}</speak>`; } return text; } } exports.SAPITTSClient = SAPITTSClient; Object.defineProperty(SAPITTSClient, "TEMP_PREFIX", { enumerable: true, configurable: true, writable: true, value: "sapi_tts_" });