UNPKG

expo-edge-speech

Version:

Text-to-speech library for Expo using Microsoft Edge TTS service

364 lines (363 loc) 14.1 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.SSML_CONSTANTS = void 0; exports.escapeXML = escapeXML; exports.validateVoiceName = validateVoiceName; exports.formatPitch = formatPitch; exports.formatRate = formatRate; exports.formatVolume = formatVolume; exports.extractLanguageFromVoice = extractLanguageFromVoice; exports.formatVoiceNameForEdgeTTS = formatVoiceNameForEdgeTTS; exports.validateSSML = validateSSML; exports.generateSSMLWithValidation = generateSSMLWithValidation; exports.generateSSML = generateSSML; exports.generateEnhancedSSML = generateEnhancedSSML; exports.isValidSSML = isValidSSML; exports.extractTextFromSSML = extractTextFromSSML; exports.normalizeSSML = normalizeSSML; const commonUtils_1 = require("./commonUtils"); const constants_1 = require("../constants"); /** * Required SSML namespace and attributes */ exports.SSML_CONSTANTS = { VERSION: "1.0", XMLNS: constants_1.SSML_NAMESPACE, XMLNS_MSTTS: "https://www.w3.org/2001/mstts", DEFAULT_LANG: "en-US", }; /** * Escape XML special characters for SSML content */ function escapeXML(text) { return text .replace(/&/g, "&amp;") .replace(/</g, "&lt;") .replace(/>/g, "&gt;") .replace(/"/g, "&quot;") .replace(/'/g, "&apos;"); } /** * Validate voice name against available voices */ function validateVoiceName(voiceName, availableVoices) { if (!voiceName || typeof voiceName !== "string") { return { isValid: false }; } if (!availableVoices || availableVoices.length === 0) { return { isValid: true }; } const exactMatch = availableVoices.find((voice) => voice.identifier === voiceName); if (exactMatch) { return { isValid: true }; } const suggestion = availableVoices.find((voice) => voice.identifier.toLowerCase().includes(voiceName.toLowerCase())); return { isValid: false, suggestion: suggestion?.identifier, }; } /** * Format pitch parameter for Edge TTS */ function formatPitch(pitchInput) { const pitch = pitchInput ?? constants_1.PARAMETER_RANGES.pitch.default; const { min, max } = constants_1.PARAMETER_RANGES.pitch; const clampedPitch = (0, commonUtils_1.clampValue)(pitch, min, max); const percentage = Math.round((clampedPitch - 1.0) * 100); return `${percentage >= 0 ? "+" : ""}${percentage}%`; } /** * Format rate parameter for Edge TTS */ function formatRate(rateInput) { const rate = rateInput ?? constants_1.PARAMETER_RANGES.rate.default; const { min, max } = constants_1.PARAMETER_RANGES.rate; const clampedRate = (0, commonUtils_1.clampValue)(rate, min, max); const percentage = Math.round((clampedRate - 1.0) * 100); return `${percentage >= 0 ? "+" : ""}${percentage}%`; } /** * Format volume parameter for Edge TTS */ function formatVolume(volumeInput) { const volume = volumeInput ?? constants_1.PARAMETER_RANGES.volume.default; const { min, max } = constants_1.PARAMETER_RANGES.volume; const clampedVolume = (0, commonUtils_1.clampValue)(volume, min, max); const percentage = Math.round((clampedVolume - 1.0) * 100); return `${percentage >= 0 ? "+" : ""}${percentage}%`; } /** * Extract language from voice identifier */ function extractLanguageFromVoice(voiceIdentifier) { if (!voiceIdentifier || typeof voiceIdentifier !== "string") { return exports.SSML_CONSTANTS.DEFAULT_LANG; } const match = voiceIdentifier.match(/^([a-z]{2}-[A-Z]{2})/); return match ? match[1] : exports.SSML_CONSTANTS.DEFAULT_LANG; } /** * Convert voice identifier to Microsoft Edge TTS format */ function formatVoiceNameForEdgeTTS(voiceIdentifier) { if (!voiceIdentifier || voiceIdentifier.trim() === "") { throw new Error("Failed to format voice name for Edge TTS: Voice identifier cannot be empty"); } if (voiceIdentifier.startsWith(constants_1.VOICE_NAME_FORMAT.PREFIX)) { return voiceIdentifier; } const match = voiceIdentifier.match(/^([a-z]{2}-[A-Z]{2})-(.+)$/); if (!match) { console.warn(`Invalid voice identifier format for Edge TTS: ${voiceIdentifier}`); return ""; } const [, langRegion, voiceNamePart] = match; return `${constants_1.VOICE_NAME_FORMAT.PREFIX} (${langRegion}, ${voiceNamePart})`; } /** * Validate SSML markup structure */ function validateSSML(ssml) { const errors = []; const warnings = []; if (!ssml || typeof ssml !== "string") { errors.push("SSML content is empty or invalid"); return { isValid: false, errors, warnings }; } if (!ssml.includes("<speak")) { errors.push("Missing required <speak> root element"); } if (!ssml.includes('version="1.0"')) { errors.push('Missing required version="1.0" attribute'); } if (!ssml.includes(`xmlns="${exports.SSML_CONSTANTS.XMLNS}"`)) { errors.push("Missing required xmlns attribute"); } if (!ssml.includes("xml:lang=")) { warnings.push("Missing xml:lang attribute (recommended)"); } if (!ssml.includes("<voice")) { errors.push("Missing required <voice> element"); } if (ssml.includes("<voice") && !ssml.includes('name="')) { errors.push("Missing required name attribute in <voice> element"); } if (ssml.includes('name=""')) { errors.push("Voice name attribute cannot be empty in <voice> element"); } try { const speakCount = (ssml.match(/<speak/g) || []).length; const speakCloseCount = (ssml.match(/<\/speak>/g) || []).length; if (speakCount !== speakCloseCount) { errors.push("Mismatched <speak> tags"); } const voiceCount = (ssml.match(/<voice/g) || []).length; const voiceCloseCount = (ssml.match(/<\/voice>/g) || []).length; if (voiceCount !== voiceCloseCount) { errors.push("Mismatched <voice> tags"); } const prosodyCount = (ssml.match(/<prosody/g) || []).length; const prosodyCloseCount = (ssml.match(/<\/prosody>/g) || []).length; if (prosodyCount !== prosodyCloseCount) { errors.push("Mismatched <prosody> tags"); } const tagStack = []; const tagRegex = /<\/?(\w+)[^>]*>/g; let match; while ((match = tagRegex.exec(ssml)) !== null) { const fullTag = match[0]; const tagName = match[1]; if (fullTag.startsWith("</")) { if (tagStack.length === 0) { errors.push(`Unexpected closing tag: </${tagName}>`); break; } const lastOpenTag = tagStack.pop(); if (lastOpenTag !== tagName) { errors.push(`Mismatched tags: expected </${lastOpenTag}> but found </${tagName}>`); break; } } else if (!fullTag.endsWith("/>")) { tagStack.push(tagName); } } if (tagStack.length > 0) { errors.push(`Unclosed tags: ${tagStack.map((tag) => `<${tag}>`).join(", ")}`); } } catch { errors.push("Invalid XML structure during parsing attempt"); } return { isValid: errors.length === 0, errors, warnings, }; } /** * Generate SSML with enhanced validation and voice handling */ function generateSSMLWithValidation(text, options = {}, availableVoices) { const validation = { isValid: true, errors: [], warnings: [], }; if (!text || typeof text !== "string") { validation.errors.push("Text must be a non-empty string."); validation.isValid = false; } else if (text.length > constants_1.MAX_TEXT_LENGTH) { validation.errors.push(`Text length (${text.length}) exceeds maximum of ${constants_1.MAX_TEXT_LENGTH} characters.`); validation.isValid = false; } const voiceToUse = options.voice || constants_1.DEFAULT_VOICE; if (typeof voiceToUse !== "string" || !voiceToUse) { validation.errors.push("Voice option must be a string if provided."); validation.isValid = false; } else { const voiceValidation = validateVoiceName(voiceToUse, availableVoices); if (!voiceValidation.isValid) { if (voiceValidation.suggestion) { validation.warnings.push(`Voice "${voiceToUse}" not found. Did you mean "${voiceValidation.suggestion}"? Using "${voiceToUse}" as specified.`); } else { validation.warnings.push(`Voice "${voiceToUse}" not found in available voices list (if provided). Using "${voiceToUse}" as specified.`); } } if (!options.voice) { validation.warnings.push(`No voice specified, using default voice: ${constants_1.DEFAULT_VOICE}`); } } if (!validation.isValid) { return { ssml: "", validation }; } let ssml = ""; try { ssml = generateSSML(text, options); const ssmlValidation = validateSSML(ssml); validation.errors.push(...ssmlValidation.errors); validation.warnings.push(...ssmlValidation.warnings); if (ssmlValidation.errors.length > 0) { validation.isValid = false; } } catch (e) { validation.errors.push(`Error during SSML generation: ${e.message}`); validation.isValid = false; } return { ssml, validation }; } /** * Generates the required SSML for Microsoft Edge TTS * @throws Error if text is invalid, exceeds max length. */ function generateSSML(text, options) { if (!text || typeof text !== "string") { throw new Error("Text must be a non-empty string."); } if (text.length > constants_1.MAX_TEXT_LENGTH) { throw new Error(`Text length (${text.length}) exceeds maximum of ${constants_1.MAX_TEXT_LENGTH} characters.`); } const voiceToUse = options?.voice !== undefined ? options.voice : constants_1.DEFAULT_VOICE; if (typeof voiceToUse !== "string") { throw new Error("Voice option must be a valid string."); } const escapedText = escapeXML(text); const { rate, pitch, volume } = options || {}; const formattedVoiceName = formatVoiceNameForEdgeTTS(voiceToUse); if (!formattedVoiceName) { throw new Error(`Failed to format voice name for Edge TTS: "${voiceToUse}". Ensure it's a valid identifier (e.g., en-US-AriaNeural) or already in Microsoft format.`); } const formattedRate = formatRate(rate); const formattedPitch = formatPitch(pitch); const formattedVolume = formatVolume(volume); // Use language from options if provided, otherwise extract from voice const language = options?.language || extractLanguageFromVoice(voiceToUse); return `<speak version="${exports.SSML_CONSTANTS.VERSION}" xmlns="${exports.SSML_CONSTANTS.XMLNS}" xmlns:mstts="${exports.SSML_CONSTANTS.XMLNS_MSTTS}" xml:lang="${language}"><voice name="${formattedVoiceName}"><prosody rate="${formattedRate}" pitch="${formattedPitch}" volume="${formattedVolume}">${escapedText}</prosody></voice></speak>`; } /** * Enhanced SSML generation with additional elements support * @throws Error if text is invalid, exceeds max length, or if voice is not provided. */ function generateEnhancedSSML(text, options) { if (!text || typeof text !== "string") { throw new Error("Text must be a non-empty string."); } if (text.length > constants_1.MAX_TEXT_LENGTH) { throw new Error(`Text length (${text.length}) exceeds maximum of ${constants_1.MAX_TEXT_LENGTH} characters.`); } if (!options || typeof options.voice !== "string" || !options.voice) { throw new Error("Voice option is required and cannot be empty."); } const escapedText = escapeXML(text); const { voice, rate, pitch, volume, prefixElements, suffixElements, useMicrosoftVoiceFormat = true, } = options; const formattedVoiceName = useMicrosoftVoiceFormat ? formatVoiceNameForEdgeTTS(voice) : voice; if (!formattedVoiceName) { throw new Error(`Failed to format voice name: "${voice}". Ensure it's a valid identifier or correctly formatted if useMicrosoftVoiceFormat is false.`); } const formattedRate = formatRate(rate); const formattedPitch = formatPitch(pitch); const formattedVolume = formatVolume(volume); const language = extractLanguageFromVoice(voice); const prefixContent = prefixElements?.join("\n ") || ""; const suffixContent = suffixElements?.join("\n ") || ""; const content = [prefixContent, escapedText, suffixContent] .filter(Boolean) .join("\n "); return `<speak version="${exports.SSML_CONSTANTS.VERSION}" xmlns="${exports.SSML_CONSTANTS.XMLNS}" xmlns:mstts="${exports.SSML_CONSTANTS.XMLNS_MSTTS}" xml:lang="${language}"> <voice name="${formattedVoiceName}"> <prosody rate="${formattedRate}" pitch="${formattedPitch}" volume="${formattedVolume}"> ${content} </prosody> </voice> </speak>`; } /** * Check if a string appears to be valid SSML */ function isValidSSML(content) { if (!content) return false; const validation = validateSSML(content); return validation.isValid; } /** * Extract text content from SSML markup */ function extractTextFromSSML(ssml) { if (!ssml) return ""; try { return ssml .replace(/<[^>]*>/g, " ") .replace(/\s+/g, " ") .trim(); } catch { return ssml; } } /** * Normalize SSML formatting for consistent output */ function normalizeSSML(ssml) { if (!ssml) return ""; try { return ssml .replace(/>\s+</g, "><") // Remove whitespace between tags .replace(/>\s+/g, ">") // Remove whitespace after opening tags .replace(/\s+</g, "<") // Remove whitespace before closing tags .replace(/\s+/g, " ") // Normalize internal whitespace to single spaces .trim(); } catch { return ssml; } }