UNPKG

expo-edge-speech

Version:

Text-to-speech library for Expo using Microsoft Edge TTS service

479 lines (478 loc) 19 kB
"use strict"; // ============================================================================= // expo-speech Parameter Validation Ranges // ============================================================================= Object.defineProperty(exports, "__esModule", { value: true }); exports.BINARY_PARSING = exports.PROTOCOL_COMPLIANCE = exports.ERROR_HANDLING = exports.EDGE_TTS_EXCEPTIONS = exports.TIMING_CONVERSION = exports.WORD_BOUNDARY_OFFSET_COMPENSATION = exports.SEC_MS_GEC_GENERATION = exports.SSML_VALIDATION = exports.CONNECTION_LIFECYCLE = exports.WSS_HEADERS = exports.VOICE_NAME_FORMAT = exports.SSML_NAMESPACE = exports.CONTENT_TYPE_VALIDATION = exports.MESSAGE_PATH_VALIDATION = exports.BINARY_MESSAGE_PARSING = exports.BINARY_MESSAGE = exports.MESSAGE_FORMAT = exports.AUDIO_STREAMING = exports.AUDIO_CONFIG = exports.WEBSOCKET_CONFIG = exports.AUTHENTICATION = exports.TIMESTAMP_FORMAT = exports.VOICE_CACHING = exports.EDGE_TTS_CONFIG = exports.AUDIO_FORMATS = exports.DEFAULT_AUDIO_FORMAT = exports.CONTENT_TYPES = exports.MESSAGE_PATHS = exports.CONNECTION_ID_FORMAT = exports.WEBSOCKET_HEADERS = exports.EDGE_TTS_VOICE_LIST_URL = exports.SEC_MS_GEC_VERSION = exports.CHROMIUM_VERSION = exports.EDGE_TTS_BASE_URL = exports.EDGE_TTS_WEBSOCKET_URL_TEMPLATE = exports.EDGE_TTS_TRUSTED_CLIENT_TOKEN = exports.MAX_TEXT_LENGTH = exports.DEFAULT_TIMEOUT = exports.DEFAULT_VOICE = exports.PARAMETER_RANGES = void 0; /** * Parameter validation ranges for user-facing API options. * These define the numeric input ranges for rate, pitch, and volume. * Conversion to SSML percentage strings is handled elsewhere (e.g., ssmlUtils.ts). */ exports.PARAMETER_RANGES = Object.freeze({ rate: Object.freeze({ min: 0.0, // Corresponds to -100% in SSML max: 2.0, // Corresponds to +100% in SSML default: 1.0, // Corresponds to +0% in SSML }), pitch: Object.freeze({ min: 0.0, // Corresponds to -100% in SSML max: 2.0, // Corresponds to +100% in SSML default: 1.0, // Corresponds to +0% in SSML }), volume: Object.freeze({ min: 0.0, // Corresponds to -100% (mute) in SSML max: 2.0, // Corresponds to +100% in SSML default: 1.0, // Corresponds to +0% in SSML }), }); /** * Default voice to use if not specified by the user. * Using multilingual Emma as the default voice. */ exports.DEFAULT_VOICE = "en-US-EmmaMultilingualNeural"; // ============================================================================= // Basic Default Values // ============================================================================= /** * Default timeout for speech operations (5000ms) */ exports.DEFAULT_TIMEOUT = 5000; /** * Maximum text length for speech input (1000 characters). * This is a client-side limit for the input string to the speak method. * The Edge TTS service itself handles further chunking based on byte length * and SSML overhead. */ exports.MAX_TEXT_LENGTH = 1000; // ============================================================================= // Edge TTS Protocol Constants // ============================================================================= /** * Edge TTS Trusted Client Token */ exports.EDGE_TTS_TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4"; /** * Edge TTS WebSocket URL template with required query parameters * Includes all required authentication and connection parameters */ exports.EDGE_TTS_WEBSOCKET_URL_TEMPLATE = "wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=6A5AA1D4EAFF4E9FB37E23D68491D6F4&Sec-MS-GEC={secMsGec}&Sec-MS-GEC-Version={secMsGecVersion}&ConnectionId={connectionId}"; /** * Base URL for Edge TTS services */ exports.EDGE_TTS_BASE_URL = "speech.platform.bing.com/consumer/speech/synthesize/readaloud"; /** * Chromium version for Sec-MS-GEC-Version header */ exports.CHROMIUM_VERSION = "130.0.2849.68"; /** * SEC-MS-GEC version format template */ exports.SEC_MS_GEC_VERSION = `1-${exports.CHROMIUM_VERSION}`; /** * Edge TTS Voice List API endpoint */ exports.EDGE_TTS_VOICE_LIST_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${exports.EDGE_TTS_TRUSTED_CLIENT_TOKEN}`; /** * Required WebSocket message headers */ exports.WEBSOCKET_HEADERS = Object.freeze({ REQUEST_ID: "X-RequestId", TIMESTAMP: "X-Timestamp", CONTENT_TYPE: "Content-Type", PATH: "Path", }); /** * Connection ID generation format * Note: this can be any random 32-character string. We choose to use UUIDv4 without dashes here. * Format: UUID v4 without dashes (32-character hexadecimal string) * Example: a1b2c3d4e5f67890abcdef1234567890 */ exports.CONNECTION_ID_FORMAT = Object.freeze({ TYPE: "UUID_V4_NO_DASHES", LENGTH: 32, FORMAT: "hex", DESCRIPTION: "Random UUID v4 without dashes used for connection tracking", }); /** * WebSocket message paths */ exports.MESSAGE_PATHS = Object.freeze({ SPEECH_CONFIG: "speech.config", SSML: "ssml", RESPONSE: "response", // General response, not explicitly used for specific actions in rany2/edge-tts TURN_START: "turn.start", TURN_END: "turn.end", AUDIO_METADATA: "audio.metadata", }); /** * Content types for WebSocket messages */ exports.CONTENT_TYPES = Object.freeze({ JSON: "application/json; charset=utf-8", SSML: "application/ssml+xml", }); /** * Default audio format for Edge TTS * This is the only format supported by Edge TTS for this endpoint. */ exports.DEFAULT_AUDIO_FORMAT = "audio-24khz-48kbitrate-mono-mp3"; /** * Supported audio formats for Edge TTS * Edge TTS only supports MP3 at 24kHz 48kbps mono */ exports.AUDIO_FORMATS = Object.freeze({ MP3_24KHZ_48KBPS: "audio-24khz-48kbitrate-mono-mp3", }); /** * Edge TTS connection configuration */ exports.EDGE_TTS_CONFIG = Object.freeze({ connectionTimeout: 10000, audioTimeout: 5000, maxRetries: 3, retryDelay: 1000, connectionPoolSize: 1, // Edge TTS uses single connection per synthesis maxConcurrentConnections: 1, keepAliveInterval: 30000, // 30 seconds }); /** * Voice caching configuration */ exports.VOICE_CACHING = Object.freeze({ VOICE_LIST_TTL: 24 * 60 * 60 * 1000, // 24 hours in milliseconds VOICE_LIST_REFRESH_INTERVAL: 6 * 60 * 60 * 1000, // 6 hours refresh interval MAX_CACHE_SIZE: 1000, // Maximum number of voices to cache ENABLE_FALLBACK_CACHE: true, // Allow using expired cache as fallback }); /** * Timestamp format constants for Edge TTS WebSocket messages */ exports.TIMESTAMP_FORMAT = Object.freeze({ REQUIRED_SUFFIX: "Z", // All timestamps must end with 'Z' MICROSECOND_PRECISION: true, // Timestamps use microsecond precision MULTIPLIER: 1000000, // Convert seconds to microseconds EXAMPLE: "1234567890123456Z", // Example format: microseconds + Z }); /** * Enhanced authentication constants from corrected protocol */ exports.AUTHENTICATION = Object.freeze({ TRUSTED_CLIENT_TOKEN: "6A5AA1D4EAFF4E9FB37E23D68491D6F4", SEC_MS_GEC_VERSION_TEMPLATE: "1-{version}", // Template for version header CHROME_VERSION_CURRENT: exports.CHROMIUM_VERSION, CHROME_VERSION_EDGE_TESTED: "91.0.864.41", // Version tested with Edge TTS protocol ORIGIN_EXTENSION: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", // Required origin }); /** * WebSocket message configuration */ exports.WEBSOCKET_CONFIG = Object.freeze({ binaryType: "arraybuffer", protocols: [], closeTimeout: 5000, }); /** * Audio processing configuration */ exports.AUDIO_CONFIG = Object.freeze({ defaultFormat: exports.DEFAULT_AUDIO_FORMAT, enableWordBoundary: true, enableSentenceBoundary: false, bufferSize: 4096, // Default buffer size for audio chunks maxBufferSize: 65536, // 64KB maximum buffer size minBufferSize: 1024, // 1KB minimum buffer size streamingChunkSize: 8192, // Preferred chunk size for streaming sampleRate: 24000, // Edge TTS fixed sample rate (24kHz) bitRate: 48000, // Edge TTS fixed bit rate (48kbps) channels: 1, // Edge TTS mono audio }); /** * Enhanced audio streaming and buffer management constants */ exports.AUDIO_STREAMING = Object.freeze({ CHUNK_PROCESSING: { MIN_CHUNK_SIZE: 256, // Minimum audio chunk size for processing MAX_CHUNK_SIZE: 32768, // Maximum audio chunk size (32KB) PREFERRED_CHUNK_SIZE: 8192, // Preferred chunk size for optimal performance BUFFER_THRESHOLD: 16384, // Buffer threshold before playback }, PLAYBACK_BUFFER: { PRELOAD_SIZE: 4096, // Preload buffer size for smooth playback UNDERRUN_THRESHOLD: 1024, // Buffer underrun detection threshold OVERRUN_PROTECTION: 131072, // Maximum buffer size to prevent memory issues (128KB) }, STREAMING_TIMEOUT: { CHUNK_TIMEOUT: 2000, // Timeout for individual chunk (2 seconds) TOTAL_TIMEOUT: 30000, // Total streaming timeout (30 seconds) SILENCE_TIMEOUT: 5000, // Timeout for silence detection (5 seconds) }, }); /** * Message formatting requirements */ exports.MESSAGE_FORMAT = Object.freeze({ LINE_ENDING: "\r\n", HEADER_SEPARATOR: "\r\n\r\n", HEADER_VALUE_SEPARATOR: ":", // check text-message.txt for example }); /** * Binary message structure constants */ exports.BINARY_MESSAGE = Object.freeze({ HEADER_LENGTH_BYTES: 2, HEADER_LENGTH_TYPE: "Int16", AUDIO_FORMAT: "MP3", }); /** * Enhanced binary message parsing constants with specific endianness details */ exports.BINARY_MESSAGE_PARSING = Object.freeze({ HEADER: { LENGTH_BYTES: 2, // Header length stored in first 2 bytes LENGTH_TYPE: "Int16", // 16-bit integer ENDIANNESS: "big", // Big-endian byte order (corrected from protocol) ENCODING: "utf-8", // Header JSON encoding }, AUDIO: { FORMAT: "audio/mpeg", // MIME type for audio content ENCODING: "mp3", // Audio encoding format EXPECTED_MAGIC: [0xff, 0xfb], // MP3 frame header magic bytes (common) }, VALIDATION: { MIN_HEADER_LENGTH: 10, // Minimum valid header length MAX_HEADER_LENGTH: 1024, // Maximum expected header length MIN_AUDIO_LENGTH: 32, // Minimum valid audio chunk length }, }); /** * Message path validation constants */ exports.MESSAGE_PATH_VALIDATION = Object.freeze({ VALID_PATHS: [ "speech.config", "ssml", "response", "turn.start", "turn.end", "audio.metadata", "audio", ], REQUIRED_PATHS: ["speech.config", "ssml"], // Paths that must be sent RESPONSE_PATHS: [ "response", "turn.start", "turn.end", "audio.metadata", "audio", ], // Expected response paths PATH_SEPARATOR: ".", // Separator used in path names }); /** * Content type validation constants */ exports.CONTENT_TYPE_VALIDATION = Object.freeze({ VALID_TYPES: [ "application/json; charset=utf-8", "application/ssml+xml", "audio/mpeg", ], CHARSET_REQUIRED: ["application/json"], // Content types that require charset DEFAULT_CHARSET: "utf-8", // Default charset when required }); // ============================================================================= // Additional Edge TTS Protocol Constants // ============================================================================= /** * SSML namespace requirement */ exports.SSML_NAMESPACE = "http://www.w3.org/2001/10/synthesis"; /** * Voice name format pattern for Microsoft Edge TTS * Format: "Microsoft Server Speech Text to Speech Voice (lang-region, NameNeural)" */ exports.VOICE_NAME_FORMAT = Object.freeze({ PREFIX: "Microsoft Server Speech Text to Speech Voice", PATTERN: "Microsoft Server Speech Text to Speech Voice ({lang}-{region}, {name})", EXAMPLE: "Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)", }); /** * Required WebSocket connection headers */ exports.WSS_HEADERS = Object.freeze({ USER_AGENT: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0", ORIGIN: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", PRAGMA: "no-cache", CACHE_CONTROL: "no-cache", ACCEPT_ENCODING: "gzip, deflate, br", ACCEPT_LANGUAGE: "en-US,en;q=0.9", }); /** * Enhanced connection lifecycle constants */ exports.CONNECTION_LIFECYCLE = Object.freeze({ TIMEOUTS: { CONNECTION_ESTABLISHMENT: 10000, // WebSocket connection timeout (10s) MESSAGE_RESPONSE: 5000, // Individual message response timeout (5s) TOTAL_SYNTHESIS: 30000, // Total synthesis operation timeout (30s) TURN_END_WAIT: 3000, // Wait for turn.end message timeout (3s) GRACEFUL_CLOSE: 2000, // Graceful connection close timeout (2s) }, RETRY_LIMITS: { CONNECTION_ATTEMPTS: 3, // Maximum connection retry attempts AUTH_FAILURES: 2, // Maximum authentication retry attempts MESSAGE_RESENDS: 1, // Maximum message resend attempts CLOCK_SKEW_ADJUSTMENTS: 3, // Maximum clock skew adjustment attempts }, POOL_MANAGEMENT: { MAX_POOL_SIZE: 1, // Edge TTS uses single connection per synthesis CONNECTION_REUSE: false, // Do not reuse connections for new synthesis IDLE_TIMEOUT: 60000, // Connection idle timeout (60s) CLEANUP_INTERVAL: 30000, // Pool cleanup interval (30s) }, }); /** * Enhanced SSML validation and processing constants */ exports.SSML_VALIDATION = Object.freeze({ REQUIRED_ATTRIBUTES: { SPEAK: ["version", "xmlns", "xml:lang"], // Required attributes for <speak> VOICE: ["name"], // Required attributes for <voice> }, NAMESPACE_VALIDATION: { REQUIRED_NAMESPACE: "http://www.w3.org/2001/10/synthesis", NAMESPACE_PREFIX: "xmlns", VALIDATION_REQUIRED: true, }, PROSODY_RANGES: { RATE: { MIN_RELATIVE: -50, // Minimum relative rate (-50%) MAX_RELATIVE: 100, // Maximum relative rate (+100%) UNIT: "percent", RELATIVE_PATTERN: /^[+-]\d+%$/, }, PITCH: { MIN_RELATIVE: -50, // Minimum relative pitch (-50%) MAX_RELATIVE: 100, // Maximum relative pitch (+100%) UNIT: "percent", RELATIVE_PATTERN: /^[+-]\d+%$/, HZ_PATTERN: /^[+-]\d+Hz$/, }, VOLUME: { MIN_RELATIVE: -50, // Minimum relative volume (-50%) MAX_RELATIVE: 100, // Maximum relative volume (+100%) UNIT: "percent", RELATIVE_PATTERN: /^[+-]\d+%$/, }, }, TEXT_LIMITS: { MAX_SSML_LENGTH: 8000, // Maximum SSML document length // SSML generation might have its own effective limits due to overhead. MAX_VOICE_ELEMENTS: 5, // Maximum number of voice elements MAX_PROSODY_NESTING: 3, // Maximum prosody element nesting depth }, }); /** * Sec-MS-GEC token generation constants */ exports.SEC_MS_GEC_GENERATION = Object.freeze({ WIN_EPOCH: 11644473600, // Windows file time epoch offset S_TO_NS: 1e9, // Seconds to nanoseconds conversion CLOCK_SKEW_MINUTES: 5, // 5-minute clock skew for rounding CLOCK_SKEW_SECONDS: 300, // 5 minutes in seconds CLOCK_SKEW_TICKS: 3000000000, // 5 minutes in ticks (3,000,000,000 ticks) HASH_INPUT_FORMAT: "{ticks}MSEdgeSpeechTTS", // Format: windowsFileTimeTicks + "MSEdgeSpeechTTS" HASH_ALGORITHM: "SHA-256", // Hash algorithm for token generation RESULT_FORMAT: "uppercase", // Result must be uppercase hexadecimal }); /** * Word boundary offset padding compensation * Used to compensate for Edge TTS service padding in word boundary events */ exports.WORD_BOUNDARY_OFFSET_COMPENSATION = 8_750_000; // ticks /** * Timing conversion constants for Edge TTS */ exports.TIMING_CONVERSION = Object.freeze({ TICKS_PER_MILLISECOND: 10000, // 10,000 ticks = 1 millisecond TICKS_PER_SECOND: 10_000_000, // 10 million ticks = 1 second MS_TO_TICKS_MULTIPLIER: 10000, TICKS_TO_MS_DIVISOR: 10000, }); /** * Edge TTS Exception types */ exports.EDGE_TTS_EXCEPTIONS = Object.freeze({ NO_AUDIO_RECEIVED: "NoAudioReceived", UNEXPECTED_RESPONSE: "UnexpectedResponse", UNKNOWN_RESPONSE: "UnknownResponse", WEBSOCKET_ERROR: "WebSocketError", SKEW_ADJUSTMENT_ERROR: "SkewAdjustmentError", }); /** * Enhanced error handling and diagnostic constants */ exports.ERROR_HANDLING = Object.freeze({ EXCEPTION_CATEGORIES: { AUTHENTICATION: ["SKEW_ADJUSTMENT_ERROR", "WEBSOCKET_ERROR"], NETWORK: ["WEBSOCKET_ERROR", "UNEXPECTED_RESPONSE"], PROTOCOL: ["UNKNOWN_RESPONSE", "UNEXPECTED_RESPONSE"], AUDIO: ["NO_AUDIO_RECEIVED"], }, RETRY_STRATEGIES: { AUTHENTICATION_ERRORS: { MAX_RETRIES: 2, BACKOFF_MS: [1000, 2000], // Exponential backoff RESET_TOKEN: true, // Regenerate Sec-MS-GEC token }, NETWORK_ERRORS: { MAX_RETRIES: 3, BACKOFF_MS: [500, 1000, 2000], RESET_CONNECTION: true, }, PROTOCOL_ERRORS: { MAX_RETRIES: 1, // Protocol errors rarely benefit from retry BACKOFF_MS: [1000], RESET_CONNECTION: true, }, }, DIAGNOSTIC_INFO: { INCLUDE_HEADERS: true, // Include WebSocket headers in error info INCLUDE_TIMESTAMP: true, // Include timing information INCLUDE_CONNECTION_ID: true, // Include connection tracking info TRUNCATE_LARGE_PAYLOADS: 1000, // Truncate payloads larger than 1KB }, }); /** * Protocol compliance validation constants */ exports.PROTOCOL_COMPLIANCE = Object.freeze({ WEBSOCKET_SUBPROTOCOLS: [], // Edge TTS doesn't use subprotocols REQUIRED_MESSAGE_HEADERS: [ "X-RequestId", "X-Timestamp", "Content-Type", "Path", ], OPTIONAL_MESSAGE_HEADERS: ["Content-Length"], // May be present in some messages HEADER_VALIDATION: { REQUEST_ID_FORMAT: /^[a-f0-9]{32}$/, // 32-character hex (connection ID) TIMESTAMP_FORMAT: /^\d{16}Z$/, // 16-digit microsecond timestamp + Z CONTENT_TYPE_STRICT: true, // Validate content-type matches expected values PATH_CASE_SENSITIVE: true, // Path validation is case-sensitive }, MESSAGE_ORDER: { REQUIRED_SEQUENCE: ["speech.config", "ssml"], // Required message order EXPECTED_RESPONSES: ["turn.start", "audio.metadata", "audio", "turn.end"], VALIDATE_ORDER: true, // Enforce message ordering }, }); /** * Binary message parsing format constants */ exports.BINARY_PARSING = Object.freeze({ HEADER_LENGTH_BYTES: 2, HEADER_ENCODING: "big", // big-endian for Int16 (corrected from protocol analysis) CONTENT_TYPE_AUDIO: "audio/mpeg", PATH_AUDIO: "audio", });