UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

273 lines (272 loc) 8.56 kB
/** * Voice and Speech Type Definitions for NeuroLink * * Core voice types: capabilities, provider config, audio utilities, * events, and provider abstractions. * * STT types are in ./stt.ts * Realtime types are in ./realtime.ts * TTS types are in ./tts.ts * * @module types/voice */ export * from "./tts.js"; export * from "./stt.js"; export * from "./realtime.js"; import type { TTSAudioFormat, TTSOptions, TTSResult, TTSVoice } from "./tts.js"; import type { TTSHandler } from "./common.js"; import type { STTResult, STTHandler } from "./stt.js"; import type { RealtimeHandler } from "./realtime.js"; /** * Voice capability types supported by providers */ export type VoiceCapability = "tts" | "stt" | "realtime" | "streaming"; /** * Voice provider types */ export type VoiceProviderType = "tts" | "stt" | "realtime"; /** * Voice provider name union type */ export type VoiceProviderName = "google-tts" | "elevenlabs" | "openai-tts" | "azure-tts" | "sarvam" | "murf" | "playai" | "speechify" | "cartesia" | "deepgram" | "gladia" | "whisper" | "assemblyai" | "google-stt" | "azure-stt" | "openai-realtime" | "gemini-live"; /** * Base voice provider configuration */ export type VoiceProviderConfig = { /** Provider identifier */ name: string; /** API key or credentials */ apiKey?: string; /** Custom endpoint URL */ baseUrl?: string; /** Request timeout in milliseconds */ timeout?: number; /** Maximum retries for failed requests */ maxRetries?: number; /** Provider-specific options */ options?: Record<string, unknown>; }; /** * Audio format details */ export type AudioFormatDetails = { /** Format name */ format: TTSAudioFormat; /** MIME type */ mimeType: string; /** File extension */ extension: string; /** Whether format supports streaming */ supportsStreaming: boolean; /** Typical sample rates */ sampleRates: number[]; /** Bit depths */ bitDepths: number[]; }; /** * Audio conversion options */ export type AudioConversionOptions = { /** Target format */ targetFormat: TTSAudioFormat; /** Target sample rate */ sampleRate?: number; /** Target bit depth */ bitDepth?: number; /** Number of channels */ channels?: number; /** Normalize audio level */ normalize?: boolean; }; /** * Audio stream chunk for streaming operations */ export type AudioStreamChunk = { /** Audio data */ data: Buffer; /** Chunk index */ index: number; /** Whether this is the final chunk */ isFinal: boolean; /** Audio format */ format: TTSAudioFormat; /** Sample rate */ sampleRate: number; /** Timestamp offset in milliseconds */ timestampMs: number; /** Duration of this chunk in milliseconds */ durationMs: number; }; /** * Voice event types for event-driven architectures */ export type VoiceEventType = "synthesis.started" | "synthesis.progress" | "synthesis.completed" | "synthesis.error" | "transcription.started" | "transcription.partial" | "transcription.completed" | "transcription.error" | "realtime.connected" | "realtime.audio.received" | "realtime.text.received" | "realtime.disconnected" | "realtime.error"; /** * Voice event for event-driven operations */ export type VoiceEvent<T = unknown> = { type: VoiceEventType; timestamp: Date; provider: VoiceProviderName; data: T; metadata?: Record<string, unknown>; }; /** * Voice operation result union */ export type VoiceResult = TTSResult | STTResult; /** * Voice conversation turn */ export type VoiceTurn = { role: "user" | "assistant"; text: string; audio?: Buffer; timestamp: Date; metadata?: { duration?: number; confidence?: number; language?: string; provider?: string; voice?: string; [key: string]: unknown; }; }; /** * TTS-capable voice provider type */ export type TTSProvider = { /** * Synthesize text to speech */ synthesize(text: string, options: TTSOptions): Promise<TTSResult>; /** * Stream synthesized audio chunks */ synthesizeStream?(text: string, options: TTSOptions): AsyncIterable<TTSStreamChunk>; /** * Get available voices */ getVoices(languageCode?: string): Promise<TTSVoice[]>; /** * Maximum text length supported */ readonly maxTextLength: number; }; /** * TTS stream chunk for streaming synthesis */ export type TTSStreamChunk = { /** Audio data chunk */ data: Buffer; /** Chunk sequence number */ index: number; /** Whether this is the final chunk */ isFinal: boolean; /** Audio format */ format: string; /** Sample rate */ sampleRate?: number; /** Timestamp offset in audio (milliseconds) */ timestampMs?: number; }; /** * Voice error codes (general) */ export declare const VOICE_ERROR_CODES: { readonly PROVIDER_NOT_FOUND: "VOICE_PROVIDER_NOT_FOUND"; readonly INVALID_CONFIGURATION: "VOICE_INVALID_CONFIGURATION"; readonly INITIALIZATION_FAILED: "VOICE_INITIALIZATION_FAILED"; readonly OPERATION_CANCELLED: "VOICE_OPERATION_CANCELLED"; readonly PROVIDER_NOT_CONFIGURED: "VOICE_PROVIDER_NOT_CONFIGURED"; readonly PROVIDER_NOT_SUPPORTED: "VOICE_PROVIDER_NOT_SUPPORTED"; readonly FEATURE_NOT_SUPPORTED: "VOICE_FEATURE_NOT_SUPPORTED"; readonly TTS_EMPTY_TEXT: "VOICE_TTS_EMPTY_TEXT"; readonly TTS_TEXT_TOO_LONG: "VOICE_TTS_TEXT_TOO_LONG"; readonly TTS_SYNTHESIS_FAILED: "VOICE_TTS_SYNTHESIS_FAILED"; readonly STT_EMPTY_AUDIO: "VOICE_STT_EMPTY_AUDIO"; readonly STT_INVALID_FORMAT: "VOICE_STT_INVALID_FORMAT"; readonly STT_TRANSCRIPTION_FAILED: "VOICE_STT_TRANSCRIPTION_FAILED"; readonly REALTIME_CONNECTION_FAILED: "VOICE_REALTIME_CONNECTION_FAILED"; readonly REALTIME_SESSION_ERROR: "VOICE_REALTIME_SESSION_ERROR"; readonly NETWORK_ERROR: "VOICE_NETWORK_ERROR"; readonly TIMEOUT: "VOICE_TIMEOUT"; }; /** * Supported audio formats with details */ export declare const AUDIO_FORMAT_DETAILS: Partial<Record<TTSAudioFormat, AudioFormatDetails>>; import type { ErrorCategory, ErrorSeverity } from "../constants/enums.js"; export type VoiceErrorOptions = { code: string; message: string; category?: ErrorCategory; severity?: ErrorSeverity; retriable?: boolean; context?: Record<string, unknown>; originalError?: Error; provider?: string; }; export type AudioMetadata = { format: TTSAudioFormat; duration: number; sampleRate: number; channels: number; bitDepth: number; samples: number; size: number; }; export type StreamHandlerConfig = { chunkDurationMs?: number; sampleRate?: number; bytesPerSample?: number; format?: TTSAudioFormat; highWaterMark?: number; bufferTimeoutMs?: number; }; export type StreamEvents = { chunk: (chunk: AudioStreamChunk) => void; end: () => void; error: (error: Error) => void; drain: () => void; pause: () => void; resume: () => void; }; export type VoiceHandler = TTSHandler | STTHandler | RealtimeHandler; export type AzureTTSOptions = TTSOptions & { useSSML?: boolean; ssmlTemplate?: string; outputFormat?: string; wordBoundary?: boolean; /** * Pass `text` through as raw SSML when it begins with `<speak`. * * **Security:** raw SSML can change voice, embed external content, or * inject markup. Only enable when `text` originates from a TRUSTED source * (your own server-built template, not end-user input). When this flag * is false (default), all input — including text starting with `<speak` * — is XML-escaped, preventing SSML injection. * * @default false */ allowRawSSML?: boolean; }; export type ElevenLabsModel = "eleven_multilingual_v2" | "eleven_turbo_v2_5" | "eleven_turbo_v2" | "eleven_monolingual_v1"; export type ElevenLabsTTSOptions = TTSOptions & { model?: ElevenLabsModel; stability?: number; similarityBoost?: number; style?: number; useSpeakerBoost?: boolean; }; export type GoogleVoiceType = "Standard" | "WaveNet" | "Neural2" | "Studio" | "Polyglot"; export type GoogleTTSOptions = TTSOptions & { voiceType?: GoogleVoiceType; sampleRateHertz?: number; effectsProfileId?: string[]; }; export type OpenAIVoice = "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer"; export type OpenAITTSModel = "tts-1" | "tts-1-hd"; export type OpenAITTSOptions = TTSOptions & { model?: OpenAITTSModel; };