@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
273 lines (272 loc) • 8.56 kB
TypeScript
/**
* Voice and Speech Type Definitions for NeuroLink
*
* Core voice types: capabilities, provider config, audio utilities,
* events, and provider abstractions.
*
* STT types are in ./stt.ts
* Realtime types are in ./realtime.ts
* TTS types are in ./tts.ts
*
* @module types/voice
*/
export * from "./tts.js";
export * from "./stt.js";
export * from "./realtime.js";
import type { TTSAudioFormat, TTSOptions, TTSResult, TTSVoice } from "./tts.js";
import type { TTSHandler } from "./common.js";
import type { STTResult, STTHandler } from "./stt.js";
import type { RealtimeHandler } from "./realtime.js";
/**
* Voice capability types supported by providers
*/
export type VoiceCapability = "tts" | "stt" | "realtime" | "streaming";
/**
* Voice provider types
*/
export type VoiceProviderType = "tts" | "stt" | "realtime";
/**
* Voice provider name union type
*/
export type VoiceProviderName = "google-tts" | "elevenlabs" | "openai-tts" | "azure-tts" | "sarvam" | "murf" | "playai" | "speechify" | "cartesia" | "deepgram" | "gladia" | "whisper" | "assemblyai" | "google-stt" | "azure-stt" | "openai-realtime" | "gemini-live";
/**
* Base voice provider configuration
*/
export type VoiceProviderConfig = {
/** Provider identifier */
name: string;
/** API key or credentials */
apiKey?: string;
/** Custom endpoint URL */
baseUrl?: string;
/** Request timeout in milliseconds */
timeout?: number;
/** Maximum retries for failed requests */
maxRetries?: number;
/** Provider-specific options */
options?: Record<string, unknown>;
};
/**
* Audio format details
*/
export type AudioFormatDetails = {
/** Format name */
format: TTSAudioFormat;
/** MIME type */
mimeType: string;
/** File extension */
extension: string;
/** Whether format supports streaming */
supportsStreaming: boolean;
/** Typical sample rates */
sampleRates: number[];
/** Bit depths */
bitDepths: number[];
};
/**
* Audio conversion options
*/
export type AudioConversionOptions = {
/** Target format */
targetFormat: TTSAudioFormat;
/** Target sample rate */
sampleRate?: number;
/** Target bit depth */
bitDepth?: number;
/** Number of channels */
channels?: number;
/** Normalize audio level */
normalize?: boolean;
};
/**
* Audio stream chunk for streaming operations
*/
export type AudioStreamChunk = {
/** Audio data */
data: Buffer;
/** Chunk index */
index: number;
/** Whether this is the final chunk */
isFinal: boolean;
/** Audio format */
format: TTSAudioFormat;
/** Sample rate */
sampleRate: number;
/** Timestamp offset in milliseconds */
timestampMs: number;
/** Duration of this chunk in milliseconds */
durationMs: number;
};
/**
* Voice event types for event-driven architectures
*/
export type VoiceEventType = "synthesis.started" | "synthesis.progress" | "synthesis.completed" | "synthesis.error" | "transcription.started" | "transcription.partial" | "transcription.completed" | "transcription.error" | "realtime.connected" | "realtime.audio.received" | "realtime.text.received" | "realtime.disconnected" | "realtime.error";
/**
* Voice event for event-driven operations
*/
export type VoiceEvent<T = unknown> = {
type: VoiceEventType;
timestamp: Date;
provider: VoiceProviderName;
data: T;
metadata?: Record<string, unknown>;
};
/**
* Voice operation result union
*/
export type VoiceResult = TTSResult | STTResult;
/**
* Voice conversation turn
*/
export type VoiceTurn = {
role: "user" | "assistant";
text: string;
audio?: Buffer;
timestamp: Date;
metadata?: {
duration?: number;
confidence?: number;
language?: string;
provider?: string;
voice?: string;
[key: string]: unknown;
};
};
/**
* TTS-capable voice provider type
*/
export type TTSProvider = {
/**
* Synthesize text to speech
*/
synthesize(text: string, options: TTSOptions): Promise<TTSResult>;
/**
* Stream synthesized audio chunks
*/
synthesizeStream?(text: string, options: TTSOptions): AsyncIterable<TTSStreamChunk>;
/**
* Get available voices
*/
getVoices(languageCode?: string): Promise<TTSVoice[]>;
/**
* Maximum text length supported
*/
readonly maxTextLength: number;
};
/**
* TTS stream chunk for streaming synthesis
*/
export type TTSStreamChunk = {
/** Audio data chunk */
data: Buffer;
/** Chunk sequence number */
index: number;
/** Whether this is the final chunk */
isFinal: boolean;
/** Audio format */
format: string;
/** Sample rate */
sampleRate?: number;
/** Timestamp offset in audio (milliseconds) */
timestampMs?: number;
};
/**
* Voice error codes (general)
*/
export declare const VOICE_ERROR_CODES: {
readonly PROVIDER_NOT_FOUND: "VOICE_PROVIDER_NOT_FOUND";
readonly INVALID_CONFIGURATION: "VOICE_INVALID_CONFIGURATION";
readonly INITIALIZATION_FAILED: "VOICE_INITIALIZATION_FAILED";
readonly OPERATION_CANCELLED: "VOICE_OPERATION_CANCELLED";
readonly PROVIDER_NOT_CONFIGURED: "VOICE_PROVIDER_NOT_CONFIGURED";
readonly PROVIDER_NOT_SUPPORTED: "VOICE_PROVIDER_NOT_SUPPORTED";
readonly FEATURE_NOT_SUPPORTED: "VOICE_FEATURE_NOT_SUPPORTED";
readonly TTS_EMPTY_TEXT: "VOICE_TTS_EMPTY_TEXT";
readonly TTS_TEXT_TOO_LONG: "VOICE_TTS_TEXT_TOO_LONG";
readonly TTS_SYNTHESIS_FAILED: "VOICE_TTS_SYNTHESIS_FAILED";
readonly STT_EMPTY_AUDIO: "VOICE_STT_EMPTY_AUDIO";
readonly STT_INVALID_FORMAT: "VOICE_STT_INVALID_FORMAT";
readonly STT_TRANSCRIPTION_FAILED: "VOICE_STT_TRANSCRIPTION_FAILED";
readonly REALTIME_CONNECTION_FAILED: "VOICE_REALTIME_CONNECTION_FAILED";
readonly REALTIME_SESSION_ERROR: "VOICE_REALTIME_SESSION_ERROR";
readonly NETWORK_ERROR: "VOICE_NETWORK_ERROR";
readonly TIMEOUT: "VOICE_TIMEOUT";
};
/**
* Supported audio formats with details
*/
export declare const AUDIO_FORMAT_DETAILS: Partial<Record<TTSAudioFormat, AudioFormatDetails>>;
import type { ErrorCategory, ErrorSeverity } from "../constants/enums.js";
export type VoiceErrorOptions = {
code: string;
message: string;
category?: ErrorCategory;
severity?: ErrorSeverity;
retriable?: boolean;
context?: Record<string, unknown>;
originalError?: Error;
provider?: string;
};
export type AudioMetadata = {
format: TTSAudioFormat;
duration: number;
sampleRate: number;
channels: number;
bitDepth: number;
samples: number;
size: number;
};
export type StreamHandlerConfig = {
chunkDurationMs?: number;
sampleRate?: number;
bytesPerSample?: number;
format?: TTSAudioFormat;
highWaterMark?: number;
bufferTimeoutMs?: number;
};
export type StreamEvents = {
chunk: (chunk: AudioStreamChunk) => void;
end: () => void;
error: (error: Error) => void;
drain: () => void;
pause: () => void;
resume: () => void;
};
export type VoiceHandler = TTSHandler | STTHandler | RealtimeHandler;
export type AzureTTSOptions = TTSOptions & {
useSSML?: boolean;
ssmlTemplate?: string;
outputFormat?: string;
wordBoundary?: boolean;
/**
* Pass `text` through as raw SSML when it begins with `<speak`.
*
* **Security:** raw SSML can change voice, embed external content, or
* inject markup. Only enable when `text` originates from a TRUSTED source
* (your own server-built template, not end-user input). When this flag
* is false (default), all input — including text starting with `<speak`
* — is XML-escaped, preventing SSML injection.
*
* @default false
*/
allowRawSSML?: boolean;
};
export type ElevenLabsModel = "eleven_multilingual_v2" | "eleven_turbo_v2_5" | "eleven_turbo_v2" | "eleven_monolingual_v1";
export type ElevenLabsTTSOptions = TTSOptions & {
model?: ElevenLabsModel;
stability?: number;
similarityBoost?: number;
style?: number;
useSpeakerBoost?: boolean;
};
export type GoogleVoiceType = "Standard" | "WaveNet" | "Neural2" | "Studio" | "Polyglot";
export type GoogleTTSOptions = TTSOptions & {
voiceType?: GoogleVoiceType;
sampleRateHertz?: number;
effectsProfileId?: string[];
};
export type OpenAIVoice = "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer";
export type OpenAITTSModel = "tts-1" | "tts-1-hd";
export type OpenAITTSOptions = TTSOptions & {
model?: OpenAITTSModel;
};