UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

586 lines (585 loc) 16.6 kB
/** * Speech-to-Text (STT) Type Definitions for NeuroLink * * All STT-specific types: options, results, handlers, * provider-specific options, error codes, defaults, and type guards. * * @module types/stt */ import type { TTSAudioFormat } from "./tts.js"; /** * STT configuration options */ export type STTOptions = { /** Enable STT processing */ enabled?: boolean; /** Override STT provider */ provider?: string; /** Language code for transcription (e.g., "en-US") */ language?: string; /** Audio format of input */ format?: TTSAudioFormat; /** Sample rate in Hz */ sampleRate?: number; /** Enable punctuation in transcription */ punctuation?: boolean; /** Enable punctuation (alias) */ punctuate?: boolean; /** Enable profanity filter */ profanityFilter?: boolean; /** Enable speaker diarization */ speakerDiarization?: boolean; /** Enable speaker diarization (alias) */ diarization?: boolean; /** Number of speakers (for diarization) */ speakerCount?: number; /** Enable word-level timestamps */ wordTimestamps?: boolean; /** Model variant to use */ model?: string; /** Custom vocabulary/phrases */ vocabulary?: string[]; /** Minimum confidence threshold */ confidenceThreshold?: number; /** * Maximum audio buffer size in bytes. STTProcessor rejects buffers over * this limit before any provider call, preventing OOM on multi-GB inputs. * Default: 25_000_000 (matches Whisper's documented 25MB ceiling). */ maxAudioBytes?: number; }; /** * STT result from transcription */ export type STTResult = { /** Full transcribed text */ text: string; /** Confidence score (0-1) */ confidence: number; /** Detected language code */ language?: string; /** Audio duration in seconds */ duration?: number; /** Word-level timings */ words?: WordTiming[]; /** Transcription segments */ segments?: TranscriptionSegment[]; /** Speaker labels (for diarization) */ speakers?: string[]; /** Performance metadata */ metadata?: { /** Processing latency in milliseconds */ latency: number; /** Provider name */ provider?: string; /** Model used */ model?: string; /** Additional provider-specific metadata */ [key: string]: unknown; }; }; /** * STT language information */ export type STTLanguage = { /** Language code (e.g., "en-US") */ code: string; /** Language name */ name: string; /** Whether the language supports speaker diarization */ supportsDiarization?: boolean; /** Whether the language supports punctuation */ supportsPunctuation?: boolean; }; /** * Word-level timing information */ export type WordTiming = { /** The word */ word: string; /** Start time in seconds */ startTime?: number; /** Start time alias */ start?: number; /** End time in seconds */ endTime?: number; /** End time alias */ end?: number; /** Confidence score (0-1) */ confidence?: number; /** Speaker label (for diarization) */ speaker?: string; }; /** * Transcription segment for streaming STT */ export type TranscriptionSegment = { /** Segment index */ index?: number; /** Transcribed text */ text: string; /** Whether this is a final result */ isFinal: boolean; /** Confidence score (0-1) */ confidence?: number; /** Start time in audio (seconds) */ startTime?: number; /** Start time (alias for startTime) */ start?: number; /** End time in audio (seconds) */ endTime?: number; /** End time (alias for endTime) */ end?: number; /** Word-level timings */ words?: WordTiming[]; /** Speaker label */ speaker?: string; /** Detected language */ language?: string; }; export type STTHandler = { transcribe(audio: Buffer | ArrayBuffer, options: STTOptions): Promise<STTResult>; transcribeStream?(audioStream: AsyncIterable<Buffer>, options: STTOptions): AsyncIterable<TranscriptionSegment>; getSupportedLanguages?(): Promise<STTLanguage[]>; getSupportedFormats(): TTSAudioFormat[]; isConfigured(): boolean; maxAudioDuration?: number; supportsStreaming?: boolean; }; /** * STT error codes */ export declare const STT_ERROR_CODES: { readonly AUDIO_EMPTY: "STT_AUDIO_EMPTY"; readonly AUDIO_TOO_LONG: "STT_AUDIO_TOO_LONG"; readonly INVALID_AUDIO_FORMAT: "STT_INVALID_AUDIO_FORMAT"; readonly LANGUAGE_NOT_SUPPORTED: "STT_LANGUAGE_NOT_SUPPORTED"; readonly TRANSCRIPTION_FAILED: "STT_TRANSCRIPTION_FAILED"; readonly PROVIDER_NOT_CONFIGURED: "STT_PROVIDER_NOT_CONFIGURED"; readonly PROVIDER_NOT_SUPPORTED: "STT_PROVIDER_NOT_SUPPORTED"; readonly STREAM_ERROR: "STT_STREAM_ERROR"; readonly STREAMING_NOT_SUPPORTED: "STT_STREAMING_NOT_SUPPORTED"; }; /** * Default STT options */ export declare const DEFAULT_STT_OPTIONS: Required<Pick<STTOptions, "language" | "punctuation" | "profanityFilter" | "sampleRate">>; /** * Type guard for STTResult */ export declare function isSTTResult(value: unknown): value is STTResult; /** * Type guard for valid STTOptions */ export declare function isValidSTTOptions(options: unknown): options is STTOptions; /** * Type guard for TranscriptionSegment */ export declare function isTranscriptionSegment(value: unknown): value is TranscriptionSegment; export type AzureRecognitionMode = "interactive" | "conversation" | "dictation"; export type AzureOutputFormat = "simple" | "detailed"; export type AzureSTTOptions = STTOptions & { recognitionMode?: AzureRecognitionMode; outputFormat?: AzureOutputFormat; interimResults?: boolean; endpointId?: string; /** Custom endpoint ID (alias for endpointId) */ customEndpointId?: string; connectionTimeout?: number; silenceTimeout?: number; profanityOption?: "masked" | "removed" | "raw"; /** Profanity mode (alias for profanityOption) */ profanityMode?: "masked" | "removed" | "raw"; initialSilenceTimeout?: number; enableLogging?: boolean; phraseList?: string[]; /** Whether to request detailed output format */ detailed?: boolean; wordLevelConfidence?: boolean; initialSilenceTimeoutMs?: number; endSilenceTimeoutMs?: number; }; export type DeepgramModel = "nova-2" | "nova-2-general" | "nova-2-meeting" | "nova-2-phonecall" | "nova-2-voicemail" | "nova-2-finance" | "nova-2-medical" | "nova" | "enhanced" | "base"; export type DeepgramSTTOptions = STTOptions & { model?: DeepgramModel | "nova-3"; smartFormat?: boolean; search?: string[]; replace?: Array<{ find: string; replace: string; }>; utterances?: boolean; utterSplit?: number; /** Alias for utterSplit (legacy field name) */ uttSplit?: number; paragraphs?: boolean; keywords?: string[]; keywordBoost?: "legacy" | "medium" | "high"; fillerWords?: boolean; detectTopics?: boolean; detectEntities?: boolean; summarize?: boolean; redact?: ("pci" | "numbers" | "ssn")[]; }; export type GoogleSTTModel = "latest_short" | "latest_long" | "telephony" | "medical_conversation" | "medical_dictation" | "command_and_search" | "phone_call" | "video" | "default"; export type GoogleSTTAudioEncoding = "ENCODING_UNSPECIFIED" | "LINEAR16" | "FLAC" | "MULAW" | "AMR" | "AMR_WB" | "OGG_OPUS" | "SPEEX_WITH_HEADER_BYTE" | "MP3" | "WEBM_OPUS"; export type GoogleSTTOptions = STTOptions & { model?: GoogleSTTModel; encoding?: GoogleSTTAudioEncoding; sampleRateHertz?: number; audioChannelCount?: number; enableSeparateRecognitionPerChannel?: boolean; alternativeLanguageCodes?: string[]; maxAlternatives?: number; enableAutomaticPunctuation?: boolean; enableSpokenPunctuation?: boolean; enableSpokenEmojis?: boolean; speechContexts?: Array<{ phrases: string[]; boost?: number; }>; adaptation?: { phraseSets?: string[]; customClasses?: string[]; }; useEnhanced?: boolean; keywords?: string[]; }; export type WhisperModel = "whisper-1"; export type WhisperSTTOptions = STTOptions & { model?: WhisperModel; responseFormat?: "json" | "text" | "srt" | "verbose_json" | "vtt"; temperature?: number; prompt?: string; /** Translate audio to English instead of transcribing in original language */ translate?: boolean; }; export type AzureWord = { Word: string; Offset: number; Duration: number; Confidence?: number; }; export type AzureNBest = { Confidence: number; Lexical: string; ITN: string; MaskedITN: string; Display: string; Words?: AzureWord[]; }; export type AzureRecognitionResult = { RecognitionStatus: "Success" | "NoMatch" | "InitialSilenceTimeout" | "BabbleTimeout" | "Error" | string; Offset?: number; Duration?: number; DisplayText?: string; NBest?: AzureNBest[]; }; export type AzureSpeakerRecognitionResult = AzureRecognitionResult & { SpeakerId?: string; }; export type DeepgramWord = { word: string; start: number; end: number; confidence: number; speaker?: number; punctuated_word?: string; }; export type DeepgramAlternative = { transcript: string; confidence: number; words: DeepgramWord[]; paragraphs?: { transcript: string; paragraphs: Array<{ sentences: Array<{ text: string; start: number; end: number; }>; }>; }; }; export type DeepgramChannel = { alternatives: DeepgramAlternative[]; }; export type DeepgramUtterance = { start: number; end: number; confidence: number; channel: number; transcript: string; words: DeepgramWord[]; speaker?: number; id?: string; }; export type DeepgramResult = { channels: DeepgramChannel[]; utterances?: DeepgramUtterance[]; }; export type DeepgramResponse = { metadata: { request_id: string; transaction_key?: string; sha256?: string; created: string; duration: number; channels: number; models: string[]; model_info?: Record<string, { name: string; version: string; }>; }; results: DeepgramResult; }; export type GoogleWordInfo = { startTime: string; endTime: string; word: string; confidence?: number; speakerTag?: number; }; export type GoogleSpeechRecognitionAlternative = { transcript: string; confidence: number; words?: GoogleWordInfo[]; }; export type GoogleSpeechRecognitionResult = { alternatives: GoogleSpeechRecognitionAlternative[]; channelTag?: number; languageCode?: string; resultEndTime?: string; }; export type GoogleLongRunningRecognizeResponse = { results: GoogleSpeechRecognitionResult[]; totalBilledTime?: string; }; export type GoogleRecognizeResponse = { results?: GoogleSpeechRecognitionResult[]; totalBilledTime?: string; }; export type GoogleOperationResponse = { name: string; done: boolean; metadata?: { progressPercent?: number; startTime?: string; lastUpdateTime?: string; }; response?: GoogleLongRunningRecognizeResponse; error?: { code: number; message: string; }; }; export type GoogleRecognitionConfig = { encoding: string; sampleRateHertz?: number; languageCode: string; enableAutomaticPunctuation?: boolean; enableWordTimeOffsets?: boolean; enableWordConfidence?: boolean; model?: string; useEnhanced?: boolean; maxAlternatives?: number; profanityFilter?: boolean; enableSpeakerDiarization?: boolean; diarizationSpeakerCount?: number; }; export type GoogleRecognitionAudio = { content: string; }; export type WhisperTranscriptionWord = { word: string; start: number; end: number; }; export type WhisperTranscriptionSegment = { id: number; seek: number; start: number; end: number; text: string; tokens: number[]; temperature: number; avg_logprob: number; compression_ratio: number; no_speech_prob: number; }; export type WhisperVerboseResponse = { task: string; language: string; duration: number; text: string; segments?: WhisperTranscriptionSegment[]; words?: WhisperTranscriptionWord[]; }; export type WhisperSimpleResponse = { text: string; }; export type ElevenLabsVoice = { voice_id: string; name: string; category: string; labels?: { accent?: string; description?: string; age?: string; gender?: string; use_case?: string; }; preview_url?: string; }; export type ElevenLabsVoicesResponse = { voices: ElevenLabsVoice[]; }; export type AzureVoiceInfo = { Name: string; DisplayName: string; LocalName: string; ShortName: string; Gender: string; Locale: string; LocaleName: string; VoiceType: string; Status: string; WordsPerMinute?: string; }; export type GoogleAudioConfig = { audioEncoding: string; speakingRate?: number; pitch?: number; volumeGainDb?: number; sampleRateHertz?: number; effectsProfileId?: string[]; }; export type GoogleVoiceSelectionParams = { languageCode: string; name?: string; ssmlGender?: string; }; export type GoogleSynthesisInput = { text?: string; ssml?: string; }; export type GoogleSynthesizeRequest = { input: GoogleSynthesisInput; voice: GoogleVoiceSelectionParams; audioConfig: GoogleAudioConfig; }; export type GoogleVoiceInfo = { languageCodes: string[]; name: string; ssmlGender: string; naturalSampleRateHertz: number; }; export type GoogleListVoicesResponse = { voices: GoogleVoiceInfo[]; }; export type GoogleSynthesizeResponse = { audioContent: string; }; export type OpenAIRealtimeEvent = { type: string; event_id?: string; [key: string]: unknown; }; export type OpenAISessionCreated = OpenAIRealtimeEvent & { type: "session.created"; session: { id: string; object: string; model: string; modalities: string[]; voice: string; input_audio_format: string; output_audio_format: string; turn_detection: { type: string; threshold?: number; prefix_padding_ms?: number; silence_duration_ms?: number; }; tools: unknown[]; tool_choice: string; temperature: number; max_response_output_tokens: string | number; }; }; export type OpenAIAudioDelta = OpenAIRealtimeEvent & { type: "response.audio.delta"; response_id: string; item_id: string; output_index: number; content_index: number; delta: string; }; export type OpenAITranscriptDelta = OpenAIRealtimeEvent & { type: "response.audio_transcript.delta" | "conversation.item.input_audio_transcription.completed"; delta?: string; transcript?: string; }; export type GeminiMessage = { setup?: { model: string; generationConfig?: { responseModalities?: string[]; speechConfig?: { voiceConfig?: { prebuiltVoiceConfig?: { voiceName?: string; }; }; }; }; systemInstruction?: { parts: Array<{ text: string; }>; }; tools?: unknown[]; }; realtimeInput?: { mediaChunks: Array<{ mimeType: string; data: string; }>; }; clientContent?: { turns: Array<{ role: string; parts: Array<{ text: string; }>; }>; turnComplete: boolean; }; }; export type GeminiResponse = { setupComplete?: Record<string, unknown>; serverContent?: { modelTurn?: { parts: Array<{ text?: string; inlineData?: { mimeType: string; data: string; }; }>; }; turnComplete?: boolean; interrupted?: boolean; }; toolCall?: { functionCalls: Array<{ id: string; name: string; args: Record<string, unknown>; }>; }; toolCallCancellation?: { ids: string[]; }; };