UNPKG

@speechmatics/real-time-client

Version:
863 lines (795 loc) 39.9 kB
import { TypedEventTarget } from 'typescript-event-target'; type RawAudioEncodingEnum = 'pcm_f32le' | 'pcm_s16le' | 'mulaw'; /** * Raw audio samples, described by the following additional mandatory fields: */ interface Raw { type: 'raw'; encoding: RawAudioEncodingEnum; /** * The sample rate of the audio in Hz. */ sample_rate: number; } /** * Choose this option to send audio encoded in a recognized format. The AddAudio messages have to provide all the file contents, including any headers. The file is usually not accepted all at once, but segmented into reasonably sized messages. * * Note: Only the following formats are supported: `wav`, `mp3`, `aac`, `ogg`, `mpeg`, `amr`, `m4a`, `mp4`, `flac` */ interface FileType { type: 'file'; } /** * Pass an object to add a single word to the dictionary, with an array of words which it sounds like. */ interface AdditionalVocabObject { content: string; sounds_like?: string[]; } /** * Set to `speaker` to apply [Speaker Diarization](https://docs.speechmatics.com/speech-to-text/features/diarization) to the audio. */ type DiarizationConfig = 'none' | 'speaker' | 'channel' | 'channel_and_speaker'; /** * This allows some additional time for [Smart Formatting](https://docs.speechmatics.com/speech-to-text/formatting#smart-formatting). */ type MaxDelayModeConfig = 'flexible' | 'fixed'; interface SpeakersInputItem { /** * Speaker label, which must not match the format used internally (e.g. S1, S2, etc) */ label: string; speaker_identifiers: string[]; } interface SpeakerDiarizationConfig { /** * Configure the maximum number of speakers to detect. See [Max Speakers](http://docs.speechmatics.com/speech-to-text/features/diarization#max-speakers). */ max_speakers?: number; /** * When set to `true`, reduces the likelihood of incorrectly switching between similar sounding speakers. * See [Prefer Current Speaker](https://docs.speechmatics.com/speech-to-text/features/diarization#prefer-current-speaker). */ prefer_current_speaker?: boolean; speaker_sensitivity?: number; /** * If true, speaker identifiers will be returned at the end of transcript. */ get_speakers?: boolean; /** * Use this option to provide speaker labels linked to their speaker identifiers. When passed, the transcription system will tag spoken words in the transcript with the provided speaker labels whenever any of the specified speakers is detected in the audio. A maximum of 50 speakers identifiers across all speakers can be provided. */ speakers?: SpeakersInputItem[]; } /** * Puts a lower limit on the volume of processed audio by using the `volume_threshold` setting. See [Audio Filtering](https://docs.speechmatics.com/speech-to-text/features/audio-filtering). */ interface AudioFilteringConfig { volume_threshold?: number; } interface WordReplacementItem { from: string; to: string; } interface TranscriptFilteringConfig { /** * When set to `true`, removes disfluencies from the transcript. See [Removing disfluencies](https://docs.speechmatics.com/speech-to-text/formatting#removing-disfluencies) */ remove_disfluencies?: boolean; /** * A list of replacement rules to apply to the transcript. Each rule consists of a pattern to match and a replacement string. See [Word replacement](https://docs.speechmatics.com/speech-to-text/formatting#word-replacement) */ replacements?: WordReplacementItem[]; } /** * Which model you wish to use. See [Operating points](http://docs.speechmatics.com/speech-to-text/#operating-points) for more details. */ type OperatingPoint = 'standard' | 'enhanced'; /** * Options for controlling punctuation in the output transcripts. See [Punctuation Settings](https://docs.speechmatics.com/speech-to-text/formatting#punctuation) */ interface PunctuationOverrides { /** * The punctuation marks which the client is prepared to accept in transcription output, or the special value 'all' (the default). Unsupported marks are ignored. This value is used to guide the transcription process. */ permitted_marks?: string[]; /** * Ranges between zero and one. Higher values will produce more punctuation. The default is 0.5. */ sensitivity?: number; } /** * This mode will detect when a speaker has stopped talking. The `end_of_utterance_silence_trigger` is the time in seconds after which the server will assume that the speaker has finished speaking, and will emit an `EndOfUtterance` message. A value of 0 disables the feature. */ interface ConversationConfig { end_of_utterance_silence_trigger?: number; } /** * Contains configuration for this recognition session. */ interface TranscriptionConfig { /** * Language model to process the audio input, normally specified as an ISO language code. The value must be consistent with the language code used in the API endpoint URL. */ language: string; /** * Request a specialized model based on 'language' but optimized for a particular field, e.g. `finance` or `medical`. */ domain?: string; /** * Configure locale for outputted transcription. See [output formatting](https://docs.speechmatics.com/speech-to-text/formatting#output-locale). */ output_locale?: string; /** * Configure [custom dictionary](https://docs.speechmatics.com/speech-to-text/features/custom-dictionary). Default is an empty list. You should be aware that there is a performance penalty (latency degradation and memory increase) from using `additional_vocab`, especially if you use a large word list. When initializing a session that uses `additional_vocab` in the config, you should expect a delay of up to 15 seconds (depending on the size of the list). */ additional_vocab?: (string | AdditionalVocabObject)[]; /** * Set to `speaker` to apply [Speaker Diarization](https://docs.speechmatics.com/speech-to-text/features/diarization) to the audio. */ diarization?: DiarizationConfig; /** * This is the delay in seconds between the end of a spoken word and returning the Final transcript results. See [Latency](https://docs.speechmatics.com/speech-to-text/realtime/output#latency) for more details */ max_delay?: number; /** * This allows some additional time for [Smart Formatting](https://docs.speechmatics.com/speech-to-text/formatting#smart-formatting). */ max_delay_mode?: MaxDelayModeConfig; speaker_diarization_config?: SpeakerDiarizationConfig; /** * Puts a lower limit on the volume of processed audio by using the `volume_threshold` setting. See [Audio Filtering](https://docs.speechmatics.com/speech-to-text/features/audio-filtering). */ audio_filtering_config?: AudioFilteringConfig; transcript_filtering_config?: TranscriptFilteringConfig; /** * Whether or not to send Partials (i.e. `AddPartialTranslation` messages) as well as Finals (i.e. `AddTranslation` messages) * See [Partial transcripts](https://docs.speechmatics.com/speech-to-text/realtime/output#partial-transcripts). */ enable_partials?: boolean; enable_entities?: boolean; /** * Which model you wish to use. See [Operating points](http://docs.speechmatics.com/speech-to-text/#operating-points) for more details. */ operating_point?: OperatingPoint; /** * Options for controlling punctuation in the output transcripts. See [Punctuation Settings](https://docs.speechmatics.com/speech-to-text/formatting#punctuation) */ punctuation_overrides?: PunctuationOverrides; /** * This mode will detect when a speaker has stopped talking. The `end_of_utterance_silence_trigger` is the time in seconds after which the server will assume that the speaker has finished speaking, and will emit an `EndOfUtterance` message. A value of 0 disables the feature. */ conversation_config?: ConversationConfig; channel_diarization_labels?: string[]; } /** * Specifies various configuration values for translation. All fields except `target_languages` are optional, using default values when omitted. */ interface TranslationConfig { /** * List of languages to translate to from the source transcription `language`. Specified as an [ISO Language Code](https://docs.speechmatics.com/speech-to-text/languages). */ target_languages: string[]; /** * Whether or not to send Partials (i.e. `AddPartialTranslation` messages) as well as Finals (i.e. `AddTranslation` messages). */ enable_partials?: boolean; } /** * Contains configuration for [Audio Events](https://docs.speechmatics.com/speech-to-text/features/audio-events) */ interface AudioEventsConfig { /** * List of [Audio Event types](https://docs.speechmatics.com/speech-to-text/features/audio-events#supported-audio-events) to enable. */ types?: string[]; } interface StartRecognition { message: 'StartRecognition'; audio_format: Raw | FileType; /** * Contains configuration for this recognition session. */ transcription_config: TranscriptionConfig; /** * Specifies various configuration values for translation. All fields except `target_languages` are optional, using default values when omitted. */ translation_config?: TranslationConfig; /** * Contains configuration for [Audio Events](https://docs.speechmatics.com/speech-to-text/features/audio-events) */ audio_events_config?: AudioEventsConfig; } interface AddChannelAudio { message: 'AddChannelAudio'; /** * The channel identifier to which the audio belongs. */ channel: string; /** * The audio data in base64 format. */ data: string; } interface EndOfStream { message: 'EndOfStream'; last_seq_no: number; } interface EndOfChannel { message: 'EndOfChannel'; /** * The channel identifier to which the audio belongs. */ channel: string; last_seq_no: number; } interface ForceEndOfUtterance { message: 'ForceEndOfUtterance'; /** * The channel to request finalized transcript from. This field is only seen in multichannel. */ channel?: string; /** * Timestamp of the audio data that corresponds to the force end of utterance request. It's the number of seconds since the beginning of the audio. */ timestamp?: number; } /** * Contains configuration for this recognition session. */ interface MidSessionTranscriptionConfig { /** * Language model to process the audio input, normally specified as an ISO language code. The value must be consistent with the language code used in the API endpoint URL. */ language?: string; /** * This is the delay in seconds between the end of a spoken word and returning the Final transcript results. See [Latency](https://docs.speechmatics.com/speech-to-text/realtime/output#latency) for more details */ max_delay?: number; /** * This allows some additional time for [Smart Formatting](https://docs.speechmatics.com/speech-to-text/formatting#smart-formatting). */ max_delay_mode?: MaxDelayModeConfig; /** * Puts a lower limit on the volume of processed audio by using the `volume_threshold` setting. See [Audio Filtering](https://docs.speechmatics.com/speech-to-text/features/audio-filtering). */ audio_filtering_config?: AudioFilteringConfig; /** * Whether or not to send Partials (i.e. `AddPartialTranslation` messages) as well as Finals (i.e. `AddTranslation` messages) * See [Partial transcripts](https://docs.speechmatics.com/speech-to-text/realtime/output#partial-transcripts). */ enable_partials?: boolean; /** * This mode will detect when a speaker has stopped talking. The `end_of_utterance_silence_trigger` is the time in seconds after which the server will assume that the speaker has finished speaking, and will emit an `EndOfUtterance` message. A value of 0 disables the feature. */ conversation_config?: ConversationConfig; } interface SetRecognitionConfig { message: 'SetRecognitionConfig'; /** * Contains configuration for this recognition session. */ transcription_config: MidSessionTranscriptionConfig; } interface GetSpeakers { message: 'GetSpeakers'; /** * Optional. This flag controls when speaker identifiers are returned. Defaults to false if omitted. * When false, multiple GetSpeakers requests can be sent during transcription, each returning the speaker identifiers generated so far. To reduce the chance of empty results, send requests after at least one TranscriptAdded message is received to make sure that the server has processed some audio. * When true, speaker identifiers are returned only once at the end of the transcription, regardless of how many final: true requests are sent. Even with final: true requests, you can still send final: false requests to receive intermediate speaker identifier updates. */ final?: boolean; } type RealtimeClientMessage = StartRecognition | AddChannelAudio | EndOfStream | EndOfChannel | ForceEndOfUtterance | SetRecognitionConfig | GetSpeakers; /** * The direction that words in the language should be written and read in. */ type WritingDirectionEnum = 'left-to-right' | 'right-to-left'; /** * Properties of the language pack. */ interface LanguagePackInfo { /** * Full descriptive name of the language, e.g. 'Japanese'. */ language_description?: string; /** * The character to use to separate words. */ word_delimiter: string; /** * The direction that words in the language should be written and read in. */ writing_direction?: WritingDirectionEnum; /** * Whether or not ITN (inverse text normalization) is available for the language pack. */ itn?: boolean; /** * Whether or not language model adaptation has been applied to the language pack. */ adapted?: boolean; } interface RecognitionStarted { message: 'RecognitionStarted'; orchestrator_version?: string; id?: string; /** * Properties of the language pack. */ language_pack_info?: LanguagePackInfo; channel_ids?: string[]; } interface AudioAdded { message: 'AudioAdded'; seq_no: number; } interface ChannelAudioAdded { message: 'ChannelAudioAdded'; seq_no: number; channel: string; } interface RecognitionMetadata { start_time: number; end_time: number; /** * The entire transcript contained in the segment in text format. Providing the entire transcript here is designed for ease of consumption; we have taken care of all the necessary formatting required to concatenate the transcription results into a block of text. * This transcript lacks the detailed information however which is contained in the `results` field of the message - such as the timings and confidences for each word. */ transcript: string; } type RecognitionResultTypeEnum = 'word' | 'punctuation' | 'entity'; type AttachesToEnum = 'next' | 'previous' | 'none' | 'both'; /** * Either `ltr` for words that should be displayed left-to-right, or `rtl` vice versa. */ type DirectionEnum = 'ltr' | 'rtl'; /** * Information about how the word/symbol should be displayed. */ interface RecognitionDisplay { /** * Either `ltr` for words that should be displayed left-to-right, or `rtl` vice versa. */ direction: DirectionEnum; } type RecognitionAlternativeTagsEnum = 'disfluency' | 'profanity'; interface RecognitionAlternative { /** * A word or punctuation mark. */ content: string; /** * A confidence score assigned to the alternative. Ranges from 0.0 (least confident) to 1.0 (most confident). */ confidence: number; /** * The language that the alternative word is assumed to be spoken in. Currently, this will always be equal to the language that was requested in the initial `StartRecognition` message. */ language?: string; /** * Information about how the word/symbol should be displayed. */ display?: RecognitionDisplay; /** * Label indicating who said that word. Only set if [diarization](https://docs.speechmatics.com/speech-to-text/features/diarization) is enabled. */ speaker?: string; /** * This is a set list of profanities and disfluencies respectively that cannot be altered by the end user. `[disfluency]` is only present in English, and `[profanity]` is present in English, Spanish, and Italian */ tags?: RecognitionAlternativeTagsEnum[]; } type SpokenFormRecognitionResultTypeEnum = 'word' | 'punctuation'; /** * A SpokenFormRecognitionResult describes a simple object which consists solely of 'word' or 'punctuation' type entries with a start and end time. It can occur only inside the spoken_form property of a full RecognitionResult */ interface SpokenFormRecognitionResult { alternatives: RecognitionAlternative[]; end_time: number; start_time: number; type: SpokenFormRecognitionResultTypeEnum; } type WrittenFormRecognitionResultTypeEnum = 'word'; /** * A WrittenFormRecognitionResult describes a simple object which consists solely of 'word' type entries with a start and end time. It can occur only inside the written_form property of a full RecognitionResult */ interface WrittenFormRecognitionResult { alternatives: RecognitionAlternative[]; end_time: number; start_time: number; type: WrittenFormRecognitionResultTypeEnum; } interface RecognitionResult { type: RecognitionResultTypeEnum; start_time: number; end_time: number; attaches_to?: AttachesToEnum; is_eos?: boolean; alternatives?: RecognitionAlternative[]; volume?: number; /** * For 'entity' results only, the class the entity has been formatted as. Examples: 'date', 'money', 'number' */ entity_class?: string; /** * For 'entity' results only, the spoken_form is the transcript of the individual words directly spoken. */ spoken_form?: SpokenFormRecognitionResult[]; /** * For 'entity' results only, the written_form is a standardized form of the spoken words. It contains the formatted entity split into individual words. */ written_form?: WrittenFormRecognitionResult[]; } interface AddPartialTranscript { message: 'AddPartialTranscript'; /** * Speechmatics JSON output format version number. */ format?: string; metadata: RecognitionMetadata; results: RecognitionResult[]; /** * The channel identifier to which the audio belongs. This field is only seen in multichannel. */ channel?: string; } interface AddTranscript { message: 'AddTranscript'; /** * Speechmatics JSON output format version number. */ format?: string; metadata: RecognitionMetadata; results: RecognitionResult[]; /** * The channel identifier to which the audio belongs. This field is only seen in multichannel. */ channel?: string; } interface TranslatedSentence { content: string; /** * The start time (in seconds) of the original transcribed audio segment */ start_time: number; /** * The end time (in seconds) of the original transcribed audio segment */ end_time: number; /** * The speaker that uttered the speech if speaker diarization is enabled */ speaker?: string; } interface AddPartialTranslation { message: 'AddPartialTranslation'; /** * Speechmatics JSON output format version number. */ format?: string; /** * Language translation relates to given as an ISO language code. */ language: string; results: TranslatedSentence[]; } interface AddTranslation { message: 'AddTranslation'; /** * Speechmatics JSON output format version number. */ format?: string; /** * Language translation relates to given as an ISO language code. */ language: string; results: TranslatedSentence[]; } interface EndOfTranscript { message: 'EndOfTranscript'; } interface AudioEventStartData { /** * The type of audio event that has started or ended. See our list of [supported Audio Event types](https://docs.speechmatics.com/speech-to-text/features/audio-events#supported-audio-events). */ type: string; /** * The time (in seconds) of the audio corresponding to the beginning of the audio event. */ start_time: number; /** * A confidence score assigned to the audio event. Ranges from 0.0 (least confident) to 1.0 (most confident). */ confidence: number; } interface AudioEventStarted { message: 'AudioEventStarted'; event: AudioEventStartData; /** * The channel identifier to which the audio belongs. This field is only seen in multichannel. */ channel?: string; } interface AudioEventEndData { /** * The type of audio event that has started or ended. See our list of [supported Audio Event types](https://docs.speechmatics.com/speech-to-text/features/audio-events#supported-audio-events). */ type: string; end_time: number; } interface AudioEventEnded { message: 'AudioEventEnded'; event: AudioEventEndData; /** * The channel identifier to which the audio belongs. This field is only seen in multichannel. */ channel?: string; } interface EndOfUtteranceMetadata { /** * The time (in seconds) that the end of utterance was detected. */ start_time?: number; /** * The time (in seconds) that the end of utterance was detected. */ end_time?: number; } interface EndOfUtterance { message: 'EndOfUtterance'; metadata: EndOfUtteranceMetadata; /** * The channel identifier to which the EndOfUtterance message belongs. This field is only seen in multichannel. */ channel?: string; } /** * The following are the possible info types: * * | Info Type | Description | * | --- | --- | * | `recognition_quality` | Informs the client what particular quality-based model is used to handle the recognition. Sent to the client immediately after the WebSocket handshake is completed.| * | `concurrent_session_usage` | Informs the client of their quota for concurrent sessions and how much of it they are using. Sent to the client immediately after the WebSocket handshake is completed.| */ type InfoTypeEnum = 'recognition_quality' | 'concurrent_session_usage'; interface Info { message: 'Info'; /** * The following are the possible info types: * * | Info Type | Description | * | --- | --- | * | `recognition_quality` | Informs the client what particular quality-based model is used to handle the recognition. Sent to the client immediately after the WebSocket handshake is completed.| * | `concurrent_session_usage` | Informs the client of their quota for concurrent sessions and how much of it they are using. Sent to the client immediately after the WebSocket handshake is completed.| */ type: InfoTypeEnum; reason: string; code?: number; seq_no?: number; /** * Only set when `type` is `recognition_quality`. Quality-based model name. It is one of "telephony", "broadcast". The model is selected automatically, for high-quality audio (12kHz+) the broadcast model is used, for lower quality audio the telephony model is used. */ quality?: string; /** * Only set when `type` is `concurrent_session_usage`. Indicates the current usage (number of active concurrent sessions). */ usage?: number; /** * Only set when `type` is `concurrent_session_usage`. Indicates the current quota (maximum number of concurrent sessions allowed). */ quota?: number; /** * Only set when `type` is `concurrent_session_usage`. Indicates the timestamp of the most recent usage update, in the format `YYYY-MM-DDTHH:MM:SSZ` (UTC). This value is updated even when usage exceeds the quota, as it represents the most recent known data. In some cases, it may be empty or outdated due to internal errors preventing successful update. */ last_updated?: string; } /** * The following are the possible warning types: * * | Warning Type | Description | * | --- | --- | * | `duration_limit_exceeded` | The maximum allowed duration of a single utterance to process has been exceeded. Any `AddAudio` messages received that exceed this limit are confirmed with `AudioAdded`, but are ignored by the transcription engine. Exceeding the limit triggers the same mechanism as receiving an `EndOfStream` message, so the Server will eventually send an `EndOfTranscript` message and suspend. * | `unsupported_translation_pair` | One of the requested translation target languages is unsupported (given the source audio language). The error message specifies the unsupported language pair. * | `idle_timeout` | Informs that the session is approaching the idle duration limit (no audio data sent within the last hour), with a `reason` of the form: <p>`Session will timeout in {time_remaining}m due to inactivity, no audio sent within the last {time_elapsed}m`</p> Currently the server will send messages at 15, 10 and 5m prior to timeout, and will send a final error message on timeout, before closing the connection with the code 1008. (see [Realtime limits](https://docs.speechmatics.com/speech-to-text/realtime/limits) for more information). * | `session_timeout` | Informs that the session is approaching the max session duration limit (maximum session duration of 48 hours), with a `reason` of the form: <p>`Session will timeout in {time_remaining}m due to max duration, session has been active for {time_elapsed}m`</p> Currently the server will send messages at 45, 30 and 15m prior to timeout, and will send a final error message on timeout, before closing the connection with the code 1008. (see [Realtime limits](https://docs.speechmatics.com/speech-to-text/realtime/limits) for more information).| * | `empty_translation_target_list` | No supported translation target languages specified. Translation will not run. * | `add_audio_after_eos` | Protocol specification doesn't allow adding audio after `EndOfStream` has been received. Any `AddAudio messages after this, will be ignored. * | `speaker_id` | Informs the client about any speaker ID related issues. | */ type WarningTypeEnum = 'duration_limit_exceeded' | 'unsupported_translation_pair' | 'idle_timeout' | 'session_timeout' | 'empty_translation_target_list' | 'add_audio_after_eos' | 'speaker_id'; interface Warning { message: 'Warning'; /** * The following are the possible warning types: * * | Warning Type | Description | * | --- | --- | * | `duration_limit_exceeded` | The maximum allowed duration of a single utterance to process has been exceeded. Any `AddAudio` messages received that exceed this limit are confirmed with `AudioAdded`, but are ignored by the transcription engine. Exceeding the limit triggers the same mechanism as receiving an `EndOfStream` message, so the Server will eventually send an `EndOfTranscript` message and suspend. * | `unsupported_translation_pair` | One of the requested translation target languages is unsupported (given the source audio language). The error message specifies the unsupported language pair. * | `idle_timeout` | Informs that the session is approaching the idle duration limit (no audio data sent within the last hour), with a `reason` of the form: <p>`Session will timeout in {time_remaining}m due to inactivity, no audio sent within the last {time_elapsed}m`</p> Currently the server will send messages at 15, 10 and 5m prior to timeout, and will send a final error message on timeout, before closing the connection with the code 1008. (see [Realtime limits](https://docs.speechmatics.com/speech-to-text/realtime/limits) for more information). * | `session_timeout` | Informs that the session is approaching the max session duration limit (maximum session duration of 48 hours), with a `reason` of the form: <p>`Session will timeout in {time_remaining}m due to max duration, session has been active for {time_elapsed}m`</p> Currently the server will send messages at 45, 30 and 15m prior to timeout, and will send a final error message on timeout, before closing the connection with the code 1008. (see [Realtime limits](https://docs.speechmatics.com/speech-to-text/realtime/limits) for more information).| * | `empty_translation_target_list` | No supported translation target languages specified. Translation will not run. * | `add_audio_after_eos` | Protocol specification doesn't allow adding audio after `EndOfStream` has been received. Any `AddAudio messages after this, will be ignored. * | `speaker_id` | Informs the client about any speaker ID related issues. | */ type: WarningTypeEnum; reason: string; code?: number; seq_no?: number; /** * Only set when `type` is `duration_limit_exceeded`. Indicates the limit that was exceeded (in seconds). */ duration_limit?: number; } /** * The following are the possible error types: * * | Error Type | Description | * | --- | --- | * | `invalid_message` | The message received was not understood. | * | `invalid_model` | Unable to use the model for the recognition. This can happen if the language is not supported at all, or is not available for the user. | * | `invalid_language` | The requested language is not valid or is not supported. | * | `invalid_config` | The config received contains some wrong or unsupported fields, or too many translation target languages were requested. | * | `invalid_audio_type` | Audio type is not supported, is deprecated, or the `audio_type` is malformed. | * | `invalid_output_format` | Output format is not supported, is deprecated, or the `output_format` is malformed. | * | `not_authorised` | User was not recognised, or the API key provided is not valid. | * | `not_allowed` | User is not allowed to use this message (is not allowed to perform the action the message would invoke). | * | `job_error` | Unable to do any work on this job, the server might have timed out etc. | * | `protocol_error` | Message received was syntactically correct, but could not be accepted due to protocol limitations. This is usually caused by messages sent in the wrong order. | * | `quota_exceeded` | Maximum number of concurrent connections allowed for the contract has been reached | * | `timelimit_exceeded` | Usage quota for the contract has been reached | * | `idle_timeout` | Idle duration limit was reached (no audio data sent within the last hour), a closing handshake with code 1008 follows this in-band error. | * | `session_timeout` | Max session duration was reached (maximum session duration of 48 hours), a closing handshake with code 1008 follows this in-band error. | * | `unknown_error` | An error that did not fit any of the types above. | * * :::info * * `invalid_message`, `protocol_error` and `unknown_error` can be triggered as a response to any type of messages. * * ::: */ type ErrorTypeEnum = 'invalid_message' | 'invalid_model' | 'invalid_language' | 'invalid_config' | 'invalid_audio_type' | 'invalid_output_format' | 'not_authorised' | 'not_allowed' | 'job_error' | 'protocol_error' | 'quota_exceeded' | 'timelimit_exceeded' | 'idle_timeout' | 'session_timeout' | 'unknown_error'; interface ErrorType { message: 'Error'; /** * The following are the possible error types: * * | Error Type | Description | * | --- | --- | * | `invalid_message` | The message received was not understood. | * | `invalid_model` | Unable to use the model for the recognition. This can happen if the language is not supported at all, or is not available for the user. | * | `invalid_language` | The requested language is not valid or is not supported. | * | `invalid_config` | The config received contains some wrong or unsupported fields, or too many translation target languages were requested. | * | `invalid_audio_type` | Audio type is not supported, is deprecated, or the `audio_type` is malformed. | * | `invalid_output_format` | Output format is not supported, is deprecated, or the `output_format` is malformed. | * | `not_authorised` | User was not recognised, or the API key provided is not valid. | * | `not_allowed` | User is not allowed to use this message (is not allowed to perform the action the message would invoke). | * | `job_error` | Unable to do any work on this job, the server might have timed out etc. | * | `protocol_error` | Message received was syntactically correct, but could not be accepted due to protocol limitations. This is usually caused by messages sent in the wrong order. | * | `quota_exceeded` | Maximum number of concurrent connections allowed for the contract has been reached | * | `timelimit_exceeded` | Usage quota for the contract has been reached | * | `idle_timeout` | Idle duration limit was reached (no audio data sent within the last hour), a closing handshake with code 1008 follows this in-band error. | * | `session_timeout` | Max session duration was reached (maximum session duration of 48 hours), a closing handshake with code 1008 follows this in-band error. | * | `unknown_error` | An error that did not fit any of the types above. | * * :::info * * `invalid_message`, `protocol_error` and `unknown_error` can be triggered as a response to any type of messages. * * ::: */ type: ErrorTypeEnum; reason: string; code?: number; seq_no?: number; } interface SpeakersResultItem { /** * Speaker label. */ label: string; speaker_identifiers: string[]; } interface SpeakersResult { message: 'SpeakersResult'; speakers: SpeakersResultItem[]; } type RealtimeServerMessage = RecognitionStarted | AudioAdded | ChannelAudioAdded | AddPartialTranscript | AddTranscript | AddPartialTranslation | AddTranslation | EndOfTranscript | AudioEventStarted | AudioEventEnded | EndOfUtterance | Info | Warning | ErrorType | SpeakersResult; declare class SocketStateChangeEvent extends Event { readonly socketState: RealtimeClient['socketState']; constructor(socketState: RealtimeClient['socketState']); } declare class ReceiveMessageEvent extends Event { readonly data: RealtimeServerMessage; constructor(data: RealtimeServerMessage); } declare class SendMessageEvent extends Event { readonly data: RealtimeClientMessage; constructor(data: RealtimeClientMessage); } interface RealtimeClientEventMap { sendMessage: SendMessageEvent; receiveMessage: ReceiveMessageEvent; socketStateChange: SocketStateChangeEvent; } type AddAudio = Parameters<WebSocket['send']>[0]; interface RealtimeClientOptions { /** * URL of the Speechmatics Realtime API, see options here: https://docs.speechmatics.com/introduction/authentication#supported-endpoints defaults to `wss://eu2.rt.speechmatics.com/v2` */ url?: string; /** * String identifying your app to the Speechmatics API. Can be any unique ID */ appId?: string; /** * Optionally enable legacy mode for the Realtime API. This opts out of incremental rescoring. * Only set this if you're sure you need it. */ enableLegacy?: boolean; /** * Optionally specify the timeout (in milliseconds) to wait before throwing an error on starting and stopping * For example, a value of 10_000 will throw an error if it takes more than 10 seconds to receive acknowledgement from the server after calling `start()` or `stopRecognition()` * Default value is 10_000 (10 seconds) */ connectionTimeout?: number; } type RealtimeTranscriptionConfig = Omit<StartRecognition, 'message' | 'audio_format'> & Partial<Pick<StartRecognition, 'audio_format'>>; declare class RealtimeClient extends TypedEventTarget<RealtimeClientEventMap> { readonly url: string; private readonly appId?; private readonly enableLegacy; timeout: number; constructor(config?: RealtimeClientOptions); private socket?; get socketState(): "connecting" | "open" | "closing" | "closed" | undefined; private lastAudioAddedSeqNo; private connect; private sendMessage; sendAudio(data: AddAudio): void; getSpeakers(options?: { final?: boolean; timeout?: number; }): Promise<SpeakersResult>; start(jwt: string, config: RealtimeTranscriptionConfig): Promise<RecognitionStarted>; /** Sends an `"EndOfStream"` message, resolving if acknowledged by an `"EndOfTranscript"` from server, rejecting if not received */ stopRecognition({ noTimeout }?: { noTimeout?: true; }): Promise<unknown>; setRecognitionConfig(config: MidSessionTranscriptionConfig): void; forceEndOfUtterance(channel?: string): void; } declare class SpeechmaticsRealtimeError extends Error { constructor(message: string, options?: ErrorOptions); } declare function getFeatures(region?: Region): Promise<FeatureResponse>; type Region = 'eu2' | 'neu' | 'wus'; interface FeatureResponse { metadata: { language_pack_info: Record<string, { language_description: string; locales?: Record<string, { name: string; }>; }>; }; realtime: { transcription: [ { version: 'latest'; languages: string[]; locales: Record<string, string[]>; domains: Record<string, string[]>; } ]; translation: [ { version: 'latest'; languages: Record<string, string[]>; } ]; }; } export { type AddAudio, type AddChannelAudio, type AddPartialTranscript, type AddPartialTranslation, type AddTranscript, type AddTranslation, type AdditionalVocabObject, type AttachesToEnum, type AudioAdded, type AudioEventEndData, type AudioEventEnded, type AudioEventStartData, type AudioEventStarted, type AudioEventsConfig, type AudioFilteringConfig, type ChannelAudioAdded, type ConversationConfig, type DiarizationConfig, type DirectionEnum, type EndOfChannel, type EndOfStream, type EndOfTranscript, type EndOfUtterance, type EndOfUtteranceMetadata, type ErrorType, type ErrorTypeEnum, type FeatureResponse, type FileType, type ForceEndOfUtterance, type GetSpeakers, type Info, type InfoTypeEnum, type LanguagePackInfo, type MaxDelayModeConfig, type MidSessionTranscriptionConfig, type OperatingPoint, type PunctuationOverrides, type Raw, type RawAudioEncodingEnum, RealtimeClient, type RealtimeClientEventMap, type RealtimeClientMessage, type RealtimeClientOptions, type RealtimeServerMessage, type RealtimeTranscriptionConfig, ReceiveMessageEvent, type RecognitionAlternative, type RecognitionAlternativeTagsEnum, type RecognitionDisplay, type RecognitionMetadata, type RecognitionResult, type RecognitionResultTypeEnum, type RecognitionStarted, SendMessageEvent, type SetRecognitionConfig, SocketStateChangeEvent, type SpeakerDiarizationConfig, type SpeakersInputItem, type SpeakersResult, type SpeakersResultItem, SpeechmaticsRealtimeError, type SpokenFormRecognitionResult, type SpokenFormRecognitionResultTypeEnum, type StartRecognition, type TranscriptFilteringConfig, type TranscriptionConfig, type TranslatedSentence, type TranslationConfig, type Warning, type WarningTypeEnum, type WordReplacementItem, type WritingDirectionEnum, type WrittenFormRecognitionResult, type WrittenFormRecognitionResultTypeEnum, getFeatures };