ibm-watson
Version:
Client library to use the IBM Watson Services
757 lines • 281 kB
JavaScript
"use strict";
/**
* (C) Copyright IBM Corp. 2024.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var __extends = (this && this.__extends) || (function () {
var extendStatics = function (d, b) {
extendStatics = Object.setPrototypeOf ||
({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||
function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };
return extendStatics(d, b);
};
return function (d, b) {
if (typeof b !== "function" && b !== null)
throw new TypeError("Class extends value " + String(b) + " is not a constructor or null");
extendStatics(d, b);
function __() { this.constructor = d; }
d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
};
})();
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
/**
* IBM OpenAPI SDK Code Generator Version: 3.96.1-5136e54a-20241108-203028
*/
var extend = require("extend");
var ibm_cloud_sdk_core_1 = require("ibm-cloud-sdk-core");
var common_1 = require("../lib/common");
/**
* The IBM Watson™ Speech to Text service provides APIs that use IBM's speech-recognition capabilities to produce
* transcripts of spoken audio. The service can transcribe speech from various languages and audio formats. In addition
* to basic transcription, the service can produce detailed information about many different aspects of the audio. It
* returns all JSON response content in the UTF-8 character set.
*
* The service supports two types of models: previous-generation models that include the terms `Broadband` and
* `Narrowband` in their names, and next-generation models that include the terms `Multimedia` and `Telephony` in their
* names. Broadband and multimedia models have minimum sampling rates of 16 kHz. Narrowband and telephony models have
* minimum sampling rates of 8 kHz. The next-generation models offer high throughput and greater transcription accuracy.
*
*
* Effective **31 July 2023**, all previous-generation models will be removed from the service and the documentation.
* Most previous-generation models were deprecated on 15 March 2022. You must migrate to the equivalent large speech
* model or next-generation model by 31 July 2023. For more information, see [Migrating to large speech
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-migrate).{: deprecated}
*
* For speech recognition, the service supports synchronous and asynchronous HTTP Representational State Transfer (REST)
* interfaces. It also supports a WebSocket interface that provides a full-duplex, low-latency communication channel:
* Clients send requests and audio to the service and receive results over a single connection asynchronously.
*
* The service also offers two customization interfaces. Use language model customization to expand the vocabulary of a
* base model with domain-specific terminology. Use acoustic model customization to adapt a base model for the acoustic
* characteristics of your audio. For language model customization, the service also supports grammars. A grammar is a
* formal language specification that lets you restrict the phrases that the service can recognize.
*
* Language model customization and grammars are available for most previous- and next-generation models. Acoustic model
* customization is available for all previous-generation models.
*
* API Version: 1.0.0
* See: https://cloud.ibm.com/docs/speech-to-text
*/
var SpeechToTextV1 = /** @class */ (function (_super) {
__extends(SpeechToTextV1, _super);
/**
* Construct a SpeechToTextV1 object.
*
* @param {Object} options - Options for the service.
* @param {string} [options.serviceUrl] - The base URL for the service
* @param {OutgoingHttpHeaders} [options.headers] - Default headers that shall be included with every request to the service.
* @param {string} [options.serviceName] - The name of the service to configure
* @param {Authenticator} [options.authenticator] - The Authenticator object used to authenticate requests to the service. Defaults to environment if not set
* @constructor
* @returns {SpeechToTextV1}
*/
function SpeechToTextV1(options) {
var _this = this;
options = options || {};
if (!options.serviceName) {
options.serviceName = SpeechToTextV1.DEFAULT_SERVICE_NAME;
}
// If the caller didn't supply an authenticator, construct one from external configuration.
if (!options.authenticator) {
options.authenticator = (0, ibm_cloud_sdk_core_1.getAuthenticatorFromEnvironment)(options.serviceName);
}
_this = _super.call(this, options) || this;
_this.configureService(options.serviceName);
if (options.serviceUrl) {
_this.setServiceUrl(options.serviceUrl);
}
return _this;
}
/*************************
* models
************************/
/**
* List models.
*
* Lists all language models that are available for use with the service. The information includes the name of the
* model and its minimum sampling rate in Hertz, among other things. The ordering of the list of models can change
* from call to call; do not rely on an alphabetized or static list of models.
*
* **See also:** [Listing all
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-list#models-list-all).
*
* @param {Object} [params] - The parameters to send to the service.
* @param {OutgoingHttpHeaders} [params.headers] - Custom request headers
* @returns {Promise<SpeechToTextV1.Response<SpeechToTextV1.SpeechModels>>}
*/
SpeechToTextV1.prototype.listModels = function (params) {
var _params = __assign({}, params);
var _requiredParams = [];
var _validParams = ['headers'];
var _validationErrors = (0, ibm_cloud_sdk_core_1.validateParams)(_params, _requiredParams, _validParams);
if (_validationErrors) {
return Promise.reject(_validationErrors);
}
var sdkHeaders = (0, common_1.getSdkHeaders)(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'listModels');
var parameters = {
options: {
url: '/v1/models',
method: 'GET',
},
defaultOptions: extend(true, {}, this.baseOptions, {
headers: extend(true, sdkHeaders, {
'Accept': 'application/json',
}, _params.headers),
}),
};
return this.createRequest(parameters);
};
/**
* Get a model.
*
* Gets information for a single specified language model that is available for use with the service. The information
* includes the name of the model and its minimum sampling rate in Hertz, among other things.
*
* **See also:** [Listing a specific
* model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-list#models-list-specific).
*
* @param {Object} params - The parameters to send to the service.
* @param {string} params.modelId - The identifier of the model in the form of its name from the output of the [List
* models](#listmodels) method.
* @param {OutgoingHttpHeaders} [params.headers] - Custom request headers
* @returns {Promise<SpeechToTextV1.Response<SpeechToTextV1.SpeechModel>>}
*/
SpeechToTextV1.prototype.getModel = function (params) {
var _params = __assign({}, params);
var _requiredParams = ['modelId'];
var _validParams = ['modelId', 'headers'];
var _validationErrors = (0, ibm_cloud_sdk_core_1.validateParams)(_params, _requiredParams, _validParams);
if (_validationErrors) {
return Promise.reject(_validationErrors);
}
var path = {
'model_id': _params.modelId,
};
var sdkHeaders = (0, common_1.getSdkHeaders)(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'getModel');
var parameters = {
options: {
url: '/v1/models/{model_id}',
method: 'GET',
path: path,
},
defaultOptions: extend(true, {}, this.baseOptions, {
headers: extend(true, sdkHeaders, {
'Accept': 'application/json',
}, _params.headers),
}),
};
return this.createRequest(parameters);
};
/*************************
* synchronous
************************/
/**
* Recognize audio.
*
* Sends audio and returns transcription results for a recognition request. You can pass a maximum of 100 MB and a
* minimum of 100 bytes of audio with a request. The service automatically detects the endianness of the incoming
* audio and, for audio that includes multiple channels, downmixes the audio to one-channel mono during transcoding.
* The method returns only final results; to enable interim results, use the WebSocket API. (With the `curl` command,
* use the `--data-binary` option to upload the file for the request.)
*
* **See also:** [Making a basic HTTP
* request](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-http#HTTP-basic).
*
* ### Streaming mode
*
* For requests to transcribe live audio as it becomes available, you must set the `Transfer-Encoding` header to
* `chunked` to use streaming mode. In streaming mode, the service closes the connection (status code 408) if it does
* not receive at least 15 seconds of audio (including silence) in any 30-second period. The service also closes the
* connection (status code 400) if it detects no speech for `inactivity_timeout` seconds of streaming audio; use the
* `inactivity_timeout` parameter to change the default of 30 seconds.
*
* **See also:**
* * [Audio transmission](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#transmission)
* * [Timeouts](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#timeouts)
*
* ### Audio formats (content types)
*
* The service accepts audio in the following formats (MIME types).
* * For formats that are labeled **Required**, you must use the `Content-Type` header with the request to specify the
* format of the audio.
* * For all other formats, you can omit the `Content-Type` header or specify `application/octet-stream` with the
* header to have the service automatically detect the format of the audio. (With the `curl` command, you can specify
* either `"Content-Type:"` or `"Content-Type: application/octet-stream"`.)
*
* Where indicated, the format that you specify must include the sampling rate and can optionally include the number
* of channels and the endianness of the audio.
* * `audio/alaw` (**Required.** Specify the sampling rate (`rate`) of the audio.)
* * `audio/basic` (**Required.** Use only with narrowband models.)
* * `audio/flac`
* * `audio/g729` (Use only with narrowband models.)
* * `audio/l16` (**Required.** Specify the sampling rate (`rate`) and optionally the number of channels (`channels`)
* and endianness (`endianness`) of the audio.)
* * `audio/mp3`
* * `audio/mpeg`
* * `audio/mulaw` (**Required.** Specify the sampling rate (`rate`) of the audio.)
* * `audio/ogg` (The service automatically detects the codec of the input audio.)
* * `audio/ogg;codecs=opus`
* * `audio/ogg;codecs=vorbis`
* * `audio/wav` (Provide audio with a maximum of nine channels.)
* * `audio/webm` (The service automatically detects the codec of the input audio.)
* * `audio/webm;codecs=opus`
* * `audio/webm;codecs=vorbis`
*
* The sampling rate of the audio must match the sampling rate of the model for the recognition request: for broadband
* models, at least 16 kHz; for narrowband models, at least 8 kHz. If the sampling rate of the audio is higher than
* the minimum required rate, the service down-samples the audio to the appropriate rate. If the sampling rate of the
* audio is lower than the minimum required rate, the request fails.
*
* **See also:** [Supported audio
* formats](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-audio-formats).
*
* ### Large speech models and Next-generation models
*
* The service supports large speech models and next-generation `Multimedia` (16 kHz) and `Telephony` (8 kHz) models
* for many languages. Large speech models and next-generation models have higher throughput than the service's
* previous generation of `Broadband` and `Narrowband` models. When you use large speech models and next-generation
* models, the service can return transcriptions more quickly and also provide noticeably better transcription
* accuracy.
*
* You specify a large speech model or next-generation model by using the `model` query parameter, as you do a
* previous-generation model. Only the next-generation models support the `low_latency` parameter, and all large
* speech models and next-generation models support the `character_insertion_bias` parameter. These parameters are not
* available with previous-generation models.
*
* Large speech models and next-generation models do not support all of the speech recognition parameters that are
* available for use with previous-generation models. Next-generation models do not support the following parameters:
* * `acoustic_customization_id`
* * `keywords` and `keywords_threshold`
* * `processing_metrics` and `processing_metrics_interval`
* * `word_alternatives_threshold`
*
* **Important:** Effective **31 July 2023**, all previous-generation models will be removed from the service and the
* documentation. Most previous-generation models were deprecated on 15 March 2022. You must migrate to the equivalent
* large speech model or next-generation model by 31 July 2023. For more information, see [Migrating to large speech
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-migrate).
*
* **See also:**
* * [Large speech languages and
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-large-speech-languages)
* * [Supported features for large speech
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-large-speech-languages#models-lsm-supported-features)
* * [Next-generation languages and models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng)
* * [Supported features for next-generation
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-features)
*
* ### Multipart speech recognition
*
* **Note:** The asynchronous HTTP interface, WebSocket interface, and Watson SDKs do not support multipart speech
* recognition.
*
* The HTTP `POST` method of the service also supports multipart speech recognition. With multipart requests, you pass
* all audio data as multipart form data. You specify some parameters as request headers and query parameters, but you
* pass JSON metadata as form data to control most aspects of the transcription. You can use multipart recognition to
* pass multiple audio files with a single request.
*
* Use the multipart approach with browsers for which JavaScript is disabled or when the parameters used with the
* request are greater than the 8 KB limit imposed by most HTTP servers and proxies. You can encounter this limit, for
* example, if you want to spot a very large number of keywords.
*
* **See also:** [Making a multipart HTTP
* request](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-http#HTTP-multi).
*
* @param {Object} params - The parameters to send to the service.
* @param {NodeJS.ReadableStream | Buffer} params.audio - The audio to transcribe.
* @param {string} [params.contentType] - The format (MIME type) of the audio. For more information about specifying
* an audio format, see **Audio formats (content types)** in the method description.
* @param {string} [params.model] - The model to use for speech recognition. If you omit the `model` parameter, the
* service uses the US English `en-US_BroadbandModel` by default.
*
* _For IBM Cloud Pak for Data,_ if you do not install the `en-US_BroadbandModel`, you must either specify a model
* with the request or specify a new default model for your installation of the service.
*
* **See also:**
* * [Using a model for speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-use)
* * [Using the default
* model](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-use#models-use-default).
* @param {boolean} [params.speechBeginEvent] - If `true`, the service returns a response object `SpeechActivity`
* which contains the time when a speech activity is detected in the stream. This can be used both in standard and low
* latency mode. This feature enables client applications to know that some words/speech has been detected and the
* service is in the process of decoding. This can be used in lieu of interim results in standard mode. See [Using
* speech recognition
* parameters](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-service-features#features-parameters).
* @param {string} [params.languageCustomizationId] - The customization ID (GUID) of a custom language model that is
* to be used with the recognition request. The base model of the specified custom language model must match the model
* specified with the `model` parameter. You must make the request with credentials for the instance of the service
* that owns the custom model. By default, no custom language model is used. See [Using a custom language model for
* speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse).
*
* **Note:** Use this parameter instead of the deprecated `customization_id` parameter.
* @param {string} [params.acousticCustomizationId] - The customization ID (GUID) of a custom acoustic model that is
* to be used with the recognition request. The base model of the specified custom acoustic model must match the model
* specified with the `model` parameter. You must make the request with credentials for the instance of the service
* that owns the custom model. By default, no custom acoustic model is used. See [Using a custom acoustic model for
* speech recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-acousticUse).
* @param {string} [params.baseModelVersion] - The version of the specified base model that is to be used with the
* recognition request. Multiple versions of a base model can exist when a model is updated for internal improvements.
* The parameter is intended primarily for use with custom models that have been upgraded for a new base model. The
* default value depends on whether the parameter is used with or without a custom model. See [Making speech
* recognition requests with upgraded custom
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-custom-upgrade-use#custom-upgrade-use-recognition).
* @param {number} [params.customizationWeight] - If you specify the customization ID (GUID) of a custom language
* model with the recognition request, the customization weight tells the service how much weight to give to words
* from the custom language model compared to those from the base model for the current request.
*
* Specify a value between 0.0 and 1.0. Unless a different customization weight was specified for the custom model
* when the model was trained, the default value is:
* * 0.5 for large speech models
* * 0.3 for previous-generation models
* * 0.2 for most next-generation models
* * 0.1 for next-generation English and Japanese models
*
* A customization weight that you specify overrides a weight that was specified when the custom model was trained.
* The default value yields the best performance in general. Assign a higher value if your audio makes frequent use of
* OOV words from the custom model. Use caution when setting the weight: a higher value can improve the accuracy of
* phrases from the custom model's domain, but it can negatively affect performance on non-domain phrases.
*
* See [Using customization
* weight](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-languageUse#weight).
* @param {number} [params.inactivityTimeout] - The time in seconds after which, if only silence (no speech) is
* detected in streaming audio, the connection is closed with a 400 error. The parameter is useful for stopping audio
* submission from a live microphone when a user simply walks away. Use `-1` for infinity. See [Inactivity
* timeout](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#timeouts-inactivity).
* @param {string[]} [params.keywords] - An array of keyword strings to spot in the audio. Each keyword string can
* include one or more string tokens. Keywords are spotted only in the final results, not in interim hypotheses. If
* you specify any keywords, you must also specify a keywords threshold. Omit the parameter or specify an empty array
* if you do not need to spot keywords.
*
* You can spot a maximum of 1000 keywords with a single request. A single keyword can have a maximum length of 1024
* characters, though the maximum effective length for double-byte languages might be shorter. Keywords are
* case-insensitive.
*
* See [Keyword spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting).
* @param {number} [params.keywordsThreshold] - A confidence value that is the lower bound for spotting a keyword. A
* word is considered to match a keyword if its confidence is greater than or equal to the threshold. Specify a
* probability between 0.0 and 1.0. If you specify a threshold, you must also specify one or more keywords. The
* service performs no keyword spotting if you omit either parameter. See [Keyword
* spotting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#keyword-spotting).
* @param {number} [params.maxAlternatives] - The maximum number of alternative transcripts that the service is to
* return. By default, the service returns a single transcript. If you specify a value of `0`, the service uses the
* default value, `1`. See [Maximum
* alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#max-alternatives).
* @param {number} [params.wordAlternativesThreshold] - A confidence value that is the lower bound for identifying a
* hypothesis as a possible word alternative (also known as "Confusion Networks"). An alternative word is considered
* if its confidence is greater than or equal to the threshold. Specify a probability between 0.0 and 1.0. By default,
* the service computes no alternative words. See [Word
* alternatives](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-spotting#word-alternatives).
* @param {boolean} [params.wordConfidence] - If `true`, the service returns a confidence measure in the range of 0.0
* to 1.0 for each word. By default, the service returns no word confidence scores. See [Word
* confidence](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-confidence).
* @param {boolean} [params.timestamps] - If `true`, the service returns time alignment for each word. By default, no
* timestamps are returned. See [Word
* timestamps](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metadata#word-timestamps).
* @param {boolean} [params.profanityFilter] - If `true`, the service filters profanity from all output except for
* keyword results by replacing inappropriate words with a series of asterisks. Set the parameter to `false` to return
* results with no censoring.
*
* **Note:** The parameter can be used with US English and Japanese transcription only. See [Profanity
* filtering](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#profanity-filtering).
* @param {boolean} [params.smartFormatting] - If `true`, the service converts dates, times, series of digits and
* numbers, phone numbers, currency values, and internet addresses into more readable, conventional representations in
* the final transcript of a recognition request. For US English, the service also converts certain keyword strings to
* punctuation symbols. By default, the service performs no smart formatting.
*
* **Note:** The parameter can be used with US English, Japanese, and Spanish (all dialects) transcription only.
*
* See [Smart formatting](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#smart-formatting).
* @param {number} [params.smartFormattingVersion] - Smart formatting version for large speech models and
* next-generation models is supported in US English, Brazilian Portuguese, French, German, Spanish and French
* Canadian languages.
* @param {boolean} [params.speakerLabels] - If `true`, the response includes labels that identify which words were
* spoken by which participants in a multi-person exchange. By default, the service returns no speaker labels. Setting
* `speaker_labels` to `true` forces the `timestamps` parameter to be `true`, regardless of whether you specify
* `false` for the parameter.
* * _For previous-generation models,_ the parameter can be used with Australian English, US English, German,
* Japanese, Korean, and Spanish (both broadband and narrowband models) and UK English (narrowband model)
* transcription only.
* * _For large speech models and next-generation models,_ the parameter can be used with all available languages.
*
* See [Speaker labels](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-speaker-labels).
* @param {string} [params.grammarName] - The name of a grammar that is to be used with the recognition request. If
* you specify a grammar, you must also use the `language_customization_id` parameter to specify the name of the
* custom language model for which the grammar is defined. The service recognizes only strings that are recognized by
* the specified grammar; it does not recognize other custom words from the model's words resource.
*
* See [Using a grammar for speech
* recognition](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-grammarUse).
* @param {boolean} [params.redaction] - If `true`, the service redacts, or masks, numeric data from final
* transcripts. The feature redacts any number that has three or more consecutive digits by replacing each digit with
* an `X` character. It is intended to redact sensitive numeric data, such as credit card numbers. By default, the
* service performs no redaction.
*
* When you enable redaction, the service automatically enables smart formatting, regardless of whether you explicitly
* disable that feature. To ensure maximum security, the service also disables keyword spotting (ignores the
* `keywords` and `keywords_threshold` parameters) and returns only a single final transcript (forces the
* `max_alternatives` parameter to be `1`).
*
* **Note:** The parameter can be used with US English, Japanese, and Korean transcription only.
*
* See [Numeric
* redaction](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-formatting#numeric-redaction).
* @param {boolean} [params.audioMetrics] - If `true`, requests detailed information about the signal characteristics
* of the input audio. The service returns audio metrics with the final transcription results. By default, the service
* returns no audio metrics.
*
* See [Audio metrics](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-metrics#audio-metrics).
* @param {number} [params.endOfPhraseSilenceTime] - Specifies the duration of the pause interval at which the service
* splits a transcript into multiple final results. If the service detects pauses or extended silence before it
* reaches the end of the audio stream, its response can include multiple final results. Silence indicates a point at
* which the speaker pauses between spoken words or phrases.
*
* Specify a value for the pause interval in the range of 0.0 to 120.0.
* * A value greater than 0 specifies the interval that the service is to use for speech recognition.
* * A value of 0 indicates that the service is to use the default interval. It is equivalent to omitting the
* parameter.
*
* The default pause interval for most languages is 0.8 seconds; the default for Chinese is 0.6 seconds.
*
* See [End of phrase silence
* time](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#silence-time).
* @param {boolean} [params.splitTranscriptAtPhraseEnd] - If `true`, directs the service to split the transcript into
* multiple final results based on semantic features of the input, for example, at the conclusion of meaningful
* phrases such as sentences. The service bases its understanding of semantic features on the base language model that
* you use with a request. Custom language models and grammars can also influence how and where the service splits a
* transcript.
*
* By default, the service splits transcripts based solely on the pause interval. If the parameters are used together
* on the same request, `end_of_phrase_silence_time` has precedence over `split_transcript_at_phrase_end`.
*
* See [Split transcript at phrase
* end](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#split-transcript).
* @param {number} [params.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service
* is to perform. Use the parameter to suppress word insertions from music, coughing, and other non-speech events. The
* service biases the audio it passes for speech recognition by evaluating the input audio against prior models of
* speech and non-speech activity.
*
* Specify a value between 0.0 and 1.0:
* * 0.0 suppresses all audio (no speech is transcribed).
* * 0.5 (the default) provides a reasonable compromise for the level of sensitivity.
* * 1.0 suppresses no audio (speech detection sensitivity is disabled).
*
* The values increase on a monotonic curve. Specifying one or two decimal places of precision (for example, `0.55`)
* is typically more than sufficient.
*
* The parameter is supported with all large speech models, next-generation models and with most previous-generation
* models. See [Speech detector
* sensitivity](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-sensitivity)
* and [Language model
* support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
* @param {number} [params.backgroundAudioSuppression] - The level to which the service is to suppress background
* audio based on its volume to prevent it from being transcribed as speech. Use the parameter to suppress side
* conversations or background noise.
*
* Specify a value in the range of 0.0 to 1.0:
* * 0.0 (the default) provides no suppression (background audio suppression is disabled).
* * 0.5 provides a reasonable level of audio suppression for general usage.
* * 1.0 suppresses all audio (no audio is transcribed).
*
* The values increase on a monotonic curve. Specifying one or two decimal places of precision (for example, `0.55`)
* is typically more than sufficient.
*
* The parameter is supported with all large speech models, next-generation models and with most previous-generation
* models. See [Background audio
* suppression](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-parameters-suppression)
* and [Language model
* support](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-detection#detection-support).
* @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that
* support low latency, directs the service to produce results even more quickly than it usually does. Next-generation
* models produce transcription results faster than previous-generation models. The `low_latency` parameter causes the
* models to produce results even more quickly, though the results might be less accurate when the parameter is used.
*
* The parameter is not available for large speech models and previous-generation `Broadband` and `Narrowband` models.
* It is available for most next-generation models.
* * For a list of next-generation models that support low latency, see [Supported next-generation language
* models](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-models-ng#models-ng-supported).
* * For more information about the `low_latency` parameter, see [Low
* latency](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-interim#low-latency).
* @param {number} [params.characterInsertionBias] - For large speech models and next-generation models, an indication
* of whether the service is biased to recognize shorter or longer strings of characters when developing transcription
* hypotheses. By default, the service is optimized to produce the best balance of strings of different lengths.
*
* The default bias is 0.0. The allowable range of values is -1.0 to 1.0.
* * Negative values bias the service to favor hypotheses with shorter strings of characters.
* * Positive values bias the service to favor hypotheses with longer strings of characters.
*
* As the value approaches -1.0 or 1.0, the impact of the parameter becomes more pronounced. To determine the most
* effective value for your scenario, start by setting the value of the parameter to a small increment, such as -0.1,
* -0.05, 0.05, or 0.1, and assess how the value impacts the transcription results. Then experiment with different
* values as necessary, adjusting the value by small increments.
*
* The parameter is not available for previous-generation models.
*
* See [Character insertion
* bias](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-parsing#insertion-bias).
* @param {OutgoingHttpHeaders} [params.headers] - Custom request headers
* @returns {Promise<SpeechToTextV1.Response<SpeechToTextV1.SpeechRecognitionResults>>}
*/
SpeechToTextV1.prototype.recognize = function (params) {
var _params = __assign({}, params);
var _requiredParams = ['audio'];
var _validParams = ['audio', 'contentType', 'model', 'speechBeginEvent', 'languageCustomizationId', 'acousticCustomizationId', 'baseModelVersion', 'customizationWeight', 'inactivityTimeout', 'keywords', 'keywordsThreshold', 'maxAlternatives', 'wordAlternativesThreshold', 'wordConfidence', 'timestamps', 'profanityFilter', 'smartFormatting', 'smartFormattingVersion', 'speakerLabels', 'grammarName', 'redaction', 'audioMetrics', 'endOfPhraseSilenceTime', 'splitTranscriptAtPhraseEnd', 'speechDetectorSensitivity', 'backgroundAudioSuppression', 'lowLatency', 'characterInsertionBias', 'headers'];
var _validationErrors = (0, ibm_cloud_sdk_core_1.validateParams)(_params, _requiredParams, _validParams);
if (_validationErrors) {
return Promise.reject(_validationErrors);
}
var body = _params.audio;
var query = {
'model': _params.model,
'speech_begin_event': _params.speechBeginEvent,
'language_customization_id': _params.languageCustomizationId,
'acoustic_customization_id': _params.acousticCustomizationId,
'base_model_version': _params.baseModelVersion,
'customization_weight': _params.customizationWeight,
'inactivity_timeout': _params.inactivityTimeout,
'keywords': _params.keywords,
'keywords_threshold': _params.keywordsThreshold,
'max_alternatives': _params.maxAlternatives,
'word_alternatives_threshold': _params.wordAlternativesThreshold,
'word_confidence': _params.wordConfidence,
'timestamps': _params.timestamps,
'profanity_filter': _params.profanityFilter,
'smart_formatting': _params.smartFormatting,
'smart_formatting_version': _params.smartFormattingVersion,
'speaker_labels': _params.speakerLabels,
'grammar_name': _params.grammarName,
'redaction': _params.redaction,
'audio_metrics': _params.audioMetrics,
'end_of_phrase_silence_time': _params.endOfPhraseSilenceTime,
'split_transcript_at_phrase_end': _params.splitTranscriptAtPhraseEnd,
'speech_detector_sensitivity': _params.speechDetectorSensitivity,
'background_audio_suppression': _params.backgroundAudioSuppression,
'low_latency': _params.lowLatency,
'character_insertion_bias': _params.characterInsertionBias,
};
var sdkHeaders = (0, common_1.getSdkHeaders)(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'recognize');
var parameters = {
options: {
url: '/v1/recognize',
method: 'POST',
body: body,
qs: query,
},
defaultOptions: extend(true, {}, this.baseOptions, {
headers: extend(true, sdkHeaders, {
'Accept': 'application/json',
'Content-Type': _params.contentType,
}, _params.headers),
}),
};
return this.createRequest(parameters);
};
/*************************
* asynchronous
************************/
/**
* Register a callback.
*
* Registers a callback URL with the service for use with subsequent asynchronous recognition requests. The service
* attempts to register, or allowlist, the callback URL if it is not already registered by sending a `GET` request to
* the callback URL. The service passes a random alphanumeric challenge string via the `challenge_string` parameter of
* the request. The request includes an `Accept` header that specifies `text/plain` as the required response type.
*
* To be registered successfully, the callback URL must respond to the `GET` request from the service. The response
* must send status code 200 and must include the challenge string in its body. Set the `Content-Type` response header
* to `text/plain`. Upon receiving this response, the service responds to the original registration request with
* response code 201.
*
* The service sends only a single `GET` request to the callback URL. If the service does not receive a reply with a
* response code of 200 and a body that echoes the challenge string sent by the service within five seconds, it does
* not allowlist the URL; it instead sends status code 400 in response to the request to register a callback. If the
* requested callback URL is already allowlisted, the service responds to the initial registration request with
* response code 200.
*
* If you specify a user secret with the request, the service uses it as a key to calculate an HMAC-SHA1 signature of
* the challenge string in its response to the `POST` request. It sends this signature in the `X-Callback-Signature`
* header of its `GET` request to the URL during registration. It also uses the secret to calculate a signature over
* the payload of every callback notification that uses the URL. The signature provides authentication and data
* integrity for HTTP communications.
*
* After you successfully register a callback URL, you can use it with an indefinite number of recognition requests.
* You can register a maximum of 20 callback URLS in a one-hour span of time.
*
* **See also:** [Registering a callback
* URL](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-async#register).
*
* @param {Object} params - The parameters to send to the service.
* @param {string} params.callbackUrl - An HTTP or HTTPS URL to which callback notifications are to be sent. To be
* allowlisted, the URL must successfully echo the challenge string during URL verification. During verification, the
* client can also check the signature that the service sends in the `X-Callback-Signature` header to verify the
* origin of the request.
* @param {string} [params.userSecret] - A user-specified string that the service uses to generate the HMAC-SHA1
* signature that it sends via the `X-Callback-Signature` header. The service includes the header during URL
* verification and with every notification sent to the callback URL. It calculates the signature over the payload of
* the notification. If you omit the parameter, the service does not send the header.
* @param {OutgoingHttpHeaders} [params.headers] - Custom request headers
* @returns {Promise<SpeechToTextV1.Response<SpeechToTextV1.RegisterStatus>>}
*/
SpeechToTextV1.prototype.registerCallback = function (params) {
var _params = __assign({}, params);
var _requiredParams = ['callbackUrl'];
var _validParams = ['callbackUrl', 'userSecret', 'headers'];
var _validationErrors = (0, ibm_cloud_sdk_core_1.validateParams)(_params, _requiredParams, _validParams);
if (_validationErrors) {
return Promise.reject(_validationErrors);
}
var query = {
'callback_url': _params.callbackUrl,
'user_secret': _params.userSecret,
};
var sdkHeaders = (0, common_1.getSdkHeaders)(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'registerCallback');
var parameters = {
options: {
url: '/v1/register_callback',
method: 'POST',
qs: query,
},
defaultOptions: extend(true, {}, this.baseOptions, {
headers: extend(true, sdkHeaders, {
'Accept': 'application/json',
}, _params.headers),
}),
};
return this.createRequest(parameters);
};
/**
* Unregister a callback.
*
* Unregisters a callback URL that was previously allowlisted with a [Register a callback](#registercallback) request
* for use with the asynchronous interface. Once unregistered, the URL can no longer be used with asynchronous
* recognition requests.
*
* **See also:** [Unregistering a callback
* URL](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-async#unregister).
*
* @param {Object} params - The parameters to send to the service.
* @param {string} params.callbackUrl - The callback URL that is to be unregistered.
* @param {OutgoingHttpHeaders} [params.headers] - Custom request headers
* @returns {Promise<SpeechToTextV1.Response<SpeechToTextV1.EmptyObject>>}
*/
SpeechToTextV1.prototype.unregisterCallback = function (params) {
var _params = __assign({}, params);
var _requiredParams = ['callbackUrl'];
var _validParams = ['callbackUrl', 'headers'];
var _validationErrors = (0, ibm_cloud_sdk_core_1.validateParams)(_params, _requiredParams, _validParams);
if (_validationErrors) {
return Promise.reject(_validationErrors);
}
var query = {
'callback_url': _params.callbackUrl,
};
var sdkHeaders = (0, common_1.getSdkHeaders)(SpeechToTextV1.DEFAULT_SERVICE_NAME, 'v1', 'unregisterCallback');
var parameters = {
options: {
url: '/v1/unregister_callback',
method: 'POST',
qs: query,
},
defaultOptions: extend(true, {}, this.baseOptions, {
headers: extend(true, sdkHeaders, {}, _params.headers),
}),
};
return this.createRequest(parameters);
};
/**
* Create a job.
*
* Creates a job for a new asynchronous recognition request. The job is owned by the instance of the service whose
* credentials are used to create it. How you learn the status and results of a job depends on the parameters you
* include with the job creation request:
* * By callback notification: Include the `callback_url` parameter to specify a URL to which the service is to send
* callback notifications when the status of the job changes. Optionally, you can also include the `events` and
* `user_token` parameters to subscribe to specific events and to specify a string that is to be included with each
* notification for the job.
* * By polling the service: Omit the `callback_url`, `events`, and `user_token` parameters. You must then use the
* [Check jobs](#checkjobs) or [Check a job](#checkjob) methods to check the status of the job, using the latter to
* retrieve the results when the job is complete.
*
* The two approaches are not mutually exclusive. You can poll the service for job status or obtain results from the
* service manually even if you include a callback URL. In both cases, you can include the `results_ttl` parameter to
* specify how long the results are to remain available after the job is complete. Using the HTTPS [Check a
* job](#checkjob) method to retrieve results is more secure than receiving them via callback notification over HTTP
* because it provides confidentiality in addition to authentication and data integrity.
*
* The method supports the same basic parameters as other HTTP and WebSocket recognition requests. It also supports
* the following parameters specific to the asynchronous interface:
* * `callback_url`
* * `events`
* * `user_token`
* * `results_ttl`
*
* You can pass a maximum of 1 GB and a minimum of 100 bytes of audio with a request. The service automatically
* detects the endianness of the incoming audio and, for audio that includes multiple channels, downmixes the audio to
* one-channel mono during transcoding. The method returns only final results; to enable interim results, use the
* WebSocket API. (With the `curl` command, use the `--data-binary` option to upload the file for the request.)
*
* **See also:** [Creating a job](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-async#create).
*
* ### Streaming mode
*
* For requests to transcribe live audio as it becomes available, you must set the `Transfer-Encoding` header to
* `chunked` to use streaming mode. In streaming mode, the service closes the connection (status code 408) if it does
* not receive at least 15 seconds of audio (including silence) in any 30-second period. The service also closes the
* connection (status code 400) if it detects no speech for `inactivity_timeout` seconds of streaming audio; use the
* `inactivity_timeout` parameter to change the default of 30 seconds.
*
* **See also:**
* * [Audio transmission](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#transmission)
* * [Timeouts](https://cloud.ibm.com/docs/speech-to-text?topic=speech-to-text-input#timeouts)
*
* ### Audio formats (content types)
*
* The service accepts audio in the following formats (MIME types).
* * For formats that are labeled **Required**, you must use the `Content-Type` header with the request to specify the
* format of the audio.
* * For all other formats, you can omit the `Content-Type` header or specify `application/octet-stream` with the
* header to have the service automatically detect the format of the audi