ibm-watson
Version:
Client library to use the IBM Watson Services
145 lines (144 loc) • 9.98 kB
TypeScript
/**
* (C) Copyright IBM Corp. 2014, 2024.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*/
/// <reference types="node" />
/// <reference types="node" />
import { Authenticator } from 'ibm-cloud-sdk-core';
import { Duplex, DuplexOptions } from 'stream';
import { RecognizeWebSocketParams } from '../speech-to-text/v1';
interface WritableState {
highWaterMark: number;
}
interface RecognizeStream extends Duplex {
_writableState: WritableState;
readableObjectMode: boolean;
}
/**
* pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in its `data` events.
* Also emits `results` events with interim results and other data.
*
* Cannot be instantiated directly, instead created by calling #recognizeUsingWebSocket()
*
* Uses WebSockets under the hood. For audio with no recognizable speech, no `data` events are emitted.
* @param {Object} options
* @constructor
*/
declare class RecognizeStream extends Duplex {
static WEBSOCKET_CONNECTION_ERROR: string;
static ERROR_UNRECOGNIZED_FORMAT: string;
static getContentType(buffer: Buffer): string;
private options;
private authenticator;
private listening;
private initialized;
private finished;
private socket;
/**
* pipe()-able Node.js Duplex stream - accepts binary audio and emits text/objects in it's `data` events.
*
* Uses WebSockets under the hood. For audio with no recognizable speech, no `data` events are emitted.
*
* By default, only finalized text is emitted in the data events, however when `objectMode`/`readableObjectMode` is enabled, both interim and final results objects are emitted.
* WriteableElementStream uses this, for example, to live-update the DOM with word-by-word transcriptions.
*
* Note that the WebSocket connection is not established until the first chunk of data is recieved. This allows for auto-detection of content type (for wav/flac/opus audio).
*
* @param {Options} options
* @param {Authenticator} options.authenticator - Authenticator to add Authorization header
* @param {string} [options.serviceUrl] - Base url for service (default='wss://api.us-south.speech-to-text.watson.cloud.ibm.com')
* @param {OutgoingHttpHeaders} [options.headers] - Only works in Node.js, not in browsers. Allows for custom headers to be set, including an Authorization header (preventing the need for auth tokens)
* @param {boolean} [options.readableObjectMode] - Emit `result` objects instead of string Buffers for the `data` events. Does not affect input (which must be binary)
* @param {boolean} [options.objectMode] - Alias for readableObjectMode
* @param {boolean} [options.disableSslVerification] - If true, disable SSL verification for the WebSocket connection (default=false)
* @param {Agent} [options.agent] - custom http(s) agent, useful for using the sdk behind a proxy (Node only)
* @param {string} [options.accessToken] - Bearer token to put in query string
* @param {string} [options.watsonToken] - Valid Watson authentication token (for Cloud Foundry)
* @param {string} [options.model] - The identifier of the model that is to be used for all recognition requests sent over the connection
* @param {string} [options.languageCustomizationId] - The customization ID (GUID) of a custom language model that is to be used for all requests sent over the connection
* @param {string} [options.acousticCustomizationId] - The customization ID (GUID) of a custom acoustic model that is to be used for the request
* @param {string} [options.baseModelVersion] - The version of the specified base model that is to be used for all requests sent over the connection
* @param {boolean} [options.xWatsonLearningOptOut] - Indicates whether IBM can use data that is sent over the connection to improve the service for future users (default=false)
* @param {string} [options.xWatsonMetadata] - Associates a customer ID with all data that is passed over the connection. The parameter accepts the argument customer_id={id}, where {id} is a random or generic string that is to be associated with the data
* @param {string} [options.contentType] - The format (MIME type) of the audio
* @param {number} [options.customizationWeight] - Tell the service how much weight to give to words from the custom language model compared to those from the base model for the current request
* @param {number} [options.inactivityTimeout] - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed (default=30)
* @param {boolean} [options.interimResults] - If true, the service returns interim results as a stream of JSON SpeechRecognitionResults objects (default=false)
* @param {string[]} [options.keywords] - An array of keyword strings to spot in the audio
* @param {number} [options.keywordsThreshold] - A confidence value that is the lower bound for spotting a keyword
* @param {number} [options.maxAlternatives] - The maximum number of alternative transcripts that the service is to return (default=1)
* @param {number} [options.wordAlternativesThreshold] - A confidence value that is the lower bound for identifying a hypothesis as a possible word alternative
* @param {boolean} [options.wordConfidence] - If true, the service returns a confidence measure in the range of 0.0 to 1.0 for each word (default=false)
* @param {boolean} [options.timestamps] - If true, the service returns time alignment for each word (default=false)
* @param {boolean} [options.profanityFilter] - If true, the service filters profanity from all output except for keyword results by replacing inappropriate words with a series of asterisks (default=true)
* @param {boolean} [options.smartFormatting] - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable, conventional representations (default=false)
* @param {boolean} [options.speakerLabels] - If true, the response includes labels that identify which words were spoken by which participants in a multi-person exchange (default=false)
* @param {string} [options.grammarName] - The name of a grammar that is to be used with the recognition request
* @param {boolean} [options.redaction] - If true, the service redacts, or masks, numeric data from final transcripts (default=false)
* @param {boolean} [options.processingMetrics] - If true, requests processing metrics about the service's transcription of the input audio (default=false)
* @param {number} [options.processingMetricsInterval] - Specifies the interval in seconds at which the service is to return processing metrics
* @param {boolean} [options.audioMetrics] - If true, requests detailed information about the signal characteristics of the input audio (detailed=false)
* @param {number} [options.endOfPhraseSilenceTime] - If `true`, specifies the duration of the pause interval at which the service splits a transcript into multiple final results
* @param {boolean} [options.splitTranscriptAtPhraseEnd] - If `true`, directs the service to split the transcript into multiple final results based on semantic features of the input
* @param {number} [options.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service is to perform
* @param {number} [options.backgroundAudioSuppression] - The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech
* @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the service to produce results even more quickly than it usually does
* @constructor
*/
constructor(options: RecognizeStream.Options);
initialize(): void;
sendJSON(msg: any): void;
sendData(data: any): void;
/**
* Flow control - don't ask for more data until we've finished what we have
*
* Notes:
*
* This limits upload speed to 100 * options.highWaterMark / second.
*
* The default highWaterMark is 16kB, so the default max upload speed is ~1.6MB/s.
*
* Microphone input provides audio at a (downsampled) rate of:
* 16000 samples/s * 16-bits * 1 channel = 32kB/s
* (note the bits to Bytes conversion there)
*
* @private
* @param {Function} next
*/
afterSend(next: any): void;
/**
* Prevents any more audio from being sent over the WebSocket and gracefully closes the connection.
* Additional data may still be emitted up until the `end` event is triggered.
*/
stop(): void;
_read(): void;
_write(chunk: any, encoding: any, callback: any): void;
finish(): void;
/**
* Returns a Promise that resolves with Watson Transaction ID from the X-Transaction-ID header
*
* Works in Node.js but not in browsers (the W3C WebSocket API does not expose headers)
*
* @return Promise<String>
*/
getTransactionId(): Promise<string>;
}
declare namespace RecognizeStream {
interface Options extends DuplexOptions, RecognizeWebSocketParams {
authenticator: Authenticator;
disableSslVerification?: boolean;
serviceUrl?: string;
}
}
export = RecognizeStream;