UNPKG

ibm-watson

Version:
145 lines (144 loc) 9.98 kB
/** * (C) Copyright IBM Corp. 2014, 2024. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License */ /// <reference types="node" /> /// <reference types="node" /> import { Authenticator } from 'ibm-cloud-sdk-core'; import { Duplex, DuplexOptions } from 'stream'; import { RecognizeWebSocketParams } from '../speech-to-text/v1'; interface WritableState { highWaterMark: number; } interface RecognizeStream extends Duplex { _writableState: WritableState; readableObjectMode: boolean; } /** * pipe()-able Node.js Readable/Writeable stream - accepts binary audio and emits text in its `data` events. * Also emits `results` events with interim results and other data. * * Cannot be instantiated directly, instead created by calling #recognizeUsingWebSocket() * * Uses WebSockets under the hood. For audio with no recognizable speech, no `data` events are emitted. * @param {Object} options * @constructor */ declare class RecognizeStream extends Duplex { static WEBSOCKET_CONNECTION_ERROR: string; static ERROR_UNRECOGNIZED_FORMAT: string; static getContentType(buffer: Buffer): string; private options; private authenticator; private listening; private initialized; private finished; private socket; /** * pipe()-able Node.js Duplex stream - accepts binary audio and emits text/objects in it's `data` events. * * Uses WebSockets under the hood. For audio with no recognizable speech, no `data` events are emitted. * * By default, only finalized text is emitted in the data events, however when `objectMode`/`readableObjectMode` is enabled, both interim and final results objects are emitted. * WriteableElementStream uses this, for example, to live-update the DOM with word-by-word transcriptions. * * Note that the WebSocket connection is not established until the first chunk of data is recieved. This allows for auto-detection of content type (for wav/flac/opus audio). * * @param {Options} options * @param {Authenticator} options.authenticator - Authenticator to add Authorization header * @param {string} [options.serviceUrl] - Base url for service (default='wss://api.us-south.speech-to-text.watson.cloud.ibm.com') * @param {OutgoingHttpHeaders} [options.headers] - Only works in Node.js, not in browsers. Allows for custom headers to be set, including an Authorization header (preventing the need for auth tokens) * @param {boolean} [options.readableObjectMode] - Emit `result` objects instead of string Buffers for the `data` events. Does not affect input (which must be binary) * @param {boolean} [options.objectMode] - Alias for readableObjectMode * @param {boolean} [options.disableSslVerification] - If true, disable SSL verification for the WebSocket connection (default=false) * @param {Agent} [options.agent] - custom http(s) agent, useful for using the sdk behind a proxy (Node only) * @param {string} [options.accessToken] - Bearer token to put in query string * @param {string} [options.watsonToken] - Valid Watson authentication token (for Cloud Foundry) * @param {string} [options.model] - The identifier of the model that is to be used for all recognition requests sent over the connection * @param {string} [options.languageCustomizationId] - The customization ID (GUID) of a custom language model that is to be used for all requests sent over the connection * @param {string} [options.acousticCustomizationId] - The customization ID (GUID) of a custom acoustic model that is to be used for the request * @param {string} [options.baseModelVersion] - The version of the specified base model that is to be used for all requests sent over the connection * @param {boolean} [options.xWatsonLearningOptOut] - Indicates whether IBM can use data that is sent over the connection to improve the service for future users (default=false) * @param {string} [options.xWatsonMetadata] - Associates a customer ID with all data that is passed over the connection. The parameter accepts the argument customer_id={id}, where {id} is a random or generic string that is to be associated with the data * @param {string} [options.contentType] - The format (MIME type) of the audio * @param {number} [options.customizationWeight] - Tell the service how much weight to give to words from the custom language model compared to those from the base model for the current request * @param {number} [options.inactivityTimeout] - The time in seconds after which, if only silence (no speech) is detected in the audio, the connection is closed (default=30) * @param {boolean} [options.interimResults] - If true, the service returns interim results as a stream of JSON SpeechRecognitionResults objects (default=false) * @param {string[]} [options.keywords] - An array of keyword strings to spot in the audio * @param {number} [options.keywordsThreshold] - A confidence value that is the lower bound for spotting a keyword * @param {number} [options.maxAlternatives] - The maximum number of alternative transcripts that the service is to return (default=1) * @param {number} [options.wordAlternativesThreshold] - A confidence value that is the lower bound for identifying a hypothesis as a possible word alternative * @param {boolean} [options.wordConfidence] - If true, the service returns a confidence measure in the range of 0.0 to 1.0 for each word (default=false) * @param {boolean} [options.timestamps] - If true, the service returns time alignment for each word (default=false) * @param {boolean} [options.profanityFilter] - If true, the service filters profanity from all output except for keyword results by replacing inappropriate words with a series of asterisks (default=true) * @param {boolean} [options.smartFormatting] - If true, the service converts dates, times, series of digits and numbers, phone numbers, currency values, and internet addresses into more readable, conventional representations (default=false) * @param {boolean} [options.speakerLabels] - If true, the response includes labels that identify which words were spoken by which participants in a multi-person exchange (default=false) * @param {string} [options.grammarName] - The name of a grammar that is to be used with the recognition request * @param {boolean} [options.redaction] - If true, the service redacts, or masks, numeric data from final transcripts (default=false) * @param {boolean} [options.processingMetrics] - If true, requests processing metrics about the service's transcription of the input audio (default=false) * @param {number} [options.processingMetricsInterval] - Specifies the interval in seconds at which the service is to return processing metrics * @param {boolean} [options.audioMetrics] - If true, requests detailed information about the signal characteristics of the input audio (detailed=false) * @param {number} [options.endOfPhraseSilenceTime] - If `true`, specifies the duration of the pause interval at which the service splits a transcript into multiple final results * @param {boolean} [options.splitTranscriptAtPhraseEnd] - If `true`, directs the service to split the transcript into multiple final results based on semantic features of the input * @param {number} [options.speechDetectorSensitivity] - The sensitivity of speech activity detection that the service is to perform * @param {number} [options.backgroundAudioSuppression] - The level to which the service is to suppress background audio based on its volume to prevent it from being transcribed as speech * @param {boolean} [params.lowLatency] - If `true` for next-generation `Multimedia` and `Telephony` models that support low latency, directs the service to produce results even more quickly than it usually does * @constructor */ constructor(options: RecognizeStream.Options); initialize(): void; sendJSON(msg: any): void; sendData(data: any): void; /** * Flow control - don't ask for more data until we've finished what we have * * Notes: * * This limits upload speed to 100 * options.highWaterMark / second. * * The default highWaterMark is 16kB, so the default max upload speed is ~1.6MB/s. * * Microphone input provides audio at a (downsampled) rate of: * 16000 samples/s * 16-bits * 1 channel = 32kB/s * (note the bits to Bytes conversion there) * * @private * @param {Function} next */ afterSend(next: any): void; /** * Prevents any more audio from being sent over the WebSocket and gracefully closes the connection. * Additional data may still be emitted up until the `end` event is triggered. */ stop(): void; _read(): void; _write(chunk: any, encoding: any, callback: any): void; finish(): void; /** * Returns a Promise that resolves with Watson Transaction ID from the X-Transaction-ID header * * Works in Node.js but not in browsers (the W3C WebSocket API does not expose headers) * * @return Promise<String> */ getTransactionId(): Promise<string>; } declare namespace RecognizeStream { interface Options extends DuplexOptions, RecognizeWebSocketParams { authenticator: Authenticator; disableSslVerification?: boolean; serviceUrl?: string; } } export = RecognizeStream;