speech-recognition-aws-polyfill

/* eslint-disable immutable/no-this */ import {EventStreamMarshaller} from '@aws-sdk/eventstream-marshaller' import {fromUtf8, toUtf8} from '@aws-sdk/util-utf8-node' import {Credentials} from 'aws-sdk' import {ifElse, not, pathEq, pathOr, pathSatisfies, propSatisfies, tap, when} from 'ramda' import {allPass, createPipe, prop} from 'remeda' import crypto from 'webcrypto' import {createPresignedURL, getCredentials} from '../lib/awsV4' import Connection from '../lib/Connection' import {CustomEventTarget} from '../lib/CustomEventTarget' import MicStream from '../lib/MicStream' import {convertAudioToBinaryMessage} from '../lib/audioUtils' import {AWSSpeechRecognitionEvent, AWSTranscribeResponse, Config, ListenerCallback} from '../types/shared' type requiredConfigs = Pick<Config, "region" | "IdentityPoolId"> type optionalConfigs = Omit<Config, "region" | "IdentityPoolId"> export type configArgs = requiredConfigs & Partial<optionalConfigs> class AWSRecognizer extends CustomEventTarget implements SpeechRecognition { /** in case future recognizers are built in the future (e.g. Azure) */ static type = 'AWS' /** true if the library is supported by the currenly browser */ static isSupported = !!navigator?.mediaDevices?.getUserMedia /** polyfill-specific config */ public config: Config /** if the library is currently capturing/transcribing audio */ public listening = false /** the langage (default en-US) */ public lang: Config['lang'] /** whether to continously transribe audio until .stop() is called */ public continuous: boolean /** a proxy for new AWSRecognizer(config) */ static create(config: configArgs): typeof SpeechRecognition { return class AWSRecognizerWithConfig extends AWSRecognizer { constructor() { super(config) } } } constructor(config: configArgs) { super() if (!config.IdentityPoolId || !config.region) throw new Error('Could not create AWS recognizer: missing configuration, see: https://github.com/ceuk/speech-recognition-aws-polyfill#configuration') const defaults: optionalConfigs = { sampleRate: 12000, lang: 'en-US', continuous: false } this.config = Object.assign(defaults, config) this.lang = this.config.lang this.continuous = this.config.continuous } /** start capturing/transcribing audio */ start() { if (this.listening) return this.dispatchEvent(new Event('start')) navigator.mediaDevices.getUserMedia({audio: true, video: false}) .then(this.establishConnection.bind(this)) .catch(err => { this.emitError(err) }); } /** stop capturing and return any final transcriptions */ public stop() { MicStream.getInstance()?.end() Connection.getInstance()?.close() this.listening = false this.dispatchEvent(new Event('audioend')) } /** stop capturing and don't emit any transcibed audio */ public abort() { if (this.listening) { MicStream.getInstance()?.end() Connection.getInstance()?.close() this.listening = false this.dispatchEvent(new Event('audioend')) } } /** dispatch transcription result */ private emitResult(transcript: string) { if (!this.continuous && this.listening) { this.stop() } if (transcript && transcript.length > 1) { this.dispatchEvent(new AWSSpeechRecognitionEvent('result', [{ 0: { transcript, confidence: 1 }, isFinal: !this.listening }] )) } else { this.dispatchEvent(new Event('nomatch')) } if (!this.listening) { this.dispatchEvent(new Event('end')) } } /** dispatch error event */ private emitError(error: Error) { this.stop() this.dispatchEvent(new ErrorEvent('error', error)) } /** dispatch events related to sound start */ private emitSoundStart() { this.dispatchEvent(new ErrorEvent('speechstart')) this.dispatchEvent(new ErrorEvent('soundstart')) } /** dispatch events realated to sound end */ private emitSoundEnd() { this.dispatchEvent(new ErrorEvent('speechend')) this.dispatchEvent(new ErrorEvent('soundend')) } /** authenticate and connect to AWS Transcribe */ private async establishConnection(mediaStream: MediaStream) { this.listening = true this.dispatchEvent(new Event('audiostart')) try { const {IdentityPoolId, region} = this.config const credentials = await getCredentials({IdentityPoolId, region}) as Credentials Connection.setUrl(this.getSignedURL(credentials)) MicStream.setStream(mediaStream) this.streamAudioToWebSocket() } catch (err) { if (err instanceof Error) { this.emitError(err) } } } /** get a signed url using specified credentials */ private getSignedURL(credentials: Credentials) { const endpoint = `transcribestreaming.${this.config.region}.amazonaws.com:8443` return createPresignedURL( 'GET', endpoint, '/stream-transcription-websocket', 'transcribe', crypto.createHash('sha256').update('', 'utf8').digest('hex'), { key: credentials.accessKeyId, secret: credentials.secretAccessKey, timestamp: Date.now(), sessionToken: credentials.sessionToken, protocol: 'wss', expires: 15, region: this.config.region, query: `language-code=${this.lang}&media-encoding=pcm&sample-rate=${this.config.sampleRate}` } ) } /** handle streaming received audio buffer to AWS transcribe */ private streamAudioToWebSocket() { try { // when we get audio data from the mic, send it to the WebSocket if possible const connection = Connection.getInstance() if (!connection) { console.error('no usable connection') return } connection.onopen = () => { const micStream = MicStream.getInstance() if (!micStream) { console.error('no usable stream') return } // emit sound start events this.emitSoundStart() // when audio is received from the mic stream, send it AWS micStream.on('data', createPipe( // emit the sound end if we are about to stop capturing when( () => !this.continuous, tap(() => this.emitSoundEnd()), ), // the audio stream is raw audio bytes. Transcribe expects PCM with additional metadata, encoded as binary (audioChunk: Buffer) => convertAudioToBinaryMessage(audioChunk, this.config.sampleRate), when(() => Connection.isActive(), connection.send.bind(connection)) )) // handle messages, errors, and close events this.handleSocketMessages() } } catch (error) { if (error instanceof Error) { this.emitError(error) } } } /** handle websocket responses */ private handleSocketMessages() { const eventStreamMarshaller = new EventStreamMarshaller(toUtf8, fromUtf8) const stringEncode = (data: ArrayBufferLike) => new TextDecoder('utf-8').decode(data) // convert the binary event stream message to JSON type ParseMessageBody = (response: {body: ArrayBufferLike}) => AWSTranscribeResponse const parseMessageBody: ParseMessageBody = createPipe( prop('body'), stringEncode, JSON.parse.bind(JSON) ) const connection = Connection.getInstance() if (connection) { connection.onmessage = createPipe( prop('data'), Buffer.from, (buffer: Buffer) => eventStreamMarshaller.unmarshall(buffer) as MessageEvent, ifElse( pathEq(['headers', ':message-type', 'value'], 'event'), // valid response createPipe( parseMessageBody, pathOr([], ['Transcript', 'Results']), when( // validate the results allPass([ propSatisfies((x: number) => x > 0, 'length'), pathSatisfies((x: number) => x > 0, [0, 'Alternatives', 'length']), pathSatisfies(not, [0, 'IsPartial']) ]), // emit the transcription result createPipe( pathOr('', [0, 'Alternatives', 0, 'Transcript']), decodeURIComponent, this.emitResult.bind(this) ) ) ), // error response createPipe( parseMessageBody, prop('Message'), console.error ) ) ) } } // stub some unimplemented props/methods set interimResults(_) { console.warn('`continous` is not yet implemented in the AWS polyfill') } get interimResults() { return false } set maxAlternatives(_) { console.warn('`maxAlternatives` is not yet implemented in the AWS polyfill') } get maxAlternatives() { return 1 } set grammars(_) { console.warn('`grammars` is not yet implemented in the AWS polyfill') } get grammars() { console.warn('`grammars` is not yet implemented in the AWS polyfill') return SpeechGrammar ? new SpeechGrammarList() : ([] as unknown as SpeechGrammarList) } // proxy event listeners set onaudiostart(fn: ListenerCallback) { this.addEventListener('audiostart', fn) } set onaudioend(fn: ListenerCallback) { this.addEventListener('audioend', fn) } set onend(fn: ListenerCallback) { this.addEventListener('end', fn) } set onerror(fn: ListenerCallback) { this.addEventListener('error', fn) } set onnomatch(fn: ListenerCallback) { this.addEventListener('nomatch', fn) } set onresult(fn: ListenerCallback) { this.addEventListener('result', fn) } set onsoundstart(fn: ListenerCallback) { this.addEventListener('soundstart', fn) } set onsoundend(fn: ListenerCallback) { this.addEventListener('soundend', fn) } set onspeechstart(fn: ListenerCallback) { this.addEventListener('speechstart', fn) } set onspeechend(fn: ListenerCallback) { this.addEventListener('speechend', fn) } set onstart(fn: ListenerCallback) { this.addEventListener('start', fn) } } export default AWSRecognizer