@project-sunbird/open-speech-streaming-client
Version:
Streaming client library for EkStep open speech API
326 lines (275 loc) • 11.2 kB
JavaScript
const hark= require('hark');
const {io} = require('socket.io-client');
const SocketStatus =require('./socket-status');
module.exports = function () {
const _this = this;
this.socket = null;
this.defaultSampleRate = 48000;
this.speechEvents = null;
this.input = null;
this.processor = null;
/** Note:
* TODO:
* auto mic close needs to be added.
* check for socket null in all used places.
* add default callbacks to avoid errors.
* Error handling
*/
// state parameters
this.audioData = [];
this.recordingLength = 0;
this.userId;
this.isStreaming = false;
this.isStreamingOver = false;
this.isSilenceTransmitted = true;
this.localBuffer = null;
this.language = 'en-IN';
this.bufferSize = 16384;
this.isSpeaking = false;
this.isSocketConnected = false;
function setStateOnMicStart() {
_this.isStreaming = true;
_this.audioData = [];
_this.recordingLength = 0;
}
function setStateOnMicStop() {
_this.isStreaming = false;
}
async function getAudioMediaStream() {
let constraints = {audio: true, video: false}
let stream = await navigator.mediaDevices.getUserMedia(constraints);
return stream;
}
function flattenArray(channelBuffer, recordingLength) {
let result = new Float32Array(recordingLength);
let offset = 0;
for (let i = 0; i < channelBuffer.length; i++) {
let buffer = channelBuffer[i];
result.set(buffer, offset);
offset += buffer.length;
}
return result;
}
function writeUTFBytes(view, offset, string) {
for (var i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
}
function generateWavBlob(finalBuffer) {
let buffer = new ArrayBuffer(44 + finalBuffer.length * 2);
let view = new DataView(buffer);
// RIFF chunk descriptor
writeUTFBytes(view, 0, 'RIFF');
view.setUint32(4, 44 + finalBuffer.length * 2, true);
writeUTFBytes(view, 8, 'WAVE');
// FMT sub-chunk
writeUTFBytes(view, 12, 'fmt ');
view.setUint32(16, 16, true); // chunkSize
view.setUint16(20, 1, true); // wFormatTag
view.setUint16(22, 1, true); // wChannels:mono(1 channel) / stereo (2 channels)
view.setUint32(24, _this.defaultSampleRate, true); // dwSamplesPerSec
view.setUint32(28, _this.defaultSampleRate * 2, true); // dwAvgBytesPerSec
view.setUint16(32, 4, true); // wBlockAlign
view.setUint16(34, 16, true); // wBitsPerSample
// data sub-chunk
writeUTFBytes(view, 36, 'data');
view.setUint32(40, finalBuffer.length * 2, true);
// write the PCM samples
let index = 44;
let volume = 1;
for (let i = 0; i < finalBuffer.length; i++) {
view.setInt16(index, finalBuffer[i] * (0x7FFF * volume), true);
index += 2;
}
// our final blob
let blob = new Blob([view], {type: 'audio/wav'});
return blob;
}
function setSilenceDetector(audioStream, context) {
let options = {audioContext: context};
_this.speechEvents = hark(audioStream, options);
_this.speechEvents.on('speaking', function () {
_this.isSpeaking = true;
});
_this.speechEvents.on('stopped_speaking', function () {
_this.isSpeaking = false;
});
}
function downSampleBuffer(buffer, sampleRate, outSampleRate) {
if (outSampleRate === sampleRate) {
return buffer;
}
if (outSampleRate > sampleRate) {
throw "down-sampling rate show be smaller than original sample rate";
}
let sampleRateRatio = sampleRate / outSampleRate;
let newLength = Math.round(buffer.length / sampleRateRatio);
let result = new Int16Array(newLength);
let offsetResult = 0;
let offsetBuffer = 0;
while (offsetResult < result.length) {
let nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
let accum = 0, count = 0;
for (let i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
accum += buffer[i];
count++;
}
result[offsetResult] = Math.min(1, accum / count) * 0x7FFF;
offsetResult++;
offsetBuffer = nextOffsetBuffer;
}
return result.buffer;
}
function appendBuffer(buffer1, buffer2) {
const buffer = new ArrayBuffer(buffer1.byteLength + buffer2.byteLength);
let tmp = new Uint8Array(buffer);
tmp.set(new Uint8Array(buffer1), 0);
tmp.set(new Uint8Array(buffer2), buffer1.byteLength);
return buffer;
}
function streamAudioProcess(e) {
_this.audioData.push(new Float32Array(e.inputBuffer.getChannelData(0)));
_this.recordingLength += _this.bufferSize;
if (_this.isStreaming === true) {
_this.isStreamingOver = false;
let data_44100 = e.inputBuffer.getChannelData(0);
let data_16000 = downSampleBuffer(data_44100, _this.defaultSampleRate, 16000);
if (_this.isSpeaking) {
_this.isSilenceTransmitted = false;
if (_this.localBuffer !== undefined && _this.localBuffer !== null) {
data_16000 = appendBuffer(_this.localBuffer, data_16000)
}
_this.socket.emit('mic_data', data_16000, _this.language, true, false);
_this.localBuffer = null;
} else {
if (!_this.isSilenceTransmitted) {
_this.isSilenceTransmitted = true;
_this.socket.emit('mic_data', data_16000, _this.language, false, false);
} else {
_this.localBuffer = data_16000;
}
}
} else {
if (!_this.isStreamingOver) {
var data_44100 = e.inputBuffer.getChannelData(0);
var data_16000 = downSampleBuffer(data_44100, _this.defaultSampleRate, 16000);
// let data_16000 = data_44100;
_this.isStreamingOver = true;
_this.socket.emit('mic_data', data_16000, _this.language, false, true);
}
}
}
this.startStreaming = async function (responseCallback = () => {}, errorCallback = () => {}) {
try {
setStateOnMicStart();
let stream = await getAudioMediaStream();
// connect socket here if needed
const audioContextClass = window.AudioContext || window.webkitAudioContext;
const context = new audioContextClass({
latencyHint: 'interactive',
});
_this.defaultSampleRate = context.sampleRate;
setSilenceDetector(stream.clone(), context);
_this.input = context.createMediaStreamSource(stream);
_this.processor = context.createScriptProcessor(_this.bufferSize, 1, 1);
_this.input.connect(_this.processor);
_this.processor.connect(context.destination);
_this.processor.onaudioprocess = streamAudioProcess;
// clear states
// access media library, proceed to next if access enabled / throw error
// if needed, connect to socket
// set silence detector
// stream processor(responseCallback)
_this.socket.on('response', function (data, language) {
if (language === "en-IN") data = data.toLowerCase();
responseCallback(data)
});
} catch (e) {
errorCallback(e);
}
}
this.stopStreaming = (callback = () => {}) => {
// revoke access to media library
// if needed, disable socket
// disable silence detector
// disable stream processor
// clear states
setStateOnMicStop();
_this.socket.emit('mic_data', null, _this.language, false, false);
_this.socket.emit('mic_data', null, _this.language, false, true);
if (_this.speechEvents && _this.speechEvents !== null)
_this.speechEvents.stop();
if (_this.input && _this.input !== null)
_this.input.disconnect();
if (_this.processor && _this.processor !== null)
_this.processor.disconnect();
let finalBuffer = flattenArray(_this.audioData, _this.recordingLength);
let blob = generateWavBlob(finalBuffer);
if (blob == null) {
callback(null);
return;
}
callback(blob);
}
this.connect = (socketURL, transcription_language, onSuccess = () => {}, onError = () => {}) => {
// establish connection
// emit connect event
// listen on connect success
// trigger onSuccess/onError depending on response
_this.language = transcription_language;
_this.socket = io(socketURL, {
// path: '/',
autoConnect: false,
withCredentials: false,
reconnectionAttempts: 5,
query: `language=${_this.language}`,
transports: ["websocket", "polling"]
});
_this.socket.connect();
_this.socket.on('connect', function () {
_this.userId = _this.socket.id;
_this.socket.emit('connect_mic_stream');
});
_this.socket.on('connect-success', function (data) {
onSuccess(SocketStatus.CONNECTED, _this.userId)
_this.isSocketConnected = true;
});
_this.socket.on('disconnect', function () {
_this.isSocketConnected = false;
})
_this.socket.on('terminate', function () {
onSuccess(SocketStatus.TERMINATED, _this.userId);
_this.disconnect();
});
_this.socket.on('abort', function () {
onError("The server is busy at the moment, please try after sometime.");
});
}
this.disconnect = () => {
// emit disconnect event
// trigger onSuccess/onError depending on response
_this.socket.disconnect();
}
this.isSocketConnected = () => _this.isSocketConnected;
this.punctuateText = (textToPunctuate, punctuationUrl, onSuccess= ()=>{}, onError= ()=>{}) => {
if (!textToPunctuate){
onError(400, 'Text cannot be empty.');
return;
}
let status;
fetch(punctuationUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify({text: textToPunctuate, language: _this.language, enabledItn: true}),
}).then(response => {
status = response.status;
return response.json();
}).then(body => {
onSuccess(status, body["text"]);
}).catch((error) => {
onError(status, error);
});
}
}