sber-salute-speech-recognition
Version:
A library that produces audio transcriptions using the SBER Salute Speech service.
184 lines (183 loc) • 7.91 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.SberSaluteSpeechRecognitionService = void 0;
const https_1 = require("https");
const music_metadata_1 = require("music-metadata");
const axios_1 = require("axios");
const qs = require("qs");
const fs = require("fs");
const constants_1 = require("./constants");
const uuid = require("uuid");
const enums_1 = require("./enums");
class SberSaluteSpeechRecognitionService {
constructor(authKey, sessionId, scope = enums_1.Scope.Personal) {
this.scope = scope;
this.token = null;
this.authKey = authKey;
this.sessionId = sessionId || uuid.v4();
}
updateAccessToken() {
return __awaiter(this, void 0, void 0, function* () {
const data = qs.stringify({
scope: this.scope,
});
const response = yield (0, axios_1.default)({
url: constants_1.SPEECH_TOKEN_URL,
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
RqUID: this.sessionId,
Authorization: `Basic ${this.authKey}`,
},
httpsAgent: new https_1.Agent({
rejectUnauthorized: false,
}),
data,
});
this.token = response.data;
});
}
getAccessToken() {
return __awaiter(this, void 0, void 0, function* () {
if (!this.token || this.token.expires_at < Date.now() - constants_1.MAX_WAIT_TIME) {
yield this.updateAccessToken();
}
if (!this.token) {
throw new Error('Failed to get access token');
}
return this.token;
});
}
uploadFileForRecognition(audioFilePath) {
return __awaiter(this, void 0, void 0, function* () {
const { access_token } = yield this.getAccessToken();
const audioFile = fs.createReadStream(audioFilePath);
const response = yield axios_1.default.request({
method: 'post',
maxBodyLength: Infinity,
url: `${constants_1.SPEECH_BASE_URL}/data:upload`,
headers: {
Authorization: `Bearer ${access_token}`,
'Content-Type': 'audio/mpeg',
},
data: audioFile,
httpsAgent: new https_1.Agent({
rejectUnauthorized: false,
}),
});
return response.data;
});
}
startRecognition(uploadedFile, fileMetadata, encoding, channels_count, hints) {
return __awaiter(this, void 0, void 0, function* () {
const data = JSON.stringify({
options: {
model: 'general',
audio_encoding: encoding,
sample_rate: fileMetadata.format.sampleRate,
hints: hints,
channels_count: channels_count
? channels_count
: fileMetadata.format.numberOfChannels,
},
request_file_id: uploadedFile.result.request_file_id,
});
const { access_token } = yield this.getAccessToken();
const response = yield axios_1.default.request({
method: 'post',
maxBodyLength: Infinity,
url: `${constants_1.SPEECH_BASE_URL}/speech:async_recognize`,
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${access_token}`,
},
data: data,
httpsAgent: new https_1.Agent({
rejectUnauthorized: false,
}),
});
return response.data;
});
}
getRecognitionStatus(recognition) {
return __awaiter(this, void 0, void 0, function* () {
const { access_token } = yield this.getAccessToken();
const response = yield axios_1.default.request({
method: 'get',
maxBodyLength: Infinity,
url: `${constants_1.SPEECH_BASE_URL}/task:get?id=${recognition.result.id}`,
headers: {
Authorization: `Bearer ${access_token}`,
},
httpsAgent: new https_1.Agent({
rejectUnauthorized: false,
}),
});
return response.data;
});
}
getRecognitionResult(recognition) {
return __awaiter(this, void 0, void 0, function* () {
const { access_token } = yield this.getAccessToken();
const response = yield axios_1.default.request({
method: 'get',
maxBodyLength: Infinity,
url: `${constants_1.SPEECH_BASE_URL}/data:download?response_file_id=${recognition.result.response_file_id}`,
headers: {
Authorization: `Bearer ${access_token}`,
},
httpsAgent: new https_1.Agent({
rejectUnauthorized: false,
}),
});
return response.data;
});
}
delay(ms) {
return __awaiter(this, void 0, void 0, function* () {
return new Promise(resolve => setTimeout(resolve, ms));
});
}
speechToText(audioPath, encoding, channels_count, hints) {
return __awaiter(this, void 0, void 0, function* () {
const metadata = yield (0, music_metadata_1.parseFile)(audioPath);
const fileUploadResponse = yield this.uploadFileForRecognition(audioPath);
const recognitionResponse = yield this.startRecognition(fileUploadResponse, metadata, encoding, channels_count, hints);
const startTime = Date.now();
let recognitionStatus = yield this.getRecognitionStatus(recognitionResponse);
while (recognitionStatus.result.status !== 'DONE') {
if (Date.now() - startTime > constants_1.MAX_WAIT_TIME) {
throw new Error('Recognition timeout');
}
yield this.delay(constants_1.RECOGNITION_POLLING_DELAY);
recognitionStatus = yield this.getRecognitionStatus(recognitionResponse);
}
const recognitionResult = yield this.getRecognitionResult(recognitionStatus);
const text = recognitionResult
.reduce((acc, item) => {
return (acc + item.results.reduce((acc, item) => acc + ' ' + item.text, ''));
}, '')
.trim();
const normalizedText = recognitionResult
.reduce((acc, item) => {
return (acc +
item.results.reduce((acc, item) => acc + ' ' + item.normalized_text, ''));
}, '')
.trim();
return {
text,
normalizedText,
};
});
}
}
exports.SberSaluteSpeechRecognitionService = SberSaluteSpeechRecognitionService;