UNPKG

sber-salute-speech-recognition

Version:

A library that produces audio transcriptions using the SBER Salute Speech service.

github.com/RaftDigiAI/sber-salute-speech-recognition

RaftDigiAI/sber-salute-speech-recognition

184 lines (183 loc) • 7.91 kB

JavaScript

"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.SberSaluteSpeechRecognitionService = void 0; const https_1 = require("https"); const music_metadata_1 = require("music-metadata"); const axios_1 = require("axios"); const qs = require("qs"); const fs = require("fs"); const constants_1 = require("./constants"); const uuid = require("uuid"); const enums_1 = require("./enums"); class SberSaluteSpeechRecognitionService { constructor(authKey, sessionId, scope = enums_1.Scope.Personal) { this.scope = scope; this.token = null; this.authKey = authKey; this.sessionId = sessionId || uuid.v4(); } updateAccessToken() { return __awaiter(this, void 0, void 0, function* () { const data = qs.stringify({ scope: this.scope, }); const response = yield (0, axios_1.default)({ url: constants_1.SPEECH_TOKEN_URL, method: 'POST', headers: { 'Content-Type': 'application/x-www-form-urlencoded', RqUID: this.sessionId, Authorization: `Basic ${this.authKey}`, }, httpsAgent: new https_1.Agent({ rejectUnauthorized: false, }), data, }); this.token = response.data; }); } getAccessToken() { return __awaiter(this, void 0, void 0, function* () { if (!this.token || this.token.expires_at < Date.now() - constants_1.MAX_WAIT_TIME) { yield this.updateAccessToken(); } if (!this.token) { throw new Error('Failed to get access token'); } return this.token; }); } uploadFileForRecognition(audioFilePath) { return __awaiter(this, void 0, void 0, function* () { const { access_token } = yield this.getAccessToken(); const audioFile = fs.createReadStream(audioFilePath); const response = yield axios_1.default.request({ method: 'post', maxBodyLength: Infinity, url: `${constants_1.SPEECH_BASE_URL}/data:upload`, headers: { Authorization: `Bearer ${access_token}`, 'Content-Type': 'audio/mpeg', }, data: audioFile, httpsAgent: new https_1.Agent({ rejectUnauthorized: false, }), }); return response.data; }); } startRecognition(uploadedFile, fileMetadata, encoding, channels_count, hints) { return __awaiter(this, void 0, void 0, function* () { const data = JSON.stringify({ options: { model: 'general', audio_encoding: encoding, sample_rate: fileMetadata.format.sampleRate, hints: hints, channels_count: channels_count ? channels_count : fileMetadata.format.numberOfChannels, }, request_file_id: uploadedFile.result.request_file_id, }); const { access_token } = yield this.getAccessToken(); const response = yield axios_1.default.request({ method: 'post', maxBodyLength: Infinity, url: `${constants_1.SPEECH_BASE_URL}/speech:async_recognize`, headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${access_token}`, }, data: data, httpsAgent: new https_1.Agent({ rejectUnauthorized: false, }), }); return response.data; }); } getRecognitionStatus(recognition) { return __awaiter(this, void 0, void 0, function* () { const { access_token } = yield this.getAccessToken(); const response = yield axios_1.default.request({ method: 'get', maxBodyLength: Infinity, url: `${constants_1.SPEECH_BASE_URL}/task:get?id=${recognition.result.id}`, headers: { Authorization: `Bearer ${access_token}`, }, httpsAgent: new https_1.Agent({ rejectUnauthorized: false, }), }); return response.data; }); } getRecognitionResult(recognition) { return __awaiter(this, void 0, void 0, function* () { const { access_token } = yield this.getAccessToken(); const response = yield axios_1.default.request({ method: 'get', maxBodyLength: Infinity, url: `${constants_1.SPEECH_BASE_URL}/data:download?response_file_id=${recognition.result.response_file_id}`, headers: { Authorization: `Bearer ${access_token}`, }, httpsAgent: new https_1.Agent({ rejectUnauthorized: false, }), }); return response.data; }); } delay(ms) { return __awaiter(this, void 0, void 0, function* () { return new Promise(resolve => setTimeout(resolve, ms)); }); } speechToText(audioPath, encoding, channels_count, hints) { return __awaiter(this, void 0, void 0, function* () { const metadata = yield (0, music_metadata_1.parseFile)(audioPath); const fileUploadResponse = yield this.uploadFileForRecognition(audioPath); const recognitionResponse = yield this.startRecognition(fileUploadResponse, metadata, encoding, channels_count, hints); const startTime = Date.now(); let recognitionStatus = yield this.getRecognitionStatus(recognitionResponse); while (recognitionStatus.result.status !== 'DONE') { if (Date.now() - startTime > constants_1.MAX_WAIT_TIME) { throw new Error('Recognition timeout'); } yield this.delay(constants_1.RECOGNITION_POLLING_DELAY); recognitionStatus = yield this.getRecognitionStatus(recognitionResponse); } const recognitionResult = yield this.getRecognitionResult(recognitionStatus); const text = recognitionResult .reduce((acc, item) => { return (acc + item.results.reduce((acc, item) => acc + ' ' + item.text, '')); }, '') .trim(); const normalizedText = recognitionResult .reduce((acc, item) => { return (acc + item.results.reduce((acc, item) => acc + ' ' + item.normalized_text, '')); }, '') .trim(); return { text, normalizedText, }; }); } } exports.SberSaluteSpeechRecognitionService = SberSaluteSpeechRecognitionService;