tts-narrator
Version:
Generate narration with Text-To-Speech technology
104 lines (103 loc) • 4.75 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.BaseTtsService = exports.TtsServiceType = void 0;
const fast_xml_parser_1 = require("fast-xml-parser");
var TtsServiceType;
(function (TtsServiceType) {
TtsServiceType["Azure"] = "azure";
})(TtsServiceType || (exports.TtsServiceType = TtsServiceType = {}));
function escapeXml(text) {
return text.replaceAll(/["&'<>]/g, (c) => {
switch (c) {
case '<': {
return '<';
}
case '>': {
return '>';
}
case '&': {
return '&';
}
case '\'': {
return ''';
}
case '"': {
return '"';
}
default: {
return c;
}
}
});
}
class BaseTtsService {
async generateSSML(paragraph) {
const generated = this.generateSsmlWithoutValidation(paragraph);
this.validateXML(generated.ssml, generated.lineOffset);
return generated.ssml;
}
validateXML(xml, lineOffset) {
const validationResult = fast_xml_parser_1.XMLValidator.validate(xml);
if (validationResult !== true) {
const err = validationResult.err;
throw new Error(`Invalid markup at line ${err.line - lineOffset}. ${err.code}: ${err.msg}`);
}
}
buildSpeakStartTag(voiceSettings) {
var _a;
return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xml:lang="${(_a = voiceSettings.language) !== null && _a !== void 0 ? _a : 'en-US'}">`;
}
buildVoiceStartTag(voiceSettings) {
return `<voice name="${voiceSettings.name}">`;
}
buildProsodyStartTag(prosodySettings) {
return `<prosody ${prosodySettings.pitch ? `pitch="${prosodySettings.pitch}"` : ''} ${prosodySettings.rate ? `rate="${prosodySettings.rate}"` : ''} ${prosodySettings.volume ? `volume="${prosodySettings.volume}"` : ''}>`;
}
buildMsttsExpressAsStartTag(msttsExpressAsSettings) {
return `<mstts:express-as ${msttsExpressAsSettings.style ? `style="${msttsExpressAsSettings.style}"` : ''} ${msttsExpressAsSettings.role ? `role="${msttsExpressAsSettings.role}"` : ''} ${msttsExpressAsSettings.styleDegree ? `styledegree="${msttsExpressAsSettings.styleDegree}"` : ''}>`;
}
generateSsmlWithoutValidation(paragraph) {
var _a, _b;
const text = (_a = paragraph === null || paragraph === void 0 ? void 0 : paragraph.text) === null || _a === void 0 ? void 0 : _a.trim();
const ssml = (_b = paragraph === null || paragraph === void 0 ? void 0 : paragraph.ssml) === null || _b === void 0 ? void 0 : _b.trim();
if (!text && !ssml) {
throw new Error('Empty content?');
}
if (ssml && ssml.startsWith('<speak')) { // it is already full SSML
if (!ssml.endsWith('</speak>')) {
throw new Error('Forgot to end the text with "</speak>"?');
}
return { lineOffset: 0, ssml };
}
const voiceSettings = paragraph.settings;
const speakStartTag = this.buildSpeakStartTag(voiceSettings);
const speakEndTag = '</speak>';
// <voice></voice> fragment
if (ssml && ssml.startsWith('<voice')) {
if (!ssml.endsWith('</voice>')) {
throw new Error('Forgot to end the text with "</voice>"?');
}
return { lineOffset: 1, ssml: `${speakStartTag}\n${ssml}\n${speakEndTag}` };
}
const voiceStartTag = this.buildVoiceStartTag(voiceSettings);
const voiceEndTag = '</voice>';
let prosodyStartTagOrEmpty = '';
let prosodyEndTagOrEmpty = '';
if (voiceSettings.prosody) {
prosodyStartTagOrEmpty = this.buildProsodyStartTag(voiceSettings.prosody);
prosodyEndTagOrEmpty = '</prosody>';
}
let msttsExpressAsStartTagOrEmpty = '';
let msttsExpressAsEndTagOrEmpty = '';
if (voiceSettings.msttsExpressAs) {
msttsExpressAsStartTagOrEmpty = this.buildMsttsExpressAsStartTag(voiceSettings.msttsExpressAs);
msttsExpressAsEndTagOrEmpty = '</mstts:express-as>';
}
// plain text or fragments containing other tags
return { lineOffset: 1, ssml: `${speakStartTag}${voiceStartTag}${prosodyStartTagOrEmpty}${msttsExpressAsStartTagOrEmpty}\n${ssml || escapeXml(text)}\n${msttsExpressAsEndTagOrEmpty}${prosodyEndTagOrEmpty}${voiceEndTag}${speakEndTag}` };
}
generateAudio(_ssml, _options) {
throw new Error('generateAudio(...) is not implemented.');
}
}
exports.BaseTtsService = BaseTtsService;