extra-amazontts
Version:
Generate speech audio from super long text, via Amazon Polly and ffmpeg.
362 lines (315 loc) • 13.1 kB
JavaScript
const cp = require('child_process');
const fs = require('fs');
const path = require('path');
const _ = require('lodash');
const boolean = require('boolean').boolean;
const AWS = require('aws-sdk');
const musicMetadata = require('music-metadata');
const awsconfig = require('extra-awsconfig');
var getStdin = null;
var tempy = null;
// Global variables
const E = process.env;
const STDIO = [0, 1, 2];
const OPTIONS = {
log: boolean(E['TTS_LOG'] || '0'),
output: E['TTS_OUTPUT'] || 'out.mp3',
text: E['TTS_TEXT'] || null,
retries: parseInt(E['TTS_RETRIES'] || '8', 10),
acodec: E['TTS_ACODEC'] || 'copy',
audio: {
encoding: E['TTS_AUDIO_ENCODING'] || null,
frequency: parseInt(E['TTS_AUDIO_FREQUENCY'] || '0', 10)
},
language: {
code: E['TTS_LANGUAGE_CODE'] || null,
lexicons: (E['TTS_LANGUAGE_LEXICONS'] || '').split(',').filter(v => !!v),
},
voice: {
name: E['TTS_VOICE_NAME'] || null,
gender: E['TTS_VOICE_GENDER'] || 'neutral'
},
quote: {
break: parseFloat(E['TTS_QUOTE_BREAK'] || '250'),
emphasis: E['TTS_QUOTE_EMPHASIS'] || 'moderate'
},
heading: {
break: parseFloat( E['TTS_HEADING_BREAK'] || '4000'),
difference: parseFloat(E['TTS_HEADING_DIFFERENCE'] || '250'),
emphasis: E['TTS_HEADING_EMPHASIS'] || 'strong',
},
ellipsis: {
break: parseFloat(E['TTS_ELLIPSIS_BREAK'] || '1500')
},
dash: {
break: parseFloat(E['TTS_DASH_BREAK'] || '500')
},
newline: {
break: parseFloat(E['TTS_NEWLINE_BREAK'] || '1000')
},
block: {
separator: E['TTS_BLOCK_SEPARATOR'] || '.',
length: parseFloat(E['TTS_BLOCK_LENGTH'] || '2000')
},
config: null
};
const VOICE = new Map([
['cmn-CN', {f: 'Zhiyu', m: null}],
['da-DK', {f: 'Naja', m: 'Mads'}],
['nl-NL', {f: 'Lotte', m: 'Ruben'}],
['en-AU', {f: 'Nicole', m: 'Russell'}],
['en-GB', {f: 'Amy', m: 'Brian'}],
['en-IN', {f: 'Aditi', m: null}],
['en-US', {f: 'Ivy', m: 'Joey'}],
['en-GB-WLS', {f: null, m: 'Geraint'}],
['fr-FR', {f: 'Celine', m: 'Mathieu'}],
['fr-CA', {f: 'Chantal', m: null}],
['de-DE', {f: 'Marlene', m: 'Hans'}],
['hi-IN', {f: 'Aditi', m: null}],
['is-IS', {f: 'Dora', m: 'Karl'}],
['it-IT', {f: 'Carla', m: 'Giorgio'}],
['ja-JP', {f: 'Mizuki', m: 'Takumi'}],
['ko-KR', {f: 'Seoyeon', m: null}],
['nb-NO', {f: 'Liv', m: null}],
['pl-PL', {f: 'Ewa', m: 'Jacek'}],
['pt-BR', {f: 'Vitoria', m: 'Ricardo'}],
['pt-PT', {f: 'Ines', m: 'Cristiano'}],
['ro-RO', {f: 'Carmen', m: null}],
['ru-RU', {f: 'Tatyana', m: 'Tatyana'}],
['es-ES', {f: 'Conchita', m: 'Enrique'}],
['es-MX', {f: 'Mia', m: null}],
['es-US', {f: 'Penelope', m: 'Miguel'}],
['sv-SE', {f: 'Astrid', m: null}],
['tr-TR', {f: 'Filiz', m: null}],
['cy-GB', {f: 'Gwyneth', m: null}]
]);
const FN_NOP = () => 0;
// ES modules dependency loader :).
async function importDependencies() {
if (getStdin!=null) return;
var $ = await Promise.all([
import('get-stdin'),
import('tempy')
]);
getStdin = $[0].default;
tempy = $[1].default;
}
// Format time in HH:MM:SS format.
function timeFormat(t) {
var hh = Math.floor(t/3600) .toString().padStart(2, '0');
var mm = Math.floor((t%3600)/60).toString().padStart(2, '0');
var ss = Math.floor(t%60) .toString().padStart(2, '0');
return `${hh}:${mm}:${ss}`;
};
// Get filename, without extension.
function pathFilename(pth) {
return pth.substring(0, pth.length-path.extname(pth).length);
}
// Write file, return promise.
function fsWriteFile(pth, dat, o) {
return new Promise((fres, frej) => fs.writeFile(pth, dat, o, (err) => {
return err? frej(err):fres();
}));
}
// Execute child process, return promise.
function cpExec(cmd, o) {
var o = o||{}, stdio = o.log? o.stdio||STDIO:o.stdio||[];
if(o.log) console.log('-cpExec:', cmd);
if(o.stdio==null) return Promise.resolve({stdout: cp.execSync(cmd, {stdio})});
return new Promise((fres, frej) => cp.exec(cmd, {stdio}, (err, stdout, stderr) => {
return err? frej(err):fres({stdout, stderr});
}));
}
// Get Polly synthesize speech params.
function pollyParams(out, txt, o) {
var ae = (o.audio.encoding||path.extname(out).substring(1)).toLowerCase().replace('ogg', 'ogg_vorbis');
var af = o.audio.frequency? o.audio.frequency.toString():null;
var vg = /^f/i.test(o.voice.gender)? 'f':(/^m/i.test(o.voice.gender)? 'm':null);
var v = VOICE.get(o.language.code)||{}, vn = o.voice.name||(vg==='m'? v.m||v.f||'Joey':v.f||v.m||'Joanna');
return {LexiconNames: o.language.lexicons, OutputFormat: ae, SampleRate: af, Text: txt,
TextType: 'ssml', VoiceId: vn, LanguageCode: o.language.code};
}
// Get SSML from text.
function textSsml(txt, o) {
var q = o.quote, h = o.heading, e = o.ellipsis, d = o.dash, n = o.newline;
txt = txt.replace(/\s*&\s*/g, ' and ');
txt = txt.replace(/\"(.*?)\"/gm, (m, p1) => {
var brk = `<break time="${q.break}ms"/>`;
var emp = `<emphasis level="${q.emphasis}">"${p1}"</emphasis>`;
return brk+emp+brk;
});
txt = txt.replace(/(=+)\s(.*?)\s\1/g, (m, p1, p2) => {
var brk = `<break time="${h.break-p1.length*h.difference}ms"/>`;
var emp = `<emphasis level="${h.emphasis}">${p2}</emphasis>`;
return brk+'Topic '+emp+brk;
});
// txt = txt.replace(/\((.*?)\)/gm, '<emphasis level="reduced">($1)</emphasis>');
// txt = txt.replace(/\[(.*?)\]/gm, '<emphasis level="reduced">[$1]</emphasis>');
txt = txt.replace(/\.\.\./g, `<break time="${e.break}ms"/>...`);
txt = txt.replace(/\—/g, `<break time="${d.break}ms"/>—`);
txt = txt.replace(/(\r?\n)+/gm, `<break time="${n.break}ms"/>\n`);
return `<speak>${txt}</speak>`;
}
// Get SSML block from long text.
function textSsmlBlock(txt, o) {
var b = o.block;
for(var end=b.length;;) {
end = Math.floor(0.75*end);
var i = txt.lastIndexOf(b.separator, end)+1;
i = i>0? i:Math.min(txt.length, end);
var ssml = textSsml(txt.substring(0, i), o);
if(ssml.length<b.length) break;
}
return [ssml, txt.substring(i)];
}
// Get sections for text.
function textSections(txt) {
var re = /(=+)\s+(.*?)\s+\1/g;
for(var i=0, title=null, top=null, secs=[]; (top=re.exec(txt))!=null;) {
secs.push({title, content: txt.substring(i, top.index)});
i = top.index; title = top[2];
}
secs.push({title, content: txt.substring(i)});
return secs;
}
// Write TTS audio to file.
function audiosWrite(out, ssml, tts, o) {
var l = o.log, req = o.params; req.Text = ssml;
return new Promise((fres, frej) => {
tts.synthesizeSpeech(req, (err, res) => {
if(err) return frej(err);
fs.writeFile(out, res.AudioStream, 'binary', (err) => {
if(l) console.log('-audiosWrite:', out);
if(err) return frej(err);
fres(out);
});
});
});
}
// Write TTS audio to file, with retries.
async function audiosRetryWrite(out, ssml, tts, o) {
var err = null;
for(var i=0, I=o.retries||8; i<I; i++) {
try { return await audiosWrite(out, ssml, tts, o); }
catch(e) { err = e; }
}
console.log(ssml);
throw err;
}
// Generate output SSML parts.
function outputSsmls(txt, o) {
for(var i=0, z=[]; txt; i++) {
var [ssml, txt] = textSsmlBlock(txt, o);
z[i] = ssml;
}
return z;
}
// Generate output audio part files.
function outputAudios(out, ssmls, tts, o) {
if(o.log) console.log('-outputAudios:', out, ssmls.length);
var pth = pathFilename(out), ext = path.extname(out);
for(var i=0, I=ssmls.length, z=[]; i<I; i++)
z[i] = audiosRetryWrite(`${pth}.${i}${ext}`, ssmls[i], tts, o);
return Promise.all(z);
}
// Get durations of audio part files.
function outputDurations(auds) {
var durs = [];
for(var aud of auds)
durs.push(musicMetadata.parseFile(aud).then(m => m.format.duration));
return Promise.all(durs);
}
// Generate output audio file.
async function outputAudio(out, auds, o) {
if(o.log) console.log('-outputAudio:', out, auds.length);
var lst = tempy.file({extension: 'txt'}), dat = '';
for(var aud of auds)
dat += `file '${aud}'\n`;
await fsWriteFile(lst, dat);
var z = await cpExec(`ffmpeg -y -safe 0 -f concat -i "${lst}" -acodec ${o.acodec} "${out}"`, o);
fs.unlink(lst, FN_NOP);
return z;
}
/**
* Generate speech audio from super long text through machine, via Amazon Polly, ffmpeg.
* @param {string} out output audio file.
* @param {string} txt input text.
* @param {object} o options.
* @returns promise <out>.
*/
async function amazontts(out, txt, o) {
await importDependencies();
var o = _.merge({}, OPTIONS, o);
if(o.log) console.log('@amazontts:', out, txt);
o.params = o.params||pollyParams(out, null, o);
var tts = new AWS.Polly(awsconfig(o.config));
var ext = path.extname(out);
var aud = tempy.file({extension: ext.substring(1)});
var secs = textSections('\n'+txt), prts = [], ssmls = [];
for(var sec of secs) {
var secSsmls = outputSsmls(sec.content, o);
prts.push(secSsmls.length);
Array.prototype.push.apply(ssmls, secSsmls);
}
var auds = await outputAudios(aud, ssmls, tts, o);
out = await outputAudio(out, auds, o);
var durs = await outputDurations(auds);
for(var i=0, j=0, t=0, toc=[], I=secs.length; i<I; i++) {
toc[i] = {title: secs[i].title, time: timeFormat(t)};
for(var p=0; p<prts[i]; p++)
t += durs[j++];
}
for(var f of auds) fs.unlink(f, FN_NOP);
if(o.log) console.log(' .toc:', toc);
return toc;
}
// Get options from arguments.
function options(o, k, a, i) {
var e = k.indexOf('='), v = null, bool = () => true, str = () => a[++i];
if(e>=0) { v = k.substring(e+1); bool = () => boolean(v); str = () => v; k = k.substring(o, e); }
o.config = o.config||{};
if(k==='--help') o.help = bool();
else if(k==='-l' || k==='--log') o.log = bool();
else if(k==='-o' || k==='--output') o.output = str();
else if(k==='-t' || k==='--text') o.text = str();
else if(k==='-r' || k==='--retries') o.retries = parseInt(str(), 10);
else if(k==='-a' || k==='--acodec') _.set(o, 'acodec', str());
else if(k==='-ae' || k==='--audio_encoding') _.set(o, 'audio.encoding', str());
else if(k==='-af' || k==='--audio_frequency') _.set(o, 'audio.frequency', parseInt(str(), 10));
else if(k==='-lc' || k==='--language_code') _.set(o, 'language.code', str());
else if(k==='-ll' || k==='--language_lexicons') _.set(o, 'language.lexicons', str());
else if(k==='-vn' || k==='--voice_name') _.set(o, 'voice.name', str());
else if(k==='-vg' || k==='--voice_gender') _.set(o, 'voice.gender', str());
else if(k==='-qb' || k==='--quote_break') _.set(o, 'quote.break', parseFloat(str()));
else if(k==='-qe' || k==='--quote_emphasis') _.set(o, 'quote.emphasis', str());
else if(k==='-hb' || k==='--heading_break') _.set(o, 'heading.break', parseFloat(str()));
else if(k==='-hd' || k==='--heading_difference') _.set(o, 'heading.difference', parseFloat(str()));
else if(k==='-he' || k==='--heading_emphasis') _.set(o, 'heading.emphasis', str());
else if(k==='-eb' || k==='--ellipsis_break') _.set(o, 'ellipsis.break', parseFloat(str()));
else if(k==='-db' || k==='--dash_break') _.set(o, 'dash.break', parseFloat(str()));
else if(k==='-nb' || k==='--newline_break') _.set(o, 'newline.break', parseFloat(str()));
else if(k==='-bl' || k==='--block_length') _.set(o, 'block.length', parseInt(str(), 10));
else if(k==='-bs' || k==='--block_separator') _.set(o, 'block.separator', str());
else if(k.startsWith('-c')) awsconfig.options(o.config, '-'+k.substring(2), a, i);
else if(k.startsWith('--config_')) awsconfig.options(o.config, '--'+k.substring(9), a, i);
else o.argv = a[i];
return i+1;
}
amazontts.options = options;
module.exports = amazontts;
// Run on shell.
async function shell(a) {
await importDependencies();
var o = {argv: await getStdin()};
for(var i=2, I=a.length; i<I;)
i = options(o, a[i], a, i);
if(o.help) return cp.execSync('less README.md', {cwd: __dirname, stdio: STDIO});
var txt = o.text? fs.readFileSync(o.text, 'utf8'):o.argv||'';
var out = o.output||'out.mp3';
var toc = await amazontts(out, txt, o);
if(o.log || OPTIONS.log) return;
for(var c of toc)
if(c.title) console.log(c.time+' '+c.title);
}
if(require.main===module) shell(process.argv);