echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
390 lines (335 loc) • 8.29 kB
text/typescript
import { SynthesisVoice } from '../api/API.js'
import { decodeWaveToRawAudio } from '../audio/AudioUtilities.js'
import { Logger } from '../utilities/Logger.js'
import { getRandomHexString, logToStderr, resolveModuleMainPath } from '../utilities/Utilities.js'
import { open, close, remove, ensureDir, readFileAsBinary, readFileAsUtf8 } from '../utilities/FileSystem.js'
import { getAppTempDir, joinPath } from '../utilities/PathUtilities.js'
const log = logToStderr
export type FliteVoiceName = 'kal' | 'kal16' | 'awb' | 'rms' | 'slt' | string
let fliteModuleObject: WebAssembly.Module
export async function synthesize(text: string, voice: FliteVoiceName, voiceDir: string | undefined, rate: number, pitchMeanHz?: number, pitchStdDev?: number) {
const logger = new Logger()
logger.start('Get Flite WASI instance')
const randomId = getRandomHexString(16)
const outFileName = `${randomId}.out.wav`
const stdOutFileName = `${randomId}.stdout`
const tempDir = getAppTempDir('flite')
await ensureDir(tempDir)
const tempAudioFilePath = joinPath(tempDir, outFileName)
const tempStdOutFilePath = joinPath(tempDir, stdOutFileName)
const stdOutFileFd = await open(tempStdOutFilePath, 'w+')
if (voiceDir) {
voice = `voices/${voice}.flitevox`
}
const optionArgs = [
'-voice', voice,
'--setf', `duration_stretch=${1.0 / rate}`,
]
if (pitchMeanHz != null) {
optionArgs.push('--setf', `int_f0_target_mean=${pitchMeanHz}`)
}
if (pitchStdDev != null) {
optionArgs.push('--setf', `int_f0_target_stddev=${pitchStdDev}`)
}
const preopens: { [k: string]: string } = {
'.': tempDir,
}
if (voiceDir != undefined) {
preopens['./voices'] = voiceDir
}
const { WASI } = await import('wasi')
const wasi = new WASI({
version: 'preview1',
env: {
},
args: ['--',
//'-ssml',
'-psdur',
...optionArgs,
//` ${escapeHtml(text)} `,
` ${text} `,
outFileName
],
preopens,
stdout: stdOutFileFd,
returnOnExit: true
})
// Some WASI binaries require `const importObject = { wasi_unstable: wasi.wasiImport }`
const importObject = { wasi_snapshot_preview1: wasi.wasiImport }
const moduleObject = await getModuleObject()
const instance = await WebAssembly.instantiate(moduleObject, importObject)
logger.start('Synthesize with flite')
const exitCode: number = wasi.start(instance) as any
await close(stdOutFileFd)
async function cleanup() {
await remove(tempAudioFilePath)
await remove(tempStdOutFilePath)
}
if (exitCode != 0) {
await cleanup()
throw new Error(`Flite failed with exit code ${exitCode}`)
}
const waveData = await readFileAsBinary(tempAudioFilePath)
const stdOutString = await readFileAsUtf8(tempStdOutFilePath)
await cleanup()
const events = parseEventsFromTrace(stdOutString)
const { rawAudio } = decodeWaveToRawAudio(waveData)
logger.end()
return { rawAudio, events }
}
async function getModuleObject() {
if (!fliteModuleObject) {
const fliteWasiPath = await resolveModuleMainPath('@echogarden/flite-wasi')
const wasiFileContent = await readFileAsBinary(fliteWasiPath) as Uint8Array<ArrayBuffer>
fliteModuleObject = await WebAssembly.compile(wasiFileContent)
}
return fliteModuleObject
}
function parseEventsFromTrace(eventTrace: string) {
//log(eventTrace)
const eventStrings = eventTrace.trim().split(' ')
const eventCount = eventStrings.length
const events: FliteEvent[] = []
let phraseStartOffset = 0
for (let i = 0; i < eventCount; i++) {
const eventString = eventStrings[i]
const splitPoint = eventString.lastIndexOf(':')
const id = eventString.substring(0, splitPoint)
const startTime = events.length > 0 ? events[events.length - 1].endTime : 0
let eventType: FliteEventType
if (id == 'pau') {
eventType = 'pause'
} else if (id == '\npau') {
eventType = 'phrasePause'
phraseStartOffset = startTime
} else {
eventType = 'phone'
}
const endTime = phraseStartOffset + parseFloat(eventString.substring(splitPoint + 1))
events.push({
type: eventType,
id,
startTime,
endTime
})
}
return events
}
export type FliteEventType = 'phone' | 'pause' | 'phrasePause'
export type FliteEvent = {
type: FliteEventType,
id: string,
startTime: number,
endTime: number
}
export const voiceList: SynthesisVoice[] = [
// Built-in voices
{
name: 'slt',
languages: ['en-US', 'en'],
gender: 'female'
},
{
name: 'kal16',
languages: ['en-US', 'en'],
gender: 'male'
},
{
name: 'rms',
languages: ['en-US', 'en'],
gender: 'male'
},
{
name: 'awb',
languages: ['en-GB-SCOTLAND', 'en-GB', 'en'],
gender: 'male'
},
{
name: 'kal',
languages: ['en-US', 'en'],
gender: 'male'
},
// Voices loaded from packages
// US English voices
{
name: 'cmu_us_aew',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_aew'
},
{
name: 'cmu_us_ahw',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_ahw'
},
{
name: 'cmu_us_aup',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_aup'
},
{
name: 'cmu_us_awb',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_awb'
},
{
name: 'cmu_us_axb',
languages: ['en-US', 'en'],
gender: 'female',
packageName: 'flite-cmu_us_axb'
},
{
name: 'cmu_us_bdl',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_bdl'
},
{
name: 'cmu_us_clb',
languages: ['en-US', 'en'],
gender: 'female',
packageName: 'flite-cmu_us_clb'
},
{
name: 'cmu_us_eey',
languages: ['en-US', 'en'],
gender: 'female',
packageName: 'flite-cmu_us_eey'
},
{
name: 'cmu_us_fem',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_fem'
},
{
name: 'cmu_us_gka',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_gka'
},
{
name: 'cmu_us_jmk',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_jmk'
},
{
name: 'cmu_us_ksp',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_ksp'
},
{
name: 'cmu_us_ljm',
languages: ['en-US', 'en'],
gender: 'female',
packageName: 'flite-cmu_us_ljm'
},
{
name: 'cmu_us_lnh',
languages: ['en-US', 'en'],
gender: 'female',
packageName: 'flite-cmu_us_lnh'
},
{
name: 'cmu_us_rms',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_rms'
},
{
name: 'cmu_us_rxr',
languages: ['en-US', 'en'],
gender: 'male',
packageName: 'flite-cmu_us_rxr'
},
{
name: 'cmu_us_slp',
languages: ['en-US', 'en'],
gender: 'female',
packageName: 'flite-cmu_us_slp'
},
{
name: 'cmu_us_slt',
languages: ['en-US', 'en'],
gender: 'female',
packageName: 'flite-cmu_us_slt'
},
// Indic voices
{
name: 'cmu_indic_ben_rm',
languages: ['be', 'bn'],// Bengali
gender: 'female',
packageName: 'flite-cmu_indic_ben_rm'
},
{
name: 'cmu_indic_guj_ad',
languages: ['gu'],// Gujarati
gender: 'male',
packageName: 'flite-cmu_indic_guj_ad'
},
{
name: 'cmu_indic_guj_dp',
languages: ['gu'],// Gujarati
gender: 'female',
packageName: 'flite-cmu_indic_guj_dp'
},
{
name: 'cmu_indic_guj_kt',
languages: ['gu'],// Gujarati
gender: 'female',
packageName: 'flite-cmu_indic_guj_kt'
},
{
name: 'cmu_indic_hin_ab',
languages: ['hi'],// Hindi
gender: 'female',
packageName: 'flite-cmu_indic_hin_ab'
},
{
name: 'cmu_indic_kan_plv',
languages: ['ka'],// Kannada
gender: 'female',
packageName: 'flite-cmu_indic_kan_plv'
},
{
name: 'cmu_indic_mar_aup',
languages: ['mr'],// Marathi
gender: 'male',
packageName: 'flite-cmu_indic_mar_aup'
},
{
name: 'cmu_indic_mar_slp',
languages: ['mr'],// Marathi
gender: 'female',
packageName: 'flite-cmu_indic_mar_slp'
},
{
name: 'cmu_indic_pan_amp',
languages: ['pa'],// Punjabi
gender: 'female',
packageName: 'flite-cmu_indic_pan_amp'
},
{
name: 'cmu_indic_tel_kpn',
languages: ['te'],// Telugu
gender: 'female',
packageName: 'flite-cmu_indic_tel_kpn'
},
{
name: 'cmu_indic_tel_sk',
languages: ['te'],// Telugu
gender: 'male',
packageName: 'flite-cmu_indic_tel_sk'
},
{
name: 'cmu_indic_tel_ss',
languages: ['te'],// Telugu
gender: 'female',
packageName: 'flite-cmu_indic_tel_ss'
},
]