UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

237 lines 10.7 kB
import { decodeToChannels, SampleFormat } from '../audio/AudioBufferConversion.js'; import { bandwidthToQFactor } from '../dsp/BiquadFilter.js'; import { Logger } from '../utilities/Logger.js'; import { readFileAsBinary } from '../utilities/FileSystem.js'; import { concatUint8Arrays } from '../utilities/Utilities.js'; import { wrapEmscriptenModuleHeap } from 'wasm-heap-manager'; let svoxPicoInstance; export async function synthesize(text, textAnalysisFilePath, signalGenerationFilePath, postprocessOutput = true) { const logger = new Logger(); logger.start('Get pico WASM instance'); const m = await getInstance(); logger.start('Initialize pico engine'); const wasmHeap = wrapEmscriptenModuleHeap(m); const pico_initialize = m._pico_initialize; const picoext_setTraceLevel = m._picoext_setTraceLevel; const pico_loadResource = m._pico_loadResource; const pico_getResourceName = m._pico_getResourceName; const pico_createVoiceDefinition = m._pico_createVoiceDefinition; const pico_addResourceToVoiceDefinition = m._pico_addResourceToVoiceDefinition; const pico_newEngine = m._pico_newEngine; const pico_putTextUtf8 = m._pico_putTextUtf8; const pico_getData = m._pico_getData; const pico_getSystemStatusMessage = m._pico_getSystemStatusMessage; const pico_disposeEngine = m._pico_disposeEngine; const pico_terminate = m._pico_terminate; const pico_releaseVoiceDefinition = m._pico_releaseVoiceDefinition; const pico_unloadResource = m._pico_unloadResource; const picoMemSize = 2500000; const picoMemAreaRef = wasmHeap.allocUint8Array(picoMemSize); const systemPtrRef = wasmHeap.allocPointer32(); let resultCode = pico_initialize(picoMemAreaRef.address, picoMemAreaRef.allocatedByteCount, systemPtrRef.address); const systemPtr = systemPtrRef.value; throwErrorIfFailed(resultCode, 'Failed Pico initialization.'); picoext_setTraceLevel(systemPtr, 5); async function loadResource(localFilePath) { const virtualFilePath = '.' + localFilePath.substring(localFilePath.lastIndexOf('/')); const fileData = await readFileAsBinary(localFilePath); m.FS.writeFile(virtualFilePath, fileData); const virtualFilePathRef = wasmHeap.allocNullTerminatedUtf8String(virtualFilePath); const resourcePtrRef = wasmHeap.allocPointer32(); resultCode = pico_loadResource(systemPtr, virtualFilePathRef.address, resourcePtrRef.address); const resourcePtr = resourcePtrRef.value; throwErrorIfFailed(resultCode, `Failed loading Pico resource ${localFilePath}.`); return { resourcePtr, resourcePtrRef }; } const { resourcePtr: textAnalysisResourcePtr, resourcePtrRef: textAnalysisResourcePtrRef } = await loadResource(textAnalysisFilePath); const { resourcePtr: signalGenerationResourcePtr, resourcePtrRef: signalGenerationResourcePtrRef } = await loadResource(signalGenerationFilePath); function getResourceName(resourcePtr) { const resourceNameRef = wasmHeap.allocNullTerminatedUtf8String(32); resultCode = pico_getResourceName(systemPtr, resourcePtr, resourceNameRef.address); throwErrorIfFailed(resultCode, `Failed getting Pico resource name.`); const resourceName = resourceNameRef.value; return { resourceName, resourceNameRef }; } const { resourceName: textAnalysisResourceName, resourceNameRef: textAnalysisResourceNameRef } = getResourceName(textAnalysisResourcePtr); const { resourceName: signalGenerationResourceName, resourceNameRef: signalGenerationResourceNameRef } = getResourceName(signalGenerationResourcePtr); const voiceNameRef = wasmHeap.allocNullTerminatedUtf8String('PicoVoice'); resultCode = pico_createVoiceDefinition(systemPtr, voiceNameRef.address); function addResourceToVoiceDefinition(resourceNamePtr) { resultCode = pico_addResourceToVoiceDefinition(systemPtr, voiceNameRef.address, resourceNamePtr); throwErrorIfFailed(resultCode, `Failed adding resource to voice definition.`); } addResourceToVoiceDefinition(textAnalysisResourceNameRef.address); addResourceToVoiceDefinition(signalGenerationResourceNameRef.address); const enginePtrRef = wasmHeap.allocPointer32(); resultCode = pico_newEngine(systemPtr, voiceNameRef.address, enginePtrRef.address); throwErrorIfFailed(resultCode, `Failed creating new engine.`); const enginePtr = enginePtrRef.value; logger.start('Synthesize with pico'); const textRef = wasmHeap.allocNullTerminatedUtf8String(text); const bytesWrittenRef = wasmHeap.allocInt32(); const audioParts = []; const textRefEncodedByteCountIncludingTerminator = textRef.encodedByteCount + 1; for (let textByteOffset = 0; textByteOffset < textRefEncodedByteCountIncludingTerminator;) { bytesWrittenRef.value = 0; resultCode = pico_putTextUtf8(enginePtr, textRef.address + textByteOffset, textRefEncodedByteCountIncludingTerminator - textByteOffset, bytesWrittenRef.address); const bytesWritten = bytesWrittenRef.value; throwErrorIfFailed(resultCode, `Failed writing text to engine.`); const audioPart = readAudioDataFromEngine(); audioParts.push(audioPart); textByteOffset += bytesWritten; } const audioData = concatUint8Arrays(audioParts); function readAudioDataFromEngine() { const outBuffers = []; const outBufferLength = 16384; const outBufferRef = wasmHeap.allocUint8Array(outBufferLength); const outByteCountRef = wasmHeap.allocInt16(); const outDataTypeRef = wasmHeap.allocInt16(); while (true) { resultCode = pico_getData(enginePtr, outBufferRef.address, outBufferRef.allocatedByteCount, outByteCountRef.address, outDataTypeRef.address); throwErrorIfFailed(resultCode, `Failed getting audio data from engine.`, [200, 201]); const outByteCount = outByteCountRef.value; const outDataType = outDataTypeRef.value; if (resultCode == 200) { break; } if (outByteCount > 0) { outBuffers.push(outBufferRef.view.slice(0, outByteCount)); } } return concatUint8Arrays(outBuffers); } dispose(); function dispose() { if (!systemPtr) { return; } if (enginePtrRef) { pico_disposeEngine(systemPtr, enginePtrRef.address); } if (voiceNameRef) { pico_releaseVoiceDefinition(systemPtr, voiceNameRef.address); } if (textAnalysisResourcePtrRef) { pico_unloadResource(systemPtr, textAnalysisResourcePtrRef.address); } if (signalGenerationResourcePtrRef) { pico_unloadResource(systemPtr, signalGenerationResourcePtrRef.address); } pico_terminate(systemPtrRef.address); wasmHeap.freeAll(); } function throwErrorIfFailed(resultCode, title, successCodes = [0]) { if (successCodes.includes(resultCode)) { return; } const picoErrorMessageRef = wasmHeap.allocNullTerminatedUtf8String(200); pico_getSystemStatusMessage(systemPtr, resultCode, picoErrorMessageRef); const picoErrorMessage = picoErrorMessageRef.value; dispose(); throw new Error(`${title} ${picoErrorMessage}`); } const audioChannels = decodeToChannels(audioData, 1, 16, SampleFormat.PCM); let rawAudio = { audioChannels, sampleRate: 16000 }; if (postprocessOutput) { logger.start('Apply EQ to synthesized audio'); const Biquad = await import('../dsp/BiquadFilter.js'); Biquad.createLowshelfFilter(rawAudio.sampleRate, 177, -2.6).filterSamplesInPlace(rawAudio.audioChannels[0]); Biquad.createPeakingFilter(rawAudio.sampleRate, 440, bandwidthToQFactor(2), -9.7).filterSamplesInPlace(rawAudio.audioChannels[0]); //Biquad.createPeakingFilter(rawAudio.sampleRate, 1639, bandwidthToQFactor(2), 5.2).filterSamplesInPlace(rawAudio.audioChannels[0]) Biquad.createHighshelfFilter(rawAudio.sampleRate, 5180, 10.6).filterSamplesInPlace(rawAudio.audioChannels[0]); } logger.end(); return { rawAudio }; } export async function getInstance() { if (!svoxPicoInstance) { const { default: initializer } = await import('@echogarden/svoxpico-wasm'); svoxPicoInstance = await initializer(); } return svoxPicoInstance; } export function getResourceFilenamesForLanguage(language) { let textAnalysisFilename; let signalGenerationFilename; switch (language) { case 'en-US': case 'en': { textAnalysisFilename = 'en-US_ta.bin'; signalGenerationFilename = 'en-US_lh0_sg.bin'; break; } case 'en-GB': { textAnalysisFilename = 'en-GB_ta.bin'; signalGenerationFilename = 'en-GB_kh0_sg.bin'; break; } case 'de-DE': case 'de': { textAnalysisFilename = 'de-DE_ta.bin'; signalGenerationFilename = 'de-DE_gl0_sg.bin'; break; } case 'es-ES': case 'es': { textAnalysisFilename = 'es-ES_ta.bin'; signalGenerationFilename = 'es-ES_zl0_sg.bin'; break; } case 'fr-FR': case 'fr': { textAnalysisFilename = 'fr-FR_ta.bin'; signalGenerationFilename = 'fr-FR_nk0_sg.bin'; break; } case 'it-IT': case 'it': { textAnalysisFilename = 'it-IT_ta.bin'; signalGenerationFilename = 'it-IT_cm0_sg.bin'; break; } default: { throw new Error(`Unsupported languge: ${language}`); } } return { textAnalysisFilename, signalGenerationFilename }; } export const voiceList = [ { name: 'en-US', languages: ['en-US', 'en'], gender: 'female', packageName: 'pico-en-US' }, { name: 'en-GB', languages: ['en-GB', 'en'], gender: 'female', packageName: 'pico-en-GB' }, { name: 'de-DE', languages: ['de-DE', 'de'], gender: 'female', packageName: 'pico-de-DE' }, { name: 'es-ES', languages: ['es-ES', 'es'], gender: 'female', packageName: 'pico-es-ES' }, { name: 'fr-FR', languages: ['fr-FR', 'fr'], gender: 'female', packageName: 'pico-fr-FR' }, { name: 'it-IT', languages: ['it-IT', 'it'], gender: 'female', packageName: 'pico-it-IT' }, ]; //# sourceMappingURL=SvoxPicoTTS.js.map