UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

367 lines (346 loc) 14.6 kB
import { parentPort } from 'node:worker_threads'; import { spawn } from 'child_process'; import { encodeRawAudioToWave, fadeAudioInOut, getRawAudioDuration, sliceAudioChannels } from './AudioUtilities.js'; import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js'; import { Timer } from '../utilities/Timer.js'; import { clip, getRandomHexString, waitTimeout, writeToStderr } from '../utilities/Utilities.js'; import { encodeToAudioBuffer, float32ToInt16Pcm, interleaveChannels } from './AudioBufferConversion.js'; import { OpenPromise } from '../utilities/OpenPromise.js'; import { addWordTextOffsetsToTimelineInPlace } from '../utilities/Timeline.js'; import { readAndParseJsonFile, readFileAsUtf8, remove, writeFile } from '../utilities/FileSystem.js'; import { tryResolvingSoxPath } from './SoxPath.js'; import { SignalChannel } from '../utilities/SignalChannel.js'; import { deepClone } from '../utilities/ObjectUtilities.js'; import { appName } from '../api/Common.js'; import { getAppTempDir, joinPath } from '../utilities/PathUtilities.js'; export async function playAudioFileWithTimelineFile(audioFilename, timelineFileName, transcriptFileName, player) { const rawAudio = await FFMpegTranscoder.decodeToChannels(audioFilename, 48000, 1); const timeline = await readAndParseJsonFile(timelineFileName); let transcript; if (transcriptFileName) { transcript = await readFileAsUtf8(transcriptFileName); } await playAudioWithWordTimeline(rawAudio, timeline, transcript, player); } export async function playAudioWithWordTimeline(rawAudio, wordTimeline, transcript, player) { if (!transcript) { transcript = wordTimeline.map(entry => entry.text).join(' '); } wordTimeline = deepClone(wordTimeline); addWordTextOffsetsToTimelineInPlace(wordTimeline, transcript); let timelineEntryIndex = 0; let transcriptOffset = 0; function onTimePosition(timePosition) { const text = transcript; for (; timelineEntryIndex < wordTimeline.length; timelineEntryIndex++) { const entry = wordTimeline[timelineEntryIndex]; if (entry.startTime > timePosition) { return; } const wordStartOffset = entry.startOffsetUtf16; let wordEndOffset = entry.endOffsetUtf16; if (wordStartOffset == null || wordEndOffset == null) { //writeToStderr(` [No offset available for '${entry.text}'] `) continue; } while (wordEndOffset < text.length && charactersToWriteAhead.includes(text[wordEndOffset]) && text[wordEndOffset] != wordTimeline[timelineEntryIndex + 2]?.text) { wordEndOffset += 1; } writeToStderr(text.substring(transcriptOffset, wordEndOffset)); transcriptOffset = wordEndOffset; } } writeToStderr('\n'); const signalChannel = new SignalChannel(); const keypressListenerStartTimestamp = Date.now(); function keypressHandler(message) { if (message.name === 'keypress') { //console.log(`Keypress: ${message.key.name}`) if (message.timestamp < keypressListenerStartTimestamp) { return; } const keyName = message.key.name; if (keyName === 'return') { signalChannel.send('abort'); } else if (keyName === 'left') { signalChannel.send('skip', -1); } else if (keyName === 'right') { signalChannel.send('skip', 1); if (false) { const nextTimelineEntryIndex = timelineEntryIndex; if (nextTimelineEntryIndex < wordTimeline.length) { //signalChannel.send('skip', 5) const targetTime = wordTimeline[nextTimelineEntryIndex].startTime; signalChannel.send('seek', targetTime); } } } else if (keyName === 'space') { signalChannel.send('togglePause'); } } } parentPort?.on('message', keypressHandler); await playAudioSamples(rawAudio, onTimePosition, signalChannel, player); parentPort?.off('message', keypressHandler); writeToStderr('\n'); } export async function playAudioSamplesWithKeyboardControls(rawAudio, player) { const signalChannel = new SignalChannel(); const keypressListenerStartTimestamp = Date.now(); function keypressHandler(message) { if (message.name === 'keypress') { //console.log(`Keypress: ${message.key.name}`) if (message.timestamp < keypressListenerStartTimestamp) { return; } const keyName = message.key.name; if (keyName === 'return') { signalChannel.send('abort'); } else if (keyName === 'left') { signalChannel.send('skip', -1); } else if (keyName === 'right') { signalChannel.send('skip', 1); } else if (keyName === 'space') { signalChannel.send('togglePause'); } } } parentPort?.on('message', keypressHandler); await playAudioSamples(rawAudio, undefined, signalChannel, player); parentPort?.off('message', keypressHandler); } export function playAudioSamples(rawAudio, onTimePosition, signalChannel, player) { if (!player) { player = 'audio-io'; } if (player === 'audio-io') { return playAudioSamples_AudioIO(rawAudio, onTimePosition, signalChannel); } else if (player === 'sox') { return playAudioSamples_Sox(rawAudio, onTimePosition, signalChannel); } else { throw new Error(`Unsupported audio player ID: ${player}`); } } export async function playAudioSamples_AudioIO(rawAudio, onTimePosition, signalChannel) { const openPromise = new OpenPromise(); const sampleRate = rawAudio.sampleRate; const channelCount = rawAudio.audioChannels.length; const bufferDuration = 100.0; const audioFrameCount = rawAudio.audioChannels[0].length; const audioDuration = getRawAudioDuration(rawAudio); const { createAudioOutput } = await import('@echogarden/audio-io'); let frameOffset = 0; let audioOutput; let abortRequested = false; let ended = false; let isPaused = false; // Define an audio output handler function async function audioOutputHandler(outputBuffer) { if (ended) { return; } if (onTimePosition) { const audioTime = Math.min(frameOffset / sampleRate, audioDuration); onTimePosition(audioTime); } if (isPaused) { return; } const chunkFrameCount = outputBuffer.length / channelCount; const floatAudioChunk = sliceAudioChannels(rawAudio.audioChannels, frameOffset, frameOffset + chunkFrameCount); const interleavedFloatAudioChunk = interleaveChannels(floatAudioChunk); const int16AudioChunk = float32ToInt16Pcm(interleavedFloatAudioChunk); outputBuffer.set(int16AudioChunk); frameOffset += chunkFrameCount; if (abortRequested || int16AudioChunk.length < outputBuffer.length) { ended = true; await audioOutput.dispose(); openPromise.resolve(); } } if (signalChannel) { signalChannel.on('abort', () => { abortRequested = true; }); signalChannel.on('skip', (durationToSkip) => { frameOffset += durationToSkip * sampleRate; frameOffset = Math.floor(frameOffset); frameOffset = clip(frameOffset, 0, audioFrameCount); }); signalChannel.on('seek', (timeToSeekTo) => { frameOffset = Math.floor(timeToSeekTo * sampleRate); frameOffset = clip(frameOffset, 0, audioFrameCount); }); signalChannel.on('togglePause', () => { isPaused = !isPaused; }); } audioOutput = await createAudioOutput({ sampleRate, // Sample rate in Hz, should be an integer like 44100, 22050, 8000 channelCount, // Channel count, likely 1 (mono), or 2 (stereo) bufferDuration, // Target buffer duration, in milliseconds. Defaults to 100.0 }, audioOutputHandler); return openPromise.promise; } export function playAudioSamples_Sox(rawAudio, onTimePosition, signalChannel, microFadeInOut = true) { return new Promise(async (resolve, reject) => { if (microFadeInOut) { rawAudio = fadeAudioInOut(rawAudio, 0.0025); } let playerProcessClosed = false; const channelCount = rawAudio.audioChannels.length; const audioDuration = getRawAudioDuration(rawAudio); const playerSpawnedOpenPromise = new OpenPromise(); const soxPath = await tryResolvingSoxPath(); if (!soxPath) { throw new Error(`Couldn't find or install the SoX utility. Please install the SoX utility on your system path to enable audio playback.`); } let aborted = false; let streamToStdin = true; if (process.platform == 'darwin') { streamToStdin = false; } let tempFilePath; let audioBuffer; async function cleanup() { if (tempFilePath) { await remove(tempFilePath); } } let playerProcess; if (streamToStdin) { audioBuffer = encodeToAudioBuffer(rawAudio.audioChannels); playerProcess = spawn(soxPath, ['-t', 'raw', '-r', `${rawAudio.sampleRate}`, '-e', 'signed', '-b', '16', '-c', channelCount.toString(), '-', '-d'], {}); } else { tempFilePath = joinPath(getAppTempDir(appName), `${getRandomHexString(16)}.wav`); const waveFileBuffer = encodeRawAudioToWave(rawAudio); await writeFile(tempFilePath, waveFileBuffer); playerProcess = spawn(soxPath, [tempFilePath, '-d'], {}); } if (signalChannel) { signalChannel.on('abort', () => { aborted = true; playerProcess.kill('SIGKILL'); }); } // Required to work around SoX bug: playerProcess.stderr.on('data', (data) => { //writeToStderr(data.toString('utf-8')) }); playerProcess.stdout.on('data', (data) => { //writeToStderr(data.toString('utf-8')) }); playerProcess.once('spawn', () => { if (audioBuffer != undefined) { playerProcess.stdin.write(audioBuffer); playerProcess.stdin.end(); playerProcess.stdin.on('error', () => { }); } playerSpawnedOpenPromise.resolve(null); }); playerProcess.once('error', async (e) => { await cleanup(); playerProcessClosed = true; reject(e); }); playerProcess.once('close', async () => { await cleanup(); playerProcessClosed = true; resolve(); }); await playerSpawnedOpenPromise.promise; const timer = new Timer(); while (!playerProcessClosed && !aborted) { const elapsedTime = timer.elapsedTimeSeconds; if (onTimePosition) { onTimePosition(elapsedTime); } if (playerProcessClosed || elapsedTime >= audioDuration) { if (onTimePosition) { onTimePosition(audioDuration); } return; } await waitTimeout(20); } }); } /* export function playAudioSamples_Speaker(rawAudio: RawAudio, onTimePosition?: (timePosition: number) => void, microFadeInOut = true) { return new Promise<void>(async (resolve, reject) => { if (microFadeInOut) { rawAudio = fadeAudioInOut(rawAudio, 0.0025) } const channelCount = rawAudio.audioChannels.length let audioData = encodeToAudioBuffer(rawAudio.audioChannels) const { default: Speaker } = await import('speaker') const speaker = new Speaker({ channels: rawAudio.audioChannels.length, bitDepth: 16, sampleRate: rawAudio.sampleRate, }) speaker.on('error', (e: any) => { reject(e) }) const bytesPerSecond = rawAudio.sampleRate * 2 * channelCount const byteCountToDuration = (byteCount: number) => { return byteCount / bytesPerSecond } const audioDuration = byteCountToDuration(audioData.length) let mpg123AudioBufferSize: number let mpg123AudioBufferDuration: number if (process.platform == 'win32') { mpg123AudioBufferSize = 65536 mpg123AudioBufferDuration = byteCountToDuration(mpg123AudioBufferSize) } else { mpg123AudioBufferDuration = 0.5 mpg123AudioBufferSize = bytesPerSecond * mpg123AudioBufferDuration } audioData = concatUint8Arrays([audioData, new Uint8Array(mpg123AudioBufferSize)]) const maxChunkSize = mpg123AudioBufferSize const writeAheadDuration = 0.5 const timer = new Timer() let readOffset = 0 let targetTimePosition = 0 while (true) { const elapsedTime = timer.elapsedTimeSeconds if (onTimePosition) { onTimePosition(elapsedTime) } if (readOffset < audioData.length) { const targetWriteTime = targetTimePosition - writeAheadDuration if (elapsedTime >= targetWriteTime) { const chunk = audioData.subarray(readOffset, readOffset + maxChunkSize) speaker.write(chunk) readOffset += chunk.length targetTimePosition += byteCountToDuration(chunk.length) } } if (elapsedTime >= audioDuration) { //speaker.close(false) resolve() return } await waitTimeout(20) } }) } */ export const charactersToWriteAhead = [ ',', '.', ',', '、', ':', ';', '。', ':', ';', '?', '?', '!', '!', ')', ']', '}', `"`, `'`, '”', '’', '-', '—', '»', '،', '؟' ]; //# sourceMappingURL=AudioPlayer.js.map