echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

github.com/echogarden-project/echogarden

echogarden-project/echogarden

282 lines (216 loc) • 10.4 kB

text/typescript

import * as AudioBufferConversion from '../audio/AudioBufferConversion.js' import { RawAudio } from '../audio/AudioUtilities.js' import { readUint16LE, readUint32LE, writeAscii, writeUint16LE, writeUint32LE } from '../utilities/BinaryUtilities.js' import { encodeHex, decodeHex } from '../encodings/Hex.js' import { concatUint8Arrays, logToStderr } from '../utilities/Utilities.js' import { decodeAscii } from '../encodings/Ascii.js' const log = logToStderr export function encodeWave(rawAudio: RawAudio, bitDepth: BitDepth = 16, sampleFormat: SampleFormat = SampleFormat.PCM, speakerPositionMask = 0) { const audioChannels = rawAudio.audioChannels const sampleRate = rawAudio.sampleRate const audioBuffer = AudioBufferConversion.encodeToAudioBuffer(audioChannels, bitDepth, sampleFormat) const audioDataLength = audioBuffer.length const shouldUseExtensibleFormat = bitDepth > 16 || audioChannels.length > 2 const formatSubChunk = new WaveFormat(audioChannels.length, sampleRate, bitDepth, sampleFormat, speakerPositionMask) const formatSubChunkBuffer = formatSubChunk.serialize(shouldUseExtensibleFormat) const dataSubChunkBuffer = new Uint8Array(4 + 4 + audioDataLength) writeAscii(dataSubChunkBuffer, 'data', 0) const dataChunkLength = Math.min(audioDataLength, 4294967295) // Ensure large data chunk length is clipped to max writeUint32LE(dataSubChunkBuffer, dataChunkLength, 4) dataSubChunkBuffer.set(audioBuffer, 8) const riffChunkHeaderBuffer = new Uint8Array(12) writeAscii(riffChunkHeaderBuffer, 'RIFF', 0) const riffChunkLength = Math.min(4 + formatSubChunkBuffer.length + dataSubChunkBuffer.length, 4294967295) // Ensure large RIFF chunk length is clipped to max writeUint32LE(riffChunkHeaderBuffer, riffChunkLength, 4) writeAscii(riffChunkHeaderBuffer, 'WAVE', 8) return concatUint8Arrays([riffChunkHeaderBuffer, formatSubChunkBuffer, dataSubChunkBuffer]) } export function decodeWave(waveData: Uint8Array, ignoreTruncatedChunks = true, ignoreOverflowingDataChunks = true) { let readOffset = 0 const riffId = decodeAscii(waveData.subarray(readOffset, readOffset + 4)) if (riffId != 'RIFF') { throw new Error('Not a valid wave file. No RIFF id found at offset 0.') } readOffset += 4 let riffChunkSize = readUint32LE(waveData, readOffset) readOffset += 4 const waveId = decodeAscii(waveData.subarray(readOffset, readOffset + 4)) if (waveId != 'WAVE') { throw new Error('Not a valid wave file. No WAVE id found at offset 8.') } if (ignoreOverflowingDataChunks && riffChunkSize === 4294967295) { riffChunkSize = waveData.length - 8 } if (riffChunkSize < waveData.length - 8) { throw new Error(`RIFF chunk length ${riffChunkSize} is smaller than the remaining size of the buffer (${waveData.length - 8})`) } if (!ignoreTruncatedChunks && riffChunkSize > waveData.length - 8) { throw new Error(`RIFF chunk length (${riffChunkSize}) is greater than the remaining size of the buffer (${waveData.length - 8})`) } readOffset += 4 let formatSubChunkBodyBuffer: Uint8Array | undefined const dataBuffers: Uint8Array[] = [] while (true) { const subChunkIdentifier = decodeAscii(waveData.subarray(readOffset, readOffset + 4)) readOffset += 4 let subChunkSize = readUint32LE(waveData, readOffset) readOffset += 4 if (!ignoreTruncatedChunks && subChunkSize > waveData.length - readOffset) { throw new Error(`Encountered a '${subChunkIdentifier}' subchunk with a size of ${subChunkSize} which is greater than the remaining size of the buffer (${waveData.length - readOffset})`) } if (subChunkIdentifier == 'fmt ') { formatSubChunkBodyBuffer = waveData.subarray(readOffset, readOffset + subChunkSize) } else if (subChunkIdentifier == 'data') { if (!formatSubChunkBodyBuffer) { throw new Error('A data subchunk was encountered before a format subchunk') } // If the data chunk is truncated or extended beyond 4 GiB, // the data would be read up to the end of the buffer if (ignoreOverflowingDataChunks && subChunkSize === 4294967295) { subChunkSize = waveData.length - readOffset } const subChunkData = waveData.subarray(readOffset, readOffset + subChunkSize) dataBuffers.push(subChunkData) } // All sub chunks other than 'data' (e.g. 'LIST', 'fact', 'plst', 'junk' etc.) are ignored // This addition operation may overflow if JavaScript integers were 32 bits, // but since they are 52 bits, it is okay: readOffset += subChunkSize // Break if readOffset is equal to or is greater than the size of the buffer if (readOffset >= waveData.length) { break } } if (!formatSubChunkBodyBuffer) { throw new Error('No format subchunk was found in the wave file') } if (dataBuffers.length === 0) { throw new Error('No data subchunks were found in the wave file') } const waveFormat = WaveFormat.deserializeFrom(formatSubChunkBodyBuffer) const sampleFormat = waveFormat.sampleFormat const channelCount = waveFormat.channelCount const sampleRate = waveFormat.sampleRate const bitDepth = waveFormat.bitDepth const speakerPositionMask = waveFormat.speakerPositionMask const concatenatedDataBuffers = concatUint8Arrays(dataBuffers) dataBuffers.length = 0 // Allow the garbage collector to free up memory held by the data buffers const audioChannels = AudioBufferConversion.decodeToChannels(concatenatedDataBuffers, channelCount, bitDepth, sampleFormat) return { rawAudio: { audioChannels, sampleRate }, sourceSampleFormat: sampleFormat, sourceBitDepth: bitDepth, sourceSpeakerPositionMask: speakerPositionMask } } export function repairWave(waveData: Uint8Array) { const { rawAudio, sourceSampleFormat, sourceBitDepth } = decodeWave(waveData) return encodeWave(rawAudio, sourceBitDepth, sourceSampleFormat) } class WaveFormat { // 24 bytes total for PCM, 26 for float sampleFormat: SampleFormat // 2 bytes LE channelCount: number // 2 bytes LE sampleRate: number // 4 bytes LE get byteRate() { return this.sampleRate * this.bytesPerSample * this.channelCount } // 4 bytes LE get blockAlign() { return this.bytesPerSample * this.channelCount } // 2 bytes LE bitDepth: BitDepth // 2 bytes LE speakerPositionMask: number // 4 bytes LE get guid() { return sampleFormatToGuid[this.sampleFormat] } // 16 bytes BE // helpers: get bytesPerSample() { return this.bitDepth / 8 } constructor(channelCount: number, sampleRate: number, bitDepth: BitDepth, sampleFormat: SampleFormat, speakerPositionMask = 0) { this.sampleFormat = sampleFormat this.channelCount = channelCount this.sampleRate = sampleRate this.bitDepth = bitDepth this.speakerPositionMask = speakerPositionMask } serialize(useExtensibleFormat: boolean) { let sampleFormatId = this.sampleFormat if (useExtensibleFormat) { sampleFormatId = 65534 as number } const serializedSize = sampleFormatToSerializedSize[sampleFormatId] const result = new Uint8Array(serializedSize) writeAscii(result, 'fmt ', 0) // + 4 writeUint32LE(result, serializedSize - 8, 4) // + 4 writeUint16LE(result, sampleFormatId, 8) // + 2 writeUint16LE(result, this.channelCount, 10) // + 2 writeUint32LE(result, this.sampleRate, 12) // + 4 writeUint32LE(result, this.byteRate, 16) // + 4 writeUint16LE(result, this.blockAlign, 20) // + 2 writeUint16LE(result, this.bitDepth, 22) // + 2 if (useExtensibleFormat) { writeUint16LE(result, serializedSize - 26, 24) // + 2 (extension size) writeUint16LE(result, this.bitDepth, 26) // + 2 (valid bits per sample) writeUint32LE(result, this.speakerPositionMask, 28) // + 2 (speaker position mask) if (this.sampleFormat == SampleFormat.PCM || this.sampleFormat == SampleFormat.Float) { result.set(decodeHex(this.guid), 32) } else { throw new Error(`Extensible format is not supported for sample format ${this.sampleFormat}`) } } return result } static deserializeFrom(formatChunkBody: Uint8Array) { // chunkBody should not include the first 8 bytes let sampleFormat = readUint16LE(formatChunkBody, 0) // + 2 const channelCount = readUint16LE(formatChunkBody, 2) // + 2 const sampleRate = readUint32LE(formatChunkBody, 4) // + 4 const bitDepth = readUint16LE(formatChunkBody, 14) let speakerPositionMask = 0 if (sampleFormat == 65534) { if (formatChunkBody.length < 40) { throw new Error(`Format subchunk specifies a format id of 65534 (extensible) but its body size is ${formatChunkBody.length} bytes, which is smaller than the minimum expected of 40 bytes`) } speakerPositionMask = readUint16LE(formatChunkBody, 20) const guid = encodeHex(formatChunkBody.subarray(24, 40)) if (guid == sampleFormatToGuid[SampleFormat.PCM]) { sampleFormat = SampleFormat.PCM } else if (guid == sampleFormatToGuid[SampleFormat.Float]) { sampleFormat = SampleFormat.Float } else { throw new Error(`Unsupported format GUID in extended format subchunk: ${guid}`) } } if (sampleFormat == SampleFormat.PCM) { if (bitDepth != 8 && bitDepth != 16 && bitDepth != 24 && bitDepth != 32) { throw new Error(`PCM audio has a bit depth of ${bitDepth}, which is not supported`) } } else if (sampleFormat == SampleFormat.Float) { if (bitDepth != 32 && bitDepth != 64) { throw new Error(`IEEE float audio has a bit depth of ${bitDepth}, which is not supported`) } } else if (sampleFormat == SampleFormat.Alaw) { if (bitDepth != 8) { throw new Error(`Alaw audio has a bit depth of ${bitDepth}, which is not supported`) } } else if (sampleFormat == SampleFormat.Mulaw) { if (bitDepth != 8) { throw new Error(`Mulaw audio has a bit depth of ${bitDepth}, which is not supported`) } } else { throw new Error(`Wave audio format id ${sampleFormat} is not supported`) } return new WaveFormat(channelCount, sampleRate, bitDepth, sampleFormat, speakerPositionMask) } } export enum SampleFormat { PCM = 1, Float = 3, Alaw = 6, Mulaw = 7, } export type BitDepth = 8 | 16 | 24 | 32 | 64 const sampleFormatToSerializedSize = { [SampleFormat.PCM]: 24, [SampleFormat.Float]: 26, [SampleFormat.Alaw]: 26, [SampleFormat.Mulaw]: 26, 65534: 48 } const sampleFormatToGuid = { [SampleFormat.PCM]: '0100000000001000800000aa00389b71', [SampleFormat.Float]: '0300000000001000800000aa00389b71', [SampleFormat.Alaw]: '', [SampleFormat.Mulaw]: '', }