mediabunny
Version:
Pure TypeScript media toolkit for reading, writing, and converting media files, directly in the browser.
293 lines (251 loc) • 12.4 kB
text/typescript
/*!
* Copyright (c) 2026-present, Vanilagy and contributors
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
import { assert } from './misc';
import { AudioSample } from './sample';
/**
* Utility class to handle audio resampling, handling both sample rate resampling as well as channel up/downmixing.
* The advantage over doing this manually rather than using OfflineAudioContext to do it for us is the artifact-free
* handling of putting multiple resampled audio samples back to back, which produces flaky results using
* OfflineAudioContext.
*/
export class AudioResampler {
sourceSampleRate: number | null = null;
targetSampleRate: number;
sourceNumberOfChannels: number | null = null;
targetNumberOfChannels: number;
endTime: number;
onSample: (sample: AudioSample) => Promise<void>;
bufferSizeInFrames: number;
bufferSizeInSamples: number;
outputBuffer: Float32Array;
/** Start frame of current buffer */
bufferStartFrame: number;
/** The highest index written to in the current buffer */
maxWrittenFrame: number | null = null;
channelMixer!: (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => number;
tempSourceBuffer!: Float32Array;
timestampOffset: number;
constructor(options: {
targetSampleRate: number;
targetNumberOfChannels: number;
startTime: number;
endTime: number;
onSample: (sample: AudioSample) => Promise<void>;
}) {
this.targetSampleRate = options.targetSampleRate;
this.targetNumberOfChannels = options.targetNumberOfChannels;
this.endTime = options.endTime;
this.onSample = options.onSample;
this.bufferSizeInFrames = Math.floor(this.targetSampleRate * 5.0); // 5 seconds
this.bufferSizeInSamples = this.bufferSizeInFrames * this.targetNumberOfChannels;
this.outputBuffer = new Float32Array(this.bufferSizeInSamples);
this.bufferStartFrame = Math.floor(options.startTime * this.targetSampleRate);
// Set to ensure that if the buffer start frame lands on a fractional sample, that the first timestamp still
// comes out as exactly startTime
this.timestampOffset = options.startTime - this.bufferStartFrame / this.targetSampleRate;
}
/**
* Sets up the channel mixer to handle up/downmixing in the case where input and output channel counts don't match.
*/
doChannelMixerSetup(): void {
assert(this.sourceNumberOfChannels !== null);
const sourceNum = this.sourceNumberOfChannels;
const targetNum = this.targetNumberOfChannels;
// Logic taken from
// https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Basic_concepts_behind_Web_Audio_API
// Most of the mapping functions are branchless.
if (sourceNum === 1 && targetNum === 2) {
// Mono to Stereo: M -> L, M -> R
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum]!;
};
} else if (sourceNum === 1 && targetNum === 4) {
// Mono to Quad: M -> L, M -> R, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum]! * +(targetChannelIndex < 2);
};
} else if (sourceNum === 1 && targetNum === 6) {
// Mono to 5.1: 0 -> L, 0 -> R, M -> C, 0 -> LFE, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum]! * +(targetChannelIndex === 2);
};
} else if (sourceNum === 2 && targetNum === 1) {
// Stereo to Mono: 0.5 * (L + R)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return 0.5 * (sourceData[baseIdx]! + sourceData[baseIdx + 1]!);
};
} else if (sourceNum === 2 && targetNum === 4) {
// Stereo to Quad: L -> L, R -> R, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum + targetChannelIndex]! * +(targetChannelIndex < 2);
};
} else if (sourceNum === 2 && targetNum === 6) {
// Stereo to 5.1: L -> L, R -> R, 0 -> C, 0 -> LFE, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum + targetChannelIndex]! * +(targetChannelIndex < 2);
};
} else if (sourceNum === 4 && targetNum === 1) {
// Quad to Mono: 0.25 * (L + R + SL + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return 0.25 * (
sourceData[baseIdx]! + sourceData[baseIdx + 1]!
+ sourceData[baseIdx + 2]! + sourceData[baseIdx + 3]!
);
};
} else if (sourceNum === 4 && targetNum === 2) {
// Quad to Stereo: 0.5 * (L + SL), 0.5 * (R + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return 0.5 * (
sourceData[baseIdx + targetChannelIndex]!
+ sourceData[baseIdx + targetChannelIndex + 2]!
);
};
} else if (sourceNum === 4 && targetNum === 6) {
// Quad to 5.1: L -> L, R -> R, 0 -> C, 0 -> LFE, SL -> SL, SR -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
// It's a bit harder to do this one branchlessly
if (targetChannelIndex < 2) return sourceData[baseIdx + targetChannelIndex]!; // L, R
if (targetChannelIndex === 2 || targetChannelIndex === 3) return 0; // C, LFE
return sourceData[baseIdx + targetChannelIndex - 2]!; // SL, SR
};
} else if (sourceNum === 6 && targetNum === 1) {
// 5.1 to Mono: sqrt(1/2) * (L + R) + C + 0.5 * (SL + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return Math.SQRT1_2 * (sourceData[baseIdx]! + sourceData[baseIdx + 1]!)
+ sourceData[baseIdx + 2]!
+ 0.5 * (sourceData[baseIdx + 4]! + sourceData[baseIdx + 5]!);
};
} else if (sourceNum === 6 && targetNum === 2) {
// 5.1 to Stereo: L + sqrt(1/2) * (C + SL), R + sqrt(1/2) * (C + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return sourceData[baseIdx + targetChannelIndex]!
+ Math.SQRT1_2 * (sourceData[baseIdx + 2]! + sourceData[baseIdx + targetChannelIndex + 4]!);
};
} else if (sourceNum === 6 && targetNum === 4) {
// 5.1 to Quad: L + sqrt(1/2) * C, R + sqrt(1/2) * C, SL, SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
// It's a bit harder to do this one branchlessly
if (targetChannelIndex < 2) {
return sourceData[baseIdx + targetChannelIndex]! + Math.SQRT1_2 * sourceData[baseIdx + 2]!;
}
return sourceData[baseIdx + targetChannelIndex + 2]!; // SL, SR
};
} else {
// Discrete fallback: direct mapping with zero-fill or drop
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return targetChannelIndex < sourceNum
? sourceData[sourceFrameIndex * sourceNum + targetChannelIndex]!
: 0;
};
}
}
ensureTempBufferSize(requiredSamples: number): void {
let length = this.tempSourceBuffer.length;
while (length < requiredSamples) {
length *= 2;
}
if (length !== this.tempSourceBuffer.length) {
const newBuffer = new Float32Array(length);
newBuffer.set(this.tempSourceBuffer);
this.tempSourceBuffer = newBuffer;
}
}
async add(audioSample: AudioSample) {
if (this.sourceSampleRate === null) {
// This is the first sample, so let's init the missing data. Initting the sample rate from the decoded
// sample is more reliable than using the file's metadata, because decoders are free to emit any sample rate
// they see fit.
this.sourceSampleRate = audioSample.sampleRate;
this.sourceNumberOfChannels = audioSample.numberOfChannels;
// Pre-allocate temporary buffer for source data
this.tempSourceBuffer = new Float32Array(this.sourceSampleRate * this.sourceNumberOfChannels);
this.doChannelMixerSetup();
}
const requiredSamples = audioSample.numberOfFrames * audioSample.numberOfChannels;
this.ensureTempBufferSize(requiredSamples);
// Copy the audio data to the temp buffer
const sourceDataSize = audioSample.allocationSize({ planeIndex: 0, format: 'f32' });
const sourceView = new Float32Array(this.tempSourceBuffer.buffer, 0, sourceDataSize / 4);
audioSample.copyTo(sourceView, { planeIndex: 0, format: 'f32' });
const inputStartTime = audioSample.timestamp;
const inputEndTime = Math.min(audioSample.timestamp + audioSample.duration, this.endTime);
// Compute which output frames are affected by this sample
const outputStartFrame = Math.floor(inputStartTime * this.targetSampleRate);
const outputEndFrame = Math.ceil(inputEndTime * this.targetSampleRate);
for (let outputFrame = outputStartFrame; outputFrame < outputEndFrame; outputFrame++) {
if (outputFrame < this.bufferStartFrame) {
continue; // Skip writes to the past
}
while (outputFrame >= this.bufferStartFrame + this.bufferSizeInFrames) {
// The write is after the current buffer, so finalize it
await this.finalizeCurrentBuffer();
this.bufferStartFrame += this.bufferSizeInFrames;
}
const bufferFrameIndex = outputFrame - this.bufferStartFrame;
assert(bufferFrameIndex < this.bufferSizeInFrames);
const outputTime = outputFrame / this.targetSampleRate;
const inputTime = outputTime - inputStartTime;
const sourcePosition = inputTime * this.sourceSampleRate;
const sourceLowerFrame = Math.floor(sourcePosition);
const sourceUpperFrame = Math.ceil(sourcePosition);
const fraction = sourcePosition - sourceLowerFrame;
// Process each output channel
for (let targetChannel = 0; targetChannel < this.targetNumberOfChannels; targetChannel++) {
let lowerSample = 0;
let upperSample = 0;
if (sourceLowerFrame >= 0 && sourceLowerFrame < audioSample.numberOfFrames) {
lowerSample = this.channelMixer(sourceView, sourceLowerFrame, targetChannel);
}
if (sourceUpperFrame >= 0 && sourceUpperFrame < audioSample.numberOfFrames) {
upperSample = this.channelMixer(sourceView, sourceUpperFrame, targetChannel);
}
// For resampling, we do naive linear interpolation to find the in-between sample. This produces
// suboptimal results especially for downsampling (for which a low-pass filter would first need to be
// applied), but AudioContext doesn't do this either, so, whatever, for now.
const outputSample = lowerSample + fraction * (upperSample - lowerSample);
// Write to output buffer (interleaved)
const outputIndex = bufferFrameIndex * this.targetNumberOfChannels + targetChannel;
this.outputBuffer[outputIndex]! += outputSample; // Add in case of overlapping samples
}
if (this.maxWrittenFrame === null) {
this.maxWrittenFrame = bufferFrameIndex;
} else {
this.maxWrittenFrame = Math.max(this.maxWrittenFrame, bufferFrameIndex);
}
}
}
async finalizeCurrentBuffer() {
if (this.maxWrittenFrame === null) {
return; // Nothing to finalize
}
const samplesWritten = (this.maxWrittenFrame + 1) * this.targetNumberOfChannels;
const outputData = new Float32Array(samplesWritten);
outputData.set(this.outputBuffer.subarray(0, samplesWritten));
const timestampSeconds = this.bufferStartFrame / this.targetSampleRate;
const audioSample = new AudioSample({
format: 'f32',
sampleRate: this.targetSampleRate,
numberOfChannels: this.targetNumberOfChannels,
timestamp: timestampSeconds + this.timestampOffset,
data: outputData,
});
await this.onSample(audioSample);
this.outputBuffer.fill(0);
this.maxWrittenFrame = null;
}
finalize() {
return this.finalizeCurrentBuffer();
}
}