mediabunny
Version:
Pure TypeScript media toolkit for reading, writing, and converting media files, directly in the browser.
1,396 lines (1,189 loc) • 47 kB
text/typescript
/*!
* Copyright (c) 2025-present, Vanilagy and contributors
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at https://mozilla.org/MPL/2.0/.
*/
import {
AUDIO_CODECS,
AudioCodec,
getFirstEncodableVideoCodec,
getEncodableAudioCodecs,
NON_PCM_AUDIO_CODECS,
Quality,
QUALITY_HIGH,
VIDEO_CODECS,
VideoCodec,
} from './codec';
import { Input } from './input';
import { InputAudioTrack, InputTrack, InputVideoTrack } from './input-track';
import {
AudioSampleSink,
CanvasSink,
EncodedPacketSink,
VideoSampleSink,
} from './media-sink';
import {
AudioEncodingConfig,
AudioSource,
EncodedVideoPacketSource,
EncodedAudioPacketSource,
VideoEncodingConfig,
VideoSource,
VideoSampleSource,
AudioSampleSource,
} from './media-source';
import { assert, clamp, MaybePromise, normalizeRotation, promiseWithResolvers, Rotation } from './misc';
import { Output, TrackType } from './output';
import { AudioSample, VideoSample } from './sample';
/**
* Video-specific options.
* @public
*/
export type ConversionVideoOptions = {
/** If true, all video tracks will be discarded and will not be present in the output. */
discard?: boolean;
/**
* The desired width of the output video in pixels, defaulting to the video's natural display width. If height
* is not set, it will be deduced automatically based on aspect ratio.
*/
width?: number;
/**
* The desired height of the output video in pixels, defaulting to the video's natural display height. If width
* is not set, it will be deduced automatically based on aspect ratio.
*/
height?: number;
/**
* The fitting algorithm in case both width and height are set.
*
* - 'fill' will stretch the image to fill the entire box, potentially altering aspect ratio.
* - 'contain' will contain the entire image within the box while preserving aspect ratio. This may lead to
* letterboxing.
* - 'cover' will scale the image until the entire box is filled, while preserving aspect ratio.
*/
fit?: 'fill' | 'contain' | 'cover';
/**
* The angle in degrees to rotate the input video by, clockwise. Rotation is applied before resizing. This
* rotation is _in addition to_ the natural rotation of the input video as specified in input file's metadata.
*/
rotate?: Rotation;
/**
* The desired frame rate of the output video, in hertz. If not specified, the original input frame rate will
* be used (which may be variable).
*/
frameRate?: number;
/** The desired output video codec. */
codec?: VideoCodec;
/** The desired bitrate of the output video. */
bitrate?: VideoEncodingConfig['bitrate'];
/** When true, video will always be re-encoded instead of directly copying over the encoded samples. */
forceTranscode?: boolean;
};
/**
* Audio-specific options.
* @public
*/
export type ConversionAudioOptions = {
/** If true, all audio tracks will be discarded and will not be present in the output. */
discard?: boolean;
/** The desired channel count of the output audio. */
numberOfChannels?: number;
/** The desired sample rate of the output audio, in hertz. */
sampleRate?: number;
/** The desired output audio codec. */
codec?: AudioCodec;
/** The desired bitrate of the output audio. */
bitrate?: AudioEncodingConfig['bitrate'];
/** When true, audio will always be re-encoded instead of directly copying over the encoded samples. */
forceTranscode?: boolean;
};
const validateVideoOptions = (videoOptions: ConversionVideoOptions | undefined) => {
if (videoOptions !== undefined && (!videoOptions || typeof videoOptions !== 'object')) {
throw new TypeError('options.video, when provided, must be an object.');
}
if (videoOptions?.discard !== undefined && typeof videoOptions.discard !== 'boolean') {
throw new TypeError('options.video.discard, when provided, must be a boolean.');
}
if (videoOptions?.forceTranscode !== undefined && typeof videoOptions.forceTranscode !== 'boolean') {
throw new TypeError('options.video.forceTranscode, when provided, must be a boolean.');
}
if (videoOptions?.codec !== undefined && !VIDEO_CODECS.includes(videoOptions.codec)) {
throw new TypeError(
`options.video.codec, when provided, must be one of: ${VIDEO_CODECS.join(', ')}.`,
);
}
if (
videoOptions?.bitrate !== undefined
&& !(videoOptions.bitrate instanceof Quality)
&& (!Number.isInteger(videoOptions.bitrate) || videoOptions.bitrate <= 0)
) {
throw new TypeError('options.video.bitrate, when provided, must be a positive integer or a quality.');
}
if (
videoOptions?.width !== undefined
&& (!Number.isInteger(videoOptions.width) || videoOptions.width <= 0)
) {
throw new TypeError('options.video.width, when provided, must be a positive integer.');
}
if (
videoOptions?.height !== undefined
&& (!Number.isInteger(videoOptions.height) || videoOptions.height <= 0)
) {
throw new TypeError('options.video.height, when provided, must be a positive integer.');
}
if (videoOptions?.fit !== undefined && !['fill', 'contain', 'cover'].includes(videoOptions.fit)) {
throw new TypeError('options.video.fit, when provided, must be one of "fill", "contain", or "cover".');
}
if (
videoOptions?.width !== undefined
&& videoOptions.height !== undefined
&& videoOptions.fit === undefined
) {
throw new TypeError(
'When both options.video.width and options.video.height are provided, options.video.fit must also be'
+ ' provided.',
);
}
if (videoOptions?.rotate !== undefined && ![0, 90, 180, 270].includes(videoOptions.rotate)) {
throw new TypeError('options.video.rotate, when provided, must be 0, 90, 180 or 270.');
}
if (
videoOptions?.frameRate !== undefined
&& (!Number.isFinite(videoOptions.frameRate) || videoOptions.frameRate <= 0)
) {
throw new TypeError('options.video.frameRate, when provided, must be a finite positive number.');
}
};
/**
* The options for media file conversion.
* @public
*/
export type ConversionOptions = {
/** The input file. */
input: Input;
/** The output file. */
output: Output;
/**
* Video-specific options. When passing an object, the same options are applied to all video tracks. When passing a
* function, it will be invoked for each video track and is expected to return or resolve to the options
* for that specific track. The function is passed an instance of `InputVideoTrack` as well as a number `n`, which
* is the 1-based index of the track in the list of all video tracks.
*/
video?: ConversionVideoOptions
| ((track: InputVideoTrack, n: number) => MaybePromise<ConversionVideoOptions | undefined>);
/**
* Audio-specific options. When passing an object, the same options are applied to all audio tracks. When passing a
* function, it will be invoked for each audio track and is expected to return or resolve to the options
* for that specific track. The function is passed an instance of `InputAudioTrack` as well as a number `n`, which
* is the 1-based index of the track in the list of all audio tracks.
*/
audio?: ConversionAudioOptions
| ((track: InputAudioTrack, n: number) => MaybePromise<ConversionAudioOptions | undefined>);
/** Options to trim the input file. */
trim?: {
/** The time in the input file in seconds at which the output file should start. Must be less than `end`. */
start: number;
/** The time in the input file in seconds at which the output file should end. Must be greater than `start`. */
end: number;
};
};
const validateAudioOptions = (audioOptions: ConversionAudioOptions | undefined) => {
if (audioOptions !== undefined && (!audioOptions || typeof audioOptions !== 'object')) {
throw new TypeError('options.audio, when provided, must be an object.');
}
if (audioOptions?.discard !== undefined && typeof audioOptions.discard !== 'boolean') {
throw new TypeError('options.audio.discard, when provided, must be a boolean.');
}
if (audioOptions?.forceTranscode !== undefined && typeof audioOptions.forceTranscode !== 'boolean') {
throw new TypeError('options.audio.forceTranscode, when provided, must be a boolean.');
}
if (audioOptions?.codec !== undefined && !AUDIO_CODECS.includes(audioOptions.codec)) {
throw new TypeError(
`options.audio.codec, when provided, must be one of: ${AUDIO_CODECS.join(', ')}.`,
);
}
if (
audioOptions?.bitrate !== undefined
&& !(audioOptions.bitrate instanceof Quality)
&& (!Number.isInteger(audioOptions.bitrate) || audioOptions.bitrate <= 0)
) {
throw new TypeError('options.audio.bitrate, when provided, must be a positive integer or a quality.');
}
if (
audioOptions?.numberOfChannels !== undefined
&& (!Number.isInteger(audioOptions.numberOfChannels) || audioOptions.numberOfChannels <= 0)
) {
throw new TypeError('options.audio.numberOfChannels, when provided, must be a positive integer.');
}
if (
audioOptions?.sampleRate !== undefined
&& (!Number.isInteger(audioOptions.sampleRate) || audioOptions.sampleRate <= 0)
) {
throw new TypeError('options.audio.sampleRate, when provided, must be a positive integer.');
}
};
const FALLBACK_NUMBER_OF_CHANNELS = 2;
const FALLBACK_SAMPLE_RATE = 48000;
/**
* Represents a media file conversion process, used to convert one media file into another. In addition to conversion,
* this class can be used to resize and rotate video, resample audio, drop tracks, or trim to a specific time range.
* @public
*/
export class Conversion {
/** The input file. */
readonly input: Input;
/** The output file. */
readonly output: Output;
/** @internal */
_options: ConversionOptions;
/** @internal */
_startTimestamp: number;
/** @internal */
_endTimestamp: number;
/** @internal */
_addedCounts: Record<TrackType, number> = {
video: 0,
audio: 0,
subtitle: 0,
};
/** @internal */
_totalTrackCount = 0;
/** @internal */
_trackPromises: Promise<void>[] = [];
/** @internal */
_started: Promise<void>;
/** @internal */
_start: () => void;
/** @internal */
_executed = false;
/** @internal */
_synchronizer = new TrackSynchronizer();
/** @internal */
_totalDuration: number | null = null;
/** @internal */
_maxTimestamps = new Map<number, number>(); // Track ID -> timestamp
/** @internal */
_canceled = false;
/**
* A callback that is fired whenever the conversion progresses. Returns a number between 0 and 1, indicating the
* completion of the conversion. Note that a progress of 1 doesn't necessarily mean the conversion is complete;
* the conversion is complete once `execute` resolves.
*
* In order for progress to be computed, this property must be set before `execute` is called.
*/
onProgress?: (progress: number) => unknown = undefined;
/** @internal */
_computeProgress = false;
/** @internal */
_lastProgress = 0;
/** The list of tracks that are included in the output file. */
readonly utilizedTracks: InputTrack[] = [];
/** The list of tracks from the input file that have been discarded, alongside the discard reason. */
readonly discardedTracks: {
/** The track that was discarded. */
track: InputTrack;
/** The reason for discarding the track. */
reason:
| 'discarded_by_user'
| 'max_track_count_reached'
| 'max_track_count_of_type_reached'
| 'unknown_source_codec'
| 'undecodable_source_codec'
| 'no_encodable_target_codec';
}[] = [];
/** Initializes a new conversion process without starting the conversion. */
static async init(options: ConversionOptions) {
const conversion = new Conversion(options);
await conversion._init();
return conversion;
}
private constructor(options: ConversionOptions) {
if (!options || typeof options !== 'object') {
throw new TypeError('options must be an object.');
}
if (!(options.input instanceof Input)) {
throw new TypeError('options.input must be an Input.');
}
if (!(options.output instanceof Output)) {
throw new TypeError('options.output must be an Output.');
}
if (options.output._tracks.length > 0 || options.output.state !== 'pending') {
throw new TypeError('options.output must be fresh: no tracks added and not started.');
}
if (typeof options.video !== 'function') {
validateVideoOptions(options.video);
} else {
// We'll validate the return value later
}
if (typeof options.audio !== 'function') {
validateAudioOptions(options.audio);
} else {
// We'll validate the return value later
}
if (options.trim !== undefined && (!options.trim || typeof options.trim !== 'object')) {
throw new TypeError('options.trim, when provided, must be an object.');
}
if (options.trim?.start !== undefined && (!Number.isFinite(options.trim.start) || options.trim.start < 0)) {
throw new TypeError('options.trim.start, when provided, must be a non-negative number.');
}
if (options.trim?.end !== undefined && (!Number.isFinite(options.trim.end) || options.trim.end < 0)) {
throw new TypeError('options.trim.end, when provided, must be a non-negative number.');
}
if (
options.trim?.start !== undefined
&& options.trim.end !== undefined
&& options.trim.start >= options.trim.end) {
throw new TypeError('options.trim.start must be less than options.trim.end.');
}
this._options = options;
this.input = options.input;
this.output = options.output;
this._startTimestamp = options.trim?.start ?? 0;
this._endTimestamp = options.trim?.end ?? Infinity;
const { promise: started, resolve: start } = promiseWithResolvers();
this._started = started;
this._start = start;
}
/** @internal */
async _init() {
const inputTracks = await this.input.getTracks();
const outputTrackCounts = this.output.format.getSupportedTrackCounts();
let nVideo = 1;
let nAudio = 1;
for (const track of inputTracks) {
let trackOptions: ConversionVideoOptions | ConversionAudioOptions | undefined = undefined;
if (track.isVideoTrack()) {
if (this._options.video) {
if (typeof this._options.video === 'function') {
trackOptions = await this._options.video(track, nVideo);
validateVideoOptions(trackOptions);
nVideo++;
} else {
trackOptions = this._options.video;
}
}
} else if (track.isAudioTrack()) {
if (this._options.audio) {
if (typeof this._options.audio === 'function') {
trackOptions = await this._options.audio(track, nAudio);
validateAudioOptions(trackOptions);
nAudio++;
} else {
trackOptions = this._options.audio;
}
}
} else {
assert(false);
}
if (trackOptions?.discard) {
this.discardedTracks.push({
track,
reason: 'discarded_by_user',
});
continue;
}
if (this._totalTrackCount === outputTrackCounts.total.max) {
this.discardedTracks.push({
track,
reason: 'max_track_count_reached',
});
continue;
}
if (this._addedCounts[track.type] === outputTrackCounts[track.type].max) {
this.discardedTracks.push({
track,
reason: 'max_track_count_of_type_reached',
});
continue;
}
if (track.isVideoTrack()) {
await this._processVideoTrack(track, (trackOptions ?? {}) as ConversionVideoOptions);
} else if (track.isAudioTrack()) {
await this._processAudioTrack(track, (trackOptions ?? {}) as ConversionAudioOptions);
}
}
const unintentionallyDiscardedTracks = this.discardedTracks.filter(x => x.reason !== 'discarded_by_user');
if (unintentionallyDiscardedTracks.length > 0) {
// Let's give the user a notice/warning about discarded tracks so they aren't confused
console.warn('Some tracks had to be discarded from the conversion:', unintentionallyDiscardedTracks);
}
}
/** Executes the conversion process. Resolves once conversion is complete. */
async execute() {
if (this._executed) {
throw new Error('Conversion cannot be executed twice.');
}
this._executed = true;
if (this.onProgress) {
this._computeProgress = true;
this._totalDuration = Math.min(
await this.input.computeDuration() - this._startTimestamp,
this._endTimestamp - this._startTimestamp,
);
this.onProgress?.(0);
}
await this.output.start();
this._start();
try {
await Promise.all(this._trackPromises);
} catch (error) {
if (!this._canceled) {
// Make sure to cancel to stop other encoding processes and clean up resources
void this.cancel();
}
throw error;
}
if (this._canceled) {
await new Promise(() => {}); // Never resolve
}
await this.output.finalize();
if (this._computeProgress) {
this.onProgress?.(1);
}
}
/** Cancels the conversion process. Does nothing if the conversion is already complete. */
async cancel() {
if (this.output.state === 'finalizing' || this.output.state === 'finalized') {
return;
}
if (this._canceled) {
console.warn('Conversion already canceled.');
return;
}
this._canceled = true;
await this.output.cancel();
}
/** @internal */
async _processVideoTrack(track: InputVideoTrack, trackOptions: ConversionVideoOptions) {
const sourceCodec = track.codec;
if (!sourceCodec) {
this.discardedTracks.push({
track,
reason: 'unknown_source_codec',
});
return;
}
let videoSource: VideoSource;
const totalRotation = normalizeRotation(track.rotation + (trackOptions.rotate ?? 0));
const outputSupportsRotation = this.output.format.supportsVideoRotationMetadata;
const [originalWidth, originalHeight] = totalRotation % 180 === 0
? [track.codedWidth, track.codedHeight]
: [track.codedHeight, track.codedWidth];
let width = originalWidth;
let height = originalHeight;
const aspectRatio = width / height;
// A lot of video encoders require that the dimensions be multiples of 2
const ceilToMultipleOfTwo = (value: number) => Math.ceil(value / 2) * 2;
if (trackOptions.width !== undefined && trackOptions.height === undefined) {
width = ceilToMultipleOfTwo(trackOptions.width);
height = ceilToMultipleOfTwo(Math.round(width / aspectRatio));
} else if (trackOptions.width === undefined && trackOptions.height !== undefined) {
height = ceilToMultipleOfTwo(trackOptions.height);
width = ceilToMultipleOfTwo(Math.round(height * aspectRatio));
} else if (trackOptions.width !== undefined && trackOptions.height !== undefined) {
width = ceilToMultipleOfTwo(trackOptions.width);
height = ceilToMultipleOfTwo(trackOptions.height);
}
const firstTimestamp = await track.getFirstTimestamp();
const needsTranscode = !!trackOptions.forceTranscode
|| this._startTimestamp > 0
|| firstTimestamp < 0
|| !!trackOptions.frameRate;
const needsRerender = width !== originalWidth
|| height !== originalHeight
|| (totalRotation !== 0 && !outputSupportsRotation);
let videoCodecs = this.output.format.getSupportedVideoCodecs();
if (
!needsTranscode
&& !trackOptions.bitrate
&& !needsRerender
&& videoCodecs.includes(sourceCodec)
&& (!trackOptions.codec || trackOptions.codec === sourceCodec)
) {
// Fast path, we can simply copy over the encoded packets
const source = new EncodedVideoPacketSource(sourceCodec);
videoSource = source;
this._trackPromises.push((async () => {
await this._started;
const sink = new EncodedPacketSink(track);
const decoderConfig = await track.getDecoderConfig();
const meta: EncodedVideoChunkMetadata = { decoderConfig: decoderConfig ?? undefined };
const endPacket = Number.isFinite(this._endTimestamp)
? await sink.getPacket(this._endTimestamp, { metadataOnly: true }) ?? undefined
: undefined;
for await (const packet of sink.packets(undefined, endPacket, { verifyKeyPackets: true })) {
if (this._synchronizer.shouldWait(track.id, packet.timestamp)) {
await this._synchronizer.wait(packet.timestamp);
}
if (this._canceled) {
return;
}
await source.add(packet, meta);
this._reportProgress(track.id, packet.timestamp + packet.duration);
}
source.close();
this._synchronizer.closeTrack(track.id);
})());
} else {
// We need to decode & reencode the video
const canDecode = await track.canDecode();
if (!canDecode) {
this.discardedTracks.push({
track,
reason: 'undecodable_source_codec',
});
return;
}
if (trackOptions.codec) {
videoCodecs = videoCodecs.filter(codec => codec === trackOptions.codec);
}
const bitrate = trackOptions.bitrate ?? QUALITY_HIGH;
const encodableCodec = await getFirstEncodableVideoCodec(videoCodecs, { width, height, bitrate });
if (!encodableCodec) {
this.discardedTracks.push({
track,
reason: 'no_encodable_target_codec',
});
return;
}
const encodingConfig: VideoEncodingConfig = {
codec: encodableCodec,
bitrate,
onEncodedPacket: sample => this._reportProgress(track.id, sample.timestamp + sample.duration),
};
const source = new VideoSampleSource(encodingConfig);
videoSource = source;
if (needsRerender) {
this._trackPromises.push((async () => {
await this._started;
const sink = new CanvasSink(track, {
width,
height,
fit: trackOptions.fit ?? 'fill',
rotation: totalRotation, // Bake the rotation into the output
poolSize: 1,
});
const iterator = sink.canvases(this._startTimestamp, this._endTimestamp);
const frameRate = trackOptions.frameRate;
let lastCanvas: HTMLCanvasElement | OffscreenCanvas | null = null;
let lastCanvasTimestamp: number | null = null;
let lastCanvasEndTimestamp: number | null = null;
/** Repeats the last sample to pad out the time until the specified timestamp. */
const padFrames = async (until: number) => {
assert(lastCanvas);
assert(frameRate !== undefined);
const frameDifference = Math.round((until - lastCanvasTimestamp!) * frameRate);
for (let i = 1; i < frameDifference; i++) {
const sample = new VideoSample(lastCanvas, {
timestamp: lastCanvasTimestamp! + i / frameRate,
duration: 1 / frameRate,
});
await source.add(sample);
}
};
for await (const { canvas, timestamp, duration } of iterator) {
if (this._synchronizer.shouldWait(track.id, timestamp)) {
await this._synchronizer.wait(timestamp);
}
if (this._canceled) {
return;
}
let adjustedSampleTimestamp = Math.max(timestamp - this._startTimestamp, 0);
lastCanvasEndTimestamp = timestamp + duration;
if (frameRate !== undefined) {
// Logic for skipping/repeating frames when a frame rate is set
const alignedTimestamp = Math.floor(adjustedSampleTimestamp * frameRate) / frameRate;
if (lastCanvas !== null) {
if (alignedTimestamp <= lastCanvasTimestamp!) {
lastCanvas = canvas;
lastCanvasTimestamp = alignedTimestamp;
// Skip this sample, since we already added one for this frame
continue;
} else {
// Check if we may need to repeat the previous frame
await padFrames(alignedTimestamp);
}
}
adjustedSampleTimestamp = alignedTimestamp;
}
const sample = new VideoSample(canvas, {
timestamp: adjustedSampleTimestamp,
duration: frameRate !== undefined ? 1 / frameRate : duration,
});
await source.add(sample);
if (frameRate !== undefined) {
lastCanvas = canvas;
lastCanvasTimestamp = adjustedSampleTimestamp;
} else {
sample.close();
}
}
if (lastCanvas) {
assert(lastCanvasEndTimestamp !== null);
assert(frameRate !== undefined);
// If necessary, pad until the end timestamp of the last sample
await padFrames(Math.floor(lastCanvasEndTimestamp * frameRate) / frameRate);
}
source.close();
this._synchronizer.closeTrack(track.id);
})());
} else {
this._trackPromises.push((async () => {
await this._started;
const sink = new VideoSampleSink(track);
const frameRate = trackOptions.frameRate;
let lastSample: VideoSample | null = null;
let lastSampleTimestamp: number | null = null;
let lastSampleEndTimestamp: number | null = null;
/** Repeats the last sample to pad out the time until the specified timestamp. */
const padFrames = async (until: number) => {
assert(lastSample);
assert(frameRate !== undefined);
const frameDifference = Math.round((until - lastSampleTimestamp!) * frameRate);
for (let i = 1; i < frameDifference; i++) {
lastSample.setTimestamp(lastSampleTimestamp! + i / frameRate);
lastSample.setDuration(1 / frameRate);
await source.add(lastSample);
}
lastSample.close();
};
for await (const sample of sink.samples(this._startTimestamp, this._endTimestamp)) {
if (this._synchronizer.shouldWait(track.id, sample.timestamp)) {
await this._synchronizer.wait(sample.timestamp);
}
if (this._canceled) {
lastSample?.close();
return;
}
let adjustedSampleTimestamp = Math.max(sample.timestamp - this._startTimestamp, 0);
lastSampleEndTimestamp = sample.timestamp + sample.duration;
if (frameRate !== undefined) {
// Logic for skipping/repeating frames when a frame rate is set
const alignedTimestamp = Math.floor(adjustedSampleTimestamp * frameRate) / frameRate;
if (lastSample !== null) {
if (alignedTimestamp <= lastSampleTimestamp!) {
lastSample.close();
lastSample = sample;
lastSampleTimestamp = alignedTimestamp;
// Skip this sample, since we already added one for this frame
continue;
} else {
// Check if we may need to repeat the previous frame
await padFrames(alignedTimestamp);
}
}
adjustedSampleTimestamp = alignedTimestamp;
sample.setDuration(1 / frameRate);
}
sample.setTimestamp(adjustedSampleTimestamp);
await source.add(sample);
if (frameRate !== undefined) {
lastSample = sample;
lastSampleTimestamp = adjustedSampleTimestamp;
} else {
sample.close();
}
}
if (lastSample) {
assert(lastSampleEndTimestamp !== null);
assert(frameRate !== undefined);
// If necessary, pad until the end timestamp of the last sample
await padFrames(Math.floor(lastSampleEndTimestamp * frameRate) / frameRate);
}
source.close();
this._synchronizer.closeTrack(track.id);
})());
}
}
this.output.addVideoTrack(videoSource, {
frameRate: trackOptions.frameRate,
languageCode: track.languageCode,
rotation: needsRerender ? 0 : totalRotation, // Rerendering will bake the rotation into the output
});
this._addedCounts.video++;
this._totalTrackCount++;
this.utilizedTracks.push(track);
}
/** @internal */
async _processAudioTrack(track: InputAudioTrack, trackOptions: ConversionAudioOptions) {
const sourceCodec = track.codec;
if (!sourceCodec) {
this.discardedTracks.push({
track,
reason: 'unknown_source_codec',
});
return;
}
let audioSource: AudioSource;
const originalNumberOfChannels = track.numberOfChannels;
const originalSampleRate = track.sampleRate;
const firstTimestamp = await track.getFirstTimestamp();
let numberOfChannels = trackOptions.numberOfChannels ?? originalNumberOfChannels;
let sampleRate = trackOptions.sampleRate ?? originalSampleRate;
let needsResample = numberOfChannels !== originalNumberOfChannels
|| sampleRate !== originalSampleRate
|| this._startTimestamp > 0
|| firstTimestamp < 0;
let audioCodecs = this.output.format.getSupportedAudioCodecs();
if (
!trackOptions.forceTranscode
&& !trackOptions.bitrate
&& !needsResample
&& audioCodecs.includes(sourceCodec)
&& (!trackOptions.codec || trackOptions.codec === sourceCodec)
) {
// Fast path, we can simply copy over the encoded packets
const source = new EncodedAudioPacketSource(sourceCodec);
audioSource = source;
this._trackPromises.push((async () => {
await this._started;
const sink = new EncodedPacketSink(track);
const decoderConfig = await track.getDecoderConfig();
const meta: EncodedAudioChunkMetadata = { decoderConfig: decoderConfig ?? undefined };
const endPacket = Number.isFinite(this._endTimestamp)
? await sink.getPacket(this._endTimestamp, { metadataOnly: true }) ?? undefined
: undefined;
for await (const packet of sink.packets(undefined, endPacket)) {
if (this._synchronizer.shouldWait(track.id, packet.timestamp)) {
await this._synchronizer.wait(packet.timestamp);
}
if (this._canceled) {
return;
}
await source.add(packet, meta);
this._reportProgress(track.id, packet.timestamp + packet.duration);
}
source.close();
this._synchronizer.closeTrack(track.id);
})());
} else {
// We need to decode & reencode the audio
const canDecode = await track.canDecode();
if (!canDecode) {
this.discardedTracks.push({
track,
reason: 'undecodable_source_codec',
});
return;
}
let codecOfChoice: AudioCodec | null = null;
if (trackOptions.codec) {
audioCodecs = audioCodecs.filter(codec => codec === trackOptions.codec);
}
const bitrate = trackOptions.bitrate ?? QUALITY_HIGH;
const encodableCodecs = await getEncodableAudioCodecs(audioCodecs, {
numberOfChannels,
sampleRate,
bitrate,
});
if (
!encodableCodecs.some(codec => (NON_PCM_AUDIO_CODECS as readonly string[]).includes(codec))
&& audioCodecs.some(codec => (NON_PCM_AUDIO_CODECS as readonly string[]).includes(codec))
&& (numberOfChannels !== FALLBACK_NUMBER_OF_CHANNELS || sampleRate !== FALLBACK_SAMPLE_RATE)
) {
// We could not find a compatible non-PCM codec despite the container supporting them. This can be
// caused by strange channel count or sample rate configurations. Therefore, let's try again but with
// fallback parameters.
const encodableCodecsWithDefaultParams = await getEncodableAudioCodecs(audioCodecs, {
numberOfChannels: FALLBACK_NUMBER_OF_CHANNELS,
sampleRate: FALLBACK_SAMPLE_RATE,
bitrate,
});
const nonPcmCodec = encodableCodecsWithDefaultParams
.find(codec => (NON_PCM_AUDIO_CODECS as readonly string[]).includes(codec));
if (nonPcmCodec) {
// We are able to encode using a non-PCM codec, but it'll require resampling
needsResample = true;
codecOfChoice = nonPcmCodec;
numberOfChannels = FALLBACK_NUMBER_OF_CHANNELS;
sampleRate = FALLBACK_SAMPLE_RATE;
}
} else {
codecOfChoice = encodableCodecs[0] ?? null;
}
if (codecOfChoice === null) {
this.discardedTracks.push({
track,
reason: 'no_encodable_target_codec',
});
return;
}
if (needsResample) {
audioSource = this._resampleAudio(track, codecOfChoice, numberOfChannels, sampleRate, bitrate);
} else {
const source = new AudioSampleSource({
codec: codecOfChoice,
bitrate,
onEncodedPacket: packet => this._reportProgress(track.id, packet.timestamp + packet.duration),
});
audioSource = source;
this._trackPromises.push((async () => {
await this._started;
const sink = new AudioSampleSink(track);
for await (const sample of sink.samples(undefined, this._endTimestamp)) {
if (this._synchronizer.shouldWait(track.id, sample.timestamp)) {
await this._synchronizer.wait(sample.timestamp);
}
if (this._canceled) {
return;
}
await source.add(sample);
sample.close();
}
source.close();
this._synchronizer.closeTrack(track.id);
})());
}
}
this.output.addAudioTrack(audioSource, {
languageCode: track.languageCode,
});
this._addedCounts.audio++;
this._totalTrackCount++;
this.utilizedTracks.push(track);
}
/** @internal */
_resampleAudio(
track: InputAudioTrack,
codec: AudioCodec,
targetNumberOfChannels: number,
targetSampleRate: number,
bitrate: number | Quality,
) {
const source = new AudioSampleSource({
codec,
bitrate,
onEncodedPacket: packet => this._reportProgress(track.id, packet.timestamp + packet.duration),
});
this._trackPromises.push((async () => {
await this._started;
const resampler = new AudioResampler({
sourceNumberOfChannels: track.numberOfChannels,
sourceSampleRate: track.sampleRate,
targetNumberOfChannels,
targetSampleRate,
startTime: this._startTimestamp,
endTime: this._endTimestamp,
onSample: sample => source.add(sample),
});
const sink = new AudioSampleSink(track);
const iterator = sink.samples(this._startTimestamp, this._endTimestamp);
for await (const sample of iterator) {
if (this._synchronizer.shouldWait(track.id, sample.timestamp)) {
await this._synchronizer.wait(sample.timestamp);
}
if (this._canceled) {
return;
}
await resampler.add(sample);
}
await resampler.finalize();
source.close();
this._synchronizer.closeTrack(track.id);
})());
return source;
}
/** @internal */
_reportProgress(trackId: number, endTimestamp: number) {
if (!this._computeProgress) {
return;
}
assert(this._totalDuration !== null);
this._maxTimestamps.set(trackId, Math.max(endTimestamp, this._maxTimestamps.get(trackId) ?? -Infinity));
let totalTimestamps = 0;
for (const [, timestamp] of this._maxTimestamps) {
totalTimestamps += timestamp;
}
const averageTimestamp = totalTimestamps / this._totalTrackCount;
const newProgress = clamp(averageTimestamp / this._totalDuration, 0, 1);
if (newProgress !== this._lastProgress) {
this._lastProgress = newProgress;
this.onProgress?.(newProgress);
}
}
}
const MAX_TIMESTAMP_GAP = 5;
/**
* Utility class for synchronizing multiple track packet consumers with one another. We don't want one consumer to get
* too out-of-sync with the others, as that may lead to a large number of packets that need to be internally buffered
* before they can be written. Therefore, we use this class to slow down a consumer if it is too far ahead of the
* slowest consumer.
*/
class TrackSynchronizer {
maxTimestamps = new Map<number, number>(); // Track ID -> timestamp
resolvers: {
timestamp: number;
resolve: () => void;
}[] = [];
computeMinAndMaybeResolve() {
let newMin = Infinity;
for (const [, timestamp] of this.maxTimestamps) {
newMin = Math.min(newMin, timestamp);
}
for (let i = 0; i < this.resolvers.length; i++) {
const entry = this.resolvers[i]!;
if (entry.timestamp - newMin < MAX_TIMESTAMP_GAP) {
// The gap has gotten small enough again, the consumer can continue again
entry.resolve();
this.resolvers.splice(i, 1);
i--;
}
}
return newMin;
}
shouldWait(trackId: number, timestamp: number) {
this.maxTimestamps.set(trackId, Math.max(timestamp, this.maxTimestamps.get(trackId) ?? -Infinity));
const newMin = this.computeMinAndMaybeResolve();
return timestamp - newMin >= MAX_TIMESTAMP_GAP; // Should wait if it is too far ahead of the slowest consumer
}
wait(timestamp: number) {
const { promise, resolve } = promiseWithResolvers();
this.resolvers.push({
timestamp,
resolve,
});
return promise;
}
closeTrack(trackId: number) {
this.maxTimestamps.delete(trackId);
this.computeMinAndMaybeResolve();
}
}
/**
* Utility class to handle audio resampling, handling both sample rate resampling as well as channel up/downmixing.
* The advantage over doing this manually rather than using OfflineAudioContext to do it for us is the artifact-free
* handling of putting multiple resampled audio samples back to back, which produces flaky results using
* OfflineAudioContext.
*/
export class AudioResampler {
sourceSampleRate: number;
targetSampleRate: number;
sourceNumberOfChannels: number;
targetNumberOfChannels: number;
startTime: number;
endTime: number;
onSample: (sample: AudioSample) => Promise<void>;
bufferSizeInFrames: number;
bufferSizeInSamples: number;
outputBuffer: Float32Array;
/** Start frame of current buffer */
bufferStartFrame: number;
/** The highest index written to in the current buffer */
maxWrittenFrame: number;
channelMixer!: (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => number;
tempSourceBuffer: Float32Array;
constructor(options: {
sourceSampleRate: number;
targetSampleRate: number;
sourceNumberOfChannels: number;
targetNumberOfChannels: number;
startTime: number;
endTime: number;
onSample: (sample: AudioSample) => Promise<void>;
}) {
this.sourceSampleRate = options.sourceSampleRate;
this.targetSampleRate = options.targetSampleRate;
this.sourceNumberOfChannels = options.sourceNumberOfChannels;
this.targetNumberOfChannels = options.targetNumberOfChannels;
this.startTime = options.startTime;
this.endTime = options.endTime;
this.onSample = options.onSample;
this.bufferSizeInFrames = Math.floor(this.targetSampleRate * 5.0); // 5 seconds
this.bufferSizeInSamples = this.bufferSizeInFrames * this.targetNumberOfChannels;
this.outputBuffer = new Float32Array(this.bufferSizeInSamples);
this.bufferStartFrame = 0;
this.maxWrittenFrame = -1;
this.setupChannelMixer();
// Pre-allocate temporary buffer for source data
this.tempSourceBuffer = new Float32Array(this.sourceSampleRate * this.sourceNumberOfChannels);
}
/**
* Sets up the channel mixer to handle up/downmixing in the case where input and output channel counts don't match.
*/
setupChannelMixer(): void {
const sourceNum = this.sourceNumberOfChannels;
const targetNum = this.targetNumberOfChannels;
// Logic taken from
// https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API/Basic_concepts_behind_Web_Audio_API
// Most of the mapping functions are branchless.
if (sourceNum === 1 && targetNum === 2) {
// Mono to Stereo: M -> L, M -> R
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum]!;
};
} else if (sourceNum === 1 && targetNum === 4) {
// Mono to Quad: M -> L, M -> R, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum]! * +(targetChannelIndex < 2);
};
} else if (sourceNum === 1 && targetNum === 6) {
// Mono to 5.1: 0 -> L, 0 -> R, M -> C, 0 -> LFE, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum]! * +(targetChannelIndex === 2);
};
} else if (sourceNum === 2 && targetNum === 1) {
// Stereo to Mono: 0.5 * (L + R)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return 0.5 * (sourceData[baseIdx]! + sourceData[baseIdx + 1]!);
};
} else if (sourceNum === 2 && targetNum === 4) {
// Stereo to Quad: L -> L, R -> R, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum + targetChannelIndex]! * +(targetChannelIndex < 2);
};
} else if (sourceNum === 2 && targetNum === 6) {
// Stereo to 5.1: L -> L, R -> R, 0 -> C, 0 -> LFE, 0 -> SL, 0 -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return sourceData[sourceFrameIndex * sourceNum + targetChannelIndex]! * +(targetChannelIndex < 2);
};
} else if (sourceNum === 4 && targetNum === 1) {
// Quad to Mono: 0.25 * (L + R + SL + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return 0.25 * (
sourceData[baseIdx]! + sourceData[baseIdx + 1]!
+ sourceData[baseIdx + 2]! + sourceData[baseIdx + 3]!
);
};
} else if (sourceNum === 4 && targetNum === 2) {
// Quad to Stereo: 0.5 * (L + SL), 0.5 * (R + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return 0.5 * (
sourceData[baseIdx + targetChannelIndex]!
+ sourceData[baseIdx + targetChannelIndex + 2]!
);
};
} else if (sourceNum === 4 && targetNum === 6) {
// Quad to 5.1: L -> L, R -> R, 0 -> C, 0 -> LFE, SL -> SL, SR -> SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
// It's a bit harder to do this one branchlessly
if (targetChannelIndex < 2) return sourceData[baseIdx + targetChannelIndex]!; // L, R
if (targetChannelIndex === 2 || targetChannelIndex === 3) return 0; // C, LFE
return sourceData[baseIdx + targetChannelIndex - 2]!; // SL, SR
};
} else if (sourceNum === 6 && targetNum === 1) {
// 5.1 to Mono: sqrt(1/2) * (L + R) + C + 0.5 * (SL + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return Math.SQRT1_2 * (sourceData[baseIdx]! + sourceData[baseIdx + 1]!)
+ sourceData[baseIdx + 2]!
+ 0.5 * (sourceData[baseIdx + 4]! + sourceData[baseIdx + 5]!);
};
} else if (sourceNum === 6 && targetNum === 2) {
// 5.1 to Stereo: L + sqrt(1/2) * (C + SL), R + sqrt(1/2) * (C + SR)
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
return sourceData[baseIdx + targetChannelIndex]!
+ Math.SQRT1_2 * (sourceData[baseIdx + 2]! + sourceData[baseIdx + targetChannelIndex + 4]!);
};
} else if (sourceNum === 6 && targetNum === 4) {
// 5.1 to Quad: L + sqrt(1/2) * C, R + sqrt(1/2) * C, SL, SR
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
const baseIdx = sourceFrameIndex * sourceNum;
// It's a bit harder to do this one branchlessly
if (targetChannelIndex < 2) {
return sourceData[baseIdx + targetChannelIndex]! + Math.SQRT1_2 * sourceData[baseIdx + 2]!;
}
return sourceData[baseIdx + targetChannelIndex + 2]!; // SL, SR
};
} else {
// Discrete fallback: direct mapping with zero-fill or drop
this.channelMixer = (sourceData: Float32Array, sourceFrameIndex: number, targetChannelIndex: number) => {
return targetChannelIndex < sourceNum
? sourceData[sourceFrameIndex * sourceNum + targetChannelIndex]!
: 0;
};
}
}
ensureTempBufferSize(requiredSamples: number): void {
let length = this.tempSourceBuffer.length;
while (length < requiredSamples) {
length *= 2;
}
if (length !== this.tempSourceBuffer.length) {
const newBuffer = new Float32Array(length);
newBuffer.set(this.tempSourceBuffer);
this.tempSourceBuffer = newBuffer;
}
}
async add(audioSample: AudioSample) {
if (!audioSample || audioSample._closed) {
return;
}
const requiredSamples = audioSample.numberOfFrames * audioSample.numberOfChannels;
this.ensureTempBufferSize(requiredSamples);
// Copy the audio data to the temp buffer
const sourceDataSize = audioSample.allocationSize({ planeIndex: 0, format: 'f32' });
const sourceView = new Float32Array(this.tempSourceBuffer.buffer, 0, sourceDataSize / 4);
audioSample.copyTo(sourceView, { planeIndex: 0, format: 'f32' });
const inputStartTime = audioSample.timestamp - this.startTime;
const inputDuration = audioSample.numberOfFrames / this.sourceSampleRate;
const inputEndTime = Math.min(inputStartTime + inputDuration, this.endTime - this.startTime);
// Compute which output frames are affected by this sample
const outputStartFrame = Math.floor(inputStartTime * this.targetSampleRate);
const outputEndFrame = Math.ceil(inputEndTime * this.targetSampleRate);
for (let outputFrame = outputStartFrame; outputFrame < outputEndFrame; outputFrame++) {
if (outputFrame < this.bufferStartFrame) {
continue; // Skip writes to the past
}
while (outputFrame >= this.bufferStartFrame + this.bufferSizeInFrames) {
// The write is after the current buffer, so finalize it
await this.finalizeCurrentBuffer();
this.bufferStartFrame += this.bufferSizeInFrames;
}
const bufferFrameIndex = outputFrame - this.bufferStartFrame;
assert(bufferFrameIndex < this.bufferSizeInFrames);
const outputTime = outputFrame / this.targetSampleRate;
const inputTime = outputTime - inputStartTime;
const sourcePosition = inputTime * this.sourceSampleRate;
const sourceLowerFrame = Math.floor(sourcePosition);
const sourceUpperFrame = Math.ceil(sourcePosition);
const fraction = sourcePosition - sourceLowerFrame;
// Process each output channel
for (let targetChannel = 0; targetChannel < this.targetNumberOfChannels; targetChannel++) {
let lowerSample = 0;
let upperSample = 0;
if (sourceLowerFrame >= 0 && sourceLowerFrame < audioSample.numberOfFrames) {
lowerSample = this.channelMixer(sourceView, sourceLowerFrame, targetChannel);
}
if (sourceUpperFrame >= 0 && sourceUpperFrame < audioSample.numberOfFrames) {
upperSample = this.channelMixer(sourceView, sourceUpperFrame, targetChannel);
}
// For resampling, we do naive linear interpolation to find the in-between sample. This produces
// suboptimal results especially for downsampling (for which a low-pass filter would first need to be
// applied), but AudioContext doesn't do this either, so, whatever, for now.
const outputSample = lowerSample + fraction * (upperSample - lowerSample);
// Write to output buffer (interleaved)
const outputIndex = bufferFrameIndex * this.targetNumberOfChannels + targetChannel;
this.outputBuffer[outputIndex]! += outputSample; // Add in case of overlapping samples
}
this.maxWrittenFrame = Math.max(this.maxWrittenFrame, bufferFrameIndex);
}
}
async finalizeCurrentBuffer() {
if (this.maxWrittenFrame < 0) {
return; // Nothing to finalize
}
const samplesWritten = (this.maxWrittenFrame + 1) * this.targetNumberOfChannels;
const outputData = new Float32Array(samplesWritten);
outputData.set(this.outputBuffer.subarray(0, samplesWritten));
const timestampSeconds = this.bufferStartFrame / this.targetSampleRate;
const audioSample = new AudioSample({
format: 'f32',
sampleRate: this.targetSampleRate,
numberOfChannels: this.targetNumberOfChannels,
timestamp: timestampSeconds,
data: outputData,
});
await this.onSample(audioSample);
this.outputBuffer.fill(0);
this.maxWrittenFrame = -1;
}
finalize() {
return this.finalizeCurrentBuffer();
}
}