mediabunny

/*! * Copyright (c) 2025-present, Vanilagy and contributors * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ import { Box, ftyp, IsobmffBoxWriter, mdat, mfra, moof, moov, vtta, vttc, vtte } from './isobmff-boxes'; import { Muxer } from '../muxer'; import { Output, OutputAudioTrack, OutputSubtitleTrack, OutputTrack, OutputVideoTrack } from '../output'; import { BufferTargetWriter, Writer } from '../writer'; import { assert, computeRationalApproximation, last, promiseWithResolvers } from '../misc'; import { IsobmffOutputFormatOptions, IsobmffOutputFormat, MovOutputFormat } from '../output-format'; import { inlineTimestampRegex, SubtitleConfig, SubtitleCue, SubtitleMetadata } from '../subtitles'; import { parsePcmCodec, PCM_AUDIO_CODECS, PcmAudioCodec, SubtitleCodec, validateAudioChunkMetadata, validateSubtitleMetadata, validateVideoChunkMetadata, } from '../codec'; import { BufferTarget } from '../target'; import { EncodedPacket, PacketType } from '../packet'; import { extractAvcDecoderConfigurationRecord, extractHevcDecoderConfigurationRecord, serializeAvcDecoderConfigurationRecord, serializeHevcDecoderConfigurationRecord, transformAnnexBToLengthPrefixed, } from '../codec-data'; import { buildIsobmffMimeType } from './isobmff-misc'; import { MAX_BOX_HEADER_SIZE, MIN_BOX_HEADER_SIZE } from './isobmff-reader'; export const GLOBAL_TIMESCALE = 1000; const TIMESTAMP_OFFSET = 2_082_844_800; // Seconds between Jan 1 1904 and Jan 1 1970 export type Sample = { timestamp: number; decodeTimestamp: number; duration: number; data: Uint8Array | null; size: number; type: PacketType; timescaleUnitsToNextSample: number; }; type Chunk = { /** The lowest presentation timestamp in this chunk */ startTimestamp: number; samples: Sample[]; offset: number | null; // In the case of a fragmented file, this indicates the position of the moof box pointing to the data in this chunk moofOffset: number | null; }; export type IsobmffTrackData = { muxer: IsobmffMuxer; timescale: number; samples: Sample[]; sampleQueue: Sample[]; // For fragmented files timestampProcessingQueue: Sample[]; timeToSampleTable: { sampleCount: number; sampleDelta: number }[]; compositionTimeOffsetTable: { sampleCount: number; sampleCompositionTimeOffset: number }[]; lastTimescaleUnits: number | null; lastSample: Sample | null; finalizedChunks: Chunk[]; currentChunk: Chunk | null; compactlyCodedChunkTable: { firstChunk: number; samplesPerChunk: number; }[]; } & ({ track: OutputVideoTrack; type: 'video'; info: { width: number; height: number; decoderConfig: VideoDecoderConfig; /** * The "Annex B transformation" involves converting the raw packet data from Annex B to * "MP4" (length-prefixed) format. * https://stackoverflow.com/questions/24884827 */ requiresAnnexBTransformation: boolean; }; } | { track: OutputAudioTrack; type: 'audio'; info: { numberOfChannels: number; sampleRate: number; decoderConfig: AudioDecoderConfig; /** * The "PCM transformation" is making every sample in the sample table be exactly one PCM audio sample long. * Some players expect this for PCM audio. */ requiresPcmTransformation: boolean; }; } | { track: OutputSubtitleTrack; type: 'subtitle'; info: { config: SubtitleConfig; }; lastCueEndTimestamp: number; cueQueue: SubtitleCue[]; nextSourceId: number; cueToSourceId: WeakMap<SubtitleCue, number>; }); export type IsobmffVideoTrackData = IsobmffTrackData & { type: 'video' }; export type IsobmffAudioTrackData = IsobmffTrackData & { type: 'audio' }; export type IsobmffSubtitleTrackData = IsobmffTrackData & { type: 'subtitle' }; export const intoTimescale = (timeInSeconds: number, timescale: number, round = true) => { const value = timeInSeconds * timescale; return round ? Math.round(value) : value; }; export class IsobmffMuxer extends Muxer { private format: IsobmffOutputFormat; private writer: Writer; private boxWriter: IsobmffBoxWriter; private fastStart: NonNullable<IsobmffOutputFormatOptions['fastStart']>; private isFragmented: boolean; isQuickTime: boolean; private auxTarget = new BufferTarget(); private auxWriter = this.auxTarget._createWriter(); private auxBoxWriter = new IsobmffBoxWriter(this.auxWriter); private mdat: Box | null = null; private trackDatas: IsobmffTrackData[] = []; private allTracksKnown = promiseWithResolvers(); private creationTime = Math.floor(Date.now() / 1000) + TIMESTAMP_OFFSET; private finalizedChunks: Chunk[] = []; private nextFragmentNumber = 1; // Only relevant for fragmented files, to make sure new fragments start with the highest timestamp seen so far private maxWrittenTimestamp = -Infinity; private minimumFragmentDuration: number; constructor(output: Output, format: IsobmffOutputFormat) { super(output); this.format = format; this.writer = output._writer; this.boxWriter = new IsobmffBoxWriter(this.writer); this.isQuickTime = format instanceof MovOutputFormat; // If the fastStart option isn't defined, enable in-memory fast start if the target is an ArrayBuffer, as the // memory usage remains identical const fastStartDefault = this.writer instanceof BufferTargetWriter ? 'in-memory' : false; this.fastStart = format._options.fastStart ?? fastStartDefault; this.isFragmented = this.fastStart === 'fragmented'; if (this.fastStart === 'in-memory' || this.isFragmented) { this.writer.ensureMonotonicity = true; } this.minimumFragmentDuration = format._options.minimumFragmentDuration ?? 1; } async start() { const release = await this.mutex.acquire(); const holdsAvc = this.output._tracks.some(x => x.type === 'video' && x.source._codec === 'avc'); // Write the header { if (this.format._options.onFtyp) { this.writer.startTrackingWrites(); } this.boxWriter.writeBox(ftyp({ isQuickTime: this.isQuickTime, holdsAvc: holdsAvc, fragmented: this.isFragmented, })); if (this.format._options.onFtyp) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onFtyp(data, start); } } if (this.fastStart === 'in-memory') { this.mdat = mdat(false); } else if (this.isFragmented) { // We write the moov box once we write out the first fragment to make sure we get the decoder configs } else { if (this.format._options.onMdat) { this.writer.startTrackingWrites(); } this.mdat = mdat(true); // Reserve large size by default, can refine this when finalizing. this.boxWriter.writeBox(this.mdat); } await this.writer.flush(); release(); } private allTracksAreKnown() { for (const track of this.output._tracks) { if (!track.source._closed && !this.trackDatas.some(x => x.track === track)) { return false; // We haven't seen a sample from this open track yet } } return true; } async getMimeType() { await this.allTracksKnown.promise; const codecStrings = this.trackDatas.map((trackData) => { if (trackData.type === 'video') { return trackData.info.decoderConfig.codec; } else if (trackData.type === 'audio') { return trackData.info.decoderConfig.codec; } else { const map: Record<SubtitleCodec, string> = { webvtt: 'wvtt', }; return map[trackData.track.source._codec]; } }); return buildIsobmffMimeType({ isQuickTime: this.isQuickTime, hasVideo: this.trackDatas.some(x => x.type === 'video'), hasAudio: this.trackDatas.some(x => x.type === 'audio'), codecStrings, }); } private getVideoTrackData(track: OutputVideoTrack, packet: EncodedPacket, meta?: EncodedVideoChunkMetadata) { const existingTrackData = this.trackDatas.find(x => x.track === track); if (existingTrackData) { return existingTrackData as IsobmffVideoTrackData; } validateVideoChunkMetadata(meta); assert(meta); assert(meta.decoderConfig); const decoderConfig = { ...meta.decoderConfig }; assert(decoderConfig.codedWidth !== undefined); assert(decoderConfig.codedHeight !== undefined); let requiresAnnexBTransformation = false; if (track.source._codec === 'avc' && !decoderConfig.description) { // ISOBMFF can only hold AVC in the AVCC format, not in Annex B, but the missing description indicates // Annex B. This means we'll need to do some converterino. const decoderConfigurationRecord = extractAvcDecoderConfigurationRecord(packet.data); if (!decoderConfigurationRecord) { throw new Error( 'Couldn\'t extract an AVCDecoderConfigurationRecord from the AVC packet. Make sure the packets are' + ' in Annex B format (as specified in ITU-T-REC-H.264) when not providing a description, or' + ' provide a description (must be an AVCDecoderConfigurationRecord as specified in ISO 14496-15)' + ' and ensure the packets are in AVCC format.', ); } decoderConfig.description = serializeAvcDecoderConfigurationRecord(decoderConfigurationRecord); requiresAnnexBTransformation = true; } else if (track.source._codec === 'hevc' && !decoderConfig.description) { // ISOBMFF can only hold HEVC in the HEVC format, not in Annex B, but the missing description indicates // Annex B. This means we'll need to do some converterino. const decoderConfigurationRecord = extractHevcDecoderConfigurationRecord(packet.data); if (!decoderConfigurationRecord) { throw new Error( 'Couldn\'t extract an HEVCDecoderConfigurationRecord from the HEVC packet. Make sure the packets' + ' are in Annex B format (as specified in ITU-T-REC-H.265) when not providing a description, or' + ' provide a description (must be an HEVCDecoderConfigurationRecord as specified in ISO 14496-15)' + ' and ensure the packets are in HEVC format.', ); } decoderConfig.description = serializeHevcDecoderConfigurationRecord(decoderConfigurationRecord); requiresAnnexBTransformation = true; } // The frame rate set by the user may not be an integer. Since timescale is an integer, we'll approximate the // frame time (inverse of frame rate) with a rational number, then use that approximation's denominator // as the timescale. const timescale = computeRationalApproximation(1 / (track.metadata.frameRate ?? 57600), 1e6).denominator; const newTrackData: IsobmffVideoTrackData = { muxer: this, track, type: 'video', info: { width: decoderConfig.codedWidth, height: decoderConfig.codedHeight, decoderConfig: decoderConfig, requiresAnnexBTransformation, }, timescale, samples: [], sampleQueue: [], timestampProcessingQueue: [], timeToSampleTable: [], compositionTimeOffsetTable: [], lastTimescaleUnits: null, lastSample: null, finalizedChunks: [], currentChunk: null, compactlyCodedChunkTable: [], }; this.trackDatas.push(newTrackData); this.trackDatas.sort((a, b) => a.track.id - b.track.id); if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } return newTrackData; } private getAudioTrackData(track: OutputAudioTrack, meta?: EncodedAudioChunkMetadata) { const existingTrackData = this.trackDatas.find(x => x.track === track); if (existingTrackData) { return existingTrackData as IsobmffAudioTrackData; } validateAudioChunkMetadata(meta); assert(meta); assert(meta.decoderConfig); const newTrackData: IsobmffAudioTrackData = { muxer: this, track, type: 'audio', info: { numberOfChannels: meta.decoderConfig.numberOfChannels, sampleRate: meta.decoderConfig.sampleRate, decoderConfig: meta.decoderConfig, requiresPcmTransformation: !this.isFragmented && (PCM_AUDIO_CODECS as readonly string[]).includes(track.source._codec), }, timescale: meta.decoderConfig.sampleRate, samples: [], sampleQueue: [], timestampProcessingQueue: [], timeToSampleTable: [], compositionTimeOffsetTable: [], lastTimescaleUnits: null, lastSample: null, finalizedChunks: [], currentChunk: null, compactlyCodedChunkTable: [], }; this.trackDatas.push(newTrackData); this.trackDatas.sort((a, b) => a.track.id - b.track.id); if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } return newTrackData; } private getSubtitleTrackData(track: OutputSubtitleTrack, meta?: SubtitleMetadata) { const existingTrackData = this.trackDatas.find(x => x.track === track); if (existingTrackData) { return existingTrackData as IsobmffSubtitleTrackData; } validateSubtitleMetadata(meta); assert(meta); assert(meta.config); const newTrackData: IsobmffSubtitleTrackData = { muxer: this, track, type: 'subtitle', info: { config: meta.config, }, timescale: 1000, // Reasonable samples: [], sampleQueue: [], timestampProcessingQueue: [], timeToSampleTable: [], compositionTimeOffsetTable: [], lastTimescaleUnits: null, lastSample: null, finalizedChunks: [], currentChunk: null, compactlyCodedChunkTable: [], lastCueEndTimestamp: 0, cueQueue: [], nextSourceId: 0, cueToSourceId: new WeakMap(), }; this.trackDatas.push(newTrackData); this.trackDatas.sort((a, b) => a.track.id - b.track.id); if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } return newTrackData; } async addEncodedVideoPacket(track: OutputVideoTrack, packet: EncodedPacket, meta?: EncodedVideoChunkMetadata) { const release = await this.mutex.acquire(); try { const trackData = this.getVideoTrackData(track, packet, meta); let packetData = packet.data; if (trackData.info.requiresAnnexBTransformation) { const transformedData = transformAnnexBToLengthPrefixed(packetData); if (!transformedData) { throw new Error( 'Failed to transform packet data. Make sure all packets are provided in Annex B format, as' + ' specified in ITU-T-REC-H.264 and ITU-T-REC-H.265.', ); } packetData = transformedData; } const timestamp = this.validateAndNormalizeTimestamp( trackData.track, packet.timestamp, packet.type === 'key', ); const internalSample = this.createSampleForTrack( trackData, packetData, timestamp, packet.duration, packet.type, ); await this.registerSample(trackData, internalSample); } finally { release(); } } async addEncodedAudioPacket(track: OutputAudioTrack, packet: EncodedPacket, meta?: EncodedAudioChunkMetadata) { const release = await this.mutex.acquire(); try { const trackData = this.getAudioTrackData(track, meta); const timestamp = this.validateAndNormalizeTimestamp( trackData.track, packet.timestamp, packet.type === 'key', ); const internalSample = this.createSampleForTrack( trackData, packet.data, timestamp, packet.duration, packet.type, ); if (trackData.info.requiresPcmTransformation) { await this.maybePadWithSilence(trackData, timestamp); } await this.registerSample(trackData, internalSample); } finally { release(); } } private async maybePadWithSilence(trackData: IsobmffAudioTrackData, untilTimestamp: number) { // The PCM transformation assumes that all samples are contiguous. This is not something that is enforced, so // we need to pad the "holes" in between samples (and before the first sample) with additional // "silence samples". const lastSample = last(trackData.samples); const lastEndTimestamp = lastSample ? lastSample.timestamp + lastSample.duration : 0; const delta = untilTimestamp - lastEndTimestamp; const deltaInTimescale = intoTimescale(delta, trackData.timescale); if (deltaInTimescale > 0) { const { sampleSize, silentValue } = parsePcmCodec( trackData.info.decoderConfig.codec as PcmAudioCodec, ); const samplesNeeded = deltaInTimescale * trackData.info.numberOfChannels; const data = new Uint8Array(sampleSize * samplesNeeded).fill(silentValue); const paddingSample = this.createSampleForTrack( trackData, new Uint8Array(data.buffer), lastEndTimestamp, delta, 'key', ); await this.registerSample(trackData, paddingSample); } } async addSubtitleCue(track: OutputSubtitleTrack, cue: SubtitleCue, meta?: SubtitleMetadata) { const release = await this.mutex.acquire(); try { const trackData = this.getSubtitleTrackData(track, meta); this.validateAndNormalizeTimestamp(trackData.track, cue.timestamp, true); if (track.source._codec === 'webvtt') { trackData.cueQueue.push(cue); await this.processWebVTTCues(trackData, cue.timestamp); } else { // TODO } } finally { release(); } } private async processWebVTTCues(trackData: IsobmffSubtitleTrackData, until: number) { // WebVTT cues need to undergo special processing as empty sections need to be padded out with samples, and // overlapping samples require special logic. The algorithm produces the format specified in ISO 14496-30. while (trackData.cueQueue.length > 0) { const timestamps = new Set<number>([]); for (const cue of trackData.cueQueue) { assert(cue.timestamp <= until); assert(trackData.lastCueEndTimestamp <= cue.timestamp + cue.duration); timestamps.add(Math.max(cue.timestamp, trackData.lastCueEndTimestamp)); // Start timestamp timestamps.add(cue.timestamp + cue.duration); // End timestamp } const sortedTimestamps = [...timestamps].sort((a, b) => a - b); // These are the timestamps of the next sample we'll create: const sampleStart = sortedTimestamps[0]!; const sampleEnd = sortedTimestamps[1] ?? sampleStart; if (until < sampleEnd) { break; } // We may need to pad out empty space with an vtte box if (trackData.lastCueEndTimestamp < sampleStart) { this.auxWriter.seek(0); const box = vtte(); this.auxBoxWriter.writeBox(box); const body = this.auxWriter.getSlice(0, this.auxWriter.getPos()); const sample = this.createSampleForTrack( trackData, body, trackData.lastCueEndTimestamp, sampleStart - trackData.lastCueEndTimestamp, 'key', ); await this.registerSample(trackData, sample); trackData.lastCueEndTimestamp = sampleStart; } this.auxWriter.seek(0); for (let i = 0; i < trackData.cueQueue.length; i++) { const cue = trackData.cueQueue[i]!; if (cue.timestamp >= sampleEnd) { break; } inlineTimestampRegex.lastIndex = 0; const containsTimestamp = inlineTimestampRegex.test(cue.text); const endTimestamp = cue.timestamp + cue.duration; let sourceId = trackData.cueToSourceId.get(cue); if (sourceId === undefined && sampleEnd < endTimestamp) { // We know this cue will appear in more than one sample, therefore we need to mark it with a // unique ID sourceId = trackData.nextSourceId++; trackData.cueToSourceId.set(cue, sourceId); } if (cue.notes) { // Any notes/comments are included in a special vtta box const box = vtta(cue.notes); this.auxBoxWriter.writeBox(box); } const box = vttc( cue.text, containsTimestamp ? sampleStart : null, cue.identifier ?? null, cue.settings ?? null, sourceId ?? null, ); this.auxBoxWriter.writeBox(box); if (endTimestamp === sampleEnd) { // The cue won't appear in any future sample, so we're done with it trackData.cueQueue.splice(i--, 1); } } const body = this.auxWriter.getSlice(0, this.auxWriter.getPos()); const sample = this.createSampleForTrack(trackData, body, sampleStart, sampleEnd - sampleStart, 'key'); await this.registerSample(trackData, sample); trackData.lastCueEndTimestamp = sampleEnd; } } private createSampleForTrack( trackData: IsobmffTrackData, data: Uint8Array, timestamp: number, duration: number, type: PacketType, ) { const sample: Sample = { timestamp, decodeTimestamp: timestamp, // This may be refined later duration, data, size: data.byteLength, type, timescaleUnitsToNextSample: intoTimescale(duration, trackData.timescale), // Will be refined }; return sample; } private processTimestamps(trackData: IsobmffTrackData, nextSample?: Sample) { if (trackData.timestampProcessingQueue.length === 0) { return; } if (trackData.type === 'audio' && trackData.info.requiresPcmTransformation) { let totalDuration = 0; // Compute the total duration in the track timescale (which is equal to the amount of PCM audio samples) // and simply say that's how many new samples there are. for (let i = 0; i < trackData.timestampProcessingQueue.length; i++) { const sample = trackData.timestampProcessingQueue[i]!; const duration = intoTimescale(sample.duration, trackData.timescale); totalDuration += duration; } if (trackData.timeToSampleTable.length === 0) { trackData.timeToSampleTable.push({ sampleCount: totalDuration, sampleDelta: 1, }); } else { const lastEntry = last(trackData.timeToSampleTable)!; lastEntry.sampleCount += totalDuration; } trackData.timestampProcessingQueue.length = 0; return; } const sortedTimestamps = trackData.timestampProcessingQueue.map(x => x.timestamp).sort((a, b) => a - b); for (let i = 0; i < trackData.timestampProcessingQueue.length; i++) { const sample = trackData.timestampProcessingQueue[i]!; // Since the user only supplies presentation time, but these may be out of order, we reverse-engineer from // that a sensible decode timestamp. The notion of a decode timestamp doesn't really make sense // (presentation timestamp & decode order are all you need), but it is a concept in ISOBMFF so we need to // model it. sample.decodeTimestamp = sortedTimestamps[i]!; if (!this.isFragmented && trackData.lastTimescaleUnits === null) { // In non-fragmented files, the first decode timestamp is always zero. If the first presentation // timestamp isn't zero, we'll simply use the composition time offset to achieve it. sample.decodeTimestamp = 0; } const sampleCompositionTimeOffset = intoTimescale(sample.timestamp - sample.decodeTimestamp, trackData.timescale); const durationInTimescale = intoTimescale(sample.duration, trackData.timescale); if (trackData.lastTimescaleUnits !== null) { assert(trackData.lastSample); const timescaleUnits = intoTimescale(sample.decodeTimestamp, trackData.timescale, false); const delta = Math.round(timescaleUnits - trackData.lastTimescaleUnits); assert(delta >= 0); trackData.lastTimescaleUnits += delta; trackData.lastSample.timescaleUnitsToNextSample = delta; if (!this.isFragmented) { let lastTableEntry = last(trackData.timeToSampleTable); assert(lastTableEntry); if (lastTableEntry.sampleCount === 1) { lastTableEntry.sampleDelta = delta; const entryBefore = trackData.timeToSampleTable[trackData.timeToSampleTable.length - 2]; if (entryBefore && entryBefore.sampleDelta === delta) { // If the delta is the same as the previous one, merge the two entries entryBefore.sampleCount++; trackData.timeToSampleTable.pop(); lastTableEntry = entryBefore; } } else if (lastTableEntry.sampleDelta !== delta) { // The delta has changed, so we need a new entry to reach the current sample lastTableEntry.sampleCount--; trackData.timeToSampleTable.push(lastTableEntry = { sampleCount: 1, sampleDelta: delta, }); } if (lastTableEntry.sampleDelta === durationInTimescale) { // The sample's duration matches the delta, so we can increment the count lastTableEntry.sampleCount++; } else { // Add a new entry in order to maintain the last sample's true duration trackData.timeToSampleTable.push({ sampleCount: 1, sampleDelta: durationInTimescale, }); } const lastCompositionTimeOffsetTableEntry = last(trackData.compositionTimeOffsetTable); assert(lastCompositionTimeOffsetTableEntry); if ( lastCompositionTimeOffsetTableEntry.sampleCompositionTimeOffset === sampleCompositionTimeOffset ) { // Simply increment the count lastCompositionTimeOffsetTableEntry.sampleCount++; } else { // The composition time offset has changed, so create a new entry with the new composition time // offset trackData.compositionTimeOffsetTable.push({ sampleCount: 1, sampleCompositionTimeOffset: sampleCompositionTimeOffset, }); } } } else { // Decode timestamp of the first sample trackData.lastTimescaleUnits = intoTimescale(sample.decodeTimestamp, trackData.timescale, false); if (!this.isFragmented) { trackData.timeToSampleTable.push({ sampleCount: 1, sampleDelta: durationInTimescale, }); trackData.compositionTimeOffsetTable.push({ sampleCount: 1, sampleCompositionTimeOffset: sampleCompositionTimeOffset, }); } } trackData.lastSample = sample; } trackData.timestampProcessingQueue.length = 0; assert(trackData.lastSample); assert(trackData.lastTimescaleUnits !== null); if (nextSample !== undefined && trackData.lastSample.timescaleUnitsToNextSample === 0) { assert(nextSample.type === 'key'); // Given the next sample, we can make a guess about the duration of the last sample. This avoids having // the last sample's duration in each fragment be "0" for fragmented files. The guess we make here is // actually correct most of the time, since typically, no delta frame with a lower timestamp follows the key // frame (although it can happen). const timescaleUnits = intoTimescale(nextSample.timestamp, trackData.timescale, false); const delta = Math.round(timescaleUnits - trackData.lastTimescaleUnits); trackData.lastSample.timescaleUnitsToNextSample = delta; } } private async registerSample(trackData: IsobmffTrackData, sample: Sample) { if (sample.type === 'key') { this.processTimestamps(trackData, sample); } trackData.timestampProcessingQueue.push(sample); if (this.isFragmented) { trackData.sampleQueue.push(sample); await this.interleaveSamples(); } else { await this.addSampleToTrack(trackData, sample); } } private async addSampleToTrack(trackData: IsobmffTrackData, sample: Sample) { if (!this.isFragmented) { trackData.samples.push(sample); } let beginNewChunk = false; if (!trackData.currentChunk) { beginNewChunk = true; } else { // Timestamp don't need to be monotonic (think B-frames), so we may need to update the start timestamp of // the chunk trackData.currentChunk.startTimestamp = Math.min( trackData.currentChunk.startTimestamp, sample.timestamp, ); const currentChunkDuration = sample.timestamp - trackData.currentChunk.startTimestamp; if (this.isFragmented) { // We can only finalize this fragment (and begin a new one) if we know that each track will be able to // start the new one with a key frame. const keyFrameQueuedEverywhere = this.trackDatas.every((otherTrackData) => { if (trackData === otherTrackData) { return sample.type === 'key'; } const firstQueuedSample = otherTrackData.sampleQueue[0]; if (firstQueuedSample) { return firstQueuedSample.type === 'key'; } return otherTrackData.track.source._closed; }); if ( currentChunkDuration >= this.minimumFragmentDuration && keyFrameQueuedEverywhere && sample.timestamp > this.maxWrittenTimestamp ) { beginNewChunk = true; await this.finalizeFragment(); } } else { beginNewChunk = currentChunkDuration >= 0.5; // Chunk is long enough, we need a new one } } if (beginNewChunk) { if (trackData.currentChunk) { await this.finalizeCurrentChunk(trackData); } trackData.currentChunk = { startTimestamp: sample.timestamp, samples: [], offset: null, moofOffset: null, }; } assert(trackData.currentChunk); trackData.currentChunk.samples.push(sample); if (this.isFragmented) { this.maxWrittenTimestamp = Math.max(this.maxWrittenTimestamp, sample.timestamp); } } private async finalizeCurrentChunk(trackData: IsobmffTrackData) { assert(!this.isFragmented); if (!trackData.currentChunk) return; trackData.finalizedChunks.push(trackData.currentChunk); this.finalizedChunks.push(trackData.currentChunk); let sampleCount = trackData.currentChunk.samples.length; if (trackData.type === 'audio' && trackData.info.requiresPcmTransformation) { sampleCount = trackData.currentChunk.samples .reduce((acc, sample) => acc + intoTimescale(sample.duration, trackData.timescale), 0); } if ( trackData.compactlyCodedChunkTable.length === 0 || last(trackData.compactlyCodedChunkTable)!.samplesPerChunk !== sampleCount ) { trackData.compactlyCodedChunkTable.push({ firstChunk: trackData.finalizedChunks.length, // 1-indexed samplesPerChunk: sampleCount, }); } if (this.fastStart === 'in-memory') { trackData.currentChunk.offset = 0; // We'll compute the proper offset when finalizing return; } // Write out the data trackData.currentChunk.offset = this.writer.getPos(); for (const sample of trackData.currentChunk.samples) { assert(sample.data); this.writer.write(sample.data); sample.data = null; // Can be GC'd } await this.writer.flush(); } private async interleaveSamples(isFinalCall = false) { assert(this.isFragmented); if (!isFinalCall) { if (!this.allTracksAreKnown()) { return; // We can't interleave yet as we don't yet know how many tracks we'll truly have } } outer: while (true) { let trackWithMinTimestamp: IsobmffTrackData | null = null; let minTimestamp = Infinity; for (const trackData of this.trackDatas) { if (!isFinalCall && trackData.sampleQueue.length === 0 && !trackData.track.source._closed) { break outer; } if (trackData.sampleQueue.length > 0 && trackData.sampleQueue[0]!.timestamp < minTimestamp) { trackWithMinTimestamp = trackData; minTimestamp = trackData.sampleQueue[0]!.timestamp; } } if (!trackWithMinTimestamp) { break; } const sample = trackWithMinTimestamp.sampleQueue.shift()!; await this.addSampleToTrack(trackWithMinTimestamp, sample); } } private async finalizeFragment(flushWriter = true) { assert(this.isFragmented); const fragmentNumber = this.nextFragmentNumber++; if (fragmentNumber === 1) { if (this.format._options.onMoov) { this.writer.startTrackingWrites(); } // Write the moov box now that we have all decoder configs const movieBox = moov(this.trackDatas, this.creationTime, true); this.boxWriter.writeBox(movieBox); if (this.format._options.onMoov) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMoov(data, start); } } // Not all tracks need to be present in every fragment const tracksInFragment = this.trackDatas.filter(x => x.currentChunk); // Create an initial moof box and measure it; we need this to know where the following mdat box will begin const moofBox = moof(fragmentNumber, tracksInFragment); const moofOffset = this.writer.getPos(); const mdatStartPos = moofOffset + this.boxWriter.measureBox(moofBox); let currentPos = mdatStartPos + MIN_BOX_HEADER_SIZE; let fragmentStartTimestamp = Infinity; for (const trackData of tracksInFragment) { trackData.currentChunk!.offset = currentPos; trackData.currentChunk!.moofOffset = moofOffset; for (const sample of trackData.currentChunk!.samples) { currentPos += sample.size; } fragmentStartTimestamp = Math.min(fragmentStartTimestamp, trackData.currentChunk!.startTimestamp); } const mdatSize = currentPos - mdatStartPos; const needsLargeMdatSize = mdatSize >= 2 ** 32; if (needsLargeMdatSize) { // Shift all offsets by 8. Previously, all chunks were shifted assuming the large box size, but due to what // I suspect is a bug in WebKit, it failed in Safari (when livestreaming with MSE, not for static playback). for (const trackData of tracksInFragment) { trackData.currentChunk!.offset! += MAX_BOX_HEADER_SIZE - MIN_BOX_HEADER_SIZE; } } if (this.format._options.onMoof) { this.writer.startTrackingWrites(); } const newMoofBox = moof(fragmentNumber, tracksInFragment); this.boxWriter.writeBox(newMoofBox); if (this.format._options.onMoof) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMoof(data, start, fragmentStartTimestamp); } assert(this.writer.getPos() === mdatStartPos); if (this.format._options.onMdat) { this.writer.startTrackingWrites(); } const mdatBox = mdat(needsLargeMdatSize); mdatBox.size = mdatSize; this.boxWriter.writeBox(mdatBox); this.writer.seek(mdatStartPos + (needsLargeMdatSize ? MAX_BOX_HEADER_SIZE : MIN_BOX_HEADER_SIZE)); // Write sample data for (const trackData of tracksInFragment) { for (const sample of trackData.currentChunk!.samples) { this.writer.write(sample.data!); sample.data = null; // Can be GC'd } } if (this.format._options.onMdat) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMdat(data, start); } for (const trackData of tracksInFragment) { trackData.finalizedChunks.push(trackData.currentChunk!); this.finalizedChunks.push(trackData.currentChunk!); trackData.currentChunk = null; } if (flushWriter) { await this.writer.flush(); } } // eslint-disable-next-line @typescript-eslint/no-misused-promises override async onTrackClose(track: OutputTrack) { const release = await this.mutex.acquire(); if (track.type === 'subtitle' && track.source._codec === 'webvtt') { const trackData = this.trackDatas.find(x => x.track === track) as IsobmffSubtitleTrackData; if (trackData) { await this.processWebVTTCues(trackData, Infinity); } } if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } if (this.isFragmented) { // Since a track is now closed, we may be able to write out chunks that were previously waiting await this.interleaveSamples(); } release(); } /** Finalizes the file, making it ready for use. Must be called after all video and audio chunks have been added. */ async finalize() { const release = await this.mutex.acquire(); this.allTracksKnown.resolve(); for (const trackData of this.trackDatas) { if (trackData.type === 'subtitle' && trackData.track.source._codec === 'webvtt') { await this.processWebVTTCues(trackData, Infinity); } } if (this.isFragmented) { await this.interleaveSamples(true); for (const trackData of this.trackDatas) { this.processTimestamps(trackData); } await this.finalizeFragment(false); // Don't flush the last fragment as we will flush it with the mfra box } else { for (const trackData of this.trackDatas) { this.processTimestamps(trackData); await this.finalizeCurrentChunk(trackData); } } if (this.fastStart === 'in-memory') { assert(this.mdat); let mdatSize: number; // We know how many chunks there are, but computing the chunk positions requires an iterative approach: // In order to know where the first chunk should go, we first need to know the size of the moov box. But we // cannot write a proper moov box without first knowing all chunk positions. So, we generate a tentative // moov box with placeholder values (0) for the chunk offsets to be able to compute its size. If it then // turns out that appending all chunks exceeds 4 GiB, we need to repeat this process, now with the co64 box // being used in the moov box instead, which will make it larger. After that, we definitely know the final // size of the moov box and can compute the proper chunk positions. for (let i = 0; i < 2; i++) { const movieBox = moov(this.trackDatas, this.creationTime); const movieBoxSize = this.boxWriter.measureBox(movieBox); mdatSize = this.boxWriter.measureBox(this.mdat); let currentChunkPos = this.writer.getPos() + movieBoxSize + mdatSize; for (const chunk of this.finalizedChunks) { chunk.offset = currentChunkPos; for (const { data } of chunk.samples) { assert(data); currentChunkPos += data.byteLength; mdatSize += data.byteLength; } } if (currentChunkPos < 2 ** 32) break; if (mdatSize >= 2 ** 32) this.mdat.largeSize = true; } if (this.format._options.onMoov) { this.writer.startTrackingWrites(); } const movieBox = moov(this.trackDatas, this.creationTime); this.boxWriter.writeBox(movieBox); if (this.format._options.onMoov) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMoov(data, start); } if (this.format._options.onMdat) { this.writer.startTrackingWrites(); } this.mdat.size = mdatSize!; this.boxWriter.writeBox(this.mdat); for (const chunk of this.finalizedChunks) { for (const sample of chunk.samples) { assert(sample.data); this.writer.write(sample.data); sample.data = null; } } if (this.format._options.onMdat) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMdat(data, start); } } else if (this.isFragmented) { // Append the mfra box to the end of the file for better random access const startPos = this.writer.getPos(); const mfraBox = mfra(this.trackDatas); this.boxWriter.writeBox(mfraBox); // Patch the 'size' field of the mfro box at the end of the mfra box now that we know its actual size const mfraBoxSize = this.writer.getPos() - startPos; this.writer.seek(this.writer.getPos() - 4); this.boxWriter.writeU32(mfraBoxSize); } else { assert(this.mdat); const mdatPos = this.boxWriter.offsets.get(this.mdat); assert(mdatPos !== undefined); const mdatSize = this.writer.getPos() - mdatPos; this.mdat.size = mdatSize; this.mdat.largeSize = mdatSize >= 2 ** 32; // Only use the large size if we need it this.boxWriter.patchBox(this.mdat); if (this.format._options.onMdat) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMdat(data, start); } if (this.format._options.onMoov) { this.writer.startTrackingWrites(); } const movieBox = moov(this.trackDatas, this.creationTime); this.boxWriter.writeBox(movieBox); if (this.format._options.onMoov) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMoov(data, start); } } release(); } }