UNPKG

mediabunny

Version:

Pure TypeScript media toolkit for reading, writing, and converting media files, directly in the browser.

Vanilagy/mediabunny

1,006 lines • 55.3 kB

JavaScript

/*! * Copyright (c) 2026-present, Vanilagy and contributors * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ import { free, ftyp, IsobmffBoxWriter, mdat, mfra, moof, moov, sidx, styp, vtta, vttc, vtte, } from './isobmff-boxes.js'; import { Muxer } from '../muxer.js'; import { Writer } from '../writer.js'; import { BufferTarget } from '../target.js'; import { assert, computeRationalApproximation, last, promiseWithResolvers, simplifyRational } from '../misc.js'; import { MovOutputFormat, CmafOutputFormat } from '../output-format.js'; import { inlineTimestampRegex } from '../subtitles.js'; import { aacChannelMap, aacFrequencyTable, buildAacAudioSpecificConfig } from '../../shared/aac-misc.js'; import { parsePcmCodec, PCM_AUDIO_CODECS, validateAudioChunkMetadata, validateSubtitleMetadata, validateVideoChunkMetadata, } from '../codec.js'; import { MAX_ADTS_FRAME_HEADER_SIZE, MIN_ADTS_FRAME_HEADER_SIZE, readAdtsFrameHeader } from '../adts/adts-reader.js'; import { FileSlice } from '../reader.js'; import { concatNalUnitsInLengthPrefixed, extractAvcDecoderConfigurationRecord, extractHevcDecoderConfigurationRecord, iterateNalUnitsInAnnexB, serializeAvcDecoderConfigurationRecord, serializeHevcDecoderConfigurationRecord, } from '../codec-data.js'; import { buildIsobmffMimeType } from './isobmff-misc.js'; import { MAX_BOX_HEADER_SIZE, MIN_BOX_HEADER_SIZE } from './isobmff-reader.js'; export const GLOBAL_TIMESCALE = 57600; // LCM of a bunch of common frame rates (24, 25, 30, 60, 144, ...) const TIMESTAMP_OFFSET = 2_082_844_800; // Seconds between Jan 1 1904 and Jan 1 1970 export const getTrackMetadata = (trackData) => { const metadata = {}; const track = trackData.track; if (track.metadata.name !== undefined) { metadata.name = track.metadata.name; } return metadata; }; export const intoTimescale = (timeInSeconds, timescale, round = true) => { const value = timeInSeconds * timescale; return round ? Math.round(value) : value; }; export class IsobmffMuxer extends Muxer { constructor(output, format) { super(output); this.writer = null; this.boxWriter = null; this.initWriter = null; this.initBoxWriter = null; this.auxTarget = new BufferTarget(); this.auxWriter = new Writer(this.auxTarget, false); this.auxBoxWriter = new IsobmffBoxWriter(this.auxWriter); this.mdat = null; this.ftypSize = null; this.trackDatas = []; this.allTracksKnown = promiseWithResolvers(); this.creationTime = Math.floor(Date.now() / 1000) + TIMESTAMP_OFFSET; this.finalizedChunks = []; this.nextFragmentNumber = 1; // Only relevant for fragmented files, to make sure new fragments start with the highest timestamp seen so far this.maxWrittenTimestamp = -Infinity; this.minWrittenTimestamp = Infinity; this.maxWrittenEndTimestamp = -Infinity; this.segmentHeaderSize = null; this.format = format; this.isQuickTime = format instanceof MovOutputFormat; this.isCmaf = format instanceof CmafOutputFormat; this.minimumFragmentDuration = format._options.minimumFragmentDuration ?? (format instanceof CmafOutputFormat ? Infinity : 1); } async start() { const release = await this.mutex.acquire(); if (!this.isCmaf) { this.writer = await this.output._getRootWriter(target => (this.format._options.fastStart !== undefined ? this.format._options.fastStart === 'fragmented' : target instanceof BufferTarget // Since if this is the case we'll use 'in-memory' )); this.boxWriter = new IsobmffBoxWriter(this.writer); // If the fastStart option isn't defined, enable in-memory fast start if the target is an ArrayBuffer, as // the memory usage remains identical this.fastStart = this.format._options.fastStart ?? (this.writer.target instanceof BufferTarget ? 'in-memory' : false); this.isFragmented = this.fastStart === 'fragmented'; } else { this.fastStart = 'fragmented'; this.isFragmented = true; } if (this.isCmaf) { if (!this.output._hasInitTarget()) { throw new Error(`CMAF outputs require the initTarget field in OutputOptions to be set; the init segment` + ` will be written to it.`); } // Set up the init writer to which we'll write the init segment const initTarget = await this.output._getInitTarget(); const initWriter = new Writer(initTarget, true); initWriter.start(); this.initWriter = initWriter; this.initBoxWriter = new IsobmffBoxWriter(initWriter); } const holdsAvc = this.output._tracks.some(x => x.isVideoTrack() && x.source._codec === 'avc'); // Write the header { const boxWriter = this.initBoxWriter ?? this.boxWriter; assert(boxWriter); if (this.format._options.onFtyp) { boxWriter.writer.startTrackingWrites(); } boxWriter.writeBox(ftyp({ isQuickTime: this.isQuickTime, holdsAvc: holdsAvc, fragmented: this.isFragmented, cmaf: this.isCmaf, })); if (this.format._options.onFtyp) { const { data, start } = boxWriter.writer.stopTrackingWrites(); this.format._options.onFtyp(data, start); } this.ftypSize = boxWriter.writer.getPos(); if (this.isCmaf) { await this.initWriter.flush(); } } if (this.fastStart === 'in-memory') { // We're write at finalization } else if (this.fastStart === 'reserve') { // Validate that all tracks have set maximumPacketCount for (const track of this.output._tracks) { if (track.metadata.maximumPacketCount === undefined) { throw new Error('All tracks must specify maximumPacketCount in their metadata when using' + ' fastStart: \'reserve\'.'); } } // We'll start writing once we know all tracks } else if (this.isFragmented) { // We write the moov box once we write out the first fragment to make sure we get the decoder configs } else { assert(this.writer); assert(this.boxWriter); if (this.format._options.onMdat) { this.writer.startTrackingWrites(); } this.mdat = mdat(true); // Reserve large size by default, can refine this when finalizing. this.boxWriter.writeBox(this.mdat); } await this.writer?.flush(); release(); } allTracksAreKnown() { for (const track of this.output._tracks) { if (!track.source._closed && !this.trackDatas.some(x => x.track === track)) { return false; // We haven't seen a sample from this open track yet } } return true; } async getMimeType() { await this.allTracksKnown.promise; const codecStrings = this.trackDatas.map((trackData) => { if (trackData.type === 'video') { return trackData.info.decoderConfig.codec; } else if (trackData.type === 'audio') { return trackData.info.decoderConfig.codec; } else { const map = { webvtt: 'wvtt', }; return map[trackData.track.source._codec]; } }); return buildIsobmffMimeType({ isQuickTime: this.isQuickTime, hasVideo: this.trackDatas.some(x => x.type === 'video'), hasAudio: this.trackDatas.some(x => x.type === 'audio'), codecStrings, }); } getVideoTrackData(track, packet, meta) { const existingTrackData = this.trackDatas.find(x => x.track === track); if (existingTrackData) { return existingTrackData; } validateVideoChunkMetadata(meta); assert(meta); assert(meta.decoderConfig); const decoderConfig = { ...meta.decoderConfig }; assert(decoderConfig.codedWidth !== undefined); assert(decoderConfig.codedHeight !== undefined); let requiresAnnexBTransformation = false; if (track.source._codec === 'avc' && !decoderConfig.description) { // ISOBMFF can only hold AVC in the AVCC format, not in Annex B, but the missing description indicates // Annex B. This means we'll need to do some converterino. const decoderConfigurationRecord = extractAvcDecoderConfigurationRecord(packet.data); if (!decoderConfigurationRecord) { throw new Error('Couldn\'t extract an AVCDecoderConfigurationRecord from the AVC packet. Make sure the packets are' + ' in Annex B format (as specified in ITU-T-REC-H.264) when not providing a description, or' + ' provide a description (must be an AVCDecoderConfigurationRecord as specified in ISO 14496-15)' + ' and ensure the packets are in AVCC format.'); } decoderConfig.description = serializeAvcDecoderConfigurationRecord(decoderConfigurationRecord); requiresAnnexBTransformation = true; } else if (track.source._codec === 'hevc' && !decoderConfig.description) { // ISOBMFF can only hold HEVC in the HEVC format, not in Annex B, but the missing description indicates // Annex B. This means we'll need to do some converterino. const decoderConfigurationRecord = extractHevcDecoderConfigurationRecord(packet.data); if (!decoderConfigurationRecord) { throw new Error('Couldn\'t extract an HEVCDecoderConfigurationRecord from the HEVC packet. Make sure the packets' + ' are in Annex B format (as specified in ITU-T-REC-H.265) when not providing a description, or' + ' provide a description (must be an HEVCDecoderConfigurationRecord as specified in ISO 14496-15)' + ' and ensure the packets are in HEVC format.'); } decoderConfig.description = serializeHevcDecoderConfigurationRecord(decoderConfigurationRecord); requiresAnnexBTransformation = true; } // The frame rate set by the user may not be an integer. Since timescale is an integer, we'll approximate the // frame time (inverse of frame rate) with a rational number, then use that approximation's denominator // as the timescale. const timescale = computeRationalApproximation(1 / (track.metadata.frameRate ?? GLOBAL_TIMESCALE), 1e6).den; const displayAspectWidth = decoderConfig.displayAspectWidth; const displayAspectHeight = decoderConfig.displayAspectHeight; const pixelAspectRatio = displayAspectWidth === undefined || displayAspectHeight === undefined ? { num: 1, den: 1 } : simplifyRational({ num: displayAspectWidth * decoderConfig.codedHeight, den: displayAspectHeight * decoderConfig.codedWidth, }); const newTrackData = { muxer: this, track, type: 'video', info: { width: decoderConfig.codedWidth, height: decoderConfig.codedHeight, pixelAspectRatio, decoderConfig: decoderConfig, requiresAnnexBTransformation, }, timescale, samples: [], sampleQueue: [], timestampProcessingQueue: [], timeToSampleTable: [], compositionTimeOffsetTable: [], lastTimescaleUnits: null, lastSample: null, startTimestampOffset: null, finalizedChunks: [], currentChunk: null, compactlyCodedChunkTable: [], closed: false, }; this.trackDatas.push(newTrackData); this.trackDatas.sort((a, b) => a.track.id - b.track.id); if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } return newTrackData; } getAudioTrackData(track, packet, meta) { const existingTrackData = this.trackDatas.find(x => x.track === track); if (existingTrackData) { return existingTrackData; } validateAudioChunkMetadata(meta); assert(meta); assert(meta.decoderConfig); const decoderConfig = { ...meta.decoderConfig }; let requiresAdtsStripping = false; if (track.source._codec === 'aac' && !decoderConfig.description) { // ISOBMFF can only hold AAC in raw format, not ADTS, but the missing description indicates ADTS. // Parse the first packet to extract the AudioSpecificConfig. const adtsFrame = readAdtsFrameHeader(FileSlice.tempFromBytes(packet.data)); if (!adtsFrame) { throw new Error('Couldn\'t parse ADTS header from the AAC packet. Make sure the packets are in ADTS format' + ' (as specified in ISO 13818-7) when not providing a description, or provide a description' + ' (must be an AudioSpecificConfig as specified in ISO 14496-3) and ensure the packets' + ' are raw AAC data.'); } const sampleRate = aacFrequencyTable[adtsFrame.samplingFrequencyIndex]; const numberOfChannels = aacChannelMap[adtsFrame.channelConfiguration]; if (sampleRate === undefined || numberOfChannels === undefined) { throw new Error('Invalid ADTS frame header.'); } decoderConfig.description = buildAacAudioSpecificConfig({ objectType: adtsFrame.objectType, sampleRate, numberOfChannels, }); requiresAdtsStripping = true; } const newTrackData = { muxer: this, track, type: 'audio', info: { numberOfChannels: meta.decoderConfig.numberOfChannels, sampleRate: meta.decoderConfig.sampleRate, decoderConfig, requiresPcmTransformation: !this.isFragmented && PCM_AUDIO_CODECS.includes(track.source._codec), expectedNextPcmPacketTimestamp: null, requiresAdtsStripping, firstPacket: packet, }, timescale: decoderConfig.sampleRate, samples: [], sampleQueue: [], timestampProcessingQueue: [], timeToSampleTable: [], compositionTimeOffsetTable: [], lastTimescaleUnits: null, lastSample: null, startTimestampOffset: null, finalizedChunks: [], currentChunk: null, compactlyCodedChunkTable: [], closed: false, }; this.trackDatas.push(newTrackData); this.trackDatas.sort((a, b) => a.track.id - b.track.id); if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } return newTrackData; } getSubtitleTrackData(track, meta) { const existingTrackData = this.trackDatas.find(x => x.track === track); if (existingTrackData) { return existingTrackData; } validateSubtitleMetadata(meta); assert(meta); assert(meta.config); const newTrackData = { muxer: this, track, type: 'subtitle', info: { config: meta.config, }, timescale: 1000, // Reasonable samples: [], sampleQueue: [], timestampProcessingQueue: [], timeToSampleTable: [], compositionTimeOffsetTable: [], lastTimescaleUnits: null, lastSample: null, startTimestampOffset: null, finalizedChunks: [], currentChunk: null, compactlyCodedChunkTable: [], closed: false, lastCueEndTimestamp: 0, cueQueue: [], nextSourceId: 0, cueToSourceId: new WeakMap(), }; this.trackDatas.push(newTrackData); this.trackDatas.sort((a, b) => a.track.id - b.track.id); if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } return newTrackData; } async addEncodedVideoPacket(track, packet, meta) { const release = await this.mutex.acquire(); try { const trackData = this.getVideoTrackData(track, packet, meta); let packetData = packet.data; if (trackData.info.requiresAnnexBTransformation) { const nalUnits = [...iterateNalUnitsInAnnexB(packetData)] .map(loc => packetData.subarray(loc.offset, loc.offset + loc.length)); if (nalUnits.length === 0) { // It's not valid Annex B data throw new Error('Failed to transform packet data. Make sure all packets are provided in Annex B format, as' + ' specified in ITU-T-REC-H.264 and ITU-T-REC-H.265.'); } // We don't strip things like SPS or PPS NALUs here, mainly because they can also appear in the middle // of a stream and potentially modify the parameters of it. So, let's just leave them in to be sure. packetData = concatNalUnitsInLengthPrefixed(nalUnits, 4); } this.validateTimestamp(trackData.track, packet.timestamp, packet.type === 'key'); const internalSample = this.createSampleForTrack(trackData, packetData, packet.timestamp, packet.duration, packet.type); await this.registerSample(trackData, internalSample); } finally { release(); } } async addEncodedAudioPacket(track, packet, meta) { const release = await this.mutex.acquire(); try { const trackData = this.getAudioTrackData(track, packet, meta); let packetData = packet.data; if (trackData.info.requiresAdtsStripping) { const adtsFrame = readAdtsFrameHeader(FileSlice.tempFromBytes(packetData)); if (!adtsFrame) { throw new Error('Expected ADTS frame, didn\'t get one.'); } const headerLength = adtsFrame.crcCheck === null ? MIN_ADTS_FRAME_HEADER_SIZE : MAX_ADTS_FRAME_HEADER_SIZE; packetData = packetData.subarray(headerLength); } this.validateTimestamp(trackData.track, packet.timestamp, packet.type === 'key'); let timestamp = packet.timestamp; let duration = packet.duration; if (trackData.info.requiresPcmTransformation) { // Packets may have only approximate timestamp/duration information, but for our PCM logic, we need it // to be precise. So here, we refine the values. const pcmInfo = parsePcmCodec(trackData.info.decoderConfig.codec); const frameSize = pcmInfo.sampleSize * trackData.info.numberOfChannels; // Compute the precise duration duration = packetData.byteLength / frameSize / trackData.info.sampleRate; if (trackData.info.expectedNextPcmPacketTimestamp !== null) { const diff = timestamp - trackData.info.expectedNextPcmPacketTimestamp; if (diff < 0.01) { timestamp = trackData.info.expectedNextPcmPacketTimestamp; } else { const paddedDuration = await this.padWithSilence(trackData, trackData.info.expectedNextPcmPacketTimestamp, diff); timestamp = trackData.info.expectedNextPcmPacketTimestamp + paddedDuration; } } trackData.info.expectedNextPcmPacketTimestamp = timestamp + duration; } const internalSample = this.createSampleForTrack(trackData, packetData, timestamp, duration, packet.type); await this.registerSample(trackData, internalSample); } finally { release(); } } async padWithSilence(trackData, timestamp, duration) { const deltaInTimescale = intoTimescale(duration, trackData.timescale); duration = deltaInTimescale / trackData.timescale; if (deltaInTimescale > 0) { const { sampleSize, silentValue } = parsePcmCodec(trackData.info.decoderConfig.codec); const samplesNeeded = deltaInTimescale * trackData.info.numberOfChannels; const data = new Uint8Array(sampleSize * samplesNeeded).fill(silentValue); const paddingSample = this.createSampleForTrack(trackData, new Uint8Array(data.buffer), timestamp, duration, 'key'); await this.registerSample(trackData, paddingSample); } return duration; } async addSubtitleCue(track, cue, meta) { const release = await this.mutex.acquire(); try { const trackData = this.getSubtitleTrackData(track, meta); this.validateTimestamp(trackData.track, cue.timestamp, true); if (track.source._codec === 'webvtt') { trackData.cueQueue.push(cue); await this.processWebVTTCues(trackData, cue.timestamp); } else { // TODO } } finally { release(); } } async processWebVTTCues(trackData, until) { // WebVTT cues need to undergo special processing as empty sections need to be padded out with samples, and // overlapping samples require special logic. The algorithm produces the format specified in ISO 14496-30. while (trackData.cueQueue.length > 0) { const timestamps = new Set([]); for (const cue of trackData.cueQueue) { assert(cue.timestamp <= until); assert(trackData.lastCueEndTimestamp <= cue.timestamp + cue.duration); timestamps.add(Math.max(cue.timestamp, trackData.lastCueEndTimestamp)); // Start timestamp timestamps.add(cue.timestamp + cue.duration); // End timestamp } const sortedTimestamps = [...timestamps].sort((a, b) => a - b); // These are the timestamps of the next sample we'll create: const sampleStart = sortedTimestamps[0]; const sampleEnd = sortedTimestamps[1] ?? sampleStart; if (until < sampleEnd) { break; } // We may need to pad out empty space with an vtte box if (trackData.lastCueEndTimestamp < sampleStart) { this.auxWriter.seek(0); const box = vtte(); this.auxBoxWriter.writeBox(box); const body = this.auxTarget._getSlice(0, this.auxWriter.getPos()); const sample = this.createSampleForTrack(trackData, body, trackData.lastCueEndTimestamp, sampleStart - trackData.lastCueEndTimestamp, 'key'); await this.registerSample(trackData, sample); trackData.lastCueEndTimestamp = sampleStart; } this.auxWriter.seek(0); for (let i = 0; i < trackData.cueQueue.length; i++) { const cue = trackData.cueQueue[i]; if (cue.timestamp >= sampleEnd) { break; } inlineTimestampRegex.lastIndex = 0; const containsTimestamp = inlineTimestampRegex.test(cue.text); const endTimestamp = cue.timestamp + cue.duration; let sourceId = trackData.cueToSourceId.get(cue); if (sourceId === undefined && sampleEnd < endTimestamp) { // We know this cue will appear in more than one sample, therefore we need to mark it with a // unique ID sourceId = trackData.nextSourceId++; trackData.cueToSourceId.set(cue, sourceId); } if (cue.notes) { // Any notes/comments are included in a special vtta box const box = vtta(cue.notes); this.auxBoxWriter.writeBox(box); } const box = vttc(cue.text, containsTimestamp ? sampleStart : null, cue.identifier ?? null, cue.settings ?? null, sourceId ?? null); this.auxBoxWriter.writeBox(box); if (endTimestamp === sampleEnd) { // The cue won't appear in any future sample, so we're done with it trackData.cueQueue.splice(i--, 1); } } const body = this.auxTarget._getSlice(0, this.auxWriter.getPos()); const sample = this.createSampleForTrack(trackData, body, sampleStart, sampleEnd - sampleStart, 'key'); await this.registerSample(trackData, sample); trackData.lastCueEndTimestamp = sampleEnd; } } createSampleForTrack(trackData, data, timestamp, duration, type) { const sample = { timestamp, decodeTimestamp: timestamp, // This may be refined later duration, data, size: data.byteLength, type, timescaleUnitsToNextSample: intoTimescale(duration, trackData.timescale), // Will be refined }; return sample; } processTimestamps(trackData, nextSample) { if (trackData.timestampProcessingQueue.length === 0) { return; } if (trackData.type === 'audio' && trackData.info.requiresPcmTransformation) { if (!this.isFragmented) { // The first timestamp is the lowest trackData.startTimestampOffset ??= trackData.timestampProcessingQueue[0].timestamp; } let totalDuration = 0; // Compute the total duration in the track timescale (which is equal to the amount of PCM audio samples) // and simply say that's how many new samples there are. for (let i = 0; i < trackData.timestampProcessingQueue.length; i++) { const sample = trackData.timestampProcessingQueue[i]; const duration = intoTimescale(sample.duration, trackData.timescale); totalDuration += duration; } if (trackData.timeToSampleTable.length === 0) { trackData.timeToSampleTable.push({ sampleCount: totalDuration, sampleDelta: 1, }); } else { const lastEntry = last(trackData.timeToSampleTable); lastEntry.sampleCount += totalDuration; } trackData.timestampProcessingQueue.length = 0; return; } const sortedTimestamps = trackData.timestampProcessingQueue.map(x => x.timestamp).sort((a, b) => a - b); if (!this.isFragmented) { trackData.startTimestampOffset ??= sortedTimestamps[0]; } for (let i = 0; i < trackData.timestampProcessingQueue.length; i++) { const sample = trackData.timestampProcessingQueue[i]; // Since the user only supplies presentation time, but these may be out of order, we reverse-engineer from // that a sensible decode timestamp. The notion of a decode timestamp doesn't really make sense // (presentation timestamp & decode order are all you need), but it is a concept in ISOBMFF so we need to // model it. sample.decodeTimestamp = sortedTimestamps[i]; const sampleCompositionTimeOffset = intoTimescale(sample.timestamp - sample.decodeTimestamp, trackData.timescale); const durationInTimescale = intoTimescale(sample.duration, trackData.timescale); if (trackData.lastTimescaleUnits !== null) { assert(trackData.lastSample); const timescaleUnits = intoTimescale(sample.decodeTimestamp, trackData.timescale, false); const delta = Math.round(timescaleUnits - trackData.lastTimescaleUnits); assert(delta >= 0); trackData.lastTimescaleUnits += delta; trackData.lastSample.timescaleUnitsToNextSample = delta; if (!this.isFragmented) { let lastTableEntry = last(trackData.timeToSampleTable); assert(lastTableEntry); if (lastTableEntry.sampleCount === 1) { lastTableEntry.sampleDelta = delta; const entryBefore = trackData.timeToSampleTable[trackData.timeToSampleTable.length - 2]; if (entryBefore && entryBefore.sampleDelta === delta) { // If the delta is the same as the previous one, merge the two entries entryBefore.sampleCount++; trackData.timeToSampleTable.pop(); lastTableEntry = entryBefore; } } else if (lastTableEntry.sampleDelta !== delta) { // The delta has changed, so we need a new entry to reach the current sample lastTableEntry.sampleCount--; trackData.timeToSampleTable.push(lastTableEntry = { sampleCount: 1, sampleDelta: delta, }); } if (lastTableEntry.sampleDelta === durationInTimescale) { // The sample's duration matches the delta, so we can increment the count lastTableEntry.sampleCount++; } else { // Add a new entry in order to maintain the last sample's true duration trackData.timeToSampleTable.push({ sampleCount: 1, sampleDelta: durationInTimescale, }); } const lastCompositionTimeOffsetTableEntry = last(trackData.compositionTimeOffsetTable); assert(lastCompositionTimeOffsetTableEntry); if (lastCompositionTimeOffsetTableEntry.sampleCompositionTimeOffset === sampleCompositionTimeOffset) { // Simply increment the count lastCompositionTimeOffsetTableEntry.sampleCount++; } else { // The composition time offset has changed, so create a new entry with the new composition time // offset trackData.compositionTimeOffsetTable.push({ sampleCount: 1, sampleCompositionTimeOffset: sampleCompositionTimeOffset, }); } } } else { // Decode timestamp of the first sample trackData.lastTimescaleUnits = intoTimescale(sample.decodeTimestamp, trackData.timescale, false); if (!this.isFragmented) { trackData.timeToSampleTable.push({ sampleCount: 1, sampleDelta: durationInTimescale, }); trackData.compositionTimeOffsetTable.push({ sampleCount: 1, sampleCompositionTimeOffset: sampleCompositionTimeOffset, }); } } trackData.lastSample = sample; } trackData.timestampProcessingQueue.length = 0; assert(trackData.lastSample); assert(trackData.lastTimescaleUnits !== null); if (nextSample !== undefined && trackData.lastSample.timescaleUnitsToNextSample === 0) { assert(nextSample.type === 'key'); // Given the next sample, we can make a guess about the duration of the last sample. This avoids having // the last sample's duration in each fragment be "0" for fragmented files. The guess we make here is // actually correct most of the time, since typically, no delta frame with a lower timestamp follows the key // frame (although it can happen). const timescaleUnits = intoTimescale(nextSample.timestamp, trackData.timescale, false); const delta = Math.round(timescaleUnits - trackData.lastTimescaleUnits); trackData.lastSample.timescaleUnitsToNextSample = delta; } } async registerSample(trackData, sample) { if (sample.type === 'key') { this.processTimestamps(trackData, sample); } trackData.timestampProcessingQueue.push(sample); if (this.isFragmented) { trackData.sampleQueue.push(sample); await this.interleaveSamples(); } else if (this.fastStart === 'reserve') { await this.registerSampleFastStartReserve(trackData, sample); } else { await this.addSampleToTrack(trackData, sample); } } async addSampleToTrack(trackData, sample) { if (!this.isFragmented) { trackData.samples.push(sample); if (this.fastStart === 'reserve') { const maximumPacketCount = trackData.track.metadata.maximumPacketCount; assert(maximumPacketCount !== undefined); if (trackData.samples.length > maximumPacketCount) { throw new Error(`Track #${trackData.track.id} has already reached the maximum packet count` + ` (${maximumPacketCount}). Either add less packets or increase the maximum packet count.`); } } } let beginNewChunk = false; if (!trackData.currentChunk) { beginNewChunk = true; } else { // Timestamp don't need to be monotonic (think B-frames), so we may need to update the start timestamp of // the chunk trackData.currentChunk.startTimestamp = Math.min(trackData.currentChunk.startTimestamp, sample.timestamp); const currentChunkDuration = sample.timestamp - trackData.currentChunk.startTimestamp; if (this.isFragmented) { // We can only finalize this fragment (and begin a new one) if we know that each track will be able to // start the new one with a key frame. const keyFrameQueuedEverywhere = this.trackDatas.every((otherTrackData) => { if (trackData === otherTrackData) { return sample.type === 'key'; } const firstQueuedSample = otherTrackData.sampleQueue[0]; if (firstQueuedSample) { return firstQueuedSample.type === 'key'; } return otherTrackData.closed; }); if (currentChunkDuration >= this.minimumFragmentDuration && keyFrameQueuedEverywhere && sample.timestamp > this.maxWrittenTimestamp) { beginNewChunk = true; await this.finalizeFragment(); } } else { beginNewChunk = currentChunkDuration >= 0.5; // Chunk is long enough, we need a new one } } if (beginNewChunk) { if (trackData.currentChunk) { await this.finalizeCurrentChunk(trackData); } trackData.currentChunk = { startTimestamp: sample.timestamp, samples: [], offset: null, moofOffset: null, }; } assert(trackData.currentChunk); trackData.currentChunk.samples.push(sample); if (this.isFragmented) { this.maxWrittenTimestamp = Math.max(this.maxWrittenTimestamp, sample.timestamp); this.maxWrittenEndTimestamp = Math.max(this.maxWrittenEndTimestamp, sample.timestamp + sample.duration); this.minWrittenTimestamp = Math.min(this.minWrittenTimestamp, sample.timestamp); } } async finalizeCurrentChunk(trackData) { assert(!this.isFragmented); assert(this.writer); if (!trackData.currentChunk) return; trackData.finalizedChunks.push(trackData.currentChunk); this.finalizedChunks.push(trackData.currentChunk); let sampleCount = trackData.currentChunk.samples.length; if (trackData.type === 'audio' && trackData.info.requiresPcmTransformation) { sampleCount = trackData.currentChunk.samples .reduce((acc, sample) => acc + intoTimescale(sample.duration, trackData.timescale), 0); } if (trackData.compactlyCodedChunkTable.length === 0 || last(trackData.compactlyCodedChunkTable).samplesPerChunk !== sampleCount) { trackData.compactlyCodedChunkTable.push({ firstChunk: trackData.finalizedChunks.length, // 1-indexed samplesPerChunk: sampleCount, }); } if (this.fastStart === 'in-memory') { trackData.currentChunk.offset = 0; // We'll compute the proper offset when finalizing return; } // Write out the data trackData.currentChunk.offset = this.writer.getPos(); for (const sample of trackData.currentChunk.samples) { assert(sample.data); this.writer.write(sample.data); sample.data = null; // Can be GC'd } await this.writer.flush(); } async interleaveSamples(isFinalCall = false) { assert(this.isFragmented); if (!isFinalCall && !this.allTracksAreKnown()) { return; // We can't interleave yet as we don't yet know how many tracks we'll truly have } outer: while (true) { let trackWithMinTimestamp = null; let minTimestamp = Infinity; for (const trackData of this.trackDatas) { if (!isFinalCall && trackData.sampleQueue.length === 0 && !trackData.closed) { break outer; } if (trackData.sampleQueue.length > 0 && trackData.sampleQueue[0].timestamp < minTimestamp) { trackWithMinTimestamp = trackData; minTimestamp = trackData.sampleQueue[0].timestamp; } } if (!trackWithMinTimestamp) { break; } const sample = trackWithMinTimestamp.sampleQueue.shift(); await this.addSampleToTrack(trackWithMinTimestamp, sample); } } async finalizeFragment(flushWriter = !this.isCmaf) { assert(this.isFragmented); const fragmentNumber = this.nextFragmentNumber++; if (fragmentNumber === 1) { const boxWriter = this.initBoxWriter ?? this.boxWriter; assert(boxWriter); if (this.format._options.onMoov) { boxWriter.writer.startTrackingWrites(); } // Write the moov box now that we have all decoder configs const movieBox = moov(this); boxWriter.writeBox(movieBox); if (this.format._options.onMoov) { const { data, start } = boxWriter.writer.stopTrackingWrites(); this.format._options.onMoov(data, start); } if (this.isCmaf) { assert(this.initWriter); await this.initWriter.flush(); await this.initWriter.finalize(); // Init segment is done // Only now, init the main writer; this way the init writer is fully done before the main writer is // even acquired this.writer = await this.output._getRootWriter(true); this.boxWriter = new IsobmffBoxWriter(this.writer); const stypSize = this.boxWriter.measureBox(styp()); const sidxSize = this.boxWriter.measureBox(sidx(this, 0)); this.segmentHeaderSize = stypSize + sidxSize; this.writer.seek(this.segmentHeaderSize); // Make room for the header to be written later } } assert(this.writer); assert(this.boxWriter); // Not all tracks need to be present in every fragment const tracksInFragment = this.trackDatas.filter(x => x.currentChunk); // Create an initial moof box and measure it; we need this to know where the following mdat box will begin const moofBox = moof(fragmentNumber, tracksInFragment); const moofOffset = this.writer.getPos(); const mdatStartPos = moofOffset + this.boxWriter.measureBox(moofBox); let currentPos = mdatStartPos + MIN_BOX_HEADER_SIZE; let fragmentStartTimestamp = Infinity; for (const trackData of tracksInFragment) { trackData.currentChunk.offset = currentPos; trackData.currentChunk.moofOffset = moofOffset; for (const sample of trackData.currentChunk.samples) { currentPos += sample.size; } fragmentStartTimestamp = Math.min(fragmentStartTimestamp, trackData.currentChunk.startTimestamp); } const mdatSize = currentPos - mdatStartPos; const needsLargeMdatSize = mdatSize >= 2 ** 32; if (needsLargeMdatSize) { // Shift all offsets by 8. Previously, all chunks were shifted assuming the large box size, but due to what // I suspect is a bug in WebKit, it failed in Safari (when livestreaming with MSE, not for static playback). for (const trackData of tracksInFragment) { trackData.currentChunk.offset += MAX_BOX_HEADER_SIZE - MIN_BOX_HEADER_SIZE; } } if (this.format._options.onMoof) { this.writer.startTrackingWrites(); } const newMoofBox = moof(fragmentNumber, tracksInFragment); this.boxWriter.writeBox(newMoofBox); if (this.format._options.onMoof) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMoof(data, start, fragmentStartTimestamp); } assert(this.writer.getPos() === mdatStartPos); if (this.format._options.onMdat) { this.writer.startTrackingWrites(); } const mdatBox = mdat(needsLargeMdatSize); mdatBox.size = mdatSize; this.boxWriter.writeBox(mdatBox); this.writer.seek(mdatStartPos + (needsLargeMdatSize ? MAX_BOX_HEADER_SIZE : MIN_BOX_HEADER_SIZE)); // Write sample data for (const trackData of tracksInFragment) { for (const sample of trackData.currentChunk.samples) { this.writer.write(sample.data); sample.data = null; // Can be GC'd } } if (this.format._options.onMdat) { const { data, start } = this.writer.stopTrackingWrites(); this.format._options.onMdat(data, start); } for (const trackData of tracksInFragment) { trackData.finalizedChunks.push(trackData.currentChunk); this.finalizedChunks.push(trackData.currentChunk); trackData.currentChunk = null; } if (flushWriter) { await this.writer.flush(); } } async registerSampleFastStartReserve(trackData, sample) { assert(this.writer); assert(this.boxWriter); if (this.allTracksAreKnown()) { if (!this.mdat) { // We finally know all tracks, let's reserve space for the moov box const moovBox = moov(this); const moovSize = this.boxWriter.measureBox(moovBox); const reservedSize = moovSize + this.computeSampleTableSizeUpperBound() + 4096; // Just a little extra headroom assert(this.ftypSize !== null); this.writer.seek(this.ftypSize + reservedSize); if (this.format._options.onMdat) { this.writer.startTrackingWrites(); } this.mdat = mdat(true); this.boxWriter.writeBox(this.mdat); // Now write everything that was queued for (const trackData of this.trackDatas) { for (const sample of trackData.sampleQueue) { await this.addSampleToTrack(trackData, sample); } trackData.sampleQueue.length = 0; } } await this.addSampleToTrack(trackData, sample); } else { // Queue it for when we know all tracks trackData.sampleQueue.push(sample); } } computeSampleTableSizeUpperBound() { assert(this.fastStart === 'reserve'); let upperBound = 0; for (const trackData of this.trackDatas) { const n = trackData.track.metadata.maximumPacketCount; assert(n !== undefined); // We validated this earlier // Given the max allowed packet count, compute the space they'll take up in the Sample Table Box, assuming // the worst case for each individual box: // stts box - since it is compactly coded, the maximum length of this table will be 2/3n upperBound += (4 + 4) * Math.ceil(2 / 3 * n); // stss box - 1 entry per sample upperBound += 4 * n; // ctts box - since it is compactly coded, the maximum length of this table will be 2/3n upperBound += (4 + 4) * Math.ceil(2 / 3 * n); // stsc box - since it is compactly coded, the maximum length of this table will be 2/3n upperBound += (4 + 4 + 4) * Math.ceil(2 / 3 * n); // stsz box - 1 entry per sample upperBound += 4 * n; // co64 box - we assume 1 sample per chunk and 64-bit chunk offsets (co64 instead of stco) upperBound += 8 * n; } return upperBound; } // eslint-disable-next-line @typescript-eslint/no-misused-promises async onTrackClose(track) { const release = await this.mutex.acquire(); const trackData = this.trackDatas.find(x => x.track === track); if (trackData) { trackData.closed = true; if (trackData.type === 'subtitle' && track.source._codec === 'webvtt') { await this.processWebVTTCues(trackData, Infinity); } this.processTimestamps(trackData); } if (this.allTracksAreKnown()) { this.allTracksKnown.resolve(); } if (this.isFragmented) { // Since a track is now closed, we may be able to write out chunks that were previously waiting await this.interleaveSamples(); } release(); } /** Finalizes the file, making it ready for use. Must be called after all video and audio chunks have been added. */ async finalize() { const release = await this.mutex.acquire(); this.allTracksKnown.resolve(); for (const trackData of this.trackDatas) { trackData.closed = true; if (trackData.type === 'subtitle' && trackData.track.source._codec === 'webvtt') { await this.processWebVTTCues(trackData, Infinity); } this.processTimestamps(trackData); } if (this.isFragmented) { await this.interleaveSamples(true); await this.finalizeFragment(false); // Don't flush the last fragment as we will flush it with the mfra box } else { for (const trackData of this.trackDatas) { await this.finalizeCurrentChunk(trackData); // Must hold because we will have processed at least one sample assert(trackData.startTimestampOffset !== null); // Shift all of the samples by the start offset. We'll then write out an edit list that will shift them // back to their proper spot in the composition. for (let i = 0; i < trackData.samples.length; i++) { const sample = trackData.samples[i]; sample.timestamp -= trackData.startTimestampOffset; sample.decodeTimestamp -= trackData.startTimestampOffset; } } } assert(this.writer); assert(this.boxWriter); if (this.fastStart === 'in-memory') { this.mdat = mdat(false); let mdatSize; // We know how many chunks there are, but computing the chunk positions requires an iterative approach: // In order to