mediabunny

/*! * Copyright (c) 2025-present, Vanilagy and contributors * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ import { AacCodecInfo, AudioCodec, extractAudioCodecString, extractVideoCodecString, MediaCodec, parseAacAudioSpecificConfig, parsePcmCodec, PCM_AUDIO_CODECS, PcmAudioCodec, VideoCodec, } from '../codec'; import { AvcDecoderConfigurationRecord, HevcDecoderConfigurationRecord, Vp9CodecInfo, Av1CodecInfo, extractVp9CodecInfoFromPacket, extractAv1CodecInfoFromPacket, } from '../codec-data'; import { Demuxer } from '../demuxer'; import { Input } from '../input'; import { InputAudioTrack, InputAudioTrackBacking, InputTrack, InputTrackBacking, InputVideoTrack, InputVideoTrackBacking, } from '../input-track'; import { PacketRetrievalOptions } from '../media-sink'; import { assert, COLOR_PRIMARIES_MAP_INVERSE, MATRIX_COEFFICIENTS_MAP_INVERSE, TRANSFER_CHARACTERISTICS_MAP_INVERSE, binarySearchLessOrEqual, binarySearchExact, Rotation, last, AsyncMutex, findLastIndex, UNDETERMINED_LANGUAGE, TransformationMatrix, roundToPrecision, isIso639Dash2LanguageCode, roundToMultiple, normalizeRotation, Bitstream, insertSorted, } from '../misc'; import { EncodedPacket, PLACEHOLDER_DATA } from '../packet'; import { Reader } from '../reader'; import { buildIsobmffMimeType } from './isobmff-misc'; import { IsobmffReader, MAX_BOX_HEADER_SIZE, MIN_BOX_HEADER_SIZE } from './isobmff-reader'; type InternalTrack = { id: number; demuxer: IsobmffDemuxer; inputTrack: InputTrack | null; timescale: number; durationInMovieTimescale: number; durationInMediaTimescale: number; rotation: Rotation; languageCode: string; sampleTableByteOffset: number; sampleTable: SampleTable | null; fragmentLookupTable: FragmentLookupTableEntry[] | null; currentFragmentState: FragmentTrackState | null; fragments: Fragment[]; fragmentsWithKeyFrame: Fragment[]; /** The segment durations of all edit list entries leading up to the main one (from which the offset is taken.) */ editListPreviousSegmentDurations: number; /** The media time offset of the main edit list entry (with media time !== -1) */ editListOffset: number; } & ({ info: null; } | { info: { type: 'video'; width: number; height: number; codec: VideoCodec | null; codecDescription: Uint8Array | null; colorSpace: VideoColorSpaceInit | null; avcCodecInfo: AvcDecoderConfigurationRecord | null; hevcCodecInfo: HevcDecoderConfigurationRecord | null; vp9CodecInfo: Vp9CodecInfo | null; av1CodecInfo: Av1CodecInfo | null; }; } | { info: { type: 'audio'; numberOfChannels: number; sampleRate: number; codec: AudioCodec | null; codecDescription: Uint8Array | null; aacCodecInfo: AacCodecInfo | null; }; }); type InternalVideoTrack = InternalTrack & { info: { type: 'video' } }; type InternalAudioTrack = InternalTrack & { info: { type: 'audio' } }; type SampleTable = { sampleTimingEntries: SampleTimingEntry[]; sampleCompositionTimeOffsets: SampleCompositionTimeOffsetEntry[]; sampleSizes: number[]; keySampleIndices: number[] | null; // Samples that are keyframes chunkOffsets: number[]; sampleToChunk: SampleToChunkEntry[]; presentationTimestamps: { presentationTimestamp: number; sampleIndex: number; }[] | null; /** * Provides a fast map from sample index to index in the sorted presentation timestamps array - so, a fast map from * decode order to presentation order. */ presentationTimestampIndexMap: number[] | null; }; type SampleTimingEntry = { startIndex: number; startDecodeTimestamp: number; count: number; delta: number; }; type SampleCompositionTimeOffsetEntry = { startIndex: number; count: number; offset: number; }; type SampleToChunkEntry = { startSampleIndex: number; startChunkIndex: number; samplesPerChunk: number; sampleDescriptionIndex: number; }; type FragmentTrackDefaults = { trackId: number; defaultSampleDescriptionIndex: number; defaultSampleDuration: number; defaultSampleSize: number; defaultSampleFlags: number; }; type FragmentLookupTableEntry = { timestamp: number; moofOffset: number; }; type FragmentTrackState = { baseDataOffset: number; sampleDescriptionIndex: number | null; defaultSampleDuration: number | null; defaultSampleSize: number | null; defaultSampleFlags: number | null; startTimestamp: number | null; }; type FragmentTrackData = { startTimestamp: number; endTimestamp: number; firstKeyFrameTimestamp: number | null; samples: FragmentTrackSample[]; presentationTimestamps: { presentationTimestamp: number; sampleIndex: number; }[]; startTimestampIsFinal: boolean; }; type FragmentTrackSample = { presentationTimestamp: number; duration: number; byteOffset: number; byteSize: number; isKeyFrame: boolean; }; type Fragment = { moofOffset: number; moofSize: number; implicitBaseDataOffset: number; trackData: Map<InternalTrack['id'], FragmentTrackData>; dataStart: number; dataEnd: number; nextFragment: Fragment | null; isKnownToBeFirstFragment: boolean; }; export class IsobmffDemuxer extends Demuxer { metadataReader: IsobmffReader; currentTrack: InternalTrack | null = null; tracks: InternalTrack[] = []; metadataPromise: Promise<void> | null = null; movieTimescale = -1; movieDurationInTimescale = -1; isQuickTime = false; isFragmented = false; fragmentTrackDefaults: FragmentTrackDefaults[] = []; fragments: Fragment[] = []; currentFragment: Fragment | null = null; fragmentLookupMutex = new AsyncMutex(); chunkReader: IsobmffReader; constructor(input: Input) { super(input); this.metadataReader = new IsobmffReader(input._mainReader); this.chunkReader = new IsobmffReader(new Reader(input.source, 64 * 2 ** 20)); // Max 64 MiB of stored chunks } override async computeDuration() { const tracks = await this.getTracks(); const trackDurations = await Promise.all(tracks.map(x => x.computeDuration())); return Math.max(0, ...trackDurations); } override async getTracks() { await this.readMetadata(); return this.tracks.map(track => track.inputTrack!); } override async getMimeType() { await this.readMetadata(); const codecStrings = await Promise.all(this.tracks.map(x => x.inputTrack!.getCodecParameterString())); return buildIsobmffMimeType({ isQuickTime: this.isQuickTime, hasVideo: this.tracks.some(x => x.info?.type === 'video'), hasAudio: this.tracks.some(x => x.info?.type === 'audio'), codecStrings: codecStrings.filter(Boolean) as string[], }); } readMetadata() { return this.metadataPromise ??= (async () => { const sourceSize = await this.metadataReader.reader.source.getSize(); while (this.metadataReader.pos < sourceSize) { await this.metadataReader.reader.loadRange( this.metadataReader.pos, this.metadataReader.pos + MAX_BOX_HEADER_SIZE, ); const startPos = this.metadataReader.pos; const boxInfo = this.metadataReader.readBoxHeader(); if (boxInfo.name === 'ftyp') { const majorBrand = this.metadataReader.readAscii(4); this.isQuickTime = majorBrand === 'qt '; } else if (boxInfo.name === 'moov') { // Found moov, load it await this.metadataReader.reader.loadRange( this.metadataReader.pos, this.metadataReader.pos + boxInfo.contentSize, ); this.readContiguousBoxes(boxInfo.contentSize); for (const track of this.tracks) { // Modify the edit list offset based on the previous segment durations. They are in different // timescales, so we first convert to seconds and then into the track timescale. const previousSegmentDurationsInSeconds = track.editListPreviousSegmentDurations / this.movieTimescale; track.editListOffset -= Math.round(previousSegmentDurationsInSeconds * track.timescale); } break; } this.metadataReader.pos = startPos + boxInfo.totalSize; } if (this.isFragmented) { // The last 4 bytes may contain the size of the mfra box at the end of the file await this.metadataReader.reader.loadRange(sourceSize - 4, sourceSize); this.metadataReader.pos = sourceSize - 4; const lastWord = this.metadataReader.readU32(); const potentialMfraPos = sourceSize - lastWord; if (potentialMfraPos >= 0 && potentialMfraPos < sourceSize) { await this.metadataReader.reader.loadRange(potentialMfraPos, sourceSize); this.metadataReader.pos = potentialMfraPos; const boxInfo = this.metadataReader.readBoxHeader(); if (boxInfo.name === 'mfra') { // We found the mfra box, allowing for much better random access. Let's parse it: this.readContiguousBoxes(boxInfo.contentSize); } } } })(); } getSampleTableForTrack(internalTrack: InternalTrack) { if (internalTrack.sampleTable) { return internalTrack.sampleTable; } const sampleTable: SampleTable = { sampleTimingEntries: [], sampleCompositionTimeOffsets: [], sampleSizes: [], keySampleIndices: null, chunkOffsets: [], sampleToChunk: [], presentationTimestamps: null, presentationTimestampIndexMap: null, }; internalTrack.sampleTable = sampleTable; this.metadataReader.pos = internalTrack.sampleTableByteOffset; this.currentTrack = internalTrack; this.traverseBox(); this.currentTrack = null; const isPcmCodec = internalTrack.info?.type === 'audio' && internalTrack.info.codec && (PCM_AUDIO_CODECS as readonly string[]).includes(internalTrack.info.codec); if (isPcmCodec && sampleTable.sampleCompositionTimeOffsets.length === 0) { // If the audio has PCM samples, the way the samples are defined in the sample table is somewhat // suboptimal: Each individual audio sample is its own sample, meaning we can have 48000 samples per second. // Because we treat each sample as its own atomic unit that can be decoded, this would lead to a huge // amount of very short samples for PCM audio. So instead, we make a transformation: If the audio is in PCM, // we say that each chunk (that normally holds many samples) now is one big sample. We can this because // the samples in the chunk are contiguous and the format is PCM, so the entire chunk as one thing still // encodes valid audio information. assert(internalTrack.info?.type === 'audio'); const pcmInfo = parsePcmCodec(internalTrack.info.codec as PcmAudioCodec); const newSampleTimingEntries: SampleTimingEntry[] = []; const newSampleSizes: number[] = []; for (let i = 0; i < sampleTable.sampleToChunk.length; i++) { const chunkEntry = sampleTable.sampleToChunk[i]!; const nextEntry = sampleTable.sampleToChunk[i + 1]; const chunkCount = (nextEntry ? nextEntry.startChunkIndex : sampleTable.chunkOffsets.length) - chunkEntry.startChunkIndex; for (let j = 0; j < chunkCount; j++) { const startSampleIndex = chunkEntry.startSampleIndex + j * chunkEntry.samplesPerChunk; const endSampleIndex = startSampleIndex + chunkEntry.samplesPerChunk; // Exclusive, outside of chunk const startTimingEntryIndex = binarySearchLessOrEqual( sampleTable.sampleTimingEntries, startSampleIndex, x => x.startIndex, ); const startTimingEntry = sampleTable.sampleTimingEntries[startTimingEntryIndex]!; const endTimingEntryIndex = binarySearchLessOrEqual( sampleTable.sampleTimingEntries, endSampleIndex, x => x.startIndex, ); const endTimingEntry = sampleTable.sampleTimingEntries[endTimingEntryIndex]!; const firstSampleTimestamp = startTimingEntry.startDecodeTimestamp + (startSampleIndex - startTimingEntry.startIndex) * startTimingEntry.delta; const lastSampleTimestamp = endTimingEntry.startDecodeTimestamp + (endSampleIndex - endTimingEntry.startIndex) * endTimingEntry.delta; const delta = lastSampleTimestamp - firstSampleTimestamp; const lastSampleTimingEntry = last(newSampleTimingEntries); if (lastSampleTimingEntry && lastSampleTimingEntry.delta === delta) { lastSampleTimingEntry.count++; } else { // One sample for the entire chunk newSampleTimingEntries.push({ startIndex: chunkEntry.startChunkIndex + j, startDecodeTimestamp: firstSampleTimestamp, count: 1, delta, }); } // Instead of determining the chunk's size by looping over the samples sizes in the sample table, we // can directly compute it as we know how many PCM frames are in this chunk, and the size of each // PCM frame. This also improves compatibility with some files which fail to write proper sample // size values into their sample tables in the PCM case. const chunkSize = chunkEntry.samplesPerChunk * pcmInfo.sampleSize * internalTrack.info.numberOfChannels; newSampleSizes.push(chunkSize); } chunkEntry.startSampleIndex = chunkEntry.startChunkIndex; chunkEntry.samplesPerChunk = 1; } sampleTable.sampleTimingEntries = newSampleTimingEntries; sampleTable.sampleSizes = newSampleSizes; } if (sampleTable.sampleCompositionTimeOffsets.length > 0) { // If composition time offsets are defined, we must build a list of all presentation timestamps and then // sort them sampleTable.presentationTimestamps = []; for (const entry of sampleTable.sampleTimingEntries) { for (let i = 0; i < entry.count; i++) { sampleTable.presentationTimestamps.push({ presentationTimestamp: entry.startDecodeTimestamp + i * entry.delta, sampleIndex: entry.startIndex + i, }); } } for (const entry of sampleTable.sampleCompositionTimeOffsets) { for (let i = 0; i < entry.count; i++) { const sampleIndex = entry.startIndex + i; const sample = sampleTable.presentationTimestamps[sampleIndex]; if (!sample) { continue; } sample.presentationTimestamp += entry.offset; } } sampleTable.presentationTimestamps.sort((a, b) => a.presentationTimestamp - b.presentationTimestamp); sampleTable.presentationTimestampIndexMap = Array(sampleTable.presentationTimestamps.length).fill(-1); for (let i = 0; i < sampleTable.presentationTimestamps.length; i++) { sampleTable.presentationTimestampIndexMap[sampleTable.presentationTimestamps[i]!.sampleIndex] = i; } } else { // If they're not defined, we can simply use the decode timestamps as presentation timestamps } return sampleTable; } async readFragment(): Promise<Fragment> { const startPos = this.metadataReader.pos; await this.metadataReader.reader.loadRange( this.metadataReader.pos, this.metadataReader.pos + MAX_BOX_HEADER_SIZE, ); const moofBoxInfo = this.metadataReader.readBoxHeader(); assert(moofBoxInfo.name === 'moof'); const contentStart = this.metadataReader.pos; await this.metadataReader.reader.loadRange(contentStart, contentStart + moofBoxInfo.contentSize); this.metadataReader.pos = startPos; this.traverseBox(); const index = binarySearchExact(this.fragments, startPos, x => x.moofOffset); assert(index !== -1); const fragment = this.fragments[index]!; assert(fragment.moofOffset === startPos); // We have read everything in the moof box, there's no need to keep the data around anymore // (keep the header tho) this.metadataReader.reader.forgetRange(contentStart, contentStart + moofBoxInfo.contentSize); // It may be that some tracks don't define the base decode time, i.e. when the fragment begins. This means the // only other option is to sum up the duration of all previous fragments. for (const [trackId, trackData] of fragment.trackData) { if (trackData.startTimestampIsFinal) { continue; } const internalTrack = this.tracks.find(x => x.id === trackId)!; this.metadataReader.pos = 0; let currentFragment: Fragment | null = null; let lastFragment: Fragment | null = null; const index = binarySearchLessOrEqual( internalTrack.fragments, startPos - 1, x => x.moofOffset, ); if (index !== -1) { // Instead of starting at the start of the file, let's start at the previous fragment instead (which // already has final timestamps). currentFragment = internalTrack.fragments[index]!; lastFragment = currentFragment; this.metadataReader.pos = currentFragment.moofOffset + currentFragment.moofSize; } let nextFragmentIsFirstFragment = this.metadataReader.pos === 0; while (this.metadataReader.pos < startPos) { if (currentFragment?.nextFragment) { currentFragment = currentFragment.nextFragment; this.metadataReader.pos = currentFragment.moofOffset + currentFragment.moofSize; } else { await this.metadataReader.reader.loadRange( this.metadataReader.pos, this.metadataReader.pos + MAX_BOX_HEADER_SIZE, ); const startPos = this.metadataReader.pos; const boxInfo = this.metadataReader.readBoxHeader(); if (boxInfo.name === 'moof') { const index = binarySearchExact(this.fragments, startPos, x => x.moofOffset); let fragment: Fragment; if (index === -1) { this.metadataReader.pos = startPos; fragment = await this.readFragment(); // Recursive call } else { // We already know this fragment fragment = this.fragments[index]!; } // Even if we already know the fragment, we might not yet know its predecessor; always do this if (currentFragment) currentFragment.nextFragment = fragment; currentFragment = fragment; if (nextFragmentIsFirstFragment) { fragment.isKnownToBeFirstFragment = true; nextFragmentIsFirstFragment = false; } } this.metadataReader.pos = startPos + boxInfo.totalSize; } if (currentFragment && currentFragment.trackData.has(trackId)) { lastFragment = currentFragment; } } if (lastFragment) { const otherTrackData = lastFragment.trackData.get(trackId)!; assert(otherTrackData.startTimestampIsFinal); offsetFragmentTrackDataByTimestamp(trackData, otherTrackData.endTimestamp); } trackData.startTimestampIsFinal = true; } return fragment; } readContiguousBoxes(totalSize: number) { const startIndex = this.metadataReader.pos; while (this.metadataReader.pos - startIndex <= totalSize - MIN_BOX_HEADER_SIZE) { this.traverseBox(); } } traverseBox() { const startPos = this.metadataReader.pos; const boxInfo = this.metadataReader.readBoxHeader(); const boxEndPos = startPos + boxInfo.totalSize; switch (boxInfo.name) { case 'mdia': case 'minf': case 'dinf': case 'mfra': case 'edts': { this.readContiguousBoxes(boxInfo.contentSize); }; break; case 'mvhd': { const version = this.metadataReader.readU8(); this.metadataReader.pos += 3; // Flags if (version === 1) { this.metadataReader.pos += 8 + 8; this.movieTimescale = this.metadataReader.readU32(); this.movieDurationInTimescale = this.metadataReader.readU64(); } else { this.metadataReader.pos += 4 + 4; this.movieTimescale = this.metadataReader.readU32(); this.movieDurationInTimescale = this.metadataReader.readU32(); } }; break; case 'trak': { const track = { id: -1, demuxer: this, inputTrack: null, info: null, timescale: -1, durationInMovieTimescale: -1, durationInMediaTimescale: -1, rotation: 0, languageCode: UNDETERMINED_LANGUAGE, sampleTableByteOffset: -1, sampleTable: null, fragmentLookupTable: null, currentFragmentState: null, fragments: [], fragmentsWithKeyFrame: [], editListPreviousSegmentDurations: 0, editListOffset: 0, } satisfies InternalTrack as InternalTrack; this.currentTrack = track; this.readContiguousBoxes(boxInfo.contentSize); if (track.id !== -1 && track.timescale !== -1 && track.info !== null) { if (track.info.type === 'video' && track.info.width !== -1) { const videoTrack = track as InternalVideoTrack; track.inputTrack = new InputVideoTrack(new IsobmffVideoTrackBacking(videoTrack)); this.tracks.push(track); } else if (track.info.type === 'audio' && track.info.numberOfChannels !== -1) { const audioTrack = track as InternalAudioTrack; track.inputTrack = new InputAudioTrack(new IsobmffAudioTrackBacking(audioTrack)); this.tracks.push(track); } } this.currentTrack = null; }; break; case 'tkhd': { const track = this.currentTrack; assert(track); const version = this.metadataReader.readU8(); const flags = this.metadataReader.readU24(); const trackEnabled = (flags & 0x1) !== 0; if (!trackEnabled) { break; } // Skip over creation & modification time to reach the track ID if (version === 0) { this.metadataReader.pos += 8; track.id = this.metadataReader.readU32(); this.metadataReader.pos += 4; track.durationInMovieTimescale = this.metadataReader.readU32(); } else if (version === 1) { this.metadataReader.pos += 16; track.id = this.metadataReader.readU32(); this.metadataReader.pos += 4; track.durationInMovieTimescale = this.metadataReader.readU64(); } else { throw new Error(`Incorrect track header version ${version}.`); } this.metadataReader.pos += 2 * 4 + 2 + 2 + 2 + 2; const matrix: TransformationMatrix = [ this.metadataReader.readFixed_16_16(), this.metadataReader.readFixed_16_16(), this.metadataReader.readFixed_2_30(), this.metadataReader.readFixed_16_16(), this.metadataReader.readFixed_16_16(), this.metadataReader.readFixed_2_30(), this.metadataReader.readFixed_16_16(), this.metadataReader.readFixed_16_16(), this.metadataReader.readFixed_2_30(), ]; const rotation = normalizeRotation(roundToMultiple(extractRotationFromMatrix(matrix), 90)); assert(rotation === 0 || rotation === 90 || rotation === 180 || rotation === 270); track.rotation = rotation; }; break; case 'elst': { const track = this.currentTrack; assert(track); const version = this.metadataReader.readU8(); this.metadataReader.pos += 3; // Flags let relevantEntryFound = false; let previousSegmentDurations = 0; const entryCount = this.metadataReader.readU32(); for (let i = 0; i < entryCount; i++) { const segmentDuration = version === 1 ? this.metadataReader.readU64() : this.metadataReader.readU32(); const mediaTime = version === 1 ? this.metadataReader.readI64() : this.metadataReader.readI32(); const mediaRate = this.metadataReader.readFixed_16_16(); if (segmentDuration === 0) { // Don't care continue; } if (relevantEntryFound) { console.warn( 'Unsupported edit list: multiple edits are not currently supported. Only using first edit.', ); break; } if (mediaTime === -1) { previousSegmentDurations += segmentDuration; continue; } if (mediaRate !== 1) { console.warn('Unsupported edit list entry: media rate must be 1.'); break; } track.editListPreviousSegmentDurations = previousSegmentDurations; track.editListOffset = mediaTime; relevantEntryFound = true; } }; break; case 'mdhd': { const track = this.currentTrack; assert(track); const version = this.metadataReader.readU8(); this.metadataReader.pos += 3; // Flags if (version === 0) { this.metadataReader.pos += 8; track.timescale = this.metadataReader.readU32(); track.durationInMediaTimescale = this.metadataReader.readU32(); } else if (version === 1) { this.metadataReader.pos += 16; track.timescale = this.metadataReader.readU32(); track.durationInMediaTimescale = this.metadataReader.readU64(); } let language = this.metadataReader.readU16(); if (language > 0) { track.languageCode = ''; for (let i = 0; i < 3; i++) { track.languageCode = String.fromCharCode(0x60 + (language & 0b11111)) + track.languageCode; language >>= 5; } if (!isIso639Dash2LanguageCode(track.languageCode)) { // Sometimes the bytes are garbage track.languageCode = UNDETERMINED_LANGUAGE; } } }; break; case 'hdlr': { const track = this.currentTrack; assert(track); this.metadataReader.pos += 8; // Version + flags + pre-defined const handlerType = this.metadataReader.readAscii(4); if (handlerType === 'vide') { track.info = { type: 'video', width: -1, height: -1, codec: null, codecDescription: null, colorSpace: null, avcCodecInfo: null, hevcCodecInfo: null, vp9CodecInfo: null, av1CodecInfo: null, }; } else if (handlerType === 'soun') { track.info = { type: 'audio', numberOfChannels: -1, sampleRate: -1, codec: null, codecDescription: null, aacCodecInfo: null, }; } }; break; case 'stbl': { const track = this.currentTrack; assert(track); track.sampleTableByteOffset = startPos; this.readContiguousBoxes(boxInfo.contentSize); }; break; case 'stsd': { const track = this.currentTrack; assert(track); if (track.info === null || track.sampleTable) { break; } const stsdVersion = this.metadataReader.readU8(); this.metadataReader.pos += 3; // Flags const entries = this.metadataReader.readU32(); for (let i = 0; i < entries; i++) { const startPos = this.metadataReader.pos; const sampleBoxInfo = this.metadataReader.readBoxHeader(); const lowercaseBoxName = sampleBoxInfo.name.toLowerCase(); if (track.info.type === 'video') { if (lowercaseBoxName === 'avc1') { track.info.codec = 'avc'; } else if (lowercaseBoxName === 'hvc1' || lowercaseBoxName === 'hev1') { track.info.codec = 'hevc'; } else if (lowercaseBoxName === 'vp08') { track.info.codec = 'vp8'; } else if (lowercaseBoxName === 'vp09') { track.info.codec = 'vp9'; } else if (lowercaseBoxName === 'av01') { track.info.codec = 'av1'; } else { console.warn(`Unsupported video codec (sample entry type '${sampleBoxInfo.name}').`); } this.metadataReader.pos += 6 * 1 + 2 + 2 + 2 + 3 * 4; track.info.width = this.metadataReader.readU16(); track.info.height = this.metadataReader.readU16(); this.metadataReader.pos += 4 + 4 + 4 + 2 + 32 + 2 + 2; this.readContiguousBoxes((startPos + sampleBoxInfo.totalSize) - this.metadataReader.pos); } else { if (lowercaseBoxName === 'mp4a') { // We don't know the codec yet (might be AAC, might be MP3), need to read the esds box } else if (lowercaseBoxName === 'opus') { track.info.codec = 'opus'; } else if (lowercaseBoxName === 'flac') { track.info.codec = 'flac'; } else if ( lowercaseBoxName === 'twos' || lowercaseBoxName === 'sowt' || lowercaseBoxName === 'raw ' || lowercaseBoxName === 'in24' || lowercaseBoxName === 'in32' || lowercaseBoxName === 'fl32' || lowercaseBoxName === 'fl64' || lowercaseBoxName === 'lpcm' || lowercaseBoxName === 'ipcm' // ISO/IEC 23003-5 || lowercaseBoxName === 'fpcm' // " ) { // It's PCM // developer.apple.com/documentation/quicktime-file-format/sound_sample_descriptions/ } else if (lowercaseBoxName === 'ulaw') { track.info.codec = 'ulaw'; } else if (lowercaseBoxName === 'alaw') { track.info.codec = 'alaw'; } else { console.warn(`Unsupported audio codec (sample entry type '${sampleBoxInfo.name}').`); } this.metadataReader.pos += 6 * 1 + 2; const version = this.metadataReader.readU16(); this.metadataReader.pos += 3 * 2; let channelCount = this.metadataReader.readU16(); let sampleSize = this.metadataReader.readU16(); this.metadataReader.pos += 2 * 2; // Can't use fixed16_16 as that's signed let sampleRate = this.metadataReader.readU32() / 0x10000; if (stsdVersion === 0 && version > 0) { // Additional QuickTime fields if (version === 1) { this.metadataReader.pos += 4; sampleSize = 8 * this.metadataReader.readU32(); this.metadataReader.pos += 2 * 4; } else if (version === 2) { this.metadataReader.pos += 4; sampleRate = this.metadataReader.readF64(); channelCount = this.metadataReader.readU32(); this.metadataReader.pos += 4; // Always 0x7f000000 sampleSize = this.metadataReader.readU32(); const flags = this.metadataReader.readU32(); this.metadataReader.pos += 2 * 4; if (lowercaseBoxName === 'lpcm') { const bytesPerSample = (sampleSize + 7) >> 3; const isFloat = Boolean(flags & 1); const isBigEndian = Boolean(flags & 2); const sFlags = flags & 4 ? -1 : 0; // I guess it means "signed flags" or something? if (sampleSize > 0 && sampleSize <= 64) { if (isFloat) { if (sampleSize === 32) { track.info.codec = isBigEndian ? 'pcm-f32be' : 'pcm-f32'; } } else { if (sFlags & (1 << (bytesPerSample - 1))) { if (bytesPerSample === 1) { track.info.codec = 'pcm-s8'; } else if (bytesPerSample === 2) { track.info.codec = isBigEndian ? 'pcm-s16be' : 'pcm-s16'; } else if (bytesPerSample === 3) { track.info.codec = isBigEndian ? 'pcm-s24be' : 'pcm-s24'; } else if (bytesPerSample === 4) { track.info.codec = isBigEndian ? 'pcm-s32be' : 'pcm-s32'; } } else { if (bytesPerSample === 1) { track.info.codec = 'pcm-u8'; } } } } if (track.info.codec === null) { console.warn('Unsupported PCM format.'); } } } } track.info.numberOfChannels = channelCount; track.info.sampleRate = sampleRate; // PCM codec assignments if (lowercaseBoxName === 'twos') { if (sampleSize === 8) { track.info.codec = 'pcm-s8'; } else if (sampleSize === 16) { track.info.codec = 'pcm-s16be'; } else { console.warn(`Unsupported sample size ${sampleSize} for codec 'twos'.`); track.info.codec = null; } } else if (lowercaseBoxName === 'sowt') { if (sampleSize === 8) { track.info.codec = 'pcm-s8'; } else if (sampleSize === 16) { track.info.codec = 'pcm-s16'; } else { console.warn(`Unsupported sample size ${sampleSize} for codec 'sowt'.`); track.info.codec = null; } } else if (lowercaseBoxName === 'raw ') { track.info.codec = 'pcm-u8'; } else if (lowercaseBoxName === 'in24') { track.info.codec = 'pcm-s24be'; } else if (lowercaseBoxName === 'in32') { track.info.codec = 'pcm-s32be'; } else if (lowercaseBoxName === 'fl32') { track.info.codec = 'pcm-f32be'; } else if (lowercaseBoxName === 'fl64') { track.info.codec = 'pcm-f64be'; } else if (lowercaseBoxName === 'ipcm') { track.info.codec = 'pcm-s16be'; // Placeholder, will be adjusted by the pcmC box } else if (lowercaseBoxName === 'fpcm') { track.info.codec = 'pcm-f32be'; // Placeholder, will be adjusted by the pcmC box } this.readContiguousBoxes((startPos + sampleBoxInfo.totalSize) - this.metadataReader.pos); } } }; break; case 'avcC': { const track = this.currentTrack; assert(track && track.info); track.info.codecDescription = this.metadataReader.readBytes(boxInfo.contentSize); }; break; case 'hvcC': { const track = this.currentTrack; assert(track && track.info); track.info.codecDescription = this.metadataReader.readBytes(boxInfo.contentSize); }; break; case 'vpcC': { const track = this.currentTrack; assert(track && track.info?.type === 'video'); this.metadataReader.pos += 4; // Version + flags const profile = this.metadataReader.readU8(); const level = this.metadataReader.readU8(); const thirdByte = this.metadataReader.readU8(); const bitDepth = thirdByte >> 4; const chromaSubsampling = (thirdByte >> 1) & 0b111; const videoFullRangeFlag = thirdByte & 1; const colourPrimaries = this.metadataReader.readU8(); const transferCharacteristics = this.metadataReader.readU8(); const matrixCoefficients = this.metadataReader.readU8(); track.info.vp9CodecInfo = { profile, level, bitDepth, chromaSubsampling, videoFullRangeFlag, colourPrimaries, transferCharacteristics, matrixCoefficients, }; }; break; case 'av1C': { const track = this.currentTrack; assert(track && track.info?.type === 'video'); this.metadataReader.pos += 1; // Marker + version const secondByte = this.metadataReader.readU8(); const profile = secondByte >> 5; const level = secondByte & 0b11111; const thirdByte = this.metadataReader.readU8(); const tier = thirdByte >> 7; const highBitDepth = (thirdByte >> 6) & 1; const twelveBit = (thirdByte >> 5) & 1; const monochrome = (thirdByte >> 4) & 1; const chromaSubsamplingX = (thirdByte >> 3) & 1; const chromaSubsamplingY = (thirdByte >> 2) & 1; const chromaSamplePosition = thirdByte & 0b11; // Logic from https://aomediacodec.github.io/av1-spec/av1-spec.pdf const bitDepth = profile == 2 && highBitDepth ? (twelveBit ? 12 : 10) : (highBitDepth ? 10 : 8); track.info.av1CodecInfo = { profile, level, tier, bitDepth, monochrome, chromaSubsamplingX, chromaSubsamplingY, chromaSamplePosition, }; }; break; case 'colr': { const track = this.currentTrack; assert(track && track.info?.type === 'video'); const colourType = this.metadataReader.readAscii(4); if (colourType !== 'nclx') { break; } const colourPrimaries = this.metadataReader.readU16(); const transferCharacteristics = this.metadataReader.readU16(); const matrixCoefficients = this.metadataReader.readU16(); const fullRangeFlag = Boolean(this.metadataReader.readU8() & 0x80); track.info.colorSpace = { primaries: COLOR_PRIMARIES_MAP_INVERSE[colourPrimaries], transfer: TRANSFER_CHARACTERISTICS_MAP_INVERSE[transferCharacteristics], matrix: MATRIX_COEFFICIENTS_MAP_INVERSE[matrixCoefficients], fullRange: fullRangeFlag, } as VideoColorSpaceInit; }; break; case 'wave': { this.readContiguousBoxes(boxInfo.contentSize); }; break; case 'esds': { const track = this.currentTrack; assert(track && track.info?.type === 'audio'); this.metadataReader.pos += 4; // Version + flags const tag = this.metadataReader.readU8(); assert(tag === 0x03); // ES Descriptor this.metadataReader.readIsomVariableInteger(); // Length this.metadataReader.pos += 2; // ES ID const mixed = this.metadataReader.readU8(); const streamDependenceFlag = (mixed & 0x80) !== 0; const urlFlag = (mixed & 0x40) !== 0; const ocrStreamFlag = (mixed & 0x20) !== 0; if (streamDependenceFlag) { this.metadataReader.pos += 2; } if (urlFlag) { const urlLength = this.metadataReader.readU8(); this.metadataReader.pos += urlLength; } if (ocrStreamFlag) { this.metadataReader.pos += 2; } const decoderConfigTag = this.metadataReader.readU8(); assert(decoderConfigTag === 0x04); // DecoderConfigDescriptor const decoderConfigDescriptorLength = this.metadataReader.readIsomVariableInteger(); // Length const payloadStart = this.metadataReader.pos; const objectTypeIndication = this.metadataReader.readU8(); if (objectTypeIndication === 0x40 || objectTypeIndication === 0x67) { track.info.codec = 'aac'; track.info.aacCodecInfo = { isMpeg2: objectTypeIndication === 0x67 }; } else if (objectTypeIndication === 0x69 || objectTypeIndication === 0x6b) { track.info.codec = 'mp3'; } else if (objectTypeIndication === 0xdd) { track.info.codec = 'vorbis'; // "nonstandard, gpac uses it" - FFmpeg } else { console.warn( `Unsupported audio codec (objectTypeIndication ${objectTypeIndication}) - discarding track.`, ); } this.metadataReader.pos += 1 + 3 + 4 + 4; if (decoderConfigDescriptorLength > this.metadataReader.pos - payloadStart) { // There's a DecoderSpecificInfo at the end, let's read it const decoderSpecificInfoTag = this.metadataReader.readU8(); assert(decoderSpecificInfoTag === 0x05); // DecoderSpecificInfo const decoderSpecificInfoLength = this.metadataReader.readIsomVariableInteger(); track.info.codecDescription = this.metadataReader.readBytes(decoderSpecificInfoLength); if (track.info.codec === 'aac') { // Let's try to deduce more accurate values directly from the AudioSpecificConfig: const audioSpecificConfig = parseAacAudioSpecificConfig(track.info.codecDescription); if (audioSpecificConfig.numberOfChannels !== null) { track.info.numberOfChannels = audioSpecificConfig.numberOfChannels; } if (audioSpecificConfig.sampleRate !== null) { track.info.sampleRate = audioSpecificConfig.sampleRate; } } } }; break; case 'enda': { const track = this.currentTrack; assert(track && track.info?.type === 'audio'); const littleEndian = this.metadataReader.readU16() & 0xff; // 0xff is from FFmpeg if (littleEndian) { if (track.info.codec === 'pcm-s16be') { track.info.codec = 'pcm-s16'; } else if (track.info.codec === 'pcm-s24be') { track.info.codec = 'pcm-s24'; } else if (track.info.codec === 'pcm-s32be') { track.info.codec = 'pcm-s32'; } else if (track.info.codec === 'pcm-f32be') { track.info.codec = 'pcm-f32'; } else if (track.info.codec === 'pcm-f64be') { track.info.codec = 'pcm-f64'; } } }; break; case 'pcmC': { const track = this.currentTrack; assert(track && track.info?.type === 'audio'); this.metadataReader.pos += 1 + 3; // Version + flags // ISO/IEC 23003-5 const formatFlags = this.metadataReader.readU8(); const isLittleEndian = Boolean(formatFlags & 0x01); const pcmSampleSize = this.metadataReader.readU8(); if (track.info.codec === 'pcm-s16be') { // ipcm if (isLittleEndian) { if (pcmSampleSize === 16) { track.info.codec = 'pcm-s16'; } else if (pcmSampleSize === 24) { track.info.codec = 'pcm-s24'; } else if (pcmSampleSize === 32) { track.info.codec = 'pcm-s32'; } else { console.warn(`Invalid ipcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } else { if (pcmSampleSize === 16) { track.info.codec = 'pcm-s16be'; } else if (pcmSampleSize === 24) { track.info.codec = 'pcm-s24be'; } else if (pcmSampleSize === 32) { track.info.codec = 'pcm-s32be'; } else { console.warn(`Invalid ipcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } } else if (track.info.codec === 'pcm-f32be') { // fpcm if (isLittleEndian) { if (pcmSampleSize === 32) { track.info.codec = 'pcm-f32'; } else if (pcmSampleSize === 64) { track.info.codec = 'pcm-f64'; } else { console.warn(`Invalid fpcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } else { if (pcmSampleSize === 32) { track.info.codec = 'pcm-f32be'; } else if (pcmSampleSize === 64) { track.info.codec = 'pcm-f64be'; } else { console.warn(`Invalid fpcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } } break; }; case 'dOps': { // Used for Opus audio const track = this.currentTrack; assert(track && track.info?.type === 'audio'); this.metadataReader.pos += 1; // Version // https://www.opus-codec.org/docs/opus_in_isobmff.html const outputChannelCount = this.metadataReader.readU8(); const preSkip = this.metadataReader.readU16(); const inputSampleRate = this.metadataReader.readU32(); const outputGain = this.metadataReader.readI16(); const channelMappingFamily = this.metadataReader.readU8(); let channelMappingTable: Uint8Array; if (channelMappingFamily !== 0) { channelMappingTable = this.metadataReader.readBytes(2 + outputChannelCount); } else { channelMappingTable = new Uint8Array(0); } // https://datatracker.ietf.org/doc/html/draft-ietf-codec-oggopus-06 const description = new Uint8Array(8 + 1 + 1 + 2 + 4 + 2 + 1 + channelMappingTable.byteLength); const view = new DataView(description.buffer); view.setUint32(0, 0x4f707573, false); // 'Opus' view.setUint32(4, 0x48656164, false); // 'Head' view.setUint8(8, 1); // Version view.setUint8(9, outputChannelCount); view.setUint16(10, preSkip, true); view.setUint32(12, inputSampleRate, true); view.setInt16(16, outputGain, true); view.setUint8(18, channelMappingFamily); description.set(channelMappingTable, 19); track.info.codecDescription = description; track.info.numberOfChannels = outputChannelCount; track.info.sampleRate = inputSampleRate; }; break; case 'dfLa': { // Used for FLAC audio const track = this.currentTrack; assert(track && track.info?.type === 'audio'); this.metadataReader.pos += 4; // Version + flags // https://datatracker.ietf.org/doc/rfc9639/ const BLOCK_TYPE_MASK = 0x7f; const LAST_METADATA_BLOCK_FLAG_MASK = 0x80; const startPos = this.metadataReader.pos; while (this.metadataReader.pos < boxEndPos) { const flagAndType = this.metadataReader.readU8(); const metadataBlockLength = this.metadataReader.readU24(); const type = flagAndType & BLOCK_TYPE_MASK; // It's a STREAMINFO block; let's extract the actual sample rate and channel count if (type === 0) { this.metadataReader.pos += 10; // Extract sample rate const word = this.metadataReader.readU32(); const sampleRate = word >>> 12; const numberOfChannels = ((word >> 9) & 0b111) + 1; track.info.sampleRate = sampleRate; track.info.numberOfChannels = numberOfChannels; this.metadataReader.pos += 20; } else { // Simply skip ahead to the next block this.metadataReader.pos += metadataBlockLength; } if (flagAndType & LAST_METADATA_BLOCK_FLAG_MASK) { break; } } const endPos = this.metadataReader.pos; this.metadataReader.pos = startPos; const bytes = this.metadataReader.readBytes(endPos - startPos); const description = new Uint8Array(4 + bytes.byteLength); const view = new DataView(description.buffer); view.setUint32(0, 0x664c6143, false); // 'fLaC' description.set(bytes, 4); // Set the codec description to be 'fLaC' + all metadata blocks track.info.codecDescription = description; }; break; case 'stts': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 4; // Version + flags const entryCount = this.metadataReader.readU32(); let currentIndex = 0; let currentTimestamp = 0; for (let i = 0; i < entryCount; i++) { const sampleCount = this.metadataReader.readU32(); const sampleDelta = this.metadataReader.readU32(); track.sampleTable.sampleTimingEntries.push({ startIndex: currentIndex, startDecodeTimestamp: currentTimestamp, count: sampleCount, delta: sampleDelta, }); currentIndex += sampleCount; currentTimestamp += sampleCount * sampleDelta; } }; break; case 'ctts': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 1 + 3; // Version + flags const entryCount = this.metadataReader.readU32(); let sampleIndex = 0; for (let i = 0; i < entryCount; i++) { const sampleCount = this.metadataReader.readU32(); const sampleOffset = this.metadataReader.readI32(); track.sampleTable.sampleCompositionTimeOffsets.push({ startIndex: sampleIndex, count: sampleCount, offset: sampleOffset, }); sampleIndex += sampleCount; } }; break; case 'stsz': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 4; // Version + flags const sampleSize = this.metadataReader.readU32(); const sampleCount = this.metadataReader.readU32(); if (sampleSize === 0) { for (let i = 0; i < sampleCount; i++) { const sampleSize = this.metadataReader.readU32(); track.sampleTable.sampleSizes.push(sampleSize); } } else { track.sampleTable.sampleSizes.push(sampleSize); } }; break; case 'stz2': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 4; // Version + flags this.metadataReader.pos += 3; // Reserved const fieldSize = this.metadataReader.readU8(); // in bits const sampleCount = this.metadataReader.readU32(); const bytes = this.metadataReader.readBytes(Math.ceil(sampleCount * fieldSize / 8)); const bitstream = new Bitstream(bytes); for (let i = 0; i < sampleCount; i++) { const sampleSize = bitstream.readBits(fieldSize); track.sampleTable.sampleSizes.push(sampleSize); } }; break; case 'stss': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 4; // Version + flags track.sampleTable.keySampleIndices = []; const entryCount = this.metadataReader.readU32(); for (let i = 0; i < entryCount; i++) { const sampleIndex = this.metadataReader.readU32() - 1; // Convert to 0-indexed track.sampleTable.keySampleIndices.push(sampleIndex); } if (track.sampleTable.keySampleIndices[0] !== 0) { // Some files don't mark the first sample a key sample, which is basically almost always incorrect. // Here, we correct for that mistake: track.sampleTable.keySampleIndices.unshift(0); } }; break; case 'stsc': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 4; const entryCount = this.metadataReader.readU32(); for (let i = 0; i < entryCount; i++) { const startChunkIndex = this.metadataReader.readU32() - 1; // Convert to 0-indexed const samplesPerChunk = this.metadataReader.readU32(); const sampleDescriptionIndex = this.metadataReader.readU32(); track.sampleTable.sampleToChunk.push({ startSampleIndex: -1, startChunkIndex, samplesPerChunk, sampleDescriptionIndex, }); } let startSampleIndex = 0; for (let i = 0; i < track.sampleTable.sampleToChunk.length; i++) { track.sampleTable.sampleToChunk[i]!.startSampleIndex = startSampleIndex; if (i < track.sampleTable.sampleToChunk.length - 1) { const nextChunk = track.sampleTable.sampleToChunk[i + 1]!; const chunkCount = nextChunk.startChunkIndex - track.sampleTable.sampleToChunk[i]!.startChunkIndex; startSampleIndex += chunkCount * track.sampleTable.sampleToChunk[i]!.samplesPerChunk; } } }; break; case 'stco': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 4; // Version + flags const entryCount = this.metadataReader.readU32(); for (let i = 0; i < entryCount; i++) { const chunkOffset = this.metadataReader.readU32(); track.sampleTable.chunkOffsets.push(chunkOffset); } }; break; case 'co64': { const track = this.currentTrack; assert(track); if (!track.sampleTable) { break; } this.metadataReader.pos += 4; // Version + flags const entryCount = this.metadataReader.readU32(); for (let i = 0; i < entryCount; i++) { const chunkOffset = this.metadataReader.readU64(); track.sampleTable.chunkOffsets.push(chunkOffset); } }; break; case 'mvex': { this.isFragmented = true; this.readContiguousBoxes(boxInfo.contentSize); }; break; case 'mehd': { const version = this.metadataReader.readU8(); this.metadataReader.pos += 3; // Flags const fragmentDuration = version === 1 ? this.metadataReader.readU64() : this.metadataReader.readU32(); this.movieDurationInTimescale = fragmentDuration; }; break; case 'trex': { this.metadataReader.pos += 4; // Version + flags const trackId = this.metadataReader.readU32(); const defaultSampleDescriptionIndex = this.metadataReader.readU32(); const defaultSampleDuration = this.metadataReader.readU32(); const defaultSampleSize = this.metadataReader.readU32(); const defaultSampleFlags = this.metadataReader.readU32(); // We store these separately rather than in the tracks since the tracks may not exist yet this.fragmentTrackDefaults.push({ trackId, defaultSampleDescriptionIndex, defaultSampleDuration, defaultSampleSize, defaultSampleFlags, }); }; break; case 'tfra': { const version = this.metadataReader.readU8(); this.metadataReader.pos += 3; // Flags const tra