UNPKG

mediabunny

Version:

Pure TypeScript media toolkit for reading, writing, and converting media files, directly in the browser.

1,622 lines (1,398 loc) 118 kB
/*! * Copyright (c) 2026-present, Vanilagy and contributors * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ import { TrackType } from '../output'; import { parseAacAudioSpecificConfig } from '../../shared/aac-misc'; import { AacCodecInfo, AudioCodec, extractAudioCodecString, extractVideoCodecString, MediaCodec, OPUS_SAMPLE_RATE, parsePcmCodec, PCM_AUDIO_CODECS, PcmAudioCodec, VideoCodec, } from '../codec'; import { Av1CodecInfo, AvcDecoderConfigurationRecord, extractAv1CodecInfoFromPacket, extractVp9CodecInfoFromPacket, FlacBlockType, HevcDecoderConfigurationRecord, Vp9CodecInfo, parseEac3Config, getEac3SampleRate, getEac3ChannelCount, AC3_ACMOD_CHANNEL_COUNTS, } from '../codec-data'; import { Demuxer } from '../demuxer'; import { Input } from '../input'; import { InputAudioTrackBacking, InputTrackBacking, InputVideoTrackBacking, } from '../input-track'; import { PacketRetrievalOptions } from '../media-sink'; import { assert, binarySearchExact, binarySearchLessOrEqual, bytesToHexString, COLOR_PRIMARIES_MAP_INVERSE, findLastIndex, isIso639Dash2LanguageCode, last, MATRIX_COEFFICIENTS_MAP_INVERSE, normalizeRotation, roundToMultiple, Rotation, textDecoder, TransformationMatrix, TRANSFER_CHARACTERISTICS_MAP_INVERSE, UNDETERMINED_LANGUAGE, toDataView, roundIfAlmostInteger, hexStringToBytes, HEX_STRING_REGEX, } from '../misc'; import { EncodedPacket, PLACEHOLDER_DATA } from '../packet'; import { buildIsobmffMimeType, parsePsshBoxContents, psshBoxesAreEqual, PsshBox } from './isobmff-misc'; import { MAX_BOX_HEADER_SIZE, MIN_BOX_HEADER_SIZE, readBoxHeader, readDataBox, readFixed_16_16, readFixed_2_30, readIsomVariableInteger, readMetadataStringShort, } from './isobmff-reader'; import { FileSlice, readBytes, readF64Be, readI16Be, readI32Be, readI64Be, Reader, readU16Be, readU24Be, readU32Be, readU64Be, readU8, readAscii, } from '../reader'; import { DEFAULT_TRACK_DISPOSITION, MetadataTags, RichImageData, TrackDisposition } from '../metadata'; import { AC3_SAMPLE_RATES } from '../../shared/ac3-misc'; import { Bitstream } from '../../shared/bitstream'; import { Aes128CbcContext } from '../aes'; type InternalTrack = { id: number; demuxer: IsobmffDemuxer; trackBacking: InputTrackBacking | null; disposition: TrackDisposition; timescale: number; durationInMovieTimescale: number; durationInMediaTimescale: number; rotation: Rotation; internalCodecId: string | null; name: string | null; languageCode: string; sampleTableByteOffset: number | null; // null when the track's sample table is another file (ominous ik 👀) sampleTable: SampleTable | null; fragmentLookupTable: FragmentLookupTableEntry[]; currentFragmentState: FragmentTrackState | null; /** * List of all encountered fragment offsets alongside their timestamps. This list never gets truncated, but memory * consumption should be negligible. */ fragmentPositionCache: { moofOffset: number; startTimestamp: number; endTimestamp: number; }[]; /** The segment durations of all edit list entries leading up to the main one (from which the offset is taken.) */ editListPreviousSegmentDurations: number; /** The media time offset of the main edit list entry (with media time !== -1) */ editListOffset: number; /** Set when the track's samples are encrypted using a supported scheme (cenc/cens/cbcs), parsed from sinf/tenc. */ encryptionInfo: TrackEncryptionInfo | null; /** For non-fragmented encrypted tracks: parsed saiz+saio from stbl; aux info is fetched lazily on first use. */ encryptionAuxInfo: SampleEncryptionAuxInfo | null; frmaCodecString: string | null; } & ({ info: null; } | { info: { type: 'video'; width: number; height: number; squarePixelWidth: number; squarePixelHeight: number; codec: VideoCodec | null; codecDescription: Uint8Array | null; colorSpace: VideoColorSpaceInit | null; avcType: 1 | 3 | null; avcCodecInfo: AvcDecoderConfigurationRecord | null; hevcCodecInfo: HevcDecoderConfigurationRecord | null; vp9CodecInfo: Vp9CodecInfo | null; av1CodecInfo: Av1CodecInfo | null; }; } | { info: { type: 'audio'; numberOfChannels: number; sampleRate: number; codec: AudioCodec | null; codecDescription: Uint8Array | null; aacCodecInfo: AacCodecInfo | null; pcmLittleEndian: boolean; pcmSampleSize: number | null; }; }); type InternalVideoTrack = InternalTrack & { info: { type: 'video' } }; type InternalAudioTrack = InternalTrack & { info: { type: 'audio' } }; type SampleTable = { sampleTimingEntries: SampleTimingEntry[]; sampleCompositionTimeOffsets: SampleCompositionTimeOffsetEntry[]; sampleSizes: number[]; keySampleIndices: number[] | null; // Samples that are keyframes chunkOffsets: number[]; sampleToChunk: SampleToChunkEntry[]; presentationTimestamps: { presentationTimestamp: number; sampleIndex: number; }[] | null; /** * Provides a fast map from sample index to index in the sorted presentation timestamps array - so, a fast map from * decode order to presentation order. */ presentationTimestampIndexMap: number[] | null; }; type SampleTimingEntry = { startIndex: number; startDecodeTimestamp: number; count: number; delta: number; }; type SampleCompositionTimeOffsetEntry = { startIndex: number; count: number; offset: number; }; type SampleToChunkEntry = { startSampleIndex: number; startChunkIndex: number; samplesPerChunk: number; sampleDescriptionIndex: number; }; type FragmentTrackDefaults = { trackId: number; defaultSampleDescriptionIndex: number; defaultSampleDuration: number; defaultSampleSize: number; defaultSampleFlags: number; }; type FragmentLookupTableEntry = { timestamp: number; moofOffset: number; }; type FragmentTrackState = { baseDataOffset: number; sampleDescriptionIndex: number | null; defaultSampleDuration: number | null; defaultSampleSize: number | null; defaultSampleFlags: number | null; startTimestamp: number | null; encryptionAuxInfo: SampleEncryptionAuxInfo | null; }; type FragmentTrackData = { track: InternalTrack; // Kept as state for the presence of multiple trun boxes currentTimestamp: number; currentOffset: number; startTimestamp: number; endTimestamp: number; firstKeyFrameTimestamp: number | null; samples: FragmentTrackSample[]; presentationTimestamps: { presentationTimestamp: number; sampleIndex: number; }[]; startTimestampIsFinal: boolean; encryptionAuxInfo: SampleEncryptionAuxInfo | null; }; type FragmentTrackSample = { presentationTimestamp: number; duration: number; byteOffset: number; byteSize: number; isKeyFrame: boolean; encryption: SampleEncryptionInfo | null; }; type Fragment = { moofOffset: number; moofSize: number; implicitBaseDataOffset: number; trackData: Map<InternalTrack['id'], FragmentTrackData>; psshBoxes: PsshBox[]; }; type TrackEncryptionInfo = { scheme: 'cenc' | 'cens' | 'cbcs'; defaultKid: string | null; defaultIsProtected: boolean | null; defaultPerSampleIvSize: number | null; defaultConstantIv: Uint8Array | null; defaultCryptByteBlock: number | null; defaultSkipByteBlock: number | null; }; type SampleEncryptionInfo = { iv: Uint8Array; subsamples: { clearLen: number; protectedLen: number; }[] | null; }; /** * Holds parsed saiz+saio state. The encryption info itself lives at a file offset and is fetched lazily. * For fragmented files this state is per-traf; for non-fragmented files it's per-track (on stbl). */ type SampleEncryptionAuxInfo = { defaultSampleInfoSize: number; sampleSizes: Uint8Array | null; sampleCount: number; offset: number | null; // Absolute file offset of the first sample's aux info resolved: SampleEncryptionInfo[] | null; }; export class IsobmffDemuxer extends Demuxer { reader: Reader; moovSlice: FileSlice | null = null; currentTrack: InternalTrack | null = null; tracks: InternalTrack[] = []; metadataPromise: Promise<void> | null = null; movieTimescale = -1; movieDurationInTimescale = -1; isQuickTime = false; metadataTags: MetadataTags = {}; currentMetadataKeys: Map<number, string> | null = null; isFragmented = false; fragmentTrackDefaults: FragmentTrackDefaults[] = []; psshBoxes: PsshBox[] = []; currentFragment: Fragment | null = null; /** * Caches the last fragment that was read. Based on the assumption that there will be multiple reads to the * same fragment in quick succession. */ lastReadFragment: Fragment | null = null; decryptionKeyCache = new Map<string, Promise<Uint8Array>>(); constructor(input: Input) { super(input); this.reader = input._reader; } override async getTrackBackings() { await this.readMetadata(); return this.tracks.map(track => track.trackBacking!); } override async getMimeType() { await this.readMetadata(); const backings = await this.getTrackBackings(); const codecStrings = await Promise.all(backings.map( x => x.getDecoderConfig().then(c => c?.codec ?? null), )); return buildIsobmffMimeType({ isQuickTime: this.isQuickTime, hasVideo: this.tracks.some(x => x.info?.type === 'video'), hasAudio: this.tracks.some(x => x.info?.type === 'audio'), codecStrings: codecStrings.filter(Boolean) as string[], }); } async getMetadataTags() { await this.readMetadata(); return this.metadataTags; } readMetadata() { return this.metadataPromise ??= (async () => { let currentPos = 0; let lookForMfraBox = false; while (true) { let slice = this.reader.requestSliceRange(currentPos, MIN_BOX_HEADER_SIZE, MAX_BOX_HEADER_SIZE); if (slice instanceof Promise) slice = await slice; if (!slice) break; const startPos = currentPos; const boxInfo = readBoxHeader(slice); if (!boxInfo) { break; } if (boxInfo.name === 'ftyp' || boxInfo.name === 'styp') { const majorBrand = readAscii(slice, 4); this.isQuickTime = majorBrand === 'qt '; } else if (boxInfo.name === 'moov') { // Found moov, load it let moovSlice = this.reader.requestSlice(slice.filePos, boxInfo.contentSize); if (moovSlice instanceof Promise) moovSlice = await moovSlice; if (!moovSlice) break; this.moovSlice = moovSlice; this.readContiguousBoxes(this.moovSlice); for (const track of this.tracks) { // Modify the edit list offset based on the previous segment durations. They are in different // timescales, so we first convert to seconds and then into the track timescale. const previousSegmentDurationsInSeconds = track.editListPreviousSegmentDurations / this.movieTimescale; track.editListOffset -= Math.round(previousSegmentDurationsInSeconds * track.timescale); } lookForMfraBox = this.isFragmented && this.reader.fileSize !== null && this.reader.fileSize > startPos + boxInfo.totalSize; // There's more after the moov box break; } else if (boxInfo.name === 'moof') { if (!this.input._initInput) { throw new Error( '"moof" box encountered with no "moov" box present; this file is likely a Segment as' + ' described in ISO/IEC 14496-12 Section 8.16. A separate init file that contains a "moov"' + ' box is required to read this file, please provide it using InputOptions.initInput.', ); } const initDemuxer = (await this.input._initInput._getDemuxer()) as IsobmffDemuxer; if (initDemuxer.constructor !== IsobmffDemuxer) { throw new Error('Init input must match the input\'s format.'); } await initDemuxer.readMetadata(); this.movieTimescale = initDemuxer.movieTimescale; this.movieDurationInTimescale = initDemuxer.movieDurationInTimescale; this.metadataTags = initDemuxer.metadataTags; this.isFragmented = true; this.fragmentTrackDefaults = initDemuxer.fragmentTrackDefaults; this.psshBoxes = initDemuxer.psshBoxes; // Create tracks from the init input's tracks for (const foreignTrack of initDemuxer.tracks) { const track: InternalTrack = { id: foreignTrack.id, demuxer: this, trackBacking: null, disposition: foreignTrack.disposition, timescale: foreignTrack.timescale, durationInMediaTimescale: foreignTrack.durationInMediaTimescale, durationInMovieTimescale: foreignTrack.durationInMovieTimescale, rotation: foreignTrack.rotation, internalCodecId: foreignTrack.internalCodecId, name: foreignTrack.name, languageCode: foreignTrack.languageCode, sampleTableByteOffset: null, sampleTable: null, fragmentLookupTable: [], currentFragmentState: null, fragmentPositionCache: [], editListPreviousSegmentDurations: foreignTrack.editListPreviousSegmentDurations, editListOffset: foreignTrack.editListOffset, encryptionInfo: foreignTrack.encryptionInfo, encryptionAuxInfo: null, frmaCodecString: null, info: foreignTrack.info, }; if (foreignTrack.trackBacking) { assert(track.info); if (track.info.type === 'video' && track.info.width !== -1) { const videoTrack = track as InternalVideoTrack; track.trackBacking = new IsobmffVideoTrackBacking(videoTrack); this.tracks.push(track); } else if (track.info.type === 'audio' && track.info.numberOfChannels !== -1) { const audioTrack = track as InternalAudioTrack; track.trackBacking = new IsobmffAudioTrackBacking(audioTrack); this.tracks.push(track); } } else { // The track didn't have enough info to warrant a backing } } lookForMfraBox = false; // No point in doing it for segment files break; } currentPos = startPos + boxInfo.totalSize; } if (lookForMfraBox) { assert(this.reader.fileSize !== null); // The last 4 bytes may contain the size of the mfra box at the end of the file let lastWordSlice = this.reader.requestSlice(this.reader.fileSize - 4, 4); if (lastWordSlice instanceof Promise) lastWordSlice = await lastWordSlice; assert(lastWordSlice); const lastWord = readU32Be(lastWordSlice); const potentialMfraPos = this.reader.fileSize - lastWord; if (potentialMfraPos >= 0 && potentialMfraPos <= this.reader.fileSize - MAX_BOX_HEADER_SIZE) { let mfraHeaderSlice = this.reader.requestSliceRange( potentialMfraPos, MIN_BOX_HEADER_SIZE, MAX_BOX_HEADER_SIZE, ); if (mfraHeaderSlice instanceof Promise) mfraHeaderSlice = await mfraHeaderSlice; if (mfraHeaderSlice) { const boxInfo = readBoxHeader(mfraHeaderSlice); if (boxInfo && boxInfo.name === 'mfra') { // We found the mfra box, allowing for much better random access. Let's parse it. let mfraSlice = this.reader.requestSlice(mfraHeaderSlice.filePos, boxInfo.contentSize); if (mfraSlice instanceof Promise) mfraSlice = await mfraSlice; if (mfraSlice) { this.readContiguousBoxes(mfraSlice); } } } } } })(); } getSampleTableForTrack(internalTrack: InternalTrack) { if (internalTrack.sampleTable) { return internalTrack.sampleTable; } const sampleTable: SampleTable = { sampleTimingEntries: [], sampleCompositionTimeOffsets: [], sampleSizes: [], keySampleIndices: null, chunkOffsets: [], sampleToChunk: [], presentationTimestamps: null, presentationTimestampIndexMap: null, }; internalTrack.sampleTable = sampleTable; if (internalTrack.sampleTableByteOffset === null) { // There's no sample table to read, it's in another file (happens with segments) return sampleTable; } assert(this.moovSlice); const stblContainerSlice = this.moovSlice.slice(internalTrack.sampleTableByteOffset); this.currentTrack = internalTrack; this.traverseBox(stblContainerSlice); this.currentTrack = null; const isPcmCodec = internalTrack.info?.type === 'audio' && internalTrack.info.codec && (PCM_AUDIO_CODECS as readonly string[]).includes(internalTrack.info.codec); if (isPcmCodec && sampleTable.sampleCompositionTimeOffsets.length === 0) { // If the audio has PCM samples, the way the samples are defined in the sample table is somewhat // suboptimal: Each individual audio sample is its own sample, meaning we can have 48000 samples per second. // Because we treat each sample as its own atomic unit that can be decoded, this would lead to a huge // amount of very short samples for PCM audio. So instead, we make a transformation: If the audio is in PCM, // we say that each chunk (that normally holds many samples) now is one big sample. We can this because // the samples in the chunk are contiguous and the format is PCM, so the entire chunk as one thing still // encodes valid audio information. assert(internalTrack.info?.type === 'audio'); const pcmInfo = parsePcmCodec(internalTrack.info.codec as PcmAudioCodec); const newSampleTimingEntries: SampleTimingEntry[] = []; const newSampleSizes: number[] = []; for (let i = 0; i < sampleTable.sampleToChunk.length; i++) { const chunkEntry = sampleTable.sampleToChunk[i]!; const nextEntry = sampleTable.sampleToChunk[i + 1]; const chunkCount = (nextEntry ? nextEntry.startChunkIndex : sampleTable.chunkOffsets.length) - chunkEntry.startChunkIndex; for (let j = 0; j < chunkCount; j++) { const startSampleIndex = chunkEntry.startSampleIndex + j * chunkEntry.samplesPerChunk; const endSampleIndex = startSampleIndex + chunkEntry.samplesPerChunk; // Exclusive, outside of chunk const startTimingEntryIndex = binarySearchLessOrEqual( sampleTable.sampleTimingEntries, startSampleIndex, x => x.startIndex, ); const startTimingEntry = sampleTable.sampleTimingEntries[startTimingEntryIndex]!; const endTimingEntryIndex = binarySearchLessOrEqual( sampleTable.sampleTimingEntries, endSampleIndex, x => x.startIndex, ); const endTimingEntry = sampleTable.sampleTimingEntries[endTimingEntryIndex]!; const firstSampleTimestamp = startTimingEntry.startDecodeTimestamp + (startSampleIndex - startTimingEntry.startIndex) * startTimingEntry.delta; const lastSampleTimestamp = endTimingEntry.startDecodeTimestamp + (endSampleIndex - endTimingEntry.startIndex) * endTimingEntry.delta; const delta = lastSampleTimestamp - firstSampleTimestamp; const lastSampleTimingEntry = last(newSampleTimingEntries); if (lastSampleTimingEntry && lastSampleTimingEntry.delta === delta) { lastSampleTimingEntry.count++; } else { // One sample for the entire chunk newSampleTimingEntries.push({ startIndex: chunkEntry.startChunkIndex + j, startDecodeTimestamp: firstSampleTimestamp, count: 1, delta, }); } // Instead of determining the chunk's size by looping over the samples sizes in the sample table, we // can directly compute it as we know how many PCM frames are in this chunk, and the size of each // PCM frame. This also improves compatibility with some files which fail to write proper sample // size values into their sample tables in the PCM case. const chunkSize = chunkEntry.samplesPerChunk * pcmInfo.sampleSize * internalTrack.info.numberOfChannels; newSampleSizes.push(chunkSize); } chunkEntry.startSampleIndex = chunkEntry.startChunkIndex; chunkEntry.samplesPerChunk = 1; } sampleTable.sampleTimingEntries = newSampleTimingEntries; sampleTable.sampleSizes = newSampleSizes; } if (sampleTable.sampleCompositionTimeOffsets.length > 0) { // If composition time offsets are defined, we must build a list of all presentation timestamps and then // sort them sampleTable.presentationTimestamps = []; for (const entry of sampleTable.sampleTimingEntries) { for (let i = 0; i < entry.count; i++) { sampleTable.presentationTimestamps.push({ presentationTimestamp: entry.startDecodeTimestamp + i * entry.delta, sampleIndex: entry.startIndex + i, }); } } for (const entry of sampleTable.sampleCompositionTimeOffsets) { for (let i = 0; i < entry.count; i++) { const sampleIndex = entry.startIndex + i; const sample = sampleTable.presentationTimestamps[sampleIndex]; if (!sample) { continue; } sample.presentationTimestamp += entry.offset; } } sampleTable.presentationTimestamps.sort((a, b) => a.presentationTimestamp - b.presentationTimestamp); sampleTable.presentationTimestampIndexMap = Array(sampleTable.presentationTimestamps.length).fill(-1); for (let i = 0; i < sampleTable.presentationTimestamps.length; i++) { sampleTable.presentationTimestampIndexMap[sampleTable.presentationTimestamps[i]!.sampleIndex] = i; } } else { // If they're not defined, we can simply use the decode timestamps as presentation timestamps } return sampleTable; } async readFragment(startPos: number): Promise<Fragment> { if (this.lastReadFragment?.moofOffset === startPos) { return this.lastReadFragment; } let headerSlice = this.reader.requestSliceRange(startPos, MIN_BOX_HEADER_SIZE, MAX_BOX_HEADER_SIZE); if (headerSlice instanceof Promise) headerSlice = await headerSlice; assert(headerSlice); const moofBoxInfo = readBoxHeader(headerSlice); assert(moofBoxInfo?.name === 'moof'); let entireSlice = this.reader.requestSlice(startPos, moofBoxInfo.totalSize); if (entireSlice instanceof Promise) entireSlice = await entireSlice; assert(entireSlice); this.traverseBox(entireSlice); const fragment = this.lastReadFragment; assert(fragment && fragment.moofOffset === startPos); for (const [, trackData] of fragment.trackData) { const track = trackData.track; const { fragmentPositionCache } = track; if (!trackData.startTimestampIsFinal) { // It may be that some tracks don't define the base decode time, i.e. when the fragment begins. This // we'll need to figure out the start timestamp another way. We'll compute the timestamp by accessing // the lookup entries and fragment cache, which works out nicely with the lookup algorithm: If these // exist, then the lookup will automatically start at the furthest possible point. If they don't, the // lookup starts sequentially from the start, incrementally summing up all fragment durations. It's sort // of implicit, but it ends up working nicely. const lookupEntry = track.fragmentLookupTable.find(x => x.moofOffset === fragment.moofOffset); if (lookupEntry) { // There's a lookup entry, let's use its timestamp offsetFragmentTrackDataByTimestamp(trackData, lookupEntry.timestamp); } else { const lastCacheIndex = binarySearchLessOrEqual( fragmentPositionCache, fragment.moofOffset - 1, x => x.moofOffset, ); if (lastCacheIndex !== -1) { // Let's use the timestamp of the previous fragment in the cache const lastCache = fragmentPositionCache[lastCacheIndex]!; offsetFragmentTrackDataByTimestamp(trackData, lastCache.endTimestamp); } else { // We're the first fragment I guess, "offset by 0" } } trackData.startTimestampIsFinal = true; } // Let's remember that a fragment with a given timestamp is here, speeding up future lookups if no // lookup table exists const insertionIndex = binarySearchLessOrEqual( fragmentPositionCache, trackData.startTimestamp, x => x.startTimestamp, ); if ( insertionIndex === -1 || fragmentPositionCache[insertionIndex]!.moofOffset !== fragment.moofOffset ) { fragmentPositionCache.splice(insertionIndex + 1, 0, { moofOffset: fragment.moofOffset, startTimestamp: trackData.startTimestamp, endTimestamp: trackData.endTimestamp, }); } // If senc wasn't parsed but saiz+saio were, fetch the aux info now and stamp each sample with it if (trackData.encryptionAuxInfo && track.encryptionInfo) { const entries = await resolveEncryptionAuxInfo( this.reader, track.encryptionInfo, trackData.encryptionAuxInfo, ); for (let i = 0; i < Math.min(trackData.samples.length, entries.length); i++) { const entry = entries[i]!; trackData.samples[i]!.encryption = entry; } } } return fragment; } readContiguousBoxes(slice: FileSlice) { const startIndex = slice.filePos; while (slice.filePos - startIndex <= slice.length - MIN_BOX_HEADER_SIZE) { const foundBox = this.traverseBox(slice); if (!foundBox) { break; } } } // eslint-disable-next-line @stylistic/generator-star-spacing *iterateContiguousBoxes(slice: FileSlice) { const startIndex = slice.filePos; while (slice.filePos - startIndex <= slice.length - MIN_BOX_HEADER_SIZE) { const startPos = slice.filePos; const boxInfo = readBoxHeader(slice); if (!boxInfo) { break; } yield { boxInfo, slice }; slice.filePos = startPos + boxInfo.totalSize; } } traverseBox(slice: FileSlice): boolean { const startPos = slice.filePos; const boxInfo = readBoxHeader(slice); if (!boxInfo) { return false; } const contentStartPos = slice.filePos; const boxEndPos = startPos + boxInfo.totalSize; switch (boxInfo.name) { case 'mdia': case 'minf': case 'dinf': case 'mfra': case 'edts': case 'sinf': case 'schi': { this.readContiguousBoxes(slice.slice(contentStartPos, boxInfo.contentSize)); }; break; case 'mvhd': { const version = readU8(slice); slice.skip(3); // Flags if (version === 1) { slice.skip(8 + 8); this.movieTimescale = readU32Be(slice); this.movieDurationInTimescale = readU64Be(slice); } else { slice.skip(4 + 4); this.movieTimescale = readU32Be(slice); this.movieDurationInTimescale = readU32Be(slice); } }; break; case 'trak': { const track = { id: -1, demuxer: this, trackBacking: null, disposition: { ...DEFAULT_TRACK_DISPOSITION, primary: false, }, info: null, timescale: -1, durationInMovieTimescale: -1, durationInMediaTimescale: -1, rotation: 0, internalCodecId: null, name: null, languageCode: UNDETERMINED_LANGUAGE, sampleTableByteOffset: -1, sampleTable: null, fragmentLookupTable: [], currentFragmentState: null, fragmentPositionCache: [], editListPreviousSegmentDurations: 0, editListOffset: 0, encryptionInfo: null, encryptionAuxInfo: null, frmaCodecString: null, } satisfies InternalTrack as InternalTrack; this.currentTrack = track; this.readContiguousBoxes(slice.slice(contentStartPos, boxInfo.contentSize)); if (track.id !== -1 && track.timescale !== -1 && track.info !== null) { if (track.info.type === 'video' && track.info.width !== -1) { const videoTrack = track as InternalVideoTrack; track.trackBacking = new IsobmffVideoTrackBacking(videoTrack); this.tracks.push(track); } else if (track.info.type === 'audio' && track.info.numberOfChannels !== -1) { const audioTrack = track as InternalAudioTrack; track.trackBacking = new IsobmffAudioTrackBacking(audioTrack); this.tracks.push(track); } } this.currentTrack = null; }; break; case 'tkhd': { const track = this.currentTrack; if (!track) { break; } const version = readU8(slice); const flags = readU24Be(slice); // Spec says disabled tracks are to be treated like they don't exist, but in practice, they are treated // more like non-default tracks. const trackEnabled = !!(flags & 0x1); track.disposition.default = trackEnabled; // Skip over creation & modification time to reach the track ID if (version === 0) { slice.skip(8); track.id = readU32Be(slice); slice.skip(4); track.durationInMovieTimescale = readU32Be(slice); } else if (version === 1) { slice.skip(16); track.id = readU32Be(slice); slice.skip(4); track.durationInMovieTimescale = readU64Be(slice); } else { throw new Error(`Incorrect track header version ${version}.`); } slice.skip(2 * 4 + 2 + 2 + 2 + 2); const matrix: TransformationMatrix = [ readFixed_16_16(slice), readFixed_16_16(slice), readFixed_2_30(slice), readFixed_16_16(slice), readFixed_16_16(slice), readFixed_2_30(slice), readFixed_16_16(slice), readFixed_16_16(slice), readFixed_2_30(slice), ]; const rotation = normalizeRotation(roundToMultiple(extractRotationFromMatrix(matrix), 90)); assert(rotation === 0 || rotation === 90 || rotation === 180 || rotation === 270); track.rotation = rotation; }; break; case 'elst': { const track = this.currentTrack; if (!track) { break; } const version = readU8(slice); slice.skip(3); // Flags let relevantEntryFound = false; let previousSegmentDurations = 0; const entryCount = readU32Be(slice); for (let i = 0; i < entryCount; i++) { const segmentDuration = version === 1 ? readU64Be(slice) : readU32Be(slice); const mediaTime = version === 1 ? readI64Be(slice) : readI32Be(slice); const mediaRate = readFixed_16_16(slice); if (segmentDuration === 0) { // Don't care continue; } if (relevantEntryFound) { console.warn( 'Unsupported edit list: multiple edits are not currently supported. Only using first edit.', ); break; } if (mediaTime === -1) { previousSegmentDurations += segmentDuration; continue; } if (mediaRate !== 1) { console.warn('Unsupported edit list entry: media rate must be 1.'); break; } track.editListPreviousSegmentDurations = previousSegmentDurations; track.editListOffset = mediaTime; relevantEntryFound = true; } }; break; case 'mdhd': { const track = this.currentTrack; if (!track) { break; } const version = readU8(slice); slice.skip(3); // Flags if (version === 0) { slice.skip(8); track.timescale = readU32Be(slice); track.durationInMediaTimescale = readU32Be(slice); } else if (version === 1) { slice.skip(16); track.timescale = readU32Be(slice); track.durationInMediaTimescale = readU64Be(slice); } let language = readU16Be(slice); if (language > 0) { track.languageCode = ''; for (let i = 0; i < 3; i++) { track.languageCode = String.fromCharCode(0x60 + (language & 0b11111)) + track.languageCode; language >>= 5; } if (!isIso639Dash2LanguageCode(track.languageCode)) { // Sometimes the bytes are garbage track.languageCode = UNDETERMINED_LANGUAGE; } } }; break; case 'hdlr': { const track = this.currentTrack; if (!track) { break; } slice.skip(8); // Version + flags + pre-defined const handlerType = readAscii(slice, 4); if (handlerType === 'vide') { track.info = { type: 'video', width: -1, height: -1, squarePixelWidth: -1, squarePixelHeight: -1, codec: null, codecDescription: null, colorSpace: null, avcType: null, avcCodecInfo: null, hevcCodecInfo: null, vp9CodecInfo: null, av1CodecInfo: null, }; } else if (handlerType === 'soun') { track.info = { type: 'audio', numberOfChannels: -1, sampleRate: -1, codec: null, codecDescription: null, aacCodecInfo: null, pcmLittleEndian: false, pcmSampleSize: null, }; } }; break; case 'stbl': { const track = this.currentTrack; if (!track) { break; } track.sampleTableByteOffset = startPos; this.readContiguousBoxes(slice.slice(contentStartPos, boxInfo.contentSize)); }; break; case 'stsd': { const track = this.currentTrack; if (!track) { break; } if (track.info === null || track.sampleTable) { break; } const stsdVersion = readU8(slice); slice.skip(3); // Flags const entries = readU32Be(slice); for (let i = 0; i < entries; i++) { const sampleBoxStartPos = slice.filePos; const sampleBoxInfo = readBoxHeader(slice); if (!sampleBoxInfo) { break; } track.internalCodecId = sampleBoxInfo.name; const lowercaseBoxName = sampleBoxInfo.name.toLowerCase(); if (track.info.type === 'video') { slice.skip(6 * 1 + 2 + 2 + 2 + 3 * 4); track.info.width = readU16Be(slice); track.info.height = readU16Be(slice); track.info.squarePixelWidth = track.info.width; track.info.squarePixelHeight = track.info.height; slice.skip(4 + 4 + 4 + 2 + 32 + 2 + 2); track.frmaCodecString = null; this.readContiguousBoxes( slice.slice( slice.filePos, (sampleBoxStartPos + sampleBoxInfo.totalSize) - slice.filePos, ), ); const codecName = lowercaseBoxName === 'encv' ? track.frmaCodecString : lowercaseBoxName; track.frmaCodecString = null; if (codecName === 'avc1' || codecName === 'avc3') { track.info.codec = 'avc'; track.info.avcType = codecName === 'avc1' ? 1 : 3; } else if (codecName === 'hvc1' || codecName === 'hev1') { track.info.codec = 'hevc'; } else if (codecName === 'vp08') { track.info.codec = 'vp8'; } else if (codecName === 'vp09') { track.info.codec = 'vp9'; } else if (codecName === 'av01') { track.info.codec = 'av1'; } else if (codecName === null) { console.warn(`Unknown encrypted video codec due to missing frma box.`); } else { console.warn(`Unsupported video codec (sample entry type '${sampleBoxInfo.name}').`); } } else { slice.skip(6 * 1 + 2); const version = readU16Be(slice); slice.skip(3 * 2); let channelCount = readU16Be(slice); let sampleSize = readU16Be(slice); slice.skip(2 * 2); // Can't use fixed16_16 as that's signed let sampleRate = readU32Be(slice) / 0x10000; let lpcmFlags: number | null = null; if (stsdVersion === 0 && version > 0) { // Additional QuickTime fields if (version === 1) { slice.skip(4); sampleSize = 8 * readU32Be(slice); slice.skip(2 * 4); } else if (version === 2) { slice.skip(4); sampleRate = readF64Be(slice); channelCount = readU32Be(slice); slice.skip(4); // Always 0x7f000000 sampleSize = readU32Be(slice); lpcmFlags = readU32Be(slice); slice.skip(2 * 4); } } track.info.numberOfChannels = channelCount; track.info.sampleRate = sampleRate; track.frmaCodecString = null; this.readContiguousBoxes( slice.slice( slice.filePos, (sampleBoxStartPos + sampleBoxInfo.totalSize) - slice.filePos, ), ); const codecName = lowercaseBoxName === 'enca' ? track.frmaCodecString : lowercaseBoxName; track.frmaCodecString = null; // developer.apple.com/documentation/quicktime-file-format/sound_sample_descriptions/ if (codecName === 'mp4a') { // The codec is set by the esds box } else if (codecName === 'opus') { track.info.codec = 'opus'; track.info.sampleRate = OPUS_SAMPLE_RATE; // Always the same } else if (codecName === 'flac') { track.info.codec = 'flac'; } else if (codecName === 'ulaw') { track.info.codec = 'ulaw'; } else if (codecName === 'alaw') { track.info.codec = 'alaw'; } else if (codecName === 'ac-3') { track.info.codec = 'ac3'; } else if (codecName === 'ec-3') { track.info.codec = 'eac3'; } else if (codecName === 'twos') { if (sampleSize === 8) { track.info.codec = 'pcm-s8'; } else if (sampleSize === 16) { track.info.codec = track.info.pcmLittleEndian ? 'pcm-s16' : 'pcm-s16be'; } else { console.warn(`Unsupported sample size ${sampleSize} for codec 'twos'.`); track.info.codec = null; } } else if (codecName === 'sowt') { if (sampleSize === 8) { track.info.codec = 'pcm-s8'; } else if (sampleSize === 16) { track.info.codec = 'pcm-s16'; } else { console.warn(`Unsupported sample size ${sampleSize} for codec 'sowt'.`); track.info.codec = null; } } else if (codecName === 'raw ') { track.info.codec = 'pcm-u8'; } else if (codecName === 'in24') { track.info.codec = track.info.pcmLittleEndian ? 'pcm-s24' : 'pcm-s24be'; } else if (codecName === 'in32') { track.info.codec = track.info.pcmLittleEndian ? 'pcm-s32' : 'pcm-s32be'; } else if (codecName === 'fl32') { track.info.codec = track.info.pcmLittleEndian ? 'pcm-f32' : 'pcm-f32be'; } else if (codecName === 'fl64') { track.info.codec = track.info.pcmLittleEndian ? 'pcm-f64' : 'pcm-f64be'; } else if (codecName === 'ipcm') { const pcmSampleSize = track.info.pcmSampleSize; if (track.info.pcmLittleEndian) { if (pcmSampleSize === 16) { track.info.codec = 'pcm-s16'; } else if (pcmSampleSize === 24) { track.info.codec = 'pcm-s24'; } else if (pcmSampleSize === 32) { track.info.codec = 'pcm-s32'; } else { console.warn(`Invalid ipcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } else { if (pcmSampleSize === 16) { track.info.codec = 'pcm-s16be'; } else if (pcmSampleSize === 24) { track.info.codec = 'pcm-s24be'; } else if (pcmSampleSize === 32) { track.info.codec = 'pcm-s32be'; } else { console.warn(`Invalid ipcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } } else if (codecName === 'fpcm') { const pcmSampleSize = track.info.pcmSampleSize; if (track.info.pcmLittleEndian) { if (pcmSampleSize === 32) { track.info.codec = 'pcm-f32'; } else if (pcmSampleSize === 64) { track.info.codec = 'pcm-f64'; } else { console.warn(`Invalid fpcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } else { if (pcmSampleSize === 32) { track.info.codec = 'pcm-f32be'; } else if (pcmSampleSize === 64) { track.info.codec = 'pcm-f64be'; } else { console.warn(`Invalid fpcm sample size ${pcmSampleSize}.`); track.info.codec = null; } } } else if (codecName === 'lpcm' && lpcmFlags !== null) { const bytesPerSample = (sampleSize + 7) >> 3; const isFloat = Boolean(lpcmFlags & 1); const isBigEndian = Boolean(lpcmFlags & 2); const sFlags = lpcmFlags & 4 ? -1 : 0; // I guess it means "signed flags" or something? if (sampleSize > 0 && sampleSize <= 64) { if (isFloat) { if (sampleSize === 32) { track.info.codec = isBigEndian ? 'pcm-f32be' : 'pcm-f32'; } } else { if (sFlags & (1 << (bytesPerSample - 1))) { if (bytesPerSample === 1) { track.info.codec = 'pcm-s8'; } else if (bytesPerSample === 2) { track.info.codec = isBigEndian ? 'pcm-s16be' : 'pcm-s16'; } else if (bytesPerSample === 3) { track.info.codec = isBigEndian ? 'pcm-s24be' : 'pcm-s24'; } else if (bytesPerSample === 4) { track.info.codec = isBigEndian ? 'pcm-s32be' : 'pcm-s32'; } } else { if (bytesPerSample === 1) { track.info.codec = 'pcm-u8'; } } } } if (track.info.codec === null) { console.warn('Unsupported PCM format.'); } } else if (codecName === null) { console.warn(`Unknown encrypted audio codec due to missing frma box.`); } else { console.warn(`Unsupported audio codec (sample entry type '${sampleBoxInfo.name}').`); } } slice.filePos = sampleBoxStartPos + sampleBoxInfo.totalSize; } }; break; case 'frma': { const track = this.currentTrack; if (!track) { break; } const format = readAscii(slice, 4); const lowercase = format.toLowerCase(); // Tells us what codec the encrypted track actually uses track.frmaCodecString = lowercase; }; break; case 'schm': { const track = this.currentTrack; if (!track) { break; } slice.skip(4); // Version + flags const schemeType = readAscii(slice, 4); if (schemeType === 'cenc' || schemeType === 'cens' || schemeType === 'cbcs') { track.encryptionInfo = { scheme: schemeType, defaultKid: null, defaultIsProtected: null, defaultPerSampleIvSize: null, defaultConstantIv: null, defaultCryptByteBlock: null, defaultSkipByteBlock: null, }; } else { console.warn(`Unsupported encryption scheme '${schemeType}'.`); } }; break; case 'tenc': { const track = this.currentTrack; if (!track || !track.encryptionInfo) { break; } const version = readU8(slice); slice.skip(3); // Flags slice.skip(1); // Reserved const patternByte = readU8(slice); if (version > 0) { track.encryptionInfo.defaultCryptByteBlock = patternByte >> 4; track.encryptionInfo.defaultSkipByteBlock = patternByte & 0xf; } else { track.encryptionInfo.defaultCryptByteBlock = 0; track.encryptionInfo.defaultSkipByteBlock = 0; } track.encryptionInfo.defaultIsProtected = readU8(slice) !== 0; track.encryptionInfo.defaultPerSampleIvSize = readU8(slice); track.encryptionInfo.defaultKid = bytesToHexString(readBytes(slice, 16)); if (track.encryptionInfo.defaultIsProtected && track.encryptionInfo.defaultPerSampleIvSize === 0) { const constantIvSize = readU8(slice); const constantIv = new Uint8Array(16); constantIv.set(readBytes(slice, constantIvSize), 0); track.encryptionInfo.defaultConstantIv = constantIv; } }; break; case 'avcC': { const track = this.currentTrack; if (!track) { break; } assert(track.info); track.info.codecDescription = readBytes(slice, boxInfo.contentSize); }; break; case 'hvcC': { const track = this.currentTrack; if (!track) { break; } assert(track.info); track.info.codecDescription = readBytes(slice, boxInfo.contentSize); }; break; case 'vpcC': { const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'video'); slice.skip(4); // Version + flags const profile = readU8(slice); const level = readU8(slice); const thirdByte = readU8(slice); const bitDepth = thirdByte >> 4; const chromaSubsampling = (thirdByte >> 1) & 0b111; const videoFullRangeFlag = thirdByte & 1; const colourPrimaries = readU8(slice); const transferCharacteristics = readU8(slice); const matrixCoefficients = readU8(slice); track.info.vp9CodecInfo = { profile, level, bitDepth, chromaSubsampling, videoFullRangeFlag, colourPrimaries, transferCharacteristics, matrixCoefficients, }; }; break; case 'av1C': { const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'video'); slice.skip(1); // Marker + version const secondByte = readU8(slice); const profile = secondByte >> 5; const level = secondByte & 0b11111; const thirdByte = readU8(slice); const tier = thirdByte >> 7; const highBitDepth = (thirdByte >> 6) & 1; const twelveBit = (thirdByte >> 5) & 1; const monochrome = (thirdByte >> 4) & 1; const chromaSubsamplingX = (thirdByte >> 3) & 1; const chromaSubsamplingY = (thirdByte >> 2) & 1; const chromaSamplePosition = thirdByte & 0b11; // Logic from https://aomediacodec.github.io/av1-spec/av1-spec.pdf const bitDepth = profile === 2 && highBitDepth ? (twelveBit ? 12 : 10) : (highBitDepth ? 10 : 8); track.info.av1CodecInfo = { profile, level, tier, bitDepth, monochrome, chromaSubsamplingX, chromaSubsamplingY, chromaSamplePosition, }; }; break; case 'colr': { const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'video'); const colourType = readAscii(slice, 4); if (colourType !== 'nclx') { break; } const colourPrimaries = readU16Be(slice); const transferCharacteristics = readU16Be(slice); const matrixCoefficients = readU16Be(slice); const fullRangeFlag = Boolean(readU8(slice) & 0x80); track.info.colorSpace = { primaries: COLOR_PRIMARIES_MAP_INVERSE[colourPrimaries], transfer: TRANSFER_CHARACTERISTICS_MAP_INVERSE[transferCharacteristics], matrix: MATRIX_COEFFICIENTS_MAP_INVERSE[matrixCoefficients], fullRange: fullRangeFlag, } as VideoColorSpaceInit; }; break; case 'pasp': { const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'video'); const num = readU32Be(slice); const den = readU32Be(slice); // https://github.com/Vanilagy/mediabunny/issues/362 if (num > 0 && den > 0) { if (num > den) { track.info.squarePixelWidth = Math.round(track.info.width * num / den); } else { track.info.squarePixelHeight = Math.round(track.info.height * den / num); } } }; break; case 'wave': { this.readContiguousBoxes(slice.slice(contentStartPos, boxInfo.contentSize)); }; break; case 'esds': { const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'audio'); slice.skip(4); // Version + flags const tag = readU8(slice); assert(tag === 0x03); // ES Descriptor readIsomVariableInteger(slice); // Length slice.skip(2); // ES ID const mixed = readU8(slice); const streamDependenceFlag = (mixed & 0x80) !== 0; const urlFlag = (mixed & 0x40) !== 0; const ocrStreamFlag = (mixed & 0x20) !== 0; if (streamDependenceFlag) { slice.skip(2); } if (urlFlag) { const urlLength = readU8(slice); slice.skip(urlLength); } if (ocrStreamFlag) { slice.skip(2); } const decoderConfigTag = readU8(slice); assert(decoderConfigTag === 0x04); // DecoderConfigDescriptor const decoderConfigDescriptorLength = readIsomVariableInteger(slice); // Length const payloadStart = slice.filePos; const objectTypeIndication = readU8(slice); if (objectTypeIndication === 0x40 || objectTypeIndication === 0x67) { track.info.codec = 'aac'; track.info.aacCodecInfo = { isMpeg2: objectTypeIndication === 0x67, objectType: null, }; } else if (objectTypeIndication === 0x69 || objectTypeIndication === 0x6b) { track.info.codec = 'mp3'; } else if (objectTypeIndication === 0xdd) { track.info.codec = 'vorbis'; // "nonstandard, gpac uses it" - FFmpeg } else { console.warn( `Unsupported audio codec (objectTypeIndication ${objectTypeIndication}) - discarding track.`, ); } slice.skip(1 + 3 + 4 + 4); if (decoderConfigDescriptorLength > slice.filePos - payloadStart) { // There's a DecoderSpecificInfo at the end, let's read it const decoderSpecificInfoTag = readU8(slice); assert(decoderSpecificInfoTag === 0x05); // DecoderSpecificInfo const decoderSpecificInfoLength = readIsomVariableInteger(slice); track.info.codecDescription = readBytes(slice, decoderSpecificInfoLength); if (track.info.codec === 'aac') { // Let's try to deduce more accurate values directly from the AudioSpecificConfig: const audioSpecificConfig = parseAacAudioSpecificConfig(track.info.codecDescription); if (audioSpecificConfig.numberOfChannels !== null) { track.info.numberOfChannels = audioSpecificConfig.numberOfChannels; } if (audioSpecificConfig.sampleRate !== null) { track.info.sampleRate = audioSpecificConfig.sampleRate; } } } }; break; case 'enda': { const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'audio'); track.info.pcmLittleEndian = !!(readU16Be(slice) & 0xff); // 0xff is from FFmpeg }; break; case 'pcmC': { const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'audio'); slice.skip(1 + 3); // Version + flags // ISO/IEC 23003-5 const formatFlags = readU8(slice); track.info.pcmLittleEndian = Boolean(formatFlags & 0x01); track.info.pcmSampleSize = readU8(slice); }; break; case 'dOps': { // Used for Opus audio const track = this.currentTrack; if (!track) { break; } assert(track.info?.type === 'audio'); slice.skip(1); // Version // https://www.opus-codec.org/docs/opus_in_isobmff.html const outputChannelCount = readU8(slice); const preSkip = readU16Be(slice); const inputSampleRate = readU32Be(slice); const outputGain = readI16Be(slice); const channelMappingFamily = readU8(slice); let channelMappingTable: Uint8Array; if (channelMappingFamily !== 0) { channelMappingTable = readBytes(slice, 2 + outputChannelCount); } else { channelMappingTable = new Uint8Array(0); } // https://datatracker.ietf.org/doc/html/draft-ietf-codec-oggopus-06 const d