UNPKG

mediabunny

Version:

Pure TypeScript media toolkit for reading, writing, and converting media files, directly in the browser.

1,112 lines (1,111 loc) 45.8 kB
/*! * Copyright (c) 2025-present, Vanilagy and contributors * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ import { toUint8Array, assert, isU32, last, textEncoder, COLOR_PRIMARIES_MAP, TRANSFER_CHARACTERISTICS_MAP, MATRIX_COEFFICIENTS_MAP, colorSpaceIsComplete, UNDETERMINED_LANGUAGE, } from '../misc.js'; import { generateAv1CodecConfigurationFromCodecString, parsePcmCodec, PCM_AUDIO_CODECS, } from '../codec.js'; import { formatSubtitleTimestamp } from '../subtitles.js'; import { GLOBAL_TIMESCALE, intoTimescale, } from './isobmff-muxer.js'; import { parseOpusIdentificationHeader } from '../codec-data.js'; export class IsobmffBoxWriter { constructor(writer) { this.writer = writer; this.helper = new Uint8Array(8); this.helperView = new DataView(this.helper.buffer); /** * Stores the position from the start of the file to where boxes elements have been written. This is used to * rewrite/edit elements that were already added before, and to measure sizes of things. */ this.offsets = new WeakMap(); } writeU32(value) { this.helperView.setUint32(0, value, false); this.writer.write(this.helper.subarray(0, 4)); } writeU64(value) { this.helperView.setUint32(0, Math.floor(value / 2 ** 32), false); this.helperView.setUint32(4, value, false); this.writer.write(this.helper.subarray(0, 8)); } writeAscii(text) { for (let i = 0; i < text.length; i++) { this.helperView.setUint8(i % 8, text.charCodeAt(i)); if (i % 8 === 7) this.writer.write(this.helper); } if (text.length % 8 !== 0) { this.writer.write(this.helper.subarray(0, text.length % 8)); } } writeBox(box) { this.offsets.set(box, this.writer.getPos()); if (box.contents && !box.children) { this.writeBoxHeader(box, box.size ?? box.contents.byteLength + 8); this.writer.write(box.contents); } else { const startPos = this.writer.getPos(); this.writeBoxHeader(box, 0); if (box.contents) this.writer.write(box.contents); if (box.children) for (const child of box.children) if (child) this.writeBox(child); const endPos = this.writer.getPos(); const size = box.size ?? endPos - startPos; this.writer.seek(startPos); this.writeBoxHeader(box, size); this.writer.seek(endPos); } } writeBoxHeader(box, size) { this.writeU32(box.largeSize ? 1 : size); this.writeAscii(box.type); if (box.largeSize) this.writeU64(size); } measureBoxHeader(box) { return 8 + (box.largeSize ? 8 : 0); } patchBox(box) { const boxOffset = this.offsets.get(box); assert(boxOffset !== undefined); const endPos = this.writer.getPos(); this.writer.seek(boxOffset); this.writeBox(box); this.writer.seek(endPos); } measureBox(box) { if (box.contents && !box.children) { const headerSize = this.measureBoxHeader(box); return headerSize + box.contents.byteLength; } else { let result = this.measureBoxHeader(box); if (box.contents) result += box.contents.byteLength; if (box.children) for (const child of box.children) if (child) result += this.measureBox(child); return result; } } } const bytes = new Uint8Array(8); const view = new DataView(bytes.buffer); const u8 = (value) => { return [(value % 0x100 + 0x100) % 0x100]; }; const u16 = (value) => { view.setUint16(0, value, false); return [bytes[0], bytes[1]]; }; const i16 = (value) => { view.setInt16(0, value, false); return [bytes[0], bytes[1]]; }; const u24 = (value) => { view.setUint32(0, value, false); return [bytes[1], bytes[2], bytes[3]]; }; const u32 = (value) => { view.setUint32(0, value, false); return [bytes[0], bytes[1], bytes[2], bytes[3]]; }; const i32 = (value) => { view.setInt32(0, value, false); return [bytes[0], bytes[1], bytes[2], bytes[3]]; }; const u64 = (value) => { view.setUint32(0, Math.floor(value / 2 ** 32), false); view.setUint32(4, value, false); return [bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7]]; }; const fixed_8_8 = (value) => { view.setInt16(0, 2 ** 8 * value, false); return [bytes[0], bytes[1]]; }; const fixed_16_16 = (value) => { view.setInt32(0, 2 ** 16 * value, false); return [bytes[0], bytes[1], bytes[2], bytes[3]]; }; const fixed_2_30 = (value) => { view.setInt32(0, 2 ** 30 * value, false); return [bytes[0], bytes[1], bytes[2], bytes[3]]; }; const variableUnsignedInt = (value, byteLength) => { const bytes = []; let remaining = value; do { let byte = remaining & 0x7f; remaining >>= 7; // If this isn't the first byte we're adding (meaning there will be more bytes after it // when we reverse the array), set the continuation bit if (bytes.length > 0) { byte |= 0x80; } bytes.push(byte); if (byteLength !== undefined) { byteLength--; } } while (remaining > 0 || byteLength); // Reverse the array since we built it backwards return bytes.reverse(); }; const ascii = (text, nullTerminated = false) => { const bytes = Array(text.length).fill(null).map((_, i) => text.charCodeAt(i)); if (nullTerminated) bytes.push(0x00); return bytes; }; const lastPresentedSample = (samples) => { let result = null; for (const sample of samples) { if (!result || sample.timestamp > result.timestamp) { result = sample; } } return result; }; const rotationMatrix = (rotationInDegrees) => { const theta = rotationInDegrees * (Math.PI / 180); const cosTheta = Math.round(Math.cos(theta)); const sinTheta = Math.round(Math.sin(theta)); // Matrices are post-multiplied in ISOBMFF, meaning this is the transpose of your typical rotation matrix return [ cosTheta, sinTheta, 0, -sinTheta, cosTheta, 0, 0, 0, 1, ]; }; const IDENTITY_MATRIX = rotationMatrix(0); const matrixToBytes = (matrix) => { return [ fixed_16_16(matrix[0]), fixed_16_16(matrix[1]), fixed_2_30(matrix[2]), fixed_16_16(matrix[3]), fixed_16_16(matrix[4]), fixed_2_30(matrix[5]), fixed_16_16(matrix[6]), fixed_16_16(matrix[7]), fixed_2_30(matrix[8]), ]; }; export const box = (type, contents, children) => ({ type, contents: contents && new Uint8Array(contents.flat(10)), children, }); /** A FullBox always starts with a version byte, followed by three flag bytes. */ export const fullBox = (type, version, flags, contents, children) => box(type, [u8(version), u24(flags), contents ?? []], children); /** * File Type Compatibility Box: Allows the reader to determine whether this is a type of file that the * reader understands. */ export const ftyp = (details) => { // You can find the full logic for this at // https://github.com/FFmpeg/FFmpeg/blob/de2fb43e785773738c660cdafb9309b1ef1bc80d/libavformat/movenc.c#L5518 // Obviously, this lib only needs a small subset of that logic. const minorVersion = 0x200; if (details.isQuickTime) { return box('ftyp', [ ascii('qt '), // Major brand u32(minorVersion), // Minor version // Compatible brands ascii('qt '), ]); } if (details.fragmented) { return box('ftyp', [ ascii('iso5'), // Major brand u32(minorVersion), // Minor version // Compatible brands ascii('iso5'), ascii('iso6'), ascii('mp41'), ]); } return box('ftyp', [ ascii('isom'), // Major brand u32(minorVersion), // Minor version // Compatible brands ascii('isom'), details.holdsAvc ? ascii('avc1') : [], ascii('mp41'), ]); }; /** Movie Sample Data Box. Contains the actual frames/samples of the media. */ export const mdat = (reserveLargeSize) => ({ type: 'mdat', largeSize: reserveLargeSize }); /** * Movie Box: Used to specify the information that defines a movie - that is, the information that allows * an application to interpret the sample data that is stored elsewhere. */ export const moov = (trackDatas, creationTime, fragmented = false) => box('moov', undefined, [ mvhd(creationTime, trackDatas), ...trackDatas.map(x => trak(x, creationTime)), fragmented ? mvex(trackDatas) : null, ]); /** Movie Header Box: Used to specify the characteristics of the entire movie, such as timescale and duration. */ export const mvhd = (creationTime, trackDatas) => { const duration = intoTimescale(Math.max(0, ...trackDatas .filter(x => x.samples.length > 0) .map((x) => { const lastSample = lastPresentedSample(x.samples); return lastSample.timestamp + lastSample.duration; })), GLOBAL_TIMESCALE); const nextTrackId = Math.max(0, ...trackDatas.map(x => x.track.id)) + 1; // Conditionally use u64 if u32 isn't enough const needsU64 = !isU32(creationTime) || !isU32(duration); const u32OrU64 = needsU64 ? u64 : u32; return fullBox('mvhd', +needsU64, 0, [ u32OrU64(creationTime), // Creation time u32OrU64(creationTime), // Modification time u32(GLOBAL_TIMESCALE), // Timescale u32OrU64(duration), // Duration fixed_16_16(1), // Preferred rate fixed_8_8(1), // Preferred volume Array(10).fill(0), // Reserved matrixToBytes(IDENTITY_MATRIX), // Matrix Array(24).fill(0), // Pre-defined u32(nextTrackId), // Next track ID ]); }; /** * Track Box: Defines a single track of a movie. A movie may consist of one or more tracks. Each track is * independent of the other tracks in the movie and carries its own temporal and spatial information. Each Track Box * contains its associated Media Box. */ export const trak = (trackData, creationTime) => box('trak', undefined, [ tkhd(trackData, creationTime), mdia(trackData, creationTime), ]); /** Track Header Box: Specifies the characteristics of a single track within a movie. */ export const tkhd = (trackData, creationTime) => { const lastSample = lastPresentedSample(trackData.samples); const durationInGlobalTimescale = intoTimescale(lastSample ? lastSample.timestamp + lastSample.duration : 0, GLOBAL_TIMESCALE); const needsU64 = !isU32(creationTime) || !isU32(durationInGlobalTimescale); const u32OrU64 = needsU64 ? u64 : u32; let matrix; if (trackData.type === 'video') { const rotation = trackData.track.metadata.rotation; matrix = rotationMatrix(rotation ?? 0); } else { matrix = IDENTITY_MATRIX; } return fullBox('tkhd', +needsU64, 3, [ u32OrU64(creationTime), // Creation time u32OrU64(creationTime), // Modification time u32(trackData.track.id), // Track ID u32(0), // Reserved u32OrU64(durationInGlobalTimescale), // Duration Array(8).fill(0), // Reserved u16(0), // Layer u16(trackData.track.id), // Alternate group fixed_8_8(trackData.type === 'audio' ? 1 : 0), // Volume u16(0), // Reserved matrixToBytes(matrix), // Matrix fixed_16_16(trackData.type === 'video' ? trackData.info.width : 0), // Track width fixed_16_16(trackData.type === 'video' ? trackData.info.height : 0), // Track height ]); }; /** Media Box: Describes and define a track's media type and sample data. */ export const mdia = (trackData, creationTime) => box('mdia', undefined, [ mdhd(trackData, creationTime), hdlr(trackData), minf(trackData), ]); /** Media Header Box: Specifies the characteristics of a media, including timescale and duration. */ export const mdhd = (trackData, creationTime) => { const lastSample = lastPresentedSample(trackData.samples); const localDuration = intoTimescale(lastSample ? lastSample.timestamp + lastSample.duration : 0, trackData.timescale); const needsU64 = !isU32(creationTime) || !isU32(localDuration); const u32OrU64 = needsU64 ? u64 : u32; let language = 0; for (const character of (trackData.track.metadata.languageCode ?? UNDETERMINED_LANGUAGE)) { language <<= 5; language += character.charCodeAt(0) - 0x60; } return fullBox('mdhd', +needsU64, 0, [ u32OrU64(creationTime), // Creation time u32OrU64(creationTime), // Modification time u32(trackData.timescale), // Timescale u32OrU64(localDuration), // Duration u16(language), // Language u16(0), // Quality ]); }; const TRACK_TYPE_TO_COMPONENT_SUBTYPE = { video: 'vide', audio: 'soun', subtitle: 'text', }; const TRACK_TYPE_TO_HANDLER_NAME = { video: 'MediabunnyVideoHandler', audio: 'MediabunnySoundHandler', subtitle: 'MediabunnyTextHandler', }; /** Handler Reference Box: Specifies the media handler component that is to be used to interpret the media's data. */ export const hdlr = (trackData) => fullBox('hdlr', 0, 0, [ ascii('mhlr'), // Component type ascii(TRACK_TYPE_TO_COMPONENT_SUBTYPE[trackData.type]), // Component subtype u32(0), // Component manufacturer u32(0), // Component flags u32(0), // Component flags mask ascii(TRACK_TYPE_TO_HANDLER_NAME[trackData.type], true), // Component name ]); /** * Media Information Box: Stores handler-specific information for a track's media data. The media handler uses this * information to map from media time to media data and to process the media data. */ export const minf = (trackData) => box('minf', undefined, [ TRACK_TYPE_TO_HEADER_BOX[trackData.type](), dinf(), stbl(trackData), ]); /** Video Media Information Header Box: Defines specific color and graphics mode information. */ export const vmhd = () => fullBox('vmhd', 0, 1, [ u16(0), // Graphics mode u16(0), // Opcolor R u16(0), // Opcolor G u16(0), // Opcolor B ]); /** Sound Media Information Header Box: Stores the sound media's control information, such as balance. */ export const smhd = () => fullBox('smhd', 0, 0, [ u16(0), // Balance u16(0), // Reserved ]); /** Null Media Header Box. */ export const nmhd = () => fullBox('nmhd', 0, 0); const TRACK_TYPE_TO_HEADER_BOX = { video: vmhd, audio: smhd, subtitle: nmhd, }; /** * Data Information Box: Contains information specifying the data handler component that provides access to the * media data. The data handler component uses the Data Information Box to interpret the media's data. */ export const dinf = () => box('dinf', undefined, [ dref(), ]); /** * Data Reference Box: Contains tabular data that instructs the data handler component how to access the media's data. */ export const dref = () => fullBox('dref', 0, 0, [ u32(1), // Entry count ], [ url(), ]); export const url = () => fullBox('url ', 0, 1); // Self-reference flag enabled /** * Sample Table Box: Contains information for converting from media time to sample number to sample location. This box * also indicates how to interpret the sample (for example, whether to decompress the video data and, if so, how). */ export const stbl = (trackData) => { const needsCtts = trackData.compositionTimeOffsetTable.length > 1 || trackData.compositionTimeOffsetTable.some(x => x.sampleCompositionTimeOffset !== 0); return box('stbl', undefined, [ stsd(trackData), stts(trackData), needsCtts ? ctts(trackData) : null, needsCtts ? cslg(trackData) : null, stsc(trackData), stsz(trackData), stco(trackData), stss(trackData), ]); }; /** * Sample Description Box: Stores information that allows you to decode samples in the media. The data stored in the * sample description varies, depending on the media type. */ export const stsd = (trackData) => { let sampleDescription; if (trackData.type === 'video') { sampleDescription = videoSampleDescription(VIDEO_CODEC_TO_BOX_NAME[trackData.track.source._codec], trackData); } else if (trackData.type === 'audio') { const boxName = audioCodecToBoxName(trackData.track.source._codec, trackData.muxer.isQuickTime); assert(boxName); sampleDescription = soundSampleDescription(boxName, trackData); } else if (trackData.type === 'subtitle') { sampleDescription = subtitleSampleDescription(SUBTITLE_CODEC_TO_BOX_NAME[trackData.track.source._codec], trackData); } assert(sampleDescription); return fullBox('stsd', 0, 0, [ u32(1), // Entry count ], [ sampleDescription, ]); }; /** Video Sample Description Box: Contains information that defines how to interpret video media data. */ export const videoSampleDescription = (compressionType, trackData) => box(compressionType, [ Array(6).fill(0), // Reserved u16(1), // Data reference index u16(0), // Pre-defined u16(0), // Reserved Array(12).fill(0), // Pre-defined u16(trackData.info.width), // Width u16(trackData.info.height), // Height u32(0x00480000), // Horizontal resolution u32(0x00480000), // Vertical resolution u32(0), // Reserved u16(1), // Frame count Array(32).fill(0), // Compressor name u16(0x0018), // Depth i16(0xffff), // Pre-defined ], [ VIDEO_CODEC_TO_CONFIGURATION_BOX[trackData.track.source._codec](trackData), colorSpaceIsComplete(trackData.info.decoderConfig.colorSpace) ? colr(trackData) : null, ]); /** Colour Information Box: Specifies the color space of the video. */ export const colr = (trackData) => box('colr', [ ascii('nclx'), // Colour type u16(COLOR_PRIMARIES_MAP[trackData.info.decoderConfig.colorSpace.primaries]), // Colour primaries u16(TRANSFER_CHARACTERISTICS_MAP[trackData.info.decoderConfig.colorSpace.transfer]), // Transfer characteristics u16(MATRIX_COEFFICIENTS_MAP[trackData.info.decoderConfig.colorSpace.matrix]), // Matrix coefficients u8((trackData.info.decoderConfig.colorSpace.fullRange ? 1 : 0) << 7), // Full range flag ]); /** AVC Configuration Box: Provides additional information to the decoder. */ export const avcC = (trackData) => trackData.info.decoderConfig && box('avcC', [ // For AVC, description is an AVCDecoderConfigurationRecord, so nothing else to do here ...toUint8Array(trackData.info.decoderConfig.description), ]); /** HEVC Configuration Box: Provides additional information to the decoder. */ export const hvcC = (trackData) => trackData.info.decoderConfig && box('hvcC', [ // For HEVC, description is an HEVCDecoderConfigurationRecord, so nothing else to do here ...toUint8Array(trackData.info.decoderConfig.description), ]); /** VP Configuration Box: Provides additional information to the decoder. */ export const vpcC = (trackData) => { // Reference: https://www.webmproject.org/vp9/mp4/ if (!trackData.info.decoderConfig) { return null; } const decoderConfig = trackData.info.decoderConfig; const parts = decoderConfig.codec.split('.'); // We can derive the required values from the codec string const profile = Number(parts[1]); const level = Number(parts[2]); const bitDepth = Number(parts[3]); const chromaSubsampling = parts[4] ? Number(parts[4]) : 1; // 4:2:0 colocated with luma (0,0) const videoFullRangeFlag = parts[8] ? Number(parts[8]) : Number(decoderConfig.colorSpace?.fullRange ?? 0); const thirdByte = (bitDepth << 4) + (chromaSubsampling << 1) + videoFullRangeFlag; const colourPrimaries = parts[5] ? Number(parts[5]) : decoderConfig.colorSpace?.primaries ? COLOR_PRIMARIES_MAP[decoderConfig.colorSpace.primaries] : 2; // Default to undetermined const transferCharacteristics = parts[6] ? Number(parts[6]) : decoderConfig.colorSpace?.transfer ? TRANSFER_CHARACTERISTICS_MAP[decoderConfig.colorSpace.transfer] : 2; const matrixCoefficients = parts[7] ? Number(parts[7]) : decoderConfig.colorSpace?.matrix ? MATRIX_COEFFICIENTS_MAP[decoderConfig.colorSpace.matrix] : 2; return fullBox('vpcC', 1, 0, [ u8(profile), // Profile u8(level), // Level u8(thirdByte), // Bit depth, chroma subsampling, full range u8(colourPrimaries), // Colour primaries u8(transferCharacteristics), // Transfer characteristics u8(matrixCoefficients), // Matrix coefficients u16(0), // Codec initialization data size ]); }; /** AV1 Configuration Box: Provides additional information to the decoder. */ export const av1C = (trackData) => { return box('av1C', generateAv1CodecConfigurationFromCodecString(trackData.info.decoderConfig.codec)); }; /** Sound Sample Description Box: Contains information that defines how to interpret sound media data. */ export const soundSampleDescription = (compressionType, trackData) => { let version = 0; let contents; let sampleSizeInBits = 16; if (PCM_AUDIO_CODECS.includes(trackData.track.source._codec)) { const codec = trackData.track.source._codec; const { sampleSize } = parsePcmCodec(codec); sampleSizeInBits = 8 * sampleSize; if (sampleSizeInBits > 16) { version = 1; } } if (version === 0) { contents = [ Array(6).fill(0), // Reserved u16(1), // Data reference index u16(version), // Version u16(0), // Revision level u32(0), // Vendor u16(trackData.info.numberOfChannels), // Number of channels u16(sampleSizeInBits), // Sample size (bits) u16(0), // Compression ID u16(0), // Packet size u16(trackData.info.sampleRate < 2 ** 16 ? trackData.info.sampleRate : 0), // Sample rate (upper) u16(0), // Sample rate (lower) ]; } else { contents = [ Array(6).fill(0), // Reserved u16(1), // Data reference index u16(version), // Version u16(0), // Revision level u32(0), // Vendor u16(trackData.info.numberOfChannels), // Number of channels u16(Math.min(sampleSizeInBits, 16)), // Sample size (bits) u16(0), // Compression ID u16(0), // Packet size u16(trackData.info.sampleRate < 2 ** 16 ? trackData.info.sampleRate : 0), // Sample rate (upper) u16(0), // Sample rate (lower) u32(1), // Samples per packet (must be 1 for uncompressed formats) u32(sampleSizeInBits / 8), // Bytes per packet u32(trackData.info.numberOfChannels * sampleSizeInBits / 8), // Bytes per frame u32(2), // Bytes per sample (constant in FFmpeg) ]; } return box(compressionType, contents, [ audioCodecToConfigurationBox(trackData.track.source._codec, trackData.muxer.isQuickTime)?.(trackData) ?? null, ]); }; /** MPEG-4 Elementary Stream Descriptor Box. */ export const esds = (trackData) => { // We build up the bytes in a layered way which reflects the nested structure let objectTypeIndication; switch (trackData.track.source._codec) { case 'aac': { objectTypeIndication = 0x40; } ; break; case 'mp3': { objectTypeIndication = 0x6b; } ; break; case 'vorbis': { objectTypeIndication = 0xdd; } ; break; default: throw new Error(`Unhandled audio codec: ${trackData.track.source._codec}`); } let bytes = [ ...u8(objectTypeIndication), // Object type indication ...u8(0x15), // stream type(6bits)=5 audio, flags(2bits)=1 ...u24(0), // 24bit buffer size ...u32(0), // max bitrate ...u32(0), // avg bitrate ]; if (trackData.info.decoderConfig.description) { const description = toUint8Array(trackData.info.decoderConfig.description); // Add the decoder description to the end bytes = [ ...bytes, ...u8(0x05), // TAG(5) = DecoderSpecificInfo ...variableUnsignedInt(description.byteLength), ...description, ]; } bytes = [ ...u16(1), // ES_ID = 1 ...u8(0x00), // flags etc = 0 ...u8(0x04), // TAG(4) = ES Descriptor ...variableUnsignedInt(bytes.length), ...bytes, ...u8(0x06), // TAG(6) ...u8(0x01), // length ...u8(0x02), // data ]; bytes = [ ...u8(0x03), // TAG(3) = Object Descriptor ...variableUnsignedInt(bytes.length), ...bytes, ]; return fullBox('esds', 0, 0, bytes); }; export const wave = (trackData) => { return box('wave', undefined, [ frma(trackData), enda(trackData), box('\x00\x00\x00\x00'), // NULL tag at the end ]); }; export const frma = (trackData) => { return box('frma', [ ascii(audioCodecToBoxName(trackData.track.source._codec, trackData.muxer.isQuickTime)), ]); }; // This box specifies PCM endianness export const enda = (trackData) => { const { littleEndian } = parsePcmCodec(trackData.track.source._codec); return box('enda', [ u16(+littleEndian), ]); }; /** Opus Specific Box. */ export const dOps = (trackData) => { let outputChannelCount = trackData.info.numberOfChannels; // Default PreSkip, should be at least 80 milliseconds worth of playback, measured in 48000 Hz samples let preSkip = 3840; let inputSampleRate = trackData.info.sampleRate; let outputGain = 0; let channelMappingFamily = 0; let channelMappingTable = new Uint8Array(0); // Read preskip and from codec private data from the encoder // https://www.rfc-editor.org/rfc/rfc7845#section-5 const description = trackData.info.decoderConfig?.description; if (description) { assert(description.byteLength >= 18); const bytes = toUint8Array(description); const header = parseOpusIdentificationHeader(bytes); outputChannelCount = header.outputChannelCount; preSkip = header.preSkip; inputSampleRate = header.inputSampleRate; outputGain = header.outputGain; channelMappingFamily = header.channelMappingFamily; if (header.channelMappingTable) { channelMappingTable = header.channelMappingTable; } } // https://www.opus-codec.org/docs/opus_in_isobmff.html return box('dOps', [ u8(0), // Version u8(outputChannelCount), // OutputChannelCount u16(preSkip), // PreSkip u32(inputSampleRate), // InputSampleRate i16(outputGain), // OutputGain u8(channelMappingFamily), // ChannelMappingFamily ...channelMappingTable, ]); }; /** FLAC specific box. */ export const dfLa = (trackData) => { const description = trackData.info.decoderConfig?.description; assert(description); const bytes = toUint8Array(description); return fullBox('dfLa', 0, 0, [ ...bytes.subarray(4), ]); }; /** PCM Configuration Box, ISO/IEC 23003-5. */ const pcmC = (trackData) => { const { littleEndian, sampleSize } = parsePcmCodec(trackData.track.source._codec); const formatFlags = +littleEndian; return fullBox('pcmC', 0, 0, [ u8(formatFlags), u8(8 * sampleSize), ]); }; export const subtitleSampleDescription = (compressionType, trackData) => box(compressionType, [ Array(6).fill(0), // Reserved u16(1), // Data reference index ], [ SUBTITLE_CODEC_TO_CONFIGURATION_BOX[trackData.track.source._codec](trackData), ]); export const vttC = (trackData) => box('vttC', [ ...textEncoder.encode(trackData.info.config.description), ]); export const txtC = (textConfig) => fullBox('txtC', 0, 0, [ ...textConfig, 0, // Text config (null-terminated) ]); /** * Time-To-Sample Box: Stores duration information for a media's samples, providing a mapping from a time in a media * to the corresponding data sample. The table is compact, meaning that consecutive samples with the same time delta * will be grouped. */ export const stts = (trackData) => { return fullBox('stts', 0, 0, [ u32(trackData.timeToSampleTable.length), // Number of entries trackData.timeToSampleTable.map(x => [ u32(x.sampleCount), // Sample count u32(x.sampleDelta), // Sample duration ]), ]); }; /** Sync Sample Box: Identifies the key frames in the media, marking the random access points within a stream. */ export const stss = (trackData) => { if (trackData.samples.every(x => x.type === 'key')) return null; // No stss box -> every frame is a key frame const keySamples = [...trackData.samples.entries()].filter(([, sample]) => sample.type === 'key'); return fullBox('stss', 0, 0, [ u32(keySamples.length), // Number of entries keySamples.map(([index]) => u32(index + 1)), // Sync sample table ]); }; /** * Sample-To-Chunk Box: As samples are added to a media, they are collected into chunks that allow optimized data * access. A chunk contains one or more samples. Chunks in a media may have different sizes, and the samples within a * chunk may have different sizes. The Sample-To-Chunk Box stores chunk information for the samples in a media, stored * in a compactly-coded fashion. */ export const stsc = (trackData) => { return fullBox('stsc', 0, 0, [ u32(trackData.compactlyCodedChunkTable.length), // Number of entries trackData.compactlyCodedChunkTable.map(x => [ u32(x.firstChunk), // First chunk u32(x.samplesPerChunk), // Samples per chunk u32(1), // Sample description index ]), ]); }; /** Sample Size Box: Specifies the byte size of each sample in the media. */ export const stsz = (trackData) => { if (trackData.type === 'audio' && trackData.info.requiresPcmTransformation) { const { sampleSize } = parsePcmCodec(trackData.track.source._codec); // With PCM, every sample has the same size return fullBox('stsz', 0, 0, [ u32(sampleSize * trackData.info.numberOfChannels), // Sample size u32(trackData.samples.reduce((acc, x) => acc + intoTimescale(x.duration, trackData.timescale), 0)), ]); } return fullBox('stsz', 0, 0, [ u32(0), // Sample size (0 means non-constant size) u32(trackData.samples.length), // Number of entries trackData.samples.map(x => u32(x.size)), // Sample size table ]); }; /** Chunk Offset Box: Identifies the location of each chunk of data in the media's data stream, relative to the file. */ export const stco = (trackData) => { if (trackData.finalizedChunks.length > 0 && last(trackData.finalizedChunks).offset >= 2 ** 32) { // If the file is large, use the co64 box return fullBox('co64', 0, 0, [ u32(trackData.finalizedChunks.length), // Number of entries trackData.finalizedChunks.map(x => u64(x.offset)), // Chunk offset table ]); } return fullBox('stco', 0, 0, [ u32(trackData.finalizedChunks.length), // Number of entries trackData.finalizedChunks.map(x => u32(x.offset)), // Chunk offset table ]); }; /** * Composition Time to Sample Box: Stores composition time offset information (PTS-DTS) for a * media's samples. The table is compact, meaning that consecutive samples with the same time * composition time offset will be grouped. */ export const ctts = (trackData) => { return fullBox('ctts', 1, 0, [ u32(trackData.compositionTimeOffsetTable.length), // Number of entries trackData.compositionTimeOffsetTable.map(x => [ u32(x.sampleCount), // Sample count i32(x.sampleCompositionTimeOffset), // Sample offset ]), ]); }; /** * Composition to Decode Box: Stores information about the composition and display times of the media samples. */ export const cslg = (trackData) => { let leastDecodeToDisplayDelta = Infinity; let greatestDecodeToDisplayDelta = -Infinity; let compositionStartTime = Infinity; let compositionEndTime = -Infinity; assert(trackData.compositionTimeOffsetTable.length > 0); assert(trackData.samples.length > 0); for (let i = 0; i < trackData.compositionTimeOffsetTable.length; i++) { const entry = trackData.compositionTimeOffsetTable[i]; leastDecodeToDisplayDelta = Math.min(leastDecodeToDisplayDelta, entry.sampleCompositionTimeOffset); greatestDecodeToDisplayDelta = Math.max(greatestDecodeToDisplayDelta, entry.sampleCompositionTimeOffset); } for (let i = 0; i < trackData.samples.length; i++) { const sample = trackData.samples[i]; compositionStartTime = Math.min(compositionStartTime, intoTimescale(sample.timestamp, trackData.timescale)); compositionEndTime = Math.max(compositionEndTime, intoTimescale(sample.timestamp + sample.duration, trackData.timescale)); } const compositionToDtsShift = Math.max(-leastDecodeToDisplayDelta, 0); if (compositionEndTime >= 2 ** 31) { // For very large files, the composition end time can't be represented in i32, so let's just scrap the box in // that case. QuickTime fails to read the file if there's a cslg box with version 1, so that's sadly not an // option. return null; } return fullBox('cslg', 0, 0, [ i32(compositionToDtsShift), // Composition to DTS shift i32(leastDecodeToDisplayDelta), // Least decode to display delta i32(greatestDecodeToDisplayDelta), // Greatest decode to display delta i32(compositionStartTime), // Composition start time i32(compositionEndTime), // Composition end time ]); }; /** * Movie Extends Box: This box signals to readers that the file is fragmented. Contains a single Track Extends Box * for each track in the movie. */ export const mvex = (trackDatas) => { return box('mvex', undefined, trackDatas.map(trex)); }; /** Track Extends Box: Contains the default values used by the movie fragments. */ export const trex = (trackData) => { return fullBox('trex', 0, 0, [ u32(trackData.track.id), // Track ID u32(1), // Default sample description index u32(0), // Default sample duration u32(0), // Default sample size u32(0), // Default sample flags ]); }; /** * Movie Fragment Box: The movie fragments extend the presentation in time. They provide the information that would * previously have been in the Movie Box. */ export const moof = (sequenceNumber, trackDatas) => { return box('moof', undefined, [ mfhd(sequenceNumber), ...trackDatas.map(traf), ]); }; /** Movie Fragment Header Box: Contains a sequence number as a safety check. */ export const mfhd = (sequenceNumber) => { return fullBox('mfhd', 0, 0, [ u32(sequenceNumber), // Sequence number ]); }; const fragmentSampleFlags = (sample) => { let byte1 = 0; let byte2 = 0; const byte3 = 0; const byte4 = 0; const sampleIsDifferenceSample = sample.type === 'delta'; byte2 |= +sampleIsDifferenceSample; if (sampleIsDifferenceSample) { byte1 |= 1; // There is redundant coding in this sample } else { byte1 |= 2; // There is no redundant coding in this sample } // Note that there are a lot of other flags to potentially set here, but most are irrelevant / non-necessary return byte1 << 24 | byte2 << 16 | byte3 << 8 | byte4; }; /** Track Fragment Box */ export const traf = (trackData) => { return box('traf', undefined, [ tfhd(trackData), tfdt(trackData), trun(trackData), ]); }; /** Track Fragment Header Box: Provides a reference to the extended track, and flags. */ export const tfhd = (trackData) => { assert(trackData.currentChunk); let tfFlags = 0; tfFlags |= 0x00008; // Default sample duration present tfFlags |= 0x00010; // Default sample size present tfFlags |= 0x00020; // Default sample flags present tfFlags |= 0x20000; // Default base is moof // Prefer the second sample over the first one, as the first one is a sync sample and therefore the "odd one out" const referenceSample = trackData.currentChunk.samples[1] ?? trackData.currentChunk.samples[0]; const referenceSampleInfo = { duration: referenceSample.timescaleUnitsToNextSample, size: referenceSample.size, flags: fragmentSampleFlags(referenceSample), }; return fullBox('tfhd', 0, tfFlags, [ u32(trackData.track.id), // Track ID u32(referenceSampleInfo.duration), // Default sample duration u32(referenceSampleInfo.size), // Default sample size u32(referenceSampleInfo.flags), // Default sample flags ]); }; /** * Track Fragment Decode Time Box: Provides the absolute decode time of the first sample of the fragment. This is * useful for performing random access on the media file. */ export const tfdt = (trackData) => { assert(trackData.currentChunk); return fullBox('tfdt', 1, 0, [ u64(intoTimescale(trackData.currentChunk.startTimestamp, trackData.timescale)), // Base Media Decode Time ]); }; /** Track Run Box: Specifies a run of contiguous samples for a given track. */ export const trun = (trackData) => { assert(trackData.currentChunk); const allSampleDurations = trackData.currentChunk.samples.map(x => x.timescaleUnitsToNextSample); const allSampleSizes = trackData.currentChunk.samples.map(x => x.size); const allSampleFlags = trackData.currentChunk.samples.map(fragmentSampleFlags); const allSampleCompositionTimeOffsets = trackData.currentChunk.samples .map(x => intoTimescale(x.timestamp - x.decodeTimestamp, trackData.timescale)); const uniqueSampleDurations = new Set(allSampleDurations); const uniqueSampleSizes = new Set(allSampleSizes); const uniqueSampleFlags = new Set(allSampleFlags); const uniqueSampleCompositionTimeOffsets = new Set(allSampleCompositionTimeOffsets); const firstSampleFlagsPresent = uniqueSampleFlags.size === 2 && allSampleFlags[0] !== allSampleFlags[1]; const sampleDurationPresent = uniqueSampleDurations.size > 1; const sampleSizePresent = uniqueSampleSizes.size > 1; const sampleFlagsPresent = !firstSampleFlagsPresent && uniqueSampleFlags.size > 1; const sampleCompositionTimeOffsetsPresent = uniqueSampleCompositionTimeOffsets.size > 1 || [...uniqueSampleCompositionTimeOffsets].some(x => x !== 0); let flags = 0; flags |= 0x0001; // Data offset present flags |= 0x0004 * +firstSampleFlagsPresent; // First sample flags present flags |= 0x0100 * +sampleDurationPresent; // Sample duration present flags |= 0x0200 * +sampleSizePresent; // Sample size present flags |= 0x0400 * +sampleFlagsPresent; // Sample flags present flags |= 0x0800 * +sampleCompositionTimeOffsetsPresent; // Sample composition time offsets present return fullBox('trun', 1, flags, [ u32(trackData.currentChunk.samples.length), // Sample count u32(trackData.currentChunk.offset - trackData.currentChunk.moofOffset || 0), // Data offset firstSampleFlagsPresent ? u32(allSampleFlags[0]) : [], trackData.currentChunk.samples.map((_, i) => [ sampleDurationPresent ? u32(allSampleDurations[i]) : [], // Sample duration sampleSizePresent ? u32(allSampleSizes[i]) : [], // Sample size sampleFlagsPresent ? u32(allSampleFlags[i]) : [], // Sample flags // Sample composition time offsets sampleCompositionTimeOffsetsPresent ? i32(allSampleCompositionTimeOffsets[i]) : [], ]), ]); }; /** * Movie Fragment Random Access Box: For each track, provides pointers to sync samples within the file * for random access. */ export const mfra = (trackDatas) => { return box('mfra', undefined, [ ...trackDatas.map(tfra), mfro(), ]); }; /** Track Fragment Random Access Box: Provides pointers to sync samples within the file for random access. */ export const tfra = (trackData, trackIndex) => { const version = 1; // Using this version allows us to use 64-bit time and offset values return fullBox('tfra', version, 0, [ u32(trackData.track.id), // Track ID u32(0b111111), // This specifies that traf number, trun number and sample number are 32-bit ints u32(trackData.finalizedChunks.length), // Number of entries trackData.finalizedChunks.map(chunk => [ u64(intoTimescale(chunk.samples[0].timestamp, trackData.timescale)), // Time (in presentation time) u64(chunk.moofOffset), // moof offset u32(trackIndex + 1), // traf number u32(1), // trun number u32(1), // Sample number ]), ]); }; /** * Movie Fragment Random Access Offset Box: Provides the size of the enclosing mfra box. This box can be used by readers * to quickly locate the mfra box by searching from the end of the file. */ export const mfro = () => { return fullBox('mfro', 0, 0, [ // This value needs to be overwritten manually from the outside, where the actual size of the enclosing mfra box // is known u32(0), // Size ]); }; /** VTT Empty Cue Box */ export const vtte = () => box('vtte'); /** VTT Cue Box */ export const vttc = (payload, timestamp, identifier, settings, sourceId) => box('vttc', undefined, [ sourceId !== null ? box('vsid', [i32(sourceId)]) : null, identifier !== null ? box('iden', [...textEncoder.encode(identifier)]) : null, timestamp !== null ? box('ctim', [...textEncoder.encode(formatSubtitleTimestamp(timestamp))]) : null, settings !== null ? box('sttg', [...textEncoder.encode(settings)]) : null, box('payl', [...textEncoder.encode(payload)]), ]); /** VTT Additional Text Box */ export const vtta = (notes) => box('vtta', [...textEncoder.encode(notes)]); const VIDEO_CODEC_TO_BOX_NAME = { avc: 'avc1', hevc: 'hvc1', vp8: 'vp08', vp9: 'vp09', av1: 'av01', }; const VIDEO_CODEC_TO_CONFIGURATION_BOX = { avc: avcC, hevc: hvcC, vp8: vpcC, vp9: vpcC, av1: av1C, }; const audioCodecToBoxName = (codec, isQuickTime) => { switch (codec) { case 'aac': return 'mp4a'; case 'mp3': return 'mp4a'; case 'opus': return 'Opus'; case 'vorbis': return 'mp4a'; case 'flac': return 'fLaC'; case 'ulaw': return 'ulaw'; case 'alaw': return 'alaw'; case 'pcm-u8': return 'raw '; case 'pcm-s8': return 'sowt'; } // Logic diverges here if (isQuickTime) { switch (codec) { case 'pcm-s16': return 'sowt'; case 'pcm-s16be': return 'twos'; case 'pcm-s24': return 'in24'; case 'pcm-s24be': return 'in24'; case 'pcm-s32': return 'in32'; case 'pcm-s32be': return 'in32'; case 'pcm-f32': return 'fl32'; case 'pcm-f32be': return 'fl32'; case 'pcm-f64': return 'fl64'; case 'pcm-f64be': return 'fl64'; } } else { switch (codec) { case 'pcm-s16': return 'ipcm'; case 'pcm-s16be': return 'ipcm'; case 'pcm-s24': return 'ipcm'; case 'pcm-s24be': return 'ipcm'; case 'pcm-s32': return 'ipcm'; case 'pcm-s32be': return 'ipcm'; case 'pcm-f32': return 'fpcm'; case 'pcm-f32be': return 'fpcm'; case 'pcm-f64': return 'fpcm'; case 'pcm-f64be': return 'fpcm'; } } }; const audioCodecToConfigurationBox = (codec, isQuickTime) => { switch (codec) { case 'aac': return esds; case 'mp3': return esds; case 'opus': return dOps; case 'vorbis': return esds; case 'flac': return dfLa; } // Logic diverges here if (isQuickTime) { switch (codec) { case 'pcm-s24': return wave; case 'pcm-s24be': return wave; case 'pcm-s32': return wave; case 'pcm-s32be': return wave; case 'pcm-f32': return wave; case 'pcm-f32be': return wave; case 'pcm-f64': return wave; case 'pcm-f64be': return wave; } } else { switch (codec) { case 'pcm-s16': return pcmC; case 'pcm-s16be': return pcmC; case 'pcm-s24': return pcmC; case 'pcm-s24be': return pcmC; case 'pcm-s32': return pcmC; case 'pcm-s32be': return pcmC; case 'pcm-f32': return pcmC; case 'pcm-f32be': return pcmC; case 'pcm-f64': return pcmC; case 'pcm-f64be': return pcmC; } } return null; }; const SUBTITLE_CODEC_TO_BOX_NAME = { webvtt: 'wvtt', }; const SUBTITLE_CODEC_TO_CONFIGURATION_BOX = { webvtt: vttC, };