UNPKG

mediabunny

Version:

Pure TypeScript media toolkit for reading, writing, and converting media files, directly in the browser.

1,557 lines (1,310 loc) 46.1 kB
/*! * Copyright (c) 2025-present, Vanilagy and contributors * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ import { VP9_LEVEL_TABLE } from './codec'; import { InputVideoTrack } from './input-track'; import { assert, assertNever, Bitstream, last, readExpGolomb, readSignedExpGolomb, toDataView, toUint8Array, } from './misc'; import { EncodedPacket, PacketType } from './packet'; // References for AVC/HEVC code: // ISO 14496-15 // Rec. ITU-T H.264 // Rec. ITU-T H.265 // https://stackoverflow.com/questions/24884827 /** Finds all NAL units in an AVC packet in Annex B format. */ const findNalUnitsInAnnexB = (packetData: Uint8Array) => { const nalUnits: Uint8Array[] = []; let i = 0; while (i < packetData.length) { let startCodePos = -1; let startCodeLength = 0; for (let j = i; j < packetData.length - 3; j++) { // Check for 3-byte start code (0x000001) if (packetData[j] === 0 && packetData[j + 1] === 0 && packetData[j + 2] === 1) { startCodePos = j; startCodeLength = 3; break; } // Check for 4-byte start code (0x00000001) if ( j < packetData.length - 4 && packetData[j] === 0 && packetData[j + 1] === 0 && packetData[j + 2] === 0 && packetData[j + 3] === 1 ) { startCodePos = j; startCodeLength = 4; break; } } if (startCodePos === -1) { break; // No more start codes found } // If this isn't the first start code, extract the previous NAL unit if (i > 0 && startCodePos > i) { const nalData = packetData.subarray(i, startCodePos); if (nalData.length > 0) { nalUnits.push(nalData); } } i = startCodePos + startCodeLength; } // Extract the last NAL unit if there is one if (i < packetData.length) { const nalData = packetData.subarray(i); if (nalData.length > 0) { nalUnits.push(nalData); } } return nalUnits; }; /** Finds all NAL units in an AVC packet in length-prefixed format. */ const findNalUnitsInLengthPrefixed = (packetData: Uint8Array, lengthSize: 1 | 2 | 3 | 4) => { const nalUnits: Uint8Array[] = []; let offset = 0; const dataView = new DataView(packetData.buffer, packetData.byteOffset, packetData.byteLength); while (offset + lengthSize <= packetData.length) { let nalUnitLength: number; if (lengthSize === 1) { nalUnitLength = dataView.getUint8(offset); } else if (lengthSize === 2) { nalUnitLength = dataView.getUint16(offset, false); } else if (lengthSize === 3) { nalUnitLength = (dataView.getUint16(offset, false) << 8) + dataView.getUint8(offset + 2); } else if (lengthSize === 4) { nalUnitLength = dataView.getUint32(offset, false); } else { assertNever(lengthSize); assert(false); } offset += lengthSize; const nalUnit = packetData.subarray(offset, offset + nalUnitLength); nalUnits.push(nalUnit); offset += nalUnitLength; } return nalUnits; }; const removeEmulationPreventionBytes = (data: Uint8Array) => { const result: number[] = []; const len = data.length; for (let i = 0; i < len; i++) { // Look for the 0x000003 pattern if (i + 2 < len && data[i] === 0x00 && data[i + 1] === 0x00 && data[i + 2] === 0x03) { result.push(0x00, 0x00); // Push the first two bytes i += 2; // Skip the 0x03 byte } else { result.push(data[i]!); } } return new Uint8Array(result); }; /** Converts an AVC packet in Annex B format to length-prefixed format. */ export const transformAnnexBToLengthPrefixed = (packetData: Uint8Array) => { const NAL_UNIT_LENGTH_SIZE = 4; const nalUnits = findNalUnitsInAnnexB(packetData); if (nalUnits.length === 0) { // If no NAL units were found, it's not valid Annex B data return null; } let totalSize = 0; for (const nalUnit of nalUnits) { totalSize += NAL_UNIT_LENGTH_SIZE + nalUnit.byteLength; } const avccData = new Uint8Array(totalSize); const dataView = new DataView(avccData.buffer); let offset = 0; // Write each NAL unit with its length prefix for (const nalUnit of nalUnits) { const length = nalUnit.byteLength; dataView.setUint32(offset, length, false); offset += 4; avccData.set(nalUnit, offset); offset += nalUnit.byteLength; } return avccData; }; // Data specified in ISO 14496-15 export type AvcDecoderConfigurationRecord = { configurationVersion: number; avcProfileIndication: number; profileCompatibility: number; avcLevelIndication: number; lengthSizeMinusOne: number; sequenceParameterSets: Uint8Array[]; pictureParameterSets: Uint8Array[]; // Fields only for specific profiles: chromaFormat: number | null; bitDepthLumaMinus8: number | null; bitDepthChromaMinus8: number | null; sequenceParameterSetExt: Uint8Array[] | null; }; const extractNalUnitTypeForAvc = (data: Uint8Array) => { return data[0]! & 0x1F; }; /** Builds an AvcDecoderConfigurationRecord from an AVC packet in Annex B format. */ export const extractAvcDecoderConfigurationRecord = (packetData: Uint8Array) => { try { const nalUnits = findNalUnitsInAnnexB(packetData); const spsUnits = nalUnits.filter(unit => extractNalUnitTypeForAvc(unit) === 7); const ppsUnits = nalUnits.filter(unit => extractNalUnitTypeForAvc(unit) === 8); const spsExtUnits = nalUnits.filter(unit => extractNalUnitTypeForAvc(unit) === 13); if (spsUnits.length === 0) { return null; } if (ppsUnits.length === 0) { return null; } // Let's get the first SPS for profile and level information const spsData = spsUnits[0]!; const bitstream = new Bitstream(removeEmulationPreventionBytes(spsData)); bitstream.skipBits(1); // forbidden_zero_bit bitstream.skipBits(2); // nal_ref_idc const nal_unit_type = bitstream.readBits(5); if (nal_unit_type !== 7) { // SPS NAL unit type is 7 console.error('Invalid SPS NAL unit type'); return null; } const profile_idc = bitstream.readAlignedByte(); const constraint_flags = bitstream.readAlignedByte(); const level_idc = bitstream.readAlignedByte(); const record: AvcDecoderConfigurationRecord = { configurationVersion: 1, avcProfileIndication: profile_idc, profileCompatibility: constraint_flags, avcLevelIndication: level_idc, lengthSizeMinusOne: 3, // Typically 4 bytes for length field sequenceParameterSets: spsUnits, pictureParameterSets: ppsUnits, chromaFormat: null, bitDepthLumaMinus8: null, bitDepthChromaMinus8: null, sequenceParameterSetExt: null, }; if ( profile_idc === 100 || profile_idc === 110 || profile_idc === 122 || profile_idc === 144 ) { readExpGolomb(bitstream); // seq_parameter_set_id const chroma_format_idc = readExpGolomb(bitstream); if (chroma_format_idc === 3) { bitstream.skipBits(1); // separate_colour_plane_flag } const bit_depth_luma_minus8 = readExpGolomb(bitstream); const bit_depth_chroma_minus8 = readExpGolomb(bitstream); record.chromaFormat = chroma_format_idc; record.bitDepthLumaMinus8 = bit_depth_luma_minus8; record.bitDepthChromaMinus8 = bit_depth_chroma_minus8; record.sequenceParameterSetExt = spsExtUnits; } return record; } catch (error) { console.error('Error building AVC Decoder Configuration Record:', error); return null; } }; /** Serializes an AvcDecoderConfigurationRecord into the format specified in Section 5.3.3.1 of ISO 14496-15. */ export const serializeAvcDecoderConfigurationRecord = (record: AvcDecoderConfigurationRecord) => { const bytes: number[] = []; // Write header bytes.push(record.configurationVersion); bytes.push(record.avcProfileIndication); bytes.push(record.profileCompatibility); bytes.push(record.avcLevelIndication); bytes.push(0xFC | (record.lengthSizeMinusOne & 0x03)); // Reserved bits (6) + lengthSizeMinusOne (2) // Reserved bits (3) + numOfSequenceParameterSets (5) bytes.push(0xE0 | (record.sequenceParameterSets.length & 0x1F)); // Write SPS for (const sps of record.sequenceParameterSets) { const length = sps.byteLength; bytes.push(length >> 8); // High byte bytes.push(length & 0xFF); // Low byte for (let i = 0; i < length; i++) { bytes.push(sps[i]!); } } bytes.push(record.pictureParameterSets.length); // Write PPS for (const pps of record.pictureParameterSets) { const length = pps.byteLength; bytes.push(length >> 8); // High byte bytes.push(length & 0xFF); // Low byte for (let i = 0; i < length; i++) { bytes.push(pps[i]!); } } if ( record.avcProfileIndication === 100 || record.avcProfileIndication === 110 || record.avcProfileIndication === 122 || record.avcProfileIndication === 144 ) { assert(record.chromaFormat !== null); assert(record.bitDepthLumaMinus8 !== null); assert(record.bitDepthChromaMinus8 !== null); assert(record.sequenceParameterSetExt !== null); bytes.push(0xFC | (record.chromaFormat & 0x03)); // Reserved bits + chroma_format bytes.push(0xF8 | (record.bitDepthLumaMinus8 & 0x07)); // Reserved bits + bit_depth_luma_minus8 bytes.push(0xF8 | (record.bitDepthChromaMinus8 & 0x07)); // Reserved bits + bit_depth_chroma_minus8 bytes.push(record.sequenceParameterSetExt.length); // Write SPS Ext for (const spsExt of record.sequenceParameterSetExt) { const length = spsExt.byteLength; bytes.push(length >> 8); // High byte bytes.push(length & 0xFF); // Low byte for (let i = 0; i < length; i++) { bytes.push(spsExt[i]!); } } } return new Uint8Array(bytes); }; const NALU_TYPE_VPS = 32; const NALU_TYPE_SPS = 33; const NALU_TYPE_PPS = 34; const NALU_TYPE_SEI_PREFIX = 39; const NALU_TYPE_SEI_SUFFIX = 40; // Data specified in ISO 14496-15 export type HevcDecoderConfigurationRecord = { configurationVersion: number; generalProfileSpace: number; generalTierFlag: number; generalProfileIdc: number; generalProfileCompatibilityFlags: number; generalConstraintIndicatorFlags: Uint8Array; // 6 bytes long generalLevelIdc: number; minSpatialSegmentationIdc: number; parallelismType: number; chromaFormatIdc: number; bitDepthLumaMinus8: number; bitDepthChromaMinus8: number; avgFrameRate: number; constantFrameRate: number; numTemporalLayers: number; temporalIdNested: number; lengthSizeMinusOne: number; arrays: { arrayCompleteness: number; nalUnitType: number; nalUnits: Uint8Array[]; }[]; }; const extractNalUnitTypeForHevc = (data: Uint8Array) => { return (data[0]! >> 1) & 0x3F; }; /** Builds a HevcDecoderConfigurationRecord from an HEVC packet in Annex B format. */ export const extractHevcDecoderConfigurationRecord = ( packetData: Uint8Array, ) => { try { const nalUnits = findNalUnitsInAnnexB(packetData); const vpsUnits = nalUnits.filter(unit => extractNalUnitTypeForHevc(unit) === NALU_TYPE_VPS); const spsUnits = nalUnits.filter(unit => extractNalUnitTypeForHevc(unit) === NALU_TYPE_SPS); const ppsUnits = nalUnits.filter(unit => extractNalUnitTypeForHevc(unit) === NALU_TYPE_PPS); const seiUnits = nalUnits.filter( unit => extractNalUnitTypeForHevc(unit) === NALU_TYPE_SEI_PREFIX || extractNalUnitTypeForHevc(unit) === NALU_TYPE_SEI_SUFFIX, ); if (spsUnits.length === 0 || ppsUnits.length === 0) return null; const sps = spsUnits[0]!; const bitstream = new Bitstream(removeEmulationPreventionBytes(sps)); bitstream.skipBits(16); // NAL header bitstream.readBits(4); // sps_video_parameter_set_id const sps_max_sub_layers_minus1 = bitstream.readBits(3); const sps_temporal_id_nesting_flag = bitstream.readBits(1); const { general_profile_space, general_tier_flag, general_profile_idc, general_profile_compatibility_flags, general_constraint_indicator_flags, general_level_idc, } = parseProfileTierLevel(bitstream, sps_max_sub_layers_minus1); readExpGolomb(bitstream); // sps_seq_parameter_set_id const chroma_format_idc = readExpGolomb(bitstream); if (chroma_format_idc === 3) bitstream.skipBits(1); // separate_colour_plane_flag readExpGolomb(bitstream); // pic_width_in_luma_samples readExpGolomb(bitstream); // pic_height_in_luma_samples if (bitstream.readBits(1)) { // conformance_window_flag readExpGolomb(bitstream); // conf_win_left_offset readExpGolomb(bitstream); // conf_win_right_offset readExpGolomb(bitstream); // conf_win_top_offset readExpGolomb(bitstream); // conf_win_bottom_offset } const bit_depth_luma_minus8 = readExpGolomb(bitstream); const bit_depth_chroma_minus8 = readExpGolomb(bitstream); readExpGolomb(bitstream); // log2_max_pic_order_cnt_lsb_minus4 const sps_sub_layer_ordering_info_present_flag = bitstream.readBits(1); const maxNum = sps_sub_layer_ordering_info_present_flag ? 0 : sps_max_sub_layers_minus1; for (let i = maxNum; i <= sps_max_sub_layers_minus1; i++) { readExpGolomb(bitstream); // sps_max_dec_pic_buffering_minus1[i] readExpGolomb(bitstream); // sps_max_num_reorder_pics[i] readExpGolomb(bitstream); // sps_max_latency_increase_plus1[i] } readExpGolomb(bitstream); // log2_min_luma_coding_block_size_minus3 readExpGolomb(bitstream); // log2_diff_max_min_luma_coding_block_size readExpGolomb(bitstream); // log2_min_luma_transform_block_size_minus2 readExpGolomb(bitstream); // log2_diff_max_min_luma_transform_block_size readExpGolomb(bitstream); // max_transform_hierarchy_depth_inter readExpGolomb(bitstream); // max_transform_hierarchy_depth_intra if (bitstream.readBits(1)) { // scaling_list_enabled_flag if (bitstream.readBits(1)) { skipScalingListData(bitstream); } } bitstream.skipBits(1); // amp_enabled_flag bitstream.skipBits(1); // sample_adaptive_offset_enabled_flag if (bitstream.readBits(1)) { // pcm_enabled_flag bitstream.skipBits(4); // pcm_sample_bit_depth_luma_minus1 bitstream.skipBits(4); // pcm_sample_bit_depth_chroma_minus1 readExpGolomb(bitstream); // log2_min_pcm_luma_coding_block_size_minus3 readExpGolomb(bitstream); // log2_diff_max_min_pcm_luma_coding_block_size bitstream.skipBits(1); // pcm_loop_filter_disabled_flag } const num_short_term_ref_pic_sets = readExpGolomb(bitstream); skipAllStRefPicSets(bitstream, num_short_term_ref_pic_sets); if (bitstream.readBits(1)) { // long_term_ref_pics_present_flag const num_long_term_ref_pics_sps = readExpGolomb(bitstream); for (let i = 0; i < num_long_term_ref_pics_sps; i++) { readExpGolomb(bitstream); // lt_ref_pic_poc_lsb_sps[i] bitstream.skipBits(1); // used_by_curr_pic_lt_sps_flag[i] } } bitstream.skipBits(1); // sps_temporal_mvp_enabled_flag bitstream.skipBits(1); // strong_intra_smoothing_enabled_flag let min_spatial_segmentation_idc = 0; if (bitstream.readBits(1)) { // vui_parameters_present_flag min_spatial_segmentation_idc = parseVuiForMinSpatialSegmentationIdc(bitstream, sps_max_sub_layers_minus1); } // Parse PPS for parallelismType let parallelismType = 0; if (ppsUnits.length > 0) { const pps = ppsUnits[0]!; const ppsBitstream = new Bitstream(removeEmulationPreventionBytes(pps)); ppsBitstream.skipBits(16); // NAL header readExpGolomb(ppsBitstream); // pps_pic_parameter_set_id readExpGolomb(ppsBitstream); // pps_seq_parameter_set_id ppsBitstream.skipBits(1); // dependent_slice_segments_enabled_flag ppsBitstream.skipBits(1); // output_flag_present_flag ppsBitstream.skipBits(3); // num_extra_slice_header_bits ppsBitstream.skipBits(1); // sign_data_hiding_enabled_flag ppsBitstream.skipBits(1); // cabac_init_present_flag readExpGolomb(ppsBitstream); // num_ref_idx_l0_default_active_minus1 readExpGolomb(ppsBitstream); // num_ref_idx_l1_default_active_minus1 readSignedExpGolomb(ppsBitstream); // init_qp_minus26 ppsBitstream.skipBits(1); // constrained_intra_pred_flag ppsBitstream.skipBits(1); // transform_skip_enabled_flag if (ppsBitstream.readBits(1)) { // cu_qp_delta_enabled_flag readExpGolomb(ppsBitstream); // diff_cu_qp_delta_depth } readSignedExpGolomb(ppsBitstream); // pps_cb_qp_offset readSignedExpGolomb(ppsBitstream); // pps_cr_qp_offset ppsBitstream.skipBits(1); // pps_slice_chroma_qp_offsets_present_flag ppsBitstream.skipBits(1); // weighted_pred_flag ppsBitstream.skipBits(1); // weighted_bipred_flag ppsBitstream.skipBits(1); // transquant_bypass_enabled_flag const tiles_enabled_flag = ppsBitstream.readBits(1); const entropy_coding_sync_enabled_flag = ppsBitstream.readBits(1); if (!tiles_enabled_flag && !entropy_coding_sync_enabled_flag) parallelismType = 0; else if (tiles_enabled_flag && !entropy_coding_sync_enabled_flag) parallelismType = 2; else if (!tiles_enabled_flag && entropy_coding_sync_enabled_flag) parallelismType = 3; else parallelismType = 0; } const arrays = [ ...(vpsUnits.length ? [ { arrayCompleteness: 1, nalUnitType: NALU_TYPE_VPS, nalUnits: vpsUnits, }, ] : []), ...(spsUnits.length ? [ { arrayCompleteness: 1, nalUnitType: NALU_TYPE_SPS, nalUnits: spsUnits, }, ] : []), ...(ppsUnits.length ? [ { arrayCompleteness: 1, nalUnitType: NALU_TYPE_PPS, nalUnits: ppsUnits, }, ] : []), ...(seiUnits.length ? [ { arrayCompleteness: 1, nalUnitType: extractNalUnitTypeForHevc(seiUnits[0]!), nalUnits: seiUnits, }, ] : []), ]; const record: HevcDecoderConfigurationRecord = { configurationVersion: 1, generalProfileSpace: general_profile_space, generalTierFlag: general_tier_flag, generalProfileIdc: general_profile_idc, generalProfileCompatibilityFlags: general_profile_compatibility_flags, generalConstraintIndicatorFlags: general_constraint_indicator_flags, generalLevelIdc: general_level_idc, minSpatialSegmentationIdc: min_spatial_segmentation_idc, parallelismType, chromaFormatIdc: chroma_format_idc, bitDepthLumaMinus8: bit_depth_luma_minus8, bitDepthChromaMinus8: bit_depth_chroma_minus8, avgFrameRate: 0, constantFrameRate: 0, numTemporalLayers: sps_max_sub_layers_minus1 + 1, temporalIdNested: sps_temporal_id_nesting_flag, lengthSizeMinusOne: 3, arrays, }; return record; } catch (error) { console.error('Error building HEVC Decoder Configuration Record:', error); return null; } }; const parseProfileTierLevel = ( bitstream: Bitstream, maxNumSubLayersMinus1: number, ) => { const general_profile_space = bitstream.readBits(2); const general_tier_flag = bitstream.readBits(1); const general_profile_idc = bitstream.readBits(5); let general_profile_compatibility_flags = 0; for (let i = 0; i < 32; i++) { general_profile_compatibility_flags = (general_profile_compatibility_flags << 1) | bitstream.readBits(1); } const general_constraint_indicator_flags = new Uint8Array(6); for (let i = 0; i < 6; i++) { general_constraint_indicator_flags[i] = bitstream.readBits(8); } const general_level_idc = bitstream.readBits(8); const sub_layer_profile_present_flag: number[] = []; const sub_layer_level_present_flag: number[] = []; for (let i = 0; i < maxNumSubLayersMinus1; i++) { sub_layer_profile_present_flag.push(bitstream.readBits(1)); sub_layer_level_present_flag.push(bitstream.readBits(1)); } if (maxNumSubLayersMinus1 > 0) { for (let i = maxNumSubLayersMinus1; i < 8; i++) { bitstream.skipBits(2); // reserved_zero_2bits } } for (let i = 0; i < maxNumSubLayersMinus1; i++) { if (sub_layer_profile_present_flag[i]) bitstream.skipBits(88); if (sub_layer_level_present_flag[i]) bitstream.skipBits(8); } return { general_profile_space, general_tier_flag, general_profile_idc, general_profile_compatibility_flags, general_constraint_indicator_flags, general_level_idc, }; }; const skipScalingListData = (bitstream: Bitstream) => { for (let sizeId = 0; sizeId < 4; sizeId++) { for (let matrixId = 0; matrixId < (sizeId === 3 ? 2 : 6); matrixId++) { const scaling_list_pred_mode_flag = bitstream.readBits(1); if (!scaling_list_pred_mode_flag) { readExpGolomb(bitstream); // scaling_list_pred_matrix_id_delta } else { const coefNum = Math.min(64, 1 << (4 + (sizeId << 1))); if (sizeId > 1) { readSignedExpGolomb(bitstream); // scaling_list_dc_coef_minus8 } for (let i = 0; i < coefNum; i++) { readSignedExpGolomb(bitstream); // scaling_list_delta_coef } } } } }; const skipAllStRefPicSets = (bitstream: Bitstream, num_short_term_ref_pic_sets: number) => { const NumDeltaPocs: number[] = []; for (let stRpsIdx = 0; stRpsIdx < num_short_term_ref_pic_sets; stRpsIdx++) { NumDeltaPocs[stRpsIdx] = skipStRefPicSet(bitstream, stRpsIdx, num_short_term_ref_pic_sets, NumDeltaPocs); } }; const skipStRefPicSet = ( bitstream: Bitstream, stRpsIdx: number, num_short_term_ref_pic_sets: number, NumDeltaPocs: number[], ) => { let NumDeltaPocsThis = 0; let inter_ref_pic_set_prediction_flag = 0; let RefRpsIdx = 0; if (stRpsIdx !== 0) { inter_ref_pic_set_prediction_flag = bitstream.readBits(1); } if (inter_ref_pic_set_prediction_flag) { if (stRpsIdx === num_short_term_ref_pic_sets) { const delta_idx_minus1 = readExpGolomb(bitstream); RefRpsIdx = stRpsIdx - (delta_idx_minus1 + 1); } else { RefRpsIdx = stRpsIdx - 1; } bitstream.readBits(1); // delta_rps_sign readExpGolomb(bitstream); // abs_delta_rps_minus1 // The number of iterations is NumDeltaPocs[RefRpsIdx] + 1 const numDelta = NumDeltaPocs[RefRpsIdx] ?? 0; for (let j = 0; j <= numDelta; j++) { const used_by_curr_pic_flag = bitstream.readBits(1); if (!used_by_curr_pic_flag) { bitstream.readBits(1); // use_delta_flag } } NumDeltaPocsThis = NumDeltaPocs[RefRpsIdx]!; } else { const num_negative_pics = readExpGolomb(bitstream); const num_positive_pics = readExpGolomb(bitstream); for (let i = 0; i < num_negative_pics; i++) { readExpGolomb(bitstream); // delta_poc_s0_minus1[i] bitstream.readBits(1); // used_by_curr_pic_s0_flag[i] } for (let i = 0; i < num_positive_pics; i++) { readExpGolomb(bitstream); // delta_poc_s1_minus1[i] bitstream.readBits(1); // used_by_curr_pic_s1_flag[i] } NumDeltaPocsThis = num_negative_pics + num_positive_pics; } return NumDeltaPocsThis; }; const parseVuiForMinSpatialSegmentationIdc = (bitstream: Bitstream, sps_max_sub_layers_minus1: number) => { if (bitstream.readBits(1)) { // aspect_ratio_info_present_flag const aspect_ratio_idc = bitstream.readBits(8); if (aspect_ratio_idc === 255) { bitstream.readBits(16); // sar_width bitstream.readBits(16); // sar_height } } if (bitstream.readBits(1)) { // overscan_info_present_flag bitstream.readBits(1); // overscan_appropriate_flag } if (bitstream.readBits(1)) { // video_signal_type_present_flag bitstream.readBits(3); // video_format bitstream.readBits(1); // video_full_range_flag if (bitstream.readBits(1)) { bitstream.readBits(8); // colour_primaries bitstream.readBits(8); // transfer_characteristics bitstream.readBits(8); // matrix_coeffs } } if (bitstream.readBits(1)) { // chroma_loc_info_present_flag readExpGolomb(bitstream); // chroma_sample_loc_type_top_field readExpGolomb(bitstream); // chroma_sample_loc_type_bottom_field } bitstream.readBits(1); // neutral_chroma_indication_flag bitstream.readBits(1); // field_seq_flag bitstream.readBits(1); // frame_field_info_present_flag if (bitstream.readBits(1)) { // default_display_window_flag readExpGolomb(bitstream); // def_disp_win_left_offset readExpGolomb(bitstream); // def_disp_win_right_offset readExpGolomb(bitstream); // def_disp_win_top_offset readExpGolomb(bitstream); // def_disp_win_bottom_offset } if (bitstream.readBits(1)) { // vui_timing_info_present_flag bitstream.readBits(32); // vui_num_units_in_tick bitstream.readBits(32); // vui_time_scale if (bitstream.readBits(1)) { // vui_poc_proportional_to_timing_flag readExpGolomb(bitstream); // vui_num_ticks_poc_diff_one_minus1 } if (bitstream.readBits(1)) { skipHrdParameters(bitstream, true, sps_max_sub_layers_minus1); } } if (bitstream.readBits(1)) { // bitstream_restriction_flag bitstream.readBits(1); // tiles_fixed_structure_flag bitstream.readBits(1); // motion_vectors_over_pic_boundaries_flag bitstream.readBits(1); // restricted_ref_pic_lists_flag const min_spatial_segmentation_idc = readExpGolomb(bitstream); // skip the rest readExpGolomb(bitstream); // max_bytes_per_pic_denom readExpGolomb(bitstream); // max_bits_per_min_cu_denom readExpGolomb(bitstream); // log2_max_mv_length_horizontal readExpGolomb(bitstream); // log2_max_mv_length_vertical return min_spatial_segmentation_idc; } return 0; }; const skipHrdParameters = ( bitstream: Bitstream, commonInfPresentFlag: boolean, maxNumSubLayersMinus1: number, ) => { let nal_hrd_parameters_present_flag = false; let vcl_hrd_parameters_present_flag = false; let sub_pic_hrd_params_present_flag = false; if (commonInfPresentFlag) { nal_hrd_parameters_present_flag = bitstream.readBits(1) === 1; vcl_hrd_parameters_present_flag = bitstream.readBits(1) === 1; if (nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag) { sub_pic_hrd_params_present_flag = bitstream.readBits(1) === 1; if (sub_pic_hrd_params_present_flag) { bitstream.readBits(8); // tick_divisor_minus2 bitstream.readBits(5); // du_cpb_removal_delay_increment_length_minus1 bitstream.readBits(1); // sub_pic_cpb_params_in_pic_timing_sei_flag bitstream.readBits(5); // dpb_output_delay_du_length_minus1 } bitstream.readBits(4); // bit_rate_scale bitstream.readBits(4); // cpb_size_scale if (sub_pic_hrd_params_present_flag) { bitstream.readBits(4); // cpb_size_du_scale } bitstream.readBits(5); // initial_cpb_removal_delay_length_minus1 bitstream.readBits(5); // au_cpb_removal_delay_length_minus1 bitstream.readBits(5); // dpb_output_delay_length_minus1 } } for (let i = 0; i <= maxNumSubLayersMinus1; i++) { const fixed_pic_rate_general_flag = bitstream.readBits(1) === 1; let fixed_pic_rate_within_cvs_flag = true; // Default assumption if general is true if (!fixed_pic_rate_general_flag) { fixed_pic_rate_within_cvs_flag = bitstream.readBits(1) === 1; } let low_delay_hrd_flag = false; // Default assumption if (fixed_pic_rate_within_cvs_flag) { readExpGolomb(bitstream); // elemental_duration_in_tc_minus1[i] } else { low_delay_hrd_flag = bitstream.readBits(1) === 1; } let CpbCnt = 1; // Default if low_delay is true if (!low_delay_hrd_flag) { const cpb_cnt_minus1 = readExpGolomb(bitstream); // cpb_cnt_minus1[i] CpbCnt = cpb_cnt_minus1 + 1; } if (nal_hrd_parameters_present_flag) { skipSubLayerHrdParameters(bitstream, CpbCnt, sub_pic_hrd_params_present_flag); } if (vcl_hrd_parameters_present_flag) { skipSubLayerHrdParameters(bitstream, CpbCnt, sub_pic_hrd_params_present_flag); } } }; const skipSubLayerHrdParameters = ( bitstream: Bitstream, CpbCnt: number, sub_pic_hrd_params_present_flag: boolean, ) => { for (let i = 0; i < CpbCnt; i++) { readExpGolomb(bitstream); // bit_rate_value_minus1[i] readExpGolomb(bitstream); // cpb_size_value_minus1[i] if (sub_pic_hrd_params_present_flag) { readExpGolomb(bitstream); // cpb_size_du_value_minus1[i] readExpGolomb(bitstream); // bit_rate_du_value_minus1[i] } bitstream.readBits(1); // cbr_flag[i] } }; /** Serializes an HevcDecoderConfigurationRecord into the format specified in Section 8.3.3.1 of ISO 14496-15. */ export const serializeHevcDecoderConfigurationRecord = (record: HevcDecoderConfigurationRecord) => { const bytes: number[] = []; bytes.push(record.configurationVersion); bytes.push( ((record.generalProfileSpace & 0x3) << 6) | ((record.generalTierFlag & 0x1) << 5) | (record.generalProfileIdc & 0x1F), ); bytes.push((record.generalProfileCompatibilityFlags >>> 24) & 0xFF); bytes.push((record.generalProfileCompatibilityFlags >>> 16) & 0xFF); bytes.push((record.generalProfileCompatibilityFlags >>> 8) & 0xFF); bytes.push(record.generalProfileCompatibilityFlags & 0xFF); bytes.push(...record.generalConstraintIndicatorFlags); bytes.push(record.generalLevelIdc & 0xFF); bytes.push(0xF0 | ((record.minSpatialSegmentationIdc >> 8) & 0x0F)); // Reserved + high nibble bytes.push(record.minSpatialSegmentationIdc & 0xFF); // Low byte bytes.push(0xFC | (record.parallelismType & 0x03)); bytes.push(0xFC | (record.chromaFormatIdc & 0x03)); bytes.push(0xF8 | (record.bitDepthLumaMinus8 & 0x07)); bytes.push(0xF8 | (record.bitDepthChromaMinus8 & 0x07)); bytes.push((record.avgFrameRate >> 8) & 0xFF); // High byte bytes.push(record.avgFrameRate & 0xFF); // Low byte bytes.push( ((record.constantFrameRate & 0x03) << 6) | ((record.numTemporalLayers & 0x07) << 3) | ((record.temporalIdNested & 0x01) << 2) | (record.lengthSizeMinusOne & 0x03), ); bytes.push(record.arrays.length & 0xFF); for (const arr of record.arrays) { bytes.push( ((arr.arrayCompleteness & 0x01) << 7) | (0 << 6) | (arr.nalUnitType & 0x3F), ); bytes.push((arr.nalUnits.length >> 8) & 0xFF); // High byte bytes.push(arr.nalUnits.length & 0xFF); // Low byte for (const nal of arr.nalUnits) { bytes.push((nal.length >> 8) & 0xFF); // High byte bytes.push(nal.length & 0xFF); // Low byte for (let i = 0; i < nal.length; i++) { bytes.push(nal[i]!); } } } return new Uint8Array(bytes); }; export type Vp9CodecInfo = { profile: number; level: number; bitDepth: number; chromaSubsampling: number; videoFullRangeFlag: number; colourPrimaries: number; transferCharacteristics: number; matrixCoefficients: number; }; export const extractVp9CodecInfoFromPacket = ( packet: Uint8Array, ): Vp9CodecInfo | null => { // eslint-disable-next-line @stylistic/max-len // https://storage.googleapis.com/downloads.webmproject.org/docs/vp9/vp9-bitstream-specification-v0.7-20170222-draft.pdf // http://downloads.webmproject.org/docs/vp9/vp9-bitstream_superframe-and-uncompressed-header_v1.0.pdf const bitstream = new Bitstream(packet); // Frame marker (0b10) const frameMarker = bitstream.readBits(2); if (frameMarker !== 2) { return null; } // Profile const profileLowBit = bitstream.readBits(1); const profileHighBit = bitstream.readBits(1); const profile = (profileHighBit << 1) + profileLowBit; // Skip reserved bit for profile 3 if (profile === 3) { bitstream.skipBits(1); } // show_existing_frame const showExistingFrame = bitstream.readBits(1); if (showExistingFrame === 1) { return null; } // frame_type (0 = key frame) const frameType = bitstream.readBits(1); if (frameType !== 0) { return null; } // Skip show_frame and error_resilient_mode bitstream.skipBits(2); // Sync code (0x498342) const syncCode = bitstream.readBits(24); if (syncCode !== 0x498342) { return null; } // Color config let bitDepth = 8; if (profile >= 2) { const tenOrTwelveBit = bitstream.readBits(1); bitDepth = tenOrTwelveBit ? 12 : 10; } // Color space const colorSpace = bitstream.readBits(3); let chromaSubsampling = 0; let videoFullRangeFlag = 0; if (colorSpace !== 7) { // 7 is CS_RGB const colorRange = bitstream.readBits(1); videoFullRangeFlag = colorRange; if (profile === 1 || profile === 3) { const subsamplingX = bitstream.readBits(1); const subsamplingY = bitstream.readBits(1); // 0 = 4:2:0 vertical // 1 = 4:2:0 colocated // 2 = 4:2:2 // 3 = 4:4:4 chromaSubsampling = !subsamplingX && !subsamplingY ? 3 // 0,0 = 4:4:4 : subsamplingX && !subsamplingY ? 2 // 1,0 = 4:2:2 : 1; // 1,1 = 4:2:0 colocated (default) // Skip reserved bit bitstream.skipBits(1); } else { // For profile 0 and 2, always 4:2:0 chromaSubsampling = 1; // Using colocated as default } } else { // RGB is always 4:4:4 chromaSubsampling = 3; videoFullRangeFlag = 1; } // Parse frame size const widthMinusOne = bitstream.readBits(16); const heightMinusOne = bitstream.readBits(16); const width = widthMinusOne + 1; const height = heightMinusOne + 1; // Calculate level based on dimensions const pictureSize = width * height; let level = last(VP9_LEVEL_TABLE)!.level; // Default to highest level for (const entry of VP9_LEVEL_TABLE) { if (pictureSize <= entry.maxPictureSize) { level = entry.level; break; } } // Map color_space to standard values const matrixCoefficients = colorSpace === 7 ? 0 : colorSpace === 2 ? 1 : colorSpace === 1 ? 6 : 2; const colourPrimaries = colorSpace === 2 ? 1 : colorSpace === 1 ? 6 : 2; const transferCharacteristics = colorSpace === 2 ? 1 : colorSpace === 1 ? 6 : 2; return { profile, level, bitDepth, chromaSubsampling, videoFullRangeFlag, colourPrimaries, transferCharacteristics, matrixCoefficients, }; }; export type Av1CodecInfo = { profile: number; level: number; tier: number; bitDepth: number; monochrome: number; chromaSubsamplingX: number; chromaSubsamplingY: number; chromaSamplePosition: number; }; /** Iterates over all OBUs in an AV1 packet bistream. */ export function* iterateAv1PacketObus(packet: Uint8Array) { // https://aomediacodec.github.io/av1-spec/av1-spec.pdf const bitstream = new Bitstream(packet); const readLeb128 = (): number | null => { let value = 0; for (let i = 0; i < 8; i++) { const byte = bitstream.readAlignedByte(); value |= ((byte & 0x7f) << (i * 7)); if (!(byte & 0x80)) { break; } // Spec requirement if (i === 7 && (byte & 0x80)) { return null; } } // Spec requirement if (value >= 2 ** 32 - 1) { return null; } return value; }; while (bitstream.getBitsLeft() >= 8) { // Parse OBU header bitstream.skipBits(1); const obuType = bitstream.readBits(4); const obuExtension = bitstream.readBits(1); const obuHasSizeField = bitstream.readBits(1); bitstream.skipBits(1); // Skip extension header if present if (obuExtension) { bitstream.skipBits(8); } // Read OBU size if present let obuSize: number; if (obuHasSizeField) { const obuSizeValue = readLeb128(); if (obuSizeValue === null) return; // It was invalid obuSize = obuSizeValue; } else { // Calculate remaining bits and convert to bytes, rounding down obuSize = Math.floor(bitstream.getBitsLeft() / 8); } assert(bitstream.pos % 8 === 0); yield { type: obuType, data: packet.subarray(bitstream.pos / 8, bitstream.pos / 8 + obuSize), }; // Move to next OBU bitstream.skipBits(obuSize * 8); } }; /** * When AV1 codec information is not provided by the container, we can still try to extract the information by digging * into the AV1 bitstream. */ export const extractAv1CodecInfoFromPacket = ( packet: Uint8Array, ): Av1CodecInfo | null => { // https://aomediacodec.github.io/av1-spec/av1-spec.pdf for (const { type, data } of iterateAv1PacketObus(packet)) { if (type !== 1) { continue; // 1 == OBU_SEQUENCE_HEADER } const bitstream = new Bitstream(data); // Read sequence header fields const seqProfile = bitstream.readBits(3); // eslint-disable-next-line @typescript-eslint/no-unused-vars const stillPicture = bitstream.readBits(1); const reducedStillPictureHeader = bitstream.readBits(1); let seqLevel = 0; let seqTier = 0; let bufferDelayLengthMinus1 = 0; if (reducedStillPictureHeader) { seqLevel = bitstream.readBits(5); } else { // Parse timing_info_present_flag const timingInfoPresentFlag = bitstream.readBits(1); if (timingInfoPresentFlag) { // Skip timing info (num_units_in_display_tick, time_scale, equal_picture_interval) bitstream.skipBits(32); // num_units_in_display_tick bitstream.skipBits(32); // time_scale const equalPictureInterval = bitstream.readBits(1); if (equalPictureInterval) { // Skip num_ticks_per_picture_minus_1 (uvlc) // Since this is variable length, we'd need to implement uvlc reading // For now, we'll return null as this is rare return null; } } // Parse decoder_model_info_present_flag const decoderModelInfoPresentFlag = bitstream.readBits(1); if (decoderModelInfoPresentFlag) { // Store buffer_delay_length_minus_1 instead of just skipping bufferDelayLengthMinus1 = bitstream.readBits(5); bitstream.skipBits(32); // num_units_in_decoding_tick bitstream.skipBits(5); // buffer_removal_time_length_minus_1 bitstream.skipBits(5); // frame_presentation_time_length_minus_1 } // Parse operating_points_cnt_minus_1 const operatingPointsCntMinus1 = bitstream.readBits(5); // For each operating point for (let i = 0; i <= operatingPointsCntMinus1; i++) { // operating_point_idc[i] bitstream.skipBits(12); // seq_level_idx[i] const seqLevelIdx = bitstream.readBits(5); if (i === 0) { seqLevel = seqLevelIdx; } if (seqLevelIdx > 7) { // seq_tier[i] const seqTierTemp = bitstream.readBits(1); if (i === 0) { seqTier = seqTierTemp; } } if (decoderModelInfoPresentFlag) { // decoder_model_present_for_this_op[i] const decoderModelPresentForThisOp = bitstream.readBits(1); if (decoderModelPresentForThisOp) { const n = bufferDelayLengthMinus1 + 1; bitstream.skipBits(n); // decoder_buffer_delay[op] bitstream.skipBits(n); // encoder_buffer_delay[op] bitstream.skipBits(1); // low_delay_mode_flag[op] } } // initial_display_delay_present_flag const initialDisplayDelayPresentFlag = bitstream.readBits(1); if (initialDisplayDelayPresentFlag) { // initial_display_delay_minus_1[i] bitstream.skipBits(4); } } } const highBitdepth = bitstream.readBits(1); let bitDepth = 8; if (seqProfile === 2 && highBitdepth) { const twelveBit = bitstream.readBits(1); bitDepth = twelveBit ? 12 : 10; } else if (seqProfile <= 2) { bitDepth = highBitdepth ? 10 : 8; } let monochrome = 0; if (seqProfile !== 1) { monochrome = bitstream.readBits(1); } let chromaSubsamplingX = 1; let chromaSubsamplingY = 1; let chromaSamplePosition = 0; if (!monochrome) { if (seqProfile === 0) { chromaSubsamplingX = 1; chromaSubsamplingY = 1; } else if (seqProfile === 1) { chromaSubsamplingX = 0; chromaSubsamplingY = 0; } else { if (bitDepth === 12) { chromaSubsamplingX = bitstream.readBits(1); if (chromaSubsamplingX) { chromaSubsamplingY = bitstream.readBits(1); } } } if (chromaSubsamplingX && chromaSubsamplingY) { chromaSamplePosition = bitstream.readBits(2); } } return { profile: seqProfile, level: seqLevel, tier: seqTier, bitDepth, monochrome, chromaSubsamplingX, chromaSubsamplingY, chromaSamplePosition, }; } return null; }; export const parseOpusIdentificationHeader = (bytes: Uint8Array) => { const view = toDataView(bytes); const outputChannelCount = view.getUint8(9); const preSkip = view.getUint16(10, true); const inputSampleRate = view.getUint32(12, true); const outputGain = view.getInt16(16, true); const channelMappingFamily = view.getUint8(18); let channelMappingTable: Uint8Array | null = null; if (channelMappingFamily) { channelMappingTable = bytes.subarray(19, 19 + 2 + outputChannelCount); } return { outputChannelCount, preSkip, inputSampleRate, outputGain, channelMappingFamily, channelMappingTable, }; }; // From https://datatracker.ietf.org/doc/html/rfc6716, in 48 kHz samples const OPUS_FRAME_DURATION_TABLE = [ 480, 960, 1920, 2880, 480, 960, 1920, 2880, 480, 960, 1920, 2880, 480, 960, 480, 960, 120, 240, 480, 960, 120, 240, 480, 960, 120, 240, 480, 960, 120, 240, 480, 960, ]; export const parseOpusTocByte = (packet: Uint8Array) => { const config = packet[0]! >> 3; return { durationInSamples: OPUS_FRAME_DURATION_TABLE[config]!, }; }; // Based on vorbis_parser.c from FFmpeg. export const parseModesFromVorbisSetupPacket = (setupHeader: Uint8Array) => { // Verify that this is a Setup header. if (setupHeader.length < 7) { throw new Error('Setup header is too short.'); } if (setupHeader[0] !== 5) { throw new Error('Wrong packet type in Setup header.'); } const signature = String.fromCharCode(...setupHeader.slice(1, 7)); if (signature !== 'vorbis') { throw new Error('Invalid packet signature in Setup header.'); } // Reverse the entire buffer. const bufSize = setupHeader.length; const revBuffer = new Uint8Array(bufSize); for (let i = 0; i < bufSize; i++) { revBuffer[i] = setupHeader[bufSize - 1 - i]!; } // Initialize a Bitstream on the reversed buffer. const bitstream = new Bitstream(revBuffer); // --- Find the framing bit. // In FFmpeg code, we scan until get_bits1() returns 1. let gotFramingBit = 0; while (bitstream.getBitsLeft() > 97) { if (bitstream.readBits(1) === 1) { gotFramingBit = bitstream.pos; break; } } if (gotFramingBit === 0) { throw new Error('Invalid Setup header: framing bit not found.'); } // --- Search backwards for a valid mode header. // We try to “guess” the number of modes by reading a fixed pattern. let modeCount = 0; let gotModeHeader = false; let lastModeCount = 0; while (bitstream.getBitsLeft() >= 97) { const tempPos = bitstream.pos; const a = bitstream.readBits(8); const b = bitstream.readBits(16); const c = bitstream.readBits(16); // If a > 63 or b or c nonzero, assume we’ve gone too far. if (a > 63 || b !== 0 || c !== 0) { bitstream.pos = tempPos; break; } bitstream.skipBits(1); modeCount++; if (modeCount > 64) { break; } const bsClone = bitstream.clone(); const candidate = bsClone.readBits(6) + 1; if (candidate === modeCount) { gotModeHeader = true; lastModeCount = modeCount; } } if (!gotModeHeader) { throw new Error('Invalid Setup header: mode header not found.'); } if (lastModeCount > 63) { throw new Error(`Unsupported mode count: ${lastModeCount}.`); } const finalModeCount = lastModeCount; // --- Reinitialize the bitstream. bitstream.pos = 0; // Skip the bits up to the found framing bit. bitstream.skipBits(gotFramingBit); // --- Now read, for each mode (in reverse order), 40 bits then one bit. // That one bit is the mode blockflag. const modeBlockflags = Array(finalModeCount).fill(0) as number[]; for (let i = finalModeCount - 1; i >= 0; i--) { bitstream.skipBits(40); modeBlockflags[i] = bitstream.readBits(1); } return { modeBlockflags }; }; /** Determines a packet's type (key or delta) by digging into the packet bitstream. */ export const determineVideoPacketType = async ( videoTrack: InputVideoTrack, packet: EncodedPacket, ): Promise<PacketType | null> => { assert(videoTrack.codec); switch (videoTrack.codec) { case 'avc': { const decoderConfig = await videoTrack.getDecoderConfig(); assert(decoderConfig); let nalUnits: Uint8Array[]; if (decoderConfig.description) { // Stream is length-prefixed. Let's extract the size of the length prefix from the decoder config const bytes = toUint8Array(decoderConfig.description); const lengthSizeMinusOne = bytes[4]! & 0b11; const lengthSize = (lengthSizeMinusOne + 1) as 1 | 2 | 3 | 4; nalUnits = findNalUnitsInLengthPrefixed(packet.data, lengthSize); } else { // Stream is in Annex B format nalUnits = findNalUnitsInAnnexB(packet.data); } const isKeyframe = nalUnits.some(x => extractNalUnitTypeForAvc(x) === 5); return isKeyframe ? 'key' : 'delta'; }; case 'hevc': { const decoderConfig = await videoTrack.getDecoderConfig(); assert(decoderConfig); let nalUnits: Uint8Array[]; if (decoderConfig.description) { // Stream is length-prefixed. Let's extract the size of the length prefix from the decoder config const bytes = toUint8Array(decoderConfig.description); const lengthSizeMinusOne = bytes[21]! & 0b11; const lengthSize = (lengthSizeMinusOne + 1) as 1 | 2 | 3 | 4; nalUnits = findNalUnitsInLengthPrefixed(packet.data, lengthSize); } else { // Stream is in Annex B format nalUnits = findNalUnitsInAnnexB(packet.data); } const isKeyframe = nalUnits.some((x) => { const type = extractNalUnitTypeForHevc(x); return 16 <= type && type <= 23; }); return isKeyframe ? 'key' : 'delta'; }; case 'vp8': { // VP8, once again, by far the easiest to deal with. const frameType = packet.data[0]! & 0b1; return frameType === 0 ? 'key' : 'delta'; }; case 'vp9': { const bitstream = new Bitstream(packet.data); if (bitstream.readBits(2) !== 2) { return null; }; const profileLowBit = bitstream.readBits(1); const profileHighBit = bitstream.readBits(1); const profile = (profileHighBit << 1) + profileLowBit; // Skip reserved bit for profile 3 if (profile === 3) { bitstream.skipBits(1); } const showExistingFrame = bitstream.readBits(1); if (showExistingFrame) { return null; } const frameType = bitstream.readBits(1); return frameType === 0 ? 'key' : 'delta'; }; case 'av1': { let reducedStillPictureHeader = false; for (const { type, data } of iterateAv1PacketObus(packet.data)) { if (type === 1) { // OBU_SEQUENCE_HEADER const bitstream = new Bitstream(data); bitstream.skipBits(4); reducedStillPictureHeader = !!bitstream.readBits(1); } else if ( type === 3 // OBU_FRAME_HEADER || type === 6 // OBU_FRAME || type === 7 // OBU_REDUNDANT_FRAME_HEADER ) { if (reducedStillPictureHeader) { return 'key'; } const bitstream = new Bitstream(data); const showExistingFrame = bitstream.readBits(1); if (showExistingFrame) { return null; } const frameType = bitstream.readBits(2); return frameType === 0 ? 'key' : 'delta'; } } return null; }; default: { assertNever(videoTrack.codec); assert(false); }; } };