hls.js

import AAC from './aac-helper'; import MP4 from './mp4-generator'; import { ErrorDetails, ErrorTypes } from '../errors'; import { Events } from '../events'; import { PlaylistLevelType } from '../types/loader'; import { type ILogger, Logger } from '../utils/logger'; import { timestampToString, toMsFromMpegTsClock, } from '../utils/timescale-conversion'; import type { HlsConfig } from '../config'; import type { HlsEventEmitter } from '../events'; import type { SourceBufferName } from '../types/buffer'; import type { AudioSample, DemuxedAudioTrack, DemuxedMetadataTrack, DemuxedUserdataTrack, DemuxedVideoTrack, VideoSample, } from '../types/demuxer'; import type { InitSegmentData, Mp4Sample, RemuxedMetadata, RemuxedTrack, RemuxedUserdata, Remuxer, RemuxerResult, } from '../types/remuxer'; import type { TrackSet } from '../types/track'; import type { TypeSupported } from '../utils/codecs'; import type { RationalTimestamp, TimestampOffset, } from '../utils/timescale-conversion'; const MAX_SILENT_FRAME_DURATION = 10 * 1000; // 10 seconds const AAC_SAMPLES_PER_FRAME = 1024; const MPEG_AUDIO_SAMPLE_PER_FRAME = 1152; const AC3_SAMPLES_PER_FRAME = 1536; let chromeVersion: number | null = null; let safariWebkitVersion: number | null = null; function createMp4Sample( isKeyframe: boolean, duration: number, size: number, cts: number, ): Mp4Sample { return { duration, size, cts, flags: { isLeading: 0, isDependedOn: 0, hasRedundancy: 0, degradPrio: 0, dependsOn: isKeyframe ? 2 : 1, isNonSync: isKeyframe ? 0 : 1, }, }; } export default class MP4Remuxer extends Logger implements Remuxer { private readonly observer: HlsEventEmitter; private readonly config: HlsConfig; private readonly typeSupported: TypeSupported; private ISGenerated: boolean = false; private _initPTS: TimestampOffset | null = null; private _initDTS: TimestampOffset | null = null; private nextVideoTs: number | null = null; private nextAudioTs: number | null = null; private videoSampleDuration: number | null = null; private isAudioContiguous: boolean = false; private isVideoContiguous: boolean = false; private videoTrackConfig?: { width?: number; height?: number; pixelRatio?: [number, number]; }; constructor( observer: HlsEventEmitter, config: HlsConfig, typeSupported: TypeSupported, logger: ILogger, ) { super('mp4-remuxer', logger); this.observer = observer; this.config = config; this.typeSupported = typeSupported; this.ISGenerated = false; if (chromeVersion === null) { const userAgent = navigator.userAgent || ''; const result = userAgent.match(/Chrome\/(\d+)/i); chromeVersion = result ? parseInt(result[1]) : 0; } if (safariWebkitVersion === null) { const result = navigator.userAgent.match(/Safari\/(\d+)/i); safariWebkitVersion = result ? parseInt(result[1]) : 0; } } destroy() { // @ts-ignore this.config = this.videoTrackConfig = this._initPTS = this._initDTS = null; } resetTimeStamp(defaultTimeStamp: TimestampOffset | null) { const initPTS = this._initPTS; if ( !initPTS || !defaultTimeStamp || defaultTimeStamp.trackId !== initPTS.trackId || defaultTimeStamp.baseTime !== initPTS.baseTime || defaultTimeStamp.timescale !== initPTS.timescale ) { this.log( `Reset initPTS: ${initPTS ? timestampToString(initPTS) : initPTS} > ${defaultTimeStamp ? timestampToString(defaultTimeStamp) : defaultTimeStamp}`, ); } this._initPTS = this._initDTS = defaultTimeStamp; } resetNextTimestamp() { this.log('reset next timestamp'); this.isVideoContiguous = false; this.isAudioContiguous = false; } resetInitSegment() { this.log('ISGenerated flag reset'); this.ISGenerated = false; this.videoTrackConfig = undefined; } getVideoStartPts(videoSamples: VideoSample[]) { // Get the minimum PTS value relative to the first sample's PTS, normalized for 33-bit wrapping let rolloverDetected = false; const firstPts = videoSamples[0].pts; const startPTS = videoSamples.reduce((minPTS, sample) => { let pts = sample.pts; let delta = pts - minPTS; if (delta < -4294967296) { // 2^32, see PTSNormalize for reasoning, but we're hitting a rollover here, and we don't want that to impact the timeOffset calculation rolloverDetected = true; pts = normalizePts(pts, firstPts); delta = pts - minPTS; } if (delta > 0) { return minPTS; } return pts; }, firstPts); if (rolloverDetected) { this.debug('PTS rollover detected'); } return startPTS; } remux( audioTrack: DemuxedAudioTrack, videoTrack: DemuxedVideoTrack, id3Track: DemuxedMetadataTrack, textTrack: DemuxedUserdataTrack, timeOffset: number, accurateTimeOffset: boolean, flush: boolean, playlistType: PlaylistLevelType, ): RemuxerResult { let video: RemuxedTrack | undefined; let audio: RemuxedTrack | undefined; let initSegment: InitSegmentData | undefined; let text: RemuxedUserdata | undefined; let id3: RemuxedMetadata | undefined; let independent: boolean | undefined; let audioTimeOffset = timeOffset; let videoTimeOffset = timeOffset; // If we're remuxing audio and video progressively, wait until we've received enough samples for each track before proceeding. // This is done to synchronize the audio and video streams. We know if the current segment will have samples if the "pid" // parameter is greater than -1. The pid is set when the PMT is parsed, which contains the tracks list. // However, if the initSegment has already been generated, or we've reached the end of a segment (flush), // then we can remux one track without waiting for the other. const hasAudio = audioTrack.pid > -1; const hasVideo = videoTrack.pid > -1; const length = videoTrack.samples.length; const enoughAudioSamples = audioTrack.samples.length > 0; const enoughVideoSamples = (flush && length > 0) || length > 1; const canRemuxAvc = ((!hasAudio || enoughAudioSamples) && (!hasVideo || enoughVideoSamples)) || this.ISGenerated || flush; if (canRemuxAvc) { if (this.ISGenerated) { const config = this.videoTrackConfig; if ( (config && (videoTrack.width !== config.width || videoTrack.height !== config.height || videoTrack.pixelRatio?.[0] !== config.pixelRatio?.[0] || videoTrack.pixelRatio?.[1] !== config.pixelRatio?.[1])) || (!config && enoughVideoSamples) || (this.nextAudioTs === null && enoughAudioSamples) ) { this.resetInitSegment(); } } if (!this.ISGenerated) { initSegment = this.generateIS( audioTrack, videoTrack, timeOffset, accurateTimeOffset, ); } const isVideoContiguous = this.isVideoContiguous; let firstKeyFrameIndex = -1; let firstKeyFramePTS; if (enoughVideoSamples) { firstKeyFrameIndex = findKeyframeIndex(videoTrack.samples); if (!isVideoContiguous && this.config.forceKeyFrameOnDiscontinuity) { independent = true; if (firstKeyFrameIndex > 0) { this.warn( `Dropped ${firstKeyFrameIndex} out of ${length} video samples due to a missing keyframe`, ); const startPTS = this.getVideoStartPts(videoTrack.samples); videoTrack.samples = videoTrack.samples.slice(firstKeyFrameIndex); videoTrack.dropped += firstKeyFrameIndex; videoTimeOffset += (videoTrack.samples[0].pts - startPTS) / videoTrack.inputTimeScale; firstKeyFramePTS = videoTimeOffset; } else if (firstKeyFrameIndex === -1) { this.warn(`No keyframe found out of ${length} video samples`); independent = false; } } } if (this.ISGenerated) { if (enoughAudioSamples && enoughVideoSamples) { // timeOffset is expected to be the offset of the first timestamp of this fragment (first DTS) // if first audio DTS is not aligned with first video DTS then we need to take that into account // when providing timeOffset to remuxAudio / remuxVideo. if we don't do that, there might be a permanent / small // drift between audio and video streams const startPTS = this.getVideoStartPts(videoTrack.samples); const tsDelta = normalizePts(audioTrack.samples[0].pts, startPTS) - startPTS; const audiovideoTimestampDelta = tsDelta / videoTrack.inputTimeScale; audioTimeOffset += Math.max(0, audiovideoTimestampDelta); videoTimeOffset += Math.max(0, -audiovideoTimestampDelta); } // Purposefully remuxing audio before video, so that remuxVideo can use nextAudioPts, which is calculated in remuxAudio. if (enoughAudioSamples) { // if initSegment was generated without audio samples, regenerate it again if (!audioTrack.samplerate) { this.warn('regenerate InitSegment as audio detected'); initSegment = this.generateIS( audioTrack, videoTrack, timeOffset, accurateTimeOffset, ); } audio = this.remuxAudio( audioTrack, audioTimeOffset, this.isAudioContiguous, accurateTimeOffset, hasVideo || enoughVideoSamples || playlistType === PlaylistLevelType.AUDIO ? videoTimeOffset : undefined, ); if (enoughVideoSamples) { const audioTrackLength = audio ? audio.endPTS - audio.startPTS : 0; // if initSegment was generated without video samples, regenerate it again if (!videoTrack.inputTimeScale) { this.warn('regenerate InitSegment as video detected'); initSegment = this.generateIS( audioTrack, videoTrack, timeOffset, accurateTimeOffset, ); } video = this.remuxVideo( videoTrack, videoTimeOffset, isVideoContiguous, audioTrackLength, ); } } else if (enoughVideoSamples) { video = this.remuxVideo( videoTrack, videoTimeOffset, isVideoContiguous, 0, ); } if (video) { video.firstKeyFrame = firstKeyFrameIndex; video.independent = firstKeyFrameIndex !== -1; video.firstKeyFramePTS = firstKeyFramePTS; } } } // Allow ID3 and text to remux, even if more audio/video samples are required if (this.ISGenerated && this._initPTS && this._initDTS) { if (id3Track.samples.length) { id3 = flushTextTrackMetadataCueSamples( id3Track, timeOffset, this._initPTS, this._initDTS, ); } if (textTrack.samples.length) { text = flushTextTrackUserdataCueSamples( textTrack, timeOffset, this._initPTS, ); } } return { audio, video, initSegment, independent, text, id3, }; } computeInitPts( basetime: number, timescale: number, presentationTime: number, type: 'audio' | 'video', ): number { const offset = Math.round(presentationTime * timescale); let timestamp = normalizePts(basetime, offset); if (timestamp < offset + timescale) { this.log( `Adjusting PTS for rollover in timeline near ${(offset - timestamp) / timescale} ${type}`, ); while (timestamp < offset + timescale) { timestamp += 8589934592; } } return timestamp - offset; } generateIS( audioTrack: DemuxedAudioTrack, videoTrack: DemuxedVideoTrack, timeOffset: number, accurateTimeOffset: boolean, ): InitSegmentData | undefined { const audioSamples = audioTrack.samples; const videoSamples = videoTrack.samples; const typeSupported = this.typeSupported; const tracks: TrackSet = {}; const _initPTS = this._initPTS; let computePTSDTS = !_initPTS || accurateTimeOffset; let container = 'audio/mp4'; let initPTS: number | undefined; let initDTS: number | undefined; let timescale: number | undefined; let trackId: number = -1; if (computePTSDTS) { initPTS = initDTS = Infinity; } if (audioTrack.config && audioSamples.length) { // let's use audio sampling rate as MP4 time scale. // rationale is that there is a integer nb of audio frames per audio sample (1024 for AAC) // using audio sampling rate here helps having an integer MP4 frame duration // this avoids potential rounding issue and AV sync issue audioTrack.timescale = audioTrack.samplerate; switch (audioTrack.segmentCodec) { case 'mp3': if (typeSupported.mpeg) { // Chrome and Safari container = 'audio/mpeg'; audioTrack.codec = ''; } else if (typeSupported.mp3) { // Firefox audioTrack.codec = 'mp3'; } break; case 'ac3': audioTrack.codec = 'ac-3'; break; } tracks.audio = { id: 'audio', container: container, codec: audioTrack.codec, initSegment: audioTrack.segmentCodec === 'mp3' && typeSupported.mpeg ? new Uint8Array(0) : MP4.initSegment([audioTrack]), metadata: { channelCount: audioTrack.channelCount, }, }; if (computePTSDTS) { trackId = audioTrack.id; timescale = audioTrack.inputTimeScale; if (!_initPTS || timescale !== _initPTS.timescale) { // remember first PTS of this demuxing context. for audio, PTS = DTS initPTS = initDTS = this.computeInitPts( audioSamples[0].pts, timescale, timeOffset, 'audio', ); } else { computePTSDTS = false; } } } if (videoTrack.sps && videoTrack.pps && videoSamples.length) { // let's use input time scale as MP4 video timescale // we use input time scale straight away to avoid rounding issues on frame duration / cts computation videoTrack.timescale = videoTrack.inputTimeScale; tracks.video = { id: 'main', container: 'video/mp4', codec: videoTrack.codec, initSegment: MP4.initSegment([videoTrack]), metadata: { width: videoTrack.width, height: videoTrack.height, }, }; if (computePTSDTS) { trackId = videoTrack.id; timescale = videoTrack.inputTimeScale; if (!_initPTS || timescale !== _initPTS.timescale) { const basePTS = this.getVideoStartPts(videoSamples); const baseDTS = normalizePts(videoSamples[0].dts, basePTS); const videoInitDTS = this.computeInitPts( baseDTS, timescale, timeOffset, 'video', ); const videoInitPTS = this.computeInitPts( basePTS, timescale, timeOffset, 'video', ); initDTS = Math.min(initDTS as number, videoInitDTS); initPTS = Math.min(initPTS as number, videoInitPTS); } else { computePTSDTS = false; } } this.videoTrackConfig = { width: videoTrack.width, height: videoTrack.height, pixelRatio: videoTrack.pixelRatio, }; } if (Object.keys(tracks).length) { this.ISGenerated = true; if (computePTSDTS) { if (_initPTS) { this.warn( `Timestamps at playlist time: ${accurateTimeOffset ? '' : '~'}${timeOffset} ${initPTS! / timescale!} != initPTS: ${_initPTS.baseTime / _initPTS.timescale} (${_initPTS.baseTime}/${_initPTS.timescale}) trackId: ${_initPTS.trackId}`, ); } this.log( `Found initPTS at playlist time: ${timeOffset} offset: ${initPTS! / timescale!} (${initPTS}/${timescale}) trackId: ${trackId}`, ); this._initPTS = { baseTime: initPTS as number, timescale: timescale as number, trackId: trackId as number, }; this._initDTS = { baseTime: initDTS as number, timescale: timescale as number, trackId: trackId as number, }; } else { initPTS = timescale = undefined; } return { tracks, initPTS, timescale, trackId, }; } } remuxVideo( track: DemuxedVideoTrack, timeOffset: number, contiguous: boolean, audioTrackLength: number, ): RemuxedTrack | undefined { const timeScale: number = track.inputTimeScale; const inputSamples: Array<VideoSample> = track.samples; const outputSamples: Array<Mp4Sample> = []; const nbSamples = inputSamples.length; const initPTS = this._initPTS as RationalTimestamp; const initTime = (initPTS.baseTime * timeScale) / initPTS.timescale; let nextVideoTs = this.nextVideoTs; let offset = 8; let mp4SampleDuration = this.videoSampleDuration; let firstDTS; let lastDTS; let minPTS: number = Number.POSITIVE_INFINITY; let maxPTS: number = Number.NEGATIVE_INFINITY; let sortSamples = false; // if parsed fragment is contiguous with last one, let's use last DTS value as reference if (!contiguous || nextVideoTs === null) { const pts = initTime + timeOffset * timeScale; const cts = inputSamples[0].pts - normalizePts(inputSamples[0].dts, inputSamples[0].pts); if ( chromeVersion && nextVideoTs !== null && Math.abs(pts - cts - (nextVideoTs + initTime)) < 15000 ) { // treat as contigous to adjust samples that would otherwise produce video buffer gaps in Chrome contiguous = true; } else { // if not contiguous, let's use target timeOffset nextVideoTs = pts - cts - initTime; } } // PTS is coded on 33bits, and can loop from -2^32 to 2^32 // PTSNormalize will make PTS/DTS value monotonic, we use last known DTS value as reference value const nextVideoPts = nextVideoTs + initTime; for (let i = 0; i < nbSamples; i++) { const sample = inputSamples[i]; sample.pts = normalizePts(sample.pts, nextVideoPts); sample.dts = normalizePts(sample.dts, nextVideoPts); if (sample.dts < inputSamples[i > 0 ? i - 1 : i].dts) { sortSamples = true; } } // sort video samples by DTS then PTS then demux id order if (sortSamples) { inputSamples.sort(function (a, b) { const deltadts = a.dts - b.dts; const deltapts = a.pts - b.pts; return deltadts || deltapts; }); } // Get first/last DTS firstDTS = inputSamples[0].dts; lastDTS = inputSamples[inputSamples.length - 1].dts; // Sample duration (as expected by trun MP4 boxes), should be the delta between sample DTS // set this constant duration as being the avg delta between consecutive DTS. const inputDuration = lastDTS - firstDTS; const averageSampleDuration = inputDuration ? Math.round(inputDuration / (nbSamples - 1)) : mp4SampleDuration || track.inputTimeScale / 30; // if fragment are contiguous, detect hole/overlapping between fragments if (contiguous) { // check timestamp continuity across consecutive fragments (this is to remove inter-fragment gap/hole) const delta = firstDTS - nextVideoPts; const foundHole = delta > averageSampleDuration; const foundOverlap = delta < -1; if (foundHole || foundOverlap) { if (foundHole) { this.warn( `${(track.segmentCodec || '').toUpperCase()}: ${toMsFromMpegTsClock( delta, true, )} ms (${delta}dts) hole between fragments detected at ${timeOffset.toFixed( 3, )}`, ); } else { this.warn( `${(track.segmentCodec || '').toUpperCase()}: ${toMsFromMpegTsClock( -delta, true, )} ms (${delta}dts) overlapping between fragments detected at ${timeOffset.toFixed( 3, )}`, ); } if ( !foundOverlap || nextVideoPts >= inputSamples[0].pts || chromeVersion ) { firstDTS = nextVideoPts; const firstPTS = inputSamples[0].pts - delta; if (foundHole) { inputSamples[0].dts = firstDTS; inputSamples[0].pts = firstPTS; } else { let isPTSOrderRetained = true; for (let i = 0; i < inputSamples.length; i++) { if (inputSamples[i].dts > firstPTS && isPTSOrderRetained) { break; } const prevPTS = inputSamples[i].pts; inputSamples[i].dts -= delta; inputSamples[i].pts -= delta; // check to see if this sample's PTS order has changed // relative to the next one if (i < inputSamples.length - 1) { const nextSamplePTS = inputSamples[i + 1].pts; const currentSamplePTS = inputSamples[i].pts; const currentOrder = nextSamplePTS <= currentSamplePTS; const prevOrder = nextSamplePTS <= prevPTS; isPTSOrderRetained = currentOrder == prevOrder; } } } this.log( `Video: Initial PTS/DTS adjusted: ${toMsFromMpegTsClock( firstPTS, true, )}/${toMsFromMpegTsClock( firstDTS, true, )}, delta: ${toMsFromMpegTsClock(delta, true)} ms`, ); } } } firstDTS = Math.max(0, firstDTS); let nbNalu = 0; let naluLen = 0; let dtsStep = firstDTS; for (let i = 0; i < nbSamples; i++) { // compute total/avc sample length and nb of NAL units const sample = inputSamples[i]; const units = sample.units; const nbUnits = units.length; let sampleLen = 0; for (let j = 0; j < nbUnits; j++) { sampleLen += units[j].data.length; } naluLen += sampleLen; nbNalu += nbUnits; sample.length = sampleLen; // ensure sample monotonic DTS if (sample.dts < dtsStep) { sample.dts = dtsStep; dtsStep += (averageSampleDuration / 4) | 0 || 1; } else { dtsStep = sample.dts; } minPTS = Math.min(sample.pts, minPTS); maxPTS = Math.max(sample.pts, maxPTS); } lastDTS = inputSamples[nbSamples - 1].dts; /* concatenate the video data and construct the mdat in place (need 8 more bytes to fill length and mpdat type) */ const mdatSize = naluLen + 4 * nbNalu + 8; let mdat; try { mdat = new Uint8Array(mdatSize); } catch (err) { this.observer.emit(Events.ERROR, Events.ERROR, { type: ErrorTypes.MUX_ERROR, details: ErrorDetails.REMUX_ALLOC_ERROR, fatal: false, error: err, bytes: mdatSize, reason: `fail allocating video mdat ${mdatSize}`, }); return; } const view = new DataView(mdat.buffer); view.setUint32(0, mdatSize); mdat.set(MP4.types.mdat, 4); let stretchedLastFrame = false; let minDtsDelta = Number.POSITIVE_INFINITY; let minPtsDelta = Number.POSITIVE_INFINITY; let maxDtsDelta = Number.NEGATIVE_INFINITY; let maxPtsDelta = Number.NEGATIVE_INFINITY; for (let i = 0; i < nbSamples; i++) { const VideoSample = inputSamples[i]; const VideoSampleUnits = VideoSample.units; let mp4SampleLength = 0; // convert NALU bitstream to MP4 format (prepend NALU with size field) for (let j = 0, nbUnits = VideoSampleUnits.length; j < nbUnits; j++) { const unit = VideoSampleUnits[j]; const unitData = unit.data; const unitDataLen = unit.data.byteLength; view.setUint32(offset, unitDataLen); offset += 4; mdat.set(unitData, offset); offset += unitDataLen; mp4SampleLength += 4 + unitDataLen; } // expected sample duration is the Decoding Timestamp diff of consecutive samples let ptsDelta; if (i < nbSamples - 1) { mp4SampleDuration = inputSamples[i + 1].dts - VideoSample.dts; ptsDelta = inputSamples[i + 1].pts - VideoSample.pts; } else { const config = this.config; const lastFrameDuration = i > 0 ? VideoSample.dts - inputSamples[i - 1].dts : averageSampleDuration; ptsDelta = i > 0 ? VideoSample.pts - inputSamples[i - 1].pts : averageSampleDuration; if (config.stretchShortVideoTrack && this.nextAudioTs !== null) { // In some cases, a segment's audio track duration may exceed the video track duration. // Since we've already remuxed audio, and we know how long the audio track is, we look to // see if the delta to the next segment is longer than maxBufferHole. // If so, playback would potentially get stuck, so we artificially inflate // the duration of the last frame to minimize any potential gap between segments. const gapTolerance = Math.floor(config.maxBufferHole * timeScale); const deltaToFrameEnd = (audioTrackLength ? minPTS + audioTrackLength * timeScale : this.nextAudioTs + initTime) - VideoSample.pts; if (deltaToFrameEnd > gapTolerance) { // We subtract lastFrameDuration from deltaToFrameEnd to try to prevent any video // frame overlap. maxBufferHole should be >> lastFrameDuration anyway. mp4SampleDuration = deltaToFrameEnd - lastFrameDuration; if (mp4SampleDuration < 0) { mp4SampleDuration = lastFrameDuration; } else { stretchedLastFrame = true; } this.log( `It is approximately ${ deltaToFrameEnd / 90 } ms to the next segment; using duration ${ mp4SampleDuration / 90 } ms for the last video frame.`, ); } else { mp4SampleDuration = lastFrameDuration; } } else { mp4SampleDuration = lastFrameDuration; } } const compositionTimeOffset = Math.round( VideoSample.pts - VideoSample.dts, ); minDtsDelta = Math.min(minDtsDelta, mp4SampleDuration); maxDtsDelta = Math.max(maxDtsDelta, mp4SampleDuration); minPtsDelta = Math.min(minPtsDelta, ptsDelta); maxPtsDelta = Math.max(maxPtsDelta, ptsDelta); outputSamples.push( createMp4Sample( VideoSample.key, mp4SampleDuration, mp4SampleLength, compositionTimeOffset, ), ); } if (outputSamples.length) { if (chromeVersion) { if (chromeVersion < 70) { // Chrome workaround, mark first sample as being a Random Access Point (keyframe) to avoid sourcebuffer append issue // https://code.google.com/p/chromium/issues/detail?id=229412 const flags = outputSamples[0].flags; flags.dependsOn = 2; flags.isNonSync = 0; } } else if (safariWebkitVersion) { // Fix for "CNN special report, with CC" in test-streams (Safari browser only) // Ignore DTS when frame durations are irregular. Safari MSE does not handle this leading to gaps. if ( maxPtsDelta - minPtsDelta < maxDtsDelta - minDtsDelta && averageSampleDuration / maxDtsDelta < 0.025 && outputSamples[0].cts === 0 ) { this.warn( 'Found irregular gaps in sample duration. Using PTS instead of DTS to determine MP4 sample duration.', ); let dts = firstDTS; for (let i = 0, len = outputSamples.length; i < len; i++) { const nextDts = dts + outputSamples[i].duration; const pts = dts + outputSamples[i].cts; if (i < len - 1) { const nextPts = nextDts + outputSamples[i + 1].cts; outputSamples[i].duration = nextPts - pts; } else { outputSamples[i].duration = i ? outputSamples[i - 1].duration : averageSampleDuration; } outputSamples[i].cts = 0; dts = nextDts; } } } } // next AVC/HEVC sample DTS should be equal to last sample DTS + last sample duration (in PES timescale) mp4SampleDuration = stretchedLastFrame || !mp4SampleDuration ? averageSampleDuration : mp4SampleDuration; const endDTS = lastDTS + mp4SampleDuration; this.nextVideoTs = nextVideoTs = endDTS - initTime; this.videoSampleDuration = mp4SampleDuration; this.isVideoContiguous = true; const moof = MP4.moof( track.sequenceNumber++, firstDTS, Object.assign(track, { samples: outputSamples, }), ); const type: SourceBufferName = 'video'; const data = { data1: moof, data2: mdat, startPTS: (minPTS - initTime) / timeScale, endPTS: (maxPTS + mp4SampleDuration - initTime) / timeScale, startDTS: (firstDTS - initTime) / timeScale, endDTS: nextVideoTs / timeScale, type, hasAudio: false, hasVideo: true, nb: outputSamples.length, dropped: track.dropped, }; track.samples = []; track.dropped = 0; return data; } getSamplesPerFrame(track: DemuxedAudioTrack) { switch (track.segmentCodec) { case 'mp3': return MPEG_AUDIO_SAMPLE_PER_FRAME; case 'ac3': return AC3_SAMPLES_PER_FRAME; default: return AAC_SAMPLES_PER_FRAME; } } remuxAudio( track: DemuxedAudioTrack, timeOffset: number, contiguous: boolean, accurateTimeOffset: boolean, videoTimeOffset?: number, ): RemuxedTrack | undefined { const inputTimeScale: number = track.inputTimeScale; const mp4timeScale: number = track.samplerate ? track.samplerate : inputTimeScale; const scaleFactor: number = inputTimeScale / mp4timeScale; const mp4SampleDuration: number = this.getSamplesPerFrame(track); const inputSampleDuration: number = mp4SampleDuration * scaleFactor; const initPTS = this._initPTS as RationalTimestamp; const rawMPEG: boolean = track.segmentCodec === 'mp3' && this.typeSupported.mpeg; const outputSamples: Array<Mp4Sample> = []; const alignedWithVideo = videoTimeOffset !== undefined; let inputSamples: Array<AudioSample> = track.samples; let offset: number = rawMPEG ? 0 : 8; let nextAudioTs: number = this.nextAudioTs || -1; // window.audioSamples ? window.audioSamples.push(inputSamples.map(s => s.pts)) : (window.audioSamples = [inputSamples.map(s => s.pts)]); // for audio samples, also consider consecutive fragments as being contiguous (even if a level switch occurs), // for sake of clarity: // consecutive fragments are frags with // - less than 100ms gaps between new time offset (if accurate) and next expected PTS OR // - less than 20 audio frames distance // contiguous fragments are consecutive fragments from same quality level (same level, new SN = old SN + 1) // this helps ensuring audio continuity // and this also avoids audio glitches/cut when switching quality, or reporting wrong duration on first audio frame const initTime = (initPTS.baseTime * inputTimeScale) / initPTS.timescale; const timeOffsetMpegTS = initTime + timeOffset * inputTimeScale; this.isAudioContiguous = contiguous = contiguous || ((inputSamples.length && nextAudioTs > 0 && ((accurateTimeOffset && Math.abs(timeOffsetMpegTS - (nextAudioTs + initTime)) < 9000) || Math.abs( normalizePts(inputSamples[0].pts, timeOffsetMpegTS) - (nextAudioTs + initTime), ) < 20 * inputSampleDuration)) as boolean); // compute normalized PTS inputSamples.forEach(function (sample) { sample.pts = normalizePts(sample.pts, timeOffsetMpegTS); }); if (!contiguous || nextAudioTs < 0) { const sampleCount = inputSamples.length; // filter out sample with negative PTS that are not playable anyway // if we don't remove these negative samples, they will shift all audio samples forward. // leading to audio overlap between current / next fragment inputSamples = inputSamples.filter((sample) => sample.pts >= 0); if (sampleCount !== inputSamples.length) { this.warn( `Removed ${inputSamples.length - sampleCount} of ${sampleCount} samples (initPTS ${initTime} / ${inputTimeScale})`, ); } // in case all samples have negative PTS, and have been filtered out, return now if (!inputSamples.length) { return; } if (videoTimeOffset === 0) { // Set the start to match video so that start gaps larger than inputSampleDuration are filled with silence nextAudioTs = 0; } else if (accurateTimeOffset && !alignedWithVideo) { // When not seeking, not live, and LevelDetails.PTSKnown, use fragment start as predicted next audio PTS nextAudioTs = Math.max(0, timeOffsetMpegTS - initTime); } else { // if frags are not contiguous and if we cant trust time offset, let's use first sample PTS as next audio PTS nextAudioTs = inputSamples[0].pts - initTime; } } // If the audio track is missing samples, the frames seem to get "left-shifted" within the // resulting mp4 segment, causing sync issues and leaving gaps at the end of the audio segment. // In an effort to prevent this from happening, we inject frames here where there are gaps. // When possible, we inject a silent frame; when that's not possible, we duplicate the last // frame. if (track.segmentCodec === 'aac') { const maxAudioFramesDrift = this.config.maxAudioFramesDrift; for ( let i = 0, nextPts = nextAudioTs + initTime; i < inputSamples.length; i++ ) { // First, let's see how far off this frame is from where we expect it to be const sample = inputSamples[i]; const pts = sample.pts; const delta = pts - nextPts; const duration = Math.abs((1000 * delta) / inputTimeScale); // When remuxing with video, if we're overlapping by more than a duration, drop this sample to stay in sync if ( delta <= -maxAudioFramesDrift * inputSampleDuration && alignedWithVideo ) { if (i === 0) { this.warn( `Audio frame @ ${(pts / inputTimeScale).toFixed( 3, )}s overlaps marker by ${Math.round( (1000 * delta) / inputTimeScale, )} ms.`, ); this.nextAudioTs = nextAudioTs = pts - initTime; nextPts = pts; } } // eslint-disable-line brace-style // Insert missing frames if: // 1: We're more than maxAudioFramesDrift frame away // 2: Not more than MAX_SILENT_FRAME_DURATION away // 3: currentTime (aka nextPtsNorm) is not 0 // 4: remuxing with video (videoTimeOffset !== undefined) else if ( delta >= maxAudioFramesDrift * inputSampleDuration && duration < MAX_SILENT_FRAME_DURATION && alignedWithVideo ) { let missing = Math.round(delta / inputSampleDuration); // Adjust nextPts so that silent samples are aligned with media pts. This will prevent media samples from // later being shifted if nextPts is based on timeOffset and delta is not a multiple of inputSampleDuration. nextPts = pts - missing * inputSampleDuration; while (nextPts < 0 && missing && inputSampleDuration) { missing--; nextPts += inputSampleDuration; } if (i === 0) { this.nextAudioTs = nextAudioTs = nextPts - initTime; } this.warn( `Injecting ${missing} audio frames @ ${( (nextPts - initTime) / inputTimeScale ).toFixed(3)}s due to ${Math.round( (1000 * delta) / inputTimeScale, )} ms gap.`, ); for (let j = 0; j < missing; j++) { let fillFrame = AAC.getSilentFrame( track.parsedCodec || track.manifestCodec || track.codec, track.channelCount, ); if (!fillFrame) { this.log( 'Unable to get silent frame for given audio codec; duplicating last frame instead.', ); fillFrame = sample.unit.subarray(); } inputSamples.splice(i, 0, { unit: fillFrame, pts: nextPts, }); nextPts += inputSampleDuration; i++; } } sample.pts = nextPts; nextPts += inputSampleDuration; } } let firstPTS: number | null = null; let lastPTS: number | null = null; let mdat: any; let mdatSize: number = 0; let sampleLength: number = inputSamples.length; while (sampleLength--) { mdatSize += inputSamples[sampleLength].unit.byteLength; } for (let j = 0, nbSamples = inputSamples.length; j < nbSamples; j++) { const audioSample = inputSamples[j]; const unit = audioSample.unit; let pts = audioSample.pts; if (lastPTS !== null) { // If we have more than one sample, set the duration of the sample to the "real" duration; the PTS diff with // the previous sample const prevSample = outputSamples[j - 1]; prevSample.duration = Math.round((pts - lastPTS) / scaleFactor); } else { if (contiguous && track.segmentCodec === 'aac') { // set PTS/DTS to expected PTS/DTS pts = nextAudioTs + initTime; } // remember first PTS of our audioSamples firstPTS = pts; if (mdatSize > 0) { /* concatenate the audio data and construct the mdat in place (need 8 more bytes to fill length and mdat type) */ mdatSize += offset; try { mdat = new Uint8Array(mdatSize); } catch (err) { this.observer.emit(Events.ERROR, Events.ERROR, { type: ErrorTypes.MUX_ERROR, details: ErrorDetails.REMUX_ALLOC_ERROR, fatal: false, error: err, bytes: mdatSize, reason: `fail allocating audio mdat ${mdatSize}`, }); return; } if (!rawMPEG) { const view = new DataView(mdat.buffer); view.setUint32(0, mdatSize); mdat.set(MP4.types.mdat, 4); } } else { // no audio samples return; } } mdat.set(unit, offset); const unitLen = unit.byteLength; offset += unitLen; // Default the sample's duration to the computed mp4SampleDuration, which will either be 1024 for AAC or 1152 for MPEG // In the case that we have 1 sample, this will be the duration. If we have more than one sample, the duration // becomes the PTS diff with the previous sample outputSamples.push(createMp4Sample(true, mp4SampleDuration, unitLen, 0)); lastPTS = pts; } // We could end up with no audio samples if all input samples were overlapping with the previously remuxed ones const nbSamples = outputSamples.length; if (!nbSamples) { return; } // The next audio sample PTS should be equal to last sample PTS + duration const lastSample = outputSamples[outputSamples.length - 1]; nextAudioTs = (lastPTS as number) - initTime; this.nextAudioTs = nextAudioTs + scaleFactor * lastSample.duration; // Set the track samples from inputSamples to outputSamples before remuxing const moof = rawMPEG ? new Uint8Array(0) : MP4.moof( track.sequenceNumber++, firstPTS! / scaleFactor, Object.assign({}, track, { samples: outputSamples }), ); // Clear the track samples. This also clears the samples array in the demuxer, since the reference is shared track.samples = []; const start = (firstPTS! - initTime) / inputTimeScale; const end = this.nextAudioTs / inputTimeScale; const type: SourceBufferName = 'audio'; const audioData = { data1: moof, data2: mdat, startPTS: start, endPTS: end, startDTS: start, endDTS: end, type, hasAudio: true, hasVideo: false, nb: nbSamples, }; this.isAudioContiguous = true; return audioData; } } export function normalizePts(value: number, reference: number | null): number { let offset; if (reference === null) { return value; } if (reference < value) { // - 2^33 offset = -8589934592; } else { // + 2^33 offset = 8589934592; } /* PTS is 33bit (from 0 to 2^33 -1) if diff between value and reference is bigger than half of the amplitude (2^32) then it means that PTS looping occured. fill the gap */ while (Math.abs(value - reference) > 4294967296) { value += offset; } return value; } function findKeyframeIndex(samples: Array<VideoSample>): number { for (let i = 0; i < samples.length; i++) { if (samples[i].key) { return i; } } return -1; } export function flushTextTrackMetadataCueSamples( track: DemuxedMetadataTrack, timeOffset: number, initPTS: TimestampOffset, initDTS: TimestampOffset, ): RemuxedMetadata | undefined { const length = track.samples.length; if (!length) { return; } const inputTimeScale = track.inputTimeScale; for (let index = 0; index < length; index++) { const sample = track.samples[index]; // setting id3 pts, dts to relative time // using this._initPTS and this._initDTS to calculate relative time sample.pts = normalizePts( sample.pts - (initPTS.baseTime * inputTimeScale) / initPTS.timescale, timeOffset * inputTimeScale, ) / inputTimeScale; sample.dts = normalizePts( sample.dts - (initDTS.baseTime * inputTimeScale) / initDTS.timescale, timeOffset * inputTimeScale, ) / inputTimeScale; } const samples = track.samples; track.samples = []; return { samples, }; } export function flushTextTrackUserdataCueSamples( track: DemuxedUserdataTrack, timeOffset: number, initPTS: RationalTimestamp, ): RemuxedUserdata | undefined { const length = track.samples.length; if (!length) { return; } const inputTimeScale = track.inputTimeScale; for (let index = 0; index < length; index++) { const sample = track.samples[index]; // setting text pts, dts to relative time // using this._initPTS and this._initDTS to calculate relative time sample.pts = normalizePts( sample.pts - (initPTS.baseTime * inputTimeScale) / initPTS.timescale, timeOffset * inputTimeScale, ) / inputTimeScale; } track.samples.sort((a, b) => a.pts - b.pts); const samples = track.samples; track.samples = []; return { samples, }; }