UNPKG

shaka-player

Version:
447 lines (391 loc) 15.2 kB
/*! @license * Shaka Player * Copyright 2016 Google LLC * SPDX-License-Identifier: Apache-2.0 */ goog.provide('shaka.cea.Mp4CeaParser'); goog.require('goog.asserts'); goog.require('shaka.cea.CeaUtils'); goog.require('shaka.cea.SeiProcessor'); goog.require('shaka.log'); goog.require('shaka.media.ClosedCaptionParser'); goog.require('shaka.util.DataViewReader'); goog.require('shaka.util.Error'); goog.require('shaka.util.Mp4Parser'); goog.require('shaka.util.Mp4BoxParsers'); /** * MPEG4 stream parser used for extracting 708 closed captions data. * @implements {shaka.extern.ICeaParser} * @export */ shaka.cea.Mp4CeaParser = class { /** */ constructor() { /** * SEI data processor. * @private * @const {!shaka.cea.SeiProcessor} */ this.seiProcessor_ = new shaka.cea.SeiProcessor(); /** * Map of track id to corresponding timescale. * @private {!Map<number, number>} */ this.trackIdToTimescale_ = new Map(); /** * Default sample duration, as specified by the TREX box. * @private {number} */ this.defaultSampleDuration_ = 0; /** * Default sample size, as specified by the TREX box. * @private {number} */ this.defaultSampleSize_ = 0; /** * @private {shaka.cea.Mp4CeaParser.BitstreamFormat} */ this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.BitstreamFormat.UNKNOWN; } /** * Parses the init segment. Gets Default Sample Duration and Size from the * TREX box, and constructs a map of Track IDs to timescales. Each TRAK box * contains a track header (TKHD) containing track ID, and a media header box * (MDHD) containing the timescale for the track * @override */ init(initSegment) { const Mp4Parser = shaka.util.Mp4Parser; const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat; const trackIds = []; const timescales = []; const codecBoxParser = (box) => this.setBitstreamFormat_(box.name); new Mp4Parser() .box('moov', Mp4Parser.children) .box('mvex', Mp4Parser.children) .fullBox('trex', (box) => { const parsedTREXBox = shaka.util.Mp4BoxParsers.parseTREX( box.reader); this.defaultSampleDuration_ = parsedTREXBox.defaultSampleDuration; this.defaultSampleSize_ = parsedTREXBox.defaultSampleSize; }) .box('trak', Mp4Parser.children) .fullBox('tkhd', (box) => { goog.asserts.assert( box.version != null, 'TKHD is a full box and should have a valid version.'); const parsedTKHDBox = shaka.util.Mp4BoxParsers.parseTKHD( box.reader, box.version); trackIds.push(parsedTKHDBox.trackId); }) .box('mdia', Mp4Parser.children) .fullBox('mdhd', (box) => { goog.asserts.assert( box.version != null, 'MDHD is a full box and should have a valid version.'); const parsedMDHDBox = shaka.util.Mp4BoxParsers.parseMDHD( box.reader, box.version); timescales.push(parsedMDHDBox.timescale); }) .box('minf', Mp4Parser.children) .box('stbl', Mp4Parser.children) .fullBox('stsd', Mp4Parser.sampleDescription) // These are the various boxes that signal a codec. .box('avc1', codecBoxParser) .box('avc3', codecBoxParser) .box('dvav', codecBoxParser) .box('dva1', codecBoxParser) .box('hev1', codecBoxParser) .box('hvc1', codecBoxParser) .box('dvh1', codecBoxParser) .box('dvhe', codecBoxParser) .box('vvc1', codecBoxParser) .box('vvi1', codecBoxParser) .box('dvc1', codecBoxParser) .box('dvi1', codecBoxParser) // This signals an encrypted sample, which we can go inside of to find // the codec used. .box('encv', Mp4Parser.visualSampleEntry) .box('sinf', Mp4Parser.children) .box('frma', (box) => { const {codec} = shaka.util.Mp4BoxParsers.parseFRMA(box.reader); this.setBitstreamFormat_(codec); }) .parse(initSegment, /* partialOkay= */ true); // At least one track should exist, and each track should have a // corresponding Id in TKHD box, and timescale in its MDHD box if (!trackIds.length|| !timescales.length || trackIds.length != timescales.length) { throw new shaka.util.Error( shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT, shaka.util.Error.Code.INVALID_MP4_CEA); } if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) { shaka.log.alwaysWarn( 'Unable to determine bitstream format for CEA parsing!'); } // Populate the map from track Id to timescale trackIds.forEach((trackId, idx) => { this.trackIdToTimescale_.set(trackId, timescales[idx]); }); } /** * Parses each video segment. In fragmented MP4s, MOOF and MDAT come in * pairs. The following logic gets the necessary info from MOOFs to parse * MDATs (base media decode time, sample sizes/offsets/durations, etc), * and then parses the MDAT boxes for CEA-708 packets using this information. * CEA-708 packets are returned in the callback. * @override */ parse(mediaSegment) { const Mp4Parser = shaka.util.Mp4Parser; const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat; if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) { // We don't know how to extract SEI from this. return []; } /** @type {!Array<!shaka.extern.ICeaParser.CaptionPacket>} **/ const captionPackets = []; let moofOffset = 0; /** @type {!Array<!shaka.cea.Mp4CeaParser.ParsedTRAF>} */ let parsedTRAFs = []; new Mp4Parser() .box('moof', (box) => { moofOffset = box.start; // traf box parsing is reset on each moof. parsedTRAFs = []; Mp4Parser.children(box); }) .box('traf', (box) => { parsedTRAFs.push({ baseMediaDecodeTime: null, defaultSampleDuration: this.defaultSampleDuration_, defaultSampleSize: this.defaultSampleSize_, parsedTRUNs: [], timescale: shaka.cea.CeaUtils.DEFAULT_TIMESCALE_VALUE, }); Mp4Parser.children(box); }) .fullBox('trun', (box) => { goog.asserts.assert( box.version != null && box.flags != null, 'TRUN is a full box and should have a valid version & flags.'); const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1]; const parsedTRUN = shaka.util.Mp4BoxParsers.parseTRUN( box.reader, box.version, box.flags); lastTRAF.parsedTRUNs.push(parsedTRUN); }) .fullBox('tfhd', (box) => { goog.asserts.assert( box.flags != null, 'TFHD is a full box and should have valid flags.'); const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1]; const parsedTFHD = shaka.util.Mp4BoxParsers.parseTFHD( box.reader, box.flags); // If specified, defaultSampleDuration and defaultSampleSize // override the ones specified in the TREX box lastTRAF.defaultSampleDuration = parsedTFHD.defaultSampleDuration || this.defaultSampleDuration_; lastTRAF.defaultSampleSize = parsedTFHD.defaultSampleSize || this.defaultSampleSize_; const trackId = parsedTFHD.trackId; // Get the timescale from the track Id if (this.trackIdToTimescale_.has(trackId)) { lastTRAF.timescale = this.trackIdToTimescale_.get(trackId); } }) .fullBox('tfdt', (box) => { goog.asserts.assert( box.version != null, 'TFDT is a full box and should have a valid version.'); const lastTRAF = parsedTRAFs[parsedTRAFs.length - 1]; const parsedTFDT = shaka.util.Mp4BoxParsers.parseTFDTInaccurate( box.reader, box.version); lastTRAF.baseMediaDecodeTime = parsedTFDT.baseMediaDecodeTime; }) .box('mdat', (box) => { const offset = moofOffset - box.start - 8; const initialPosition = box.reader.getPosition(); for (const parsedTRAF of parsedTRAFs) { if (parsedTRAF.baseMediaDecodeTime === null) { // This field should have been populated by the Base Media Decode // Time in the tfdt box. shaka.log.alwaysWarn( 'Unable to find base media decode time for CEA captions!'); throw new shaka.util.Error( shaka.util.Error.Severity.CRITICAL, shaka.util.Error.Category.TEXT, shaka.util.Error.Code.INVALID_MP4_CEA); } box.reader.seek(initialPosition); this.parseMdat_(box.reader, parsedTRAF.baseMediaDecodeTime, parsedTRAF.timescale, parsedTRAF.defaultSampleDuration, parsedTRAF.defaultSampleSize, offset, parsedTRAF.parsedTRUNs, captionPackets); } }) .parse(mediaSegment, /* partialOkay= */ false); return captionPackets; } /** * Parse MDAT box. * @param {!shaka.util.DataViewReader} reader * @param {number} time * @param {number} timescale * @param {number} defaultSampleDuration * @param {number} defaultSampleSize * @param {number} offset * @param {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs * @param {!Array<!shaka.extern.ICeaParser.CaptionPacket>} captionPackets * @private */ parseMdat_(reader, time, timescale, defaultSampleDuration, defaultSampleSize, offset, parsedTRUNs, captionPackets) { const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat; const CeaUtils = shaka.cea.CeaUtils; let sampleIndex = 0; // The fields in each ParsedTRUNSample contained in the sampleData // array are nullable. In the case of sample data and sample duration, // we use the defaults provided by the TREX/TFHD boxes. For sample // composition time offset, we default to 0. let sampleSize = defaultSampleSize; // Combine all sample data. This assumes that the samples described across // multiple trun boxes are still continuous in the mdat box. const sampleDatas = parsedTRUNs.map((t) => t.sampleData); const sampleData = [].concat(...sampleDatas); if (sampleData.length) { sampleSize = sampleData[0].sampleSize || defaultSampleSize; } reader.skip(offset + parsedTRUNs[0].dataOffset); while (reader.hasMoreData()) { const naluSize = reader.readUint32(); const naluHeader = reader.readUint8(); let naluType = null; let isSeiMessage = false; let naluHeaderSize = 1; goog.asserts.assert(this.bitstreamFormat_ != BitstreamFormat.UNKNOWN, 'Bitstream format should have been checked before now!'); switch (this.bitstreamFormat_) { case BitstreamFormat.H264: naluType = naluHeader & 0x1f; isSeiMessage = naluType == CeaUtils.H264_NALU_TYPE_SEI; break; case BitstreamFormat.H265: naluHeaderSize = 2; reader.skip(1); naluType = (naluHeader >> 1) & 0x3f; isSeiMessage = naluType == CeaUtils.H265_PREFIX_NALU_TYPE_SEI || naluType == CeaUtils.H265_SUFFIX_NALU_TYPE_SEI; break; case BitstreamFormat.H266: naluHeaderSize = 2; reader.skip(1); naluType = (naluHeader >> 1) & 0x3f; isSeiMessage = naluType == CeaUtils.H266_PREFIX_NALU_TYPE_SEI || naluType == CeaUtils.H266_SUFFIX_NALU_TYPE_SEI; break; default: return; } if (isSeiMessage) { let timeOffset = 0; if (sampleIndex < sampleData.length) { timeOffset = sampleData[sampleIndex].sampleCompositionTimeOffset || 0; } const pts = (time + timeOffset) / timescale; for (const packet of this.seiProcessor_ .process(reader.readBytes(naluSize - naluHeaderSize))) { captionPackets.push({ packet, pts, }); } } else { try { reader.skip(naluSize - naluHeaderSize); } catch (e) { // It is necessary to ignore this error because it can break the start // of playback even if the user does not want to see the subtitles. break; } } sampleSize -= (naluSize + 4); if (sampleSize == 0) { if (sampleIndex < sampleData.length) { time += sampleData[sampleIndex].sampleDuration || defaultSampleDuration; } else { time += defaultSampleDuration; } sampleIndex++; if (sampleIndex < sampleData.length) { sampleSize = sampleData[sampleIndex].sampleSize || defaultSampleSize; } else { sampleSize = defaultSampleSize; } } } } /** * @param {string} codec A fourcc for a codec. * @private */ setBitstreamFormat_(codec) { if (shaka.cea.Mp4CeaParser.CodecBitstreamMap_.has(codec)) { this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.CodecBitstreamMap_.get(codec); } } }; /** @enum {number} */ shaka.cea.Mp4CeaParser.BitstreamFormat = { UNKNOWN: 0, H264: 1, H265: 2, H266: 3, }; /** @private {Map<string, shaka.cea.Mp4CeaParser.BitstreamFormat>} */ shaka.cea.Mp4CeaParser.CodecBitstreamMap_ = new Map() // AVC .set('avc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H264) .set('avc3', shaka.cea.Mp4CeaParser.BitstreamFormat.H264) // Dolby Vision based in AVC .set('dvav', shaka.cea.Mp4CeaParser.BitstreamFormat.H264) .set('dva1', shaka.cea.Mp4CeaParser.BitstreamFormat.H264) // HEVC .set('hev1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265) .set('hvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265) // Dolby Vision based in HEVC .set('dvh1', shaka.cea.Mp4CeaParser.BitstreamFormat.H265) .set('dvhe', shaka.cea.Mp4CeaParser.BitstreamFormat.H265) // VVC .set('vvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266) .set('vvi1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266) // Dolby Vision based in VVC .set('dvc1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266) .set('dvi1', shaka.cea.Mp4CeaParser.BitstreamFormat.H266); /** * @typedef {{ * baseMediaDecodeTime: ?number, * defaultSampleDuration: number, * defaultSampleSize: number, * parsedTRUNs: !Array<shaka.util.ParsedTRUNBox>, * timescale: number * }} * * @property {?number} baseMediaDecodeTime * @property {number} defaultSampleDuration * @property {number} defaultSampleSize * @property {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs * @property {?number} timescale */ shaka.cea.Mp4CeaParser.ParsedTRAF; shaka.media.ClosedCaptionParser.registerParser('video/mp4', () => new shaka.cea.Mp4CeaParser());