Update layer indices for non-flexible mode according to updates in the RTP payload profile.

https://tools.ietf.org/id/draft-ietf-payload-vp9-01.txt

BUG=chromium:500602
TBR=stefan@webrtc.org

Review URL: https://codereview.webrtc.org/1426813002

Cr-Commit-Position: refs/heads/master@{#10522}
diff --git a/webrtc/modules/rtp_rtcp/source/rtp_format_vp9.cc b/webrtc/modules/rtp_rtcp/source/rtp_format_vp9.cc
index 0e76a8e..c89009b 100644
--- a/webrtc/modules/rtp_rtcp/source/rtp_format_vp9.cc
+++ b/webrtc/modules/rtp_rtcp/source/rtp_format_vp9.cc
@@ -47,10 +47,6 @@
   return (hdr.tl0_pic_idx == kNoTl0PicIdx) ? def : hdr.tl0_pic_idx;
 }
 
-uint8_t GofIdxField(const RTPVideoHeaderVP9& hdr, uint8_t def) {
-  return (hdr.gof_idx == kNoGofIdx) ? def : hdr.gof_idx;
-}
-
 // Picture ID:
 //
 //      +-+-+-+-+-+-+-+-+
@@ -74,19 +70,17 @@
 // Flexible mode (F=1):     Non-flexible mode (F=0):
 //
 //      +-+-+-+-+-+-+-+-+   +-+-+-+-+-+-+-+-+
-// L:   |  T  |U|  S  |D|   |GOF_IDX|  S  |D|
+// L:   |  T  |U|  S  |D|   |  T  |U|  S  |D|
 //      +-+-+-+-+-+-+-+-+   +-+-+-+-+-+-+-+-+
 //                          |   TL0PICIDX   |
 //                          +-+-+-+-+-+-+-+-+
 //
 size_t LayerInfoLength(const RTPVideoHeaderVP9& hdr) {
-  if (hdr.flexible_mode) {
-    return (hdr.temporal_idx == kNoTemporalIdx &&
-            hdr.spatial_idx == kNoSpatialIdx) ? 0 : 1;
-  } else {
-    return (hdr.gof_idx == kNoGofIdx &&
-            hdr.spatial_idx == kNoSpatialIdx) ? 0 : 2;
+  if (hdr.temporal_idx == kNoTemporalIdx &&
+      hdr.spatial_idx == kNoSpatialIdx) {
+    return 0;
   }
+  return hdr.flexible_mode ? 1 : 2;
 }
 
 bool LayerInfoPresent(const RTPVideoHeaderVP9& hdr) {
@@ -198,8 +192,8 @@
 // L:   |  T  |U|  S  |D|
 //      +-+-+-+-+-+-+-+-+
 //
-bool WriteLayerInfoFlexibleMode(const RTPVideoHeaderVP9& vp9,
-                                rtc::BitBufferWriter* writer) {
+bool WriteLayerInfoCommon(const RTPVideoHeaderVP9& vp9,
+                          rtc::BitBufferWriter* writer) {
   RETURN_FALSE_ON_ERROR(writer->WriteBits(TemporalIdxField(vp9, 0), 3));
   RETURN_FALSE_ON_ERROR(writer->WriteBits(vp9.temporal_up_switch ? 1 : 0, 1));
   RETURN_FALSE_ON_ERROR(writer->WriteBits(SpatialIdxField(vp9, 0), 3));
@@ -210,27 +204,26 @@
 // Non-flexible mode (F=0):
 //
 //      +-+-+-+-+-+-+-+-+
-// L:   |GOF_IDX|  S  |D|
+// L:   |  T  |U|  S  |D|
 //      +-+-+-+-+-+-+-+-+
 //      |   TL0PICIDX   |
 //      +-+-+-+-+-+-+-+-+
 //
 bool WriteLayerInfoNonFlexibleMode(const RTPVideoHeaderVP9& vp9,
                                    rtc::BitBufferWriter* writer) {
-  RETURN_FALSE_ON_ERROR(writer->WriteBits(GofIdxField(vp9, 0), 4));
-  RETURN_FALSE_ON_ERROR(writer->WriteBits(SpatialIdxField(vp9, 0), 3));
-  RETURN_FALSE_ON_ERROR(writer->WriteBits(vp9.inter_layer_predicted ? 1: 0, 1));
   RETURN_FALSE_ON_ERROR(writer->WriteUInt8(Tl0PicIdxField(vp9, 0)));
   return true;
 }
 
 bool WriteLayerInfo(const RTPVideoHeaderVP9& vp9,
                     rtc::BitBufferWriter* writer) {
-  if (vp9.flexible_mode) {
-    return WriteLayerInfoFlexibleMode(vp9, writer);
-  } else {
-    return WriteLayerInfoNonFlexibleMode(vp9, writer);
-  }
+  if (!WriteLayerInfoCommon(vp9, writer))
+    return false;
+
+  if (vp9.flexible_mode)
+    return true;
+
+  return WriteLayerInfoNonFlexibleMode(vp9, writer);
 }
 
 // Reference indices:
@@ -337,8 +330,7 @@
 // L:   |  T  |U|  S  |D|
 //      +-+-+-+-+-+-+-+-+
 //
-bool ParseLayerInfoFlexibleMode(rtc::BitBuffer* parser,
-                                RTPVideoHeaderVP9* vp9) {
+bool ParseLayerInfoCommon(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) {
   uint32_t t, u_bit, s, d_bit;
   RETURN_FALSE_ON_ERROR(parser->ReadBits(&t, 3));
   RETURN_FALSE_ON_ERROR(parser->ReadBits(&u_bit, 1));
@@ -354,32 +346,27 @@
 // Layer indices (non-flexible mode):
 //
 //      +-+-+-+-+-+-+-+-+
-// L:   |GOF_IDX|  S  |D|
+// L:   |  T  |U|  S  |D|
 //      +-+-+-+-+-+-+-+-+
 //      |   TL0PICIDX   |
 //      +-+-+-+-+-+-+-+-+
 //
 bool ParseLayerInfoNonFlexibleMode(rtc::BitBuffer* parser,
                                    RTPVideoHeaderVP9* vp9) {
-  uint32_t gof_idx, s, d_bit;
   uint8_t tl0picidx;
-  RETURN_FALSE_ON_ERROR(parser->ReadBits(&gof_idx, 4));
-  RETURN_FALSE_ON_ERROR(parser->ReadBits(&s, 3));
-  RETURN_FALSE_ON_ERROR(parser->ReadBits(&d_bit, 1));
   RETURN_FALSE_ON_ERROR(parser->ReadUInt8(&tl0picidx));
-  vp9->gof_idx = gof_idx;
-  vp9->spatial_idx = s;
-  vp9->inter_layer_predicted = d_bit ? true : false;
   vp9->tl0_pic_idx = tl0picidx;
   return true;
 }
 
 bool ParseLayerInfo(rtc::BitBuffer* parser, RTPVideoHeaderVP9* vp9) {
-  if (vp9->flexible_mode) {
-    return ParseLayerInfoFlexibleMode(parser, vp9);
-  } else {
-    return ParseLayerInfoNonFlexibleMode(parser, vp9);
-  }
+  if (!ParseLayerInfoCommon(parser, vp9))
+    return false;
+
+  if (vp9->flexible_mode)
+    return true;
+
+  return ParseLayerInfoNonFlexibleMode(parser, vp9);
 }
 
 // Reference indices:
@@ -604,7 +591,7 @@
 //      +-+-+-+-+-+-+-+-+
 // M:   | EXTENDED PID  | (RECOMMENDED)
 //      +-+-+-+-+-+-+-+-+
-// L:   |GOF_IDX|  S  |D| (CONDITIONALLY RECOMMENDED)
+// L:   |  T  |U|  S  |D| (CONDITIONALLY RECOMMENDED)
 //      +-+-+-+-+-+-+-+-+
 //      |   TL0PICIDX   | (CONDITIONALLY REQUIRED)
 //      +-+-+-+-+-+-+-+-+
diff --git a/webrtc/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc b/webrtc/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc
index 66ab5cd..242746d 100644
--- a/webrtc/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc
+++ b/webrtc/modules/rtp_rtcp/source/rtp_format_vp9_unittest.cc
@@ -112,7 +112,7 @@
 //        +-+-+-+-+-+-+-+-+
 //   M:   | EXTENDED PID  | (RECOMMENDED)
 //        +-+-+-+-+-+-+-+-+
-//   L:   |GOF_IDX|  S  |D| (CONDITIONALLY RECOMMENDED)
+//   L:   |  T  |U|  S  |D| (CONDITIONALLY RECOMMENDED)
 //        +-+-+-+-+-+-+-+-+
 //        |   TL0PICIDX   | (CONDITIONALLY REQUIRED)
 //        +-+-+-+-+-+-+-+-+
@@ -255,7 +255,8 @@
   const size_t kFrameSize = 30;
   const size_t kPacketSize = 25;
 
-  expected_.gof_idx = 3;
+  expected_.temporal_idx = 3;
+  expected_.temporal_up_switch = true;  // U
   expected_.num_spatial_layers = 3;
   expected_.spatial_idx = 2;
   expected_.inter_layer_predicted = true;  // D
@@ -264,9 +265,9 @@
 
   // Two packets:
   //    | I:0, P:0, L:1, F:0, B:1, E:0, V:0 | (3hdr + 15 payload)
-  // L: | GOF_IDX:3, S:2, D:1 | TL0PICIDX:117 |
+  // L: | T:3, U:1, S:2, D:1 | TL0PICIDX:117 |
   //    | I:0, P:0, L:1, F:0, B:0, E:1, V:0 | (3hdr + 15 payload)
-  // L: | GOF_IDX:3, S:2, D:1 | TL0PICIDX:117 |
+  // L: | T:3, U:1, S:2, D:1 | TL0PICIDX:117 |
   const size_t kExpectedHdrSizes[] = {3, 3};
   const size_t kExpectedSizes[] = {18, 18};
   const size_t kExpectedNum = GTEST_ARRAY_SIZE_(kExpectedSizes);
@@ -505,16 +506,20 @@
 
 TEST_F(RtpDepacketizerVp9Test, ParseLayerInfoWithNonFlexibleMode) {
   const uint8_t kHeaderLength = 3;
-  const uint8_t kGofIdx = 7;
+  const uint8_t kTemporalIdx = 2;
+  const uint8_t kUbit = 1;
   const uint8_t kSpatialIdx = 1;
   const uint8_t kDbit = 1;
   const uint8_t kTl0PicIdx = 17;
   uint8_t packet[13] = {0};
   packet[0] = 0x20;  // I:0 P:0 L:1 F:0 B:0 E:0 V:0 R:0
-  packet[1] = (kGofIdx << 4) | (kSpatialIdx << 1) | kDbit;  // GOF_IDX:7 S:1 D:1
-  packet[2] = kTl0PicIdx;                                   // TL0PICIDX:17
+  packet[1] = (kTemporalIdx << 5) | (kUbit << 4) | (kSpatialIdx << 1) | kDbit;
+  packet[2] = kTl0PicIdx;
 
-  expected_.gof_idx = kGofIdx;
+  // T:2 U:1 S:1 D:1
+  // TL0PICIDX:17
+  expected_.temporal_idx = kTemporalIdx;
+  expected_.temporal_up_switch = kUbit ? true : false;
   expected_.spatial_idx = kSpatialIdx;
   expected_.inter_layer_predicted = kDbit ? true : false;
   expected_.tl0_pic_idx = kTl0PicIdx;
diff --git a/webrtc/modules/video_coding/main/source/jitter_buffer.cc b/webrtc/modules/video_coding/main/source/jitter_buffer.cc
index bc63411..5385e0f 100644
--- a/webrtc/modules/video_coding/main/source/jitter_buffer.cc
+++ b/webrtc/modules/video_coding/main/source/jitter_buffer.cc
@@ -316,7 +316,6 @@
   first_packet_since_reset_ = true;
   rtt_ms_ = kDefaultRtt;
   last_decoded_state_.Reset();
-  vp9_ss_map_.Reset();
 }
 
 void VCMJitterBuffer::Stop() {
@@ -324,7 +323,6 @@
   UpdateHistograms();
   running_ = false;
   last_decoded_state_.Reset();
-  vp9_ss_map_.Reset();
 
   // Make sure all frames are free and reset.
   for (FrameList::iterator it = decodable_frames_.begin();
@@ -356,7 +354,6 @@
   decodable_frames_.Reset(&free_frames_);
   incomplete_frames_.Reset(&free_frames_);
   last_decoded_state_.Reset();  // TODO(mikhal): sync reset.
-  vp9_ss_map_.Reset();
   num_consecutive_old_packets_ = 0;
   // Also reset the jitter and delay estimates
   jitter_estimate_.Reset();
@@ -688,19 +685,10 @@
 
   num_consecutive_old_packets_ = 0;
 
-  if (packet.codec == kVideoCodecVP9) {
-    if (packet.codecSpecificHeader.codecHeader.VP9.flexible_mode) {
-      // TODO(asapersson): Add support for flexible mode.
-      return kGeneralError;
-    }
-    if (!packet.codecSpecificHeader.codecHeader.VP9.flexible_mode) {
-      if (vp9_ss_map_.Insert(packet))
-        vp9_ss_map_.UpdateFrames(&incomplete_frames_);
-
-      vp9_ss_map_.UpdatePacket(const_cast<VCMPacket*>(&packet));
-    }
-    if (!last_decoded_state_.in_initial_state())
-      vp9_ss_map_.RemoveOld(last_decoded_state_.time_stamp());
+  if (packet.codec == kVideoCodecVP9 &&
+      packet.codecSpecificHeader.codecHeader.VP9.flexible_mode) {
+    // TODO(asapersson): Add support for flexible mode.
+    return kGeneralError;
   }
 
   VCMFrameBuffer* frame;
diff --git a/webrtc/modules/video_coding/main/source/jitter_buffer.h b/webrtc/modules/video_coding/main/source/jitter_buffer.h
index 9bde97c..228e786 100644
--- a/webrtc/modules/video_coding/main/source/jitter_buffer.h
+++ b/webrtc/modules/video_coding/main/source/jitter_buffer.h
@@ -338,8 +338,6 @@
   FrameList incomplete_frames_ GUARDED_BY(crit_sect_);
   VCMDecodingState last_decoded_state_ GUARDED_BY(crit_sect_);
   bool first_packet_since_reset_;
-  // Contains scalability structure data for VP9.
-  Vp9SsMap vp9_ss_map_ GUARDED_BY(crit_sect_);
 
   // Statistics.
   VCMReceiveStatisticsCallback* stats_callback_ GUARDED_BY(crit_sect_);
diff --git a/webrtc/modules/video_coding/main/source/jitter_buffer_unittest.cc b/webrtc/modules/video_coding/main/source/jitter_buffer_unittest.cc
index d6c6d49..4bb85cf 100644
--- a/webrtc/modules/video_coding/main/source/jitter_buffer_unittest.cc
+++ b/webrtc/modules/video_coding/main/source/jitter_buffer_unittest.cc
@@ -885,7 +885,6 @@
   packet_->codecSpecificHeader.codecHeader.VP9.spatial_idx = 0;
   packet_->codecSpecificHeader.codecHeader.VP9.beginning_of_frame = true;
   packet_->codecSpecificHeader.codecHeader.VP9.end_of_frame = true;
-  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = kNoTemporalIdx;
   packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = false;
 
   packet_->seqNum = 65485;
@@ -893,7 +892,7 @@
   packet_->frameType = kVideoFrameKey;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 5;
   packet_->codecSpecificHeader.codecHeader.VP9.tl0_pic_idx = 200;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 0;
   packet_->codecSpecificHeader.codecHeader.VP9.ss_data_available = true;
   packet_->codecSpecificHeader.codecHeader.VP9.gof.SetGofInfoVP9(
       kTemporalStructureMode3);  // kTemporalStructureMode3: 0-2-1-2..
@@ -905,7 +904,7 @@
   packet_->frameType = kVideoFrameDelta;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 9;
   packet_->codecSpecificHeader.codecHeader.VP9.tl0_pic_idx = 201;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 0;
   packet_->codecSpecificHeader.codecHeader.VP9.ss_data_available = false;
   EXPECT_EQ(kCompleteSession, jitter_buffer_->InsertPacket(*packet_, &re));
 
@@ -939,22 +938,22 @@
   packet_->codecSpecificHeader.codecHeader.VP9.spatial_idx = 0;
   packet_->codecSpecificHeader.codecHeader.VP9.beginning_of_frame = true;
   packet_->codecSpecificHeader.codecHeader.VP9.end_of_frame = true;
-  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = kNoTemporalIdx;
-  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = false;
   packet_->codecSpecificHeader.codecHeader.VP9.tl0_pic_idx = 200;
 
   packet_->seqNum = 65486;
   packet_->timestamp = 6000;
   packet_->frameType = kVideoFrameDelta;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 6;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 1;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 2;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = true;
   EXPECT_EQ(kCompleteSession, jitter_buffer_->InsertPacket(*packet_, &re));
 
   packet_->seqNum = 65487;
   packet_->timestamp = 9000;
   packet_->frameType = kVideoFrameDelta;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 7;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 2;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 1;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = true;
   EXPECT_EQ(kCompleteSession, jitter_buffer_->InsertPacket(*packet_, &re));
 
   // Insert first frame with SS data.
@@ -964,7 +963,8 @@
   packet_->width = 352;
   packet_->height = 288;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 5;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = false;
   packet_->codecSpecificHeader.codecHeader.VP9.ss_data_available = true;
   packet_->codecSpecificHeader.codecHeader.VP9.gof.SetGofInfoVP9(
       kTemporalStructureMode3);  // kTemporalStructureMode3: 0-2-1-2..
@@ -1011,8 +1011,6 @@
   packet_->codecSpecificHeader.codecHeader.VP9.flexible_mode = false;
   packet_->codecSpecificHeader.codecHeader.VP9.beginning_of_frame = true;
   packet_->codecSpecificHeader.codecHeader.VP9.end_of_frame = true;
-  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = kNoTemporalIdx;
-  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = false;
   packet_->codecSpecificHeader.codecHeader.VP9.tl0_pic_idx = 200;
 
   packet_->isFirstPacket = true;
@@ -1022,7 +1020,8 @@
   packet_->frameType = kVideoFrameDelta;
   packet_->codecSpecificHeader.codecHeader.VP9.spatial_idx = 0;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 6;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 1;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 1;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = true;
   EXPECT_EQ(kIncomplete, jitter_buffer_->InsertPacket(*packet_, &re));
 
   packet_->isFirstPacket = false;
@@ -1031,7 +1030,8 @@
   packet_->frameType = kVideoFrameDelta;
   packet_->codecSpecificHeader.codecHeader.VP9.spatial_idx = 1;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 6;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 1;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 1;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = true;
   EXPECT_EQ(kCompleteSession, jitter_buffer_->InsertPacket(*packet_, &re));
 
   packet_->isFirstPacket = false;
@@ -1041,7 +1041,8 @@
   packet_->frameType = kVideoFrameKey;
   packet_->codecSpecificHeader.codecHeader.VP9.spatial_idx = 1;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 5;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = false;
   EXPECT_EQ(kIncomplete, jitter_buffer_->InsertPacket(*packet_, &re));
 
   // Insert first frame with SS data.
@@ -1053,7 +1054,8 @@
   packet_->height = 288;
   packet_->codecSpecificHeader.codecHeader.VP9.spatial_idx = 0;
   packet_->codecSpecificHeader.codecHeader.VP9.picture_id = 5;
-  packet_->codecSpecificHeader.codecHeader.VP9.gof_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_idx = 0;
+  packet_->codecSpecificHeader.codecHeader.VP9.temporal_up_switch = false;
   packet_->codecSpecificHeader.codecHeader.VP9.ss_data_available = true;
   packet_->codecSpecificHeader.codecHeader.VP9.gof.SetGofInfoVP9(
       kTemporalStructureMode2);  // kTemporalStructureMode3: 0-1-0-1..