AudioEncoderCng: Handle case where speech encoder is reset

Previously, AudioEncoderCng required the speech encoder to not change
its mind regarding the number of 10 ms frames in the next packet
between calls to AudioEncoderCng::EncodeInternal()---specifically, it
could handle an upward but not a downward adjustment. With this patch,
it can handle a downward adjustment too, by simply saving the
overshoot data for the next call to EncodeInternal().

It will still not handle the case where the encoder's reported number
of 10 ms frames in the next packet is inconsistent with the behavior
of its Encode() function when called with no intervening changes to
the encoder.

R=henrik.lundin@webrtc.org

Review URL: https://webrtc-codereview.appspot.com/53469005

Cr-Commit-Position: refs/heads/master@{#9261}
diff --git a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
index 58fd24f..9b23607 100644
--- a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
+++ b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
@@ -50,8 +50,6 @@
     : speech_encoder_(config.speech_encoder),
       cng_payload_type_(config.payload_type),
       num_cng_coefficients_(config.num_cng_coefficients),
-      first_timestamp_in_buffer_(0),
-      frames_in_buffer_(0),
       last_frame_active_(true),
       vad_(new Vad(config.vad_mode)) {
   if (config.vad) {
@@ -115,35 +113,31 @@
     size_t max_encoded_bytes,
     uint8_t* encoded) {
   CHECK_GE(max_encoded_bytes, static_cast<size_t>(num_cng_coefficients_ + 1));
-  const int num_samples = SampleRateHz() / 100 * NumChannels();
-  if (speech_buffer_.empty()) {
-    CHECK_EQ(frames_in_buffer_, 0);
-    first_timestamp_in_buffer_ = rtp_timestamp;
-  }
-  for (int i = 0; i < num_samples; ++i) {
+  const size_t samples_per_10ms_frame = SamplesPer10msFrame();
+  CHECK_EQ(speech_buffer_.size(),
+           rtp_timestamps_.size() * samples_per_10ms_frame);
+  rtp_timestamps_.push_back(rtp_timestamp);
+  for (size_t i = 0; i < samples_per_10ms_frame; ++i) {
     speech_buffer_.push_back(audio[i]);
   }
-  ++frames_in_buffer_;
-  if (frames_in_buffer_ < speech_encoder_->Num10MsFramesInNextPacket()) {
+  const int frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket();
+  if (rtp_timestamps_.size() < static_cast<size_t>(frames_to_encode)) {
     return EncodedInfo();
   }
-  CHECK_LE(frames_in_buffer_ * 10, kMaxFrameSizeMs)
+  CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs)
       << "Frame size cannot be larger than " << kMaxFrameSizeMs
       << " ms when using VAD/CNG.";
-  const size_t samples_per_10ms_frame = 10 * SampleRateHz() / 1000;
-  CHECK_EQ(speech_buffer_.size(),
-           static_cast<size_t>(frames_in_buffer_) * samples_per_10ms_frame);
 
   // Group several 10 ms blocks per VAD call. Call VAD once or twice using the
   // following split sizes:
   // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms;
   // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms.
   int blocks_in_first_vad_call =
-      (frames_in_buffer_ > 3 ? 3 : frames_in_buffer_);
-  if (frames_in_buffer_ == 4)
+      (frames_to_encode > 3 ? 3 : frames_to_encode);
+  if (frames_to_encode == 4)
     blocks_in_first_vad_call = 2;
   const int blocks_in_second_vad_call =
-      frames_in_buffer_ - blocks_in_first_vad_call;
+      frames_to_encode - blocks_in_first_vad_call;
   CHECK_GE(blocks_in_second_vad_call, 0);
 
   // Check if all of the buffer is passive speech. Start with checking the first
@@ -161,12 +155,12 @@
   EncodedInfo info;
   switch (activity) {
     case Vad::kPassive: {
-      info = EncodePassive(max_encoded_bytes, encoded);
+      info = EncodePassive(frames_to_encode, max_encoded_bytes, encoded);
       last_frame_active_ = false;
       break;
     }
     case Vad::kActive: {
-      info = EncodeActive(max_encoded_bytes, encoded);
+      info = EncodeActive(frames_to_encode, max_encoded_bytes, encoded);
       last_frame_active_ = true;
       break;
     }
@@ -176,20 +170,24 @@
     }
   }
 
-  speech_buffer_.clear();
-  frames_in_buffer_ = 0;
+  speech_buffer_.erase(
+      speech_buffer_.begin(),
+      speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame);
+  rtp_timestamps_.erase(rtp_timestamps_.begin(),
+                        rtp_timestamps_.begin() + frames_to_encode);
   return info;
 }
 
 AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
+    int frames_to_encode,
     size_t max_encoded_bytes,
     uint8_t* encoded) {
   bool force_sid = last_frame_active_;
   bool output_produced = false;
   const size_t samples_per_10ms_frame = SamplesPer10msFrame();
-  CHECK_GE(max_encoded_bytes, frames_in_buffer_ * samples_per_10ms_frame);
+  CHECK_GE(max_encoded_bytes, frames_to_encode * samples_per_10ms_frame);
   AudioEncoder::EncodedInfo info;
-  for (int i = 0; i < frames_in_buffer_; ++i) {
+  for (int i = 0; i < frames_to_encode; ++i) {
     int16_t encoded_bytes_tmp = 0;
     CHECK_GE(WebRtcCng_Encode(cng_inst_.get(),
                               &speech_buffer_[i * samples_per_10ms_frame],
@@ -202,7 +200,7 @@
       force_sid = false;
     }
   }
-  info.encoded_timestamp = first_timestamp_in_buffer_;
+  info.encoded_timestamp = rtp_timestamps_.front();
   info.payload_type = cng_payload_type_;
   info.send_even_if_empty = true;
   info.speech = false;
@@ -210,15 +208,18 @@
 }
 
 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(
+    int frames_to_encode,
     size_t max_encoded_bytes,
     uint8_t* encoded) {
   const size_t samples_per_10ms_frame = SamplesPer10msFrame();
   AudioEncoder::EncodedInfo info;
-  for (int i = 0; i < frames_in_buffer_; ++i) {
+  for (int i = 0; i < frames_to_encode; ++i) {
     info = speech_encoder_->Encode(
-        first_timestamp_in_buffer_, &speech_buffer_[i * samples_per_10ms_frame],
+        rtp_timestamps_.front(), &speech_buffer_[i * samples_per_10ms_frame],
         samples_per_10ms_frame, max_encoded_bytes, encoded);
-    if (i < frames_in_buffer_ - 1) {
+    if (i == frames_to_encode - 1) {
+      CHECK_GT(info.encoded_bytes, 0u) << "Encoder didn't deliver data.";
+    } else {
       CHECK_EQ(info.encoded_bytes, 0u) << "Encoder delivered data too early.";
     }
   }
diff --git a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc
index a31f0de..8135b98 100644
--- a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc
+++ b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc
@@ -80,6 +80,21 @@
     timestamp_ += num_audio_samples_10ms_;
   }
 
+  // Expect |num_calls| calls to the encoder, all successful. The last call
+  // claims to have encoded |kMockMaxEncodedBytes| bytes, and all the preceding
+  // ones 0 bytes.
+  void ExpectEncodeCalls(int num_calls) {
+    InSequence s;
+    AudioEncoder::EncodedInfo info;
+    for (int j = 0; j < num_calls - 1; ++j) {
+      EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
+          .WillOnce(Return(info));
+    }
+    info.encoded_bytes = kMockReturnEncodedBytes;
+    EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
+        .WillOnce(Return(info));
+  }
+
   // Verifies that the cng_ object waits until it has collected
   // |blocks_per_frame| blocks of audio, and then dispatches all of them to
   // the underlying codec (speech or cng).
@@ -96,20 +111,8 @@
       Encode();
       EXPECT_EQ(0u, encoded_info_.encoded_bytes);
     }
-    if (active_speech) {
-      // Now expect |blocks_per_frame| calls to the encoder in sequence.
-      // Let the speech codec mock return true and set the number of encoded
-      // bytes to |kMockReturnEncodedBytes|.
-      InSequence s;
-      AudioEncoder::EncodedInfo info;
-      for (int j = 0; j < blocks_per_frame - 1; ++j) {
-        EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-            .WillOnce(Return(info));
-      }
-      info.encoded_bytes = kMockReturnEncodedBytes;
-      EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-          .WillOnce(Return(info));
-    }
+    if (active_speech)
+      ExpectEncodeCalls(blocks_per_frame);
     Encode();
     if (active_speech) {
       EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes);
@@ -283,23 +286,17 @@
   CreateCng();
 
   // All of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
   EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kActive));
   EXPECT_TRUE(encoded_info_.speech);
 
   // First half of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
   EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kPassive));
   EXPECT_TRUE(encoded_info_.speech);
 
   // Second half of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
   EXPECT_TRUE(CheckMixedActivePassive(Vad::kPassive, Vad::kActive));
   EXPECT_TRUE(encoded_info_.speech);
 
diff --git a/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h b/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h
index 831758b..094b730 100644
--- a/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h
+++ b/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h
@@ -66,16 +66,19 @@
     inline void operator()(CNG_enc_inst* ptr) const { WebRtcCng_FreeEnc(ptr); }
   };
 
-  EncodedInfo EncodePassive(size_t max_encoded_bytes, uint8_t* encoded);
-  EncodedInfo EncodeActive(size_t max_encoded_bytes, uint8_t* encoded);
+  EncodedInfo EncodePassive(int frames_to_encode,
+                            size_t max_encoded_bytes,
+                            uint8_t* encoded);
+  EncodedInfo EncodeActive(int frames_to_encode,
+                           size_t max_encoded_bytes,
+                           uint8_t* encoded);
   size_t SamplesPer10msFrame() const;
 
   AudioEncoder* speech_encoder_;
   const int cng_payload_type_;
   const int num_cng_coefficients_;
   std::vector<int16_t> speech_buffer_;
-  uint32_t first_timestamp_in_buffer_;
-  int frames_in_buffer_;
+  std::vector<uint32_t> rtp_timestamps_;
   bool last_frame_active_;
   rtc::scoped_ptr<Vad> vad_;
   rtc::scoped_ptr<CNG_enc_inst, CngInstDeleter> cng_inst_;