AudioEncoderCng: Handle case where speech encoder is reset Previously, AudioEncoderCng required the speech encoder to not change its mind regarding the number of 10 ms frames in the next packet between calls to AudioEncoderCng::EncodeInternal()---specifically, it could handle an upward but not a downward adjustment. With this patch, it can handle a downward adjustment too, by simply saving the overshoot data for the next call to EncodeInternal(). It will still not handle the case where the encoder's reported number of 10 ms frames in the next packet is inconsistent with the behavior of its Encode() function when called with no intervening changes to the encoder. R=henrik.lundin@webrtc.org Review URL: https://webrtc-codereview.appspot.com/53469005 Cr-Commit-Position: refs/heads/master@{#9261}

commit: 367c868c998e96bc1aac41b607548d6125fa6b1e [log] [tgz]
author: Henrik Lundin <henrik.lundin@webrtc.org> Fri May 22 15:13:41 2015 +0200
committer: Henrik Lundin <henrik.lundin@webrtc.org> Fri May 22 13:13:24 2015 +0000
tree: 77256fa289675b7cbd05b7439afe9b3c9d83dccf
parent: f761d10393ae47283c6170387fcb8cce4aadbd59 [diff]
diff --git a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
index 58fd24f..9b23607 100644
--- a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc
+++ b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng.cc

@@ -50,8 +50,6 @@
     : speech_encoder_(config.speech_encoder),
       cng_payload_type_(config.payload_type),
       num_cng_coefficients_(config.num_cng_coefficients),
-      first_timestamp_in_buffer_(0),
-      frames_in_buffer_(0),
       last_frame_active_(true),
       vad_(new Vad(config.vad_mode)) {
   if (config.vad) {
@@ -115,35 +113,31 @@
     size_t max_encoded_bytes,
     uint8_t* encoded) {
   CHECK_GE(max_encoded_bytes, static_cast<size_t>(num_cng_coefficients_ + 1));
-  const int num_samples = SampleRateHz() / 100 * NumChannels();
-  if (speech_buffer_.empty()) {
-    CHECK_EQ(frames_in_buffer_, 0);
-    first_timestamp_in_buffer_ = rtp_timestamp;
-  }
-  for (int i = 0; i < num_samples; ++i) {
+  const size_t samples_per_10ms_frame = SamplesPer10msFrame();
+  CHECK_EQ(speech_buffer_.size(),
+           rtp_timestamps_.size() * samples_per_10ms_frame);
+  rtp_timestamps_.push_back(rtp_timestamp);
+  for (size_t i = 0; i < samples_per_10ms_frame; ++i) {
     speech_buffer_.push_back(audio[i]);
   }
-  ++frames_in_buffer_;
-  if (frames_in_buffer_ < speech_encoder_->Num10MsFramesInNextPacket()) {
+  const int frames_to_encode = speech_encoder_->Num10MsFramesInNextPacket();
+  if (rtp_timestamps_.size() < static_cast<size_t>(frames_to_encode)) {
     return EncodedInfo();
   }
-  CHECK_LE(frames_in_buffer_ * 10, kMaxFrameSizeMs)
+  CHECK_LE(frames_to_encode * 10, kMaxFrameSizeMs)
       << "Frame size cannot be larger than " << kMaxFrameSizeMs
       << " ms when using VAD/CNG.";
-  const size_t samples_per_10ms_frame = 10 * SampleRateHz() / 1000;
-  CHECK_EQ(speech_buffer_.size(),
-           static_cast<size_t>(frames_in_buffer_) * samples_per_10ms_frame);
 
   // Group several 10 ms blocks per VAD call. Call VAD once or twice using the
   // following split sizes:
   // 10 ms = 10 + 0 ms; 20 ms = 20 + 0 ms; 30 ms = 30 + 0 ms;
   // 40 ms = 20 + 20 ms; 50 ms = 30 + 20 ms; 60 ms = 30 + 30 ms.
   int blocks_in_first_vad_call =
-      (frames_in_buffer_ > 3 ? 3 : frames_in_buffer_);
-  if (frames_in_buffer_ == 4)
+      (frames_to_encode > 3 ? 3 : frames_to_encode);
+  if (frames_to_encode == 4)
     blocks_in_first_vad_call = 2;
   const int blocks_in_second_vad_call =
-      frames_in_buffer_ - blocks_in_first_vad_call;
+      frames_to_encode - blocks_in_first_vad_call;
   CHECK_GE(blocks_in_second_vad_call, 0);
 
   // Check if all of the buffer is passive speech. Start with checking the first
@@ -161,12 +155,12 @@
   EncodedInfo info;
   switch (activity) {
     case Vad::kPassive: {
-      info = EncodePassive(max_encoded_bytes, encoded);
+      info = EncodePassive(frames_to_encode, max_encoded_bytes, encoded);
       last_frame_active_ = false;
       break;
     }
     case Vad::kActive: {
-      info = EncodeActive(max_encoded_bytes, encoded);
+      info = EncodeActive(frames_to_encode, max_encoded_bytes, encoded);
       last_frame_active_ = true;
       break;
     }
@@ -176,20 +170,24 @@
     }
   }
 
-  speech_buffer_.clear();
-  frames_in_buffer_ = 0;
+  speech_buffer_.erase(
+      speech_buffer_.begin(),
+      speech_buffer_.begin() + frames_to_encode * samples_per_10ms_frame);
+  rtp_timestamps_.erase(rtp_timestamps_.begin(),
+                        rtp_timestamps_.begin() + frames_to_encode);
   return info;
 }
 
 AudioEncoder::EncodedInfo AudioEncoderCng::EncodePassive(
+    int frames_to_encode,
     size_t max_encoded_bytes,
     uint8_t* encoded) {
   bool force_sid = last_frame_active_;
   bool output_produced = false;
   const size_t samples_per_10ms_frame = SamplesPer10msFrame();
-  CHECK_GE(max_encoded_bytes, frames_in_buffer_ * samples_per_10ms_frame);
+  CHECK_GE(max_encoded_bytes, frames_to_encode * samples_per_10ms_frame);
   AudioEncoder::EncodedInfo info;
-  for (int i = 0; i < frames_in_buffer_; ++i) {
+  for (int i = 0; i < frames_to_encode; ++i) {
     int16_t encoded_bytes_tmp = 0;
     CHECK_GE(WebRtcCng_Encode(cng_inst_.get(),
                               &speech_buffer_[i * samples_per_10ms_frame],
@@ -202,7 +200,7 @@
       force_sid = false;
     }
   }
-  info.encoded_timestamp = first_timestamp_in_buffer_;
+  info.encoded_timestamp = rtp_timestamps_.front();
   info.payload_type = cng_payload_type_;
   info.send_even_if_empty = true;
   info.speech = false;
@@ -210,15 +208,18 @@
 }
 
 AudioEncoder::EncodedInfo AudioEncoderCng::EncodeActive(
+    int frames_to_encode,
     size_t max_encoded_bytes,
     uint8_t* encoded) {
   const size_t samples_per_10ms_frame = SamplesPer10msFrame();
   AudioEncoder::EncodedInfo info;
-  for (int i = 0; i < frames_in_buffer_; ++i) {
+  for (int i = 0; i < frames_to_encode; ++i) {
     info = speech_encoder_->Encode(
-        first_timestamp_in_buffer_, &speech_buffer_[i * samples_per_10ms_frame],
+        rtp_timestamps_.front(), &speech_buffer_[i * samples_per_10ms_frame],
         samples_per_10ms_frame, max_encoded_bytes, encoded);
-    if (i < frames_in_buffer_ - 1) {
+    if (i == frames_to_encode - 1) {
+      CHECK_GT(info.encoded_bytes, 0u) << "Encoder didn't deliver data.";
+    } else {
       CHECK_EQ(info.encoded_bytes, 0u) << "Encoder delivered data too early.";
     }
   }

diff --git a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc
index a31f0de..8135b98 100644
--- a/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc
+++ b/webrtc/modules/audio_coding/codecs/cng/audio_encoder_cng_unittest.cc

@@ -80,6 +80,21 @@
     timestamp_ += num_audio_samples_10ms_;
   }
 
+  // Expect |num_calls| calls to the encoder, all successful. The last call
+  // claims to have encoded |kMockMaxEncodedBytes| bytes, and all the preceding
+  // ones 0 bytes.
+  void ExpectEncodeCalls(int num_calls) {
+    InSequence s;
+    AudioEncoder::EncodedInfo info;
+    for (int j = 0; j < num_calls - 1; ++j) {
+      EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
+          .WillOnce(Return(info));
+    }
+    info.encoded_bytes = kMockReturnEncodedBytes;
+    EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
+        .WillOnce(Return(info));
+  }
+
   // Verifies that the cng_ object waits until it has collected
   // |blocks_per_frame| blocks of audio, and then dispatches all of them to
   // the underlying codec (speech or cng).
@@ -96,20 +111,8 @@
       Encode();
       EXPECT_EQ(0u, encoded_info_.encoded_bytes);
     }
-    if (active_speech) {
-      // Now expect |blocks_per_frame| calls to the encoder in sequence.
-      // Let the speech codec mock return true and set the number of encoded
-      // bytes to |kMockReturnEncodedBytes|.
-      InSequence s;
-      AudioEncoder::EncodedInfo info;
-      for (int j = 0; j < blocks_per_frame - 1; ++j) {
-        EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-            .WillOnce(Return(info));
-      }
-      info.encoded_bytes = kMockReturnEncodedBytes;
-      EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-          .WillOnce(Return(info));
-    }
+    if (active_speech)
+      ExpectEncodeCalls(blocks_per_frame);
     Encode();
     if (active_speech) {
       EXPECT_EQ(kMockReturnEncodedBytes, encoded_info_.encoded_bytes);
@@ -283,23 +286,17 @@
   CreateCng();
 
   // All of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
   EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kActive));
   EXPECT_TRUE(encoded_info_.speech);
 
   // First half of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
   EXPECT_TRUE(CheckMixedActivePassive(Vad::kActive, Vad::kPassive));
   EXPECT_TRUE(encoded_info_.speech);
 
   // Second half of the frame is active speech.
-  EXPECT_CALL(mock_encoder_, EncodeInternal(_, _, _, _))
-      .Times(6)
-      .WillRepeatedly(Return(AudioEncoder::EncodedInfo()));
+  ExpectEncodeCalls(6);
   EXPECT_TRUE(CheckMixedActivePassive(Vad::kPassive, Vad::kActive));
   EXPECT_TRUE(encoded_info_.speech);
 

diff --git a/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h b/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h
index 831758b..094b730 100644
--- a/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h
+++ b/webrtc/modules/audio_coding/codecs/cng/include/audio_encoder_cng.h

@@ -66,16 +66,19 @@
     inline void operator()(CNG_enc_inst* ptr) const { WebRtcCng_FreeEnc(ptr); }
   };
 
-  EncodedInfo EncodePassive(size_t max_encoded_bytes, uint8_t* encoded);
-  EncodedInfo EncodeActive(size_t max_encoded_bytes, uint8_t* encoded);
+  EncodedInfo EncodePassive(int frames_to_encode,
+                            size_t max_encoded_bytes,
+                            uint8_t* encoded);
+  EncodedInfo EncodeActive(int frames_to_encode,
+                           size_t max_encoded_bytes,
+                           uint8_t* encoded);
   size_t SamplesPer10msFrame() const;
 
   AudioEncoder* speech_encoder_;
   const int cng_payload_type_;
   const int num_cng_coefficients_;
   std::vector<int16_t> speech_buffer_;
-  uint32_t first_timestamp_in_buffer_;
-  int frames_in_buffer_;
+  std::vector<uint32_t> rtp_timestamps_;
   bool last_frame_active_;
   rtc::scoped_ptr<Vad> vad_;
   rtc::scoped_ptr<CNG_enc_inst, CngInstDeleter> cng_inst_;
commit	367c868c998e96bc1aac41b607548d6125fa6b1e	[log] [tgz]
author	Henrik Lundin <henrik.lundin@webrtc.org>	Fri May 22 15:13:41 2015 +0200
committer	Henrik Lundin <henrik.lundin@webrtc.org>	Fri May 22 13:13:24 2015 +0000
tree	77256fa289675b7cbd05b7439afe9b3c9d83dccf
parent	f761d10393ae47283c6170387fcb8cce4aadbd59 [diff]