Revert^2 "Reduce the number of calls to Jit::AddSamples."

The method is called for every invoke which is expensive.
Add samples, but don't check the consequences every time.

This reduces its cost from 3.5% to 1% (maps on device).

This reverts commit 0402f4b019c1d6c567b1c56589f1ea9170ab5dcc
and fixes 667-jit-jni-stub which relied on being able to set
the sample count to exactly one below the compilation threshold,
and then expected the compilation to happen on the next sample.

Test: ./art/test.py -b --host -r --ndebug
Test: ./art/test.py -b -r -t 570
Change-Id: I99c6d03f565f17fe6539ed89632d8f8bbda68107
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index a633a63..d1896e6 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -44,7 +44,7 @@
 #include "handle_scope-inl.h"
 #include "interpreter_mterp_impl.h"
 #include "interpreter_switch_impl.h"
-#include "jit/jit.h"
+#include "jit/jit-inl.h"
 #include "mirror/call_site.h"
 #include "mirror/class-inl.h"
 #include "mirror/dex_cache.h"
diff --git a/runtime/interpreter/interpreter_switch_impl-inl.h b/runtime/interpreter/interpreter_switch_impl-inl.h
index 94cb3de..aec2aa2 100644
--- a/runtime/interpreter/interpreter_switch_impl-inl.h
+++ b/runtime/interpreter/interpreter_switch_impl-inl.h
@@ -26,7 +26,7 @@
 #include "dex/dex_instruction_list.h"
 #include "experimental_flags.h"
 #include "interpreter_common.h"
-#include "jit/jit.h"
+#include "jit/jit-inl.h"
 #include "jvalue-inl.h"
 #include "mirror/string-alloc-inl.h"
 #include "nth_caller_visitor.h"
diff --git a/runtime/jit/jit-inl.h b/runtime/jit/jit-inl.h
new file mode 100644
index 0000000..80324ad
--- /dev/null
+++ b/runtime/jit/jit-inl.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_JIT_JIT_INL_H_
+#define ART_RUNTIME_JIT_JIT_INL_H_
+
+#include "jit/jit.h"
+
+#include "art_method.h"
+#include "base/bit_utils.h"
+#include "thread.h"
+#include "runtime-inl.h"
+
+namespace art {
+namespace jit {
+
+inline bool Jit::ShouldUsePriorityThreadWeight(Thread* self) {
+  return self->IsJitSensitiveThread() && Runtime::Current()->InJankPerceptibleProcessState();
+}
+
+inline void Jit::AddSamples(Thread* self,
+                            ArtMethod* method,
+                            uint16_t samples,
+                            bool with_backedges) {
+  if (Jit::ShouldUsePriorityThreadWeight(self)) {
+    samples *= PriorityThreadWeight();
+  }
+  uint32_t old_count = method->GetCounter();
+  uint32_t new_count = old_count + samples;
+
+  // The full check is fairly expensive so we just add to hotness most of the time,
+  // and we do the full check only when some of the higher bits of the count change.
+  // NB: The method needs to see the transitions of the counter past the thresholds.
+  uint32_t old_batch = RoundDown(old_count, kJitSamplesBatchSize);  // Clear lower bits.
+  uint32_t new_batch = RoundDown(new_count, kJitSamplesBatchSize);  // Clear lower bits.
+  if (UNLIKELY(old_batch == 0)) {
+    // For low sample counts, we check every time (which is important for tests).
+    if (!MaybeCompileMethod(self, method, old_count, new_count, with_backedges)) {
+      // Tests may check that the counter is 0 for methods that we never compile.
+      return;  // Ignore the samples for now and retry later.
+    }
+  } else if (UNLIKELY(old_batch != new_batch)) {
+    // For high sample counts, we check only when we move past the batch boundary.
+    if (!MaybeCompileMethod(self, method, old_batch, new_batch, with_backedges)) {
+      // OSR compilation will ignore the samples if they don't have backedges.
+      return;  // Ignore the samples for now and retry later.
+    }
+  }
+
+  method->SetCounter(new_count);
+}
+
+}  // namespace jit
+}  // namespace art
+
+#endif  // ART_RUNTIME_JIT_JIT_INL_H_
diff --git a/runtime/jit/jit.cc b/runtime/jit/jit.cc
index e43d771..8d542d355 100644
--- a/runtime/jit/jit.cc
+++ b/runtime/jit/jit.cc
@@ -28,6 +28,7 @@
 #include "debugger.h"
 #include "entrypoints/runtime_asm_entrypoints.h"
 #include "interpreter/interpreter.h"
+#include "jit-inl.h"
 #include "jit_code_cache.h"
 #include "jni/java_vm_ext.h"
 #include "mirror/method_handle_impl.h"
@@ -68,6 +69,14 @@
 };
 DEFINE_RUNTIME_DEBUG_FLAG(StressModeHelper, kSlowMode);
 
+uint32_t JitOptions::RoundUpThreshold(uint32_t threshold) {
+  if (threshold > kJitSamplesBatchSize) {
+    threshold = RoundUp(threshold, kJitSamplesBatchSize);
+  }
+  CHECK_LE(threshold, std::numeric_limits<uint16_t>::max());
+  return threshold;
+}
+
 JitOptions* JitOptions::CreateFromRuntimeArguments(const RuntimeArgumentMap& options) {
   auto* jit_options = new JitOptions;
   jit_options->use_jit_compilation_ = options.GetOrDefault(RuntimeArgumentMap::UseJitCompilation);
@@ -93,30 +102,25 @@
                    : kJitStressDefaultCompileThreshold)
             : kJitDefaultCompileThreshold;
   }
-  if (jit_options->compile_threshold_ > std::numeric_limits<uint16_t>::max()) {
-    LOG(FATAL) << "Method compilation threshold is above its internal limit.";
-  }
+  jit_options->compile_threshold_ = RoundUpThreshold(jit_options->compile_threshold_);
 
   if (options.Exists(RuntimeArgumentMap::JITWarmupThreshold)) {
     jit_options->warmup_threshold_ = *options.Get(RuntimeArgumentMap::JITWarmupThreshold);
-    if (jit_options->warmup_threshold_ > std::numeric_limits<uint16_t>::max()) {
-      LOG(FATAL) << "Method warmup threshold is above its internal limit.";
-    }
   } else {
     jit_options->warmup_threshold_ = jit_options->compile_threshold_ / 2;
   }
+  jit_options->warmup_threshold_ = RoundUpThreshold(jit_options->warmup_threshold_);
 
   if (options.Exists(RuntimeArgumentMap::JITOsrThreshold)) {
     jit_options->osr_threshold_ = *options.Get(RuntimeArgumentMap::JITOsrThreshold);
-    if (jit_options->osr_threshold_ > std::numeric_limits<uint16_t>::max()) {
-      LOG(FATAL) << "Method on stack replacement threshold is above its internal limit.";
-    }
   } else {
     jit_options->osr_threshold_ = jit_options->compile_threshold_ * 2;
     if (jit_options->osr_threshold_ > std::numeric_limits<uint16_t>::max()) {
-      jit_options->osr_threshold_ = std::numeric_limits<uint16_t>::max();
+      jit_options->osr_threshold_ =
+          RoundDown(std::numeric_limits<uint16_t>::max(), kJitSamplesBatchSize);
     }
   }
+  jit_options->osr_threshold_ = RoundUpThreshold(jit_options->osr_threshold_);
 
   if (options.Exists(RuntimeArgumentMap::JITPriorityThreadWeight)) {
     jit_options->priority_thread_weight_ =
@@ -149,10 +153,6 @@
   return jit_options;
 }
 
-bool Jit::ShouldUsePriorityThreadWeight(Thread* self) {
-  return self->IsJitSensitiveThread() && Runtime::Current()->InJankPerceptibleProcessState();
-}
-
 void Jit::DumpInfo(std::ostream& os) {
   code_cache_->Dump(os);
   cumulative_timings_.Dump(os);
@@ -630,20 +630,24 @@
   return false;
 }
 
-void Jit::AddSamples(Thread* self, ArtMethod* method, uint16_t count, bool with_backedges) {
+bool Jit::MaybeCompileMethod(Thread* self,
+                             ArtMethod* method,
+                             uint32_t old_count,
+                             uint32_t new_count,
+                             bool with_backedges) {
   if (thread_pool_ == nullptr) {
     // Should only see this when shutting down, starting up, or in safe mode.
     DCHECK(Runtime::Current()->IsShuttingDown(self) ||
            !Runtime::Current()->IsFinishedStarting() ||
            Runtime::Current()->IsSafeMode());
-    return;
+    return false;
   }
   if (IgnoreSamplesForMethod(method)) {
-    return;
+    return false;
   }
   if (HotMethodThreshold() == 0) {
     // Tests might request JIT on first use (compiled synchronously in the interpreter).
-    return;
+    return false;
   }
   DCHECK(thread_pool_ != nullptr);
   DCHECK_GT(WarmMethodThreshold(), 0);
@@ -652,15 +656,9 @@
   DCHECK_GE(PriorityThreadWeight(), 1);
   DCHECK_LE(PriorityThreadWeight(), HotMethodThreshold());
 
-  uint16_t starting_count = method->GetCounter();
-  if (Jit::ShouldUsePriorityThreadWeight(self)) {
-    count *= PriorityThreadWeight();
-  }
-  uint32_t new_count = starting_count + count;
-  // Note: Native method have no "warm" state or profiling info.
-  if (LIKELY(!method->IsNative()) && starting_count < WarmMethodThreshold()) {
-    if ((new_count >= WarmMethodThreshold()) &&
-        (method->GetProfilingInfo(kRuntimePointerSize) == nullptr)) {
+  if (old_count < WarmMethodThreshold() && new_count >= WarmMethodThreshold()) {
+    // Note: Native method have no "warm" state or profiling info.
+    if (!method->IsNative() && method->GetProfilingInfo(kRuntimePointerSize) == nullptr) {
       bool success = ProfilingInfo::Create(self, method, /* retry_allocation= */ false);
       if (success) {
         VLOG(jit) << "Start profiling " << method->PrettyMethod();
@@ -670,7 +668,7 @@
         // Calling ProfilingInfo::Create might put us in a suspended state, which could
         // lead to the thread pool being deleted when we are shutting down.
         DCHECK(Runtime::Current()->IsShuttingDown(self));
-        return;
+        return false;
       }
 
       if (!success) {
@@ -680,32 +678,27 @@
             self, new JitCompileTask(method, JitCompileTask::TaskKind::kAllocateProfile));
       }
     }
-    // Avoid jumping more than one state at a time.
-    new_count = std::min(new_count, static_cast<uint32_t>(HotMethodThreshold() - 1));
-  } else if (UseJitCompilation()) {
-    if (starting_count < HotMethodThreshold()) {
-      if ((new_count >= HotMethodThreshold()) &&
-          !code_cache_->ContainsPc(method->GetEntryPointFromQuickCompiledCode())) {
+  }
+  if (UseJitCompilation()) {
+    if (old_count < HotMethodThreshold() && new_count >= HotMethodThreshold()) {
+      if (!code_cache_->ContainsPc(method->GetEntryPointFromQuickCompiledCode())) {
         DCHECK(thread_pool_ != nullptr);
         thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::TaskKind::kCompile));
       }
-      // Avoid jumping more than one state at a time.
-      new_count = std::min(new_count, static_cast<uint32_t>(OSRMethodThreshold() - 1));
-    } else if (starting_count < OSRMethodThreshold()) {
+    }
+    if (old_count < OSRMethodThreshold() && new_count >= OSRMethodThreshold()) {
       if (!with_backedges) {
-        // If the samples don't contain any back edge, we don't increment the hotness.
-        return;
+        return false;
       }
       DCHECK(!method->IsNative());  // No back edges reported for native methods.
-      if ((new_count >= OSRMethodThreshold()) &&  !code_cache_->IsOsrCompiled(method)) {
+      if (!code_cache_->IsOsrCompiled(method)) {
         DCHECK(thread_pool_ != nullptr);
         thread_pool_->AddTask(
             self, new JitCompileTask(method, JitCompileTask::TaskKind::kCompileOsr));
       }
     }
   }
-  // Update hotness counter
-  method->SetCounter(new_count);
+  return true;
 }
 
 class ScopedSetRuntimeThread {
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index 7ce5f07..10f6094 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -47,6 +47,7 @@
 // At what priority to schedule jit threads. 9 is the lowest foreground priority on device.
 // See android/os/Process.java.
 static constexpr int kJitPoolThreadPthreadDefaultPriority = 9;
+static constexpr uint32_t kJitSamplesBatchSize = 32;  // Must be power of 2.
 
 class JitOptions {
  public:
@@ -122,12 +123,16 @@
   }
 
  private:
+  // We add the sample in batches of size kJitSamplesBatchSize.
+  // This method rounds the threshold so that it is multiple of the batch size.
+  static uint32_t RoundUpThreshold(uint32_t threshold);
+
   bool use_jit_compilation_;
   size_t code_cache_initial_capacity_;
   size_t code_cache_max_capacity_;
-  uint16_t compile_threshold_;
-  uint16_t warmup_threshold_;
-  uint16_t osr_threshold_;
+  uint32_t compile_threshold_;
+  uint32_t warmup_threshold_;
+  uint32_t osr_threshold_;
   uint16_t priority_thread_weight_;
   uint16_t invoke_transition_weight_;
   bool dump_info_on_shutdown_;
@@ -154,7 +159,7 @@
   static constexpr size_t kDefaultPriorityThreadWeightRatio = 1000;
   static constexpr size_t kDefaultInvokeTransitionWeightRatio = 500;
   // How frequently should the interpreter check to see if OSR compilation is ready.
-  static constexpr int16_t kJitRecheckOSRThreshold = 100;
+  static constexpr int16_t kJitRecheckOSRThreshold = 101;  // Prime number to avoid patterns.
 
   virtual ~Jit();
 
@@ -217,7 +222,10 @@
   void MethodEntered(Thread* thread, ArtMethod* method)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  void AddSamples(Thread* self, ArtMethod* method, uint16_t samples, bool with_backedges)
+  ALWAYS_INLINE void AddSamples(Thread* self,
+                                ArtMethod* method,
+                                uint16_t samples,
+                                bool with_backedges)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   void InvokeVirtualOrInterface(ObjPtr<mirror::Object> this_object,
@@ -297,6 +305,15 @@
  private:
   Jit(JitCodeCache* code_cache, JitOptions* options);
 
+  // Compile the method if the number of samples passes a threshold.
+  // Returns false if we can not compile now - don't increment the counter and retry later.
+  bool MaybeCompileMethod(Thread* self,
+                          ArtMethod* method,
+                          uint32_t old_count,
+                          uint32_t new_count,
+                          bool with_backedges)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   static bool BindCompilerMethods(std::string* error_msg);
 
   // JIT compiler