Revert "ART: Improve JitProfile perf in arm/arm64 mterp"

This reverts commit c1d6b341eed646e5adafc6c4fd4e3748f0292368.
diff --git a/runtime/art_method.h b/runtime/art_method.h
index d1ef019..3dbcd58 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -545,9 +545,6 @@
   ALWAYS_INLINE GcRoot<mirror::Class>* GetDexCacheResolvedTypes(size_t pointer_size)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
-  // Note, hotness_counter_ updates are non-atomic but it doesn't need to be precise.  Also,
-  // given that the counter is only 16 bits wide we can expect wrap-around in some
-  // situations.  Consumers of hotness_count_ must be able to deal with that.
   uint16_t IncrementCounter() {
     return ++hotness_count_;
   }
@@ -556,14 +553,6 @@
     hotness_count_ = 0;
   }
 
-  void SetCounter(int16_t hotness_count) {
-    hotness_count_ = hotness_count;
-  }
-
-  uint16_t GetCounter() const {
-    return hotness_count_;
-  }
-
   const uint8_t* GetQuickenedInfo() SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Returns the method header for the compiled code containing 'pc'. Note that runtime
@@ -608,7 +597,7 @@
   // ifTable.
   uint16_t method_index_;
 
-  // The hotness we measure for this method. Managed by the interpreter. Not atomic, as we allow
+  // The hotness we measure for this method. Incremented by the interpreter. Not atomic, as we allow
   // missing increments: if the method is hot, we will see it eventually.
   uint16_t hotness_count_;
 
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index d27d2f6..942f9de 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -20,7 +20,6 @@
 #if defined(__cplusplus)
 #include "art_method.h"
 #include "gc/allocator/rosalloc.h"
-#include "jit/jit_instrumentation.h"
 #include "lock_word.h"
 #include "mirror/class.h"
 #include "mirror/string.h"
@@ -189,13 +188,7 @@
 #define SHADOWFRAME_DEX_PC_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 4)
 ADD_TEST_EQ(SHADOWFRAME_DEX_PC_OFFSET,
             static_cast<int32_t>(art::ShadowFrame::DexPCOffset()))
-#define SHADOWFRAME_CACHED_HOTNESS_COUNTDOWN_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 8)
-ADD_TEST_EQ(SHADOWFRAME_CACHED_HOTNESS_COUNTDOWN_OFFSET,
-            static_cast<int32_t>(art::ShadowFrame::CachedHotnessCountdownOffset()))
-#define SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 10)
-ADD_TEST_EQ(SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET,
-            static_cast<int32_t>(art::ShadowFrame::HotnessCountdownOffset()))
-#define SHADOWFRAME_VREGS_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 12)
+#define SHADOWFRAME_VREGS_OFFSET (SHADOWFRAME_NUMBER_OF_VREGS_OFFSET + 8)
 ADD_TEST_EQ(SHADOWFRAME_VREGS_OFFSET,
             static_cast<int32_t>(art::ShadowFrame::VRegsOffset()))
 
@@ -396,12 +389,6 @@
 #define THREAD_CHECKPOINT_REQUEST 2
 ADD_TEST_EQ(THREAD_CHECKPOINT_REQUEST, static_cast<int32_t>(art::kCheckpointRequest))
 
-#define JIT_CHECK_OSR -1
-ADD_TEST_EQ(JIT_CHECK_OSR, static_cast<int32_t>(art::jit::kJitCheckForOSR))
-
-#define JIT_HOTNESS_DISABLE -2
-ADD_TEST_EQ(JIT_HOTNESS_DISABLE, static_cast<int32_t>(art::jit::kJitHotnessDisabled))
-
 #if defined(__cplusplus)
 }  // End of CheckAsmSupportOffsets.
 #endif
diff --git a/runtime/instrumentation.h b/runtime/instrumentation.h
index a4c3d41..d07f47b 100644
--- a/runtime/instrumentation.h
+++ b/runtime/instrumentation.h
@@ -303,8 +303,7 @@
   bool NonJitProfilingActive() const SHARED_REQUIRES(Locks::mutator_lock_) {
     return have_dex_pc_listeners_ || have_method_exit_listeners_ ||
         have_field_read_listeners_ || have_field_write_listeners_ ||
-        have_exception_caught_listeners_ || have_method_unwind_listeners_ ||
-        have_branch_listeners_;
+        have_exception_caught_listeners_ || have_method_unwind_listeners_;
   }
 
   // Inform listeners that a method has been entered. A dex PC is provided as we may install
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index d70a7c4..12d6fdc 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -22,7 +22,6 @@
 #include "experimental_flags.h"
 #include "interpreter_common.h"
 #include "jit/jit.h"
-#include "jit/jit_instrumentation.h"
 #include "safe_math.h"
 
 #include <memory>  // std::unique_ptr
@@ -65,20 +64,15 @@
   currentHandlersTable = handlersTable[ \
       Runtime::Current()->GetInstrumentation()->GetInterpreterHandlerTable()]
 
-#define BRANCH_INSTRUMENTATION(offset)                                                          \
-  do {                                                                                          \
-    instrumentation->Branch(self, method, dex_pc, offset);                                      \
-    JValue result;                                                                              \
-    if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {           \
-      return result;                                                                            \
-    }                                                                                           \
-  } while (false)
-
-#define HOTNESS_UPDATE()                                                                       \
-  do {                                                                                         \
-    if (jit_instrumentation_cache != nullptr) {                                                \
-      jit_instrumentation_cache->AddSamples(self, method, 1);                                  \
-    }                                                                                          \
+#define BRANCH_INSTRUMENTATION(offset)                                                            \
+  do {                                                                                            \
+    ArtMethod* method = shadow_frame.GetMethod();                                                 \
+    instrumentation::Instrumentation* instrumentation = Runtime::Current()->GetInstrumentation(); \
+    instrumentation->Branch(self, method, dex_pc, offset);                                        \
+    JValue result;                                                                                \
+    if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {             \
+      return result;                                                                              \
+    }                                                                                             \
   } while (false)
 
 #define UNREACHABLE_CODE_CHECK()                \
@@ -192,13 +186,6 @@
   UPDATE_HANDLER_TABLE();
   std::unique_ptr<lambda::ClosureBuilder> lambda_closure_builder;
   size_t lambda_captured_variable_index = 0;
-  const auto* const instrumentation = Runtime::Current()->GetInstrumentation();
-  ArtMethod* method = shadow_frame.GetMethod();
-  jit::Jit* jit = Runtime::Current()->GetJit();
-  jit::JitInstrumentationCache* jit_instrumentation_cache = nullptr;
-  if (jit != nullptr) {
-    jit_instrumentation_cache = jit->GetInstrumentationCache();
-  }
 
   // Jump to first instruction.
   ADVANCE(0);
@@ -643,7 +630,6 @@
     int8_t offset = inst->VRegA_10t(inst_data);
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
-      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -657,7 +643,6 @@
     int16_t offset = inst->VRegA_20t();
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
-      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -671,7 +656,6 @@
     int32_t offset = inst->VRegA_30t();
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
-      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -685,7 +669,6 @@
     int32_t offset = DoPackedSwitch(inst, shadow_frame, inst_data);
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
-      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -699,7 +682,6 @@
     int32_t offset = DoSparseSwitch(inst, shadow_frame, inst_data);
     BRANCH_INSTRUMENTATION(offset);
     if (IsBackwardBranch(offset)) {
-      HOTNESS_UPDATE();
       if (UNLIKELY(self->TestAllFlags())) {
         self->CheckSuspend();
         UPDATE_HANDLER_TABLE();
@@ -803,7 +785,6 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -823,7 +804,6 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -843,7 +823,6 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -863,7 +842,6 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -883,7 +861,6 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -903,7 +880,6 @@
       int16_t offset = inst->VRegC_22t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -922,7 +898,6 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -941,7 +916,6 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -960,7 +934,6 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -979,7 +952,6 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -998,7 +970,6 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
@@ -1017,7 +988,6 @@
       int16_t offset = inst->VRegB_21t();
       BRANCH_INSTRUMENTATION(offset);
       if (IsBackwardBranch(offset)) {
-        HOTNESS_UPDATE();
         if (UNLIKELY(self->TestAllFlags())) {
           self->CheckSuspend();
           UPDATE_HANDLER_TABLE();
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index f9941d2..0488dbf 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -18,7 +18,6 @@
 #include "experimental_flags.h"
 #include "interpreter_common.h"
 #include "jit/jit.h"
-#include "jit/jit_instrumentation.h"
 #include "safe_math.h"
 
 #include <memory>  // std::unique_ptr
@@ -73,6 +72,7 @@
 
 #define BRANCH_INSTRUMENTATION(offset)                                                         \
   do {                                                                                         \
+    ArtMethod* method = shadow_frame.GetMethod();                                              \
     instrumentation->Branch(self, method, dex_pc, offset);                                     \
     JValue result;                                                                             \
     if (jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, &result)) {          \
@@ -80,13 +80,6 @@
     }                                                                                          \
   } while (false)
 
-#define HOTNESS_UPDATE()                                                                       \
-  do {                                                                                         \
-    if (jit_instrumentation_cache != nullptr) {                                                \
-      jit_instrumentation_cache->AddSamples(self, method, 1);                                  \
-    }                                                                                          \
-  } while (false)
-
 static bool IsExperimentalInstructionEnabled(const Instruction *inst) {
   DCHECK(inst->IsExperimental());
   return Runtime::Current()->AreExperimentalFlagsEnabled(ExperimentalFlags::kLambdas);
@@ -108,12 +101,6 @@
   const uint16_t* const insns = code_item->insns_;
   const Instruction* inst = Instruction::At(insns + dex_pc);
   uint16_t inst_data;
-  ArtMethod* method = shadow_frame.GetMethod();
-  jit::Jit* jit = Runtime::Current()->GetJit();
-  jit::JitInstrumentationCache* jit_instrumentation_cache = nullptr;
-  if (jit != nullptr) {
-    jit_instrumentation_cache = jit->GetInstrumentationCache();
-  }
 
   // TODO: collapse capture-variable+create-lambda into one opcode, then we won't need
   // to keep this live for the scope of the entire function call.
@@ -577,7 +564,6 @@
         int8_t offset = inst->VRegA_10t(inst_data);
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
-          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -588,7 +574,6 @@
         int16_t offset = inst->VRegA_20t();
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
-          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -599,7 +584,6 @@
         int32_t offset = inst->VRegA_30t();
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
-          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -610,7 +594,6 @@
         int32_t offset = DoPackedSwitch(inst, shadow_frame, inst_data);
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
-          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -621,7 +604,6 @@
         int32_t offset = DoSparseSwitch(inst, shadow_frame, inst_data);
         BRANCH_INSTRUMENTATION(offset);
         if (IsBackwardBranch(offset)) {
-          HOTNESS_UPDATE();
           self->AllowThreadSuspension();
         }
         inst = inst->RelativeAt(offset);
@@ -726,7 +708,6 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -743,7 +724,6 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -760,7 +740,6 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -777,7 +756,6 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -794,7 +772,6 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -811,7 +788,6 @@
           int16_t offset = inst->VRegC_22t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -827,7 +803,6 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -843,7 +818,6 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -859,7 +833,6 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -875,7 +848,6 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -891,7 +863,6 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
@@ -907,7 +878,6 @@
           int16_t offset = inst->VRegB_21t();
           BRANCH_INSTRUMENTATION(offset);
           if (IsBackwardBranch(offset)) {
-            HOTNESS_UPDATE();
             self->AllowThreadSuspension();
           }
           inst = inst->RelativeAt(offset);
diff --git a/runtime/interpreter/mterp/arm/bincmp.S b/runtime/interpreter/mterp/arm/bincmp.S
index 8fad42f..cfad714 100644
--- a/runtime/interpreter/mterp/arm/bincmp.S
+++ b/runtime/interpreter/mterp/arm/bincmp.S
@@ -1,6 +1,7 @@
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -8,12 +9,23 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r0, r0                     @ r0<- vA
+    GET_VREG r2, r0                     @ r2<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, r3                      @ compare (vA, vB)
-    b${condition} MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    cmp     r2, r3                      @ compare (vA, vB)
+    mov${revcmp} rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r2, rINST, rINST            @ convert to bytes, check sign
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/entry.S b/runtime/interpreter/mterp/arm/entry.S
index a6b131d..981c036 100644
--- a/runtime/interpreter/mterp/arm/entry.S
+++ b/runtime/interpreter/mterp/arm/entry.S
@@ -33,8 +33,10 @@
 
 ExecuteMterpImpl:
     .fnstart
-    .save {r3-r10,fp,lr}
-    stmfd   sp!, {r3-r10,fp,lr}         @ save 10 regs, (r3 just to align 64)
+    .save {r4-r10,fp,lr}
+    stmfd   sp!, {r4-r10,fp,lr}         @ save 9 regs
+    .pad    #4
+    sub     sp, sp, #4                  @ align 64
 
     /* Remember the return register */
     str     r3, [r2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -55,12 +57,6 @@
     /* Starting ibase */
     ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
-    /* Set up for backwards branches & osr profiling */
-    ldr     r0, [rFP, #OFF_FP_METHOD]
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    bl      MterpSetUpHotnessCountdown
-    mov     rPROFILE, r0                @ Starting hotness countdown to rPROFILE
-
     /* start executing the instruction at rPC */
     FETCH_INST                          @ load rINST from rPC
     GET_INST_OPCODE ip                  @ extract opcode from rINST
diff --git a/runtime/interpreter/mterp/arm/footer.S b/runtime/interpreter/mterp/arm/footer.S
index 4f46bad..3456a75 100644
--- a/runtime/interpreter/mterp/arm/footer.S
+++ b/runtime/interpreter/mterp/arm/footer.S
@@ -114,111 +114,21 @@
     /* NOTE: no fallthrough */
 
 /*
- * Common handling for branches with support for Jit profiling.
- * On entry:
- *    rINST          <= signed offset
- *    rPROFILE       <= signed hotness countdown (expanded to 32 bits)
- *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
- *
- * We have quite a few different cases for branch profiling, OSR detection and
- * suspend check support here.
- *
- * Taken backward branches:
- *    If profiling active, do hotness countdown and report if we hit zero.
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *    Is there a pending suspend request?  If so, suspend.
- *
- * Taken forward branches and not-taken backward branches:
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *
- * Our most common case is expected to be a taken backward branch with active jit profiling,
- * but no full OSR check and no pending suspend request.
- * Next most common case is not-taken branch with no full OSR check.
- *
+ * Check for suspend check request.  Assumes rINST already loaded, rPC advanced and
+ * still needs to get the opcode and branch to it, and flags are in lr.
  */
-MterpCommonTakenBranchNoFlags:
-    cmp     rINST, #0
-MterpCommonTakenBranch:
-    bgt     .L_forward_branch           @ don't add forward branches to hotness
-/*
- * We need to subtract 1 from positive values and we should not see 0 here,
- * so we may use the result of the comparison with -1.
- */
-#if JIT_CHECK_OSR != -1
-#  error "JIT_CHECK_OSR must be -1."
-#endif
-    cmp     rPROFILE, #JIT_CHECK_OSR
-    beq     .L_osr_check
-    subgts  rPROFILE, #1
-    beq     .L_add_batch                @ counted down to zero - report
-.L_resume_backward_branch:
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    REFRESH_IBASE
-    add     r2, rINST, rINST            @ r2<- byte offset
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+MterpCheckSuspendAndContinue:
+    ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]  @ refresh rIBASE
     ands    lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    bne     .L_suspend_request_pending
+    bne     1f
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
-
-.L_suspend_request_pending:
+1:
     EXPORT_PC
     mov     r0, rSELF
     bl      MterpSuspendCheck           @ (self)
     cmp     r0, #0
     bne     MterpFallback
-    REFRESH_IBASE                       @ might have changed during suspend
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-
-.L_no_count_backwards:
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    bne     .L_resume_backward_branch
-.L_osr_check:
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
-    bne     MterpOnStackReplacement
-    b       .L_resume_backward_branch
-
-.L_forward_branch:
-    cmp     rPROFILE, #JIT_CHECK_OSR @ possible OSR re-entry?
-    beq     .L_check_osr_forward
-.L_resume_forward_branch:
-    add     r2, rINST, rINST            @ r2<- byte offset
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-
-.L_check_osr_forward:
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
-    bne     MterpOnStackReplacement
-    b       .L_resume_forward_branch
-
-.L_add_batch:
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
-    ldr     r0, [rFP, #OFF_FP_METHOD]
-    mov     r2, rSELF
-    bl      MterpAddHotnessBatch        @ (method, shadow_frame, self)
-    mov     rPROFILE, r0                @ restore new hotness countdown to rPROFILE
-    b       .L_no_count_backwards
-
-/*
- * Entered from the conditional branch handlers when OSR check request active on
- * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
- */
-.L_check_not_taken_osr:
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, #2
-    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
-    bne     MterpOnStackReplacement
-    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -266,27 +176,9 @@
     str     r1, [r2, #4]
     mov     r0, #1                                  @ signal return to caller.
 MterpDone:
-/*
- * At this point, we expect rPROFILE to be non-zero.  If negative, hotness is disabled or we're
- * checking for OSR.  If greater than zero, we might have unreported hotness to register
- * (the difference between the ending rPROFILE and the cached hotness counter).  rPROFILE
- * should only reach zero immediately after a hotness decrement, and is then reset to either
- * a negative special state or the new non-zero countdown value.
- */
-    cmp     rPROFILE, #0
-    bgt     MterpProfileActive                      @ if > 0, we may have some counts to report.
-    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
+    add     sp, sp, #4                              @ un-align 64
+    ldmfd   sp!, {r4-r10,fp,pc}                     @ restore 9 regs and return
 
-MterpProfileActive:
-    mov     rINST, r0                               @ stash return value
-    /* Report cached hotness counts */
-    ldr     r0, [rFP, #OFF_FP_METHOD]
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rSELF
-    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
-    bl      MterpAddHotnessBatch                    @ (method, shadow_frame, self)
-    mov     r0, rINST                               @ restore return value
-    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
 
     .fnend
     .size   ExecuteMterpImpl, .-ExecuteMterpImpl
diff --git a/runtime/interpreter/mterp/arm/header.S b/runtime/interpreter/mterp/arm/header.S
index 039bcbe..298af8a 100644
--- a/runtime/interpreter/mterp/arm/header.S
+++ b/runtime/interpreter/mterp/arm/header.S
@@ -72,8 +72,7 @@
   r6  rSELF     self (Thread) pointer
   r7  rINST     first 16-bit code unit of current instruction
   r8  rIBASE    interpreted instruction base pointer, used for computed goto
-  r10 rPROFILE  branch profiling countdown
-  r11 rREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
+  r11 rREFS	base of object references in shadow frame  (ideally, we'll get rid of this later).
 
 Macros are provided for common operations.  Each macro MUST emit only
 one instruction to make instruction-counting easier.  They MUST NOT alter
@@ -91,13 +90,12 @@
 
 /* During bringup, we'll use the shadow frame model instead of rFP */
 /* single-purpose registers, given names for clarity */
-#define rPC      r4
-#define rFP      r5
-#define rSELF    r6
-#define rINST    r7
-#define rIBASE   r8
-#define rPROFILE r10
-#define rREFS    r11
+#define rPC     r4
+#define rFP     r5
+#define rSELF   r6
+#define rINST   r7
+#define rIBASE  r8
+#define rREFS   r11
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep rFP at the base of the vregs.  So,
@@ -111,7 +109,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME OFF_FP(0)
+#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
diff --git a/runtime/interpreter/mterp/arm/op_cmp_long.S b/runtime/interpreter/mterp/arm/op_cmp_long.S
index 6626ff0..e57b19c 100644
--- a/runtime/interpreter/mterp/arm/op_cmp_long.S
+++ b/runtime/interpreter/mterp/arm/op_cmp_long.S
@@ -1,6 +1,22 @@
     /*
      * Compare two 64-bit values.  Puts 0, 1, or -1 into the destination
      * register based on the results of the comparison.
+     *
+     * We load the full values with LDM, but in practice many values could
+     * be resolved by only looking at the high word.  This could be made
+     * faster or slower by splitting the LDM into a pair of LDRs.
+     *
+     * If we just wanted to set condition flags, we could do this:
+     *  subs    ip, r0, r2
+     *  sbcs    ip, r1, r3
+     *  subeqs  ip, r0, r2
+     * Leaving { <0, 0, >0 } in ip.  However, we have to set it to a specific
+     * integer value, which we can do with 2 conditional mov/mvn instructions
+     * (set 1, set -1; if they're equal we already have 0 in ip), giving
+     * us a constant 5-cycle path plus a branch at the end to the
+     * instruction epilogue code.  The multi-compare approach below needs
+     * 2 or 3 cycles + branch if the high word doesn't match, 6 + branch
+     * in the worst case (the 64-bit values are equal).
      */
     /* cmp-long vAA, vBB, vCC */
     FETCH r0, 1                         @ r0<- CCBB
@@ -11,13 +27,30 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    cmp     r0, r2
-    sbcs    ip, r1, r3                  @ Sets correct CCs for checking LT (but not EQ/NE)
-    mov     ip, #0
-    mvnlt   ip, #0                      @ -1
-    cmpeq   r0, r2                      @ For correct EQ/NE, we may need to repeat the first CMP
-    orrne   ip, #1
+    cmp     r1, r3                      @ compare (vBB+1, vCC+1)
+    blt     .L${opcode}_less            @ signed compare on high part
+    bgt     .L${opcode}_greater
+    subs    r1, r0, r2                  @ r1<- r0 - r2
+    bhi     .L${opcode}_greater         @ unsigned compare on low part
+    bne     .L${opcode}_less
+    b       .L${opcode}_finish          @ equal; r1 already holds 0
+%break
+
+.L${opcode}_less:
+    mvn     r1, #0                      @ r1<- -1
+    @ Want to cond code the next mov so we can avoid branch, but don't see it;
+    @ instead, we just replicate the tail end.
     FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
-    SET_VREG ip, r9                     @ vAA<- ip
+    SET_VREG r1, r9                     @ vAA<- r1
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+.L${opcode}_greater:
+    mov     r1, #1                      @ r1<- 1
+    @ fall through to _finish
+
+.L${opcode}_finish:
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    SET_VREG r1, r9                     @ vAA<- r1
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/op_goto.S b/runtime/interpreter/mterp/arm/op_goto.S
index aa42dfd..6861950 100644
--- a/runtime/interpreter/mterp/arm/op_goto.S
+++ b/runtime/interpreter/mterp/arm/op_goto.S
@@ -5,5 +5,32 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    sbfx    rINST, rINST, #8, #8           @ rINST<- ssssssAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+    /* tuning: use sbfx for 6t2+ targets */
+#if MTERP_PROFILE_BRANCHES
+    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
+    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+       @ If backwards branch refresh rIBASE
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
+    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+       @ If backwards branch refresh rIBASE
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
diff --git a/runtime/interpreter/mterp/arm/op_goto_16.S b/runtime/interpreter/mterp/arm/op_goto_16.S
index 12a6bc0..91639ca 100644
--- a/runtime/interpreter/mterp/arm/op_goto_16.S
+++ b/runtime/interpreter/mterp/arm/op_goto_16.S
@@ -5,5 +5,27 @@
      * double to get a byte offset.
      */
     /* goto/16 +AAAA */
+#if MTERP_PROFILE_BRANCHES
     FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
diff --git a/runtime/interpreter/mterp/arm/op_goto_32.S b/runtime/interpreter/mterp/arm/op_goto_32.S
index 7325a1c..e730b52 100644
--- a/runtime/interpreter/mterp/arm/op_goto_32.S
+++ b/runtime/interpreter/mterp/arm/op_goto_32.S
@@ -10,7 +10,31 @@
      * offset to byte offset.
      */
     /* goto/32 +AAAAAAAA */
+#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- aaaa (lo)
-    FETCH r3, 2                         @ r1<- AAAA (hi)
-    orrs    rINST, r0, r3, lsl #16      @ rINST<- AAAAaaaa
-    b       MterpCommonTakenBranch
+    FETCH r1, 2                         @ r1<- AAAA (hi)
+    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    FETCH r0, 1                         @ r0<- aaaa (lo)
+    FETCH r1, 2                         @ r1<- AAAA (hi)
+    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
diff --git a/runtime/interpreter/mterp/arm/op_if_eq.S b/runtime/interpreter/mterp/arm/op_if_eq.S
index b8b6a6e..5685686 100644
--- a/runtime/interpreter/mterp/arm/op_if_eq.S
+++ b/runtime/interpreter/mterp/arm/op_if_eq.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "condition":"eq" }
+%include "arm/bincmp.S" { "revcmp":"ne" }
diff --git a/runtime/interpreter/mterp/arm/op_if_eqz.S b/runtime/interpreter/mterp/arm/op_if_eqz.S
index 7012f61..2a9c0f9 100644
--- a/runtime/interpreter/mterp/arm/op_if_eqz.S
+++ b/runtime/interpreter/mterp/arm/op_if_eqz.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "condition":"eq" }
+%include "arm/zcmp.S" { "revcmp":"ne" }
diff --git a/runtime/interpreter/mterp/arm/op_if_ge.S b/runtime/interpreter/mterp/arm/op_if_ge.S
index eb29e63..60a0307 100644
--- a/runtime/interpreter/mterp/arm/op_if_ge.S
+++ b/runtime/interpreter/mterp/arm/op_if_ge.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "condition":"ge" }
+%include "arm/bincmp.S" { "revcmp":"lt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_gez.S b/runtime/interpreter/mterp/arm/op_if_gez.S
index d9da374..981cdec 100644
--- a/runtime/interpreter/mterp/arm/op_if_gez.S
+++ b/runtime/interpreter/mterp/arm/op_if_gez.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "condition":"ge" }
+%include "arm/zcmp.S" { "revcmp":"lt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_gt.S b/runtime/interpreter/mterp/arm/op_if_gt.S
index a35eab8..ca50cd7 100644
--- a/runtime/interpreter/mterp/arm/op_if_gt.S
+++ b/runtime/interpreter/mterp/arm/op_if_gt.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "condition":"gt" }
+%include "arm/bincmp.S" { "revcmp":"le" }
diff --git a/runtime/interpreter/mterp/arm/op_if_gtz.S b/runtime/interpreter/mterp/arm/op_if_gtz.S
index 4ef4d8e..c621812 100644
--- a/runtime/interpreter/mterp/arm/op_if_gtz.S
+++ b/runtime/interpreter/mterp/arm/op_if_gtz.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "condition":"gt" }
+%include "arm/zcmp.S" { "revcmp":"le" }
diff --git a/runtime/interpreter/mterp/arm/op_if_le.S b/runtime/interpreter/mterp/arm/op_if_le.S
index c7c31bc..7e060f2 100644
--- a/runtime/interpreter/mterp/arm/op_if_le.S
+++ b/runtime/interpreter/mterp/arm/op_if_le.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "condition":"le" }
+%include "arm/bincmp.S" { "revcmp":"gt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_lez.S b/runtime/interpreter/mterp/arm/op_if_lez.S
index 9fbf6c9..f92be23 100644
--- a/runtime/interpreter/mterp/arm/op_if_lez.S
+++ b/runtime/interpreter/mterp/arm/op_if_lez.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "condition":"le" }
+%include "arm/zcmp.S" { "revcmp":"gt" }
diff --git a/runtime/interpreter/mterp/arm/op_if_lt.S b/runtime/interpreter/mterp/arm/op_if_lt.S
index 9469fbb..213344d 100644
--- a/runtime/interpreter/mterp/arm/op_if_lt.S
+++ b/runtime/interpreter/mterp/arm/op_if_lt.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "condition":"lt" }
+%include "arm/bincmp.S" { "revcmp":"ge" }
diff --git a/runtime/interpreter/mterp/arm/op_if_ltz.S b/runtime/interpreter/mterp/arm/op_if_ltz.S
index a4fc1b8..dfd4e44 100644
--- a/runtime/interpreter/mterp/arm/op_if_ltz.S
+++ b/runtime/interpreter/mterp/arm/op_if_ltz.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "condition":"lt" }
+%include "arm/zcmp.S" { "revcmp":"ge" }
diff --git a/runtime/interpreter/mterp/arm/op_if_ne.S b/runtime/interpreter/mterp/arm/op_if_ne.S
index c945331..4a58b4a 100644
--- a/runtime/interpreter/mterp/arm/op_if_ne.S
+++ b/runtime/interpreter/mterp/arm/op_if_ne.S
@@ -1 +1 @@
-%include "arm/bincmp.S" { "condition":"ne" }
+%include "arm/bincmp.S" { "revcmp":"eq" }
diff --git a/runtime/interpreter/mterp/arm/op_if_nez.S b/runtime/interpreter/mterp/arm/op_if_nez.S
index 2d81fda..d864ef4 100644
--- a/runtime/interpreter/mterp/arm/op_if_nez.S
+++ b/runtime/interpreter/mterp/arm/op_if_nez.S
@@ -1 +1 @@
-%include "arm/zcmp.S" { "condition":"ne" }
+%include "arm/zcmp.S" { "revcmp":"eq" }
diff --git a/runtime/interpreter/mterp/arm/op_mul_long.S b/runtime/interpreter/mterp/arm/op_mul_long.S
index a13c803..8f40f19 100644
--- a/runtime/interpreter/mterp/arm/op_mul_long.S
+++ b/runtime/interpreter/mterp/arm/op_mul_long.S
@@ -24,13 +24,13 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    mul     ip, r2, r1                  @ ip<- ZxW
-    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
-    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @  ip<- ZxW
+    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
+    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
     mov     r0, rINST, lsr #8           @ r0<- AA
-    add     r2, r2, lr                  @ r2<- lr + low(ZxW + (YxX))
+    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
     VREG_INDEX_TO_ADDR r0, r0           @ r0<- &fp[AA]
     FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r1-r2 }                @ vAA/vAA+1<- r1/r2
+    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/op_mul_long_2addr.S b/runtime/interpreter/mterp/arm/op_mul_long_2addr.S
index 4c1f058..7ef24c5 100644
--- a/runtime/interpreter/mterp/arm/op_mul_long_2addr.S
+++ b/runtime/interpreter/mterp/arm/op_mul_long_2addr.S
@@ -13,12 +13,12 @@
     VREG_INDEX_TO_ADDR rINST, r9        @ rINST<- &fp[A]
     ldmia   r1, {r2-r3}                 @ r2/r3<- vBB/vBB+1
     ldmia   rINST, {r0-r1}              @ r0/r1<- vAA/vAA+1
-    mul     ip, r2, r1                  @ ip<- ZxW
-    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
-    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @  ip<- ZxW
+    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
+    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
     mov     r0, rINST                   @ r0<- &fp[A] (free up rINST)
     FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
-    add     r2, r2, lr                  @ r2<- r2 + low(ZxW + (YxX))
+    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r1-r2}                 @ vAA/vAA+1<- r1/r2
+    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm/op_packed_switch.S b/runtime/interpreter/mterp/arm/op_packed_switch.S
index 412c58f..4c369cb 100644
--- a/runtime/interpreter/mterp/arm/op_packed_switch.S
+++ b/runtime/interpreter/mterp/arm/op_packed_switch.S
@@ -9,6 +9,7 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
+#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- bbbb (lo)
     FETCH r1, 2                         @ r1<- BBBB (hi)
     mov     r3, rINST, lsr #8           @ r3<- AA
@@ -16,5 +17,33 @@
     GET_VREG r1, r3                     @ r1<- vAA
     add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
     bl      $func                       @ r0<- code-unit branch offset
-    movs    rINST, r0
-    b       MterpCommonTakenBranch
+    mov     rINST, r0
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    FETCH r0, 1                         @ r0<- bbbb (lo)
+    FETCH r1, 2                         @ r1<- BBBB (hi)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
+    GET_VREG r1, r3                     @ r1<- vAA
+    add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
+    bl      $func                       @ r0<- code-unit branch offset
+    mov     rINST, r0
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
diff --git a/runtime/interpreter/mterp/arm/zcmp.S b/runtime/interpreter/mterp/arm/zcmp.S
index 5db8b6c..3d7dec0 100644
--- a/runtime/interpreter/mterp/arm/zcmp.S
+++ b/runtime/interpreter/mterp/arm/zcmp.S
@@ -1,17 +1,29 @@
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r0, r0                     @ r0<- vAA
+    GET_VREG r2, r0                     @ r2<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, #0                      @ compare (vA, 0)
-    b${condition} MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    cmp     r2, #0                      @ compare (vA, 0)
+    mov${revcmp} rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r1, rINST, rINST            @ convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/bincmp.S b/runtime/interpreter/mterp/arm64/bincmp.S
index 8dd4fed..2356ecb 100644
--- a/runtime/interpreter/mterp/arm64/bincmp.S
+++ b/runtime/interpreter/mterp/arm64/bincmp.S
@@ -1,6 +1,7 @@
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -9,11 +10,22 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Offset if branch not taken
     cmp     w2, w3                      // compare (vA, vB)
-    b.${condition} MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, ${condition} // Branch if true, stashing result in callee save reg.
+#if MTERP_PROFILE_BRANCHES
+    // TUINING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes, check sign
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/entry.S b/runtime/interpreter/mterp/arm64/entry.S
index 9fbbbd3..23e656e 100644
--- a/runtime/interpreter/mterp/arm64/entry.S
+++ b/runtime/interpreter/mterp/arm64/entry.S
@@ -31,12 +31,11 @@
 
 ExecuteMterpImpl:
     .cfi_startproc
-    stp     xPROFILE, x27, [sp, #-80]!
-    stp     xIBASE, xREFS, [sp, #16]
-    stp     xSELF, xINST, [sp, #32]
-    stp     xPC, xFP, [sp, #48]
-    stp     fp, lr, [sp, #64]
-    add     fp, sp, #64
+    stp     xIBASE, xREFS, [sp, #-64]!
+    stp     xSELF, xINST, [sp, #16]
+    stp     xPC, xFP, [sp, #32]
+    stp     fp, lr, [sp, #48]
+    add     fp, sp, #48
 
     /* Remember the return register */
     str     x3, [x2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -57,12 +56,6 @@
     /* Starting ibase */
     ldr     xIBASE, [xSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
-    /* Set up for backwards branches & osr profiling */
-    ldr     x0, [xFP, #OFF_FP_METHOD]
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    bl      MterpSetUpHotnessCountdown
-    mov     wPROFILE, w0                // Starting hotness countdown to xPROFILE
-
     /* start executing the instruction at rPC */
     FETCH_INST                          // load wINST from rPC
     GET_INST_OPCODE ip                  // extract opcode from wINST
diff --git a/runtime/interpreter/mterp/arm64/footer.S b/runtime/interpreter/mterp/arm64/footer.S
index 98d6d58..aae78de 100644
--- a/runtime/interpreter/mterp/arm64/footer.S
+++ b/runtime/interpreter/mterp/arm64/footer.S
@@ -107,104 +107,6 @@
     GET_INST_OPCODE ip
     GOTO_OPCODE ip
     /* NOTE: no fallthrough */
-/*
- * Common handling for branches with support for Jit profiling.
- * On entry:
- *    wINST          <= signed offset
- *    wPROFILE       <= signed hotness countdown (expanded to 32 bits)
- *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
- *
- * We have quite a few different cases for branch profiling, OSR detection and
- * suspend check support here.
- *
- * Taken backward branches:
- *    If profiling active, do hotness countdown and report if we hit zero.
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *    Is there a pending suspend request?  If so, suspend.
- *
- * Taken forward branches and not-taken backward branches:
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *
- * Our most common case is expected to be a taken backward branch with active jit profiling,
- * but no full OSR check and no pending suspend request.
- * Next most common case is not-taken branch with no full OSR check.
- *
- */
-MterpCommonTakenBranchNoFlags:
-    cmp     wINST, #0
-    b.gt    .L_forward_branch           // don't add forward branches to hotness
-    tbnz    wPROFILE, #31, .L_no_count_backwards  // go if negative
-    subs    wPROFILE, wPROFILE, #1      // countdown
-    b.eq    .L_add_batch                // counted down to zero - report
-.L_resume_backward_branch:
-    ldr     lr, [xSELF, #THREAD_FLAGS_OFFSET]
-    add     w2, wINST, wINST            // w2<- byte offset
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    REFRESH_IBASE
-    ands    lr, lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    b.ne    .L_suspend_request_pending
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
-.L_suspend_request_pending:
-    EXPORT_PC
-    mov     x0, xSELF
-    bl      MterpSuspendCheck           // (self)
-    cbnz    x0, MterpFallback
-    REFRESH_IBASE                       // might have changed during suspend
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
-.L_no_count_backwards:
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.ne    .L_resume_backward_branch
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, xINST
-    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
-    b.ne    MterpOnStackReplacement
-    b       .L_resume_backward_branch
-
-.L_forward_branch:
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_osr_forward
-.L_resume_forward_branch:
-    add     w2, wINST, wINST            // w2<- byte offset
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
-.L_check_osr_forward:
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, xINST
-    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
-    b.ne    MterpOnStackReplacement
-    b       .L_resume_forward_branch
-
-.L_add_batch:
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    strh    wPROFILE, [x1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
-    ldr     x0, [xFP, #OFF_FP_METHOD]
-    mov     x2, xSELF
-    bl      MterpAddHotnessBatch        // (method, shadow_frame, self)
-    mov     wPROFILE, w0                // restore new hotness countdown to wPROFILE
-    b       .L_no_count_backwards
-
-/*
- * Entered from the conditional branch handlers when OSR check request active on
- * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
- */
-.L_check_not_taken_osr:
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, #2
-    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
-    b.ne    MterpOnStackReplacement
-    FETCH_ADVANCE_INST 2
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
 
 /*
  * Check for suspend check request.  Assumes wINST already loaded, xPC advanced and
@@ -273,11 +175,10 @@
 check2:
     mov     x0, #1                                  // signal return to caller.
 MterpDone:
-    ldp     fp, lr, [sp, #64]
-    ldp     xPC, xFP, [sp, #48]
-    ldp     xSELF, xINST, [sp, #32]
-    ldp     xIBASE, xREFS, [sp, #16]
-    ldp     xPROFILE, x27, [sp], #80
+    ldp     fp, lr, [sp, #48]
+    ldp     xPC, xFP, [sp, #32]
+    ldp     xSELF, xINST, [sp, #16]
+    ldp     xIBASE, xREFS, [sp], #64
     ret
 
     .cfi_endproc
diff --git a/runtime/interpreter/mterp/arm64/header.S b/runtime/interpreter/mterp/arm64/header.S
index 4257200..7101ba9 100644
--- a/runtime/interpreter/mterp/arm64/header.S
+++ b/runtime/interpreter/mterp/arm64/header.S
@@ -74,7 +74,6 @@
   x23  xINST     first 16-bit code unit of current instruction
   x24  xIBASE    interpreted instruction base pointer, used for computed goto
   x25  xREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
-  x26  wPROFILE  jit profile hotness countdown
   x16  ip        scratch reg
   x17  ip2       scratch reg (used by macros)
 
@@ -93,17 +92,15 @@
 
 /* During bringup, we'll use the shadow frame model instead of xFP */
 /* single-purpose registers, given names for clarity */
-#define xPC      x20
-#define xFP      x21
-#define xSELF    x22
-#define xINST    x23
-#define wINST    w23
-#define xIBASE   x24
-#define xREFS    x25
-#define wPROFILE w26
-#define xPROFILE x26
-#define ip       x16
-#define ip2      x17
+#define xPC     x20
+#define xFP     x21
+#define xSELF   x22
+#define xINST   x23
+#define wINST   w23
+#define xIBASE  x24
+#define xREFS   x25
+#define ip      x16
+#define ip2     x17
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep xFP at the base of the vregs.  So,
@@ -117,7 +114,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME OFF_FP(0)
+#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
diff --git a/runtime/interpreter/mterp/arm64/op_goto.S b/runtime/interpreter/mterp/arm64/op_goto.S
index 6381e94..7e2f6a9 100644
--- a/runtime/interpreter/mterp/arm64/op_goto.S
+++ b/runtime/interpreter/mterp/arm64/op_goto.S
@@ -5,5 +5,21 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    sbfx    wINST, wINST, #8, #8           // wINST<- ssssssAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+    /* tuning: use sbfx for 6t2+ targets */
+    lsl     w0, wINST, #16              // w0<- AAxx0000
+    asr     wINST, w0, #24              // wINST<- ssssssAA (sign-extended)
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]  // Preload flags for MterpCheckSuspendAndContinue
+    adds    w1, wINST, wINST            // Convert dalvik offset to byte offset, setting flags
+    FETCH_ADVANCE_INST_RB w1            // load wINST and advance xPC
+       // If backwards branch refresh rIBASE
+    b.mi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/op_goto_16.S b/runtime/interpreter/mterp/arm64/op_goto_16.S
index fb9a80a..b2b9924 100644
--- a/runtime/interpreter/mterp/arm64/op_goto_16.S
+++ b/runtime/interpreter/mterp/arm64/op_goto_16.S
@@ -6,4 +6,17 @@
      */
     /* goto/16 +AAAA */
     FETCH_S wINST, 1                    // wINST<- ssssAAAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w1, wINST, wINST            // w1<- byte offset, flags set
+    FETCH_ADVANCE_INST_RB w1            // update rPC, load rINST
+    b.mi    MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from rINST
+    GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/op_goto_32.S b/runtime/interpreter/mterp/arm64/op_goto_32.S
index b13cb41..b785857 100644
--- a/runtime/interpreter/mterp/arm64/op_goto_32.S
+++ b/runtime/interpreter/mterp/arm64/op_goto_32.S
@@ -13,4 +13,17 @@
     FETCH w0, 1                         // w0<- aaaa (lo)
     FETCH w1, 2                         // w1<- AAAA (hi)
     orr     wINST, w0, w1, lsl #16      // wINST<- AAAAaaaa
-    b       MterpCommonTakenBranchNoFlags
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w1, wINST, wINST            // w1<- byte offset
+    FETCH_ADVANCE_INST_RB w1            // update rPC, load xINST
+    b.le    MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from xINST
+    GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/op_packed_switch.S b/runtime/interpreter/mterp/arm64/op_packed_switch.S
index 1456f1a..e8b4f04 100644
--- a/runtime/interpreter/mterp/arm64/op_packed_switch.S
+++ b/runtime/interpreter/mterp/arm64/op_packed_switch.S
@@ -17,4 +17,17 @@
     add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
     bl      $func                       // w0<- code-unit branch offset
     sbfm    xINST, x0, 0, 31
-    b       MterpCommonTakenBranchNoFlags
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xINST
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w1, wINST, wINST            // w1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB w1            // update rPC, load wINST
+    b.le    MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/zcmp.S b/runtime/interpreter/mterp/arm64/zcmp.S
index b303e6a..3f1e1b1 100644
--- a/runtime/interpreter/mterp/arm64/zcmp.S
+++ b/runtime/interpreter/mterp/arm64/zcmp.S
@@ -1,17 +1,29 @@
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S wINST, 1                    // w1<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Branch offset if not taken
     cmp     w2, #0                      // compare (vA, 0)
-    b.${condition} MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, ${condition} // Branch if true, stashing result in callee save reg
+#if MTERP_PROFILE_BRANCHES
+    // TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi    MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/mterp.cc b/runtime/interpreter/mterp/mterp.cc
index 60e6266..10b19c5 100644
--- a/runtime/interpreter/mterp/mterp.cc
+++ b/runtime/interpreter/mterp/mterp.cc
@@ -21,7 +21,6 @@
 #include "entrypoints/entrypoint_utils-inl.h"
 #include "mterp.h"
 #include "jit/jit.h"
-#include "jit/jit_instrumentation.h"
 #include "debugger.h"
 
 namespace art {
@@ -433,7 +432,7 @@
 }
 
 extern "C" void MterpCheckBefore(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
   if (inst->Opcode(inst_data) == Instruction::MOVE_EXCEPTION) {
@@ -445,7 +444,7 @@
 }
 
 extern "C" void MterpLogDivideByZeroException(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -453,7 +452,7 @@
 }
 
 extern "C" void MterpLogArrayIndexException(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -461,7 +460,7 @@
 }
 
 extern "C" void MterpLogNegativeArraySizeException(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -469,7 +468,7 @@
 }
 
 extern "C" void MterpLogNoSuchMethodException(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -477,7 +476,7 @@
 }
 
 extern "C" void MterpLogExceptionThrownException(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -485,7 +484,7 @@
 }
 
 extern "C" void MterpLogNullObjectException(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -493,7 +492,7 @@
 }
 
 extern "C" void MterpLogFallback(Thread* self, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -502,7 +501,7 @@
 }
 
 extern "C" void MterpLogOSR(Thread* self, ShadowFrame* shadow_frame, int32_t offset)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -510,7 +509,7 @@
 }
 
 extern "C" void MterpLogSuspendFallback(Thread* self, ShadowFrame* shadow_frame, uint32_t flags)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   UNUSED(self);
   const Instruction* inst = Instruction::At(shadow_frame->GetDexPCPtr());
   uint16_t inst_data = inst->Fetch16(0);
@@ -522,7 +521,7 @@
 }
 
 extern "C" bool MterpSuspendCheck(Thread* self)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   self->AllowThreadSuspension();
   return MterpShouldSwitchInterpreters();
 }
@@ -618,7 +617,7 @@
 }
 
 extern "C" mirror::Object* artAGetObjectFromMterp(mirror::Object* arr, int32_t index)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   if (UNLIKELY(arr == nullptr)) {
     ThrowNullPointerExceptionFromInterpreter();
     return nullptr;
@@ -632,7 +631,7 @@
 }
 
 extern "C" mirror::Object* artIGetObjectFromMterp(mirror::Object* obj, uint32_t field_offset)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   if (UNLIKELY(obj == nullptr)) {
     ThrowNullPointerExceptionFromInterpreter();
     return nullptr;
@@ -640,82 +639,13 @@
   return obj->GetFieldObject<mirror::Object>(MemberOffset(field_offset));
 }
 
-/*
- * Create a hotness_countdown based on the current method hotness_count and profiling
- * mode.  In short, determine how many hotness events we hit before reporting back
- * to the full instrumentation via MterpAddHotnessBatch.  Called once on entry to the method,
- * and regenerated following batch updates.
- */
-extern "C" int MterpSetUpHotnessCountdown(ArtMethod* method, ShadowFrame* shadow_frame)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
-  uint16_t hotness_count = method->GetCounter();
-  int32_t countdown_value = jit::kJitHotnessDisabled;
-  jit::Jit* jit = Runtime::Current()->GetJit();
-  if (jit != nullptr) {
-    jit::JitInstrumentationCache* cache = jit->GetInstrumentationCache();
-    int32_t warm_threshold = cache->WarmMethodThreshold();
-    int32_t hot_threshold = cache->HotMethodThreshold();
-    if (hotness_count < warm_threshold) {
-      countdown_value  = warm_threshold - hotness_count;
-    } else if (hotness_count < hot_threshold) {
-      countdown_value = hot_threshold - hotness_count;
-    } else {
-      countdown_value = jit::kJitCheckForOSR;
-    }
-  }
-  /*
-   * The actual hotness threshold may exceed the range of our int16_t countdown value.  This is
-   * not a problem, though.  We can just break it down into smaller chunks.
-   */
-  countdown_value = std::min(countdown_value,
-                             static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
-  shadow_frame->SetCachedHotnessCountdown(countdown_value);
-  shadow_frame->SetHotnessCountdown(countdown_value);
-  return countdown_value;
-}
-
-/*
- * Report a batch of hotness events to the instrumentation and then return the new
- * countdown value to the next time we should report.
- */
-extern "C" int16_t MterpAddHotnessBatch(ArtMethod* method,
-                                        ShadowFrame* shadow_frame,
-                                        Thread* self)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
-  jit::Jit* jit = Runtime::Current()->GetJit();
-  if (jit != nullptr) {
-    int16_t count = shadow_frame->GetCachedHotnessCountdown() - shadow_frame->GetHotnessCountdown();
-    jit->GetInstrumentationCache()->AddSamples(self, method, count);
-  }
-  return MterpSetUpHotnessCountdown(method, shadow_frame);
-}
-
-// TUNING: Unused by arm/arm64.  Remove when x86/x86_64/mips/mips64 mterps support batch updates.
 extern "C" bool  MterpProfileBranch(Thread* self, ShadowFrame* shadow_frame, int32_t offset)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+  SHARED_REQUIRES(Locks::mutator_lock_) {
   ArtMethod* method = shadow_frame->GetMethod();
   JValue* result = shadow_frame->GetResultRegister();
   uint32_t dex_pc = shadow_frame->GetDexPC();
-  jit::Jit* jit = Runtime::Current()->GetJit();
-  if ((jit != nullptr) && (offset <= 0)) {
-    jit->GetInstrumentationCache()->AddSamples(self, method, 1);
-  }
-  int16_t countdown_value = MterpSetUpHotnessCountdown(method, shadow_frame);
-  if (countdown_value == jit::kJitCheckForOSR) {
-    return jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, result);
-  } else {
-    return false;
-  }
-}
-
-extern "C" bool MterpMaybeDoOnStackReplacement(Thread* self,
-                                               ShadowFrame* shadow_frame,
-                                               int32_t offset)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
-  ArtMethod* method = shadow_frame->GetMethod();
-  JValue* result = shadow_frame->GetResultRegister();
-  uint32_t dex_pc = shadow_frame->GetDexPC();
-  // Assumes caller has already determined that an OSR check is appropriate.
+  const auto* const instrumentation = Runtime::Current()->GetInstrumentation();
+  instrumentation->Branch(self, method, dex_pc, offset);
   return jit::Jit::MaybeDoOnStackReplacement(self, method, dex_pc, offset, result);
 }
 
diff --git a/runtime/interpreter/mterp/out/mterp_arm.S b/runtime/interpreter/mterp/out/mterp_arm.S
index d8b4700..092474d 100644
--- a/runtime/interpreter/mterp/out/mterp_arm.S
+++ b/runtime/interpreter/mterp/out/mterp_arm.S
@@ -79,8 +79,7 @@
   r6  rSELF     self (Thread) pointer
   r7  rINST     first 16-bit code unit of current instruction
   r8  rIBASE    interpreted instruction base pointer, used for computed goto
-  r10 rPROFILE  branch profiling countdown
-  r11 rREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
+  r11 rREFS	base of object references in shadow frame  (ideally, we'll get rid of this later).
 
 Macros are provided for common operations.  Each macro MUST emit only
 one instruction to make instruction-counting easier.  They MUST NOT alter
@@ -98,13 +97,12 @@
 
 /* During bringup, we'll use the shadow frame model instead of rFP */
 /* single-purpose registers, given names for clarity */
-#define rPC      r4
-#define rFP      r5
-#define rSELF    r6
-#define rINST    r7
-#define rIBASE   r8
-#define rPROFILE r10
-#define rREFS    r11
+#define rPC     r4
+#define rFP     r5
+#define rSELF   r6
+#define rINST   r7
+#define rIBASE  r8
+#define rREFS   r11
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep rFP at the base of the vregs.  So,
@@ -118,7 +116,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME OFF_FP(0)
+#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
@@ -331,8 +329,10 @@
 
 ExecuteMterpImpl:
     .fnstart
-    .save {r3-r10,fp,lr}
-    stmfd   sp!, {r3-r10,fp,lr}         @ save 10 regs, (r3 just to align 64)
+    .save {r4-r10,fp,lr}
+    stmfd   sp!, {r4-r10,fp,lr}         @ save 9 regs
+    .pad    #4
+    sub     sp, sp, #4                  @ align 64
 
     /* Remember the return register */
     str     r3, [r2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -353,12 +353,6 @@
     /* Starting ibase */
     ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
-    /* Set up for backwards branches & osr profiling */
-    ldr     r0, [rFP, #OFF_FP_METHOD]
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    bl      MterpSetUpHotnessCountdown
-    mov     rPROFILE, r0                @ Starting hotness countdown to rPROFILE
-
     /* start executing the instruction at rPC */
     FETCH_INST                          @ load rINST from rPC
     GET_INST_OPCODE ip                  @ extract opcode from rINST
@@ -1109,8 +1103,35 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    sbfx    rINST, rINST, #8, #8           @ rINST<- ssssssAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+    /* tuning: use sbfx for 6t2+ targets */
+#if MTERP_PROFILE_BRANCHES
+    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
+    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+       @ If backwards branch refresh rIBASE
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    mov     r0, rINST, lsl #16          @ r0<- AAxx0000
+    movs    rINST, r0, asr #24          @ rINST<- ssssssAA (sign-extended)
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r2, rINST, rINST            @ r2<- byte offset, set flags
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+       @ If backwards branch refresh rIBASE
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
 
 /* ------------------------------ */
     .balign 128
@@ -1123,8 +1144,30 @@
      * double to get a byte offset.
      */
     /* goto/16 +AAAA */
+#if MTERP_PROFILE_BRANCHES
     FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    FETCH_S rINST, 1                    @ rINST<- ssssAAAA (sign-extended)
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset, flags set
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
 
 /* ------------------------------ */
     .balign 128
@@ -1142,10 +1185,34 @@
      * offset to byte offset.
      */
     /* goto/32 +AAAAAAAA */
+#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- aaaa (lo)
-    FETCH r3, 2                         @ r1<- AAAA (hi)
-    orrs    rINST, r0, r3, lsl #16      @ rINST<- AAAAaaaa
-    b       MterpCommonTakenBranch
+    FETCH r1, 2                         @ r1<- AAAA (hi)
+    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    FETCH r0, 1                         @ r0<- aaaa (lo)
+    FETCH r1, 2                         @ r1<- AAAA (hi)
+    orr     rINST, r0, r1, lsl #16      @ rINST<- AAAAaaaa
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
 
 /* ------------------------------ */
     .balign 128
@@ -1161,6 +1228,7 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
+#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- bbbb (lo)
     FETCH r1, 2                         @ r1<- BBBB (hi)
     mov     r3, rINST, lsr #8           @ r3<- AA
@@ -1168,8 +1236,36 @@
     GET_VREG r1, r3                     @ r1<- vAA
     add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
     bl      MterpDoPackedSwitch                       @ r0<- code-unit branch offset
-    movs    rINST, r0
-    b       MterpCommonTakenBranch
+    mov     rINST, r0
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    FETCH r0, 1                         @ r0<- bbbb (lo)
+    FETCH r1, 2                         @ r1<- BBBB (hi)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
+    GET_VREG r1, r3                     @ r1<- vAA
+    add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
+    bl      MterpDoPackedSwitch                       @ r0<- code-unit branch offset
+    mov     rINST, r0
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
 
 /* ------------------------------ */
     .balign 128
@@ -1186,6 +1282,7 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
+#if MTERP_PROFILE_BRANCHES
     FETCH r0, 1                         @ r0<- bbbb (lo)
     FETCH r1, 2                         @ r1<- BBBB (hi)
     mov     r3, rINST, lsr #8           @ r3<- AA
@@ -1193,8 +1290,36 @@
     GET_VREG r1, r3                     @ r1<- vAA
     add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
     bl      MterpDoSparseSwitch                       @ r0<- code-unit branch offset
-    movs    rINST, r0
-    b       MterpCommonTakenBranch
+    mov     rINST, r0
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#else
+    FETCH r0, 1                         @ r0<- bbbb (lo)
+    FETCH r1, 2                         @ r1<- BBBB (hi)
+    mov     r3, rINST, lsr #8           @ r3<- AA
+    orr     r0, r0, r1, lsl #16         @ r0<- BBBBbbbb
+    GET_VREG r1, r3                     @ r1<- vAA
+    add     r0, rPC, r0, lsl #1         @ r0<- PC + BBBBbbbb*2
+    bl      MterpDoSparseSwitch                       @ r0<- code-unit branch offset
+    mov     rINST, r0
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    adds    r1, rINST, rINST            @ r1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    ble     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+#endif
 
 
 /* ------------------------------ */
@@ -1360,6 +1485,22 @@
     /*
      * Compare two 64-bit values.  Puts 0, 1, or -1 into the destination
      * register based on the results of the comparison.
+     *
+     * We load the full values with LDM, but in practice many values could
+     * be resolved by only looking at the high word.  This could be made
+     * faster or slower by splitting the LDM into a pair of LDRs.
+     *
+     * If we just wanted to set condition flags, we could do this:
+     *  subs    ip, r0, r2
+     *  sbcs    ip, r1, r3
+     *  subeqs  ip, r0, r2
+     * Leaving { <0, 0, >0 } in ip.  However, we have to set it to a specific
+     * integer value, which we can do with 2 conditional mov/mvn instructions
+     * (set 1, set -1; if they're equal we already have 0 in ip), giving
+     * us a constant 5-cycle path plus a branch at the end to the
+     * instruction epilogue code.  The multi-compare approach below needs
+     * 2 or 3 cycles + branch if the high word doesn't match, 6 + branch
+     * in the worst case (the 64-bit values are equal).
      */
     /* cmp-long vAA, vBB, vCC */
     FETCH r0, 1                         @ r0<- CCBB
@@ -1370,16 +1511,13 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    cmp     r0, r2
-    sbcs    ip, r1, r3                  @ Sets correct CCs for checking LT (but not EQ/NE)
-    mov     ip, #0
-    mvnlt   ip, #0                      @ -1
-    cmpeq   r0, r2                      @ For correct EQ/NE, we may need to repeat the first CMP
-    orrne   ip, #1
-    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
-    SET_VREG ip, r9                     @ vAA<- ip
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
+    cmp     r1, r3                      @ compare (vBB+1, vCC+1)
+    blt     .Lop_cmp_long_less            @ signed compare on high part
+    bgt     .Lop_cmp_long_greater
+    subs    r1, r0, r2                  @ r1<- r0 - r2
+    bhi     .Lop_cmp_long_greater         @ unsigned compare on low part
+    bne     .Lop_cmp_long_less
+    b       .Lop_cmp_long_finish          @ equal; r1 already holds 0
 
 /* ------------------------------ */
     .balign 128
@@ -1387,8 +1525,9 @@
 /* File: arm/op_if_eq.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1396,13 +1535,24 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r0, r0                     @ r0<- vA
+    GET_VREG r2, r0                     @ r2<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, r3                      @ compare (vA, vB)
-    beq MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    cmp     r2, r3                      @ compare (vA, vB)
+    movne rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r2, rINST, rINST            @ convert to bytes, check sign
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1413,8 +1563,9 @@
 /* File: arm/op_if_ne.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1422,13 +1573,24 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r0, r0                     @ r0<- vA
+    GET_VREG r2, r0                     @ r2<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, r3                      @ compare (vA, vB)
-    bne MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    cmp     r2, r3                      @ compare (vA, vB)
+    moveq rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r2, rINST, rINST            @ convert to bytes, check sign
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1439,8 +1601,9 @@
 /* File: arm/op_if_lt.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1448,13 +1611,24 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r0, r0                     @ r0<- vA
+    GET_VREG r2, r0                     @ r2<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, r3                      @ compare (vA, vB)
-    blt MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    cmp     r2, r3                      @ compare (vA, vB)
+    movge rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r2, rINST, rINST            @ convert to bytes, check sign
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1465,8 +1639,9 @@
 /* File: arm/op_if_ge.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1474,13 +1649,24 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r0, r0                     @ r0<- vA
+    GET_VREG r2, r0                     @ r2<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, r3                      @ compare (vA, vB)
-    bge MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    cmp     r2, r3                      @ compare (vA, vB)
+    movlt rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r2, rINST, rINST            @ convert to bytes, check sign
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1491,8 +1677,9 @@
 /* File: arm/op_if_gt.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1500,13 +1687,24 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r0, r0                     @ r0<- vA
+    GET_VREG r2, r0                     @ r2<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, r3                      @ compare (vA, vB)
-    bgt MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    cmp     r2, r3                      @ compare (vA, vB)
+    movle rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r2, rINST, rINST            @ convert to bytes, check sign
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1517,8 +1715,9 @@
 /* File: arm/op_if_le.S */
 /* File: arm/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1526,13 +1725,24 @@
     mov     r1, rINST, lsr #12          @ r1<- B
     ubfx    r0, rINST, #8, #4           @ r0<- A
     GET_VREG r3, r1                     @ r3<- vB
-    GET_VREG r0, r0                     @ r0<- vA
+    GET_VREG r2, r0                     @ r2<- vA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, r3                      @ compare (vA, vB)
-    ble MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    cmp     r2, r3                      @ compare (vA, vB)
+    movgt rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r2, rINST, rINST            @ convert to bytes, check sign
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1543,20 +1753,32 @@
 /* File: arm/op_if_eqz.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r0, r0                     @ r0<- vAA
+    GET_VREG r2, r0                     @ r2<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, #0                      @ compare (vA, 0)
-    beq MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    cmp     r2, #0                      @ compare (vA, 0)
+    movne rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r1, rINST, rINST            @ convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1567,20 +1789,32 @@
 /* File: arm/op_if_nez.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r0, r0                     @ r0<- vAA
+    GET_VREG r2, r0                     @ r2<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, #0                      @ compare (vA, 0)
-    bne MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    cmp     r2, #0                      @ compare (vA, 0)
+    moveq rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r1, rINST, rINST            @ convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1591,20 +1825,32 @@
 /* File: arm/op_if_ltz.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r0, r0                     @ r0<- vAA
+    GET_VREG r2, r0                     @ r2<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, #0                      @ compare (vA, 0)
-    blt MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    cmp     r2, #0                      @ compare (vA, 0)
+    movge rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r1, rINST, rINST            @ convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1615,20 +1861,32 @@
 /* File: arm/op_if_gez.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r0, r0                     @ r0<- vAA
+    GET_VREG r2, r0                     @ r2<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, #0                      @ compare (vA, 0)
-    bge MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    cmp     r2, #0                      @ compare (vA, 0)
+    movlt rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r1, rINST, rINST            @ convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1639,20 +1897,32 @@
 /* File: arm/op_if_gtz.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r0, r0                     @ r0<- vAA
+    GET_VREG r2, r0                     @ r2<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, #0                      @ compare (vA, 0)
-    bgt MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    cmp     r2, #0                      @ compare (vA, 0)
+    movle rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r1, rINST, rINST            @ convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -1663,20 +1933,32 @@
 /* File: arm/op_if_lez.S */
 /* File: arm/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     mov     r0, rINST, lsr #8           @ r0<- AA
-    GET_VREG r0, r0                     @ r0<- vAA
+    GET_VREG r2, r0                     @ r2<- vAA
     FETCH_S rINST, 1                    @ rINST<- branch offset, in code units
-    cmp     r0, #0                      @ compare (vA, 0)
-    ble MterpCommonTakenBranchNoFlags
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    beq     .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    cmp     r2, #0                      @ compare (vA, 0)
+    movgt rINST, #2
+#if MTERP_PROFILE_BRANCHES
+    @ TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    mov     r2, rINST
+    bl      MterpProfileBranch          @ (self, shadow_frame, offset)
+    cmp     r0, #0
+    bne     MterpOnStackReplacement     @ Note: offset must be in rINST
+#endif
+    adds    r1, rINST, rINST            @ convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB r1            @ update rPC, load rINST
+    bmi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -4429,15 +4711,15 @@
     VREG_INDEX_TO_ADDR r3, r3           @ r3<- &fp[CC]
     ldmia   r2, {r0-r1}                 @ r0/r1<- vBB/vBB+1
     ldmia   r3, {r2-r3}                 @ r2/r3<- vCC/vCC+1
-    mul     ip, r2, r1                  @ ip<- ZxW
-    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
-    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @  ip<- ZxW
+    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
+    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
     mov     r0, rINST, lsr #8           @ r0<- AA
-    add     r2, r2, lr                  @ r2<- lr + low(ZxW + (YxX))
+    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
     VREG_INDEX_TO_ADDR r0, r0           @ r0<- &fp[AA]
     FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r1-r2 }                @ vAA/vAA+1<- r1/r2
+    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
     GOTO_OPCODE ip                      @ jump to next instruction
 
 /* ------------------------------ */
@@ -5595,14 +5877,14 @@
     VREG_INDEX_TO_ADDR rINST, r9        @ rINST<- &fp[A]
     ldmia   r1, {r2-r3}                 @ r2/r3<- vBB/vBB+1
     ldmia   rINST, {r0-r1}              @ r0/r1<- vAA/vAA+1
-    mul     ip, r2, r1                  @ ip<- ZxW
-    umull   r1, lr, r2, r0              @ r1/lr <- ZxX
-    mla     r2, r0, r3, ip              @ r2<- YxX + (ZxW)
+    mul     ip, r2, r1                  @  ip<- ZxW
+    umull   r9, r10, r2, r0             @  r9/r10 <- ZxX
+    mla     r2, r0, r3, ip              @  r2<- YxX + (ZxW)
     mov     r0, rINST                   @ r0<- &fp[A] (free up rINST)
     FETCH_ADVANCE_INST 1                @ advance rPC, load rINST
-    add     r2, r2, lr                  @ r2<- r2 + low(ZxW + (YxX))
+    add     r10, r2, r10                @  r10<- r10 + low(ZxW + (YxX))
     GET_INST_OPCODE ip                  @ extract opcode from rINST
-    stmia   r0, {r1-r2}                 @ vAA/vAA+1<- r1/r2
+    stmia   r0, {r9-r10}                @ vAA/vAA+1<- r9/r10
     GOTO_OPCODE ip                      @ jump to next instruction
 
 /* ------------------------------ */
@@ -7334,6 +7616,27 @@
     .balign 4
 artMterpAsmSisterStart:
 
+/* continuation for op_cmp_long */
+
+.Lop_cmp_long_less:
+    mvn     r1, #0                      @ r1<- -1
+    @ Want to cond code the next mov so we can avoid branch, but don't see it;
+    @ instead, we just replicate the tail end.
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    SET_VREG r1, r9                     @ vAA<- r1
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+.Lop_cmp_long_greater:
+    mov     r1, #1                      @ r1<- 1
+    @ fall through to _finish
+
+.Lop_cmp_long_finish:
+    FETCH_ADVANCE_INST 2                @ advance rPC, load rINST
+    SET_VREG r1, r9                     @ vAA<- r1
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
 /* continuation for op_float_to_long */
 /*
  * Convert the float in r0 to a long in r0/r1.
@@ -11904,111 +12207,21 @@
     /* NOTE: no fallthrough */
 
 /*
- * Common handling for branches with support for Jit profiling.
- * On entry:
- *    rINST          <= signed offset
- *    rPROFILE       <= signed hotness countdown (expanded to 32 bits)
- *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
- *
- * We have quite a few different cases for branch profiling, OSR detection and
- * suspend check support here.
- *
- * Taken backward branches:
- *    If profiling active, do hotness countdown and report if we hit zero.
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *    Is there a pending suspend request?  If so, suspend.
- *
- * Taken forward branches and not-taken backward branches:
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *
- * Our most common case is expected to be a taken backward branch with active jit profiling,
- * but no full OSR check and no pending suspend request.
- * Next most common case is not-taken branch with no full OSR check.
- *
+ * Check for suspend check request.  Assumes rINST already loaded, rPC advanced and
+ * still needs to get the opcode and branch to it, and flags are in lr.
  */
-MterpCommonTakenBranchNoFlags:
-    cmp     rINST, #0
-MterpCommonTakenBranch:
-    bgt     .L_forward_branch           @ don't add forward branches to hotness
-/*
- * We need to subtract 1 from positive values and we should not see 0 here,
- * so we may use the result of the comparison with -1.
- */
-#if JIT_CHECK_OSR != -1
-#  error "JIT_CHECK_OSR must be -1."
-#endif
-    cmp     rPROFILE, #JIT_CHECK_OSR
-    beq     .L_osr_check
-    subgts  rPROFILE, #1
-    beq     .L_add_batch                @ counted down to zero - report
-.L_resume_backward_branch:
-    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
-    REFRESH_IBASE
-    add     r2, rINST, rINST            @ r2<- byte offset
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
+MterpCheckSuspendAndContinue:
+    ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]  @ refresh rIBASE
     ands    lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    bne     .L_suspend_request_pending
+    bne     1f
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
-
-.L_suspend_request_pending:
+1:
     EXPORT_PC
     mov     r0, rSELF
     bl      MterpSuspendCheck           @ (self)
     cmp     r0, #0
     bne     MterpFallback
-    REFRESH_IBASE                       @ might have changed during suspend
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-
-.L_no_count_backwards:
-    cmp     rPROFILE, #JIT_CHECK_OSR    @ possible OSR re-entry?
-    bne     .L_resume_backward_branch
-.L_osr_check:
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
-    bne     MterpOnStackReplacement
-    b       .L_resume_backward_branch
-
-.L_forward_branch:
-    cmp     rPROFILE, #JIT_CHECK_OSR @ possible OSR re-entry?
-    beq     .L_check_osr_forward
-.L_resume_forward_branch:
-    add     r2, rINST, rINST            @ r2<- byte offset
-    FETCH_ADVANCE_INST_RB r2            @ update rPC, load rINST
-    GET_INST_OPCODE ip                  @ extract opcode from rINST
-    GOTO_OPCODE ip                      @ jump to next instruction
-
-.L_check_osr_forward:
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rINST
-    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
-    bne     MterpOnStackReplacement
-    b       .L_resume_forward_branch
-
-.L_add_batch:
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
-    ldr     r0, [rFP, #OFF_FP_METHOD]
-    mov     r2, rSELF
-    bl      MterpAddHotnessBatch        @ (method, shadow_frame, self)
-    mov     rPROFILE, r0                @ restore new hotness countdown to rPROFILE
-    b       .L_no_count_backwards
-
-/*
- * Entered from the conditional branch handlers when OSR check request active on
- * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
- */
-.L_check_not_taken_osr:
-    mov     r0, rSELF
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, #2
-    bl      MterpMaybeDoOnStackReplacement  @ (self, shadow_frame, offset)
-    bne     MterpOnStackReplacement
-    FETCH_ADVANCE_INST 2
     GET_INST_OPCODE ip                  @ extract opcode from rINST
     GOTO_OPCODE ip                      @ jump to next instruction
 
@@ -12056,27 +12269,9 @@
     str     r1, [r2, #4]
     mov     r0, #1                                  @ signal return to caller.
 MterpDone:
-/*
- * At this point, we expect rPROFILE to be non-zero.  If negative, hotness is disabled or we're
- * checking for OSR.  If greater than zero, we might have unreported hotness to register
- * (the difference between the ending rPROFILE and the cached hotness counter).  rPROFILE
- * should only reach zero immediately after a hotness decrement, and is then reset to either
- * a negative special state or the new non-zero countdown value.
- */
-    cmp     rPROFILE, #0
-    bgt     MterpProfileActive                      @ if > 0, we may have some counts to report.
-    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
+    add     sp, sp, #4                              @ un-align 64
+    ldmfd   sp!, {r4-r10,fp,pc}                     @ restore 9 regs and return
 
-MterpProfileActive:
-    mov     rINST, r0                               @ stash return value
-    /* Report cached hotness counts */
-    ldr     r0, [rFP, #OFF_FP_METHOD]
-    add     r1, rFP, #OFF_FP_SHADOWFRAME
-    mov     r2, rSELF
-    strh    rPROFILE, [r1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
-    bl      MterpAddHotnessBatch                    @ (method, shadow_frame, self)
-    mov     r0, rINST                               @ restore return value
-    ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
 
     .fnend
     .size   ExecuteMterpImpl, .-ExecuteMterpImpl
diff --git a/runtime/interpreter/mterp/out/mterp_arm64.S b/runtime/interpreter/mterp/out/mterp_arm64.S
index bcceea1..6ae59d8 100644
--- a/runtime/interpreter/mterp/out/mterp_arm64.S
+++ b/runtime/interpreter/mterp/out/mterp_arm64.S
@@ -81,7 +81,6 @@
   x23  xINST     first 16-bit code unit of current instruction
   x24  xIBASE    interpreted instruction base pointer, used for computed goto
   x25  xREFS     base of object references in shadow frame  (ideally, we'll get rid of this later).
-  x26  wPROFILE  jit profile hotness countdown
   x16  ip        scratch reg
   x17  ip2       scratch reg (used by macros)
 
@@ -100,17 +99,15 @@
 
 /* During bringup, we'll use the shadow frame model instead of xFP */
 /* single-purpose registers, given names for clarity */
-#define xPC      x20
-#define xFP      x21
-#define xSELF    x22
-#define xINST    x23
-#define wINST    w23
-#define xIBASE   x24
-#define xREFS    x25
-#define wPROFILE w26
-#define xPROFILE x26
-#define ip       x16
-#define ip2      x17
+#define xPC     x20
+#define xFP     x21
+#define xSELF   x22
+#define xINST   x23
+#define wINST   w23
+#define xIBASE  x24
+#define xREFS   x25
+#define ip      x16
+#define ip2     x17
 
 /*
  * Instead of holding a pointer to the shadow frame, we keep xFP at the base of the vregs.  So,
@@ -124,7 +121,7 @@
 #define OFF_FP_RESULT_REGISTER OFF_FP(SHADOWFRAME_RESULT_REGISTER_OFFSET)
 #define OFF_FP_DEX_PC_PTR OFF_FP(SHADOWFRAME_DEX_PC_PTR_OFFSET)
 #define OFF_FP_CODE_ITEM OFF_FP(SHADOWFRAME_CODE_ITEM_OFFSET)
-#define OFF_FP_SHADOWFRAME OFF_FP(0)
+#define OFF_FP_SHADOWFRAME (-SHADOWFRAME_VREGS_OFFSET)
 
 /*
  * "export" the PC to dex_pc field in the shadow frame, f/b/o future exception objects.  Must
@@ -326,12 +323,11 @@
 
 ExecuteMterpImpl:
     .cfi_startproc
-    stp     xPROFILE, x27, [sp, #-80]!
-    stp     xIBASE, xREFS, [sp, #16]
-    stp     xSELF, xINST, [sp, #32]
-    stp     xPC, xFP, [sp, #48]
-    stp     fp, lr, [sp, #64]
-    add     fp, sp, #64
+    stp     xIBASE, xREFS, [sp, #-64]!
+    stp     xSELF, xINST, [sp, #16]
+    stp     xPC, xFP, [sp, #32]
+    stp     fp, lr, [sp, #48]
+    add     fp, sp, #48
 
     /* Remember the return register */
     str     x3, [x2, #SHADOWFRAME_RESULT_REGISTER_OFFSET]
@@ -352,12 +348,6 @@
     /* Starting ibase */
     ldr     xIBASE, [xSELF, #THREAD_CURRENT_IBASE_OFFSET]
 
-    /* Set up for backwards branches & osr profiling */
-    ldr     x0, [xFP, #OFF_FP_METHOD]
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    bl      MterpSetUpHotnessCountdown
-    mov     wPROFILE, w0                // Starting hotness countdown to xPROFILE
-
     /* start executing the instruction at rPC */
     FETCH_INST                          // load wINST from rPC
     GET_INST_OPCODE ip                  // extract opcode from wINST
@@ -1091,8 +1081,24 @@
      * double to get a byte offset.
      */
     /* goto +AA */
-    sbfx    wINST, wINST, #8, #8           // wINST<- ssssssAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+    /* tuning: use sbfx for 6t2+ targets */
+    lsl     w0, wINST, #16              // w0<- AAxx0000
+    asr     wINST, w0, #24              // wINST<- ssssssAA (sign-extended)
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]  // Preload flags for MterpCheckSuspendAndContinue
+    adds    w1, wINST, wINST            // Convert dalvik offset to byte offset, setting flags
+    FETCH_ADVANCE_INST_RB w1            // load wINST and advance xPC
+       // If backwards branch refresh rIBASE
+    b.mi     MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
 
 /* ------------------------------ */
     .balign 128
@@ -1106,7 +1112,20 @@
      */
     /* goto/16 +AAAA */
     FETCH_S wINST, 1                    // wINST<- ssssAAAA (sign-extended)
-    b       MterpCommonTakenBranchNoFlags
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w1, wINST, wINST            // w1<- byte offset, flags set
+    FETCH_ADVANCE_INST_RB w1            // update rPC, load rINST
+    b.mi    MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from rINST
+    GOTO_OPCODE ip                      // jump to next instruction
 
 /* ------------------------------ */
     .balign 128
@@ -1127,7 +1146,20 @@
     FETCH w0, 1                         // w0<- aaaa (lo)
     FETCH w1, 2                         // w1<- AAAA (hi)
     orr     wINST, w0, w1, lsl #16      // wINST<- AAAAaaaa
-    b       MterpCommonTakenBranchNoFlags
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w1, wINST, wINST            // w1<- byte offset
+    FETCH_ADVANCE_INST_RB w1            // update rPC, load xINST
+    b.le    MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from xINST
+    GOTO_OPCODE ip                      // jump to next instruction
 
 /* ------------------------------ */
     .balign 128
@@ -1151,7 +1183,20 @@
     add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
     bl      MterpDoPackedSwitch                       // w0<- code-unit branch offset
     sbfm    xINST, x0, 0, 31
-    b       MterpCommonTakenBranchNoFlags
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xINST
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w1, wINST, wINST            // w1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB w1            // update rPC, load wINST
+    b.le    MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
 
 /* ------------------------------ */
     .balign 128
@@ -1176,7 +1221,20 @@
     add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
     bl      MterpDoSparseSwitch                       // w0<- code-unit branch offset
     sbfm    xINST, x0, 0, 31
-    b       MterpCommonTakenBranchNoFlags
+#if MTERP_PROFILE_BRANCHES
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    mov     x2, xINST
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w1, wINST, wINST            // w1<- byte offset; clear V
+    FETCH_ADVANCE_INST_RB w1            // update rPC, load wINST
+    b.le    MterpCheckSuspendAndContinue
+    GET_INST_OPCODE ip                  // extract opcode from wINST
+    GOTO_OPCODE ip                      // jump to next instruction
 
 
 /* ------------------------------ */
@@ -1307,8 +1365,9 @@
 /* File: arm64/op_if_eq.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1317,12 +1376,23 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Offset if branch not taken
     cmp     w2, w3                      // compare (vA, vB)
-    b.eq MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, eq // Branch if true, stashing result in callee save reg.
+#if MTERP_PROFILE_BRANCHES
+    // TUINING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes, check sign
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1333,8 +1403,9 @@
 /* File: arm64/op_if_ne.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1343,12 +1414,23 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Offset if branch not taken
     cmp     w2, w3                      // compare (vA, vB)
-    b.ne MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, ne // Branch if true, stashing result in callee save reg.
+#if MTERP_PROFILE_BRANCHES
+    // TUINING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes, check sign
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1359,8 +1441,9 @@
 /* File: arm64/op_if_lt.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1369,12 +1452,23 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Offset if branch not taken
     cmp     w2, w3                      // compare (vA, vB)
-    b.lt MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, lt // Branch if true, stashing result in callee save reg.
+#if MTERP_PROFILE_BRANCHES
+    // TUINING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes, check sign
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1385,8 +1479,9 @@
 /* File: arm64/op_if_ge.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1395,12 +1490,23 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Offset if branch not taken
     cmp     w2, w3                      // compare (vA, vB)
-    b.ge MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, ge // Branch if true, stashing result in callee save reg.
+#if MTERP_PROFILE_BRANCHES
+    // TUINING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes, check sign
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1411,8 +1517,9 @@
 /* File: arm64/op_if_gt.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1421,12 +1528,23 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Offset if branch not taken
     cmp     w2, w3                      // compare (vA, vB)
-    b.gt MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, gt // Branch if true, stashing result in callee save reg.
+#if MTERP_PROFILE_BRANCHES
+    // TUINING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes, check sign
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1437,8 +1555,9 @@
 /* File: arm64/op_if_le.S */
 /* File: arm64/bincmp.S */
     /*
-     * Generic two-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic two-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * For: if-eq, if-ne, if-lt, if-ge, if-gt, if-le
      */
@@ -1447,12 +1566,23 @@
     ubfx    w0, wINST, #8, #4           // w0<- A
     GET_VREG w3, w1                     // w3<- vB
     GET_VREG w2, w0                     // w2<- vA
-    FETCH_S wINST, 1                    // wINST<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Offset if branch not taken
     cmp     w2, w3                      // compare (vA, vB)
-    b.le MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, le // Branch if true, stashing result in callee save reg.
+#if MTERP_PROFILE_BRANCHES
+    // TUINING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31            // Sign extend branch offset
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in xINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes, check sign
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi     MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1463,20 +1593,32 @@
 /* File: arm64/op_if_eqz.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S wINST, 1                    // w1<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Branch offset if not taken
     cmp     w2, #0                      // compare (vA, 0)
-    b.eq MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, eq // Branch if true, stashing result in callee save reg
+#if MTERP_PROFILE_BRANCHES
+    // TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi    MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1487,20 +1629,32 @@
 /* File: arm64/op_if_nez.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S wINST, 1                    // w1<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Branch offset if not taken
     cmp     w2, #0                      // compare (vA, 0)
-    b.ne MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, ne // Branch if true, stashing result in callee save reg
+#if MTERP_PROFILE_BRANCHES
+    // TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi    MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1511,20 +1665,32 @@
 /* File: arm64/op_if_ltz.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S wINST, 1                    // w1<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Branch offset if not taken
     cmp     w2, #0                      // compare (vA, 0)
-    b.lt MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, lt // Branch if true, stashing result in callee save reg
+#if MTERP_PROFILE_BRANCHES
+    // TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi    MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1535,20 +1701,32 @@
 /* File: arm64/op_if_gez.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S wINST, 1                    // w1<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Branch offset if not taken
     cmp     w2, #0                      // compare (vA, 0)
-    b.ge MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, ge // Branch if true, stashing result in callee save reg
+#if MTERP_PROFILE_BRANCHES
+    // TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi    MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1559,20 +1737,32 @@
 /* File: arm64/op_if_gtz.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S wINST, 1                    // w1<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Branch offset if not taken
     cmp     w2, #0                      // compare (vA, 0)
-    b.gt MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, gt // Branch if true, stashing result in callee save reg
+#if MTERP_PROFILE_BRANCHES
+    // TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi    MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -1583,20 +1773,32 @@
 /* File: arm64/op_if_lez.S */
 /* File: arm64/zcmp.S */
     /*
-     * Generic one-operand compare-and-branch operation.  Provide a "condition"
-     * fragment that specifies the comparison to perform.
+     * Generic one-operand compare-and-branch operation.  Provide a "revcmp"
+     * fragment that specifies the *reverse* comparison to perform, e.g.
+     * for "if-le" you would use "gt".
      *
      * for: if-eqz, if-nez, if-ltz, if-gez, if-gtz, if-lez
      */
     /* if-cmp vAA, +BBBB */
     lsr     w0, wINST, #8               // w0<- AA
     GET_VREG w2, w0                     // w2<- vAA
-    FETCH_S wINST, 1                    // w1<- branch offset, in code units
+    FETCH_S w1, 1                       // w1<- branch offset, in code units
+    mov     w0, #2                      // Branch offset if not taken
     cmp     w2, #0                      // compare (vA, 0)
-    b.le MterpCommonTakenBranchNoFlags
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_not_taken_osr
-    FETCH_ADVANCE_INST 2
+    csel    wINST, w1, w0, le // Branch if true, stashing result in callee save reg
+#if MTERP_PROFILE_BRANCHES
+    // TUNING: once measurements are complete, remove #if and hand-schedule.
+    EXPORT_PC
+    mov     x0, xSELF
+    add     x1, xFP, #OFF_FP_SHADOWFRAME
+    sbfm    x2, xINST, 0, 31
+    bl      MterpProfileBranch          // (self, shadow_frame, offset)
+    cbnz    w0, MterpOnStackReplacement // Note: offset must be in wINST
+#endif
+    ldr     w7, [xSELF, #THREAD_FLAGS_OFFSET]
+    adds    w2, wINST, wINST            // convert to bytes & set flags
+    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
+    b.mi    MterpCheckSuspendAndContinue
     GET_INST_OPCODE ip                  // extract opcode from wINST
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -11394,104 +11596,6 @@
     GET_INST_OPCODE ip
     GOTO_OPCODE ip
     /* NOTE: no fallthrough */
-/*
- * Common handling for branches with support for Jit profiling.
- * On entry:
- *    wINST          <= signed offset
- *    wPROFILE       <= signed hotness countdown (expanded to 32 bits)
- *    condition bits <= set to establish sign of offset (use "NoFlags" entry if not)
- *
- * We have quite a few different cases for branch profiling, OSR detection and
- * suspend check support here.
- *
- * Taken backward branches:
- *    If profiling active, do hotness countdown and report if we hit zero.
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *    Is there a pending suspend request?  If so, suspend.
- *
- * Taken forward branches and not-taken backward branches:
- *    If in osr check mode, see if our target is a compiled loop header entry and do OSR if so.
- *
- * Our most common case is expected to be a taken backward branch with active jit profiling,
- * but no full OSR check and no pending suspend request.
- * Next most common case is not-taken branch with no full OSR check.
- *
- */
-MterpCommonTakenBranchNoFlags:
-    cmp     wINST, #0
-    b.gt    .L_forward_branch           // don't add forward branches to hotness
-    tbnz    wPROFILE, #31, .L_no_count_backwards  // go if negative
-    subs    wPROFILE, wPROFILE, #1      // countdown
-    b.eq    .L_add_batch                // counted down to zero - report
-.L_resume_backward_branch:
-    ldr     lr, [xSELF, #THREAD_FLAGS_OFFSET]
-    add     w2, wINST, wINST            // w2<- byte offset
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    REFRESH_IBASE
-    ands    lr, lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
-    b.ne    .L_suspend_request_pending
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
-.L_suspend_request_pending:
-    EXPORT_PC
-    mov     x0, xSELF
-    bl      MterpSuspendCheck           // (self)
-    cbnz    x0, MterpFallback
-    REFRESH_IBASE                       // might have changed during suspend
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
-.L_no_count_backwards:
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.ne    .L_resume_backward_branch
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, xINST
-    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
-    b.ne    MterpOnStackReplacement
-    b       .L_resume_backward_branch
-
-.L_forward_branch:
-    cmp     wPROFILE, #JIT_CHECK_OSR    // possible OSR re-entry?
-    b.eq    .L_check_osr_forward
-.L_resume_forward_branch:
-    add     w2, wINST, wINST            // w2<- byte offset
-    FETCH_ADVANCE_INST_RB w2            // update rPC, load wINST
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
-.L_check_osr_forward:
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, xINST
-    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
-    b.ne    MterpOnStackReplacement
-    b       .L_resume_forward_branch
-
-.L_add_batch:
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    strh    wPROFILE, [x1, #SHADOWFRAME_HOTNESS_COUNTDOWN_OFFSET]
-    ldr     x0, [xFP, #OFF_FP_METHOD]
-    mov     x2, xSELF
-    bl      MterpAddHotnessBatch        // (method, shadow_frame, self)
-    mov     wPROFILE, w0                // restore new hotness countdown to wPROFILE
-    b       .L_no_count_backwards
-
-/*
- * Entered from the conditional branch handlers when OSR check request active on
- * not-taken path.  All Dalvik not-taken conditional branch offsets are 2.
- */
-.L_check_not_taken_osr:
-    mov     x0, xSELF
-    add     x1, xFP, #OFF_FP_SHADOWFRAME
-    mov     x2, #2
-    bl      MterpMaybeDoOnStackReplacement  // (self, shadow_frame, offset)
-    b.ne    MterpOnStackReplacement
-    FETCH_ADVANCE_INST 2
-    GET_INST_OPCODE ip                  // extract opcode from wINST
-    GOTO_OPCODE ip                      // jump to next instruction
-
 
 /*
  * Check for suspend check request.  Assumes wINST already loaded, xPC advanced and
@@ -11560,11 +11664,10 @@
 check2:
     mov     x0, #1                                  // signal return to caller.
 MterpDone:
-    ldp     fp, lr, [sp, #64]
-    ldp     xPC, xFP, [sp, #48]
-    ldp     xSELF, xINST, [sp, #32]
-    ldp     xIBASE, xREFS, [sp, #16]
-    ldp     xPROFILE, x27, [sp], #80
+    ldp     fp, lr, [sp, #48]
+    ldp     xPC, xFP, [sp, #32]
+    ldp     xSELF, xINST, [sp, #16]
+    ldp     xIBASE, xREFS, [sp], #64
     ret
 
     .cfi_endproc
diff --git a/runtime/jit/jit_instrumentation.cc b/runtime/jit/jit_instrumentation.cc
index cce2fb2..d751e5a 100644
--- a/runtime/jit/jit_instrumentation.cc
+++ b/runtime/jit/jit_instrumentation.cc
@@ -80,9 +80,9 @@
   DISALLOW_IMPLICIT_CONSTRUCTORS(JitCompileTask);
 };
 
-JitInstrumentationCache::JitInstrumentationCache(uint16_t hot_method_threshold,
-                                                 uint16_t warm_method_threshold,
-                                                 uint16_t osr_method_threshold)
+JitInstrumentationCache::JitInstrumentationCache(size_t hot_method_threshold,
+                                                 size_t warm_method_threshold,
+                                                 size_t osr_method_threshold)
     : hot_method_threshold_(hot_method_threshold),
       warm_method_threshold_(warm_method_threshold),
       osr_method_threshold_(osr_method_threshold),
@@ -130,62 +130,43 @@
   }
 }
 
-void JitInstrumentationCache::AddSamples(Thread* self, ArtMethod* method, uint16_t count) {
+void JitInstrumentationCache::AddSamples(Thread* self, ArtMethod* method, size_t) {
   // Since we don't have on-stack replacement, some methods can remain in the interpreter longer
   // than we want resulting in samples even after the method is compiled.
   if (method->IsClassInitializer() || method->IsNative()) {
     return;
   }
   DCHECK(thread_pool_ != nullptr);
-  DCHECK_GT(warm_method_threshold_, 0);
-  DCHECK_GT(hot_method_threshold_, warm_method_threshold_);
-  DCHECK_GT(osr_method_threshold_, hot_method_threshold_);
 
-  int32_t starting_count = method->GetCounter();
-  int32_t new_count = starting_count + count;   // int32 here to avoid wrap-around;
-  if (starting_count < warm_method_threshold_) {
-    if (new_count >= warm_method_threshold_) {
-      bool success = ProfilingInfo::Create(self, method, /* retry_allocation */ false);
-      if (success) {
-        VLOG(jit) << "Start profiling " << PrettyMethod(method);
-      }
+  uint16_t sample_count = method->IncrementCounter();
+  if (sample_count == warm_method_threshold_) {
+    bool success = ProfilingInfo::Create(self, method, /* retry_allocation */ false);
+    if (success) {
+      VLOG(jit) << "Start profiling " << PrettyMethod(method);
+    }
 
-      if (thread_pool_ == nullptr) {
-        // Calling ProfilingInfo::Create might put us in a suspended state, which could
-        // lead to the thread pool being deleted when we are shutting down.
-        DCHECK(Runtime::Current()->IsShuttingDown(self));
-        return;
-      }
+    if (thread_pool_ == nullptr) {
+      // Calling ProfilingInfo::Create might put us in a suspended state, which could
+      // lead to the thread pool being deleted when we are shutting down.
+      DCHECK(Runtime::Current()->IsShuttingDown(self));
+      return;
+    }
 
-      if (!success) {
-        // We failed allocating. Instead of doing the collection on the Java thread, we push
-        // an allocation to a compiler thread, that will do the collection.
-        thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kAllocateProfile));
-      }
+    if (!success) {
+      // We failed allocating. Instead of doing the collection on the Java thread, we push
+      // an allocation to a compiler thread, that will do the collection.
+      thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kAllocateProfile));
     }
-    // Avoid jumping more than one state at a time.
-    method->SetCounter(std::min(new_count, hot_method_threshold_ - 1));
-  } else if (starting_count < hot_method_threshold_) {
-    if (new_count >= hot_method_threshold_) {
-      DCHECK(thread_pool_ != nullptr);
-      thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompile));
-    }
-    // Avoid jumping more than one state at a time.
-    method->SetCounter(std::min(new_count, osr_method_threshold_ - 1));
-  } else if (starting_count < osr_method_threshold_) {
-    if (new_count >= osr_method_threshold_) {
-      DCHECK(thread_pool_ != nullptr);
-      thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompileOsr));
-      // Disable counting and enable OSR check.
-      // TUNING: might be better to disable counting here, and enable OSR check once OSR
-      // compilation is complete.  However, counting here does provide a signal that could
-      // be used to tell if the method is still hot.
-      method->SetCounter(kJitCheckForOSR);
-    }
-  } else {
-    // Make sure we don't wrap around.
-    method->SetCounter(
-        std::min(new_count, static_cast<int32_t>(std::numeric_limits<uint16_t>::max())));
+  }
+
+  if (sample_count == hot_method_threshold_) {
+    DCHECK(thread_pool_ != nullptr);
+    thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompile));
+  }
+
+  if (sample_count == osr_method_threshold_) {
+    DCHECK(thread_pool_ != nullptr);
+    thread_pool_->AddTask(self, new JitCompileTask(method, JitCompileTask::kCompileOsr));
   }
 }
 
diff --git a/runtime/jit/jit_instrumentation.h b/runtime/jit/jit_instrumentation.h
index 4775b5d..d1c5c44 100644
--- a/runtime/jit/jit_instrumentation.h
+++ b/runtime/jit/jit_instrumentation.h
@@ -40,8 +40,6 @@
 class Thread;
 
 namespace jit {
-static constexpr int16_t kJitCheckForOSR = -1;
-static constexpr int16_t kJitHotnessDisabled = -2;
 
 class JitInstrumentationCache;
 
@@ -86,6 +84,7 @@
 
   static constexpr uint32_t kJitEvents =
       instrumentation::Instrumentation::kMethodEntered |
+      instrumentation::Instrumentation::kBranch |
       instrumentation::Instrumentation::kInvokeVirtualOrInterface;
 
  private:
@@ -97,10 +96,10 @@
 // Keeps track of which methods are hot.
 class JitInstrumentationCache {
  public:
-  JitInstrumentationCache(uint16_t hot_method_threshold,
-                          uint16_t warm_method_threshold,
-                          uint16_t osr_method_threshold);
-  void AddSamples(Thread* self, ArtMethod* method, uint16_t samples)
+  JitInstrumentationCache(size_t hot_method_threshold,
+                          size_t warm_method_threshold,
+                          size_t osr_method_threshold);
+  void AddSamples(Thread* self, ArtMethod* method, size_t samples)
       SHARED_REQUIRES(Locks::mutator_lock_);
   void CreateThreadPool();
   void DeleteThreadPool(Thread* self);
@@ -109,17 +108,13 @@
     return hot_method_threshold_;
   }
 
-  size_t WarmMethodThreshold() const {
-    return warm_method_threshold_;
-  }
-
   // Wait until there is no more pending compilation tasks.
   void WaitForCompilationToFinish(Thread* self);
 
  private:
-  int16_t hot_method_threshold_;
-  int16_t warm_method_threshold_;
-  int16_t osr_method_threshold_;
+  size_t hot_method_threshold_;
+  size_t warm_method_threshold_;
+  size_t osr_method_threshold_;
   JitInstrumentationListener listener_;
   std::unique_ptr<ThreadPool> thread_pool_;
 
diff --git a/runtime/stack.h b/runtime/stack.h
index ec653e7..4fa1a4f 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -187,22 +187,6 @@
     return (dex_pc_ptr_ == nullptr) ? dex_pc_ : dex_pc_ptr_ - code_item_->insns_;
   }
 
-  int16_t GetCachedHotnessCountdown() const {
-    return cached_hotness_countdown_;
-  }
-
-  void SetCachedHotnessCountdown(int16_t cached_hotness_countdown) {
-    cached_hotness_countdown_ = cached_hotness_countdown;
-  }
-
-  int16_t GetHotnessCountdown() const {
-    return hotness_countdown_;
-  }
-
-  void SetHotnessCountdown(int16_t hotness_countdown) {
-    hotness_countdown_ = hotness_countdown;
-  }
-
   void SetDexPC(uint32_t dex_pc) {
     dex_pc_ = dex_pc;
     dex_pc_ptr_ = nullptr;
@@ -413,14 +397,6 @@
     return OFFSETOF_MEMBER(ShadowFrame, code_item_);
   }
 
-  static size_t CachedHotnessCountdownOffset() {
-    return OFFSETOF_MEMBER(ShadowFrame, cached_hotness_countdown_);
-  }
-
-  static size_t HotnessCountdownOffset() {
-    return OFFSETOF_MEMBER(ShadowFrame, hotness_countdown_);
-  }
-
   // Create ShadowFrame for interpreter using provided memory.
   static ShadowFrame* CreateShadowFrameImpl(uint32_t num_vregs,
                                             ShadowFrame* link,
@@ -430,7 +406,7 @@
     return new (memory) ShadowFrame(num_vregs, link, method, dex_pc, true);
   }
 
-  const uint16_t* GetDexPCPtr() {
+  uint16_t* GetDexPCPtr() {
     return dex_pc_ptr_;
   }
 
@@ -467,13 +443,11 @@
   ShadowFrame* link_;
   ArtMethod* method_;
   JValue* result_register_;
-  const uint16_t* dex_pc_ptr_;
+  uint16_t* dex_pc_ptr_;
   const DexFile::CodeItem* code_item_;
   LockCountData lock_count_data_;  // This may contain GC roots when lock counting is active.
   const uint32_t number_of_vregs_;
   uint32_t dex_pc_;
-  int16_t cached_hotness_countdown_;
-  int16_t hotness_countdown_;
 
   // This is a two-part array:
   //  - [0..number_of_vregs) holds the raw virtual registers, and each element here is always 4