Inlining a few small methods based on profiling dex2oat with perf.

Test: m test-art-host
Change-Id: I6313158e59592d8d132154523be9c82dda3c7eb8
diff --git a/compiler/optimizing/instruction_builder.cc b/compiler/optimizing/instruction_builder.cc
index c60f6e5..7dc2fa8 100644
--- a/compiler/optimizing/instruction_builder.cc
+++ b/compiler/optimizing/instruction_builder.cc
@@ -37,37 +37,45 @@
   return block_builder_->GetBlockAt(dex_pc);
 }
 
-ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsFor(HBasicBlock* block) {
+inline ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsFor(HBasicBlock* block) {
   ArenaVector<HInstruction*>* locals = &locals_for_[block->GetBlockId()];
   const size_t vregs = graph_->GetNumberOfVRegs();
-  if (locals->size() != vregs) {
-    locals->resize(vregs, nullptr);
+  if (locals->size() == vregs) {
+    return locals;
+  }
+  return GetLocalsForWithAllocation(block, locals, vregs);
+}
 
-    if (block->IsCatchBlock()) {
-      // We record incoming inputs of catch phis at throwing instructions and
-      // must therefore eagerly create the phis. Phis for undefined vregs will
-      // be deleted when the first throwing instruction with the vreg undefined
-      // is encountered. Unused phis will be removed by dead phi analysis.
-      for (size_t i = 0; i < vregs; ++i) {
-        // No point in creating the catch phi if it is already undefined at
-        // the first throwing instruction.
-        HInstruction* current_local_value = (*current_locals_)[i];
-        if (current_local_value != nullptr) {
-          HPhi* phi = new (arena_) HPhi(
-              arena_,
-              i,
-              0,
-              current_local_value->GetType());
-          block->AddPhi(phi);
-          (*locals)[i] = phi;
-        }
+ArenaVector<HInstruction*>* HInstructionBuilder::GetLocalsForWithAllocation(
+    HBasicBlock* block,
+    ArenaVector<HInstruction*>* locals,
+    const size_t vregs) {
+  DCHECK_NE(locals->size(), vregs);
+  locals->resize(vregs, nullptr);
+  if (block->IsCatchBlock()) {
+    // We record incoming inputs of catch phis at throwing instructions and
+    // must therefore eagerly create the phis. Phis for undefined vregs will
+    // be deleted when the first throwing instruction with the vreg undefined
+    // is encountered. Unused phis will be removed by dead phi analysis.
+    for (size_t i = 0; i < vregs; ++i) {
+      // No point in creating the catch phi if it is already undefined at
+      // the first throwing instruction.
+      HInstruction* current_local_value = (*current_locals_)[i];
+      if (current_local_value != nullptr) {
+        HPhi* phi = new (arena_) HPhi(
+            arena_,
+            i,
+            0,
+            current_local_value->GetType());
+        block->AddPhi(phi);
+        (*locals)[i] = phi;
       }
     }
   }
   return locals;
 }
 
-HInstruction* HInstructionBuilder::ValueOfLocalAt(HBasicBlock* block, size_t local) {
+inline HInstruction* HInstructionBuilder::ValueOfLocalAt(HBasicBlock* block, size_t local) {
   ArenaVector<HInstruction*>* locals = GetLocalsFor(block);
   return (*locals)[local];
 }
diff --git a/compiler/optimizing/instruction_builder.h b/compiler/optimizing/instruction_builder.h
index e735a0c..7fdc188 100644
--- a/compiler/optimizing/instruction_builder.h
+++ b/compiler/optimizing/instruction_builder.h
@@ -93,6 +93,10 @@
   HBasicBlock* FindBlockStartingAt(uint32_t dex_pc) const;
 
   ArenaVector<HInstruction*>* GetLocalsFor(HBasicBlock* block);
+  // Out of line version of GetLocalsFor(), which has a fast path that is
+  // beneficial to get inlined by callers.
+  ArenaVector<HInstruction*>* GetLocalsForWithAllocation(
+      HBasicBlock* block, ArenaVector<HInstruction*>* locals, const size_t vregs);
   HInstruction* ValueOfLocalAt(HBasicBlock* block, size_t local);
   HInstruction* LoadLocal(uint32_t register_index, Primitive::Type type) const;
   HInstruction* LoadNullCheckedLocal(uint32_t register_index, uint32_t dex_pc);
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 26c9ab8..bffda93 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -69,7 +69,7 @@
 }
 
 void HLoopOptimization::Run() {
-  // Well-behaved loops only.
+  // Skip if there is no loop or the graph has try-catch/irreducible loops.
   // TODO: make this less of a sledgehammer.
   if (!graph_->HasLoops() || graph_->HasTryCatch() || graph_->HasIrreducibleLoops()) {
     return;
@@ -85,6 +85,7 @@
   LocalRun();
 
   if (top_loop_ == nullptr) {
+    // All loops have been eliminated.
     graph_->SetHasLoops(false);
   }
 
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index c39aed2..11a0d35 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1734,11 +1734,11 @@
 // A HEnvironment object contains the values of virtual registers at a given location.
 class HEnvironment : public ArenaObject<kArenaAllocEnvironment> {
  public:
-  HEnvironment(ArenaAllocator* arena,
-               size_t number_of_vregs,
-               ArtMethod* method,
-               uint32_t dex_pc,
-               HInstruction* holder)
+  ALWAYS_INLINE HEnvironment(ArenaAllocator* arena,
+                             size_t number_of_vregs,
+                             ArtMethod* method,
+                             uint32_t dex_pc,
+                             HInstruction* holder)
      : vregs_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentVRegs)),
        locations_(number_of_vregs, arena->Adapter(kArenaAllocEnvironmentLocations)),
        parent_(nullptr),
@@ -1747,7 +1747,7 @@
        holder_(holder) {
   }
 
-  HEnvironment(ArenaAllocator* arena, const HEnvironment& to_copy, HInstruction* holder)
+  ALWAYS_INLINE HEnvironment(ArenaAllocator* arena, const HEnvironment& to_copy, HInstruction* holder)
       : HEnvironment(arena,
                      to_copy.Size(),
                      to_copy.GetMethod(),
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index b62bf4e..a239bd5 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -331,7 +331,7 @@
         instruction, /* environment */ nullptr, input_index, block->GetLifetimeEnd(), first_use_);
   }
 
-  void AddRange(size_t start, size_t end) {
+  ALWAYS_INLINE void AddRange(size_t start, size_t end) {
     if (first_range_ == nullptr) {
       first_range_ = last_range_ = range_search_start_ =
           new (allocator_) LiveRange(start, end, first_range_);
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index c59664b..65d35ea 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -516,12 +516,12 @@
 // construction and releases it upon destruction.
 class SCOPED_CAPABILITY ReaderMutexLock {
  public:
-  ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu) :
+  ReaderMutexLock(Thread* self, ReaderWriterMutex& mu) ACQUIRE(mu) ALWAYS_INLINE :
       self_(self), mu_(mu) {
     mu_.SharedLock(self_);
   }
 
-  ~ReaderMutexLock() RELEASE() {
+  ~ReaderMutexLock() RELEASE() ALWAYS_INLINE {
     mu_.SharedUnlock(self_);
   }
 
diff --git a/runtime/scoped_thread_state_change.h b/runtime/scoped_thread_state_change.h
index a3286ac..5f03741 100644
--- a/runtime/scoped_thread_state_change.h
+++ b/runtime/scoped_thread_state_change.h
@@ -141,6 +141,8 @@
   ALWAYS_INLINE explicit ScopedObjectAccessUnchecked(Thread* self)
       REQUIRES(!Locks::thread_suspend_count_lock_);
 
+  ALWAYS_INLINE ~ScopedObjectAccessUnchecked() REQUIRES(!Locks::thread_suspend_count_lock_) {}
+
   // Used when we want a scoped JNI thread state but have no thread/JNIEnv. Consequently doesn't
   // change into Runnable or acquire a share on the mutator_lock_.
   explicit ScopedObjectAccessUnchecked(JavaVM* vm) ALWAYS_INLINE
diff --git a/runtime/stack_map.h b/runtime/stack_map.h
index 67f0b57..d936ce9 100644
--- a/runtime/stack_map.h
+++ b/runtime/stack_map.h
@@ -571,7 +571,7 @@
     }
   }
 
-  bool IsDexRegisterLive(uint16_t dex_register_number) const {
+  ALWAYS_INLINE bool IsDexRegisterLive(uint16_t dex_register_number) const {
     size_t live_bit_mask_offset_in_bits = GetLiveBitMaskOffset() * kBitsPerByte;
     return region_.LoadBit(live_bit_mask_offset_in_bits + dex_register_number);
   }