Avoid generating dead code on frame enter/exit.
This includes stack operations and, on x86, call/pop to read PC.

bug=26997690

Rationale:
(1) If method is fully intrinsified, and makes no calls in slow
    path or uses special input, no need to require current method.
(2) Invoke instructions with HasPcRelativeDexCache() generate code
    that reads the PC (call/pop) on x86. However, if the invoke is
    an intrinsic that is later replaced with actual code, this PC
    reading code may be dead.

Example X86 (before/after):

0x0000108c: 83EC0C      sub esp, 12
0x0000108f: 890424      mov [esp], eax       <-- not needed
0x00001092: E800000000  call +0 (0x00001097)
0x00001097: 58          pop eax              <-- dead code to read PC
0x00001098: F30FB8C1    popcnt eax, ecx
0x0000109c: F30FB8DA    popcnt ebx, edx
0x000010a0: 03D8        add ebx, eax
0x000010a2: 89D8        mov eax, ebx
0x000010a4: 83C40C      add esp, 12          <-- not needed
0x000010a7: C3          ret

0x0000103c: F30FB8C1    popcnt eax, ecx
0x00001040: F30FB8DA    popcnt ebx, edx
0x00001044: 03D8        add ebx, eax
0x00001046: 89D8        mov eax, ebx
0x00001048: C3          ret

Example ARM64 (before/after):

0x0000103c: f81e0fe0      str x0, [sp, #-32]!
0x00001040: f9000ffe      str lr, [sp, #24]
0x00001044: dac01020      clz x0, x1
0x00001048: f9400ffe      ldr lr, [sp, #24]
0x0000104c: 910083ff      add sp, sp, #0x20 (32)
0x00001050: d65f03c0      ret

0x0000103c: dac01020      clz x0, x1
0x00001040: d65f03c0      ret

Change-Id: I8377db80c9a901a08fff4624927cf4a6e585da0c
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 967d156..af50363 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -543,8 +543,16 @@
   DCHECK(CheckTypeConsistency(instruction));
   LocationSummary* locations = instruction->GetLocations();
   if (!instruction->IsSuspendCheckEntry()) {
-    if (locations != nullptr && locations->CanCall()) {
-      MarkNotLeaf();
+    if (locations != nullptr) {
+      if (locations->CanCall()) {
+        MarkNotLeaf();
+      } else if (locations->Intrinsified() &&
+                 instruction->IsInvokeStaticOrDirect() &&
+                 !instruction->AsInvokeStaticOrDirect()->HasCurrentMethodInput()) {
+        // A static method call that has been fully intrinsified, and cannot call on the slow
+        // path or refer to the current method directly, no longer needs current method.
+        return;
+      }
     }
     if (instruction->NeedsCurrentMethod()) {
       SetRequiresCurrentMethod();
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 42f22af..4ecd1e6 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -20,7 +20,7 @@
 #include <memory>
 #include <stdint.h>
 
-#ifdef ART_ENABLE_CODEGEN_arm64
+#ifdef ART_ENABLE_CODEGEN_arm
 #include "dex_cache_array_fixups_arm.h"
 #endif
 
@@ -431,6 +431,7 @@
 
 static void RunArchOptimizations(InstructionSet instruction_set,
                                  HGraph* graph,
+                                 CodeGenerator* codegen,
                                  OptimizingCompilerStats* stats,
                                  PassObserver* pass_observer) {
   ArenaAllocator* arena = graph->GetArena();
@@ -466,7 +467,8 @@
 #endif
 #ifdef ART_ENABLE_CODEGEN_x86
     case kX86: {
-      x86::PcRelativeFixups* pc_relative_fixups = new (arena) x86::PcRelativeFixups(graph, stats);
+      x86::PcRelativeFixups* pc_relative_fixups =
+          new (arena) x86::PcRelativeFixups(graph, codegen, stats);
       HOptimization* x86_optimizations[] = {
           pc_relative_fixups
       };
@@ -561,7 +563,7 @@
   };
   RunOptimizations(optimizations2, arraysize(optimizations2), pass_observer);
 
-  RunArchOptimizations(driver->GetInstructionSet(), graph, stats, pass_observer);
+  RunArchOptimizations(driver->GetInstructionSet(), graph, codegen, stats, pass_observer);
   AllocateRegisters(graph, codegen, pass_observer);
 }
 
diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc
index a6f1461..d281a9f 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.cc
+++ b/compiler/optimizing/pc_relative_fixups_x86.cc
@@ -16,6 +16,7 @@
 
 #include "pc_relative_fixups_x86.h"
 #include "code_generator_x86.h"
+#include "intrinsics_x86.h"
 
 namespace art {
 namespace x86 {
@@ -25,7 +26,10 @@
  */
 class PCRelativeHandlerVisitor : public HGraphVisitor {
  public:
-  explicit PCRelativeHandlerVisitor(HGraph* graph) : HGraphVisitor(graph), base_(nullptr) {}
+  PCRelativeHandlerVisitor(HGraph* graph, CodeGenerator* codegen)
+      : HGraphVisitor(graph),
+        codegen_(down_cast<CodeGeneratorX86*>(codegen)),
+        base_(nullptr) {}
 
   void MoveBaseIfNeeded() {
     if (base_ != nullptr) {
@@ -146,7 +150,6 @@
     if (base_ != nullptr) {
       return;
     }
-
     // Insert the base at the start of the entry block, move it to a better
     // position later in MoveBaseIfNeeded().
     base_ = new (GetGraph()->GetArena()) HX86ComputeBaseMethodAddress();
@@ -180,7 +183,9 @@
     }
 
     bool base_added = false;
-    if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasPcRelativeDexCache()) {
+    if (invoke_static_or_direct != nullptr &&
+        invoke_static_or_direct->HasPcRelativeDexCache() &&
+        !WillHaveCallFreeIntrinsicsCodeGen(invoke)) {
       InitializePCRelativeBasePointer();
       // Add the extra parameter base_.
       invoke_static_or_direct->AddSpecialInput(base_);
@@ -215,6 +220,24 @@
     }
   }
 
+  bool WillHaveCallFreeIntrinsicsCodeGen(HInvoke* invoke) {
+    if (invoke->GetIntrinsic() != Intrinsics::kNone) {
+      // This invoke may have intrinsic code generation defined. However, we must
+      // now also determine if this code generation is truly there and call-free
+      // (not unimplemented, no bail on instruction features, or call on slow path).
+      // This is done by actually calling the locations builder on the instruction
+      // and clearing out the locations once result is known. We assume this
+      // call only has creating locations as side effects!
+      IntrinsicLocationsBuilderX86 builder(codegen_);
+      bool success = builder.TryDispatch(invoke) && !invoke->GetLocations()->CanCall();
+      invoke->SetLocations(nullptr);
+      return success;
+    }
+    return false;
+  }
+
+  CodeGeneratorX86* codegen_;
+
   // The generated HX86ComputeBaseMethodAddress in the entry block needed as an
   // input to the HX86LoadFromConstantTable instructions.
   HX86ComputeBaseMethodAddress* base_;
@@ -226,7 +249,7 @@
     // that can be live-in at the irreducible loop header.
     return;
   }
-  PCRelativeHandlerVisitor visitor(graph_);
+  PCRelativeHandlerVisitor visitor(graph_, codegen_);
   visitor.VisitInsertionOrder();
   visitor.MoveBaseIfNeeded();
 }
diff --git a/compiler/optimizing/pc_relative_fixups_x86.h b/compiler/optimizing/pc_relative_fixups_x86.h
index af708ac..03de2fc 100644
--- a/compiler/optimizing/pc_relative_fixups_x86.h
+++ b/compiler/optimizing/pc_relative_fixups_x86.h
@@ -21,14 +21,21 @@
 #include "optimization.h"
 
 namespace art {
+
+class CodeGenerator;
+
 namespace x86 {
 
 class PcRelativeFixups : public HOptimization {
  public:
-  PcRelativeFixups(HGraph* graph, OptimizingCompilerStats* stats)
-      : HOptimization(graph, "pc_relative_fixups_x86", stats) {}
+  PcRelativeFixups(HGraph* graph, CodeGenerator* codegen, OptimizingCompilerStats* stats)
+      : HOptimization(graph, "pc_relative_fixups_x86", stats),
+        codegen_(codegen) {}
 
   void Run() OVERRIDE;
+
+ private:
+  CodeGenerator* codegen_;
 };
 
 }  // namespace x86