Small optimization for recursive calls: avoid dex cache.

Change-Id: I044757a2f06e535cdc1480c4fc8182b89635baf6
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 9c2facb..86567ed 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -613,9 +613,12 @@
     // Sharpening to kDirect only works if we compile PIC.
     DCHECK((optimized_invoke_type == invoke_type) || (optimized_invoke_type != kDirect)
            || compiler_driver_->GetCompilerOptions().GetCompilePic());
+    bool is_recursive =
+        (target_method.dex_method_index == outer_compilation_unit_->GetDexMethodIndex());
+    DCHECK(!is_recursive || (target_method.dex_file == outer_compilation_unit_->GetDexFile()));
     invoke = new (arena_) HInvokeStaticOrDirect(
         arena_, number_of_arguments, return_type, dex_pc, target_method.dex_method_index,
-        optimized_invoke_type);
+        is_recursive, optimized_invoke_type);
   }
 
   size_t start_index = 0;
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 0fe28e8..7731a10 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -527,6 +527,8 @@
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kArm);
   DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
+  __ Bind(&frame_entry_label_);
+
   if (!skip_overflow_check) {
     __ AddConstant(IP, SP, -static_cast<int32_t>(GetStackOverflowReservedBytes(kArm)));
     __ LoadFromOffset(kLoadWord, IP, IP, 0);
@@ -1185,18 +1187,22 @@
 
   // temp = method;
   codegen_->LoadCurrentMethod(temp);
-  // temp = temp->dex_cache_resolved_methods_;
-  __ LoadFromOffset(
-      kLoadWord, temp, temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value());
-  // temp = temp[index_in_cache]
-  __ LoadFromOffset(
-      kLoadWord, temp, temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()));
-  // LR = temp[offset_of_quick_compiled_code]
-  __ LoadFromOffset(kLoadWord, LR, temp,
-                     mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-                         kArmWordSize).Int32Value());
-  // LR()
-  __ blx(LR);
+  if (!invoke->IsRecursive()) {
+    // temp = temp->dex_cache_resolved_methods_;
+    __ LoadFromOffset(
+        kLoadWord, temp, temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value());
+    // temp = temp[index_in_cache]
+    __ LoadFromOffset(
+        kLoadWord, temp, temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex()));
+    // LR = temp[offset_of_quick_compiled_code]
+    __ LoadFromOffset(kLoadWord, LR, temp,
+                      mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+                             kArmWordSize).Int32Value());
+    // LR()
+    __ blx(LR);
+  } else {
+    __ bl(codegen_->GetFrameEntryLabel());
+  }
 
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   DCHECK(!codegen_->IsLeafMethod());
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index dd69e4d..4b03dff 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -247,9 +247,12 @@
 
   void ComputeSpillMask() OVERRIDE;
 
+  Label* GetFrameEntryLabel() { return &frame_entry_label_; }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
+  Label frame_entry_label_;
   LocationsBuilderARM location_builder_;
   InstructionCodeGeneratorARM instruction_visitor_;
   ParallelMoveResolverARM move_resolver_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index cc7bf3c..0909424 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -442,6 +442,8 @@
 }
 
 void CodeGeneratorARM64::GenerateFrameEntry() {
+  __ Bind(&frame_entry_label_);
+
   bool do_overflow_check = FrameNeedsStackCheck(GetFrameSize(), kArm64) || !IsLeafMethod();
   if (do_overflow_check) {
     UseScratchRegisterScope temps(GetVIXLAssembler());
@@ -1845,17 +1847,21 @@
   //
   // Currently we implement the app -> app logic, which looks up in the resolve cache.
 
-  // temp = method;
-  LoadCurrentMethod(temp);
-  // temp = temp->dex_cache_resolved_methods_;
-  __ Ldr(temp, HeapOperand(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset()));
-  // temp = temp[index_in_cache];
-  __ Ldr(temp, HeapOperand(temp, index_in_cache));
-  // lr = temp->entry_point_from_quick_compiled_code_;
-  __ Ldr(lr, HeapOperand(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kArm64WordSize)));
-  // lr();
-  __ Blr(lr);
+  if (!invoke->IsRecursive()) {
+    // temp = method;
+    LoadCurrentMethod(temp);
+    // temp = temp->dex_cache_resolved_methods_;
+    __ Ldr(temp, HeapOperand(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset()));
+    // temp = temp[index_in_cache];
+    __ Ldr(temp, HeapOperand(temp, index_in_cache));
+    // lr = temp->entry_point_from_quick_compiled_code_;
+    __ Ldr(lr, HeapOperand(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+        kArm64WordSize)));
+    // lr();
+    __ Blr(lr);
+  } else {
+    __ Bl(&frame_entry_label_);
+  }
 
   RecordPcInfo(invoke, invoke->GetDexPc());
   DCHECK(!IsLeafMethod());
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 100dafe..9a99dcc 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -297,6 +297,7 @@
  private:
   // Labels for each block that will be compiled.
   vixl::Label* block_labels_;
+  vixl::Label frame_entry_label_;
 
   LocationsBuilderARM64 location_builder_;
   InstructionCodeGeneratorARM64 instruction_visitor_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 66f1d5e..a22c91a 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -449,6 +449,7 @@
         codegen_(codegen) {}
 
 void CodeGeneratorX86::GenerateFrameEntry() {
+  __ Bind(&frame_entry_label_);
   bool skip_overflow_check =
       IsLeafMethod() && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86);
   DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
@@ -1125,13 +1126,17 @@
 
   // temp = method;
   codegen_->LoadCurrentMethod(temp);
-  // temp = temp->dex_cache_resolved_methods_;
-  __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
-  // temp = temp[index_in_cache]
-  __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
-  // (temp + offset_of_quick_compiled_code)()
-  __ call(Address(
-      temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+  if (!invoke->IsRecursive()) {
+    // temp = temp->dex_cache_resolved_methods_;
+    __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value()));
+    // temp = temp[index_in_cache]
+    __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
+    // (temp + offset_of_quick_compiled_code)()
+    __ call(Address(
+        temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kX86WordSize).Int32Value()));
+  } else {
+    __ call(codegen_->GetFrameEntryLabel());
+  }
 
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 55d71e3..54cb6cd 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -241,9 +241,12 @@
     return type == Primitive::kPrimLong;
   }
 
+  Label* GetFrameEntryLabel() { return &frame_entry_label_; }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
+  Label frame_entry_label_;
   LocationsBuilderX86 location_builder_;
   InstructionCodeGeneratorX86 instruction_visitor_;
   ParallelMoveResolverX86 move_resolver_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 6bc28ff..90b7bda 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -361,13 +361,17 @@
 
   // temp = method;
   LoadCurrentMethod(temp);
-  // temp = temp->dex_cache_resolved_methods_;
-  __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
-  // temp = temp[index_in_cache]
-  __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
-  // (temp + offset_of_quick_compiled_code)()
-  __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
-      kX86_64WordSize).SizeValue()));
+  if (!invoke->IsRecursive()) {
+    // temp = temp->dex_cache_resolved_methods_;
+    __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
+    // temp = temp[index_in_cache]
+    __ movl(temp, Address(temp, CodeGenerator::GetCacheOffset(invoke->GetDexMethodIndex())));
+    // (temp + offset_of_quick_compiled_code)()
+    __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(
+        kX86_64WordSize).SizeValue()));
+  } else {
+    __ call(&frame_entry_label_);
+  }
 
   DCHECK(!IsLeafMethod());
   RecordPcInfo(invoke, invoke->GetDexPc());
@@ -472,6 +476,7 @@
 }
 
 void CodeGeneratorX86_64::GenerateFrameEntry() {
+  __ Bind(&frame_entry_label_);
   bool skip_overflow_check = IsLeafMethod()
       && !FrameNeedsStackCheck(GetFrameSize(), InstructionSet::kX86_64);
   DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index c30f4c2..dbdbf86 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -248,6 +248,7 @@
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
+  Label frame_entry_label_;
   LocationsBuilderX86_64 location_builder_;
   InstructionCodeGeneratorX86_64 instruction_visitor_;
   ParallelMoveResolverX86_64 move_resolver_;
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 2cc021c..1a0ebe5 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1711,9 +1711,11 @@
                         Primitive::Type return_type,
                         uint32_t dex_pc,
                         uint32_t dex_method_index,
+                        bool is_recursive,
                         InvokeType invoke_type)
       : HInvoke(arena, number_of_arguments, return_type, dex_pc, dex_method_index),
-        invoke_type_(invoke_type) {}
+        invoke_type_(invoke_type),
+        is_recursive_(is_recursive) {}
 
   bool CanDoImplicitNullCheck() const OVERRIDE {
     // We access the method via the dex cache so we can't do an implicit null check.
@@ -1722,11 +1724,13 @@
   }
 
   InvokeType GetInvokeType() const { return invoke_type_; }
+  bool IsRecursive() const { return is_recursive_; }
 
   DECLARE_INSTRUCTION(InvokeStaticOrDirect);
 
  private:
   const InvokeType invoke_type_;
+  const bool is_recursive_;
 
   DISALLOW_COPY_AND_ASSIGN(HInvokeStaticOrDirect);
 };
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 1f0dba5..03744e4 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -51,7 +51,8 @@
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xE8);
   static const int kSize = 5;
-  EmitLabel(label, kSize);
+  // Offset by one because we already have emitted the opcode.
+  EmitLabel(label, kSize - 1);
 }
 
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 5afa603..7e8e769 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -57,7 +57,8 @@
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xE8);
   static const int kSize = 5;
-  EmitLabel(label, kSize);
+  // Offset by one because we already have emitted the opcode.
+  EmitLabel(label, kSize - 1);
 }
 
 void X86_64Assembler::pushq(CpuRegister reg) {