Fix stack overflow for mutual recursion.

There was an error where we would have a pc that was in the method
which generated the stack overflow. This didn't work however
because the stack overflow check was before we stored the method in
the stack. The result was that the stack overflow handler had a PC
which wasnt necessarily in the method at the top of the stack. This
is now fixed by always restoring the link register before branching
to the throw entrypoint.

Slight code size regression on ARM/Mips (unmeasured). Regression on ARM
is 4 bytes of code per stack overflow check. Some of this regression is
mitigated by having one less GC safepoint.

Also adds test case for StackOverflowError issue (from bdc).

Tests passing: ARM, X86, Mips
Phone booting: ARM

Bug: https://code.google.com/p/android/issues/detail?id=66411
Bug: 12967914
Change-Id: I96fe667799458b58d1f86671e051968f7be78d5d

(cherry-picked from c0f96d03a1855fda7d94332331b94860404874dd)
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index 8b02a42..882a3bb 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -225,6 +225,9 @@
     case kOpBlx:
       opcode = kThumbBlxR;
       break;
+    case kOpBx:
+      opcode = kThumbBx;
+      break;
     default:
       LOG(FATAL) << "Bad opcode " << op;
   }
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 58db984..71cc0d9 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -16,6 +16,7 @@
 
 #include "dex/compiler_ir.h"
 #include "dex/compiler_internals.h"
+#include "dex/quick/arm/arm_lir.h"
 #include "dex/quick/mir_to_lir-inl.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "mirror/array.h"
@@ -627,7 +628,9 @@
     ThreadOffset func_offset(-1);
     int v1 = lab->operands[2];
     int v2 = lab->operands[3];
-    bool target_x86 = (cu_->instruction_set == kX86);
+    const bool target_x86 = cu_->instruction_set == kX86;
+    const bool target_arm = cu_->instruction_set == kArm || cu_->instruction_set == kThumb2;
+    const bool target_mips = cu_->instruction_set == kMips;
     switch (lab->operands[0]) {
       case kThrowNullPointer:
         func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowNullPointer);
@@ -685,21 +688,40 @@
         func_offset =
           QUICK_ENTRYPOINT_OFFSET(pThrowNoSuchMethod);
         break;
-      case kThrowStackOverflow:
+      case kThrowStackOverflow: {
         func_offset = QUICK_ENTRYPOINT_OFFSET(pThrowStackOverflow);
         // Restore stack alignment
+        int r_tgt = 0;
+        const int spill_size = (num_core_spills_ + num_fp_spills_) * 4;
         if (target_x86) {
-          OpRegImm(kOpAdd, TargetReg(kSp), frame_size_);
+          // - 4 to leave link register on stack.
+          OpRegImm(kOpAdd, TargetReg(kSp), frame_size_ - 4);
+          ClobberCallerSave();
+        } else if (target_arm) {
+          r_tgt = r12;
+          LoadWordDisp(TargetReg(kSp), spill_size - 4, TargetReg(kLr));
+          OpRegImm(kOpAdd, TargetReg(kSp), spill_size);
+          ClobberCallerSave();
+          LoadWordDisp(rARM_SELF, func_offset.Int32Value(), r_tgt);
         } else {
-          OpRegImm(kOpAdd, TargetReg(kSp), (num_core_spills_ + num_fp_spills_) * 4);
+          DCHECK(target_mips);
+          DCHECK_EQ(num_fp_spills_, 0);  // FP spills currently don't happen on mips.
+          // LR is offset 0 since we push in reverse order.
+          LoadWordDisp(TargetReg(kSp), 0, TargetReg(kLr));
+          OpRegImm(kOpAdd, TargetReg(kSp), spill_size);
+          ClobberCallerSave();
+          r_tgt = CallHelperSetup(func_offset);  // Doesn't clobber LR.
+          DCHECK_NE(r_tgt, TargetReg(kLr));
         }
-        break;
+        CallHelper(r_tgt, func_offset, false /* MarkSafepointPC */, false /* UseLink */);
+        continue;
+      }
       default:
         LOG(FATAL) << "Unexpected throw kind: " << lab->operands[0];
     }
     ClobberCallerSave();
     int r_tgt = CallHelperSetup(func_offset);
-    CallHelper(r_tgt, func_offset, true /* MarkSafepointPC */);
+    CallHelper(r_tgt, func_offset, true /* MarkSafepointPC */, true /* UseLink */);
   }
 }
 
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 4f02fd7..55d50ae 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -62,7 +62,7 @@
 
 /*
  * To save scheduling time, helper calls are broken into two parts: generation of
- * the helper target address, and the actuall call to the helper.  Because x86
+ * the helper target address, and the actual call to the helper.  Because x86
  * has a memory call operation, part 1 is a NOP for x86.  For other targets,
  * load arguments between the two parts.
  */
@@ -71,12 +71,13 @@
 }
 
 /* NOTE: if r_tgt is a temp, it will be freed following use */
-LIR* Mir2Lir::CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc) {
+LIR* Mir2Lir::CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc, bool use_link) {
   LIR* call_inst;
+  OpKind op = use_link ? kOpBlx : kOpBx;
   if (cu_->instruction_set == kX86) {
-    call_inst = OpThreadMem(kOpBlx, helper_offset);
+    call_inst = OpThreadMem(op, helper_offset);
   } else {
-    call_inst = OpReg(kOpBlx, r_tgt);
+    call_inst = OpReg(op, r_tgt);
     FreeTemp(r_tgt);
   }
   if (safepoint_pc) {
diff --git a/compiler/dex/quick/mips/mips_lir.h b/compiler/dex/quick/mips/mips_lir.h
index 59f442c..77ae337 100644
--- a/compiler/dex/quick/mips/mips_lir.h
+++ b/compiler/dex/quick/mips/mips_lir.h
@@ -138,7 +138,6 @@
 #define r_FRESULT1 r_F1
 
 // Regs not used for Mips.
-#define rMIPS_LR INVALID_REG
 #define rMIPS_PC INVALID_REG
 
 enum MipsResourceEncodingPos {
@@ -268,6 +267,7 @@
 #define rMIPS_RET1 r_RESULT1
 #define rMIPS_INVOKE_TGT r_T9
 #define rMIPS_COUNT INVALID_REG
+#define rMIPS_LR r_RA
 
 // RegisterLocation templates return values (r_V0, or r_V0/r_V1).
 const RegLocation mips_loc_c_return
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 42d7f59..5a1f6cd 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -602,7 +602,7 @@
 
     // Shared by all targets - implemented in gen_invoke.cc.
     int CallHelperSetup(ThreadOffset helper_offset);
-    LIR* CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc);
+    LIR* CallHelper(int r_tgt, ThreadOffset helper_offset, bool safepoint_pc, bool use_link = true);
     void CallRuntimeHelperImm(ThreadOffset helper_offset, int arg0, bool safepoint_pc);
     void CallRuntimeHelperReg(ThreadOffset helper_offset, int arg0, bool safepoint_pc);
     void CallRuntimeHelperRegLocation(ThreadOffset helper_offset, RegLocation arg0,
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index f6c8a00d..9cafcee 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -357,6 +357,7 @@
   { kX86Jmp32, kJmp,  IS_UNARY_OP  | IS_BRANCH | NEEDS_FIXUP,               { 0,             0, 0xE9, 0,    0, 0, 0, 0 }, "Jmp32", "!0t" },
   { kX86JmpR,  kJmp,  IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xFF, 0,    0, 4, 0, 0 }, "JmpR",  "!0r" },
   { kX86Jecxz8, kJmp, NO_OPERAND   | IS_BRANCH | NEEDS_FIXUP | REG_USEC,    { 0,             0, 0xE3, 0,    0, 0, 0, 0 }, "Jecxz", "!0t" },
+  { kX86JmpT,  kJmp,  IS_UNARY_OP  | IS_BRANCH | IS_LOAD,                   { THREAD_PREFIX, 0, 0xFF, 0,    0, 4, 0, 0 }, "JmpT",  "fs:[!0d]" },
   { kX86CallR, kCall, IS_UNARY_OP  | IS_BRANCH | REG_USE0,                  { 0,             0, 0xE8, 0,    0, 0, 0, 0 }, "CallR", "!0r" },
   { kX86CallM, kCall, IS_BINARY_OP | IS_BRANCH | IS_LOAD | REG_USE0,        { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallM", "[!0r+!1d]" },
   { kX86CallA, kCall, IS_QUAD_OP   | IS_BRANCH | IS_LOAD | REG_USE01,       { 0,             0, 0xFF, 0,    0, 2, 0, 0 }, "CallA", "[!0r+!1r<<!2d+!3d]" },
@@ -499,6 +500,8 @@
         return 2;  // opcode + rel8
       } else if (lir->opcode == kX86Jmp32) {
         return 5;  // opcode + rel32
+      } else if (lir->opcode == kX86JmpT) {
+        return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
       } else {
         DCHECK(lir->opcode == kX86JmpR);
         return 2;  // opcode + modrm
@@ -1328,7 +1331,13 @@
         EmitRegRegCond(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
       case kJmp:  // lir operands - 0: rel
-        EmitJmp(entry, lir->operands[0]);
+        if (entry->opcode == kX86JmpT) {
+          // This works since the instruction format for jmp and call is basically the same and
+          // EmitCallThread loads opcode info.
+          EmitCallThread(entry, lir->operands[0]);
+        } else {
+          EmitJmp(entry, lir->operands[0]);
+        }
         break;
       case kJcc:  // lir operands - 0: rel, 1: CC, target assigned
         EmitJcc(entry, lir->operands[0], lir->operands[1]);
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 577f216..72fc922 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -198,15 +198,15 @@
   LockTemp(rX86_ARG2);
 
   /* Build frame, return address already on stack */
+  // TODO: 64 bit.
   stack_decrement_ = OpRegImm(kOpSub, rX86_SP, frame_size_ - 4);
 
   /*
    * We can safely skip the stack overflow check if we're
    * a leaf *and* our frame size < fudge factor.
    */
-  bool skip_overflow_check = (mir_graph_->MethodIsLeaf() &&
-                (static_cast<size_t>(frame_size_) <
-                Thread::kStackOverflowReservedBytes));
+  const bool skip_overflow_check = (mir_graph_->MethodIsLeaf() &&
+      (static_cast<size_t>(frame_size_) < Thread::kStackOverflowReservedBytes));
   NewLIR0(kPseudoMethodEntry);
   /* Spill core callee saves */
   SpillCoreRegs();
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index d5d6b0e..bd82bf6 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -472,6 +472,7 @@
   X86OpCode opcode = kX86Bkpt;
   switch (op) {
     case kOpBlx: opcode = kX86CallT;  break;
+    case kOpBx: opcode = kX86JmpT;  break;
     default:
       LOG(FATAL) << "Bad opcode: " << op;
       break;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 9fb0044..abe1b3d 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -397,6 +397,8 @@
   kX86Jmp8, kX86Jmp32,  // jmp rel8/32; lir operands - 0: rel, target assigned
   kX86JmpR,             // jmp reg; lir operands - 0: reg
   kX86Jecxz8,           // jcexz rel8; jump relative if ECX is zero.
+  kX86JmpT,             // jmp fs:[disp]; fs: is equal to Thread::Current(); lir operands - 0: disp
+
   kX86CallR,            // call reg; lir operands - 0: reg
   kX86CallM,            // call [base + disp]; lir operands - 0: base, 1: disp
   kX86CallA,            // call [base + index * scale + disp]
diff --git a/runtime/oat.cc b/runtime/oat.cc
index d04514f..f970789 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '1', '8', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '1', '9', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/test/018-stack-overflow/expected.txt b/test/018-stack-overflow/expected.txt
index 7797816..98b45b7 100644
--- a/test/018-stack-overflow/expected.txt
+++ b/test/018-stack-overflow/expected.txt
@@ -1,2 +1,3 @@
-caught SOE
+caught SOE in testSelfRecursion
+caught SOE in testMutualRecursion
 SOE test done
diff --git a/test/018-stack-overflow/src/Main.java b/test/018-stack-overflow/src/Main.java
index f79c269..41adabc 100644
--- a/test/018-stack-overflow/src/Main.java
+++ b/test/018-stack-overflow/src/Main.java
@@ -19,17 +19,46 @@
  */
 public class Main {
     public static void main(String args[]) {
+        testSelfRecursion();
+        testMutualRecursion();
+        System.out.println("SOE test done");
+    }
+
+    private static void testSelfRecursion() {
         try {
             stackOverflowTestSub(0.0, 0.0, 0.0);
         }
         catch (StackOverflowError soe) {
-            System.out.println("caught SOE");
+            System.out.println("caught SOE in testSelfRecursion");
         }
-        System.out.println("SOE test done");
     }
 
-    private static void stackOverflowTestSub(double pad1, double pad2,
-            double pad3) {
+    private static void stackOverflowTestSub(double pad1, double pad2, double pad3) {
         stackOverflowTestSub(pad1, pad2, pad3);
     }
+
+    private static void testMutualRecursion() {
+        try {
+            foo(0.0, 0.0, 0.0);
+        }
+        catch (StackOverflowError soe) {
+            System.out.println("caught SOE in testMutualRecursion");
+        }
+    }
+
+    private static void foo(double pad1, double pad2, double pad3) {
+        bar(pad1, pad2, pad3);
+    }
+
+    private static void bar(double pad1, double pad2, double pad3) {
+        baz(pad1, pad2, pad3);
+    }
+
+    private static void baz(double pad1, double pad2, double pad3) {
+        qux(pad1, pad2, pad3);
+    }
+
+    private static void qux(double pad1, double pad2, double pad3) {
+        foo(pad1, pad2, pad3);
+    }
 }