Suspend check reworking (ready for rewiew)

I hate burning a register, but the cost of suspend checks was just too high
in our current environment.  There are things that can be done in future
releases to avoid the register burn, but for now it's worthwhile.

The general strategy is to reserve r4 as a suspend check counter.
Rather than poll the thread suspendPending counter, we instead simply
decrement the counter register.  When it rolls to zero, we check.  For
now I'm just using the counter scheme on backwards branches - we always
poll on returns (which is already heavyweight enough that the extra cost
isn't especially noticable).

I've also added an optimization hint to the MIR in case we have enough
time to test and enable the existing loop analysis code that omits the
suspend check on smallish counted loops.

Change-Id: I82d8bad5882a4cf2ccff590942e2d1520d58969d
diff --git a/src/asm_support.h b/src/asm_support.h
index 6eda4bf..097ab7a 100644
--- a/src/asm_support.h
+++ b/src/asm_support.h
@@ -3,9 +3,16 @@
 #ifndef ART_SRC_ASM_SUPPORT_H_
 #define ART_SRC_ASM_SUPPORT_H_
 
+#if defined(__arm__)
+#define rSUSPEND r4
+#define rSELF r9
+#define rLR r14
+#define SUSPEND_CHECK_INTERVAL (1000)
+#endif
+
 #if defined(__i386__)
 // Offset of field Thread::self_ verified in InitCpu
-#define THREAD_SELF_OFFSET 0x161
+#define THREAD_SELF_OFFSET 0x165
 #endif
 
 #endif  // ART_SRC_ASM_SUPPORT_H_
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index b697292..0965c14 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -87,6 +87,7 @@
     kMIRInlined,                        // Invoke is inlined (ie dead)
     kMIRInlinedPred,                    // Invoke is inlined via prediction
     kMIRCallee,                         // Instruction is inlined from callee
+    kMIRIgnoreSuspendCheck,
 } MIROptimizationFlagPositons;
 
 #define MIR_IGNORE_NULL_CHECK           (1 << kMIRIgnoreNullCheck)
@@ -96,6 +97,7 @@
 #define MIR_INLINED                     (1 << kMIRInlined)
 #define MIR_INLINED_PRED                (1 << kMIRInlinedPred)
 #define MIR_CALLEE                      (1 << kMIRCallee)
+#define MIR_IGNORE_SUSPEND_CHECK        (1 << kMIRIgnoreSuspendCheck)
 
 typedef struct CallsiteInfo {
     const char* classDescriptor;
@@ -239,6 +241,7 @@
     GrowableList dfsOrder;
     GrowableList domPostOrderTraversal;
     GrowableList throwLaunchpads;
+    GrowableList suspendLaunchpads;
     ArenaBitVector* tryBlockAddr;
     ArenaBitVector** defBlockMatrix;    // numDalvikRegister x numBlocks
     ArenaBitVector* tempBlockV;
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index fdcce9c..6a01e36 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -725,6 +725,9 @@
     /* Intialize the throwLaunchpads list */
     oatInitGrowableList(&cUnit.throwLaunchpads, 4);
 
+    /* Intialize the suspendLaunchpads list */
+    oatInitGrowableList(&cUnit.suspendLaunchpads, 4);
+
     /* Allocate the bit-vector to track the beginning of basic blocks */
     ArenaBitVector *tryBlockAddr = oatAllocBitVector(cUnit.insnsSize,
                                                      true /* expandable */);
diff --git a/src/compiler/codegen/arm/ArchUtility.cc b/src/compiler/codegen/arm/ArchUtility.cc
index 1d6bb41..45e1b19 100644
--- a/src/compiler/codegen/arm/ArchUtility.cc
+++ b/src/compiler/codegen/arm/ArchUtility.cc
@@ -354,6 +354,9 @@
         case kArmPseudoThrowTarget:
             LOG(INFO) << "LT" << (intptr_t)lir << ":";
             break;
+        case kArmPseudoSuspendTarget:
+            LOG(INFO) << "LS" << (intptr_t)lir << ":";
+            break;
         case kArmPseudoCaseLabel:
             LOG(INFO) << "LC" << (intptr_t)lir << ": Case target 0x" <<
                 std::hex << lir->operands[0] << "|" << std::dec <<
diff --git a/src/compiler/codegen/arm/ArmLIR.h b/src/compiler/codegen/arm/ArmLIR.h
index 07e2e97..e436eea 100644
--- a/src/compiler/codegen/arm/ArmLIR.h
+++ b/src/compiler/codegen/arm/ArmLIR.h
@@ -28,7 +28,7 @@
  *        pointer in r0 as a hidden arg0. Otherwise used as codegen scratch
  *        registers.
  * r0-r1: As in C/C++ r0 is 32-bit return register and r0/r1 is 64-bit
- * r4   : Callee save (promotion target)
+ * r4   : (rSUSPEND) is reserved (suspend check assist)
  * r5   : Callee save (promotion target)
  * r6   : Callee save (promotion target)
  * r7   : Callee save (promotion target)
@@ -243,7 +243,7 @@
 
 /*
  * Annotate special-purpose core registers:
- *   - VM: r4PC, r5FP, and r6SELF
+ *   - VM: r6SELF
  *   - ARM architecture: r13sp, r14lr, and r15pc
  *
  * rPC, rFP, and rSELF are for architecture-independent code to use.
@@ -253,7 +253,7 @@
     r1     = 1,
     r2     = 2,
     r3     = 3,
-    r4     = 4,
+    rSUSPEND = 4,
     r5     = 5,
     r6     = 6,
     r7     = 7,
@@ -366,6 +366,7 @@
  * Assemble.c.
  */
 typedef enum ArmOpcode {
+    kArmPseudoSuspendTarget = -15,
     kArmPseudoThrowTarget = -14,
     kArmPseudoCaseLabel = -13,
     kArmPseudoMethodEntry = -12,
diff --git a/src/compiler/codegen/arm/MethodCodegenDriver.cc b/src/compiler/codegen/arm/MethodCodegenDriver.cc
index ce65803..41053a2 100644
--- a/src/compiler/codegen/arm/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/arm/MethodCodegenDriver.cc
@@ -1113,10 +1113,12 @@
 
         case OP_RETURN:
         case OP_RETURN_OBJECT:
+            genSuspendPoll(cUnit, mir);
             storeValue(cUnit, retLoc, rlSrc[0]);
             break;
 
         case OP_RETURN_WIDE:
+            genSuspendPoll(cUnit, mir);
             rlDest = retLocWide;
             rlDest.fp = rlSrc[0].fp;
             storeValueWide(cUnit, rlDest, rlSrc[0]);
@@ -1277,11 +1279,8 @@
         case OP_GOTO:
         case OP_GOTO_16:
         case OP_GOTO_32:
-            // TUNING: add MIR flag to disable when unnecessary
-            bool backwardBranch;
-            backwardBranch = (bb->taken->startOffset <= mir->offset);
-            if (backwardBranch) {
-                genSuspendPoll(cUnit, mir);
+            if (bb->taken->startOffset <= mir->offset) {
+                genSuspendTest(cUnit, mir);
             }
             genUnconditionalBranch(cUnit, &labelList[bb->taken->id]);
             break;
@@ -1315,7 +1314,7 @@
             ArmConditionCode cond;
             backwardBranch = (bb->taken->startOffset <= mir->offset);
             if (backwardBranch) {
-                genSuspendPoll(cUnit, mir);
+                genSuspendTest(cUnit, mir);
             }
             rlSrc[0] = loadValue(cUnit, rlSrc[0], kCoreReg);
             rlSrc[1] = loadValue(cUnit, rlSrc[1], kCoreReg);
@@ -1358,7 +1357,7 @@
             ArmConditionCode cond;
             backwardBranch = (bb->taken->startOffset <= mir->offset);
             if (backwardBranch) {
-                genSuspendPoll(cUnit, mir);
+                genSuspendTest(cUnit, mir);
             }
             rlSrc[0] = loadValue(cUnit, rlSrc[0], kCoreReg);
             opRegImm(cUnit, kOpCmp, rlSrc[0].lowReg, 0);
@@ -1999,6 +1998,27 @@
     }
 }
 
+static void handleSuspendLaunchpads(CompilationUnit *cUnit)
+{
+    ArmLIR** suspendLabel =
+        (ArmLIR **) cUnit->suspendLaunchpads.elemList;
+    int numElems = cUnit->suspendLaunchpads.numUsed;
+
+    for (int i = 0; i < numElems; i++) {
+        /* TUNING: move suspend count load into helper */
+        ArmLIR* lab = suspendLabel[i];
+        ArmLIR* resumeLab = (ArmLIR*)lab->operands[0];
+        cUnit->currentDalvikOffset = lab->operands[1];
+        oatAppendLIR(cUnit, (LIR *)lab);
+        loadWordDisp(cUnit, rSELF,
+                     OFFSETOF_MEMBER(Thread, pTestSuspendFromCode), rLR);
+        loadWordDisp(cUnit, rSELF,
+            art::Thread::SuspendCountOffset().Int32Value(), rSUSPEND);
+        opReg(cUnit, kOpBlx, rLR);
+        genUnconditionalBranch(cUnit, resumeLab);
+    }
+}
+
 static void handleThrowLaunchpads(CompilationUnit *cUnit)
 {
     ArmLIR** throwLabel =
@@ -2084,9 +2104,11 @@
 
     oatDataFlowAnalysisDispatcher(cUnit, methodBlockCodeGen,
                                   kPreOrderDFSTraversal, false /* Iterative */);
-    removeRedundantBranches(cUnit);
+    handleSuspendLaunchpads(cUnit);
 
     handleThrowLaunchpads(cUnit);
+
+    removeRedundantBranches(cUnit);
 }
 
 /* Common initialization routine for an architecture family */
diff --git a/src/compiler/codegen/arm/Thumb2/Factory.cc b/src/compiler/codegen/arm/Thumb2/Factory.cc
index 254802d..9321753 100644
--- a/src/compiler/codegen/arm/Thumb2/Factory.cc
+++ b/src/compiler/codegen/arm/Thumb2/Factory.cc
@@ -22,9 +22,9 @@
  *
  */
 
-static int coreRegs[] = {r0, r1, r2, r3, r4, r5, r6, r7, rSELF, r8, r10, r11,
-                         r12, rSP, rLR, rPC};
-static int reservedRegs[] = {rSELF, rSP, rLR, rPC};
+static int coreRegs[] = {r0, r1, r2, r3, rSUSPEND, r5, r6, r7, rSELF, r8, r10,
+                         r11, r12, rSP, rLR, rPC};
+static int reservedRegs[] = {rSUSPEND, rSELF, rSP, rLR, rPC};
 static int fpRegs[] = {fr0, fr1, fr2, fr3, fr4, fr5, fr6, fr7,
                        fr8, fr9, fr10, fr11, fr12, fr13, fr14, fr15,
                        fr16, fr17, fr18, fr19, fr20, fr21, fr22, fr23,
diff --git a/src/compiler/codegen/arm/Thumb2/Gen.cc b/src/compiler/codegen/arm/Thumb2/Gen.cc
index 2404ca7..76d8b45 100644
--- a/src/compiler/codegen/arm/Thumb2/Gen.cc
+++ b/src/compiler/codegen/arm/Thumb2/Gen.cc
@@ -1683,9 +1683,31 @@
     return false;
 }
 
+/* Check if we need to check for pending suspend request */
+static void genSuspendTest(CompilationUnit* cUnit, MIR* mir)
+{
+    if (mir->optimizationFlags & MIR_IGNORE_SUSPEND_CHECK) {
+        return;
+    }
+    newLIR2(cUnit, kThumbSubRI8, rSUSPEND, 1);
+    ArmLIR* branch = opCondBranch(cUnit, kArmCondEq);
+    ArmLIR* retLab = newLIR0(cUnit, kArmPseudoTargetLabel);
+    retLab->defMask = ENCODE_ALL;
+    ArmLIR* target = (ArmLIR*)oatNew(sizeof(ArmLIR), true);
+    target->generic.dalvikOffset = cUnit->currentDalvikOffset;
+    target->opcode = kArmPseudoSuspendTarget;
+    target->operands[0] = (intptr_t)retLab;
+    target->operands[1] = mir->offset;
+    branch->generic.target = (LIR*)target;
+    oatInsertGrowableList(&cUnit->suspendLaunchpads, (intptr_t)target);
+}
+
 /* Check for pending suspend request.  */
 static void genSuspendPoll(CompilationUnit* cUnit, MIR* mir)
 {
+    if (mir->optimizationFlags & MIR_IGNORE_SUSPEND_CHECK) {
+        return;
+    }
     oatLockCallTemps(cUnit);   // Explicit register usage
     int rSuspendCount = r1;
     ArmLIR* ld;
diff --git a/src/jni_internal_arm.cc b/src/jni_internal_arm.cc
index 65dc380..8ba5717 100644
--- a/src/jni_internal_arm.cc
+++ b/src/jni_internal_arm.cc
@@ -31,21 +31,21 @@
   UniquePtr<ArmAssembler> assembler(
       down_cast<ArmAssembler*>(Assembler::Create(kArm)));
 #define __ assembler->
-  // Size of frame - spill of R9/LR + Method* + possible receiver + arg array
-  size_t unpadded_frame_size = (3 * kPointerSize) +
+  // Size of frame - spill of R4,R9/LR + Method* + possible receiver + arg array
+  size_t unpadded_frame_size = (4 * kPointerSize) +
                                (method->IsStatic() ? 0 : kPointerSize) +
                                method->NumArgArrayBytes();
   size_t frame_size = RoundUp(unpadded_frame_size, kStackAlignment);
 
-  // Spill R9 and LR
-  RegList save = (1 << R9);
+  // Spill R4,R9 and LR
+  RegList save = (1 << R9) | (1 << R4);
   __ PushList(save | (1 << LR));
 
   // Move the managed thread pointer into R9.
   __ mov(R9, ShifterOperand(R2));
 
-  // Move frame down for arguments less 2 pushed values above
-  __ AddConstant(SP, -frame_size + (2 * kPointerSize));
+  // Move frame down for arguments less 3 pushed values above
+  __ AddConstant(SP, -frame_size + (3 * kPointerSize));
 
   // Can either get 3 or 2 arguments into registers
   size_t reg_bytes = (method->IsStatic() ? 3 : 2) * kPointerSize;
@@ -112,10 +112,10 @@
     }
   }
 
-  // Remove the frame less the spilled R9 and LR
-  __ AddConstant(SP, frame_size - (2 * kPointerSize));
+  // Remove the frame less the spilled R4, R9 and LR
+  __ AddConstant(SP, frame_size - (3 * kPointerSize));
 
-  // Pop R9 and the LR into PC
+  // Pop R4, R9 and the LR into PC
   __ PopList(save | (1 << PC));
   // TODO: store native_entry in the stub table
   ByteArray* code = ByteArray::Alloc(assembler->CodeSize());
diff --git a/src/runtime_support.S b/src/runtime_support.S
index 24883fc..6522243 100644
--- a/src/runtime_support.S
+++ b/src/runtime_support.S
@@ -159,6 +159,23 @@
     mov     r1, r1, lsr r2              @  r1<- r1 >>> r2
     bx      lr
 
+    .balign 4
+    .global art_test_suspend
+    .extern artCheckSuspendFromCode
+art_test_suspend:
+    /*
+     * Check to see if there's a pending suspend request on our thread.
+     * reset rSUSPEND to SUSPEND_CHECK_INTERVAL.
+     * On entry, rSUSPEND holds the suspend request value
+     * [TUNING: move load of suspend check value into this stub.
+     */
+    cmp    rSUSPEND, #0
+    mov    rSUSPEND, #SUSPEND_CHECK_INTERVAL
+    bxeq   rLR
+    mov    r0, rSELF
+    b      artCheckSuspendFromCode
+
+
 #endif
 
 #if defined(__i386__)
diff --git a/src/runtime_support.h b/src/runtime_support.h
index d421fcc..ed047c9 100644
--- a/src/runtime_support.h
+++ b/src/runtime_support.h
@@ -15,6 +15,7 @@
   extern "C" void art_throw_div_zero_from_code();
   extern "C" void art_throw_array_bounds_from_code(int32_t index, int32_t limit);
   extern "C" void art_invoke_interface_trampoline(void*, void*, void*, void*);
+  extern "C" void art_test_suspend();
 
   /* Conversions */
   extern "C" float __aeabi_i2f(int op1);             // OP_INT_TO_FLOAT
diff --git a/src/thread.cc b/src/thread.cc
index d86a0c5..8ab10af 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -178,7 +178,7 @@
   // TODO: throw and unwind on failure.
 }
 
-void CheckSuspendFromCode(Thread* thread) {
+extern "C" void artCheckSuspendFromCode(Thread* thread) {
   Runtime::Current()->GetThreadList()->FullSuspendCheck(thread);
 }
 
@@ -367,6 +367,7 @@
   pThrowArrayBoundsFromCode = art_throw_array_bounds_from_code;
   pThrowDivZeroFromCode = art_throw_div_zero_from_code;
   pInvokeInterfaceTrampoline = art_invoke_interface_trampoline;
+  pTestSuspendFromCode = art_test_suspend;
 #endif
   pDeliverException = art_deliver_exception;
   pF2l = F2L;
@@ -391,7 +392,7 @@
   pLockObjectFromCode = LockObjectFromCode;
   pUnlockObjectFromCode = UnlockObjectFromCode;
   pFindInstanceFieldFromCode = Field::FindInstanceFieldFromCode;
-  pCheckSuspendFromCode = CheckSuspendFromCode;
+  pCheckSuspendFromCode = artCheckSuspendFromCode;
   pStackOverflowFromCode = StackOverflowFromCode;
   pThrowVerificationErrorFromCode = ThrowVerificationErrorFromCode;
   pThrowNegArraySizeFromCode = ThrowNegArraySizeFromCode;
diff --git a/src/thread.h b/src/thread.h
index 03f3ef6..212abdc 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -224,6 +224,7 @@
   StaticStorageBase* (*pInitializeStaticStorage)(uint32_t, const Method*);
   Field* (*pFindInstanceFieldFromCode)(uint32_t, const Method*);
   void (*pCheckSuspendFromCode)(Thread*);
+  void (*pTestSuspendFromCode)();
   void (*pStackOverflowFromCode)(Method*);
   void (*pThrowNullPointerFromCode)();
   void (*pThrowArrayBoundsFromCode)(int32_t, int32_t);