Dalvik: Add sdiv support in the JIT

This patch adds hardware divide support in the JIT side of dalvik.
This operation is supported on new armv7 cpus such as A15 or A7.

The following opcodes are enabled and will generate code based using
SDIV instruction:

        OP_DIV_INT
        OP_DIV_INT_2ADDR
        OP_REM_INT
        OP_REM_INT_2ADDR
        OP_DIV_INT_LIT16
        OP_DIV_INT_LIT8
        OP_REM_INT_LIT16
        OP_REM_INT_LIT8

Change-Id: I2b2f9f337f13b5c794df951c4929b6ca0ad583c4
Signed-off-by: Serban Constantinescu <serban.constantinescu@arm.com>
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index e159aec..d6d1757 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -613,8 +613,14 @@
                                   [10110000] imm4l[3-0] */
     kThumb2Mla,          /* mla [111110110000] rn[19-16] ra[15-12] rd[7-4]
                                   [0000] rm[3-0] */
+    kThumb2MlsRRRR,      /* mls [1111101110000] rn[19-16] ra[15-12] rd[11-8]
+                                  [0001] rm[3-0] */
     kThumb2Umull,        /* umull [111110111010] rn[19-16], rdlo[15-12]
                                   rdhi[11-8] [0000] rm[3-0] */
+    kThumb2SdivRRR,      /* sdiv [1111101111001 rn[19-16] [1111] rd[11-8]
+                                  [1111] rm[3-0] */
+    kThumb2UdivRRR,      /* udiv [1111101111011 rn[19-16] [1111] rd[11-8]
+                                  [1111] rm[3-0] */
     kThumb2Ldrex,        /* ldrex [111010000101] rn[19-16] rt[11-8] [1111]
                                   imm8[7-0] */
     kThumb2Strex,        /* strex [111010000100] rn[19-16] rt[11-8] rd[11-8]
diff --git a/vm/compiler/codegen/arm/Assemble.cpp b/vm/compiler/codegen/arm/Assemble.cpp
index 10572eb..d470786 100644
--- a/vm/compiler/codegen/arm/Assemble.cpp
+++ b/vm/compiler/codegen/arm/Assemble.cpp
@@ -847,11 +847,26 @@
                  kFmtBitBlt, 15, 12,
                  IS_QUAD_OP | REG_DEF0 | REG_USE1 | REG_USE2 | REG_USE3,
                  "mla", "r!0d, r!1d, r!2d, r!3d", 2),
+    ENCODING_MAP(kThumb2MlsRRRR,  0xfb000010,
+                 kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
+                 kFmtBitBlt, 15, 12,
+                 IS_QUAD_OP | REG_DEF0 | REG_USE1 | REG_USE2 | REG_USE3,
+                 "mls", "r!0d, r!1d, r!2d, r!3d", 2),
     ENCODING_MAP(kThumb2Umull,  0xfba00000,
                  kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
                  kFmtBitBlt, 3, 0,
                  IS_QUAD_OP | REG_DEF0 | REG_DEF1 | REG_USE2 | REG_USE3,
                  "umull", "r!0d, r!1d, r!2d, r!3d", 2),
+    ENCODING_MAP(kThumb2SdivRRR,  0xfb90f0f0,
+                 kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16, kFmtBitBlt, 3, 0,
+                 kFmtUnused, -1, -1,
+                 IS_TERTIARY_OP | REG_DEF0_USE12,
+                 "sdiv", "r!0d, r!1d, r!2d", 2),
+    ENCODING_MAP(kThumb2UdivRRR,  0xfbb0f0f0,
+                 kFmtBitBlt, 19, 16, kFmtBitBlt, 11, 8, kFmtBitBlt, 3, 0,
+                 kFmtUnused, -1, -1,
+                 IS_TERTIARY_OP | REG_DEF0 | REG_USE1 | REG_USE2,
+                 "udiv", "r!0d, r!1d, r!2d", 2),
     ENCODING_MAP(kThumb2Ldrex,       0xe8500f00,
                  kFmtBitBlt, 15, 12, kFmtBitBlt, 19, 16, kFmtBitBlt, 7, 0,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1 | IS_LOAD,
diff --git a/vm/compiler/codegen/arm/CodegenDriver.cpp b/vm/compiler/codegen/arm/CodegenDriver.cpp
index c2c112b..3f00f43 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.cpp
+++ b/vm/compiler/codegen/arm/CodegenDriver.cpp
@@ -783,6 +783,7 @@
     int (*callTgt)(int, int);
     RegLocation rlResult;
     bool shiftOp = false;
+    bool remOp = false;
 
     switch (mir->dalvikInsn.opcode) {
         case OP_NEG_INT:
@@ -807,18 +808,27 @@
             break;
         case OP_DIV_INT:
         case OP_DIV_INT_2ADDR:
+#ifdef __ARM_ARCH_EXT_IDIV__
+            op = kOpDiv;
+#else
             callOut = true;
-            checkZero = true;
             callTgt = __aeabi_idiv;
             retReg = r0;
+#endif
+            checkZero = true;
             break;
         /* NOTE: returns in r1 */
         case OP_REM_INT:
         case OP_REM_INT_2ADDR:
+#ifdef __ARM_ARCH_EXT_IDIV__
+            op = kOpRem;
+            remOp = true;
+#else
             callOut = true;
-            checkZero = true;
             callTgt = __aeabi_idivmod;
             retReg = r1;
+#endif
+            checkZero = true;
             break;
         case OP_AND_INT:
         case OP_AND_INT_2ADDR:
@@ -860,6 +870,11 @@
                      rlSrc1.lowReg);
         } else {
             rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
+#ifdef __ARM_ARCH_EXT_IDIV__
+            if (checkZero) {
+                genNullCheck(cUnit, rlSrc2.sRegLow, r1, mir->offset, NULL);
+            }
+#endif
             if (shiftOp) {
                 int tReg = dvmCompilerAllocTemp(cUnit);
                 opRegRegImm(cUnit, kOpAnd, tReg, rlSrc2.lowReg, 31);
@@ -867,6 +882,14 @@
                 opRegRegReg(cUnit, op, rlResult.lowReg,
                             rlSrc1.lowReg, tReg);
                 dvmCompilerFreeTemp(cUnit, tReg);
+            } else if(remOp) {
+                int tReg = dvmCompilerAllocTemp(cUnit);
+                opRegRegReg(cUnit, kOpDiv, tReg,
+                            rlSrc1.lowReg, rlSrc2.lowReg);
+                rlResult = dvmCompilerEvalLoc(cUnit, rlDest, kCoreReg, true);
+                opRegRegRegReg(cUnit, op, rlResult.lowReg,
+                               rlSrc2.lowReg, tReg, rlSrc1.lowReg);
+                dvmCompilerFreeTemp(cUnit, tReg);
             } else {
                 rlResult = dvmCompilerEvalLoc(cUnit, rlDest, kCoreReg, true);
                 opRegRegReg(cUnit, op, rlResult.lowReg,
@@ -2273,6 +2296,7 @@
     OpKind op = (OpKind)0;      /* Make gcc happy */
     int shiftOp = false;
     bool isDiv = false;
+    bool isRem = false;
 
     switch (dalvikOpcode) {
         case OP_RSUB_INT_LIT8:
@@ -2342,6 +2366,17 @@
             if (handleEasyDivide(cUnit, dalvikOpcode, rlSrc, rlDest, lit)) {
                 return false;
             }
+#ifdef __ARM_ARCH_EXT_IDIV__
+            if ((dalvikOpcode == OP_DIV_INT_LIT8) ||
+                (dalvikOpcode == OP_DIV_INT_LIT16)) {
+                op = kOpDiv;
+            }
+            else {
+                isRem = true;
+                op = kOpRem;
+            }
+            break;
+#endif
             dvmCompilerFlushAllRegs(cUnit);   /* Everything to home location */
             loadValueDirectFixed(cUnit, rlSrc, r0);
             dvmCompilerClobber(cUnit, r0);
@@ -2371,6 +2406,15 @@
     // Avoid shifts by literal 0 - no support in Thumb.  Change to copy
     if (shiftOp && (lit == 0)) {
         genRegCopy(cUnit, rlResult.lowReg, rlSrc.lowReg);
+    } else if(isRem) {
+        int tReg1 = dvmCompilerAllocTemp(cUnit);
+        int tReg2 = dvmCompilerAllocTemp(cUnit);
+
+        loadConstant(cUnit, tReg2, lit);
+        opRegRegReg(cUnit, kOpDiv, tReg1, rlSrc.lowReg, tReg2);
+        opRegRegRegReg(cUnit, op, rlResult.lowReg, tReg2, tReg1, rlSrc.lowReg);
+        dvmCompilerFreeTemp(cUnit, tReg1);
+        dvmCompilerFreeTemp(cUnit, tReg2);
     } else {
         opRegRegImm(cUnit, op, rlResult.lowReg, rlSrc.lowReg, lit);
     }
diff --git a/vm/compiler/codegen/arm/Thumb2/Factory.cpp b/vm/compiler/codegen/arm/Thumb2/Factory.cpp
index b9265e8..cc036cb 100644
--- a/vm/compiler/codegen/arm/Thumb2/Factory.cpp
+++ b/vm/compiler/codegen/arm/Thumb2/Factory.cpp
@@ -352,6 +352,10 @@
             assert(shift == 0);
             opcode = (thumbForm) ? kThumbMul : kThumb2MulRRR;
             break;
+        case kOpDiv:
+            assert(shift == 0);
+            opcode = kThumb2SdivRRR;
+            break;
         case kOpMvn:
             opcode = (thumbForm) ? kThumbMvn : kThumb2MnvRR;
             break;
@@ -454,6 +458,13 @@
             assert(shift == 0);
             opcode = kThumb2MulRRR;
             break;
+        case kOpDiv:
+            assert(shift == 0);
+            opcode = kThumb2SdivRRR;
+            break;
+        case kOpRem:
+            opcode = kThumb2MlsRRRR;
+            break;
         case kOpOr:
             opcode = kThumb2OrrRRR;
             break;
@@ -495,6 +506,12 @@
     return opRegRegRegShift(cUnit, op, rDest, rSrc1, rSrc2, 0);
 }
 
+static ArmLIR *opRegRegRegReg(CompilationUnit *cUnit, OpKind op, int rDest,
+                           int rSrc1, int rSrc2, int rSrc3)
+{
+    return opRegRegRegShift(cUnit, op, rDest, rSrc1, rSrc2, rSrc3);
+}
+
 static ArmLIR *opRegRegImm(CompilationUnit *cUnit, OpKind op, int rDest,
                            int rSrc1, int value)
 {
@@ -586,6 +603,10 @@
             modImm = -1;
             altOpcode = kThumb2MulRRR;
             break;
+        case kOpDiv:
+            modImm = -1;
+            altOpcode = kThumb2SdivRRR;
+            break;
         case kOpCmp: {
             int modImm = modifiedImmediate(value);
             ArmLIR *res;