MIPS32: Pass more arguments in registers. Specifically, use A0-A3,T0-T1 for non-floats and F8-F19 for floats. Test: booted MIPS32R2 in QEMU Test: test-art-target-run-test-optimizing (MIPS32R2) on CI20 Test: test-art-target-gtest (MIPS32R2) on CI20 Test: booted MIPS64 (with 2nd arch MIPS32R6) in QEMU Test: test-art-target-run-test-optimizing (MIPS32R6) in QEMU Test: test-art-target-gtest (MIPS32R6) in QEMU Test: test-art-host-gtest Change-Id: Ib8b0310a109d9f3d70119c1e605e54b013e60728

commit: 1b8464d17c2266763714ae18be7c4dc26e28bf61 [log] [tgz]
author: Alexey Frunze <Alexey.Frunze@imgtec.com> Sat Nov 12 17:22:05 2016 -0800
committer: Alexey Frunze <Alexey.Frunze@imgtec.com> Tue Dec 06 13:57:04 2016 -0800
tree: 19ab81b439fc216e289cb14be8a7618dbafb4f50
parent: d7a7c7f3e93de9fa915e66d54dfc799efcc12ffb [diff]
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index a205800..2710ae9 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc

@@ -327,7 +327,7 @@
     0xC0, 0xFF, 0xBD, 0x27, 0x3C, 0x00, 0xBF, 0xAF, 0x38, 0x00, 0xBE, 0xAF,
     0x34, 0x00, 0xB7, 0xAF, 0x30, 0x00, 0xB6, 0xAF, 0x2C, 0x00, 0xB5, 0xAF,
     0x28, 0x00, 0xB4, 0xAF, 0x24, 0x00, 0xB3, 0xAF, 0x20, 0x00, 0xB2, 0xAF,
-    0x00, 0x00, 0xA4, 0xAF, 0x44, 0x00, 0xA5, 0xAF, 0x48, 0x00, 0xAC, 0xE7,
+    0x00, 0x00, 0xA4, 0xAF, 0x44, 0x00, 0xA5, 0xAF, 0x48, 0x00, 0xA8, 0xE7,
     0x4C, 0x00, 0xA6, 0xAF, 0x50, 0x00, 0xA7, 0xAF, 0xE0, 0xFF, 0xBD, 0x27,
     0x20, 0x00, 0xBD, 0x27, 0x20, 0x00, 0xB2, 0x8F, 0x24, 0x00, 0xB3, 0x8F,
     0x28, 0x00, 0xB4, 0x8F, 0x2C, 0x00, 0xB5, 0x8F, 0x30, 0x00, 0xB6, 0x8F,
@@ -361,7 +361,7 @@
 // 0x00000024: .cfi_offset: r18 at cfa-32
 // 0x00000024: sw r4, +0(r29)
 // 0x00000028: sw r5, +68(r29)
-// 0x0000002c: swc1 f12, +72(r29)
+// 0x0000002c: swc1 f8, +72(r29)
 // 0x00000030: sw r6, +76(r29)
 // 0x00000034: sw r7, +80(r29)
 // 0x00000038: addiu r29, r29, -32

diff --git a/compiler/jni/quick/mips/calling_convention_mips.cc b/compiler/jni/quick/mips/calling_convention_mips.cc
index e6948ec..0e0716e 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.cc
+++ b/compiler/jni/quick/mips/calling_convention_mips.cc

@@ -23,6 +23,10 @@
 namespace art {
 namespace mips {
 
+//
+// JNI calling convention constants.
+//
+
 // Up to how many float-like (float, double) args can be enregistered in floating-point registers.
 // The rest of the args must go in integer registers or on the stack.
 constexpr size_t kMaxFloatOrDoubleRegisterArguments = 2u;
@@ -30,9 +34,17 @@
 // enregistered. The rest of the args must go on the stack.
 constexpr size_t kMaxIntLikeRegisterArguments = 4u;
 
-static const Register kCoreArgumentRegisters[] = { A0, A1, A2, A3 };
-static const FRegister kFArgumentRegisters[] = { F12, F14 };
-static const DRegister kDArgumentRegisters[] = { D6, D7 };
+static const Register kJniCoreArgumentRegisters[] = { A0, A1, A2, A3 };
+static const FRegister kJniFArgumentRegisters[] = { F12, F14 };
+static const DRegister kJniDArgumentRegisters[] = { D6, D7 };
+
+//
+// Managed calling convention constants.
+//
+
+static const Register kManagedCoreArgumentRegisters[] = { A0, A1, A2, A3, T0, T1 };
+static const FRegister kManagedFArgumentRegisters[] = { F8, F10, F12, F14, F16, F18 };
+static const DRegister kManagedDArgumentRegisters[] = { D4, D5, D6, D7, D8, D9 };
 
 static constexpr ManagedRegister kCalleeSaveRegisters[] = {
     // Core registers.
@@ -133,30 +145,30 @@
     for (ResetIterator(FrameOffset(0)); HasNext(); Next()) {
       if (IsCurrentParamAFloatOrDouble()) {
         if (IsCurrentParamADouble()) {
-          if (fpr_index < arraysize(kDArgumentRegisters)) {
+          if (fpr_index < arraysize(kManagedDArgumentRegisters)) {
             entry_spills_.push_back(
-                MipsManagedRegister::FromDRegister(kDArgumentRegisters[fpr_index++]));
+                MipsManagedRegister::FromDRegister(kManagedDArgumentRegisters[fpr_index++]));
           } else {
             entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
           }
         } else {
-          if (fpr_index < arraysize(kFArgumentRegisters)) {
+          if (fpr_index < arraysize(kManagedFArgumentRegisters)) {
             entry_spills_.push_back(
-                MipsManagedRegister::FromFRegister(kFArgumentRegisters[fpr_index++]));
+                MipsManagedRegister::FromFRegister(kManagedFArgumentRegisters[fpr_index++]));
           } else {
             entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
           }
         }
       } else {
         if (IsCurrentParamALong() && !IsCurrentParamAReference()) {
-          if (gpr_index == 1) {
-            // Don't use a1-a2 as a register pair, move to a2-a3 instead.
+          if (gpr_index == 1 || gpr_index == 3) {
+            // Don't use A1-A2(A3-T0) as a register pair, move to A2-A3(T0-T1) instead.
             gpr_index++;
           }
-          if (gpr_index < arraysize(kCoreArgumentRegisters) - 1) {
+          if (gpr_index < arraysize(kManagedCoreArgumentRegisters) - 1) {
             entry_spills_.push_back(
-                MipsManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gpr_index++]));
-          } else if (gpr_index == arraysize(kCoreArgumentRegisters) - 1) {
+                MipsManagedRegister::FromCoreRegister(kManagedCoreArgumentRegisters[gpr_index++]));
+          } else if (gpr_index == arraysize(kManagedCoreArgumentRegisters) - 1) {
             gpr_index++;
             entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
           } else {
@@ -164,9 +176,9 @@
           }
         }
 
-        if (gpr_index < arraysize(kCoreArgumentRegisters)) {
+        if (gpr_index < arraysize(kManagedCoreArgumentRegisters)) {
           entry_spills_.push_back(
-            MipsManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gpr_index++]));
+              MipsManagedRegister::FromCoreRegister(kManagedCoreArgumentRegisters[gpr_index++]));
         } else {
           entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
         }
@@ -175,6 +187,7 @@
   }
   return entry_spills_;
 }
+
 // JNI calling convention
 
 MipsJniCallingConvention::MipsJniCallingConvention(bool is_static,
@@ -285,7 +298,7 @@
   //  | FLOAT | INT | DOUBLE  |
   //  |  F12  | A1  | A2 | A3 |
   // (c) first two arguments are floating-point (float, double)
-  //  | FLAOT | (PAD) | DOUBLE |  INT  |
+  //  | FLOAT | (PAD) | DOUBLE |  INT  |
   //  |  F12  |       |  F14   | SP+16 |
   // (d) first two arguments are floating-point (double, float)
   //  | DOUBLE | FLOAT | INT |
@@ -404,9 +417,9 @@
   if (use_fp_arg_registers_ && (itr_args_ < kMaxFloatOrDoubleRegisterArguments)) {
     if (IsCurrentParamAFloatOrDouble()) {
       if (IsCurrentParamADouble()) {
-        return MipsManagedRegister::FromDRegister(kDArgumentRegisters[itr_args_]);
+        return MipsManagedRegister::FromDRegister(kJniDArgumentRegisters[itr_args_]);
       } else {
-        return MipsManagedRegister::FromFRegister(kFArgumentRegisters[itr_args_]);
+        return MipsManagedRegister::FromFRegister(kJniFArgumentRegisters[itr_args_]);
       }
     }
   }
@@ -420,7 +433,7 @@
       return MipsManagedRegister::FromRegisterPair(A2_A3);
     }
   } else {
-    return MipsManagedRegister::FromCoreRegister(kCoreArgumentRegisters[itr_slots_]);
+    return MipsManagedRegister::FromCoreRegister(kJniCoreArgumentRegisters[itr_slots_]);
   }
 }
 

diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 8f94834..f0d4910 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc

@@ -99,8 +99,9 @@
       uint32_t gp_index = gp_index_;
       gp_index_ += 2;
       if (gp_index + 1 < calling_convention.GetNumberOfRegisters()) {
-        if (calling_convention.GetRegisterAt(gp_index) == A1) {
-          gp_index_++;  // Skip A1, and use A2_A3 instead.
+        Register reg = calling_convention.GetRegisterAt(gp_index);
+        if (reg == A1 || reg == A3) {
+          gp_index_++;  // Skip A1(A3), and use A2_A3(T0_T1) instead.
           gp_index++;
         }
         Register low_even = calling_convention.GetRegisterAt(gp_index);
@@ -5085,9 +5086,9 @@
 
 void LocationsBuilderMIPS::VisitInvokeInterface(HInvokeInterface* invoke) {
   HandleInvoke(invoke);
-  // The register T0 is required to be used for the hidden argument in
+  // The register T7 is required to be used for the hidden argument in
   // art_quick_imt_conflict_trampoline, so add the hidden argument.
-  invoke->GetLocations()->AddTemp(Location::RegisterLocation(T0));
+  invoke->GetLocations()->AddTemp(Location::RegisterLocation(T7));
 }
 
 void InstructionCodeGeneratorMIPS::VisitInvokeInterface(HInvokeInterface* invoke) {

diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index e225d20..685e4a9 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h

@@ -31,11 +31,11 @@
 // InvokeDexCallingConvention registers
 
 static constexpr Register kParameterCoreRegisters[] =
-    { A1, A2, A3 };
+    { A1, A2, A3, T0, T1 };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
 
 static constexpr FRegister kParameterFpuRegisters[] =
-    { F12, F14 };
+    { F8, F10, F12, F14, F16, F18 };
 static constexpr size_t kParameterFpuRegistersLength = arraysize(kParameterFpuRegisters);
 
 
@@ -47,7 +47,7 @@
     arraysize(kRuntimeParameterCoreRegisters);
 
 static constexpr FRegister kRuntimeParameterFpuRegisters[] =
-    { F12, F14};
+    { F12, F14 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
 

diff --git a/compiler/optimizing/emit_swap_mips_test.cc b/compiler/optimizing/emit_swap_mips_test.cc
index 9dc53e6..0d4e1c5 100644
--- a/compiler/optimizing/emit_swap_mips_test.cc
+++ b/compiler/optimizing/emit_swap_mips_test.cc

@@ -154,54 +154,54 @@
 TEST_F(EmitSwapMipsTest, TwoFpuRegistersFloat) {
   moves_->AddMove(
       Location::FpuRegisterLocation(4),
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Primitive::kPrimFloat,
       nullptr);
   moves_->AddMove(
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Location::FpuRegisterLocation(4),
       Primitive::kPrimFloat,
       nullptr);
   const char* expected =
-      "mov.s $f8, $f6\n"
-      "mov.s $f6, $f4\n"
-      "mov.s $f4, $f8\n";
+      "mov.s $f6, $f2\n"
+      "mov.s $f2, $f4\n"
+      "mov.s $f4, $f6\n";
   DriverWrapper(moves_, expected, "TwoFpuRegistersFloat");
 }
 
 TEST_F(EmitSwapMipsTest, TwoFpuRegistersDouble) {
   moves_->AddMove(
       Location::FpuRegisterLocation(4),
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Primitive::kPrimDouble,
       nullptr);
   moves_->AddMove(
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Location::FpuRegisterLocation(4),
       Primitive::kPrimDouble,
       nullptr);
   const char* expected =
-      "mov.d $f8, $f6\n"
-      "mov.d $f6, $f4\n"
-      "mov.d $f4, $f8\n";
+      "mov.d $f6, $f2\n"
+      "mov.d $f2, $f4\n"
+      "mov.d $f4, $f6\n";
   DriverWrapper(moves_, expected, "TwoFpuRegistersDouble");
 }
 
 TEST_F(EmitSwapMipsTest, RegisterAndFpuRegister) {
   moves_->AddMove(
       Location::RegisterLocation(4),
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Primitive::kPrimFloat,
       nullptr);
   moves_->AddMove(
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Location::RegisterLocation(4),
       Primitive::kPrimFloat,
       nullptr);
   const char* expected =
       "or $t8, $a0, $zero\n"
-      "mfc1 $a0, $f6\n"
-      "mtc1 $t8, $f6\n";
+      "mfc1 $a0, $f2\n"
+      "mtc1 $t8, $f2\n";
   DriverWrapper(moves_, expected, "RegisterAndFpuRegister");
 }
 
@@ -327,9 +327,9 @@
       Primitive::kPrimFloat,
       nullptr);
   const char* expected =
-      "mov.s $f8, $f4\n"
+      "mov.s $f6, $f4\n"
       "lwc1 $f4, 48($sp)\n"
-      "swc1 $f8, 48($sp)\n";
+      "swc1 $f6, 48($sp)\n";
   DriverWrapper(moves_, expected, "FpuRegisterAndStackSlot");
 }
 
@@ -345,9 +345,9 @@
       Primitive::kPrimDouble,
       nullptr);
   const char* expected =
-      "mov.d $f8, $f4\n"
+      "mov.d $f6, $f4\n"
       "ldc1 $f4, 48($sp)\n"
-      "sdc1 $f8, 48($sp)\n";
+      "sdc1 $f6, 48($sp)\n";
   DriverWrapper(moves_, expected, "FpuRegisterAndDoubleStackSlot");
 }
 

diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index b29974c..3dcad6a 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc

@@ -3252,6 +3252,9 @@
       CHECK_EQ(kMipsDoublewordSize, size) << dst;
       LoadDFromOffset(dst.AsFRegister(), src_register, src_offset);
     }
+  } else if (dst.IsDRegister()) {
+    CHECK_EQ(kMipsDoublewordSize, size) << dst;
+    LoadDFromOffset(dst.AsOverlappingDRegisterLow(), src_register, src_offset);
   }
 }
 
@@ -3396,6 +3399,9 @@
       CHECK_EQ(kMipsDoublewordSize, size);
       StoreDToOffset(src.AsFRegister(), SP, dest.Int32Value());
     }
+  } else if (src.IsDRegister()) {
+    CHECK_EQ(kMipsDoublewordSize, size);
+    StoreDToOffset(src.AsOverlappingDRegisterLow(), SP, dest.Int32Value());
   }
 }
 

diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 135b074..7437774 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h

@@ -21,7 +21,7 @@
 
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVES 96
 #define FRAME_SIZE_SAVE_REFS_ONLY 48
-#define FRAME_SIZE_SAVE_REFS_AND_ARGS 80
+#define FRAME_SIZE_SAVE_REFS_AND_ARGS 112
 #define FRAME_SIZE_SAVE_EVERYTHING 256
 
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_

diff --git a/runtime/arch/mips/context_mips.cc b/runtime/arch/mips/context_mips.cc
index 375a03a..98ed5e6 100644
--- a/runtime/arch/mips/context_mips.cc
+++ b/runtime/arch/mips/context_mips.cc

@@ -75,11 +75,21 @@
   gprs_[A1] = nullptr;
   gprs_[A2] = nullptr;
   gprs_[A3] = nullptr;
+  gprs_[T0] = nullptr;
+  gprs_[T1] = nullptr;
 
+  fprs_[F8] = nullptr;
+  fprs_[F9] = nullptr;
+  fprs_[F10] = nullptr;
+  fprs_[F11] = nullptr;
   fprs_[F12] = nullptr;
   fprs_[F13] = nullptr;
   fprs_[F14] = nullptr;
   fprs_[F15] = nullptr;
+  fprs_[F16] = nullptr;
+  fprs_[F17] = nullptr;
+  fprs_[F18] = nullptr;
+  fprs_[F19] = nullptr;
 }
 
 extern "C" NO_RETURN void art_quick_do_long_jump(uint32_t*, uint32_t*);

diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 34e34b4..3e8cdc9 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S

@@ -167,50 +167,60 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs).
-     * callee-save: $a1-$a3, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * callee-save: $a1-$a3, $t0-$t1, $s2-$s8, $gp, $ra, $f8-$f19
+     *              (26 total + 1 word padding + method*)
      */
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
-    addiu  $sp, $sp, -80
-    .cfi_adjust_cfa_offset 80
+    addiu  $sp, $sp, -112
+    .cfi_adjust_cfa_offset 112
 
     // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 80)
+#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 112)
 #error "FRAME_SIZE_SAVE_REFS_AND_ARGS(MIPS) size not as expected."
 #endif
 
-    sw     $ra, 76($sp)
-    .cfi_rel_offset 31, 76
-    sw     $s8, 72($sp)
-    .cfi_rel_offset 30, 72
-    sw     $gp, 68($sp)
-    .cfi_rel_offset 28, 68
-    sw     $s7, 64($sp)
-    .cfi_rel_offset 23, 64
-    sw     $s6, 60($sp)
-    .cfi_rel_offset 22, 60
-    sw     $s5, 56($sp)
-    .cfi_rel_offset 21, 56
-    sw     $s4, 52($sp)
-    .cfi_rel_offset 20, 52
-    sw     $s3, 48($sp)
-    .cfi_rel_offset 19, 48
-    sw     $s2, 44($sp)
-    .cfi_rel_offset 18, 44
-    sw     $a3, 40($sp)
-    .cfi_rel_offset 7, 40
-    sw     $a2, 36($sp)
-    .cfi_rel_offset 6, 36
-    sw     $a1, 32($sp)
-    .cfi_rel_offset 5, 32
-    SDu $f14, $f15, 24, $sp, $t0
-    SDu $f12, $f13, 16, $sp, $t0
+    sw     $ra, 108($sp)
+    .cfi_rel_offset 31, 108
+    sw     $s8, 104($sp)
+    .cfi_rel_offset 30, 104
+    sw     $gp, 100($sp)
+    .cfi_rel_offset 28, 100
+    sw     $s7, 96($sp)
+    .cfi_rel_offset 23, 96
+    sw     $s6, 92($sp)
+    .cfi_rel_offset 22, 92
+    sw     $s5, 88($sp)
+    .cfi_rel_offset 21, 88
+    sw     $s4, 84($sp)
+    .cfi_rel_offset 20, 84
+    sw     $s3, 80($sp)
+    .cfi_rel_offset 19, 80
+    sw     $s2, 76($sp)
+    .cfi_rel_offset 18, 76
+    sw     $t1, 72($sp)
+    .cfi_rel_offset 9, 72
+    sw     $t0, 68($sp)
+    .cfi_rel_offset 8, 68
+    sw     $a3, 64($sp)
+    .cfi_rel_offset 7, 64
+    sw     $a2, 60($sp)
+    .cfi_rel_offset 6, 60
+    sw     $a1, 56($sp)
+    .cfi_rel_offset 5, 56
+    SDu $f18, $f19, 48, $sp, $t8
+    SDu $f16, $f17, 40, $sp, $t8
+    SDu $f14, $f15, 32, $sp, $t8
+    SDu $f12, $f13, 24, $sp, $t8
+    SDu $f10, $f11, 16, $sp, $t8
+    SDu $f8, $f9, 8, $sp, $t8
     # bottom will hold Method*
 .endm
 
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs). Restoration assumes non-moving GC.
-     * callee-save: $a1-$a3, $f12-$f15, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * callee-save: $a1-$a3, $t0-$t1, $s2-$s8, $gp, $ra, $f8-$f19
+     *              (26 total + 1 word padding + method*)
      * Clobbers $t0 and $sp
      * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
      * Reserves FRAME_SIZE_SAVE_REFS_AND_ARGS + ARG_SLOT_SIZE bytes on the stack
@@ -229,7 +239,8 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs). Restoration assumes non-moving GC.
-     * callee-save: $a1-$a3, $f12-$f15, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * callee-save: $a1-$a3, $t0-$t1, $s2-$s8, $gp, $ra, $f8-$f19
+     *              (26 total + 1 word padding + method*)
      * Clobbers $sp
      * Use $a0 as the Method* and loads it into bottom of stack.
      * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
@@ -246,34 +257,42 @@
 .macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
     addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
     .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
-    lw     $ra, 76($sp)
+    lw     $ra, 108($sp)
     .cfi_restore 31
-    lw     $s8, 72($sp)
+    lw     $s8, 104($sp)
     .cfi_restore 30
-    lw     $gp, 68($sp)
+    lw     $gp, 100($sp)
     .cfi_restore 28
-    lw     $s7, 64($sp)
+    lw     $s7, 96($sp)
     .cfi_restore 23
-    lw     $s6, 60($sp)
+    lw     $s6, 92($sp)
     .cfi_restore 22
-    lw     $s5, 56($sp)
+    lw     $s5, 88($sp)
     .cfi_restore 21
-    lw     $s4, 52($sp)
+    lw     $s4, 84($sp)
     .cfi_restore 20
-    lw     $s3, 48($sp)
+    lw     $s3, 80($sp)
     .cfi_restore 19
-    lw     $s2, 44($sp)
+    lw     $s2, 76($sp)
     .cfi_restore 18
-    lw     $a3, 40($sp)
+    lw     $t1, 72($sp)
+    .cfi_restore 9
+    lw     $t0, 68($sp)
+    .cfi_restore 8
+    lw     $a3, 64($sp)
     .cfi_restore 7
-    lw     $a2, 36($sp)
+    lw     $a2, 60($sp)
     .cfi_restore 6
-    lw     $a1, 32($sp)
+    lw     $a1, 56($sp)
     .cfi_restore 5
-    LDu $f14, $f15, 24, $sp, $t1
-    LDu $f12, $f13, 16, $sp, $t1
-    addiu  $sp, $sp, 80           # pop frame
-    .cfi_adjust_cfa_offset -80
+    LDu $f18, $f19, 48, $sp, $t8
+    LDu $f16, $f17, 40, $sp, $t8
+    LDu $f14, $f15, 32, $sp, $t8
+    LDu $f12, $f13, 24, $sp, $t8
+    LDu $f10, $f11, 16, $sp, $t8
+    LDu $f8, $f9, 8, $sp, $t8
+    addiu  $sp, $sp, 112          # pop frame
+    .cfi_adjust_cfa_offset -112
 .endm
 
     /*
@@ -824,30 +843,56 @@
 INVOKE_TRAMPOLINE art_quick_invoke_super_trampoline_with_access_check, artInvokeSuperTrampolineWithAccessCheck
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
-.macro LOAD_WORD_TO_REG reg, next_arg, index, label
+// Each of the following macros expands into four instructions or 16 bytes.
+// They are used to build indexable "tables" of code.
+
+.macro LOAD_WORD_TO_REG reg, next_arg, index_reg, label
     lw    $\reg, -4($\next_arg)   # next_arg points to argument after the current one (offset is 4)
     b     \label
-    addiu $\index, 1
+    addiu $\index_reg, 16
+    .balign 16
 .endm
 
-.macro LOAD_LONG_TO_REG reg1, reg2, next_arg, index, label
+.macro LOAD_LONG_TO_REG reg1, reg2, next_arg, index_reg, next_index, label
     lw    $\reg1, -8($\next_arg)  # next_arg points to argument after the current one (offset is 8)
     lw    $\reg2, -4($\next_arg)
     b     \label
-    li    $\index, 4              # long can be loaded only to a2_a3 pair so index will be always 4
+    li    $\index_reg, \next_index
+    .balign 16
 .endm
 
-.macro LOAD_FLOAT_TO_REG reg, next_arg, index, label
+.macro LOAD_FLOAT_TO_REG reg, next_arg, index_reg, label
     lwc1  $\reg, -4($\next_arg)   # next_arg points to argument after the current one (offset is 4)
     b     \label
-    addiu $\index, 1
+    addiu $\index_reg, 16
+    .balign 16
 .endm
 
-.macro LOAD_DOUBLE_TO_REG reg1, reg2, next_arg, index, tmp, label
+#if defined(__mips_isa_rev) && __mips_isa_rev > 2
+// LDu expands into 3 instructions for 64-bit FPU, so index_reg cannot be updated here.
+.macro LOAD_DOUBLE_TO_REG reg1, reg2, next_arg, index_reg, tmp, label
+    .set reorder                                # force use of the branch delay slot
     LDu  $\reg1, $\reg2, -8, $\next_arg, $\tmp  # next_arg points to argument after the current one
                                                 # (offset is 8)
     b     \label
-    addiu $\index, 1
+    .set noreorder
+    .balign 16
+.endm
+#else
+// LDu expands into 2 instructions for 32-bit FPU, so index_reg is updated here.
+.macro LOAD_DOUBLE_TO_REG reg1, reg2, next_arg, index_reg, tmp, label
+    LDu  $\reg1, $\reg2, -8, $\next_arg, $\tmp  # next_arg points to argument after the current one
+                                                # (offset is 8)
+    b     \label
+    addiu $\index_reg, 16
+    .balign 16
+.endm
+#endif
+
+.macro LOAD_END index_reg, next_index, label
+    b     \label
+    li    $\index_reg, \next_index
+    .balign 16
 .endm
 
 #define SPILL_SIZE    32
@@ -891,61 +936,63 @@
     lw    $gp, 16($fp)          # restore $gp
     lw    $a0, SPILL_SIZE($fp)  # restore ArtMethod*
     lw    $a1, 4($sp)           # a1 = this*
-    addiu $t0, $sp, 8           # t0 = pointer to the current argument (skip ArtMethod* and this*)
-    li    $t3, 2                # t3 = gpr_index = 2 (skip A0 and A1)
-    move  $t4, $zero            # t4 = fp_index = 0
-    lw    $t1, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
+    addiu $t8, $sp, 8           # t8 = pointer to the current argument (skip ArtMethod* and this*)
+    li    $t6, 0                # t6 = gpr_index = 0 (corresponds to A2; A0 and A1 are skipped)
+    li    $t7, 0                # t7 = fp_index = 0
+    lw    $t9, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
                                 # as the $fp is SPILL_SIZE bytes below the $sp on entry)
-    addiu $t1, 1                # t1 = shorty + 1 (skip 1 for return type)
+    addiu $t9, 1                # t9 = shorty + 1 (skip 1 for return type)
+
+    // Load the base addresses of tabInt ... tabDouble.
+    // We will use the register indices (gpr_index, fp_index) to branch.
+    // Note that the indices are scaled by 16, so they can be added to the bases directly.
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6
+    lapc  $t2, tabInt
+    lapc  $t3, tabLong
+    lapc  $t4, tabSingle
+    lapc  $t5, tabDouble
+#else
+    bltzal $zero, tabBase       # nal
+    addiu $t2, $ra, %lo(tabInt - tabBase)
+tabBase:
+    addiu $t3, $ra, %lo(tabLong - tabBase)
+    addiu $t4, $ra, %lo(tabSingle - tabBase)
+    addiu $t5, $ra, %lo(tabDouble - tabBase)
+#endif
+
 loop:
-    lbu   $t2, 0($t1)           # t2 = shorty[i]
-    beqz  $t2, loopEnd          # finish getting args when shorty[i] == '\0'
-    addiu $t1, 1
+    lbu   $ra, 0($t9)           # ra = shorty[i]
+    beqz  $ra, loopEnd          # finish getting args when shorty[i] == '\0'
+    addiu $t9, 1
 
-    li    $t9, 'J'              # put char 'J' into t9
-    beq   $t9, $t2, isLong      # branch if result type char == 'J'
-    li    $t9, 'D'              # put char 'D' into t9
-    beq   $t9, $t2, isDouble    # branch if result type char == 'D'
-    li    $t9, 'F'              # put char 'F' into t9
-    beq   $t9, $t2, isSingle    # branch if result type char == 'F'
-    addiu $t0, 4                # next_arg = curr_arg + 4 (in branch delay slot,
-                                # for both, int and single)
+    addiu $ra, -'J'
+    beqz  $ra, isLong           # branch if result type char == 'J'
+    addiu $ra, 'J' - 'D'
+    beqz  $ra, isDouble         # branch if result type char == 'D'
+    addiu $ra, 'D' - 'F'
+    beqz  $ra, isSingle         # branch if result type char == 'F'
 
-    li    $t5, 2                                   # skip a0 and a1 (ArtMethod* and this*)
-    bne   $t5, $t3, 1f                             # if (gpr_index == 2)
-    addiu $t5, 1
-    LOAD_WORD_TO_REG a2, t0, t3, loop              #   a2 = current argument, gpr_index++
-1:  bne   $t5, $t3, loop                           # else if (gpr_index == 3)
-    nop
-    LOAD_WORD_TO_REG a3, t0, t3, loop              #   a3 = current argument, gpr_index++
+    addu  $ra, $t2, $t6
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
 
 isLong:
-    addiu $t0, 8                                   # next_arg = curr_arg + 8
-    slti  $t5, $t3, 3
-    beqz  $t5, 2f                                  # if (gpr_index < 3)
-    nop
-    LOAD_LONG_TO_REG a2, a3, t0, t3, loop          #   a2_a3 = curr_arg, gpr_index = 4
-2:  b     loop                                     # else
-    li    $t3, 4                                   #   gpr_index = 4
-
-isDouble:
-    addiu $t0, 8                                   # next_arg = curr_arg + 8
-    li    $t5, 0
-    bne   $t5, $t4, 3f                             # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_DOUBLE_TO_REG f12, f13, t0, t4, t9, loop  #   f12_f13 = curr_arg, fp_index++
-3:  bne   $t5, $t4, loop                           # else if (fp_index == 1)
-    nop
-    LOAD_DOUBLE_TO_REG f14, f15, t0, t4, t9, loop  #   f14_f15 = curr_arg, fp_index++
+    addu  $ra, $t3, $t6
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 isSingle:
-    li    $t5, 0
-    bne   $t5, $t4, 4f                             # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_FLOAT_TO_REG f12, t0, t4, loop            #   f12 = curr_arg, fp_index++
-4:  bne   $t5, $t4, loop                           # else if (fp_index == 1)
-    nop
-    LOAD_FLOAT_TO_REG f14, t0, t4, loop            #   f14 = curr_arg, fp_index++
+    addu  $ra, $t4, $t7
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
+
+isDouble:
+    addu  $ra, $t5, $t7
+#if defined(__mips_isa_rev) && __mips_isa_rev > 2
+    addiu $t7, 16               # fp_index += 16 didn't fit into LOAD_DOUBLE_TO_REG
+#endif
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 loopEnd:
     lw    $t9, ART_METHOD_QUICK_CODE_OFFSET_32($a0)  # get pointer to the code
@@ -976,6 +1023,38 @@
     SDu   $f0, $f1, 0, $t0, $t1 # store floating point result
     jalr  $zero, $ra
     nop
+
+    // Note that gpr_index is kept within the range of tabInt and tabLong
+    // and fp_index is kept within the range of tabSingle and tabDouble.
+    .balign 16
+tabInt:
+    LOAD_WORD_TO_REG a2, t8, t6, loop             # a2 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG a3, t8, t6, loop             # a3 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t0, t8, t6, loop             # t0 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t1, t8, t6, loop             # t1 = current argument, gpr_index += 16
+    LOAD_END t6, 4*16, loop                       # no more GPR args, gpr_index = 4*16
+tabLong:
+    LOAD_LONG_TO_REG a2, a3, t8, t6, 2*16, loop   # a2_a3 = curr_arg, gpr_index = 2*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 4*16, loop   # t0_t1 = curr_arg, gpr_index = 4*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 4*16, loop   # t0_t1 = curr_arg, gpr_index = 4*16
+    LOAD_END t6, 4*16, loop                       # no more GPR args, gpr_index = 4*16
+    LOAD_END t6, 4*16, loop                       # no more GPR args, gpr_index = 4*16
+tabSingle:
+    LOAD_FLOAT_TO_REG f8, t8, t7, loop            # f8 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f10, t8, t7, loop           # f10 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f12, t8, t7, loop           # f12 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f14, t8, t7, loop           # f14 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f16, t8, t7, loop           # f16 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f18, t8, t7, loop           # f18 = curr_arg, fp_index += 16
+    LOAD_END t7, 6*16, loop                       # no more FPR args, fp_index = 6*16
+tabDouble:
+    LOAD_DOUBLE_TO_REG f8, f9, t8, t7, ra, loop   # f8_f9 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f10, f11, t8, t7, ra, loop # f10_f11 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f12, f13, t8, t7, ra, loop # f12_f13 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f14, f15, t8, t7, ra, loop # f14_f15 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f16, f17, t8, t7, ra, loop # f16_f17 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f18, f19, t8, t7, ra, loop # f18_f19 = curr_arg; if FPU32, fp_index += 16
+    LOAD_END t7, 6*16, loop                       # no more FPR args, fp_index = 6*16
 END art_quick_invoke_stub
 
     /*
@@ -1016,64 +1095,63 @@
     addiu $sp, $sp, 16          # restore stack after memcpy
     lw    $gp, 16($fp)          # restore $gp
     lw    $a0, SPILL_SIZE($fp)  # restore ArtMethod*
-    addiu $t0, $sp, 4           # t0 = pointer to the current argument (skip ArtMethod*)
-    li    $t3, 1                # t3 = gpr_index = 1 (skip A0)
-    move  $t4, $zero            # t4 = fp_index = 0
-    lw    $t1, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
+    addiu $t8, $sp, 4           # t8 = pointer to the current argument (skip ArtMethod*)
+    li    $t6, 0                # t6 = gpr_index = 0 (corresponds to A1; A0 is skipped)
+    li    $t7, 0                # t7 = fp_index = 0
+    lw    $t9, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
                                 # as the $fp is SPILL_SIZE bytes below the $sp on entry)
-    addiu $t1, 1                # t1 = shorty + 1 (skip 1 for return type)
+    addiu $t9, 1                # t9 = shorty + 1 (skip 1 for return type)
+
+    // Load the base addresses of tabIntS ... tabDoubleS.
+    // We will use the register indices (gpr_index, fp_index) to branch.
+    // Note that the indices are scaled by 16, so they can be added to the bases directly.
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6
+    lapc  $t2, tabIntS
+    lapc  $t3, tabLongS
+    lapc  $t4, tabSingleS
+    lapc  $t5, tabDoubleS
+#else
+    bltzal $zero, tabBaseS      # nal
+    addiu $t2, $ra, %lo(tabIntS - tabBaseS)
+tabBaseS:
+    addiu $t3, $ra, %lo(tabLongS - tabBaseS)
+    addiu $t4, $ra, %lo(tabSingleS - tabBaseS)
+    addiu $t5, $ra, %lo(tabDoubleS - tabBaseS)
+#endif
+
 loopS:
-    lbu   $t2, 0($t1)           # t2 = shorty[i]
-    beqz  $t2, loopEndS         # finish getting args when shorty[i] == '\0'
-    addiu $t1, 1
+    lbu   $ra, 0($t9)           # ra = shorty[i]
+    beqz  $ra, loopEndS         # finish getting args when shorty[i] == '\0'
+    addiu $t9, 1
 
-    li    $t9, 'J'              # put char 'J' into t9
-    beq   $t9, $t2, isLongS     # branch if result type char == 'J'
-    li    $t9, 'D'              # put char 'D' into t9
-    beq   $t9, $t2, isDoubleS   # branch if result type char == 'D'
-    li    $t9, 'F'              # put char 'F' into t9
-    beq   $t9, $t2, isSingleS   # branch if result type char == 'F'
-    addiu $t0, 4                # next_arg = curr_arg + 4 (in branch delay slot,
-                                # for both, int and single)
+    addiu $ra, -'J'
+    beqz  $ra, isLongS          # branch if result type char == 'J'
+    addiu $ra, 'J' - 'D'
+    beqz  $ra, isDoubleS        # branch if result type char == 'D'
+    addiu $ra, 'D' - 'F'
+    beqz  $ra, isSingleS        # branch if result type char == 'F'
 
-    li    $t5, 1                                    # skip a0 (ArtMethod*)
-    bne   $t5, $t3, 1f                              # if (gpr_index == 1)
-    addiu $t5, 1
-    LOAD_WORD_TO_REG a1, t0, t3, loopS              #   a1 = current argument, gpr_index++
-1:  bne   $t5, $t3, 2f                              # else if (gpr_index == 2)
-    addiu $t5, 1
-    LOAD_WORD_TO_REG a2, t0, t3, loopS              #   a2 = current argument, gpr_index++
-2:  bne   $t5, $t3, loopS                           # else if (gpr_index == 3)
-    nop
-    LOAD_WORD_TO_REG a3, t0, t3, loopS              #   a3 = current argument, gpr_index++
+    addu  $ra, $t2, $t6
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
 
 isLongS:
-    addiu $t0, 8                                    # next_arg = curr_arg + 8
-    slti  $t5, $t3, 3
-    beqz  $t5, 3f                                   # if (gpr_index < 3)
-    nop
-    LOAD_LONG_TO_REG a2, a3, t0, t3, loopS          #   a2_a3 = curr_arg, gpr_index = 4
-3:  b     loopS                                     # else
-    li    $t3, 4                                    #   gpr_index = 4
-
-isDoubleS:
-    addiu $t0, 8                                    # next_arg = curr_arg + 8
-    li    $t5, 0
-    bne   $t5, $t4, 4f                              # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_DOUBLE_TO_REG f12, f13, t0, t4, t9, loopS  #   f12_f13 = curr_arg, fp_index++
-4:  bne   $t5, $t4, loopS                           # else if (fp_index == 1)
-    nop
-    LOAD_DOUBLE_TO_REG f14, f15, t0, t4, t9, loopS  #   f14_f15 = curr_arg, fp_index++
+    addu  $ra, $t3, $t6
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 isSingleS:
-    li    $t5, 0
-    bne   $t5, $t4, 5f                              # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_FLOAT_TO_REG f12, t0, t4, loopS            #   f12 = curr_arg, fp_index++
-5:  bne   $t5, $t4, loopS                           # else if (fp_index == 1)
-    nop
-    LOAD_FLOAT_TO_REG f14, t0, t4, loopS            #   f14 = curr_arg, fp_index++
+    addu  $ra, $t4, $t7
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
+
+isDoubleS:
+    addu  $ra, $t5, $t7
+#if defined(__mips_isa_rev) && __mips_isa_rev > 2
+    addiu $t7, 16               # fp_index += 16 didn't fit into LOAD_DOUBLE_TO_REG
+#endif
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 loopEndS:
     lw    $t9, ART_METHOD_QUICK_CODE_OFFSET_32($a0)  # get pointer to the code
@@ -1104,6 +1182,40 @@
     SDu   $f0, $f1, 0, $t0, $t1 # store floating point result
     jalr  $zero, $ra
     nop
+
+    // Note that gpr_index is kept within the range of tabIntS and tabLongS
+    // and fp_index is kept within the range of tabSingleS and tabDoubleS.
+    .balign 16
+tabIntS:
+    LOAD_WORD_TO_REG a1, t8, t6, loopS             # a1 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG a2, t8, t6, loopS             # a2 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG a3, t8, t6, loopS             # a3 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t0, t8, t6, loopS             # t0 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t1, t8, t6, loopS             # t1 = current argument, gpr_index += 16
+    LOAD_END t6, 5*16, loopS                       # no more GPR args, gpr_index = 5*16
+tabLongS:
+    LOAD_LONG_TO_REG a2, a3, t8, t6, 3*16, loopS   # a2_a3 = curr_arg, gpr_index = 3*16
+    LOAD_LONG_TO_REG a2, a3, t8, t6, 3*16, loopS   # a2_a3 = curr_arg, gpr_index = 3*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 5*16, loopS   # t0_t1 = curr_arg, gpr_index = 5*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 5*16, loopS   # t0_t1 = curr_arg, gpr_index = 5*16
+    LOAD_END t6, 5*16, loopS                       # no more GPR args, gpr_index = 5*16
+    LOAD_END t6, 5*16, loopS                       # no more GPR args, gpr_index = 5*16
+tabSingleS:
+    LOAD_FLOAT_TO_REG f8, t8, t7, loopS            # f8 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f10, t8, t7, loopS           # f10 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f12, t8, t7, loopS           # f12 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f14, t8, t7, loopS           # f14 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f16, t8, t7, loopS           # f16 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f18, t8, t7, loopS           # f18 = curr_arg, fp_index += 16
+    LOAD_END t7, 6*16, loopS                       # no more FPR args, fp_index = 6*16
+tabDoubleS:
+    LOAD_DOUBLE_TO_REG f8, f9, t8, t7, ra, loopS   # f8_f9 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f10, f11, t8, t7, ra, loopS # f10_f11 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f12, f13, t8, t7, ra, loopS # f12_f13 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f14, f15, t8, t7, ra, loopS # f14_f15 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f16, f17, t8, t7, ra, loopS # f16_f17 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f18, f19, t8, t7, ra, loopS # f18_f19 = curr_arg; if FPU32, fp_index += 16
+    LOAD_END t7, 6*16, loopS                       # no more FPR args, fp_index = 6*16
 END art_quick_invoke_static_stub
 
 #undef SPILL_SIZE
@@ -1886,9 +1998,9 @@
     la      $t9, artQuickProxyInvokeHandler
     jalr    $t9                         # (Method* proxy method, receiver, Thread*, SP)
     addiu   $a3, $sp, ARG_SLOT_SIZE     # pass $sp (remove arg slots)
-    lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
+    lw      $t7, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    bnez    $t0, 1f
+    bnez    $t7, 1f
     # don't care if $v0 and/or $v1 are modified, when exception branch taken
     MTD     $v0, $v1, $f0, $f1          # move float value to return value
     jalr    $zero, $ra
@@ -1900,25 +2012,25 @@
     /*
      * Called to resolve an imt conflict.
      * a0 is the conflict ArtMethod.
-     * t0 is a hidden argument that holds the target interface method's dex method index.
+     * t7 is a hidden argument that holds the target interface method's dex method index.
      *
-     * Note that this stub writes to a0, t0 and t1.
+     * Note that this stub writes to a0, t7 and t8.
      */
 ENTRY art_quick_imt_conflict_trampoline
-    lw      $t1, 0($sp)                                      # Load referrer.
-    lw      $t1, ART_METHOD_DEX_CACHE_METHODS_OFFSET_32($t1) # Load dex cache methods array.
-    sll     $t0, $t0, POINTER_SIZE_SHIFT                     # Calculate offset.
-    addu    $t0, $t1, $t0                                    # Add offset to base.
-    lw      $t0, 0($t0)                                      # Load interface method.
+    lw      $t8, 0($sp)                                      # Load referrer.
+    lw      $t8, ART_METHOD_DEX_CACHE_METHODS_OFFSET_32($t8) # Load dex cache methods array.
+    sll     $t7, $t7, POINTER_SIZE_SHIFT                     # Calculate offset.
+    addu    $t7, $t8, $t7                                    # Add offset to base.
+    lw      $t7, 0($t7)                                      # Load interface method.
     lw      $a0, ART_METHOD_JNI_OFFSET_32($a0)               # Load ImtConflictTable.
 
 .Limt_table_iterate:
-    lw      $t1, 0($a0)                                      # Load next entry in ImtConflictTable.
+    lw      $t8, 0($a0)                                      # Load next entry in ImtConflictTable.
     # Branch if found.
-    beq     $t1, $t0, .Limt_table_found
+    beq     $t8, $t7, .Limt_table_found
     nop
     # If the entry is null, the interface method is not in the ImtConflictTable.
-    beqz    $t1, .Lconflict_trampoline
+    beqz    $t8, .Lconflict_trampoline
     nop
     # Iterate over the entries of the ImtConflictTable.
     b       .Limt_table_iterate
@@ -1928,7 +2040,7 @@
     # We successfully hit an entry in the table. Load the target method and jump to it.
     lw      $a0, __SIZEOF_POINTER__($a0)
     lw      $t9, ART_METHOD_QUICK_CODE_OFFSET_32($a0)
-    jr      $t9
+    jalr    $zero, $t9
     nop
 
 .Lconflict_trampoline:
@@ -1972,7 +2084,7 @@
     # The result of the call is:
     # v0: ptr to native code, 0 on error.
     # v1: ptr to the bottom of the used area of the alloca, can restore stack till here.
-    beq     $v0, $zero, 1f         # check entry error
+    beq     $v0, $zero, 2f         # check entry error
     move    $t9, $v0               # save the code ptr
     move    $sp, $v1               # release part of the alloca
 
@@ -1980,10 +2092,22 @@
     lw      $a0,   0($sp)
     lw      $a1,   4($sp)
     lw      $a2,   8($sp)
-
-    # Load FPRs the same as GPRs. Look at BuildNativeCallFrameStateMachine.
-    jalr    $t9                    # native call
     lw      $a3,  12($sp)
+
+    # artQuickGenericJniTrampoline sets bit 0 of the native code address to 1
+    # when the first two arguments are both single precision floats. This lets
+    # us extract them properly from the stack and load into floating point
+    # registers.
+    MTD     $a0, $a1, $f12, $f13
+    andi    $t0, $t9, 1
+    xor     $t9, $t9, $t0
+    bnez    $t0, 1f
+    mtc1    $a1, $f14
+    MTD     $a2, $a3, $f14, $f15
+
+1:
+    jalr    $t9                    # native call
+    nop
     addiu   $sp, $sp, 16           # remove arg slots
 
     move    $gp, $s3               # restore $gp from $s3
@@ -1999,18 +2123,18 @@
     s.d     $f0, 16($sp)           # pass result_f
 
     lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
-    bne     $t0, $zero, 1f         # check for pending exceptions
+    bne     $t0, $zero, 2f         # check for pending exceptions
 
     move    $sp, $s8               # tear down the alloca
 
-    # tear dpown the callee-save frame
+    # tear down the callee-save frame
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
 
     MTD     $v0, $v1, $f0, $f1     # move float value to return value
     jalr    $zero, $ra
     nop
 
-1:
+2:
     lw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)
     # This will create a new save-all frame, required by the runtime.
     DELIVER_PENDING_EXCEPTION
@@ -2023,9 +2147,9 @@
     la      $t9, artQuickToInterpreterBridge
     jalr    $t9                                 # (Method* method, Thread*, SP)
     addiu   $a2, $sp, ARG_SLOT_SIZE             # pass $sp (remove arg slots)
-    lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
+    lw      $t7, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    bnez    $t0, 1f
+    bnez    $t7, 1f
     # don't care if $v0 and/or $v1 are modified, when exception branch taken
     MTD     $v0, $v1, $f0, $f1                  # move float value to return value
     jalr    $zero, $ra

diff --git a/runtime/arch/mips/quick_method_frame_info_mips.h b/runtime/arch/mips/quick_method_frame_info_mips.h
index 90e7b20..6f16352 100644
--- a/runtime/arch/mips/quick_method_frame_info_mips.h
+++ b/runtime/arch/mips/quick_method_frame_info_mips.h

@@ -26,12 +26,13 @@
 namespace mips {
 
 static constexpr uint32_t kMipsCalleeSaveAlwaysSpills =
-    (1 << art::mips::RA);
+    (1u << art::mips::RA);
 static constexpr uint32_t kMipsCalleeSaveRefSpills =
     (1 << art::mips::S2) | (1 << art::mips::S3) | (1 << art::mips::S4) | (1 << art::mips::S5) |
     (1 << art::mips::S6) | (1 << art::mips::S7) | (1 << art::mips::GP) | (1 << art::mips::FP);
 static constexpr uint32_t kMipsCalleeSaveArgSpills =
-    (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3);
+    (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3) | (1 << art::mips::T0) |
+    (1 << art::mips::T1);
 static constexpr uint32_t kMipsCalleeSaveAllSpills =
     (1 << art::mips::S0) | (1 << art::mips::S1);
 static constexpr uint32_t kMipsCalleeSaveEverythingSpills =
@@ -44,11 +45,13 @@
 static constexpr uint32_t kMipsCalleeSaveFpAlwaysSpills = 0;
 static constexpr uint32_t kMipsCalleeSaveFpRefSpills = 0;
 static constexpr uint32_t kMipsCalleeSaveFpArgSpills =
-    (1 << art::mips::F12) | (1 << art::mips::F13) | (1 << art::mips::F14) | (1 << art::mips::F15);
+    (1 << art::mips::F8) | (1 << art::mips::F9) | (1 << art::mips::F10) | (1 << art::mips::F11) |
+    (1 << art::mips::F12) | (1 << art::mips::F13) | (1 << art::mips::F14) | (1 << art::mips::F15) |
+    (1 << art::mips::F16) | (1 << art::mips::F17) | (1 << art::mips::F18) | (1 << art::mips::F19);
 static constexpr uint32_t kMipsCalleeSaveAllFPSpills =
     (1 << art::mips::F20) | (1 << art::mips::F21) | (1 << art::mips::F22) | (1 << art::mips::F23) |
     (1 << art::mips::F24) | (1 << art::mips::F25) | (1 << art::mips::F26) | (1 << art::mips::F27) |
-    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1 << art::mips::F31);
+    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1u << art::mips::F31);
 static constexpr uint32_t kMipsCalleeSaveFpEverythingSpills =
     (1 << art::mips::F0) | (1 << art::mips::F1) | (1 << art::mips::F2) | (1 << art::mips::F3) |
     (1 << art::mips::F4) | (1 << art::mips::F5) | (1 << art::mips::F6) | (1 << art::mips::F7) |
@@ -57,7 +60,7 @@
     (1 << art::mips::F16) | (1 << art::mips::F17) | (1 << art::mips::F18) | (1 << art::mips::F19) |
     (1 << art::mips::F20) | (1 << art::mips::F21) | (1 << art::mips::F22) | (1 << art::mips::F23) |
     (1 << art::mips::F24) | (1 << art::mips::F25) | (1 << art::mips::F26) | (1 << art::mips::F27) |
-    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1 << art::mips::F31);
+    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1u << art::mips::F31);
 
 constexpr uint32_t MipsCalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kMipsCalleeSaveAlwaysSpills | kMipsCalleeSaveRefSpills |

diff --git a/runtime/arch/mips/registers_mips.h b/runtime/arch/mips/registers_mips.h
index ae01bd5..555f3f0 100644
--- a/runtime/arch/mips/registers_mips.h
+++ b/runtime/arch/mips/registers_mips.h

@@ -35,9 +35,9 @@
   A1   =  5,
   A2   =  6,
   A3   =  7,
-  T0   =  8,  // Temporaries.
+  T0   =  8,  // Two extra arguments / temporaries.
   T1   =  9,
-  T2   = 10,
+  T2   = 10,  // Temporaries.
   T3   = 11,
   T4   = 12,
   T5   = 13,
@@ -100,7 +100,7 @@
   F29 = 29,
   F30 = 30,
   F31 = 31,
-  FTMP = F8,  // scratch register
+  FTMP = F6,  // scratch register
   kNumberOfFRegisters = 32,
   kNoFRegister = -1,
 };

diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 6665897..9e385f83 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc

@@ -355,7 +355,7 @@
         "lw $a2, 8($sp)\n\t"
         "lw $t9, 12($sp)\n\t"
         "lw $s1, 16($sp)\n\t"
-        "lw $t0, 20($sp)\n\t"
+        "lw $t7, 20($sp)\n\t"
         "addiu $sp, $sp, 24\n\t"
 
         "jalr $t9\n\t"             // Call the stub.

diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index fe82878..bf1d4ea 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc

@@ -134,13 +134,23 @@
   // | Method*    | ---
   // | RA         |
   // | ...        |    callee saves
+  // | T1         |    arg5
+  // | T0         |    arg4
   // | A3         |    arg3
   // | A2         |    arg2
   // | A1         |    arg1
+  // | F19        |
+  // | F18        |    f_arg5
+  // | F17        |
+  // | F16        |    f_arg4
   // | F15        |
-  // | F14        |    f_arg1
+  // | F14        |    f_arg3
   // | F13        |
-  // | F12        |    f_arg0
+  // | F12        |    f_arg2
+  // | F11        |
+  // | F10        |    f_arg1
+  // | F9         |
+  // | F8         |    f_arg0
   // |            |    padding
   // | A0/Method* |  <- sp
   static constexpr bool kSplitPairAcrossRegisterAndStack = false;
@@ -148,14 +158,14 @@
   static constexpr bool kQuickSoftFloatAbi = false;
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr bool kQuickSkipOddFpRegisters = true;
-  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 4;  // 2 arguments passed in FPRs. Floats can be passed
-                                                 // only in even numbered registers and each double
-                                                 // occupies two registers.
+  static constexpr size_t kNumQuickGprArgs = 5;   // 5 arguments passed in GPRs.
+  static constexpr size_t kNumQuickFprArgs = 12;  // 6 arguments passed in FPRs. Floats can be
+                                                  // passed only in even numbered registers and each
+                                                  // double occupies two registers.
   static constexpr bool kGprFprLockstep = false;
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 32;  // Offset of first GPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 76;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 8;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 56;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 108;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
@@ -187,7 +197,7 @@
   // | F12        |    f_arg0
   // |            |    padding
   // | A0/Method* |  <- sp
-  // NOTE: for Mip64, when A0 is skipped, F0 is also skipped.
+  // NOTE: for Mip64, when A0 is skipped, F12 is also skipped.
   static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;
@@ -197,7 +207,7 @@
   static constexpr size_t kNumQuickFprArgs = 7;  // 7 arguments passed in FPRs.
   static constexpr bool kGprFprLockstep = true;
 
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 24;  // Offset of first FPR arg (F1).
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 24;  // Offset of first FPR arg (F13).
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg (A1).
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 200;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
@@ -501,10 +511,16 @@
         case Primitive::kPrimDouble:
         case Primitive::kPrimLong:
           if (kQuickSoftFloatAbi || (cur_type_ == Primitive::kPrimLong)) {
-            if (cur_type_ == Primitive::kPrimLong && kAlignPairRegister && gpr_index_ == 0) {
-              // Currently, this is only for ARM and MIPS, where the first available parameter
-              // register is R1 (on ARM) or A1 (on MIPS). So we skip it, and use R2 (on ARM) or
-              // A2 (on MIPS) instead.
+            if (cur_type_ == Primitive::kPrimLong &&
+#if defined(__mips__) && !defined(__LP64__)
+                (gpr_index_ == 0 || gpr_index_ == 2) &&
+#else
+                gpr_index_ == 0 &&
+#endif
+                kAlignPairRegister) {
+              // Currently, this is only for ARM and MIPS, where we align long parameters with
+              // even-numbered registers by skipping R1 (on ARM) or A1(A3) (on MIPS) and using
+              // R2 (on ARM) or A2(T0) (on MIPS) instead.
               IncGprIndex();
             }
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
@@ -2086,6 +2102,41 @@
     // Note that the native code pointer will be automatically set by artFindNativeMethod().
   }
 
+#if defined(__mips__) && !defined(__LP64__)
+  // On MIPS32 if the first two arguments are floating-point, we need to know their types
+  // so that art_quick_generic_jni_trampoline can correctly extract them from the stack
+  // and load into floating-point registers.
+  // Possible arrangements of first two floating-point arguments on the stack (32-bit FPU
+  // view):
+  // (1)
+  //  |     DOUBLE    |     DOUBLE    | other args, if any
+  //  |  F12  |  F13  |  F14  |  F15  |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (2)
+  //  |     DOUBLE    | FLOAT | (PAD) | other args, if any
+  //  |  F12  |  F13  |  F14  |       |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (3)
+  //  | FLOAT | (PAD) |     DOUBLE    | other args, if any
+  //  |  F12  |       |  F14  |  F15  |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (4)
+  //  | FLOAT | FLOAT | other args, if any
+  //  |  F12  |  F14  |
+  //  |  SP+0 |  SP+4 | SP+8
+  // As you can see, only the last case (4) is special. In all others we can just
+  // load F12/F13 and F14/F15 in the same manner.
+  // Set bit 0 of the native code address to 1 in this case (valid code addresses
+  // are always a multiple of 4 on MIPS32, so we have 2 spare bits available).
+  if (nativeCode != nullptr &&
+      shorty != nullptr &&
+      shorty_len >= 3 &&
+      shorty[1] == 'F' &&
+      shorty[2] == 'F') {
+    nativeCode = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(nativeCode) | 1);
+  }
+#endif
+
   // Return native code addr(lo) and bottom of alloca address(hi).
   return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(visitor.GetBottomOfUsedArea()),
                                 reinterpret_cast<uintptr_t>(nativeCode));

diff --git a/runtime/oat.h b/runtime/oat.h
index 8c84d42..0f4cbbb7 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h

@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '9', '2', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '9', '3', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
commit	1b8464d17c2266763714ae18be7c4dc26e28bf61	[log] [tgz]
author	Alexey Frunze <Alexey.Frunze@imgtec.com>	Sat Nov 12 17:22:05 2016 -0800
committer	Alexey Frunze <Alexey.Frunze@imgtec.com>	Tue Dec 06 13:57:04 2016 -0800
tree	19ab81b439fc216e289cb14be8a7618dbafb4f50
parent	d7a7c7f3e93de9fa915e66d54dfc799efcc12ffb [diff]