ART: Implement hard float for X86

Use XMM0-XMM3 as parameter registers for float/double on X86.  X86_64
already uses XMM0-XMM7 for parameters.

Change the 'hidden' argument register from XMM0 to XMM7 to avoid a
conflict.

This change was requested to simplify the Optimizing compiler
implementation.

Change-Id: I89ba8ade99b9a8a5b1ad1ee5f5cbfd33d656bfaa
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index c7d83dd..b7fa2d2 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -39,22 +39,15 @@
     }
    protected:
     Mir2Lir* m2l_;
-   private:
     size_t cur_core_reg_;
     size_t cur_fp_reg_;
   };
 
-  class InToRegStorageX86Mapper : public InToRegStorageMapper {
+  class InToRegStorageX86Mapper : public InToRegStorageX86_64Mapper {
    public:
-    explicit InToRegStorageX86Mapper(Mir2Lir* m2l) : m2l_(m2l), cur_core_reg_(0) {}
+    explicit InToRegStorageX86Mapper(Mir2Lir* m2l)
+        : InToRegStorageX86_64Mapper(m2l) { }
     virtual RegStorage GetNextReg(ShortyArg arg);
-    virtual void Reset() OVERRIDE {
-      cur_core_reg_ = 0;
-    }
-   protected:
-    Mir2Lir* m2l_;
-   private:
-    size_t cur_core_reg_;
   };
 
   InToRegStorageX86_64Mapper in_to_reg_storage_x86_64_mapper_;
@@ -118,9 +111,12 @@
       if (cu_->target64) {
         return As64BitReg(TargetReg32(symbolic_reg));
       } else {
+        if (symbolic_reg >= kFArg0 && symbolic_reg <= kFArg3) {
+          // We want an XMM, not a pair.
+          return As64BitReg(TargetReg32(symbolic_reg));
+        }
         // x86: construct a pair.
         DCHECK((kArg0 <= symbolic_reg && symbolic_reg < kArg3) ||
-               (kFArg0 <= symbolic_reg && symbolic_reg < kFArg3) ||
                (kRet0 == symbolic_reg));
         return RegStorage::MakeRegPair(TargetReg32(symbolic_reg),
                                  TargetReg32(static_cast<SpecialTargetRegister>(symbolic_reg + 1)));
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 142acbc..bfa24cc 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -177,10 +177,10 @@
   RegStorage::InvalidReg(),  // kArg5
   RegStorage::InvalidReg(),  // kArg6
   RegStorage::InvalidReg(),  // kArg7
-  rs_rAX,                    // kFArg0
-  rs_rCX,                    // kFArg1
-  rs_rDX,                    // kFArg2
-  rs_rBX,                    // kFArg3
+  rs_fr0,                    // kFArg0
+  rs_fr1,                    // kFArg1
+  rs_fr2,                    // kFArg2
+  rs_fr3,                    // kFArg3
   RegStorage::InvalidReg(),  // kFArg4
   RegStorage::InvalidReg(),  // kFArg5
   RegStorage::InvalidReg(),  // kFArg6
@@ -197,7 +197,7 @@
   rs_rDX,                    // kRet1
   rs_rAX,                    // kInvokeTgt
   rs_rAX,                    // kHiddenArg - used to hold the method index before copying to fr0.
-  rs_fr0,                    // kHiddenFpArg
+  rs_fr7,                    // kHiddenFpArg
   rs_rCX,                    // kCount
 };
 
@@ -542,13 +542,13 @@
   LockTemp(TargetReg32(kArg1));
   LockTemp(TargetReg32(kArg2));
   LockTemp(TargetReg32(kArg3));
+  LockTemp(TargetReg32(kFArg0));
+  LockTemp(TargetReg32(kFArg1));
+  LockTemp(TargetReg32(kFArg2));
+  LockTemp(TargetReg32(kFArg3));
   if (cu_->target64) {
     LockTemp(TargetReg32(kArg4));
     LockTemp(TargetReg32(kArg5));
-    LockTemp(TargetReg32(kFArg0));
-    LockTemp(TargetReg32(kFArg1));
-    LockTemp(TargetReg32(kFArg2));
-    LockTemp(TargetReg32(kFArg3));
     LockTemp(TargetReg32(kFArg4));
     LockTemp(TargetReg32(kFArg5));
     LockTemp(TargetReg32(kFArg6));
@@ -563,13 +563,13 @@
   FreeTemp(TargetReg32(kArg2));
   FreeTemp(TargetReg32(kArg3));
   FreeTemp(TargetReg32(kHiddenArg));
+  FreeTemp(TargetReg32(kFArg0));
+  FreeTemp(TargetReg32(kFArg1));
+  FreeTemp(TargetReg32(kFArg2));
+  FreeTemp(TargetReg32(kFArg3));
   if (cu_->target64) {
     FreeTemp(TargetReg32(kArg4));
     FreeTemp(TargetReg32(kArg5));
-    FreeTemp(TargetReg32(kFArg0));
-    FreeTemp(TargetReg32(kFArg1));
-    FreeTemp(TargetReg32(kFArg2));
-    FreeTemp(TargetReg32(kFArg3));
     FreeTemp(TargetReg32(kFArg4));
     FreeTemp(TargetReg32(kFArg5));
     FreeTemp(TargetReg32(kFArg6));
@@ -2457,14 +2457,23 @@
 RegStorage X86Mir2Lir::InToRegStorageX86Mapper::GetNextReg(ShortyArg arg) {
   const SpecialTargetRegister coreArgMappingToPhysicalReg[] = {kArg1, kArg2, kArg3};
   const size_t coreArgMappingToPhysicalRegSize = arraysize(coreArgMappingToPhysicalReg);
+  const SpecialTargetRegister fpArgMappingToPhysicalReg[] = {kFArg0, kFArg1, kFArg2, kFArg3};
+  const size_t fpArgMappingToPhysicalRegSize = arraysize(fpArgMappingToPhysicalReg);
 
   RegStorage result = RegStorage::InvalidReg();
-  if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-    result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
-                          arg.IsRef() ? kRef : kNotWide);
-    if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      result = RegStorage::MakeRegPair(
-          result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
+  if (arg.IsFP()) {
+    if (cur_fp_reg_ < fpArgMappingToPhysicalRegSize) {
+      return m2l_->TargetReg(fpArgMappingToPhysicalReg[cur_fp_reg_++],
+                             arg.IsWide() ? kWide : kNotWide);
+    }
+  } else {
+    if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+      result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
+                               arg.IsRef() ? kRef : kNotWide);
+      if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+        result = RegStorage::MakeRegPair(
+            result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
+      }
     }
   }
   return result;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 3e0a852..b48c4ad 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -56,15 +56,15 @@
  * x86-64/x32 gs: holds it.
  *
  * For floating point we don't support CPUs without SSE2 support (ie newer than PIII):
- *  Native: x86  | x86-64 / x32 | ART x86                    | ART x86-64
- *  XMM0: caller | caller, arg1 | caller, float return value | caller, arg1, float return value
- *  XMM1: caller | caller, arg2 | caller, scratch            | caller, arg2, scratch
- *  XMM2: caller | caller, arg3 | caller, scratch            | caller, arg3, scratch
- *  XMM3: caller | caller, arg4 | caller, scratch            | caller, arg4, scratch
- *  XMM4: caller | caller, arg5 | caller, scratch            | caller, arg5, scratch
- *  XMM5: caller | caller, arg6 | caller, scratch            | caller, arg6, scratch
- *  XMM6: caller | caller, arg7 | caller, scratch            | caller, arg7, scratch
- *  XMM7: caller | caller, arg8 | caller, scratch            | caller, arg8, scratch
+ *  Native: x86  | x86-64 / x32 | ART x86                          | ART x86-64
+ *  XMM0: caller | caller, arg1 | caller, arg1, float return value | caller, arg1, float return value
+ *  XMM1: caller | caller, arg2 | caller, arg2, scratch            | caller, arg2, scratch
+ *  XMM2: caller | caller, arg3 | caller, arg3, scratch            | caller, arg3, scratch
+ *  XMM3: caller | caller, arg4 | caller, arg4, scratch            | caller, arg4, scratch
+ *  XMM4: caller | caller, arg5 | caller, scratch                  | caller, arg5, scratch
+ *  XMM5: caller | caller, arg6 | caller, scratch                  | caller, arg6, scratch
+ *  XMM6: caller | caller, arg7 | caller, scratch                  | caller, arg7, scratch
+ *  XMM7: caller | caller, arg8 | caller, scratch                  | caller, arg8, scratch
  *  ---  x86-64/x32 registers
  *  XMM8 .. 11: caller save available as scratch registers for ART.
  *  XMM12 .. 15: callee save available as promoted registers for ART.
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index 9bf7d0f..b400f04 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -76,12 +76,34 @@
 }
 
 bool X86ManagedRuntimeCallingConvention::IsCurrentParamOnStack() {
-  return true;  // Everything is passed by stack
+  // We assume all parameters are on stack, args coming via registers are spilled as entry_spills.
+  return true;
 }
 
 ManagedRegister X86ManagedRuntimeCallingConvention::CurrentParamRegister() {
-  LOG(FATAL) << "Should not reach here";
-  return ManagedRegister::NoRegister();
+  ManagedRegister res = ManagedRegister::NoRegister();
+  if (!IsCurrentParamAFloatOrDouble()) {
+    switch (itr_args_ + high_long_regs_used_ - itr_float_and_doubles_) {
+    case 0: res = X86ManagedRegister::FromCpuRegister(ECX); break;
+    case 1: res = X86ManagedRegister::FromCpuRegister(EDX); break;
+    case 2: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+    }
+  } else if (itr_float_and_doubles_ < 4) {
+    // First four float parameters are passed via XMM0..XMM3
+    res = X86ManagedRegister::FromXmmRegister(
+                                 static_cast<XmmRegister>(XMM0 + itr_float_and_doubles_));
+  }
+  return res;
+}
+
+ManagedRegister X86ManagedRuntimeCallingConvention::CurrentParamHighLongRegister() {
+  ManagedRegister res = ManagedRegister::NoRegister();
+  DCHECK(IsCurrentParamALong());
+  switch (itr_args_ + high_long_regs_used_ - itr_float_and_doubles_) {
+  case 0: res = X86ManagedRegister::FromCpuRegister(EDX); break;
+  case 1: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+  }
+  return res;
 }
 
 FrameOffset X86ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
@@ -94,15 +116,26 @@
   // We spill the argument registers on X86 to free them up for scratch use, we then assume
   // all arguments are on the stack.
   if (entry_spills_.size() == 0) {
-    size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
-    if (num_spills > 0) {
-      entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(ECX));
-      if (num_spills > 1) {
-        entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EDX));
-        if (num_spills > 2) {
-          entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EBX));
+    ResetIterator(FrameOffset(0));
+    while (HasNext()) {
+      ManagedRegister in_reg = CurrentParamRegister();
+      if (!in_reg.IsNoRegister()) {
+        int32_t size = IsParamADouble(itr_args_) ? 8 : 4;
+        int32_t spill_offset = CurrentParamStackOffset().Uint32Value();
+        ManagedRegisterSpill spill(in_reg, size, spill_offset);
+        entry_spills_.push_back(spill);
+        if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
+          // special case, as we may need a second register here.
+          in_reg = CurrentParamHighLongRegister();
+          if (!in_reg.IsNoRegister()) {
+            // We have to spill the second half of the long.
+            ManagedRegisterSpill spill2(in_reg, size, spill_offset + 4);
+            entry_spills_.push_back(spill2);
+            high_long_regs_used_++;
+          }
         }
       }
+      Next();
     }
   }
   return entry_spills_;
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index 025eb6d..db34ea9 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -28,7 +28,8 @@
  public:
   explicit X86ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized,
                                               const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize),
+        high_long_regs_used_(0) {}
   ~X86ManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
@@ -40,7 +41,10 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
   const ManagedRegisterEntrySpills& EntrySpills() OVERRIDE;
+
  private:
+  int high_long_regs_used_;
+  ManagedRegister CurrentParamHighLongRegister();
   ManagedRegisterEntrySpills entry_spills_;
   DISALLOW_COPY_AND_ASSIGN(X86ManagedRuntimeCallingConvention);
 };
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 83584a2..d66d773 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1501,8 +1501,12 @@
 
   uint32_t reg_offset = 1;
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    pushl(spill_regs.at(i).AsX86().AsCpuRegister());
+    x86::X86ManagedRegister spill = spill_regs.at(i).AsX86();
+    DCHECK(spill.IsCpuRegister());
+    pushl(spill.AsCpuRegister());
+    gpr_count++;
 
     // DW_CFA_advance_loc
     DW_CFA_advance_loc(&cfi_info_, buffer_.Size() - cfi_pc_);
@@ -1516,7 +1520,7 @@
   }
 
   // return address then method on stack
-  int32_t adjust = frame_size - (spill_regs.size() * kFramePointerSize) -
+  int32_t adjust = frame_size - (gpr_count * kFramePointerSize) -
                    sizeof(StackReference<mirror::ArtMethod>) /*method*/ -
                    kFramePointerSize /*return address*/;
   addl(ESP, Immediate(-adjust));
@@ -1536,9 +1540,18 @@
   DW_CFA_def_cfa_offset(&cfi_info_, cfi_cfa_offset_);
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    movl(Address(ESP, frame_size + sizeof(StackReference<mirror::ArtMethod>) +
-                 (i * kFramePointerSize)),
-         entry_spills.at(i).AsX86().AsCpuRegister());
+    ManagedRegisterSpill spill = entry_spills.at(i);
+    if (spill.AsX86().IsCpuRegister()) {
+      movl(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsCpuRegister());
+    } else {
+      DCHECK(spill.AsX86().IsXmmRegister());
+      if (spill.getSize() == 8) {
+        movsd(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsXmmRegister());
+      } else {
+        CHECK_EQ(spill.getSize(), 4);
+        movss(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsXmmRegister());
+      }
+    }
   }
 }
 
@@ -1548,7 +1561,9 @@
   addl(ESP, Immediate(frame_size - (spill_regs.size() * kFramePointerSize) -
                       sizeof(StackReference<mirror::ArtMethod>)));
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    popl(spill_regs.at(i).AsX86().AsCpuRegister());
+    x86::X86ManagedRegister spill = spill_regs.at(i).AsX86();
+    DCHECK(spill.IsCpuRegister());
+    popl(spill.AsCpuRegister());
   }
   ret();
 }
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 285007c..180f860 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -302,7 +302,7 @@
 #if defined(__i386__)
     // TODO: Set the thread?
     __asm__ __volatile__(
-        "movd %[hidden], %%xmm0\n\t"
+        "movd %[hidden], %%xmm7\n\t"
         "subl $12, %%esp\n\t"       // Align stack.
         "pushl %[referrer]\n\t"     // Store referrer
         "call *%%edi\n\t"           // Call the stub
diff --git a/runtime/arch/x86/asm_support_x86.h b/runtime/arch/x86/asm_support_x86.h
index 5a88f80..b0a6017 100644
--- a/runtime/arch/x86/asm_support_x86.h
+++ b/runtime/arch/x86/asm_support_x86.h
@@ -21,6 +21,8 @@
 
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 32
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 32
-#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 32
+
+// 32 bytes for GPRs and 32 bytes for FPRs.
+#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE (32 + 32)
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_H_
diff --git a/runtime/arch/x86/context_x86.cc b/runtime/arch/x86/context_x86.cc
index 49aa326..3af7d7c 100644
--- a/runtime/arch/x86/context_x86.cc
+++ b/runtime/arch/x86/context_x86.cc
@@ -40,7 +40,7 @@
   mirror::ArtMethod* method = fr.GetMethod();
   const QuickMethodFrameInfo frame_info = method->GetQuickFrameInfo();
   size_t spill_count = POPCOUNT(frame_info.CoreSpillMask());
-  DCHECK_EQ(frame_info.FpSpillMask(), 0u);
+  // We don't have any callee save FPRs, so ignore them.
   if (spill_count > 0) {
     // Lowest number spill is farthest away, walk registers and fill into context.
     int j = 2;  // Offset j to skip return address spill.
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 302b9f8..05dc640 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -90,6 +90,15 @@
     PUSH ebx  // Save args
     PUSH edx
     PUSH ecx
+    // Create space for FPR args.
+    subl MACRO_LITERAL(4 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    // Save FPRs.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+
     SETUP_GOT_NOSAVE VAR(got_reg, 0)
     // Load Runtime::instance_ from GOT.
     movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg, 0)), REG_VAR(temp_reg, 1)
@@ -102,7 +111,7 @@
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +4: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 7*4 + 4)
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 7*4 + 4*8 + 4)
 #error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(X86) size not as expected."
 #endif
 END_MACRO
@@ -112,20 +121,39 @@
      * Runtime::CreateCalleeSaveMethod(kRefsAndArgs) where the method is passed in EAX.
      */
 MACRO0(SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_EAX)
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
     PUSH edi  // Save callee saves
     PUSH esi
     PUSH ebp
     PUSH ebx  // Save args
     PUSH edx
     PUSH ecx
+
+    // Create space for FPR args.
+    subl MACRO_LITERAL(32), %esp
+    CFI_ADJUST_CFA_OFFSET(32)
+
+    // Save FPRs.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+
     PUSH eax  // Store the ArtMethod reference at the bottom of the stack.
     // Store esp as the stop quick frame.
     movl %esp, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
 END_MACRO
 
 MACRO0(RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME)
-    addl MACRO_LITERAL(4), %esp  // Remove padding
-    CFI_ADJUST_CFA_OFFSET(-4)
+    // Restore FPRs. EAX is still on the stack.
+    movsd 4(%esp), %xmm0
+    movsd 12(%esp), %xmm1
+    movsd 20(%esp), %xmm2
+    movsd 28(%esp), %xmm3
+
+    addl MACRO_LITERAL(36), %esp  // Remove FPRs and EAX.
+    CFI_ADJUST_CFA_OFFSET(-36)
+
     POP ecx  // Restore args except eax
     POP edx
     POP ebx
@@ -134,6 +162,30 @@
     POP edi
 END_MACRO
 
+// Restore register and jump to routine
+// Inputs:  EDI contains pointer to code.
+// Notes: Need to pop EAX too (restores Method*)
+MACRO0(RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME_AND_JUMP)
+    POP eax  // Restore Method*
+
+    // Restore FPRs.
+    movsd 0(%esp), %xmm0
+    movsd 8(%esp), %xmm1
+    movsd 16(%esp), %xmm2
+    movsd 24(%esp), %xmm3
+
+    addl MACRO_LITERAL(32), %esp  // Remove FPRs.
+    CFI_ADJUST_CFA_OFFSET(-32)
+
+    POP ecx  // Restore args except eax
+    POP edx
+    POP ebx
+    POP ebp  // Restore callee saves
+    POP esi
+    xchgl 0(%esp),%edi // restore EDI and place code pointer as only value on stack
+    ret
+END_MACRO
+
     /*
      * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
      * exception is Thread::Current()->exception_.
@@ -243,13 +295,14 @@
     DEFINE_FUNCTION RAW_VAR(c_name, 0)
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME ebx, ebx
     movl %esp, %edx  // remember SP
+
     // Outgoing argument set up
     subl MACRO_LITERAL(12), %esp  // alignment padding
     CFI_ADJUST_CFA_OFFSET(12)
     PUSH edx                      // pass SP
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
-    pushl 32(%edx)                // pass caller Method*
+    pushl 32+32(%edx)             // pass caller Method*
     CFI_ADJUST_CFA_OFFSET(4)
     PUSH ecx                      // pass arg2
     PUSH eax                      // pass arg1
@@ -257,6 +310,17 @@
     movl %edx, %edi               // save code pointer in EDI
     addl MACRO_LITERAL(36), %esp  // Pop arguments skip eax
     CFI_ADJUST_CFA_OFFSET(-36)
+
+    // Restore FPRs.
+    movsd 0(%esp), %xmm0
+    movsd 8(%esp), %xmm1
+    movsd 16(%esp), %xmm2
+    movsd 24(%esp), %xmm3
+
+    // Remove space for FPR args.
+    addl MACRO_LITERAL(4 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(-4 * 8)
+
     POP ecx  // Restore args except eax
     POP edx
     POP ebx
@@ -284,7 +348,63 @@
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
     /*
-     * Quick invocation stub.
+     * Helper for quick invocation stub to set up XMM registers. Assumes EBX == shorty,
+     * ECX == arg_array. Clobbers EBX, ECX and al. Branches to xmm_setup_finished if it encounters
+     * the end of the shorty.
+     */
+MACRO2(LOOP_OVER_SHORTY_LOADING_XMMS, xmm_reg, finished)
+1: // LOOP
+    movb (%ebx), %al              // al := *shorty
+    addl MACRO_LITERAL(1), %ebx   // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto xmm_setup_finished
+    je VAR(finished, 1)
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto FOUND_DOUBLE
+    je 2f
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto FOUND_FLOAT
+    je 3f
+    addl MACRO_LITERAL(4), %ecx   // arg_array++
+    //  Handle extra space in arg array taken by a long.
+    cmpb MACRO_LITERAL(74), %al   // if (al != 'J') goto LOOP
+    jne 1b
+    addl MACRO_LITERAL(4), %ecx   // arg_array++
+    jmp 1b                        // goto LOOP
+2:  // FOUND_DOUBLE
+    movsd (%ecx), REG_VAR(xmm_reg, 0)
+    addl MACRO_LITERAL(8), %ecx   // arg_array+=2
+    jmp 4f
+3:  // FOUND_FLOAT
+    movss (%ecx), REG_VAR(xmm_reg, 0)
+    addl MACRO_LITERAL(4), %ecx   // arg_array++
+4:
+END_MACRO
+
+    /*
+     * Helper for quick invocation stub to set up GPR registers. Assumes ESI == shorty,
+     * EDI == arg_array. Clobbers ESI, EDI and al. Branches to gpr_setup_finished if it encounters
+     * the end of the shorty.
+     */
+MACRO1(SKIP_OVER_FLOATS, finished)
+1: // LOOP
+    movb (%esi), %al              // al := *shorty
+    addl MACRO_LITERAL(1), %esi   // shorty++
+    cmpb MACRO_LITERAL(0), %al    // if (al == '\0') goto gpr_setup_finished
+    je  VAR(finished, 2)
+    cmpb MACRO_LITERAL(70), %al   // if (al == 'F') goto SKIP_FLOAT
+    je 3f
+    cmpb MACRO_LITERAL(68), %al   // if (al == 'D') goto SKIP_DOUBLE
+    je 4f
+    jmp 5f
+3:  // SKIP_FLOAT
+    addl MACRO_LITERAL(4), %edi   // arg_array++
+    jmp 1b
+4:  // SKIP_DOUBLE
+    addl MACRO_LITERAL(8), %edi   // arg_array+=2
+    jmp 1b
+5:
+END_MACRO
+
+  /*
+     * Quick invocation stub (non-static).
      * On entry:
      *   [sp] = return address
      *   [sp + 4] = method pointer
@@ -295,6 +415,17 @@
      *   [sp + 24] = shorty
      */
 DEFINE_FUNCTION art_quick_invoke_stub
+    // Set up argument XMM registers.
+    mov 24(%esp), %ebx           // EBX := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %ebx
+    mov 8(%esp), %ecx            // ECX := arg_array + 4 ; ie skip this pointer.
+    addl LITERAL(4), %ecx
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm0, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm1, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm2, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm3, .Lxmm_setup_finished
+    .balign 16
+.Lxmm_setup_finished:
     PUSH ebp                      // save ebp
     PUSH ebx                      // save ebx
     PUSH esi                      // save esi
@@ -308,18 +439,41 @@
     andl LITERAL(0xFFFFFFF0), %ebx
     subl LITERAL(20), %ebx        // remove space for return address, ebx, ebp, esi and edi
     subl %ebx, %esp               // reserve stack space for argument array
-    SETUP_GOT_NOSAVE ebx          // clobbers ebx (harmless here)
-    lea  4(%esp), %eax            // use stack pointer + method ptr as dest for memcpy
-    pushl 28(%ebp)                // push size of region to memcpy
-    pushl 24(%ebp)                // push arg array as source of memcpy
-    pushl %eax                    // push stack pointer as destination of memcpy
-    call PLT_SYMBOL(memcpy)       // (void*, const void*, size_t)
-    addl LITERAL(12), %esp        // pop arguments to memcpy
+
     movl LITERAL(0), (%esp)       // store NULL for method*
+
+    // Copy arg array into stack.
+    movl 28(%ebp), %ecx           // ECX = size of args
+    movl 24(%ebp), %esi           // ESI = argument array
+    leal 4(%esp), %edi            // EDI = just after Method* in stack arguments
+    rep movsb                     // while (ecx--) { *edi++ = *esi++ }
+
+    mov 40(%ebp), %esi            // ESI := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %esi
+    mov 24(%ebp), %edi            // EDI := arg_array
+    mov 0(%edi), %ecx             // ECX := this pointer
+    addl LITERAL(4), %edi         // EDI := arg_array + 4 ; ie skip this pointer.
+
+    // Enumerate the possible cases for loading GPRS.
+    // edx (and maybe ebx):
+    SKIP_OVER_FLOATS .Lgpr_setup_finished
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je .LfirstLong
+    // Must be an integer value.
+    movl (%edi), %edx
+    addl LITERAL(4), %edi         // arg_array++
+
+    // Now check ebx
+    SKIP_OVER_FLOATS .Lgpr_setup_finished
+    // Must be first word of a long, or an integer.
+    movl (%edi), %ebx
+    jmp .Lgpr_setup_finished
+.LfirstLong:
+    movl (%edi), %edx
+    movl 4(%edi), %ebx
+    // Nothing left to load.
+.Lgpr_setup_finished:
     mov 20(%ebp), %eax            // move method pointer into eax
-    mov 4(%esp), %ecx             // copy arg1 into ecx
-    mov 8(%esp), %edx             // copy arg2 into edx
-    mov 12(%esp), %ebx            // copy arg3 into ebx
     call *MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32(%eax) // call the method
     mov %ebp, %esp                // restore stack pointer
     CFI_DEF_CFA_REGISTER(esp)
@@ -344,6 +498,119 @@
     ret
 END_FUNCTION art_quick_invoke_stub
 
+  /*
+     * Quick invocation stub (static).
+     * On entry:
+     *   [sp] = return address
+     *   [sp + 4] = method pointer
+     *   [sp + 8] = argument array or NULL for no argument methods
+     *   [sp + 12] = size of argument array in bytes
+     *   [sp + 16] = (managed) thread pointer
+     *   [sp + 20] = JValue* result
+     *   [sp + 24] = shorty
+     */
+DEFINE_FUNCTION art_quick_invoke_static_stub
+    // Set up argument XMM registers.
+    mov 24(%esp), %ebx           // EBX := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %ebx
+    mov 8(%esp), %ecx            // ECX := arg_array
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm0, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm1, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm2, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm3, .Lxmm_setup_finished2
+    .balign 16
+.Lxmm_setup_finished2:
+    PUSH ebp                      // save ebp
+    PUSH ebx                      // save ebx
+    PUSH esi                      // save esi
+    PUSH edi                      // save edi
+    mov %esp, %ebp                // copy value of stack pointer into base pointer
+    CFI_DEF_CFA_REGISTER(ebp)
+    mov 28(%ebp), %ebx            // get arg array size
+    // reserve space for return addr, method*, ebx, ebp, esi, and edi in frame
+    addl LITERAL(36), %ebx
+    // align frame size to 16 bytes
+    andl LITERAL(0xFFFFFFF0), %ebx
+    subl LITERAL(20), %ebx        // remove space for return address, ebx, ebp, esi and edi
+    subl %ebx, %esp               // reserve stack space for argument array
+
+    movl LITERAL(0), (%esp)       // store NULL for method*
+
+    // Copy arg array into stack.
+    movl 28(%ebp), %ecx           // ECX = size of args
+    movl 24(%ebp), %esi           // ESI = argument array
+    leal 4(%esp), %edi            // EDI = just after Method* in stack arguments
+    rep movsb                     // while (ecx--) { *edi++ = *esi++ }
+
+    mov 40(%ebp), %esi            // ESI := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %esi
+    mov 24(%ebp), %edi            // EDI := arg_array
+
+    // Enumerate the possible cases for loading GPRS.
+    // ecx (and maybe edx)
+    SKIP_OVER_FLOATS .Lgpr_setup_finished2
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je .LfirstLong2
+    // Must be an integer value.  Load into ECX.
+    movl (%edi), %ecx
+    addl LITERAL(4), %edi         // arg_array++
+
+    // Now check edx (and maybe ebx).
+    SKIP_OVER_FLOATS .Lgpr_setup_finished2
+    cmpb MACRO_LITERAL(74), %al   // if (al == 'J') goto FOUND_LONG
+    je .LSecondLong2
+    // Must be an integer.  Load into EDX.
+    movl (%edi), %edx
+    addl LITERAL(4), %edi         // arg_array++
+
+    // Is there anything for ebx?
+    SKIP_OVER_FLOATS .Lgpr_setup_finished2
+    // First word of long or integer.  Load into EBX.
+    movl (%edi), %ebx
+    jmp .Lgpr_setup_finished2
+.LSecondLong2:
+    // EDX:EBX is long.  That is all.
+    movl (%edi), %edx
+    movl 4(%edi), %ebx
+    jmp .Lgpr_setup_finished2
+.LfirstLong2:
+    // ECX:EDX is a long
+    movl (%edi), %ecx
+    movl 4(%edi), %edx
+    addl LITERAL(8), %edi         // arg_array += 2
+
+    // Anything for EBX?
+    SKIP_OVER_FLOATS .Lgpr_setup_finished2
+    // First word of long or integer.  Load into EBX.
+    movl (%edi), %ebx
+    jmp .Lgpr_setup_finished2
+    // Nothing left to load.
+.Lgpr_setup_finished2:
+    mov 20(%ebp), %eax            // move method pointer into eax
+    call *MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32(%eax) // call the method
+    mov %ebp, %esp                // restore stack pointer
+    CFI_DEF_CFA_REGISTER(esp)
+    POP edi                       // pop edi
+    POP esi                       // pop esi
+    POP ebx                       // pop ebx
+    POP ebp                       // pop ebp
+    mov 20(%esp), %ecx            // get result pointer
+    mov %eax, (%ecx)              // store the result assuming its a long, int or Object*
+    mov %edx, 4(%ecx)             // store the other half of the result
+    mov 24(%esp), %edx            // get the shorty
+    cmpb LITERAL(68), (%edx)      // test if result type char == 'D'
+    je .Lreturn_double_quick2
+    cmpb LITERAL(70), (%edx)      // test if result type char == 'F'
+    je .Lreturn_float_quick2
+    ret
+.Lreturn_double_quick2:
+    movsd %xmm0, (%ecx)           // store the floating point result
+    ret
+.Lreturn_float_quick2:
+    movss %xmm0, (%ecx)           // store the floating point result
+    ret
+END_FUNCTION art_quick_invoke_static_stub
+
 MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION RAW_VAR(c_name, 0)
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME ebx, ebx  // save ref containing registers for GC
@@ -802,12 +1069,10 @@
 NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
 
 DEFINE_FUNCTION art_quick_fmod
-    subl LITERAL(12), %esp        // alignment padding
-    CFI_ADJUST_CFA_OFFSET(12)
-    PUSH ebx                      // pass arg4 b.hi
-    PUSH edx                      // pass arg3 b.lo
-    PUSH ecx                      // pass arg2 a.hi
-    PUSH eax                      // pass arg1 a.lo
+    subl LITERAL(28), %esp        // alignment padding, room for arguments
+    CFI_ADJUST_CFA_OFFSET(28)
+    movsd %xmm0, 0(%esp)          // arg a
+    movsd %xmm1, 8(%esp)          // arg b
     SETUP_GOT_NOSAVE ebx          // clobbers EBX
     call PLT_SYMBOL(fmod)         // (jdouble a, jdouble b)
     fstpl (%esp)                  // pop return value off fp stack
@@ -818,9 +1083,10 @@
 END_FUNCTION art_quick_fmod
 
 DEFINE_FUNCTION art_quick_fmodf
-    PUSH eax                      // alignment padding
-    PUSH ecx                      // pass arg2 b
-    PUSH eax                      // pass arg1 a
+    subl LITERAL(12), %esp        // alignment padding, room for arguments
+    CFI_ADJUST_CFA_OFFSET(12)
+    movss %xmm0, 0(%esp)          // arg a
+    movss %xmm1, 4(%esp)          // arg b
     SETUP_GOT_NOSAVE ebx          // clobbers EBX
     call PLT_SYMBOL(fmodf)        // (jfloat a, jfloat b)
     fstps (%esp)                  // pop return value off fp stack
@@ -831,20 +1097,20 @@
 END_FUNCTION art_quick_fmodf
 
 DEFINE_FUNCTION art_quick_d2l
-    PUSH eax                      // alignment padding
-    PUSH ecx                      // pass arg2 a.hi
-    PUSH eax                      // pass arg1 a.lo
-    call SYMBOL(art_d2l)      // (jdouble a)
+    subl LITERAL(12), %esp        // alignment padding, room for argument
+    CFI_ADJUST_CFA_OFFSET(12)
+    movsd %xmm0, 0(%esp)          // arg a
+    call SYMBOL(art_d2l)          // (jdouble a)
     addl LITERAL(12), %esp        // pop arguments
     CFI_ADJUST_CFA_OFFSET(-12)
     ret
 END_FUNCTION art_quick_d2l
 
 DEFINE_FUNCTION art_quick_f2l
-    subl LITERAL(8), %esp         // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
-    PUSH eax                      // pass arg1 a
-    call SYMBOL(art_f2l)      // (jfloat a)
+    subl LITERAL(12), %esp        // alignment padding
+    CFI_ADJUST_CFA_OFFSET(12)
+    movss %xmm0, 0(%esp)          // arg a
+    call SYMBOL(art_f2l)          // (jfloat a)
     addl LITERAL(12), %esp        // pop arguments
     CFI_ADJUST_CFA_OFFSET(-12)
     ret
@@ -1004,8 +1270,8 @@
     movd %eax, %xmm0              // place return value also into floating point return value
     movd %edx, %xmm1
     punpckldq %xmm1, %xmm0
-    addl LITERAL(44), %esp        // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-44)
+    addl LITERAL(76), %esp        // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-76)
     RETURN_OR_DELIVER_PENDING_EXCEPTION    // return or deliver exception
 END_FUNCTION art_quick_proxy_invoke_handler
 
@@ -1017,7 +1283,7 @@
     PUSH ecx
     movl 8(%esp), %eax            // load caller Method*
     movl MIRROR_ART_METHOD_DEX_CACHE_METHODS_OFFSET(%eax), %eax  // load dex_cache_resolved_methods
-    movd %xmm0, %ecx              // get target method index stored in xmm0
+    movd %xmm7, %ecx              // get target method index stored in xmm0
     movl MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4), %eax  // load the target method
     POP ecx
     jmp SYMBOL(art_quick_invoke_interface_trampoline)
@@ -1036,14 +1302,7 @@
     addl LITERAL(16), %esp        // pop arguments
     test %eax, %eax               // if code pointer is NULL goto deliver pending exception
     jz 1f
-    POP eax                       // called method
-    POP ecx                       // restore args
-    POP edx
-    POP ebx
-    POP ebp                       // restore callee saves except EDI
-    POP esi
-    xchgl 0(%esp),%edi            // restore EDI and place code pointer as only value on stack
-    ret                           // tail call into method
+    RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME_AND_JUMP
 1:
     RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     DELIVER_PENDING_EXCEPTION
@@ -1077,7 +1336,6 @@
     movl %edx, %esp
 
     // On x86 there are no registers passed, so nothing to pop here.
-
     // Native call.
     call *%eax
 
@@ -1104,8 +1362,10 @@
     jnz .Lexception_in_native
 
     // Tear down the callee-save frame.
-    addl LITERAL(4), %esp     // Remove padding
-    CFI_ADJUST_CFA_OFFSET(-4)
+    // Remove space for FPR args and EAX
+    addl MACRO_LITERAL(4 + 4 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(-(4 + 4 * 8))
+
     POP ecx
     addl LITERAL(4), %esp     // Avoid edx, as it may be part of the result.
     CFI_ADJUST_CFA_OFFSET(-4)
@@ -1135,12 +1395,21 @@
     CFI_ADJUST_CFA_OFFSET(4)
     PUSH eax                      // pass  method
     call SYMBOL(artQuickToInterpreterBridge)  // (method, Thread*, SP)
-    movd %eax, %xmm0              // place return value also into floating point return value
-    movd %edx, %xmm1
-    punpckldq %xmm1, %xmm0
     addl LITERAL(16), %esp        // pop arguments
     CFI_ADJUST_CFA_OFFSET(-16)
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+
+    // Return eax:edx in xmm0 also.
+    movd %eax, %xmm0
+    movd %edx, %xmm1
+    punpckldq %xmm1, %xmm0
+
+    addl MACRO_LITERAL(48), %esp  // Remove FPRs and EAX, ECX, EDX, EBX.
+    CFI_ADJUST_CFA_OFFSET(-48)
+
+    POP ebp  // Restore callee saves
+    POP esi
+    POP edi
+
     RETURN_OR_DELIVER_PENDING_EXCEPTION    // return or deliver exception
 END_FUNCTION art_quick_to_interpreter_bridge
 
@@ -1160,18 +1429,25 @@
     PUSH eax                      // Pass Method*.
     call SYMBOL(artInstrumentationMethodEntryFromCode) // (Method*, Object*, Thread*, LR)
     addl LITERAL(28), %esp        // Pop arguments upto saved Method*.
-    movl 28(%esp), %edi           // Restore edi.
-    movl %eax, 28(%esp)           // Place code* over edi, just under return pc.
+    movl 60(%esp), %edi           // Restore edi.
+    movl %eax, 60(%esp)           // Place code* over edi, just under return pc.
     movl SYMBOL(art_quick_instrumentation_exit)@GOT(%ebx), %ebx
     // Place instrumentation exit as return pc. ebx holds the GOT computed on entry.
-    movl %ebx, 32(%esp)
-    movl (%esp), %eax             // Restore eax.
-    movl 8(%esp), %ecx            // Restore ecx.
-    movl 12(%esp), %edx           // Restore edx.
-    movl 16(%esp), %ebx           // Restore ebx.
-    movl 20(%esp), %ebp           // Restore ebp.
-    movl 24(%esp), %esi           // Restore esi.
-    addl LITERAL(28), %esp        // Wind stack back upto code*.
+    movl %ebx, 64(%esp)
+    movl 0(%esp), %eax           // Restore eax.
+    // Restore FPRs (extra 4 bytes of offset due to EAX push at top).
+    movsd 8(%esp), %xmm0
+    movsd 16(%esp), %xmm1
+    movsd 24(%esp), %xmm2
+    movsd 32(%esp), %xmm3
+
+    // Restore GPRs.
+    movl 40(%esp), %ecx           // Restore ecx.
+    movl 48(%esp), %edx           // Restore edx.
+    movl 48(%esp), %ebx           // Restore ebx.
+    movl 52(%esp), %ebp           // Restore ebp.
+    movl 56(%esp), %esi           // Restore esi.
+    addl LITERAL(60), %esp        // Wind stack back upto code*.
     ret                           // Call method (and pop).
 END_FUNCTION art_quick_instrumentation_entry
 
diff --git a/runtime/arch/x86/quick_method_frame_info_x86.h b/runtime/arch/x86/quick_method_frame_info_x86.h
index b9dc0d8..9bba531 100644
--- a/runtime/arch/x86/quick_method_frame_info_x86.h
+++ b/runtime/arch/x86/quick_method_frame_info_x86.h
@@ -24,25 +24,44 @@
 namespace art {
 namespace x86 {
 
+enum XMM {
+  XMM0 = 0,
+  XMM1 = 1,
+  XMM2 = 2,
+  XMM3 = 3,
+  XMM4 = 4,
+  XMM5 = 5,
+  XMM6 = 6,
+  XMM7 = 7,
+};
+
 static constexpr uint32_t kX86CalleeSaveRefSpills =
     (1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI);
 static constexpr uint32_t kX86CalleeSaveArgSpills =
     (1 << art::x86::ECX) | (1 << art::x86::EDX) | (1 << art::x86::EBX);
+static constexpr uint32_t kX86CalleeSaveFpArgSpills =
+    (1 << art::x86::XMM0) | (1 << art::x86::XMM1) |
+    (1 << art::x86::XMM2) | (1 << art::x86::XMM3);
 
 constexpr uint32_t X86CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kX86CalleeSaveRefSpills | (type == Runtime::kRefsAndArgs ? kX86CalleeSaveArgSpills : 0) |
       (1 << art::x86::kNumberOfCpuRegisters);  // fake return address callee save
 }
 
+constexpr uint32_t X86CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
+    return type == Runtime::kRefsAndArgs ? kX86CalleeSaveFpArgSpills : 0;
+}
+
 constexpr uint32_t X86CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
   return RoundUp((POPCOUNT(X86CalleeSaveCoreSpills(type)) /* gprs */ +
+                  2 * POPCOUNT(X86CalleeSaveFpSpills(type)) /* fprs */ +
                   1 /* Method* */) * kX86PointerSize, kStackAlignment);
 }
 
 constexpr QuickMethodFrameInfo X86CalleeSaveMethodFrameInfo(Runtime::CalleeSaveType type) {
   return QuickMethodFrameInfo(X86CalleeSaveFrameSize(type),
                               X86CalleeSaveCoreSpills(type),
-                              0u);
+                              X86CalleeSaveFpSpills(type));
 }
 
 }  // namespace x86
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 9db1646..87dd790 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -145,14 +145,18 @@
   // | EBX         |    arg3
   // | EDX         |    arg2
   // | ECX         |    arg1
+  // | XMM3        |    float arg 4
+  // | XMM2        |    float arg 3
+  // | XMM1        |    float arg 2
+  // | XMM0        |    float arg 1
   // | EAX/Method* |  <- sp
-  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 28;  // Offset of return address.
+  static constexpr size_t kNumQuickFprArgs = 4;  // 4 arguments passed in FPRs.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 4;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4 + 4*8;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 28 + 4*8;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index ff3822a..71f06db 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -41,7 +41,7 @@
 
 extern "C" void art_quick_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                       const char*);
-#if defined(__LP64__) || defined(__arm__)
+#if defined(__LP64__) || defined(__arm__) || defined(__i386__)
 extern "C" void art_quick_invoke_static_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                              const char*);
 #endif
@@ -417,7 +417,7 @@
             << "Don't call compiled code when -Xint " << PrettyMethod(this);
       }
 
-#if defined(__LP64__) || defined(__arm__)
+#if defined(__LP64__) || defined(__arm__) || defined(__i386__)
       if (!IsStatic()) {
         (*art_quick_invoke_stub)(this, args, args_size, self, result, shorty);
       } else {