Revert "Revert "ART: Implement X86 hard float (Quick/JNI/Baseline)""

This reverts commit 949c91fb91f40a4a80b2b492913cf8541008975e.

This time, don't clobber EBX before saving it.

Redo some of the macros to make register usage explicit.

Change-Id: I8db8662877cd006816e16a28f42444ab7c36bfef
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 3815722..811d4f5 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -41,22 +41,15 @@
     }
    protected:
     Mir2Lir* m2l_;
-   private:
     size_t cur_core_reg_;
     size_t cur_fp_reg_;
   };
 
-  class InToRegStorageX86Mapper : public InToRegStorageMapper {
+  class InToRegStorageX86Mapper : public InToRegStorageX86_64Mapper {
    public:
-    explicit InToRegStorageX86Mapper(Mir2Lir* m2l) : m2l_(m2l), cur_core_reg_(0) {}
+    explicit InToRegStorageX86Mapper(Mir2Lir* m2l)
+        : InToRegStorageX86_64Mapper(m2l) { }
     virtual RegStorage GetNextReg(ShortyArg arg);
-    virtual void Reset() OVERRIDE {
-      cur_core_reg_ = 0;
-    }
-   protected:
-    Mir2Lir* m2l_;
-   private:
-    size_t cur_core_reg_;
   };
 
   InToRegStorageX86_64Mapper in_to_reg_storage_x86_64_mapper_;
@@ -120,9 +113,12 @@
       if (cu_->target64) {
         return As64BitReg(TargetReg32(symbolic_reg));
       } else {
+        if (symbolic_reg >= kFArg0 && symbolic_reg <= kFArg3) {
+          // We want an XMM, not a pair.
+          return As64BitReg(TargetReg32(symbolic_reg));
+        }
         // x86: construct a pair.
         DCHECK((kArg0 <= symbolic_reg && symbolic_reg < kArg3) ||
-               (kFArg0 <= symbolic_reg && symbolic_reg < kFArg3) ||
                (kRet0 == symbolic_reg));
         return RegStorage::MakeRegPair(TargetReg32(symbolic_reg),
                                  TargetReg32(static_cast<SpecialTargetRegister>(symbolic_reg + 1)));
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index bc64aad..0337096 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -180,10 +180,10 @@
   RegStorage::InvalidReg(),  // kArg5
   RegStorage::InvalidReg(),  // kArg6
   RegStorage::InvalidReg(),  // kArg7
-  rs_rAX,                    // kFArg0
-  rs_rCX,                    // kFArg1
-  rs_rDX,                    // kFArg2
-  rs_rBX,                    // kFArg3
+  rs_fr0,                    // kFArg0
+  rs_fr1,                    // kFArg1
+  rs_fr2,                    // kFArg2
+  rs_fr3,                    // kFArg3
   RegStorage::InvalidReg(),  // kFArg4
   RegStorage::InvalidReg(),  // kFArg5
   RegStorage::InvalidReg(),  // kFArg6
@@ -200,7 +200,7 @@
   rs_rDX,                    // kRet1
   rs_rAX,                    // kInvokeTgt
   rs_rAX,                    // kHiddenArg - used to hold the method index before copying to fr0.
-  rs_fr0,                    // kHiddenFpArg
+  rs_fr7,                    // kHiddenFpArg
   rs_rCX,                    // kCount
 };
 
@@ -545,13 +545,13 @@
   LockTemp(TargetReg32(kArg1));
   LockTemp(TargetReg32(kArg2));
   LockTemp(TargetReg32(kArg3));
+  LockTemp(TargetReg32(kFArg0));
+  LockTemp(TargetReg32(kFArg1));
+  LockTemp(TargetReg32(kFArg2));
+  LockTemp(TargetReg32(kFArg3));
   if (cu_->target64) {
     LockTemp(TargetReg32(kArg4));
     LockTemp(TargetReg32(kArg5));
-    LockTemp(TargetReg32(kFArg0));
-    LockTemp(TargetReg32(kFArg1));
-    LockTemp(TargetReg32(kFArg2));
-    LockTemp(TargetReg32(kFArg3));
     LockTemp(TargetReg32(kFArg4));
     LockTemp(TargetReg32(kFArg5));
     LockTemp(TargetReg32(kFArg6));
@@ -566,13 +566,13 @@
   FreeTemp(TargetReg32(kArg2));
   FreeTemp(TargetReg32(kArg3));
   FreeTemp(TargetReg32(kHiddenArg));
+  FreeTemp(TargetReg32(kFArg0));
+  FreeTemp(TargetReg32(kFArg1));
+  FreeTemp(TargetReg32(kFArg2));
+  FreeTemp(TargetReg32(kFArg3));
   if (cu_->target64) {
     FreeTemp(TargetReg32(kArg4));
     FreeTemp(TargetReg32(kArg5));
-    FreeTemp(TargetReg32(kFArg0));
-    FreeTemp(TargetReg32(kFArg1));
-    FreeTemp(TargetReg32(kFArg2));
-    FreeTemp(TargetReg32(kFArg3));
     FreeTemp(TargetReg32(kFArg4));
     FreeTemp(TargetReg32(kFArg5));
     FreeTemp(TargetReg32(kFArg6));
@@ -2460,14 +2460,23 @@
 RegStorage X86Mir2Lir::InToRegStorageX86Mapper::GetNextReg(ShortyArg arg) {
   const SpecialTargetRegister coreArgMappingToPhysicalReg[] = {kArg1, kArg2, kArg3};
   const size_t coreArgMappingToPhysicalRegSize = arraysize(coreArgMappingToPhysicalReg);
+  const SpecialTargetRegister fpArgMappingToPhysicalReg[] = {kFArg0, kFArg1, kFArg2, kFArg3};
+  const size_t fpArgMappingToPhysicalRegSize = arraysize(fpArgMappingToPhysicalReg);
 
   RegStorage result = RegStorage::InvalidReg();
-  if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-    result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
-                          arg.IsRef() ? kRef : kNotWide);
-    if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      result = RegStorage::MakeRegPair(
-          result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
+  if (arg.IsFP()) {
+    if (cur_fp_reg_ < fpArgMappingToPhysicalRegSize) {
+      return m2l_->TargetReg(fpArgMappingToPhysicalReg[cur_fp_reg_++],
+                             arg.IsWide() ? kWide : kNotWide);
+    }
+  } else {
+    if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+      result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
+                               arg.IsRef() ? kRef : kNotWide);
+      if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+        result = RegStorage::MakeRegPair(
+            result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
+      }
     }
   }
   return result;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index bc4cb5a..7dea09a 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -57,15 +57,15 @@
  * x86-64/x32 gs: holds it.
  *
  * For floating point we don't support CPUs without SSE2 support (ie newer than PIII):
- *  Native: x86  | x86-64 / x32 | ART x86                    | ART x86-64
- *  XMM0: caller | caller, arg1 | caller, float return value | caller, arg1, float return value
- *  XMM1: caller | caller, arg2 | caller, scratch            | caller, arg2, scratch
- *  XMM2: caller | caller, arg3 | caller, scratch            | caller, arg3, scratch
- *  XMM3: caller | caller, arg4 | caller, scratch            | caller, arg4, scratch
- *  XMM4: caller | caller, arg5 | caller, scratch            | caller, arg5, scratch
- *  XMM5: caller | caller, arg6 | caller, scratch            | caller, arg6, scratch
- *  XMM6: caller | caller, arg7 | caller, scratch            | caller, arg7, scratch
- *  XMM7: caller | caller, arg8 | caller, scratch            | caller, arg8, scratch
+ *  Native: x86  | x86-64 / x32 | ART x86                          | ART x86-64
+ *  XMM0: caller | caller, arg1 | caller, arg1, float return value | caller, arg1, float return value
+ *  XMM1: caller | caller, arg2 | caller, arg2, scratch            | caller, arg2, scratch
+ *  XMM2: caller | caller, arg3 | caller, arg3, scratch            | caller, arg3, scratch
+ *  XMM3: caller | caller, arg4 | caller, arg4, scratch            | caller, arg4, scratch
+ *  XMM4: caller | caller, arg5 | caller, scratch                  | caller, arg5, scratch
+ *  XMM5: caller | caller, arg6 | caller, scratch                  | caller, arg6, scratch
+ *  XMM6: caller | caller, arg7 | caller, scratch                  | caller, arg7, scratch
+ *  XMM7: caller | caller, arg8 | caller, scratch                  | caller, arg8, scratch
  *  ---  x86-64/x32 registers
  *  XMM8 .. 11: caller save available as scratch registers for ART.
  *  XMM12 .. 15: callee save available as promoted registers for ART.
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index a5686e1..fc72e88 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -77,12 +77,34 @@
 }
 
 bool X86ManagedRuntimeCallingConvention::IsCurrentParamOnStack() {
-  return true;  // Everything is passed by stack
+  // We assume all parameters are on stack, args coming via registers are spilled as entry_spills.
+  return true;
 }
 
 ManagedRegister X86ManagedRuntimeCallingConvention::CurrentParamRegister() {
-  LOG(FATAL) << "Should not reach here";
-  return ManagedRegister::NoRegister();
+  ManagedRegister res = ManagedRegister::NoRegister();
+  if (!IsCurrentParamAFloatOrDouble()) {
+    switch (gpr_arg_count_) {
+      case 0: res = X86ManagedRegister::FromCpuRegister(ECX); break;
+      case 1: res = X86ManagedRegister::FromCpuRegister(EDX); break;
+      case 2: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+    }
+  } else if (itr_float_and_doubles_ < 4) {
+    // First four float parameters are passed via XMM0..XMM3
+    res = X86ManagedRegister::FromXmmRegister(
+                                 static_cast<XmmRegister>(XMM0 + itr_float_and_doubles_));
+  }
+  return res;
+}
+
+ManagedRegister X86ManagedRuntimeCallingConvention::CurrentParamHighLongRegister() {
+  ManagedRegister res = ManagedRegister::NoRegister();
+  DCHECK(IsCurrentParamALong());
+  switch (gpr_arg_count_) {
+    case 0: res = X86ManagedRegister::FromCpuRegister(EDX); break;
+    case 1: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+  }
+  return res;
 }
 
 FrameOffset X86ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
@@ -95,15 +117,32 @@
   // We spill the argument registers on X86 to free them up for scratch use, we then assume
   // all arguments are on the stack.
   if (entry_spills_.size() == 0) {
-    size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
-    if (num_spills > 0) {
-      entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(ECX));
-      if (num_spills > 1) {
-        entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EDX));
-        if (num_spills > 2) {
-          entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EBX));
+    ResetIterator(FrameOffset(0));
+    while (HasNext()) {
+      ManagedRegister in_reg = CurrentParamRegister();
+      if (!in_reg.IsNoRegister()) {
+        int32_t size = IsParamADouble(itr_args_) ? 8 : 4;
+        int32_t spill_offset = CurrentParamStackOffset().Uint32Value();
+        ManagedRegisterSpill spill(in_reg, size, spill_offset);
+        entry_spills_.push_back(spill);
+        if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
+          // special case, as we may need a second register here.
+          in_reg = CurrentParamHighLongRegister();
+          if (!in_reg.IsNoRegister()) {
+            // We have to spill the second half of the long.
+            ManagedRegisterSpill spill2(in_reg, size, spill_offset + 4);
+            entry_spills_.push_back(spill2);
+            // Long was allocated in 2 registers.
+            gpr_arg_count_++;
+          }
+        }
+
+        // Keep track of the number of GPRs allocated.
+        if (!IsCurrentParamAFloatOrDouble()) {
+          gpr_arg_count_++;
         }
       }
+      Next();
     }
   }
   return entry_spills_;
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index 025eb6d..b1b3598 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -28,7 +28,8 @@
  public:
   explicit X86ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized,
                                               const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize),
+        gpr_arg_count_(0) {}
   ~X86ManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
@@ -40,7 +41,10 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
   const ManagedRegisterEntrySpills& EntrySpills() OVERRIDE;
+
  private:
+  int gpr_arg_count_;
+  ManagedRegister CurrentParamHighLongRegister();
   ManagedRegisterEntrySpills entry_spills_;
   DISALLOW_COPY_AND_ASSIGN(X86ManagedRuntimeCallingConvention);
 };
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c0fdcaa..66f1d5e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -36,8 +36,9 @@
 static constexpr Register kRuntimeParameterCoreRegisters[] = { EAX, ECX, EDX, EBX };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
-static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
-static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
+static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1, XMM2, XMM3 };
+static constexpr size_t kRuntimeParameterFpuRegistersLength =
+    arraysize(kRuntimeParameterFpuRegisters);
 
 static constexpr int kC2ConditionMask = 0x400;
 
@@ -504,30 +505,49 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-    case Primitive::kPrimFloat:
     case Primitive::kPrimNot: {
       uint32_t index = gp_index_++;
+      stack_index_++;
       if (index < calling_convention.GetNumberOfRegisters()) {
         return Location::RegisterLocation(calling_convention.GetRegisterAt(index));
       } else {
-        return Location::StackSlot(calling_convention.GetStackOffsetOf(index));
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 1));
       }
     }
 
-    case Primitive::kPrimLong:
-    case Primitive::kPrimDouble: {
+    case Primitive::kPrimLong: {
       uint32_t index = gp_index_;
       gp_index_ += 2;
+      stack_index_ += 2;
       if (index + 1 < calling_convention.GetNumberOfRegisters()) {
         X86ManagedRegister pair = X86ManagedRegister::FromRegisterPair(
             calling_convention.GetRegisterPairAt(index));
         return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
       } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
-        // On X86, the register index and stack index of a quick parameter is the same, since
-        // we are passing floating pointer values in core registers.
-        return Location::QuickParameter(index, index);
+        // stack_index_ is the right offset for the memory.
+        return Location::QuickParameter(index, stack_index_ - 2);
       } else {
-        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index));
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
+      }
+    }
+
+    case Primitive::kPrimFloat: {
+      uint32_t index = fp_index_++;
+      stack_index_++;
+      if (index < calling_convention.GetNumberOfFpuRegisters()) {
+        return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
+      } else {
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 1));
+      }
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t index = fp_index_++;
+      stack_index_ += 2;
+      if (index < calling_convention.GetNumberOfFpuRegisters()) {
+        return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
+      } else {
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
       }
     }
 
@@ -1186,7 +1206,7 @@
 void LocationsBuilderX86::VisitInvokeInterface(HInvokeInterface* invoke) {
   HandleInvoke(invoke);
   // Add the hidden argument.
-  invoke->GetLocations()->AddTemp(Location::FpuRegisterLocation(XMM0));
+  invoke->GetLocations()->AddTemp(Location::FpuRegisterLocation(XMM7));
 }
 
 void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke) {
@@ -1388,31 +1408,17 @@
           locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
           break;
 
-        case Primitive::kPrimFloat: {
-          // Processing a Dex `float-to-long' instruction.
-          InvokeRuntimeCallingConvention calling_convention;
-          // Note that on x86 floating-point parameters are passed
-          // through core registers (here, EAX).
-          locations->SetInAt(0, Location::RegisterLocation(
-              calling_convention.GetRegisterAt(0)));
-          // The runtime helper puts the result in EAX, EDX.
-          locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
-          break;
-        }
-
+        case Primitive::kPrimFloat:
         case Primitive::kPrimDouble: {
-          // Processing a Dex `double-to-long' instruction.
+          // Processing a Dex `float-to-long' or 'double-to-long' instruction.
           InvokeRuntimeCallingConvention calling_convention;
-          // Note that on x86 floating-point parameters are passed
-          // through core registers (here, EAX and ECX).
-          locations->SetInAt(0, Location::RegisterPairLocation(
-              calling_convention.GetRegisterAt(0),
-              calling_convention.GetRegisterAt(1)));
+          XmmRegister parameter = calling_convention.GetFpuRegisterAt(0);
+          locations->SetInAt(0, Location::FpuRegisterLocation(parameter));
+
           // The runtime helper puts the result in EAX, EDX.
           locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
-          break;
         }
-          break;
+        break;
 
         default:
           LOG(FATAL) << "Unexpected type conversion from " << input_type
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 73b647c..55d71e3 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -36,8 +36,8 @@
 static constexpr Register kParameterCoreRegisters[] = { ECX, EDX, EBX };
 static constexpr RegisterPair kParameterCorePairRegisters[] = { ECX_EDX, EDX_EBX };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
-static constexpr XmmRegister kParameterFpuRegisters[] = { };
-static constexpr size_t kParameterFpuRegistersLength = 0;
+static constexpr XmmRegister kParameterFpuRegisters[] = { XMM0, XMM1, XMM2, XMM3 };
+static constexpr size_t kParameterFpuRegistersLength = arraysize(kParameterFpuRegisters);
 
 class InvokeDexCallingConvention : public CallingConvention<Register, XmmRegister> {
  public:
@@ -58,13 +58,18 @@
 
 class InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor() : gp_index_(0) {}
+  InvokeDexCallingConventionVisitor() : gp_index_(0), fp_index_(0), stack_index_(0) {}
 
   Location GetNextLocation(Primitive::Type type);
 
  private:
   InvokeDexCallingConvention calling_convention;
+  // The current index for cpu registers.
   uint32_t gp_index_;
+  // The current index for fpu registers.
+  uint32_t fp_index_;
+  // The current stack index.
+  uint32_t stack_index_;
 
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 3f266fe..1f0dba5 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1537,8 +1537,12 @@
 
   uint32_t reg_offset = 1;
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    pushl(spill_regs.at(i).AsX86().AsCpuRegister());
+    x86::X86ManagedRegister spill = spill_regs.at(i).AsX86();
+    DCHECK(spill.IsCpuRegister());
+    pushl(spill.AsCpuRegister());
+    gpr_count++;
 
     // DW_CFA_advance_loc
     DW_CFA_advance_loc(&cfi_info_, buffer_.Size() - cfi_pc_);
@@ -1552,7 +1556,7 @@
   }
 
   // return address then method on stack
-  int32_t adjust = frame_size - (spill_regs.size() * kFramePointerSize) -
+  int32_t adjust = frame_size - (gpr_count * kFramePointerSize) -
                    sizeof(StackReference<mirror::ArtMethod>) /*method*/ -
                    kFramePointerSize /*return address*/;
   addl(ESP, Immediate(-adjust));
@@ -1572,9 +1576,18 @@
   DW_CFA_def_cfa_offset(&cfi_info_, cfi_cfa_offset_);
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    movl(Address(ESP, frame_size + sizeof(StackReference<mirror::ArtMethod>) +
-                 (i * kFramePointerSize)),
-         entry_spills.at(i).AsX86().AsCpuRegister());
+    ManagedRegisterSpill spill = entry_spills.at(i);
+    if (spill.AsX86().IsCpuRegister()) {
+      movl(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsCpuRegister());
+    } else {
+      DCHECK(spill.AsX86().IsXmmRegister());
+      if (spill.getSize() == 8) {
+        movsd(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsXmmRegister());
+      } else {
+        CHECK_EQ(spill.getSize(), 4);
+        movss(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsXmmRegister());
+      }
+    }
   }
 }
 
@@ -1584,7 +1597,9 @@
   addl(ESP, Immediate(frame_size - (spill_regs.size() * kFramePointerSize) -
                       sizeof(StackReference<mirror::ArtMethod>)));
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    popl(spill_regs.at(i).AsX86().AsCpuRegister());
+    x86::X86ManagedRegister spill = spill_regs.at(i).AsX86();
+    DCHECK(spill.IsCpuRegister());
+    popl(spill.AsCpuRegister());
   }
   ret();
 }
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 4b67c83..6acc2a7 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -302,7 +302,7 @@
 #if defined(__i386__)
     // TODO: Set the thread?
     __asm__ __volatile__(
-        "movd %[hidden], %%xmm0\n\t"
+        "movd %[hidden], %%xmm7\n\t"
         "subl $12, %%esp\n\t"       // Align stack.
         "pushl %[referrer]\n\t"     // Store referrer
         "call *%%edi\n\t"           // Call the stub
diff --git a/runtime/arch/x86/asm_support_x86.S b/runtime/arch/x86/asm_support_x86.S
index fea16da..122428b 100644
--- a/runtime/arch/x86/asm_support_x86.S
+++ b/runtime/arch/x86/asm_support_x86.S
@@ -25,6 +25,8 @@
     #define MACRO1(macro_name, macro_arg1) .macro macro_name
     #define MACRO2(macro_name, macro_arg1, macro_args2) .macro macro_name
     #define MACRO3(macro_name, macro_arg1, macro_args2, macro_args3) .macro macro_name
+    #define MACRO4(macro_name, macro_arg1, macro_arg2, macro_arg3, macro_arg4) .macro macro_name
+    #define MACRO5(macro_name, macro_arg1, macro_arg2, macro_arg3, macro_arg4, macro_arg5) .macro macro_name
     #define END_MACRO .endmacro
 
     // Clang's as(1) uses $0, $1, and so on for macro arguments.
@@ -43,6 +45,8 @@
     #define MACRO1(macro_name, macro_arg1) .macro macro_name macro_arg1
     #define MACRO2(macro_name, macro_arg1, macro_arg2) .macro macro_name macro_arg1, macro_arg2
     #define MACRO3(macro_name, macro_arg1, macro_arg2, macro_arg3) .macro macro_name macro_arg1, macro_arg2, macro_arg3
+    #define MACRO4(macro_name, macro_arg1, macro_arg2, macro_arg3, macro_arg4) .macro macro_name macro_arg1, macro_arg2, macro_arg3, macro_arg4
+    #define MACRO5(macro_name, macro_arg1, macro_arg2, macro_arg3, macro_arg4, macro_arg5) .macro macro_name macro_arg1, macro_arg2, macro_arg3, macro_arg4, macro_arg5
     #define END_MACRO .endm
 
     // Regular gas(1) uses \argument_name for macro arguments.
diff --git a/runtime/arch/x86/asm_support_x86.h b/runtime/arch/x86/asm_support_x86.h
index 5a88f80..b0a6017 100644
--- a/runtime/arch/x86/asm_support_x86.h
+++ b/runtime/arch/x86/asm_support_x86.h
@@ -21,6 +21,8 @@
 
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 32
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 32
-#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 32
+
+// 32 bytes for GPRs and 32 bytes for FPRs.
+#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE (32 + 32)
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_H_
diff --git a/runtime/arch/x86/context_x86.cc b/runtime/arch/x86/context_x86.cc
index 2a6ff14..4ea4684 100644
--- a/runtime/arch/x86/context_x86.cc
+++ b/runtime/arch/x86/context_x86.cc
@@ -30,6 +30,9 @@
   for (size_t  i = 0; i < kNumberOfCpuRegisters; i++) {
     gprs_[i] = nullptr;
   }
+  for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+    fprs_[i] = nullptr;
+  }
   gprs_[ESP] = &esp_;
   // Initialize registers with easy to spot debug values.
   esp_ = X86Context::kBadGprBase + ESP;
@@ -40,7 +43,7 @@
   mirror::ArtMethod* method = fr.GetMethod();
   const QuickMethodFrameInfo frame_info = method->GetQuickFrameInfo();
   size_t spill_count = POPCOUNT(frame_info.CoreSpillMask());
-  DCHECK_EQ(frame_info.FpSpillMask(), 0u);
+  size_t fp_spill_count = POPCOUNT(frame_info.FpSpillMask());
   if (spill_count > 0) {
     // Lowest number spill is farthest away, walk registers and fill into context.
     int j = 2;  // Offset j to skip return address spill.
@@ -51,6 +54,24 @@
       }
     }
   }
+  if (fp_spill_count > 0) {
+    // Lowest number spill is farthest away, walk registers and fill into context.
+    size_t j = 2;  // Offset j to skip return address spill.
+    size_t fp_spill_size_in_words = fp_spill_count * 2;
+    for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+      if (((frame_info.FpSpillMask() >> i) & 1) != 0) {
+        // There are 2 pieces to each XMM register, to match VR size.
+        fprs_[2*i] = reinterpret_cast<uint32_t*>(
+            fr.CalleeSaveAddress(spill_count + fp_spill_size_in_words - j,
+                                 frame_info.FrameSizeInBytes()));
+        fprs_[2*i+1] = reinterpret_cast<uint32_t*>(
+            fr.CalleeSaveAddress(spill_count + fp_spill_size_in_words - j - 1,
+                                 frame_info.FrameSizeInBytes()));
+        // Two void* per XMM register.
+        j += 2;
+      }
+    }
+  }
 }
 
 void X86Context::SmashCallerSaves() {
@@ -59,6 +80,7 @@
   gprs_[EDX] = const_cast<uintptr_t*>(&gZero);
   gprs_[ECX] = nullptr;
   gprs_[EBX] = nullptr;
+  memset(&fprs_[0], '\0', sizeof(fprs_));
 }
 
 void X86Context::SetGPR(uint32_t reg, uintptr_t value) {
@@ -68,14 +90,11 @@
   *gprs_[reg] = value;
 }
 
-uintptr_t X86Context::GetFPR(uint32_t reg ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Floating-point registers are all caller save in X86";
-  UNREACHABLE();
-}
-
-void X86Context::SetFPR(uint32_t reg ATTRIBUTE_UNUSED, uintptr_t value ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Floating-point registers are all caller save in X86";
-  UNREACHABLE();
+void X86Context::SetFPR(uint32_t reg, uintptr_t value) {
+  CHECK_LT(reg, static_cast<uint32_t>(kNumberOfFloatRegisters));
+  DCHECK(IsAccessibleFPR(reg));
+  CHECK_NE(fprs_[reg], reinterpret_cast<const uint32_t*>(&gZero));
+  *fprs_[reg] = value;
 }
 
 void X86Context::DoLongJump() {
@@ -86,17 +105,30 @@
   for (size_t i = 0; i < kNumberOfCpuRegisters; ++i) {
     gprs[kNumberOfCpuRegisters - i - 1] = gprs_[i] != nullptr ? *gprs_[i] : X86Context::kBadGprBase + i;
   }
+  uint32_t fprs[kNumberOfFloatRegisters];
+  for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+    fprs[i] = fprs_[i] != nullptr ? *fprs_[i] : X86Context::kBadFprBase + i;
+  }
   // We want to load the stack pointer one slot below so that the ret will pop eip.
   uintptr_t esp = gprs[kNumberOfCpuRegisters - ESP - 1] - sizeof(intptr_t);
   gprs[kNumberOfCpuRegisters] = esp;
   *(reinterpret_cast<uintptr_t*>(esp)) = eip_;
   __asm__ __volatile__(
+      "movl %1, %%ebx\n\t"          // Address base of FPRs.
+      "movsd 0(%%ebx), %%xmm0\n\t"  // Load up XMM0-XMM7.
+      "movsd 8(%%ebx), %%xmm1\n\t"
+      "movsd 16(%%ebx), %%xmm2\n\t"
+      "movsd 24(%%ebx), %%xmm3\n\t"
+      "movsd 32(%%ebx), %%xmm4\n\t"
+      "movsd 40(%%ebx), %%xmm5\n\t"
+      "movsd 48(%%ebx), %%xmm6\n\t"
+      "movsd 56(%%ebx), %%xmm7\n\t"
       "movl %0, %%esp\n\t"  // ESP points to gprs.
       "popal\n\t"           // Load all registers except ESP and EIP with values in gprs.
       "popl %%esp\n\t"      // Load stack pointer.
       "ret\n\t"             // From higher in the stack pop eip.
       :  // output.
-      : "g"(&gprs[0])  // input.
+      : "g"(&gprs[0]), "g"(&fprs[0]) // input.
       :);  // clobber.
 #else
   UNIMPLEMENTED(FATAL);
diff --git a/runtime/arch/x86/context_x86.h b/runtime/arch/x86/context_x86.h
index 8b7804d..c66a9dc 100644
--- a/runtime/arch/x86/context_x86.h
+++ b/runtime/arch/x86/context_x86.h
@@ -61,11 +61,16 @@
 
   void SetGPR(uint32_t reg, uintptr_t value) OVERRIDE;
 
-  bool IsAccessibleFPR(uint32_t reg ATTRIBUTE_UNUSED) OVERRIDE {
-    return false;
+  bool IsAccessibleFPR(uint32_t reg) OVERRIDE {
+    DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfFloatRegisters));
+    return fprs_[reg] != nullptr;
   }
 
-  uintptr_t GetFPR(uint32_t reg) OVERRIDE;
+  uintptr_t GetFPR(uint32_t reg) OVERRIDE {
+    DCHECK_LT(reg, static_cast<uint32_t>(kNumberOfFloatRegisters));
+    DCHECK(IsAccessibleFPR(reg));
+    return *fprs_[reg];
+  }
 
   void SetFPR(uint32_t reg, uintptr_t value) OVERRIDE;
 
@@ -73,9 +78,22 @@
   void DoLongJump() OVERRIDE;
 
  private:
-  // Pointers to register locations, floating point registers are all caller save. Values are
-  // initialized to NULL or the special registers below.
+  // Pretend XMM registers are made of uin32_t pieces, because they are manipulated
+  // in uint32_t chunks.
+  enum {
+    XMM0_0 = 0, XMM0_1,
+    XMM1_0, XMM1_1,
+    XMM2_0, XMM2_1,
+    XMM3_0, XMM3_1,
+    XMM4_0, XMM4_1,
+    XMM5_0, XMM5_1,
+    XMM6_0, XMM6_1,
+    XMM7_0, XMM7_1,
+    kNumberOfFloatRegisters};
+
+  // Pointers to register locations. Values are initialized to NULL or the special registers below.
   uintptr_t* gprs_[kNumberOfCpuRegisters];
+  uint32_t* fprs_[kNumberOfFloatRegisters];
   // Hold values for esp and eip if they are not located within a stack frame. EIP is somewhat
   // special in that it cannot be encoded normally as a register operand to an instruction (except
   // in 64bit addressing modes).
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 7153403..fd3a1cf 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -90,6 +90,15 @@
     PUSH ebx  // Save args
     PUSH edx
     PUSH ecx
+    // Create space for FPR args.
+    subl MACRO_LITERAL(4 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    // Save FPRs.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+
     SETUP_GOT_NOSAVE VAR(got_reg, 0)
     // Load Runtime::instance_ from GOT.
     movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg, 0)), REG_VAR(temp_reg, 1)
@@ -102,7 +111,7 @@
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +4: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 7*4 + 4)
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 7*4 + 4*8 + 4)
 #error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(X86) size not as expected."
 #endif
 END_MACRO
@@ -112,20 +121,39 @@
      * Runtime::CreateCalleeSaveMethod(kRefsAndArgs) where the method is passed in EAX.
      */
 MACRO0(SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_EAX)
+    // Save callee and GPR args, mixed together to agree with core spills bitmap.
     PUSH edi  // Save callee saves
     PUSH esi
     PUSH ebp
     PUSH ebx  // Save args
     PUSH edx
     PUSH ecx
+
+    // Create space for FPR args.
+    subl MACRO_LITERAL(32), %esp
+    CFI_ADJUST_CFA_OFFSET(32)
+
+    // Save FPRs.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+
     PUSH eax  // Store the ArtMethod reference at the bottom of the stack.
     // Store esp as the stop quick frame.
     movl %esp, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
 END_MACRO
 
 MACRO0(RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME)
-    addl MACRO_LITERAL(4), %esp  // Remove padding
-    CFI_ADJUST_CFA_OFFSET(-4)
+    // Restore FPRs. EAX is still on the stack.
+    movsd 4(%esp), %xmm0
+    movsd 12(%esp), %xmm1
+    movsd 20(%esp), %xmm2
+    movsd 28(%esp), %xmm3
+
+    addl MACRO_LITERAL(36), %esp  // Remove FPRs and EAX.
+    CFI_ADJUST_CFA_OFFSET(-36)
+
     POP ecx  // Restore args except eax
     POP edx
     POP ebx
@@ -134,6 +162,30 @@
     POP edi
 END_MACRO
 
+// Restore register and jump to routine
+// Inputs:  EDI contains pointer to code.
+// Notes: Need to pop EAX too (restores Method*)
+MACRO0(RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME_AND_JUMP)
+    POP eax  // Restore Method*
+
+    // Restore FPRs.
+    movsd 0(%esp), %xmm0
+    movsd 8(%esp), %xmm1
+    movsd 16(%esp), %xmm2
+    movsd 24(%esp), %xmm3
+
+    addl MACRO_LITERAL(32), %esp  // Remove FPRs.
+    CFI_ADJUST_CFA_OFFSET(-32)
+
+    POP ecx  // Restore args except eax
+    POP edx
+    POP ebx
+    POP ebp  // Restore callee saves
+    POP esi
+    xchgl 0(%esp),%edi // restore EDI and place code pointer as only value on stack
+    ret
+END_MACRO
+
     /*
      * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
      * exception is Thread::Current()->exception_.
@@ -243,13 +295,14 @@
     DEFINE_FUNCTION RAW_VAR(c_name, 0)
     SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME ebx, ebx
     movl %esp, %edx  // remember SP
+
     // Outgoing argument set up
     subl MACRO_LITERAL(12), %esp  // alignment padding
     CFI_ADJUST_CFA_OFFSET(12)
     PUSH edx                      // pass SP
     pushl %fs:THREAD_SELF_OFFSET  // pass Thread::Current()
     CFI_ADJUST_CFA_OFFSET(4)
-    pushl 32(%edx)                // pass caller Method*
+    pushl 32+32(%edx)             // pass caller Method*
     CFI_ADJUST_CFA_OFFSET(4)
     PUSH ecx                      // pass arg2
     PUSH eax                      // pass arg1
@@ -257,6 +310,17 @@
     movl %edx, %edi               // save code pointer in EDI
     addl MACRO_LITERAL(36), %esp  // Pop arguments skip eax
     CFI_ADJUST_CFA_OFFSET(-36)
+
+    // Restore FPRs.
+    movsd 0(%esp), %xmm0
+    movsd 8(%esp), %xmm1
+    movsd 16(%esp), %xmm2
+    movsd 24(%esp), %xmm3
+
+    // Remove space for FPR args.
+    addl MACRO_LITERAL(4 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(-4 * 8)
+
     POP ecx  // Restore args except eax
     POP edx
     POP ebx
@@ -284,7 +348,63 @@
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
     /*
-     * Quick invocation stub.
+     * Helper for quick invocation stub to set up XMM registers.
+     * Increments shorty and arg_array and clobbers temp_char.
+     * Branches to finished if it encounters the end of the shorty.
+     */
+MACRO5(LOOP_OVER_SHORTY_LOADING_XMMS, xmm_reg, shorty, arg_array, temp_char, finished)
+1: // LOOP
+    movb (REG_VAR(shorty, 1)), REG_VAR(temp_char, 3)  // temp_char := *shorty
+    addl MACRO_LITERAL(1), REG_VAR(shorty, 1)         // shorty++
+    cmpb MACRO_LITERAL(0), REG_VAR(temp_char, 3)      // if (temp_char == '\0')
+    je RAW_VAR(finished, 4)                           //   goto finished
+    cmpb MACRO_LITERAL(68), REG_VAR(temp_char, 3)     // if (temp_char == 'D')
+    je 2f                                             //   goto FOUND_DOUBLE
+    cmpb MACRO_LITERAL(70), REG_VAR(temp_char, 3)     // if (temp_char == 'F')
+    je 3f                                             //   goto FOUND_FLOAT
+    addl MACRO_LITERAL(4), REG_VAR(arg_array, 2)      // arg_array++
+    //  Handle extra space in arg array taken by a long.
+    cmpb MACRO_LITERAL(74), REG_VAR(temp_char, 3)     // if (temp_char != 'J')
+    jne 1b                                            //   goto LOOP
+    addl MACRO_LITERAL(4), REG_VAR(arg_array, 2)      // arg_array++
+    jmp 1b                                            // goto LOOP
+2:  // FOUND_DOUBLE
+    movsd (REG_VAR(arg_array, 2)), REG_VAR(xmm_reg, 0)
+    addl MACRO_LITERAL(8), REG_VAR(arg_array, 2)      // arg_array+=2
+    jmp 4f
+3:  // FOUND_FLOAT
+    movss (REG_VAR(arg_array, 2)), REG_VAR(xmm_reg, 0)
+    addl MACRO_LITERAL(4), REG_VAR(arg_array, 2)      // arg_array++
+4:
+END_MACRO
+
+    /*
+     * Helper for quick invocation stub to set up GPR registers.
+     * Increments shorty and arg_array, and returns the current short character in
+     * temp_char. Branches to finished if it encounters the end of the shorty.
+     */
+MACRO4(SKIP_OVER_FLOATS, shorty, arg_array, temp_char, finished)
+1: // LOOP:
+    movb (REG_VAR(shorty, 0)), REG_VAR(temp_char, 2)  // temp_char := *shorty
+    addl MACRO_LITERAL(1), REG_VAR(shorty, 0)         // shorty++
+    cmpb MACRO_LITERAL(0), REG_VAR(temp_char, 2)      // if (temp_char == '\0')
+    je RAW_VAR(finished, 3)                           //   goto finished
+    cmpb MACRO_LITERAL(70), REG_VAR(temp_char, 2)     // if (temp_char == 'F')
+    je 3f                                             //   goto SKIP_FLOAT
+    cmpb MACRO_LITERAL(68), REG_VAR(temp_char, 2)     // if (temp_char == 'D')
+    je 4f                                             //   goto SKIP_DOUBLE
+    jmp 5f                                            // goto end
+3:  // SKIP_FLOAT
+    addl MACRO_LITERAL(4), REG_VAR(arg_array, 1)      // arg_array++
+    jmp 1b                                            // goto LOOP
+4:  // SKIP_DOUBLE
+    addl MACRO_LITERAL(8), REG_VAR(arg_array, 1)      // arg_array+=2
+    jmp 1b                                            // goto LOOP
+5:
+END_MACRO
+
+  /*
+     * Quick invocation stub (non-static).
      * On entry:
      *   [sp] = return address
      *   [sp + 4] = method pointer
@@ -295,10 +415,23 @@
      *   [sp + 24] = shorty
      */
 DEFINE_FUNCTION art_quick_invoke_stub
+    // Save the non-volatiles.
     PUSH ebp                      // save ebp
     PUSH ebx                      // save ebx
     PUSH esi                      // save esi
     PUSH edi                      // save edi
+    // Set up argument XMM registers.
+    mov 24+16(%esp), %esi         // ESI := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %esi
+    mov 8+16(%esp), %edi          // EDI := arg_array + 4 ; ie skip this pointer.
+    addl LITERAL(4), %edi
+    // Clobbers ESI, EDI, EAX.
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm0, esi, edi, al, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm1, esi, edi, al, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm2, esi, edi, al, .Lxmm_setup_finished
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm3, esi, edi, al, .Lxmm_setup_finished
+    .balign 16
+.Lxmm_setup_finished:
     mov %esp, %ebp                // copy value of stack pointer into base pointer
     CFI_DEF_CFA_REGISTER(ebp)
     mov 28(%ebp), %ebx            // get arg array size
@@ -308,18 +441,41 @@
     andl LITERAL(0xFFFFFFF0), %ebx
     subl LITERAL(20), %ebx        // remove space for return address, ebx, ebp, esi and edi
     subl %ebx, %esp               // reserve stack space for argument array
-    SETUP_GOT_NOSAVE ebx          // clobbers ebx (harmless here)
-    lea  4(%esp), %eax            // use stack pointer + method ptr as dest for memcpy
-    pushl 28(%ebp)                // push size of region to memcpy
-    pushl 24(%ebp)                // push arg array as source of memcpy
-    pushl %eax                    // push stack pointer as destination of memcpy
-    call PLT_SYMBOL(memcpy)       // (void*, const void*, size_t)
-    addl LITERAL(12), %esp        // pop arguments to memcpy
+
     movl LITERAL(0), (%esp)       // store NULL for method*
+
+    // Copy arg array into stack.
+    movl 28(%ebp), %ecx           // ECX = size of args
+    movl 24(%ebp), %esi           // ESI = argument array
+    leal 4(%esp), %edi            // EDI = just after Method* in stack arguments
+    rep movsb                     // while (ecx--) { *edi++ = *esi++ }
+
+    mov 40(%ebp), %esi            // ESI := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %esi
+    mov 24(%ebp), %edi            // EDI := arg_array
+    mov 0(%edi), %ecx             // ECX := this pointer
+    addl LITERAL(4), %edi         // EDI := arg_array + 4 ; ie skip this pointer.
+
+    // Enumerate the possible cases for loading GPRS.
+    // edx (and maybe ebx):
+    SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished
+    cmpb LITERAL(74), %al         // if (al == 'J') goto FOUND_LONG
+    je .LfirstLong
+    // Must be an integer value.
+    movl (%edi), %edx
+    addl LITERAL(4), %edi         // arg_array++
+
+    // Now check ebx
+    SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished
+    // Must be first word of a long, or an integer.
+    movl (%edi), %ebx
+    jmp .Lgpr_setup_finished
+.LfirstLong:
+    movl (%edi), %edx
+    movl 4(%edi), %ebx
+    // Nothing left to load.
+.Lgpr_setup_finished:
     mov 20(%ebp), %eax            // move method pointer into eax
-    mov 4(%esp), %ecx             // copy arg1 into ecx
-    mov 8(%esp), %edx             // copy arg2 into edx
-    mov 12(%esp), %ebx            // copy arg3 into ebx
     call *MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32(%eax) // call the method
     mov %ebp, %esp                // restore stack pointer
     CFI_DEF_CFA_REGISTER(esp)
@@ -344,6 +500,121 @@
     ret
 END_FUNCTION art_quick_invoke_stub
 
+  /*
+     * Quick invocation stub (static).
+     * On entry:
+     *   [sp] = return address
+     *   [sp + 4] = method pointer
+     *   [sp + 8] = argument array or NULL for no argument methods
+     *   [sp + 12] = size of argument array in bytes
+     *   [sp + 16] = (managed) thread pointer
+     *   [sp + 20] = JValue* result
+     *   [sp + 24] = shorty
+     */
+DEFINE_FUNCTION art_quick_invoke_static_stub
+    // Save the non-volatiles.
+    PUSH ebp                      // save ebp
+    PUSH ebx                      // save ebx
+    PUSH esi                      // save esi
+    PUSH edi                      // save edi
+    // Set up argument XMM registers.
+    mov 24+16(%esp), %esi         // ESI := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %esi
+    mov 8+16(%esp), %edi          // EDI := arg_array
+    // Clobbers ESI, EDI, EAX.
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm0, esi, edi, al, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm1, esi, edi, al, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm2, esi, edi, al, .Lxmm_setup_finished2
+    LOOP_OVER_SHORTY_LOADING_XMMS xmm3, esi, edi, al, .Lxmm_setup_finished2
+    .balign 16
+.Lxmm_setup_finished2:
+    mov %esp, %ebp                // copy value of stack pointer into base pointer
+    CFI_DEF_CFA_REGISTER(ebp)
+    mov 28(%ebp), %ebx            // get arg array size
+    // reserve space for return addr, method*, ebx, ebp, esi, and edi in frame
+    addl LITERAL(36), %ebx
+    // align frame size to 16 bytes
+    andl LITERAL(0xFFFFFFF0), %ebx
+    subl LITERAL(20), %ebx        // remove space for return address, ebx, ebp, esi and edi
+    subl %ebx, %esp               // reserve stack space for argument array
+
+    movl LITERAL(0), (%esp)       // store NULL for method*
+
+    // Copy arg array into stack.
+    movl 28(%ebp), %ecx           // ECX = size of args
+    movl 24(%ebp), %esi           // ESI = argument array
+    leal 4(%esp), %edi            // EDI = just after Method* in stack arguments
+    rep movsb                     // while (ecx--) { *edi++ = *esi++ }
+
+    mov 40(%ebp), %esi            // ESI := shorty + 1  ; ie skip return arg character.
+    addl LITERAL(1), %esi
+    mov 24(%ebp), %edi            // EDI := arg_array
+
+    // Enumerate the possible cases for loading GPRS.
+    // ecx (and maybe edx)
+    SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
+    cmpb LITERAL(74), %al         // if (al == 'J') goto FOUND_LONG
+    je .LfirstLong2
+    // Must be an integer value.  Load into ECX.
+    movl (%edi), %ecx
+    addl LITERAL(4), %edi         // arg_array++
+
+    // Now check edx (and maybe ebx).
+    SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
+    cmpb LITERAL(74), %al         // if (al == 'J') goto FOUND_LONG
+    je .LSecondLong2
+    // Must be an integer.  Load into EDX.
+    movl (%edi), %edx
+    addl LITERAL(4), %edi         // arg_array++
+
+    // Is there anything for ebx?
+    SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
+    // First word of long or integer.  Load into EBX.
+    movl (%edi), %ebx
+    jmp .Lgpr_setup_finished2
+.LSecondLong2:
+    // EDX:EBX is long.  That is all.
+    movl (%edi), %edx
+    movl 4(%edi), %ebx
+    jmp .Lgpr_setup_finished2
+.LfirstLong2:
+    // ECX:EDX is a long
+    movl (%edi), %ecx
+    movl 4(%edi), %edx
+    addl LITERAL(8), %edi         // arg_array += 2
+
+    // Anything for EBX?
+    SKIP_OVER_FLOATS esi, edi, al, .Lgpr_setup_finished2
+    // First word of long or integer.  Load into EBX.
+    movl (%edi), %ebx
+    jmp .Lgpr_setup_finished2
+    // Nothing left to load.
+.Lgpr_setup_finished2:
+    mov 20(%ebp), %eax            // move method pointer into eax
+    call *MIRROR_ART_METHOD_QUICK_CODE_OFFSET_32(%eax) // call the method
+    mov %ebp, %esp                // restore stack pointer
+    CFI_DEF_CFA_REGISTER(esp)
+    POP edi                       // pop edi
+    POP esi                       // pop esi
+    POP ebx                       // pop ebx
+    POP ebp                       // pop ebp
+    mov 20(%esp), %ecx            // get result pointer
+    mov %eax, (%ecx)              // store the result assuming its a long, int or Object*
+    mov %edx, 4(%ecx)             // store the other half of the result
+    mov 24(%esp), %edx            // get the shorty
+    cmpb LITERAL(68), (%edx)      // test if result type char == 'D'
+    je .Lreturn_double_quick2
+    cmpb LITERAL(70), (%edx)      // test if result type char == 'F'
+    je .Lreturn_float_quick2
+    ret
+.Lreturn_double_quick2:
+    movsd %xmm0, (%ecx)           // store the floating point result
+    ret
+.Lreturn_float_quick2:
+    movss %xmm0, (%ecx)           // store the floating point result
+    ret
+END_FUNCTION art_quick_invoke_static_stub
+
 MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION RAW_VAR(c_name, 0)
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME ebx, ebx  // save ref containing registers for GC
@@ -842,20 +1113,20 @@
 NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
 
 DEFINE_FUNCTION art_quick_d2l
-    PUSH eax                      // alignment padding
-    PUSH ecx                      // pass arg2 a.hi
-    PUSH eax                      // pass arg1 a.lo
-    call SYMBOL(art_d2l)      // (jdouble a)
+    subl LITERAL(12), %esp        // alignment padding, room for argument
+    CFI_ADJUST_CFA_OFFSET(12)
+    movsd %xmm0, 0(%esp)          // arg a
+    call SYMBOL(art_d2l)          // (jdouble a)
     addl LITERAL(12), %esp        // pop arguments
     CFI_ADJUST_CFA_OFFSET(-12)
     ret
 END_FUNCTION art_quick_d2l
 
 DEFINE_FUNCTION art_quick_f2l
-    subl LITERAL(8), %esp         // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
-    PUSH eax                      // pass arg1 a
-    call SYMBOL(art_f2l)      // (jfloat a)
+    subl LITERAL(12), %esp        // alignment padding
+    CFI_ADJUST_CFA_OFFSET(12)
+    movss %xmm0, 0(%esp)          // arg a
+    call SYMBOL(art_f2l)          // (jfloat a)
     addl LITERAL(12), %esp        // pop arguments
     CFI_ADJUST_CFA_OFFSET(-12)
     ret
@@ -1015,8 +1286,8 @@
     movd %eax, %xmm0              // place return value also into floating point return value
     movd %edx, %xmm1
     punpckldq %xmm1, %xmm0
-    addl LITERAL(44), %esp        // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-44)
+    addl LITERAL(76), %esp        // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-76)
     RETURN_OR_DELIVER_PENDING_EXCEPTION    // return or deliver exception
 END_FUNCTION art_quick_proxy_invoke_handler
 
@@ -1028,7 +1299,7 @@
     PUSH ecx
     movl 8(%esp), %eax            // load caller Method*
     movl MIRROR_ART_METHOD_DEX_CACHE_METHODS_OFFSET(%eax), %eax  // load dex_cache_resolved_methods
-    movd %xmm0, %ecx              // get target method index stored in xmm0
+    movd %xmm7, %ecx              // get target method index stored in xmm0
     movl MIRROR_OBJECT_ARRAY_DATA_OFFSET(%eax, %ecx, 4), %eax  // load the target method
     POP ecx
     jmp SYMBOL(art_quick_invoke_interface_trampoline)
@@ -1047,14 +1318,7 @@
     addl LITERAL(16), %esp        // pop arguments
     test %eax, %eax               // if code pointer is NULL goto deliver pending exception
     jz 1f
-    POP eax                       // called method
-    POP ecx                       // restore args
-    POP edx
-    POP ebx
-    POP ebp                       // restore callee saves except EDI
-    POP esi
-    xchgl 0(%esp),%edi            // restore EDI and place code pointer as only value on stack
-    ret                           // tail call into method
+    RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME_AND_JUMP
 1:
     RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     DELIVER_PENDING_EXCEPTION
@@ -1088,7 +1352,6 @@
     movl %edx, %esp
 
     // On x86 there are no registers passed, so nothing to pop here.
-
     // Native call.
     call *%eax
 
@@ -1115,8 +1378,10 @@
     jnz .Lexception_in_native
 
     // Tear down the callee-save frame.
-    addl LITERAL(4), %esp     // Remove padding
-    CFI_ADJUST_CFA_OFFSET(-4)
+    // Remove space for FPR args and EAX
+    addl LITERAL(4 + 4 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(-(4 + 4 * 8))
+
     POP ecx
     addl LITERAL(4), %esp     // Avoid edx, as it may be part of the result.
     CFI_ADJUST_CFA_OFFSET(-4)
@@ -1146,12 +1411,21 @@
     CFI_ADJUST_CFA_OFFSET(4)
     PUSH eax                      // pass  method
     call SYMBOL(artQuickToInterpreterBridge)  // (method, Thread*, SP)
-    movd %eax, %xmm0              // place return value also into floating point return value
-    movd %edx, %xmm1
-    punpckldq %xmm1, %xmm0
     addl LITERAL(16), %esp        // pop arguments
     CFI_ADJUST_CFA_OFFSET(-16)
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+
+    // Return eax:edx in xmm0 also.
+    movd %eax, %xmm0
+    movd %edx, %xmm1
+    punpckldq %xmm1, %xmm0
+
+    addl LITERAL(48), %esp        // Remove FPRs and EAX, ECX, EDX, EBX.
+    CFI_ADJUST_CFA_OFFSET(-48)
+
+    POP ebp  // Restore callee saves
+    POP esi
+    POP edi
+
     RETURN_OR_DELIVER_PENDING_EXCEPTION    // return or deliver exception
 END_FUNCTION art_quick_to_interpreter_bridge
 
@@ -1171,18 +1445,25 @@
     PUSH eax                      // Pass Method*.
     call SYMBOL(artInstrumentationMethodEntryFromCode) // (Method*, Object*, Thread*, LR)
     addl LITERAL(28), %esp        // Pop arguments upto saved Method*.
-    movl 28(%esp), %edi           // Restore edi.
-    movl %eax, 28(%esp)           // Place code* over edi, just under return pc.
+    movl 60(%esp), %edi           // Restore edi.
+    movl %eax, 60(%esp)           // Place code* over edi, just under return pc.
     movl SYMBOL(art_quick_instrumentation_exit)@GOT(%ebx), %ebx
     // Place instrumentation exit as return pc. ebx holds the GOT computed on entry.
-    movl %ebx, 32(%esp)
-    movl (%esp), %eax             // Restore eax.
-    movl 8(%esp), %ecx            // Restore ecx.
-    movl 12(%esp), %edx           // Restore edx.
-    movl 16(%esp), %ebx           // Restore ebx.
-    movl 20(%esp), %ebp           // Restore ebp.
-    movl 24(%esp), %esi           // Restore esi.
-    addl LITERAL(28), %esp        // Wind stack back upto code*.
+    movl %ebx, 64(%esp)
+    movl 0(%esp), %eax           // Restore eax.
+    // Restore FPRs (extra 4 bytes of offset due to EAX push at top).
+    movsd 8(%esp), %xmm0
+    movsd 16(%esp), %xmm1
+    movsd 24(%esp), %xmm2
+    movsd 32(%esp), %xmm3
+
+    // Restore GPRs.
+    movl 40(%esp), %ecx           // Restore ecx.
+    movl 48(%esp), %edx           // Restore edx.
+    movl 48(%esp), %ebx           // Restore ebx.
+    movl 52(%esp), %ebp           // Restore ebp.
+    movl 56(%esp), %esi           // Restore esi.
+    addl LITERAL(60), %esp        // Wind stack back upto code*.
     ret                           // Call method (and pop).
 END_FUNCTION art_quick_instrumentation_entry
 
diff --git a/runtime/arch/x86/quick_method_frame_info_x86.h b/runtime/arch/x86/quick_method_frame_info_x86.h
index b9dc0d8..9bba531 100644
--- a/runtime/arch/x86/quick_method_frame_info_x86.h
+++ b/runtime/arch/x86/quick_method_frame_info_x86.h
@@ -24,25 +24,44 @@
 namespace art {
 namespace x86 {
 
+enum XMM {
+  XMM0 = 0,
+  XMM1 = 1,
+  XMM2 = 2,
+  XMM3 = 3,
+  XMM4 = 4,
+  XMM5 = 5,
+  XMM6 = 6,
+  XMM7 = 7,
+};
+
 static constexpr uint32_t kX86CalleeSaveRefSpills =
     (1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI);
 static constexpr uint32_t kX86CalleeSaveArgSpills =
     (1 << art::x86::ECX) | (1 << art::x86::EDX) | (1 << art::x86::EBX);
+static constexpr uint32_t kX86CalleeSaveFpArgSpills =
+    (1 << art::x86::XMM0) | (1 << art::x86::XMM1) |
+    (1 << art::x86::XMM2) | (1 << art::x86::XMM3);
 
 constexpr uint32_t X86CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kX86CalleeSaveRefSpills | (type == Runtime::kRefsAndArgs ? kX86CalleeSaveArgSpills : 0) |
       (1 << art::x86::kNumberOfCpuRegisters);  // fake return address callee save
 }
 
+constexpr uint32_t X86CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
+    return type == Runtime::kRefsAndArgs ? kX86CalleeSaveFpArgSpills : 0;
+}
+
 constexpr uint32_t X86CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
   return RoundUp((POPCOUNT(X86CalleeSaveCoreSpills(type)) /* gprs */ +
+                  2 * POPCOUNT(X86CalleeSaveFpSpills(type)) /* fprs */ +
                   1 /* Method* */) * kX86PointerSize, kStackAlignment);
 }
 
 constexpr QuickMethodFrameInfo X86CalleeSaveMethodFrameInfo(Runtime::CalleeSaveType type) {
   return QuickMethodFrameInfo(X86CalleeSaveFrameSize(type),
                               X86CalleeSaveCoreSpills(type),
-                              0u);
+                              X86CalleeSaveFpSpills(type));
 }
 
 }  // namespace x86
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 9947b55..8ab90eb 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -198,16 +198,20 @@
   // | EBX         |    arg3
   // | EDX         |    arg2
   // | ECX         |    arg1
+  // | XMM3        |    float arg 4
+  // | XMM2        |    float arg 3
+  // | XMM1        |    float arg 2
+  // | XMM0        |    float arg 1
   // | EAX/Method* |  <- sp
   static constexpr bool kAlignPairRegister = false;
-  static constexpr bool kQuickSoftFloatAbi = true;  // This is a soft float ABI.
+  static constexpr bool kQuickSoftFloatAbi = false;  // This is a hard float ABI.
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 0;  // 0 arguments passed in FPRs.
+  static constexpr size_t kNumQuickFprArgs = 4;  // 4 arguments passed in FPRs.
   static constexpr bool kGprFprLockstep = false;
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 0;  // Offset of first FPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4;  // Offset of first GPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 28;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 4;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 4 + 4*8;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 28 + 4*8;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
diff --git a/runtime/mirror/art_method.cc b/runtime/mirror/art_method.cc
index 288f6a6..b2016dc 100644
--- a/runtime/mirror/art_method.cc
+++ b/runtime/mirror/art_method.cc
@@ -41,7 +41,7 @@
 
 extern "C" void art_quick_invoke_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                       const char*);
-#if defined(__LP64__) || defined(__arm__)
+#if defined(__LP64__) || defined(__arm__) || defined(__i386__)
 extern "C" void art_quick_invoke_static_stub(ArtMethod*, uint32_t*, uint32_t, Thread*, JValue*,
                                              const char*);
 #endif
@@ -415,7 +415,7 @@
             << "Don't call compiled code when -Xint " << PrettyMethod(this);
       }
 
-#if defined(__LP64__) || defined(__arm__)
+#if defined(__LP64__) || defined(__arm__) || defined(__i386__)
       if (!IsStatic()) {
         (*art_quick_invoke_stub)(this, args, args_size, self, result, shorty);
       } else {
diff --git a/runtime/oat.h b/runtime/oat.h
index 8e63d3a..3e28606 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '5', '3', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '5', '4', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";