Support hard float on arm in optimizing compiler.

Also bump oat version, needed after latest hard float switch.

Change-Id: Idf5acfb36c07e74acff00edab998419a3c6b2965
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index c4286a4..28ff1cf 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -111,6 +111,20 @@
     }
   }
   LOG(FATAL) << "Could not find a register in baseline register allocator";
+  UNREACHABLE();
+  return -1;
+}
+
+size_t CodeGenerator::FindTwoFreeConsecutiveEntries(bool* array, size_t length) {
+  for (size_t i = 0; i < length - 1; ++i) {
+    if (!array[i] && !array[i + 1]) {
+      array[i] = true;
+      array[i + 1] = true;
+      return i;
+    }
+  }
+  LOG(FATAL) << "Could not find a register in baseline register allocator";
+  UNREACHABLE();
   return -1;
 }
 
@@ -180,6 +194,11 @@
     } else if (loc.IsFpuRegister()) {
       DCHECK(!blocked_fpu_registers_[loc.reg()]);
       blocked_fpu_registers_[loc.reg()] = true;
+    } else if (loc.IsFpuRegisterPair()) {
+      DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()]);
+      blocked_fpu_registers_[loc.AsFpuRegisterPairLow<int>()] = true;
+      DCHECK(!blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()]);
+      blocked_fpu_registers_[loc.AsFpuRegisterPairHigh<int>()] = true;
     } else if (loc.IsRegisterPair()) {
       DCHECK(!blocked_core_registers_[loc.AsRegisterPairLow<int>()]);
       blocked_core_registers_[loc.AsRegisterPairLow<int>()] = true;
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 220d745..1638869 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -185,6 +185,7 @@
   virtual Location AllocateFreeRegister(Primitive::Type type) const = 0;
 
   static size_t FindFreeEntry(bool* array, size_t length);
+  static size_t FindTwoFreeConsecutiveEntries(bool* array, size_t length);
 
   virtual Location GetStackLocation(HLoadLocal* load) const = 0;
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index a3b31d8..4733432 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -31,8 +31,9 @@
 
 namespace arm {
 
-static SRegister FromDToLowS(DRegister reg) {
-  return static_cast<SRegister>(reg * 2);
+static DRegister FromLowSToD(SRegister reg) {
+  DCHECK_EQ(reg % 2, 0);
+  return static_cast<DRegister>(reg / 2);
 }
 
 static constexpr bool kExplicitStackOverflowCheck = false;
@@ -43,10 +44,10 @@
 static constexpr Register kRuntimeParameterCoreRegisters[] = { R0, R1, R2 };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
-static constexpr DRegister kRuntimeParameterFpuRegisters[] = { };
+static constexpr SRegister kRuntimeParameterFpuRegisters[] = { };
 static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
 
-class InvokeRuntimeCallingConvention : public CallingConvention<Register, DRegister> {
+class InvokeRuntimeCallingConvention : public CallingConvention<Register, SRegister> {
  public:
   InvokeRuntimeCallingConvention()
       : CallingConvention(kRuntimeParameterCoreRegisters,
@@ -207,7 +208,7 @@
 }
 
 void CodeGeneratorARM::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
-  stream << ArmManagedRegister::FromDRegister(DRegister(reg));
+  stream << ArmManagedRegister::FromSRegister(SRegister(reg));
 }
 
 size_t CodeGeneratorARM::SaveCoreRegister(size_t stack_index, uint32_t reg_id) {
@@ -221,7 +222,7 @@
 }
 
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph)
-    : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfDRegisters, kNumberOfRegisterPairs),
+    : CodeGenerator(graph, kNumberOfCoreRegisters, kNumberOfSRegisters, kNumberOfRegisterPairs),
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
@@ -265,12 +266,16 @@
       return Location::RegisterLocation(reg);
     }
 
-    case Primitive::kPrimFloat:
-    case Primitive::kPrimDouble: {
-      int reg = FindFreeEntry(blocked_fpu_registers_, kNumberOfDRegisters);
+    case Primitive::kPrimFloat: {
+      int reg = FindFreeEntry(blocked_fpu_registers_, kNumberOfSRegisters);
       return Location::FpuRegisterLocation(reg);
     }
 
+    case Primitive::kPrimDouble: {
+      int reg = FindTwoFreeConsecutiveEntries(blocked_fpu_registers_, kNumberOfSRegisters);
+      return Location::FpuRegisterPairLocation(reg, reg + 1);
+    }
+
     case Primitive::kPrimVoid:
       LOG(FATAL) << "Unreachable type " << type;
   }
@@ -304,14 +309,14 @@
   blocked_core_registers_[R10] = true;
   blocked_core_registers_[R11] = true;
 
-  blocked_fpu_registers_[D8] = true;
-  blocked_fpu_registers_[D9] = true;
-  blocked_fpu_registers_[D10] = true;
-  blocked_fpu_registers_[D11] = true;
-  blocked_fpu_registers_[D12] = true;
-  blocked_fpu_registers_[D13] = true;
-  blocked_fpu_registers_[D14] = true;
-  blocked_fpu_registers_[D15] = true;
+  blocked_fpu_registers_[S16] = true;
+  blocked_fpu_registers_[S17] = true;
+  blocked_fpu_registers_[S18] = true;
+  blocked_fpu_registers_[S19] = true;
+  blocked_fpu_registers_[S20] = true;
+  blocked_fpu_registers_[S21] = true;
+  blocked_fpu_registers_[S22] = true;
+  blocked_fpu_registers_[S23] = true;
 
   UpdateBlockedPairRegisters();
 }
@@ -397,28 +402,56 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-    case Primitive::kPrimFloat:
     case Primitive::kPrimNot: {
       uint32_t index = gp_index_++;
+      uint32_t stack_index = stack_index_++;
       if (index < calling_convention.GetNumberOfRegisters()) {
         return Location::RegisterLocation(calling_convention.GetRegisterAt(index));
       } else {
-        return Location::StackSlot(calling_convention.GetStackOffsetOf(index));
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index));
       }
     }
 
-    case Primitive::kPrimLong:
-    case Primitive::kPrimDouble: {
+    case Primitive::kPrimLong: {
       uint32_t index = gp_index_;
+      uint32_t stack_index = stack_index_;
       gp_index_ += 2;
+      stack_index_ += 2;
       if (index + 1 < calling_convention.GetNumberOfRegisters()) {
         ArmManagedRegister pair = ArmManagedRegister::FromRegisterPair(
             calling_convention.GetRegisterPairAt(index));
         return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
       } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
-        return Location::QuickParameter(index);
+        return Location::QuickParameter(stack_index);
       } else {
-        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index));
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index));
+      }
+    }
+
+    case Primitive::kPrimFloat: {
+      uint32_t stack_index = stack_index_++;
+      if (float_index_ % 2 == 0) {
+        float_index_ = std::max(double_index_, float_index_);
+      }
+      if (float_index_ < calling_convention.GetNumberOfFpuRegisters()) {
+        return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(float_index_++));
+      } else {
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index));
+      }
+    }
+
+    case Primitive::kPrimDouble: {
+      double_index_ = std::max(double_index_, RoundUp(float_index_, 2));
+      uint32_t stack_index = stack_index_;
+      stack_index_ += 2;
+      if (double_index_ + 1 < calling_convention.GetNumberOfFpuRegisters()) {
+        uint32_t index = double_index_;
+        double_index_ += 2;
+        return Location::FpuRegisterPairLocation(
+          calling_convention.GetFpuRegisterAt(index),
+          calling_convention.GetFpuRegisterAt(index + 1));
+      } else {
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index));
       }
     }
 
@@ -429,6 +462,36 @@
   return Location();
 }
 
+Location InvokeDexCallingConventionVisitor::GetReturnLocation(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot: {
+      return Location::RegisterLocation(R0);
+    }
+
+    case Primitive::kPrimFloat: {
+      return Location::FpuRegisterLocation(S0);
+    }
+
+    case Primitive::kPrimLong: {
+      return Location::RegisterPairLocation(R0, R1);
+    }
+
+    case Primitive::kPrimDouble: {
+      return Location::FpuRegisterPairLocation(S0, S1);
+    }
+
+    case Primitive::kPrimVoid:
+      return Location();
+  }
+  UNREACHABLE();
+  return Location();
+}
+
 void CodeGeneratorARM::Move32(Location destination, Location source) {
   if (source.Equals(destination)) {
     return;
@@ -437,24 +500,24 @@
     if (source.IsRegister()) {
       __ Mov(destination.As<Register>(), source.As<Register>());
     } else if (source.IsFpuRegister()) {
-      __ vmovrs(destination.As<Register>(), FromDToLowS(source.As<DRegister>()));
+      __ vmovrs(destination.As<Register>(), source.As<SRegister>());
     } else {
       __ LoadFromOffset(kLoadWord, destination.As<Register>(), SP, source.GetStackIndex());
     }
   } else if (destination.IsFpuRegister()) {
     if (source.IsRegister()) {
-      __ vmovsr(FromDToLowS(destination.As<DRegister>()), source.As<Register>());
+      __ vmovsr(destination.As<SRegister>(), source.As<Register>());
     } else if (source.IsFpuRegister()) {
-      __ vmovs(FromDToLowS(destination.As<DRegister>()), FromDToLowS(source.As<DRegister>()));
+      __ vmovs(destination.As<SRegister>(), source.As<SRegister>());
     } else {
-      __ vldrs(FromDToLowS(destination.As<DRegister>()), Address(SP, source.GetStackIndex()));
+      __ LoadSFromOffset(destination.As<SRegister>(), SP, source.GetStackIndex());
     }
   } else {
     DCHECK(destination.IsStackSlot());
     if (source.IsRegister()) {
       __ StoreToOffset(kStoreWord, source.As<Register>(), SP, destination.GetStackIndex());
     } else if (source.IsFpuRegister()) {
-      __ vstrs(FromDToLowS(source.As<DRegister>()), Address(SP, destination.GetStackIndex()));
+      __ StoreSToOffset(source.As<SRegister>(), SP, destination.GetStackIndex());
     } else {
       DCHECK(source.IsStackSlot());
       __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
@@ -472,7 +535,7 @@
       __ Mov(destination.AsRegisterPairLow<Register>(), source.AsRegisterPairLow<Register>());
       __ Mov(destination.AsRegisterPairHigh<Register>(), source.AsRegisterPairHigh<Register>());
     } else if (source.IsFpuRegister()) {
-      LOG(FATAL) << "Unimplemented";
+      UNIMPLEMENTED(FATAL);
     } else if (source.IsQuickParameter()) {
       uint32_t argument_index = source.GetQuickParameterIndex();
       InvokeDexCallingConvention calling_convention;
@@ -491,11 +554,13 @@
                           SP, source.GetStackIndex());
       }
     }
-  } else if (destination.IsFpuRegister()) {
+  } else if (destination.IsFpuRegisterPair()) {
     if (source.IsDoubleStackSlot()) {
-      __ vldrd(destination.As<DRegister>(), Address(SP, source.GetStackIndex()));
+      __ LoadDFromOffset(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()),
+                         SP,
+                         source.GetStackIndex());
     } else {
-      LOG(FATAL) << "Unimplemented";
+      UNIMPLEMENTED(FATAL);
     }
   } else if (destination.IsQuickParameter()) {
     InvokeDexCallingConvention calling_convention;
@@ -506,10 +571,11 @@
       __ StoreToOffset(kStoreWord, source.AsRegisterPairHigh<Register>(),
              SP, calling_convention.GetStackOffsetOf(argument_index + 1));
     } else if (source.IsFpuRegister()) {
-      LOG(FATAL) << "Unimplemented";
+      UNIMPLEMENTED(FATAL);
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ LoadFromOffset(kLoadWord, calling_convention.GetRegisterAt(argument_index), SP, source.GetStackIndex());
+      __ LoadFromOffset(
+          kLoadWord, calling_convention.GetRegisterAt(argument_index), SP, source.GetStackIndex());
       __ LoadFromOffset(kLoadWord, R0, SP, source.GetHighStackIndex(kArmWordSize));
       __ StoreToOffset(kStoreWord, R0, SP, calling_convention.GetStackOffsetOf(argument_index + 1));
     }
@@ -532,8 +598,10 @@
       __ LoadFromOffset(kLoadWord, R0,
              SP, calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize());
       __ StoreToOffset(kStoreWord, R0, SP, destination.GetHighStackIndex(kArmWordSize));
-    } else if (source.IsFpuRegister()) {
-      __ vstrd(source.As<DRegister>(), Address(SP, destination.GetStackIndex()));
+    } else if (source.IsFpuRegisterPair()) {
+      __ StoreDToOffset(FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()),
+                        SP,
+                        destination.GetStackIndex());
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
@@ -892,50 +960,10 @@
 void LocationsBuilderARM::VisitReturn(HReturn* ret) {
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(ret, LocationSummary::kNoCall);
-  switch (ret->InputAt(0)->GetType()) {
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot:
-    case Primitive::kPrimFloat:
-      locations->SetInAt(0, Location::RegisterLocation(R0));
-      break;
-
-    case Primitive::kPrimLong:
-    case Primitive::kPrimDouble:
-      locations->SetInAt(0, Location::RegisterPairLocation(R0, R1));
-      break;
-
-    default:
-      LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
-  }
+  locations->SetInAt(0, parameter_visitor_.GetReturnLocation(ret->InputAt(0)->GetType()));
 }
 
 void InstructionCodeGeneratorARM::VisitReturn(HReturn* ret) {
-  if (kIsDebugBuild) {
-    switch (ret->InputAt(0)->GetType()) {
-      case Primitive::kPrimBoolean:
-      case Primitive::kPrimByte:
-      case Primitive::kPrimChar:
-      case Primitive::kPrimShort:
-      case Primitive::kPrimInt:
-      case Primitive::kPrimNot:
-      case Primitive::kPrimFloat:
-        DCHECK_EQ(ret->GetLocations()->InAt(0).As<Register>(), R0);
-        break;
-
-      case Primitive::kPrimLong:
-      case Primitive::kPrimDouble:
-        DCHECK_EQ(ret->GetLocations()->InAt(0).AsRegisterPairLow<Register>(), R0);
-        DCHECK_EQ(ret->GetLocations()->InAt(0).AsRegisterPairHigh<Register>(), R1);
-        break;
-
-      default:
-        LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
-    }
-  }
   codegen_->GenerateFrameExit();
 }
 
@@ -991,25 +1019,7 @@
     locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
   }
 
-  switch (invoke->GetType()) {
-    case Primitive::kPrimBoolean:
-    case Primitive::kPrimByte:
-    case Primitive::kPrimChar:
-    case Primitive::kPrimShort:
-    case Primitive::kPrimInt:
-    case Primitive::kPrimNot:
-    case Primitive::kPrimFloat:
-      locations->SetOut(Location::RegisterLocation(R0));
-      break;
-
-    case Primitive::kPrimLong:
-    case Primitive::kPrimDouble:
-      locations->SetOut(Location::RegisterPairLocation(R0, R1));
-      break;
-
-    case Primitive::kPrimVoid:
-      break;
-  }
+  locations->SetOut(calling_convention_visitor.GetReturnLocation(invoke->GetType()));
 }
 
 
@@ -1153,13 +1163,13 @@
       break;
 
     case Primitive::kPrimFloat:
-      __ vadds(FromDToLowS(out.As<DRegister>()),
-               FromDToLowS(first.As<DRegister>()),
-               FromDToLowS(second.As<DRegister>()));
+      __ vadds(out.As<SRegister>(), first.As<SRegister>(), second.As<SRegister>());
       break;
 
     case Primitive::kPrimDouble:
-      __ vaddd(out.As<DRegister>(), first.As<DRegister>(), second.As<DRegister>());
+      __ vaddd(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(first.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(second.AsFpuRegisterPairLow<SRegister>()));
       break;
 
     default:
@@ -1219,14 +1229,14 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ vsubs(FromDToLowS(out.As<DRegister>()),
-               FromDToLowS(first.As<DRegister>()),
-               FromDToLowS(second.As<DRegister>()));
+      __ vsubs(out.As<SRegister>(), first.As<SRegister>(), second.As<SRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ vsubd(out.As<DRegister>(), first.As<DRegister>(), second.As<DRegister>());
+      __ vsubd(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(first.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(second.AsFpuRegisterPairLow<SRegister>()));
       break;
     }
 
@@ -1303,14 +1313,14 @@
     }
 
     case Primitive::kPrimFloat: {
-      __ vmuls(FromDToLowS(out.As<DRegister>()),
-               FromDToLowS(first.As<DRegister>()),
-               FromDToLowS(second.As<DRegister>()));
+      __ vmuls(out.As<SRegister>(), first.As<SRegister>(), second.As<SRegister>());
       break;
     }
 
     case Primitive::kPrimDouble: {
-      __ vmuld(out.As<DRegister>(), first.As<DRegister>(), second.As<DRegister>());
+      __ vmuld(FromLowSToD(out.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(first.AsFpuRegisterPairLow<SRegister>()),
+               FromLowSToD(second.AsFpuRegisterPairLow<SRegister>()));
       break;
     }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 1fe8a7e..b1c36c0 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -32,10 +32,11 @@
 static constexpr Register kParameterCoreRegisters[] = { R1, R2, R3 };
 static constexpr RegisterPair kParameterCorePairRegisters[] = { R1_R2, R2_R3 };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
-static constexpr DRegister kParameterFpuRegisters[] = { };
-static constexpr size_t kParameterFpuRegistersLength = 0;
+static constexpr SRegister kParameterFpuRegisters[] =
+    { S0, S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15 };
+static constexpr size_t kParameterFpuRegistersLength = arraysize(kParameterFpuRegisters);
 
-class InvokeDexCallingConvention : public CallingConvention<Register, DRegister> {
+class InvokeDexCallingConvention : public CallingConvention<Register, SRegister> {
  public:
   InvokeDexCallingConvention()
       : CallingConvention(kParameterCoreRegisters,
@@ -54,13 +55,18 @@
 
 class InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor() : gp_index_(0) {}
+  InvokeDexCallingConventionVisitor()
+      : gp_index_(0), float_index_(0), double_index_(0), stack_index_(0) {}
 
   Location GetNextLocation(Primitive::Type type);
+  Location GetReturnLocation(Primitive::Type type);
 
  private:
   InvokeDexCallingConvention calling_convention;
   uint32_t gp_index_;
+  uint32_t float_index_;
+  uint32_t double_index_;
+  uint32_t stack_index_;
 
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 11bcd78..de8c78d 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -47,24 +47,26 @@
     // We do not use the value 5 because it conflicts with kLocationConstantMask.
     kDoNotUse5 = 5,
 
-    kFpuRegister = 6,  // Floating point processor.
+    kFpuRegister = 6,  // Float register.
 
-    kRegisterPair = 7,
+    kRegisterPair = 7,  // Long register.
+
+    kFpuRegisterPair = 8,  // Double register.
+
+    // We do not use the value 9 because it conflicts with kLocationConstantMask.
+    kDoNotUse9 = 9,
 
     // On 32bits architectures, quick can pass a long where the
     // low bits are in the last parameter register, and the high
     // bits are in a stack slot. The kQuickParameter kind is for
     // handling this special case.
-    kQuickParameter = 8,
-
-    // We do not use the value 9 because it conflicts with kLocationConstantMask.
-    kDoNotUse9 = 9,
+    kQuickParameter = 10,
 
     // Unallocated location represents a location that is not fixed and can be
     // allocated by a register allocator.  Each unallocated location has
     // a policy that specifies what kind of location is suitable. Payload
     // contains register allocation policy.
-    kUnallocated = 10,
+    kUnallocated = 11,
   };
 
   Location() : value_(kInvalid) {
@@ -77,6 +79,7 @@
     COMPILE_ASSERT((kQuickParameter & kLocationConstantMask) != kConstant, TagError);
     COMPILE_ASSERT((kFpuRegister & kLocationConstantMask) != kConstant, TagError);
     COMPILE_ASSERT((kRegisterPair & kLocationConstantMask) != kConstant, TagError);
+    COMPILE_ASSERT((kFpuRegisterPair & kLocationConstantMask) != kConstant, TagError);
     COMPILE_ASSERT((kConstant & kLocationConstantMask) == kConstant, TagError);
 
     DCHECK(!IsValid());
@@ -129,6 +132,10 @@
     return Location(kRegisterPair, low << 16 | high);
   }
 
+  static Location FpuRegisterPairLocation(int low, int high) {
+    return Location(kFpuRegisterPair, low << 16 | high);
+  }
+
   bool IsRegister() const {
     return GetKind() == kRegister;
   }
@@ -141,6 +148,10 @@
     return GetKind() == kRegisterPair;
   }
 
+  bool IsFpuRegisterPair() const {
+    return GetKind() == kFpuRegisterPair;
+  }
+
   int reg() const {
     DCHECK(IsRegister() || IsFpuRegister());
     return GetPayload();
@@ -163,6 +174,18 @@
     return static_cast<T>(GetPayload() & 0xFFFF);
   }
 
+  template <typename T>
+  T AsFpuRegisterPairLow() const {
+    DCHECK(IsFpuRegisterPair());
+    return static_cast<T>(GetPayload() >> 16);
+  }
+
+  template <typename T>
+  T AsFpuRegisterPairHigh() const {
+    DCHECK(IsFpuRegisterPair());
+    return static_cast<T>(GetPayload() & 0xFFFF);
+  }
+
   static uintptr_t EncodeStackIndex(intptr_t stack_index) {
     DCHECK(-kStackIndexBias <= stack_index);
     DCHECK(stack_index < kStackIndexBias);
@@ -237,6 +260,7 @@
       case kConstant: return "C";
       case kFpuRegister: return "F";
       case kRegisterPair: return "RP";
+      case kFpuRegisterPair: return "FP";
       case kDoNotUse5:  // fall-through
       case kDoNotUse9:
         LOG(FATAL) << "Should not use this location kind";
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 0555c00..5350dcb 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -199,6 +199,13 @@
   delegate_->InitCompilationUnit(cu);
 }
 
+static bool IsInstructionSetSupported(InstructionSet instruction_set) {
+  return instruction_set == kArm64
+      || (instruction_set == kThumb2 && !kArm32QuickCodeUseSoftFloat)
+      || instruction_set == kX86
+      || instruction_set == kX86_64;
+}
+
 CompiledMethod* OptimizingCompiler::TryCompile(const DexFile::CodeItem* code_item,
                                                uint32_t access_flags,
                                                InvokeType invoke_type,
@@ -215,10 +222,7 @@
   }
 
   // Do not attempt to compile on architectures we do not support.
-  if (instruction_set != kArm64 &&
-      instruction_set != kThumb2 &&
-      instruction_set != kX86 &&
-      instruction_set != kX86_64) {
+  if (!IsInstructionSetSupported(instruction_set)) {
     return nullptr;
   }
 
@@ -233,17 +237,6 @@
   bool shouldOptimize =
       dex_compilation_unit.GetSymbol().find("00024reg_00024") != std::string::npos;
 
-  if (instruction_set == kThumb2 && !kArm32QuickCodeUseSoftFloat) {
-    uint32_t shorty_len;
-    const char* shorty = dex_compilation_unit.GetShorty(&shorty_len);
-    for (uint32_t i = 0; i < shorty_len; ++i) {
-      if (shorty[i] == 'D' || shorty[i] == 'F') {
-        CHECK(!shouldCompile) << "Hard float ARM32 parameters are not yet supported";
-        return nullptr;
-      }
-    }
-  }
-
   ArenaPool pool;
   ArenaAllocator arena(&pool);
   HGraphBuilder builder(&arena, &dex_compilation_unit, &dex_file, GetCompilerDriver());
diff --git a/runtime/oat.cc b/runtime/oat.cc
index 0d7fb01..95c4716 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -23,7 +23,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '4', '3', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '4', '4', '\0' };
 
 static size_t ComputeOatHeaderSize(const SafeMap<std::string, std::string>* variable_data) {
   size_t estimate = 0U;