[optimizing] Implement x86/x86_64 math intrinsics

Implement floor/ceil/round/RoundFloat on x86 and x86_64.
Implement RoundDouble on x86_64.

Add support for roundss and roundsd on both architectures.  Support them
in the disassembler as well.

Add the instruction set features for x86, as the 'round' instruction is
only supported if SSE4.1 is supported.

Fix the tests to handle the addition of passing the instruction set
features to x86 and x86_64.

Add assembler tests for roundsd and roundss to x86_64 assembler tests.

Change-Id: I9742d5930befb0bbc23f3d6c83ce0183ed9fe04f
Signed-off-by: Mark Mendell <mark.p.mendell@intel.com>
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index bd6e943..9b1ef17 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -378,10 +378,14 @@
     case kMips:
       return nullptr;
     case kX86: {
-      return new x86::CodeGeneratorX86(graph, compiler_options);
+      return new x86::CodeGeneratorX86(graph,
+           *isa_features.AsX86InstructionSetFeatures(),
+           compiler_options);
     }
     case kX86_64: {
-      return new x86_64::CodeGeneratorX86_64(graph, compiler_options);
+      return new x86_64::CodeGeneratorX86_64(graph,
+          *isa_features.AsX86_64InstructionSetFeatures(),
+          compiler_options);
     }
     default:
       return nullptr;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 0d5fe49..224be0f 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -339,7 +339,9 @@
   return GetFloatingPointSpillSlotSize();
 }
 
-CodeGeneratorX86::CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options)
+CodeGeneratorX86::CodeGeneratorX86(HGraph* graph,
+                   const X86InstructionSetFeatures& isa_features,
+                   const CompilerOptions& compiler_options)
     : CodeGenerator(graph,
                     kNumberOfCpuRegisters,
                     kNumberOfXmmRegisters,
@@ -352,7 +354,8 @@
       block_labels_(graph->GetArena(), 0),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
-      move_resolver_(graph->GetArena(), this) {
+      move_resolver_(graph->GetArena(), this),
+      isa_features_(isa_features) {
   // Use a fake return address register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
@@ -1110,7 +1113,7 @@
 }
 
 void LocationsBuilderX86::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
-  IntrinsicLocationsBuilderX86 intrinsic(GetGraph()->GetArena());
+  IntrinsicLocationsBuilderX86 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
   }
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 6a4d42d..7cdbd62 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -185,7 +185,9 @@
 
 class CodeGeneratorX86 : public CodeGenerator {
  public:
-  CodeGeneratorX86(HGraph* graph, const CompilerOptions& compiler_options);
+  CodeGeneratorX86(HGraph* graph,
+                   const X86InstructionSetFeatures& isa_features,
+                   const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorX86() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -271,6 +273,10 @@
 
   Label* GetFrameEntryLabel() { return &frame_entry_label_; }
 
+  const X86InstructionSetFeatures& GetInstructionSetFeatures() const {
+    return isa_features_;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
@@ -279,6 +285,7 @@
   InstructionCodeGeneratorX86 instruction_visitor_;
   ParallelMoveResolverX86 move_resolver_;
   X86Assembler assembler_;
+  const X86InstructionSetFeatures& isa_features_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86);
 };
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index ef60280..aac4c3a 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -390,7 +390,9 @@
 static constexpr int kNumberOfCpuRegisterPairs = 0;
 // Use a fake return address register to mimic Quick.
 static constexpr Register kFakeReturnRegister = Register(kLastCpuRegister + 1);
-CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options)
+CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph,
+                const X86_64InstructionSetFeatures& isa_features,
+                const CompilerOptions& compiler_options)
       : CodeGenerator(graph,
                       kNumberOfCpuRegisters,
                       kNumberOfFloatRegisters,
@@ -404,7 +406,8 @@
         block_labels_(graph->GetArena(), 0),
         location_builder_(graph, this),
         instruction_visitor_(graph, this),
-        move_resolver_(graph->GetArena(), this) {
+        move_resolver_(graph->GetArena(), this),
+        isa_features_(isa_features) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
@@ -1180,7 +1183,7 @@
 }
 
 void LocationsBuilderX86_64::VisitInvokeStaticOrDirect(HInvokeStaticOrDirect* invoke) {
-  IntrinsicLocationsBuilderX86_64 intrinsic(GetGraph()->GetArena());
+  IntrinsicLocationsBuilderX86_64 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
   }
@@ -1241,7 +1244,7 @@
 }
 
 void LocationsBuilderX86_64::VisitInvokeVirtual(HInvokeVirtual* invoke) {
-  IntrinsicLocationsBuilderX86_64 intrinsic(GetGraph()->GetArena());
+  IntrinsicLocationsBuilderX86_64 intrinsic(codegen_);
   if (intrinsic.TryDispatch(invoke)) {
     return;
   }
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index a380b6a..26e913c 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -191,7 +191,9 @@
 
 class CodeGeneratorX86_64 : public CodeGenerator {
  public:
-  CodeGeneratorX86_64(HGraph* graph, const CompilerOptions& compiler_options);
+  CodeGeneratorX86_64(HGraph* graph,
+                  const X86_64InstructionSetFeatures& isa_features,
+                  const CompilerOptions& compiler_options);
   virtual ~CodeGeneratorX86_64() {}
 
   void GenerateFrameEntry() OVERRIDE;
@@ -264,6 +266,10 @@
 
   void GenerateStaticOrDirectCall(HInvokeStaticOrDirect* invoke, CpuRegister temp);
 
+  const X86_64InstructionSetFeatures& GetInstructionSetFeatures() const {
+    return isa_features_;
+  }
+
  private:
   // Labels for each block that will be compiled.
   GrowableArray<Label> block_labels_;
@@ -272,6 +278,7 @@
   InstructionCodeGeneratorX86_64 instruction_visitor_;
   ParallelMoveResolverX86_64 move_resolver_;
   X86_64Assembler assembler_;
+  const X86_64InstructionSetFeatures& isa_features_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
 };
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 6053ad5..2be117b 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -19,6 +19,8 @@
 #include "arch/instruction_set.h"
 #include "arch/arm/instruction_set_features_arm.h"
 #include "arch/arm64/instruction_set_features_arm64.h"
+#include "arch/x86/instruction_set_features_x86.h"
+#include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "base/macros.h"
 #include "builder.h"
 #include "code_generator_arm.h"
@@ -108,7 +110,9 @@
   InternalCodeAllocator allocator;
 
   CompilerOptions compiler_options;
-  x86::CodeGeneratorX86 codegenX86(graph, compiler_options);
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
   // We avoid doing a stack overflow check that requires the runtime being setup,
   // by making sure the compiler knows the methods we are running are leaf methods.
   codegenX86.CompileBaseline(&allocator, true);
@@ -124,7 +128,9 @@
     Run(allocator, codegenARM, has_result, expected);
   }
 
-  x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options);
+  std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
+      X86_64InstructionSetFeatures::FromCppDefines());
+  x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options);
   codegenX86_64.CompileBaseline(&allocator, true);
   if (kRuntimeISA == kX86_64) {
     Run(allocator, codegenX86_64, has_result, expected);
@@ -175,10 +181,14 @@
                                            compiler_options);
     RunCodeOptimized(&codegenARM64, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kX86) {
-    x86::CodeGeneratorX86 codegenX86(graph, compiler_options);
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
     RunCodeOptimized(&codegenX86, graph, hook_before_codegen, has_result, expected);
   } else if (kRuntimeISA == kX86_64) {
-    x86_64::CodeGeneratorX86_64 codegenX86_64(graph, compiler_options);
+    std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
+        X86_64InstructionSetFeatures::FromCppDefines());
+    x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options);
     RunCodeOptimized(&codegenX86_64, graph, hook_before_codegen, has_result, expected);
   }
 }
diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc
index 6853d54..02ad675 100644
--- a/compiler/optimizing/constant_folding_test.cc
+++ b/compiler/optimizing/constant_folding_test.cc
@@ -16,6 +16,7 @@
 
 #include <functional>
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "code_generator_x86.h"
 #include "constant_folding.h"
 #include "dead_code_elimination.h"
@@ -46,7 +47,9 @@
   std::string actual_before = printer_before.str();
   ASSERT_EQ(expected_before, actual_before);
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), CompilerOptions());
   HConstantFolding(graph).Run();
   SSAChecker ssa_checker_cf(&allocator, graph);
   ssa_checker_cf.Run();
diff --git a/compiler/optimizing/dead_code_elimination_test.cc b/compiler/optimizing/dead_code_elimination_test.cc
index a6447196..98ae1ec 100644
--- a/compiler/optimizing/dead_code_elimination_test.cc
+++ b/compiler/optimizing/dead_code_elimination_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "code_generator_x86.h"
 #include "dead_code_elimination.h"
 #include "driver/compiler_options.h"
@@ -40,7 +41,9 @@
   std::string actual_before = printer_before.str();
   ASSERT_EQ(actual_before, expected_before);
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), CompilerOptions());
   HDeadCodeElimination(graph).Run();
   SSAChecker ssa_checker(&allocator, graph);
   ssa_checker.Run();
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 384737f..0740471 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -16,6 +16,7 @@
 
 #include "intrinsics_x86.h"
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "code_generator_x86.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "intrinsics.h"
@@ -34,6 +35,11 @@
 static constexpr int kDoubleNaNLow = 0x00000000;
 static constexpr int kFloatNaN = 0x7FC00000;
 
+IntrinsicLocationsBuilderX86::IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen)
+  : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
+}
+
+
 X86Assembler* IntrinsicCodeGeneratorX86::GetAssembler() {
   return reinterpret_cast<X86Assembler*>(codegen_->GetAssembler());
 }
@@ -719,6 +725,148 @@
   GetAssembler()->sqrtsd(out, in);
 }
 
+static void InvokeOutOfLineIntrinsic(CodeGeneratorX86* codegen, HInvoke* invoke) {
+  MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen);
+
+  DCHECK(invoke->IsInvokeStaticOrDirect());
+  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), EAX);
+
+  // Copy the result back to the expected output.
+  Location out = invoke->GetLocations()->Out();
+  if (out.IsValid()) {
+    DCHECK(out.IsRegister());
+    MoveFromReturnRegister(out, invoke->GetType(), codegen);
+  }
+}
+
+static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
+                                      HInvoke* invoke,
+                                      CodeGeneratorX86* codegen) {
+  // Do we have instruction support?
+  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
+    CreateFPToFPLocations(arena, invoke);
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::FpuRegisterLocation(XMM0));
+  // Needs to be EAX for the invoke.
+  locations->AddTemp(Location::RegisterLocation(EAX));
+}
+
+static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86* codegen,
+                                   HInvoke* invoke,
+                                   X86Assembler* assembler,
+                                   int round_mode) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen, invoke);
+  } else {
+    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
+    __ roundsd(out, in, Immediate(round_mode));
+  }
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathCeil(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathCeil(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathFloor(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathFloor(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
+}
+
+void IntrinsicLocationsBuilderX86::VisitMathRint(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathRint(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
+}
+
+// Note that 32 bit x86 doesn't have the capability to inline MathRoundDouble,
+// as it needs 64 bit instructions.
+void IntrinsicLocationsBuilderX86::VisitMathRoundFloat(HInvoke* invoke) {
+  // Do we have instruction support?
+  if (codegen_->GetInstructionSetFeatures().HasSSE4_1()) {
+    LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                              LocationSummary::kNoCall,
+                                                              kIntrinsified);
+    locations->SetInAt(0, Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena_) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::RegisterLocation(EAX));
+  // Needs to be EAX for the invoke.
+  locations->AddTemp(Location::RegisterLocation(EAX));
+}
+
+void IntrinsicCodeGeneratorX86::VisitMathRoundFloat(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen_, invoke);
+    return;
+  }
+
+  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  Register out = locations->Out().AsRegister<Register>();
+  XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  Label done, nan;
+  X86Assembler* assembler = GetAssembler();
+
+  // Generate 0.5 into inPlusPointFive.
+  __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f)));
+  __ movd(inPlusPointFive, out);
+
+  // Add in the input.
+  __ addss(inPlusPointFive, in);
+
+  // And truncate to an integer.
+  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
+
+  __ movl(out, Immediate(kPrimIntMax));
+  // maxInt = int-to-float(out)
+  __ cvtsi2ss(maxInt, out);
+
+  // if inPlusPointFive >= maxInt goto done
+  __ comiss(inPlusPointFive, maxInt);
+  __ j(kAboveEqual, &done);
+
+  // if input == NaN goto nan
+  __ j(kUnordered, &nan);
+
+  // output = float-to-int-truncate(input)
+  __ cvttss2si(out, inPlusPointFive);
+  __ jmp(&done);
+  __ Bind(&nan);
+
+  //  output = 0
+  __ xorl(out, out);
+  __ Bind(&done);
+}
+
 void IntrinsicLocationsBuilderX86::VisitStringCharAt(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
@@ -1191,11 +1339,7 @@
 UNIMPLEMENTED_INTRINSIC(IntegerReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverseBytes)
-UNIMPLEMENTED_INTRINSIC(MathFloor)
-UNIMPLEMENTED_INTRINSIC(MathCeil)
-UNIMPLEMENTED_INTRINSIC(MathRint)
 UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
-UNIMPLEMENTED_INTRINSIC(MathRoundFloat)
 UNIMPLEMENTED_INTRINSIC(StringIndexOf)
 UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
diff --git a/compiler/optimizing/intrinsics_x86.h b/compiler/optimizing/intrinsics_x86.h
index e1e8260..4292ec7 100644
--- a/compiler/optimizing/intrinsics_x86.h
+++ b/compiler/optimizing/intrinsics_x86.h
@@ -32,7 +32,7 @@
 
 class IntrinsicLocationsBuilderX86 FINAL : public IntrinsicVisitor {
  public:
-  explicit IntrinsicLocationsBuilderX86(ArenaAllocator* arena) : arena_(arena) {}
+  explicit IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen);
 
   // Define visitor methods.
 
@@ -50,6 +50,7 @@
 
  private:
   ArenaAllocator* arena_;
+  CodeGeneratorX86* codegen_;
 
   DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderX86);
 };
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index 736cea8..f6fa013 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -16,6 +16,7 @@
 
 #include "intrinsics_x86_64.h"
 
+#include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "code_generator_x86_64.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "intrinsics.h"
@@ -30,6 +31,11 @@
 
 namespace x86_64 {
 
+IntrinsicLocationsBuilderX86_64::IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen)
+  : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) {
+}
+
+
 X86_64Assembler* IntrinsicCodeGeneratorX86_64::GetAssembler() {
   return reinterpret_cast<X86_64Assembler*>(codegen_->GetAssembler());
 }
@@ -614,6 +620,203 @@
   GetAssembler()->sqrtsd(out, in);
 }
 
+static void InvokeOutOfLineIntrinsic(CodeGeneratorX86_64* codegen, HInvoke* invoke) {
+  MoveArguments(invoke, codegen->GetGraph()->GetArena(), codegen);
+
+  DCHECK(invoke->IsInvokeStaticOrDirect());
+  codegen->GenerateStaticOrDirectCall(invoke->AsInvokeStaticOrDirect(), CpuRegister(RDI));
+  codegen->RecordPcInfo(invoke, invoke->GetDexPc());
+
+  // Copy the result back to the expected output.
+  Location out = invoke->GetLocations()->Out();
+  if (out.IsValid()) {
+    DCHECK(out.IsRegister());
+    MoveFromReturnRegister(out, invoke->GetType(), codegen);
+  }
+}
+
+static void CreateSSE41FPToFPLocations(ArenaAllocator* arena,
+                                      HInvoke* invoke,
+                                      CodeGeneratorX86_64* codegen) {
+  // Do we have instruction support?
+  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
+    CreateFPToFPLocations(arena, invoke);
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::FpuRegisterLocation(XMM0));
+  // Needs to be RDI for the invoke.
+  locations->AddTemp(Location::RegisterLocation(RDI));
+}
+
+static void GenSSE41FPToFPIntrinsic(CodeGeneratorX86_64* codegen,
+                                   HInvoke* invoke,
+                                   X86_64Assembler* assembler,
+                                   int round_mode) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen, invoke);
+  } else {
+    XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+    XmmRegister out = locations->Out().AsFpuRegister<XmmRegister>();
+    __ roundsd(out, in, Immediate(round_mode));
+  }
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathCeil(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathCeil(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 2);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathFloor(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathFloor(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 1);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathRint(HInvoke* invoke) {
+  CreateSSE41FPToFPLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathRint(HInvoke* invoke) {
+  GenSSE41FPToFPIntrinsic(codegen_, invoke, GetAssembler(), 0);
+}
+
+static void CreateSSE41FPToIntLocations(ArenaAllocator* arena,
+                                       HInvoke* invoke,
+                                       CodeGeneratorX86_64* codegen) {
+  // Do we have instruction support?
+  if (codegen->GetInstructionSetFeatures().HasSSE4_1()) {
+    LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                              LocationSummary::kNoCall,
+                                                              kIntrinsified);
+    locations->SetInAt(0, Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    locations->AddTemp(Location::RequiresFpuRegister());
+    return;
+  }
+
+  // We have to fall back to a call to the intrinsic.
+  LocationSummary* locations = new (arena) LocationSummary(invoke,
+                                                           LocationSummary::kCall);
+  InvokeRuntimeCallingConvention calling_convention;
+  locations->SetInAt(0, Location::RegisterLocation(calling_convention.GetFpuRegisterAt(0)));
+  locations->SetOut(Location::RegisterLocation(RAX));
+  // Needs to be RDI for the invoke.
+  locations->AddTemp(Location::RegisterLocation(RDI));
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathRoundFloat(HInvoke* invoke) {
+  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathRoundFloat(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen_, invoke);
+    return;
+  }
+
+  // Implement RoundFloat as t1 = floor(input + 0.5f);  convert to int.
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  XmmRegister maxInt = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  Label done, nan;
+  X86_64Assembler* assembler = GetAssembler();
+
+  // Generate 0.5 into inPlusPointFive.
+  __ movl(out, Immediate(bit_cast<int32_t, float>(0.5f)));
+  __ movd(inPlusPointFive, out, false);
+
+  // Add in the input.
+  __ addss(inPlusPointFive, in);
+
+  // And truncate to an integer.
+  __ roundss(inPlusPointFive, inPlusPointFive, Immediate(1));
+
+  __ movl(out, Immediate(kPrimIntMax));
+  // maxInt = int-to-float(out)
+  __ cvtsi2ss(maxInt, out);
+
+  // if inPlusPointFive >= maxInt goto done
+  __ comiss(inPlusPointFive, maxInt);
+  __ j(kAboveEqual, &done);
+
+  // if input == NaN goto nan
+  __ j(kUnordered, &nan);
+
+  // output = float-to-int-truncate(input)
+  __ cvttss2si(out, inPlusPointFive);
+  __ jmp(&done);
+  __ Bind(&nan);
+
+  //  output = 0
+  __ xorl(out, out);
+  __ Bind(&done);
+}
+
+void IntrinsicLocationsBuilderX86_64::VisitMathRoundDouble(HInvoke* invoke) {
+  CreateSSE41FPToIntLocations(arena_, invoke, codegen_);
+}
+
+void IntrinsicCodeGeneratorX86_64::VisitMathRoundDouble(HInvoke* invoke) {
+  LocationSummary* locations = invoke->GetLocations();
+  if (locations->WillCall()) {
+    InvokeOutOfLineIntrinsic(codegen_, invoke);
+    return;
+  }
+
+  // Implement RoundDouble as t1 = floor(input + 0.5);  convert to long.
+  XmmRegister in = locations->InAt(0).AsFpuRegister<XmmRegister>();
+  CpuRegister out = locations->Out().AsRegister<CpuRegister>();
+  XmmRegister maxLong = locations->GetTemp(0).AsFpuRegister<XmmRegister>();
+  XmmRegister inPlusPointFive = locations->GetTemp(1).AsFpuRegister<XmmRegister>();
+  Label done, nan;
+  X86_64Assembler* assembler = GetAssembler();
+
+  // Generate 0.5 into inPlusPointFive.
+  __ movq(out, Immediate(bit_cast<int64_t, double>(0.5)));
+  __ movd(inPlusPointFive, out, true);
+
+  // Add in the input.
+  __ addsd(inPlusPointFive, in);
+
+  // And truncate to an integer.
+  __ roundsd(inPlusPointFive, inPlusPointFive, Immediate(1));
+
+  __ movq(out, Immediate(kPrimLongMax));
+  // maxLong = long-to-double(out)
+  __ cvtsi2sd(maxLong, out, true);
+
+  // if inPlusPointFive >= maxLong goto done
+  __ comisd(inPlusPointFive, maxLong);
+  __ j(kAboveEqual, &done);
+
+  // if input == NaN goto nan
+  __ j(kUnordered, &nan);
+
+  // output = double-to-long-truncate(input)
+  __ cvttsd2si(out, inPlusPointFive, true);
+  __ jmp(&done);
+  __ Bind(&nan);
+
+  //  output = 0
+  __ xorq(out, out);
+  __ Bind(&done);
+}
+
 void IntrinsicLocationsBuilderX86_64::VisitStringCharAt(HInvoke* invoke) {
   // The inputs plus one temp.
   LocationSummary* locations = new (arena_) LocationSummary(invoke,
@@ -1009,11 +1212,6 @@
 
 UNIMPLEMENTED_INTRINSIC(IntegerReverse)
 UNIMPLEMENTED_INTRINSIC(LongReverse)
-UNIMPLEMENTED_INTRINSIC(MathFloor)
-UNIMPLEMENTED_INTRINSIC(MathCeil)
-UNIMPLEMENTED_INTRINSIC(MathRint)
-UNIMPLEMENTED_INTRINSIC(MathRoundDouble)
-UNIMPLEMENTED_INTRINSIC(MathRoundFloat)
 UNIMPLEMENTED_INTRINSIC(StringIndexOf)
 UNIMPLEMENTED_INTRINSIC(StringIndexOfAfter)
 UNIMPLEMENTED_INTRINSIC(SystemArrayCopyChar)
diff --git a/compiler/optimizing/intrinsics_x86_64.h b/compiler/optimizing/intrinsics_x86_64.h
index dfae7fa..0e0e72c 100644
--- a/compiler/optimizing/intrinsics_x86_64.h
+++ b/compiler/optimizing/intrinsics_x86_64.h
@@ -32,7 +32,7 @@
 
 class IntrinsicLocationsBuilderX86_64 FINAL : public IntrinsicVisitor {
  public:
-  explicit IntrinsicLocationsBuilderX86_64(ArenaAllocator* arena) : arena_(arena) {}
+  explicit IntrinsicLocationsBuilderX86_64(CodeGeneratorX86_64* codegen);
 
   // Define visitor methods.
 
@@ -50,6 +50,7 @@
 
  private:
   ArenaAllocator* arena_;
+  CodeGeneratorX86_64* codegen_;
 
   DISALLOW_COPY_AND_ASSIGN(IntrinsicLocationsBuilderX86_64);
 };
diff --git a/compiler/optimizing/linearize_test.cc b/compiler/optimizing/linearize_test.cc
index f22b7a7..28c5555 100644
--- a/compiler/optimizing/linearize_test.cc
+++ b/compiler/optimizing/linearize_test.cc
@@ -16,6 +16,7 @@
 
 #include <fstream>
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "base/stringprintf.h"
 #include "builder.h"
@@ -46,7 +47,9 @@
 
   graph->TryBuildingSsa();
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc
index c102c4f..61d6593 100644
--- a/compiler/optimizing/live_ranges_test.cc
+++ b/compiler/optimizing/live_ranges_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "builder.h"
 #include "code_generator.h"
@@ -65,7 +66,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -111,7 +114,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -160,7 +165,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -237,7 +244,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
   RemoveSuspendChecks(graph);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -315,7 +324,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -391,7 +402,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/liveness_test.cc b/compiler/optimizing/liveness_test.cc
index 0b0cfde..81250ca 100644
--- a/compiler/optimizing/liveness_test.cc
+++ b/compiler/optimizing/liveness_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "builder.h"
 #include "code_generator.h"
@@ -53,7 +54,9 @@
   graph->TryBuildingSsa();
   // `Inline` conditions into ifs.
   PrepareForRegisterAllocation(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
index 7c3a035..3951439 100644
--- a/compiler/optimizing/register_allocator_test.cc
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arch/x86/instruction_set_features_x86.h"
 #include "base/arena_allocator.h"
 #include "builder.h"
 #include "code_generator.h"
@@ -42,7 +43,9 @@
   const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
   builder.BuildGraph(*item);
   graph->TryBuildingSsa();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -58,7 +61,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = new (&allocator) HGraph(&allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   GrowableArray<LiveInterval*> intervals(&allocator, 0);
 
   // Test with two intervals of the same range.
@@ -298,7 +303,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -330,7 +337,9 @@
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
 
@@ -383,7 +392,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
   SsaDeadPhiElimination(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -405,7 +416,9 @@
   ArenaAllocator allocator(&pool);
   HGraph* graph = BuildSSAGraph(data, &allocator);
   SsaDeadPhiElimination(graph).Run();
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
   liveness.Analyze();
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
@@ -507,7 +520,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -522,7 +537,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -539,7 +556,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -556,7 +575,9 @@
 
   {
     HGraph* graph = BuildIfElseWithPhi(&allocator, &phi, &input1, &input2);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -608,7 +629,9 @@
 
   {
     HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -621,7 +644,9 @@
 
   {
     HGraph* graph = BuildFieldReturn(&allocator, &field, &ret);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -671,7 +696,9 @@
 
   {
     HGraph* graph = BuildTwoSubs(&allocator, &first_sub, &second_sub);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -685,7 +712,9 @@
 
   {
     HGraph* graph = BuildTwoSubs(&allocator, &first_sub, &second_sub);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -734,7 +763,9 @@
 
   {
     HGraph* graph = BuildDiv(&allocator, &div);
-    x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+    std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+        X86InstructionSetFeatures::FromCppDefines());
+    x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
     SsaLivenessAnalysis liveness(*graph, &codegen);
     liveness.Analyze();
 
@@ -822,7 +853,9 @@
   locations = new (&allocator) LocationSummary(fourth->GetDefinedBy(), LocationSummary::kNoCall);
   locations->SetOut(Location::RequiresRegister());
 
-  x86::CodeGeneratorX86 codegen(graph, CompilerOptions());
+  std::unique_ptr<const X86InstructionSetFeatures> features_x86(
+      X86InstructionSetFeatures::FromCppDefines());
+  x86::CodeGeneratorX86 codegen(graph, *features_x86.get(), CompilerOptions());
   SsaLivenessAnalysis liveness(*graph, &codegen);
 
   RegisterAllocator register_allocator(&allocator, &codegen, liveness);
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index 6f8b301..b13edb6 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -123,6 +123,16 @@
                                                   fmt);
   }
 
+  std::string RepeatFFI(void (Ass::*f)(FPReg, FPReg, const Imm&), size_t imm_bytes, std::string fmt) {
+    return RepeatTemplatedRegistersImm<FPReg, FPReg>(f,
+                                                  GetFPRegisters(),
+                                                  GetFPRegisters(),
+                                                  &AssemblerTest::GetFPRegName,
+                                                  &AssemblerTest::GetFPRegName,
+                                                  imm_bytes,
+                                                  fmt);
+  }
+
   std::string RepeatFR(void (Ass::*f)(FPReg, Reg), std::string fmt) {
     return RepeatTemplatedRegisters<FPReg, Reg>(f,
         GetFPRegisters(),
@@ -448,6 +458,57 @@
     return str;
   }
 
+  template <typename Reg1, typename Reg2>
+  std::string RepeatTemplatedRegistersImm(void (Ass::*f)(Reg1, Reg2, const Imm&),
+                                          const std::vector<Reg1*> reg1_registers,
+                                          const std::vector<Reg2*> reg2_registers,
+                                          std::string (AssemblerTest::*GetName1)(const Reg1&),
+                                          std::string (AssemblerTest::*GetName2)(const Reg2&),
+                                          size_t imm_bytes,
+                                          std::string fmt) {
+    std::vector<int64_t> imms = CreateImmediateValues(imm_bytes);
+    WarnOnCombinations(reg1_registers.size() * reg2_registers.size() * imms.size());
+
+    std::string str;
+    for (auto reg1 : reg1_registers) {
+      for (auto reg2 : reg2_registers) {
+        for (int64_t imm : imms) {
+          Imm new_imm = CreateImmediate(imm);
+          (assembler_.get()->*f)(*reg1, *reg2, new_imm);
+          std::string base = fmt;
+
+          std::string reg1_string = (this->*GetName1)(*reg1);
+          size_t reg1_index;
+          while ((reg1_index = base.find(REG1_TOKEN)) != std::string::npos) {
+            base.replace(reg1_index, ConstexprStrLen(REG1_TOKEN), reg1_string);
+          }
+
+          std::string reg2_string = (this->*GetName2)(*reg2);
+          size_t reg2_index;
+          while ((reg2_index = base.find(REG2_TOKEN)) != std::string::npos) {
+            base.replace(reg2_index, ConstexprStrLen(REG2_TOKEN), reg2_string);
+          }
+
+          size_t imm_index = base.find(IMM_TOKEN);
+          if (imm_index != std::string::npos) {
+            std::ostringstream sreg;
+            sreg << imm;
+            std::string imm_string = sreg.str();
+            base.replace(imm_index, ConstexprStrLen(IMM_TOKEN), imm_string);
+          }
+
+          if (str.size() > 0) {
+            str += "\n";
+          }
+          str += base;
+        }
+      }
+    }
+    // Add a newline at the end.
+    str += "\n";
+    return str;
+  }
+
   template <RegisterView kRegView>
   std::string GetRegName(const Reg& reg) {
     std::ostringstream sreg;
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 5773459..b3a1376 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -695,6 +695,28 @@
 }
 
 
+void X86Assembler::roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0B);
+  EmitXmmRegisterOperand(dst, src);
+  EmitUint8(imm.value());
+}
+
+
+void X86Assembler::roundss(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0A);
+  EmitXmmRegisterOperand(dst, src);
+  EmitUint8(imm.value());
+}
+
+
 void X86Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 6ccf2e3..bdf8843 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -312,6 +312,9 @@
   void ucomiss(XmmRegister a, XmmRegister b);
   void ucomisd(XmmRegister a, XmmRegister b);
 
+  void roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm);
+  void roundss(XmmRegister dst, XmmRegister src, const Immediate& imm);
+
   void sqrtsd(XmmRegister dst, XmmRegister src);
   void sqrtss(XmmRegister dst, XmmRegister src);
 
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index bd155ed..e82d90c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -796,6 +796,30 @@
 }
 
 
+void X86_64Assembler::roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0B);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+  EmitUint8(imm.value());
+}
+
+
+void X86_64Assembler::roundss(XmmRegister dst, XmmRegister src, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitOptionalRex32(dst, src);
+  EmitUint8(0x0F);
+  EmitUint8(0x3A);
+  EmitUint8(0x0A);
+  EmitXmmRegisterOperand(dst.LowBits(), src);
+  EmitUint8(imm.value());
+}
+
+
 void X86_64Assembler::sqrtsd(XmmRegister dst, XmmRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0xF2);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 495f74f..39f781c 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -353,6 +353,9 @@
   void ucomiss(XmmRegister a, XmmRegister b);
   void ucomisd(XmmRegister a, XmmRegister b);
 
+  void roundsd(XmmRegister dst, XmmRegister src, const Immediate& imm);
+  void roundss(XmmRegister dst, XmmRegister src, const Immediate& imm);
+
   void sqrtsd(XmmRegister dst, XmmRegister src);
   void sqrtss(XmmRegister dst, XmmRegister src);
 
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 00f508b..4402dfc 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -692,6 +692,14 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::sqrtsd, "sqrtsd %{reg2}, %{reg1}"), "sqrtsd");
 }
 
+TEST_F(AssemblerX86_64Test, Roundss) {
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundss, 1, "roundss ${imm}, %{reg2}, %{reg1}"), "roundss");
+}
+
+TEST_F(AssemblerX86_64Test, Roundsd) {
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundsd, 1, "roundsd ${imm}, %{reg2}, %{reg1}"), "roundsd");
+}
+
 TEST_F(AssemblerX86_64Test, Xorps) {
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::xorps, "xorps %{reg2}, %{reg1}"), "xorps");
 }
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 203488d..a1834e1 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -561,6 +561,24 @@
         instr++;
         if (prefix[2] == 0x66) {
           switch (*instr) {
+            case 0x0A:
+              opcode1 = "roundss";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              src_reg_file = SSE;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
+            case 0x0B:
+              opcode1 = "roundsd";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              src_reg_file = SSE;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
             case 0x14:
               opcode1 = "pextrb";
               prefix[2] = 0;
diff --git a/runtime/arch/x86/instruction_set_features_x86.h b/runtime/arch/x86/instruction_set_features_x86.h
index 926fabb..7b61245 100644
--- a/runtime/arch/x86/instruction_set_features_x86.h
+++ b/runtime/arch/x86/instruction_set_features_x86.h
@@ -58,6 +58,8 @@
 
   virtual ~X86InstructionSetFeatures() {}
 
+  bool HasSSE4_1() const { return has_SSE4_1_; }
+
  protected:
   // Parse a string of the form "ssse3" adding these to a new InstructionSetFeatures.
   virtual const InstructionSetFeatures*