Merge "Incorrect transformation of (sub,neg) to (sub) for fp"
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 7283710..8c61871 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -171,6 +171,7 @@
   runtime/oat_file_test.cc \
   runtime/oat_file_assistant_test.cc \
   runtime/parsed_options_test.cc \
+  runtime/prebuilt_tools_test.cc \
   runtime/reference_table_test.cc \
   runtime/thread_pool_test.cc \
   runtime/transaction_test.cc \
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index 96d90bb..8ffc86e 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -165,7 +165,7 @@
                                               method_inliner_map_.get(),
                                               compiler_kind, instruction_set,
                                               instruction_set_features_.get(),
-                                              true, new std::set<std::string>, nullptr,
+                                              true, new std::unordered_set<std::string>, nullptr,
                                               2, true, true, "", timer_.get(), -1, ""));
   }
   // We typically don't generate an image in unit tests, disable this optimization by default.
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 641d174..1832647 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -76,6 +76,10 @@
 // Whether to produce 64-bit ELF files for 64-bit targets. Leave this off for now.
 static constexpr bool kProduce64BitELFFiles = false;
 
+// Whether classes-to-compile is only applied to the boot image, or, when given, too all
+// compilations.
+static constexpr bool kRestrictCompilationFiltersToImage = true;
+
 static double Percentage(size_t x, size_t y) {
   return 100.0 * (static_cast<double>(x)) / (static_cast<double>(x + y));
 }
@@ -343,9 +347,9 @@
                                Compiler::Kind compiler_kind,
                                InstructionSet instruction_set,
                                const InstructionSetFeatures* instruction_set_features,
-                               bool image, std::set<std::string>* image_classes,
-                               std::set<std::string>* compiled_classes, size_t thread_count,
-                               bool dump_stats, bool dump_passes,
+                               bool image, std::unordered_set<std::string>* image_classes,
+                               std::unordered_set<std::string>* compiled_classes,
+                               size_t thread_count, bool dump_stats, bool dump_passes,
                                const std::string& dump_cfg_file_name, CumulativeLogger* timer,
                                int swap_fd, const std::string& profile_file)
     : swap_space_(swap_fd == -1 ? nullptr : new SwapSpace(swap_fd, 10 * MB)),
@@ -656,14 +660,14 @@
 }
 
 bool CompilerDriver::IsClassToCompile(const char* descriptor) const {
-  if (!IsImage()) {
+  if (kRestrictCompilationFiltersToImage && !IsImage()) {
     return true;
-  } else {
-    if (classes_to_compile_ == nullptr) {
-      return true;
-    }
-    return classes_to_compile_->find(descriptor) != classes_to_compile_->end();
   }
+
+  if (classes_to_compile_ == nullptr) {
+    return true;
+  }
+  return classes_to_compile_->find(descriptor) != classes_to_compile_->end();
 }
 
 static void ResolveExceptionsForMethod(MutableHandle<mirror::ArtMethod> method_handle,
@@ -723,7 +727,8 @@
 
 static bool RecordImageClassesVisitor(mirror::Class* klass, void* arg)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-  std::set<std::string>* image_classes = reinterpret_cast<std::set<std::string>*>(arg);
+  std::unordered_set<std::string>* image_classes =
+      reinterpret_cast<std::unordered_set<std::string>*>(arg);
   std::string temp;
   image_classes->insert(klass->GetDescriptor(&temp));
   return true;
@@ -795,7 +800,8 @@
   CHECK_NE(image_classes_->size(), 0U);
 }
 
-static void MaybeAddToImageClasses(Handle<mirror::Class> c, std::set<std::string>* image_classes)
+static void MaybeAddToImageClasses(Handle<mirror::Class> c,
+                                   std::unordered_set<std::string>* image_classes)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   Thread* self = Thread::Current();
   StackHandleScope<1> hs(self);
@@ -804,7 +810,8 @@
   std::string temp;
   while (!klass->IsObjectClass()) {
     const char* descriptor = klass->GetDescriptor(&temp);
-    std::pair<std::set<std::string>::iterator, bool> result = image_classes->insert(descriptor);
+    std::pair<std::unordered_set<std::string>::iterator, bool> result =
+        image_classes->insert(descriptor);
     if (!result.second) {  // Previously inserted.
       break;
     }
@@ -826,8 +833,8 @@
 // Note: we can use object pointers because we suspend all threads.
 class ClinitImageUpdate {
  public:
-  static ClinitImageUpdate* Create(std::set<std::string>* image_class_descriptors, Thread* self,
-                                   ClassLinker* linker, std::string* error_msg) {
+  static ClinitImageUpdate* Create(std::unordered_set<std::string>* image_class_descriptors,
+                                   Thread* self, ClassLinker* linker, std::string* error_msg) {
     std::unique_ptr<ClinitImageUpdate> res(new ClinitImageUpdate(image_class_descriptors, self,
                                                                  linker));
     if (res->art_method_class_ == nullptr) {
@@ -867,7 +874,7 @@
   }
 
  private:
-  ClinitImageUpdate(std::set<std::string>* image_class_descriptors, Thread* self,
+  ClinitImageUpdate(std::unordered_set<std::string>* image_class_descriptors, Thread* self,
                     ClassLinker* linker)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) :
       image_class_descriptors_(image_class_descriptors), self_(self) {
@@ -933,7 +940,7 @@
   }
 
   mutable std::unordered_set<mirror::Object*> marked_objects_;
-  std::set<std::string>* const image_class_descriptors_;
+  std::unordered_set<std::string>* const image_class_descriptors_;
   std::vector<mirror::Class*> image_classes_;
   const mirror::Class* art_method_class_;
   const mirror::Class* dex_cache_class_;
@@ -2381,7 +2388,7 @@
 }
 
 bool CompilerDriver::RequiresConstructorBarrier(Thread* self, const DexFile* dex_file,
-                                                uint16_t class_def_index) {
+                                                uint16_t class_def_index) const {
   ReaderMutexLock mu(self, freezing_constructor_lock_);
   return freezing_constructor_classes_.count(ClassReference(dex_file, class_def_index)) != 0;
 }
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index 1a4ae13..ce13a17 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -19,6 +19,7 @@
 
 #include <set>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "arch/instruction_set.h"
@@ -101,8 +102,8 @@
                           Compiler::Kind compiler_kind,
                           InstructionSet instruction_set,
                           const InstructionSetFeatures* instruction_set_features,
-                          bool image, std::set<std::string>* image_classes,
-                          std::set<std::string>* compiled_classes,
+                          bool image, std::unordered_set<std::string>* image_classes,
+                          std::unordered_set<std::string>* compiled_classes,
                           size_t thread_count, bool dump_stats, bool dump_passes,
                           const std::string& dump_cfg_file_name,
                           CumulativeLogger* timer, int swap_fd,
@@ -154,7 +155,7 @@
     return image_;
   }
 
-  const std::set<std::string>* GetImageClasses() const {
+  const std::unordered_set<std::string>* GetImageClasses() const {
     return image_classes_.get();
   }
 
@@ -187,7 +188,8 @@
 
   void AddRequiresConstructorBarrier(Thread* self, const DexFile* dex_file,
                                      uint16_t class_def_index);
-  bool RequiresConstructorBarrier(Thread* self, const DexFile* dex_file, uint16_t class_def_index);
+  bool RequiresConstructorBarrier(Thread* self, const DexFile* dex_file,
+                                  uint16_t class_def_index) const;
 
   // Callbacks from compiler to see what runtime checks must be generated.
 
@@ -397,6 +399,10 @@
     return thread_count_;
   }
 
+  bool GetDumpStats() const {
+    return dump_stats_;
+  }
+
   bool GetDumpPasses() const {
     return dump_passes_;
   }
@@ -419,7 +425,7 @@
   // Checks if class specified by type_idx is one of the image_classes_
   bool IsImageClass(const char* descriptor) const;
 
-  // Checks if the provided class should be compiled, i.e., is in classes_to_compile_.
+  // Checks whether the provided class should be compiled, i.e., is in classes_to_compile_.
   bool IsClassToCompile(const char* descriptor) const;
 
   void RecordClassStatus(ClassReference ref, mirror::Class::Status status)
@@ -584,12 +590,12 @@
   // If image_ is true, specifies the classes that will be included in
   // the image. Note if image_classes_ is nullptr, all classes are
   // included in the image.
-  std::unique_ptr<std::set<std::string>> image_classes_;
+  std::unique_ptr<std::unordered_set<std::string>> image_classes_;
 
-  // If image_ is true, specifies the classes that will be compiled in
-  // the image. Note if classes_to_compile_ is nullptr, all classes are
-  // included in the image.
-  std::unique_ptr<std::set<std::string>> classes_to_compile_;
+  // Specifies the classes that will be compiled. Note that if classes_to_compile_ is nullptr,
+  // all classes are eligible for compilation (duplication filters etc. will still apply).
+  // This option may be restricted to the boot image, depending on a flag in the implementation.
+  std::unique_ptr<std::unordered_set<std::string>> classes_to_compile_;
 
   bool had_hard_verifier_failure_;
 
diff --git a/compiler/dwarf/dwarf_test.h b/compiler/dwarf/dwarf_test.h
index dd5e0c2..5a97c3b 100644
--- a/compiler/dwarf/dwarf_test.h
+++ b/compiler/dwarf/dwarf_test.h
@@ -55,36 +55,6 @@
     expected_lines_.push_back(ExpectedLine {substr, next, at_file, at_line});
   }
 
-  static std::string GetObjdumpPath() {
-    const char* android_build_top = getenv("ANDROID_BUILD_TOP");
-    if (android_build_top != nullptr) {
-      std::string host_prebuilts = std::string(android_build_top) +
-                                   "/prebuilts/gcc/linux-x86/host/";
-      // Read the content of the directory.
-      std::set<std::string> entries;
-      DIR* dir = opendir(host_prebuilts.c_str());
-      if (dir != nullptr) {
-        struct dirent* entry;
-        while ((entry = readdir(dir)) != nullptr) {
-          if (strstr(entry->d_name, "linux-glibc")) {
-            entries.insert(host_prebuilts + entry->d_name);
-          }
-        }
-        closedir(dir);
-      }
-      // Strings are sorted so the last one should be the most recent version.
-      if (!entries.empty()) {
-        std::string path = *entries.rbegin() + "/x86_64-linux/bin/objdump";
-        struct stat st;
-        if (stat(path.c_str(), &st) == 0) {
-          return path;  // File exists.
-        }
-      }
-    }
-    ADD_FAILURE() << "Can not find prebuild objdump.";
-    return "objdump";  // Use the system objdump as fallback.
-  }
-
   // Pretty-print the generated DWARF data using objdump.
   template<typename Elf_Word, typename Elf_Sword, typename Elf_Addr, typename Elf_Dyn,
            typename Elf_Sym, typename Elf_Ehdr, typename Elf_Phdr, typename Elf_Shdr>
@@ -130,8 +100,8 @@
 
     // Read the elf file back using objdump.
     std::vector<std::string> lines;
-    std::string cmd = GetObjdumpPath();
-    cmd = cmd + " " + args + " " + file.GetFilename() + " 2>&1";
+    std::string cmd = GetAndroidHostToolsDir();
+    cmd = cmd + "objdump " + args + " " + file.GetFilename() + " 2>&1";
     FILE* output = popen(cmd.data(), "r");
     char buffer[1024];
     const char* line;
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index cfd525c..8016831 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -124,7 +124,7 @@
   }
 
   ASSERT_TRUE(compiler_driver_->GetImageClasses() != NULL);
-  std::set<std::string> image_classes(*compiler_driver_->GetImageClasses());
+  std::unordered_set<std::string> image_classes(*compiler_driver_->GetImageClasses());
 
   // Need to delete the compiler since it has worker threads which are attached to runtime.
   compiler_driver_.reset();
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 670c897..a99ef34 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -776,7 +776,7 @@
 }
 
 void ImageWriter::DumpImageClasses() {
-  const std::set<std::string>* image_classes = compiler_driver_.GetImageClasses();
+  auto image_classes = compiler_driver_.GetImageClasses();
   CHECK(image_classes != NULL);
   for (const std::string& image_class : *image_classes) {
     LOG(INFO) << " " << image_class;
diff --git a/compiler/jit/jit_compiler.cc b/compiler/jit/jit_compiler.cc
index be2c8c6..9ff7ab8 100644
--- a/compiler/jit/jit_compiler.cc
+++ b/compiler/jit/jit_compiler.cc
@@ -94,7 +94,7 @@
   compiler_driver_.reset(new CompilerDriver(
       compiler_options_.get(), verification_results_.get(), method_inliner_map_.get(),
       Compiler::kQuick, instruction_set, instruction_set_features_.get(), false,
-      nullptr, new std::set<std::string>, 1, false, true,
+      nullptr, nullptr, 1, false, true,
       std::string(), cumulative_logger_.get(), -1, std::string()));
   // Disable dedupe so we can remove compiled methods.
   compiler_driver_->SetDedupeEnabled(false);
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 8a64d81..818d671 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -520,8 +520,24 @@
   UpdateLocal(instruction.VRegA(), current_block_->GetLastInstruction());
 }
 
+static bool RequiresConstructorBarrier(const DexCompilationUnit* cu, const CompilerDriver& driver) {
+  // dex compilation unit is null only when unit testing.
+  if (cu == nullptr) {
+    return false;
+  }
+
+  Thread* self = Thread::Current();
+  return cu->IsConstructor()
+      && driver.RequiresConstructorBarrier(self, cu->GetDexFile(), cu->GetClassDefIndex());
+}
+
 void HGraphBuilder::BuildReturn(const Instruction& instruction, Primitive::Type type) {
   if (type == Primitive::kPrimVoid) {
+    // Note that we might insert redundant barriers when inlining `super` calls.
+    // TODO: add a data flow analysis to get rid of duplicate barriers.
+    if (RequiresConstructorBarrier(dex_compilation_unit_, *compiler_driver_)) {
+      current_block_->AddInstruction(new (arena_) HMemoryBarrier(kStoreStore));
+    }
     current_block_->AddInstruction(new (arena_) HReturnVoid());
   } else {
     HInstruction* value = LoadLocal(instruction.VRegA(), type);
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index f7fa5db..8ab759d 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -36,6 +36,80 @@
 
 namespace art {
 
+// Return whether a location is consistent with a type.
+static bool CheckType(Primitive::Type type, Location location) {
+  if (location.IsFpuRegister()
+      || (location.IsUnallocated() && (location.GetPolicy() == Location::kRequiresFpuRegister))) {
+    return (type == Primitive::kPrimFloat) || (type == Primitive::kPrimDouble);
+  } else if (location.IsRegister() ||
+             (location.IsUnallocated() && (location.GetPolicy() == Location::kRequiresRegister))) {
+    return Primitive::IsIntegralType(type) || (type == Primitive::kPrimNot);
+  } else if (location.IsRegisterPair()) {
+    return type == Primitive::kPrimLong;
+  } else if (location.IsFpuRegisterPair()) {
+    return type == Primitive::kPrimDouble;
+  } else if (location.IsStackSlot()) {
+    return (Primitive::IsIntegralType(type) && type != Primitive::kPrimLong)
+           || (type == Primitive::kPrimFloat)
+           || (type == Primitive::kPrimNot);
+  } else if (location.IsDoubleStackSlot()) {
+    return (type == Primitive::kPrimLong) || (type == Primitive::kPrimDouble);
+  } else if (location.IsConstant()) {
+    if (location.GetConstant()->IsIntConstant()) {
+      return Primitive::IsIntegralType(type) && (type != Primitive::kPrimLong);
+    } else if (location.GetConstant()->IsNullConstant()) {
+      return type == Primitive::kPrimNot;
+    } else if (location.GetConstant()->IsLongConstant()) {
+      return type == Primitive::kPrimLong;
+    } else if (location.GetConstant()->IsFloatConstant()) {
+      return type == Primitive::kPrimFloat;
+    } else {
+      return location.GetConstant()->IsDoubleConstant()
+          && (type == Primitive::kPrimDouble);
+    }
+  } else {
+    return location.IsInvalid() || (location.GetPolicy() == Location::kAny);
+  }
+}
+
+// Check that a location summary is consistent with an instruction.
+static bool CheckTypeConsistency(HInstruction* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  if (locations == nullptr) {
+    return true;
+  }
+
+  if (locations->Out().IsUnallocated()
+      && (locations->Out().GetPolicy() == Location::kSameAsFirstInput)) {
+    DCHECK(CheckType(instruction->GetType(), locations->InAt(0)))
+        << instruction->GetType()
+        << " " << locations->InAt(0);
+  } else {
+    DCHECK(CheckType(instruction->GetType(), locations->Out()))
+        << instruction->GetType()
+        << " " << locations->Out();
+  }
+
+  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+    DCHECK(CheckType(instruction->InputAt(i)->GetType(), locations->InAt(i)))
+      << instruction->InputAt(i)->GetType()
+      << " " << locations->InAt(i);
+  }
+
+  HEnvironment* environment = instruction->GetEnvironment();
+  for (size_t i = 0; i < instruction->EnvironmentSize(); ++i) {
+    if (environment->GetInstructionAt(i) != nullptr) {
+      Primitive::Type type = environment->GetInstructionAt(i)->GetType();
+      DCHECK(CheckType(type, locations->GetEnvironmentAt(i)))
+        << type << " " << locations->GetEnvironmentAt(i);
+    } else {
+      DCHECK(locations->GetEnvironmentAt(i).IsInvalid())
+        << locations->GetEnvironmentAt(i);
+    }
+  }
+  return true;
+}
+
 size_t CodeGenerator::GetCacheOffset(uint32_t index) {
   return mirror::ObjectArray<mirror::Object>::OffsetOfElement(index).SizeValue();
 }
@@ -95,6 +169,7 @@
       if (is_baseline) {
         InitLocationsBaseline(current);
       }
+      DCHECK(CheckTypeConsistency(current));
       current->Accept(instruction_visitor);
     }
   }
@@ -347,6 +422,7 @@
 
 void CodeGenerator::AllocateLocations(HInstruction* instruction) {
   instruction->Accept(GetLocationBuilder());
+  DCHECK(CheckTypeConsistency(instruction));
   LocationSummary* locations = instruction->GetLocations();
   if (!instruction->IsSuspendCheckEntry()) {
     if (locations != nullptr && locations->CanCall()) {
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 2ea9203..38fa043 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1214,6 +1214,14 @@
   UNUSED(constant);
 }
 
+void LocationsBuilderARM::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  memory_barrier->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorARM::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
+}
+
 void LocationsBuilderARM::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
@@ -2826,10 +2834,14 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
-
 
   Primitive::Type field_type = field_info.GetFieldType();
+  if (Primitive::IsFloatingPointType(field_type)) {
+    locations->SetInAt(1, Location::RequiresFpuRegister());
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
+
   bool is_wide = field_type == Primitive::kPrimLong || field_type == Primitive::kPrimDouble;
   bool generate_volatile = field_info.IsVolatile()
       && is_wide
@@ -2965,8 +2977,13 @@
       && (field_info.GetFieldType() == Primitive::kPrimDouble)
       && !codegen_->GetInstructionSetFeatures().HasAtomicLdrdAndStrd();
   bool overlap = field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong);
-  locations->SetOut(Location::RequiresRegister(),
-                    (overlap ? Location::kOutputOverlap : Location::kNoOutputOverlap));
+
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister());
+  } else {
+    locations->SetOut(Location::RequiresRegister(),
+                      (overlap ? Location::kOutputOverlap : Location::kNoOutputOverlap));
+  }
   if (volatile_for_double) {
     // Arm encoding have some additional constraints for ldrexd/strexd:
     // - registers need to be consecutive
@@ -3139,7 +3156,11 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+  } else {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  }
 }
 
 void InstructionCodeGeneratorARM::VisitArrayGet(HArrayGet* instruction) {
@@ -3286,7 +3307,11 @@
   } else {
     locations->SetInAt(0, Location::RequiresRegister());
     locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-    locations->SetInAt(2, Location::RequiresRegister());
+    if (Primitive::IsFloatingPointType(value_type)) {
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+    } else {
+      locations->SetInAt(2, Location::RequiresRegister());
+    }
 
     if (needs_write_barrier) {
       // Temporary registers for the write barrier.
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 06f425e..6009036 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -96,10 +96,10 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
 
-class ParallelMoveResolverARM : public ParallelMoveResolver {
+class ParallelMoveResolverARM : public ParallelMoveResolverWithSwap {
  public:
   ParallelMoveResolverARM(ArenaAllocator* allocator, CodeGeneratorARM* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverWithSwap(allocator), codegen_(codegen) {}
 
   void EmitMove(size_t index) OVERRIDE;
   void EmitSwap(size_t index) OVERRIDE;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index efc41e7..23ba339 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -425,30 +425,67 @@
   CodeGenerator::Finalize(allocator);
 }
 
+void ParallelMoveResolverARM64::PrepareForEmitNativeCode() {
+  // Note: There are 6 kinds of moves:
+  // 1. constant -> GPR/FPR (non-cycle)
+  // 2. constant -> stack (non-cycle)
+  // 3. GPR/FPR -> GPR/FPR
+  // 4. GPR/FPR -> stack
+  // 5. stack -> GPR/FPR
+  // 6. stack -> stack (non-cycle)
+  // Case 1, 2 and 6 should never be included in a dependency cycle on ARM64. For case 3, 4, and 5
+  // VIXL uses at most 1 GPR. VIXL has 2 GPR and 1 FPR temps, and there should be no intersecting
+  // cycles on ARM64, so we always have 1 GPR and 1 FPR available VIXL temps to resolve the
+  // dependency.
+  vixl_temps_.Open(GetVIXLAssembler());
+}
+
+void ParallelMoveResolverARM64::FinishEmitNativeCode() {
+  vixl_temps_.Close();
+}
+
+Location ParallelMoveResolverARM64::AllocateScratchLocationFor(Location::Kind kind) {
+  DCHECK(kind == Location::kRegister || kind == Location::kFpuRegister ||
+         kind == Location::kStackSlot || kind == Location::kDoubleStackSlot);
+  kind = (kind == Location::kFpuRegister) ? Location::kFpuRegister : Location::kRegister;
+  Location scratch = GetScratchLocation(kind);
+  if (!scratch.Equals(Location::NoLocation())) {
+    return scratch;
+  }
+  // Allocate from VIXL temp registers.
+  if (kind == Location::kRegister) {
+    scratch = LocationFrom(vixl_temps_.AcquireX());
+  } else {
+    DCHECK(kind == Location::kFpuRegister);
+    scratch = LocationFrom(vixl_temps_.AcquireD());
+  }
+  AddScratchLocation(scratch);
+  return scratch;
+}
+
+void ParallelMoveResolverARM64::FreeScratchLocation(Location loc) {
+  if (loc.IsRegister()) {
+    vixl_temps_.Release(XRegisterFrom(loc));
+  } else {
+    DCHECK(loc.IsFpuRegister());
+    vixl_temps_.Release(DRegisterFrom(loc));
+  }
+  RemoveScratchLocation(loc);
+}
+
 void ParallelMoveResolverARM64::EmitMove(size_t index) {
   MoveOperands* move = moves_.Get(index);
   codegen_->MoveLocation(move->GetDestination(), move->GetSource());
 }
 
-void ParallelMoveResolverARM64::EmitSwap(size_t index) {
-  MoveOperands* move = moves_.Get(index);
-  codegen_->SwapLocations(move->GetDestination(), move->GetSource());
-}
-
-void ParallelMoveResolverARM64::RestoreScratch(int reg) {
-  __ Pop(Register(VIXLRegCodeFromART(reg), kXRegSize));
-}
-
-void ParallelMoveResolverARM64::SpillScratch(int reg) {
-  __ Push(Register(VIXLRegCodeFromART(reg), kXRegSize));
-}
-
 void CodeGeneratorARM64::GenerateFrameEntry() {
+  MacroAssembler* masm = GetVIXLAssembler();
+  BlockPoolsScope block_pools(masm);
   __ Bind(&frame_entry_label_);
 
   bool do_overflow_check = FrameNeedsStackCheck(GetFrameSize(), kArm64) || !IsLeafMethod();
   if (do_overflow_check) {
-    UseScratchRegisterScope temps(GetVIXLAssembler());
+    UseScratchRegisterScope temps(masm);
     Register temp = temps.AcquireX();
     DCHECK(GetCompilerOptions().GetImplicitStackOverflowChecks());
     __ Sub(temp, sp, static_cast<int32_t>(GetStackOverflowReservedBytes(kArm64)));
@@ -474,6 +511,7 @@
 }
 
 void CodeGeneratorARM64::GenerateFrameExit() {
+  BlockPoolsScope block_pools(GetVIXLAssembler());
   GetAssembler()->cfi().RememberState();
   if (!HasEmptyFrame()) {
     int frame_size = GetFrameSize();
@@ -726,10 +764,10 @@
       if (destination.IsRegister()) {
         __ Mov(Register(dst), RegisterFrom(source, type));
       } else {
+        DCHECK(destination.IsFpuRegister());
         __ Fmov(FPRegister(dst), FPRegisterFrom(source, type));
       }
     }
-
   } else {  // The destination is not a register. It must be a stack slot.
     DCHECK(destination.IsStackSlot() || destination.IsDoubleStackSlot());
     if (source.IsRegister() || source.IsFpuRegister()) {
@@ -772,67 +810,6 @@
   }
 }
 
-void CodeGeneratorARM64::SwapLocations(Location loc1, Location loc2) {
-  DCHECK(!loc1.IsConstant());
-  DCHECK(!loc2.IsConstant());
-
-  if (loc1.Equals(loc2)) {
-    return;
-  }
-
-  UseScratchRegisterScope temps(GetAssembler()->vixl_masm_);
-
-  bool is_slot1 = loc1.IsStackSlot() || loc1.IsDoubleStackSlot();
-  bool is_slot2 = loc2.IsStackSlot() || loc2.IsDoubleStackSlot();
-  bool is_fp_reg1 = loc1.IsFpuRegister();
-  bool is_fp_reg2 = loc2.IsFpuRegister();
-
-  if (loc2.IsRegister() && loc1.IsRegister()) {
-    Register r1 = XRegisterFrom(loc1);
-    Register r2 = XRegisterFrom(loc2);
-    Register tmp = temps.AcquireSameSizeAs(r1);
-    __ Mov(tmp, r2);
-    __ Mov(r2, r1);
-    __ Mov(r1, tmp);
-  } else if (is_fp_reg2 && is_fp_reg1) {
-    FPRegister r1 = DRegisterFrom(loc1);
-    FPRegister r2 = DRegisterFrom(loc2);
-    FPRegister tmp = temps.AcquireSameSizeAs(r1);
-    __ Fmov(tmp, r2);
-    __ Fmov(r2, r1);
-    __ Fmov(r1, tmp);
-  } else if (is_slot1 != is_slot2) {
-    MemOperand mem = StackOperandFrom(is_slot1 ? loc1 : loc2);
-    Location reg_loc = is_slot1 ? loc2 : loc1;
-    CPURegister reg, tmp;
-    if (reg_loc.IsFpuRegister()) {
-      reg = DRegisterFrom(reg_loc);
-      tmp = temps.AcquireD();
-    } else {
-      reg = XRegisterFrom(reg_loc);
-      tmp = temps.AcquireX();
-    }
-    __ Ldr(tmp, mem);
-    __ Str(reg, mem);
-    if (reg_loc.IsFpuRegister()) {
-      __ Fmov(FPRegister(reg), FPRegister(tmp));
-    } else {
-      __ Mov(Register(reg), Register(tmp));
-    }
-  } else if (is_slot1 && is_slot2) {
-    MemOperand mem1 = StackOperandFrom(loc1);
-    MemOperand mem2 = StackOperandFrom(loc2);
-    Register tmp1 = loc1.IsStackSlot() ? temps.AcquireW() : temps.AcquireX();
-    Register tmp2 = temps.AcquireSameSizeAs(tmp1);
-    __ Ldr(tmp1, mem1);
-    __ Ldr(tmp2, mem2);
-    __ Str(tmp1, mem2);
-    __ Str(tmp2, mem1);
-  } else {
-    LOG(FATAL) << "Unimplemented";
-  }
-}
-
 void CodeGeneratorARM64::Load(Primitive::Type type,
                               CPURegister dst,
                               const MemOperand& src) {
@@ -865,7 +842,9 @@
 void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
                                      CPURegister dst,
                                      const MemOperand& src) {
-  UseScratchRegisterScope temps(GetVIXLAssembler());
+  MacroAssembler* masm = GetVIXLAssembler();
+  BlockPoolsScope block_pools(masm);
+  UseScratchRegisterScope temps(masm);
   Register temp_base = temps.AcquireX();
   Primitive::Type type = instruction->GetType();
 
@@ -995,6 +974,7 @@
                                        HInstruction* instruction,
                                        uint32_t dex_pc,
                                        SlowPathCode* slow_path) {
+  BlockPoolsScope block_pools(GetVIXLAssembler());
   __ Ldr(lr, MemOperand(tr, entry_point_offset));
   __ Blr(lr);
   if (instruction != nullptr) {
@@ -1130,6 +1110,83 @@
   }
 }
 
+void LocationsBuilderARM64::HandleFieldGet(HInstruction* instruction) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+  locations->SetInAt(0, Location::RequiresRegister());
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister());
+  } else {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  }
+}
+
+void InstructionCodeGeneratorARM64::HandleFieldGet(HInstruction* instruction,
+                                                   const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldGet() || instruction->IsStaticFieldGet());
+  BlockPoolsScope block_pools(GetVIXLAssembler());
+
+  MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), field_info.GetFieldOffset());
+  bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
+
+  if (field_info.IsVolatile()) {
+    if (use_acquire_release) {
+      // NB: LoadAcquire will record the pc info if needed.
+      codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
+    } else {
+      codegen_->Load(field_info.GetFieldType(), OutputCPURegister(instruction), field);
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+      // For IRIW sequential consistency kLoadAny is not sufficient.
+      GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+    }
+  } else {
+    codegen_->Load(field_info.GetFieldType(), OutputCPURegister(instruction), field);
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
+}
+
+void LocationsBuilderARM64::HandleFieldSet(HInstruction* instruction) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+  locations->SetInAt(0, Location::RequiresRegister());
+  if (Primitive::IsFloatingPointType(instruction->InputAt(1)->GetType())) {
+    locations->SetInAt(1, Location::RequiresFpuRegister());
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
+}
+
+void InstructionCodeGeneratorARM64::HandleFieldSet(HInstruction* instruction,
+                                                   const FieldInfo& field_info) {
+  DCHECK(instruction->IsInstanceFieldSet() || instruction->IsStaticFieldSet());
+  BlockPoolsScope block_pools(GetVIXLAssembler());
+
+  Register obj = InputRegisterAt(instruction, 0);
+  CPURegister value = InputCPURegisterAt(instruction, 1);
+  Offset offset = field_info.GetFieldOffset();
+  Primitive::Type field_type = field_info.GetFieldType();
+  bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
+
+  if (field_info.IsVolatile()) {
+    if (use_acquire_release) {
+      codegen_->StoreRelease(field_type, value, HeapOperand(obj, offset));
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+    } else {
+      GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
+      codegen_->Store(field_type, value, HeapOperand(obj, offset));
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
+      GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
+    }
+  } else {
+    codegen_->Store(field_type, value, HeapOperand(obj, offset));
+    codegen_->MaybeRecordImplicitNullCheck(instruction);
+  }
+
+  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
+    codegen_->MarkGCCard(obj, Register(value));
+  }
+}
+
 void InstructionCodeGeneratorARM64::HandleBinaryOp(HBinaryOperation* instr) {
   Primitive::Type type = instr->GetType();
 
@@ -1250,7 +1307,11 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  locations->SetOut(Location::RequiresRegister());
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+  } else {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  }
 }
 
 void InstructionCodeGeneratorARM64::VisitArrayGet(HArrayGet* instruction) {
@@ -1260,7 +1321,9 @@
   Location index = locations->InAt(1);
   size_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(type)).Uint32Value();
   MemOperand source = HeapOperand(obj);
-  UseScratchRegisterScope temps(GetVIXLAssembler());
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope temps(masm);
+  BlockPoolsScope block_pools(masm);
 
   if (index.IsConstant()) {
     offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(type);
@@ -1283,55 +1346,71 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitArrayLength(HArrayLength* instruction) {
+  BlockPoolsScope block_pools(GetVIXLAssembler());
   __ Ldr(OutputRegister(instruction),
          HeapOperand(InputRegisterAt(instruction, 0), mirror::Array::LengthOffset()));
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderARM64::VisitArraySet(HArraySet* instruction) {
-  Primitive::Type value_type = instruction->GetComponentType();
-  bool is_object = value_type == Primitive::kPrimNot;
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(
-      instruction, is_object ? LocationSummary::kCall : LocationSummary::kNoCall);
-  if (is_object) {
+  if (instruction->NeedsTypeCheck()) {
+    LocationSummary* locations =
+        new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kCall);
     InvokeRuntimeCallingConvention calling_convention;
     locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0)));
     locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1)));
     locations->SetInAt(2, LocationFrom(calling_convention.GetRegisterAt(2)));
   } else {
+    LocationSummary* locations =
+        new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
     locations->SetInAt(0, Location::RequiresRegister());
     locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-    locations->SetInAt(2, Location::RequiresRegister());
+    if (Primitive::IsFloatingPointType(instruction->InputAt(2)->GetType())) {
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+    } else {
+      locations->SetInAt(2, Location::RequiresRegister());
+    }
   }
 }
 
 void InstructionCodeGeneratorARM64::VisitArraySet(HArraySet* instruction) {
   Primitive::Type value_type = instruction->GetComponentType();
-  if (value_type == Primitive::kPrimNot) {
+  LocationSummary* locations = instruction->GetLocations();
+  bool needs_runtime_call = locations->WillCall();
+
+  if (needs_runtime_call) {
     codegen_->InvokeRuntime(
         QUICK_ENTRY_POINT(pAputObject), instruction, instruction->GetDexPc(), nullptr);
     CheckEntrypointTypes<kQuickAputObject, void, mirror::Array*, int32_t, mirror::Object*>();
   } else {
-    LocationSummary* locations = instruction->GetLocations();
     Register obj = InputRegisterAt(instruction, 0);
     CPURegister value = InputCPURegisterAt(instruction, 2);
     Location index = locations->InAt(1);
     size_t offset = mirror::Array::DataOffset(Primitive::ComponentSize(value_type)).Uint32Value();
     MemOperand destination = HeapOperand(obj);
-    UseScratchRegisterScope temps(GetVIXLAssembler());
+    MacroAssembler* masm = GetVIXLAssembler();
+    BlockPoolsScope block_pools(masm);
+    {
+      // We use a block to end the scratch scope before the write barrier, thus
+      // freeing the temporary registers so they can be used in `MarkGCCard`.
+      UseScratchRegisterScope temps(masm);
 
-    if (index.IsConstant()) {
-      offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(value_type);
-      destination = HeapOperand(obj, offset);
-    } else {
-      Register temp = temps.AcquireSameSizeAs(obj);
-      Register index_reg = InputRegisterAt(instruction, 1);
-      __ Add(temp, obj, Operand(index_reg, LSL, Primitive::ComponentSizeShift(value_type)));
-      destination = HeapOperand(temp, offset);
+      if (index.IsConstant()) {
+        offset += Int64ConstantFrom(index) << Primitive::ComponentSizeShift(value_type);
+        destination = HeapOperand(obj, offset);
+      } else {
+        Register temp = temps.AcquireSameSizeAs(obj);
+        Register index_reg = InputRegisterAt(instruction, 1);
+        __ Add(temp, obj, Operand(index_reg, LSL, Primitive::ComponentSizeShift(value_type)));
+        destination = HeapOperand(temp, offset);
+      }
+
+      codegen_->Store(value_type, value, destination);
+      codegen_->MaybeRecordImplicitNullCheck(instruction);
     }
-
-    codegen_->Store(value_type, value, destination);
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
+    if (CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue())) {
+      codegen_->MarkGCCard(obj, value.W());
+    }
   }
 }
 
@@ -1742,64 +1821,19 @@
 }
 
 void LocationsBuilderARM64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  HandleFieldGet(instruction);
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceFieldGet(HInstanceFieldGet* instruction) {
-  MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), instruction->GetFieldOffset());
-  bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
-
-  if (instruction->IsVolatile()) {
-    if (use_acquire_release) {
-      // NB: LoadAcquire will record the pc info if needed.
-      codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
-    } else {
-      codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
-      // For IRIW sequential consistency kLoadAny is not sufficient.
-      GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
-    }
-  } else {
-    codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
-  }
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
 }
 
 void LocationsBuilderARM64::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  HandleFieldSet(instruction);
 }
 
 void InstructionCodeGeneratorARM64::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
-  Register obj = InputRegisterAt(instruction, 0);
-  CPURegister value = InputCPURegisterAt(instruction, 1);
-  Offset offset = instruction->GetFieldOffset();
-  Primitive::Type field_type = instruction->GetFieldType();
-  bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
-
-  if (instruction->IsVolatile()) {
-    if (use_acquire_release) {
-      codegen_->StoreRelease(field_type, value, HeapOperand(obj, offset));
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
-    } else {
-      GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
-      codegen_->Store(field_type, value, HeapOperand(obj, offset));
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
-      GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
-    }
-  } else {
-    codegen_->Store(field_type, value, HeapOperand(obj, offset));
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
-  }
-
-  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-    codegen_->MarkGCCard(obj, Register(value));
-  }
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
 }
 
 void LocationsBuilderARM64::VisitInstanceOf(HInstanceOf* instruction) {
@@ -1898,7 +1932,9 @@
 
   // The register ip1 is required to be used for the hidden argument in
   // art_quick_imt_conflict_trampoline, so prevent VIXL from using it.
-  UseScratchRegisterScope scratch_scope(GetVIXLAssembler());
+  MacroAssembler* masm = GetVIXLAssembler();
+  UseScratchRegisterScope scratch_scope(masm);
+  BlockPoolsScope block_pools(masm);
   scratch_scope.Exclude(ip1);
   __ Mov(ip1, invoke->GetDexMethodIndex());
 
@@ -1984,6 +2020,7 @@
     return;
   }
 
+  BlockPoolsScope block_pools(GetVIXLAssembler());
   Register temp = WRegisterFrom(invoke->GetLocations()->GetTemp(0));
   codegen_->GenerateStaticOrDirectCall(invoke, temp);
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
@@ -2002,6 +2039,8 @@
   Offset class_offset = mirror::Object::ClassOffset();
   Offset entry_point = mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset(kArm64WordSize);
 
+  BlockPoolsScope block_pools(GetVIXLAssembler());
+
   // temp = object->GetClass();
   if (receiver.IsStackSlot()) {
     __ Ldr(temp, MemOperand(sp, receiver.GetStackIndex()));
@@ -2302,8 +2341,9 @@
   if (codegen_->CanMoveNullCheckToUser(instruction)) {
     return;
   }
-  Location obj = instruction->GetLocations()->InAt(0);
 
+  BlockPoolsScope block_pools(GetVIXLAssembler());
+  Location obj = instruction->GetLocations()->InAt(0);
   __ Ldr(wzr, HeapOperandFrom(obj, Offset(0)));
   codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
 }
@@ -2430,6 +2470,14 @@
   }
 }
 
+void LocationsBuilderARM64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  memory_barrier->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorARM64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
+}
+
 void LocationsBuilderARM64::VisitReturn(HReturn* instruction) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
   Primitive::Type return_type = instruction->InputAt(0)->GetType();
@@ -2503,59 +2551,19 @@
 }
 
 void LocationsBuilderARM64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  HandleFieldGet(instruction);
 }
 
 void InstructionCodeGeneratorARM64::VisitStaticFieldGet(HStaticFieldGet* instruction) {
-  MemOperand field = HeapOperand(InputRegisterAt(instruction, 0), instruction->GetFieldOffset());
-  bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
-
-  if (instruction->IsVolatile()) {
-    if (use_acquire_release) {
-      // NB: LoadAcquire will record the pc info if needed.
-      codegen_->LoadAcquire(instruction, OutputCPURegister(instruction), field);
-    } else {
-      codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
-      // For IRIW sequential consistency kLoadAny is not sufficient.
-      GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
-    }
-  } else {
-    codegen_->Load(instruction->GetType(), OutputCPURegister(instruction), field);
-  }
+  HandleFieldGet(instruction, instruction->GetFieldInfo());
 }
 
 void LocationsBuilderARM64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  LocationSummary* locations =
-      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  HandleFieldSet(instruction);
 }
 
 void InstructionCodeGeneratorARM64::VisitStaticFieldSet(HStaticFieldSet* instruction) {
-  Register cls = InputRegisterAt(instruction, 0);
-  CPURegister value = InputCPURegisterAt(instruction, 1);
-  Offset offset = instruction->GetFieldOffset();
-  Primitive::Type field_type = instruction->GetFieldType();
-  bool use_acquire_release = codegen_->GetInstructionSetFeatures().PreferAcquireRelease();
-
-  if (instruction->IsVolatile()) {
-    if (use_acquire_release) {
-      codegen_->StoreRelease(field_type, value, HeapOperand(cls, offset));
-    } else {
-      GenerateMemoryBarrier(MemBarrierKind::kAnyStore);
-      codegen_->Store(field_type, value, HeapOperand(cls, offset));
-      GenerateMemoryBarrier(MemBarrierKind::kAnyAny);
-    }
-  } else {
-    codegen_->Store(field_type, value, HeapOperand(cls, offset));
-  }
-
-  if (CodeGenerator::StoreNeedsWriteBarrier(field_type, instruction->InputAt(1))) {
-    codegen_->MarkGCCard(cls, Register(value));
-  }
+  HandleFieldSet(instruction, instruction->GetFieldInfo());
 }
 
 void LocationsBuilderARM64::VisitSuspendCheck(HSuspendCheck* instruction) {
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 07c6dd0..5a35867 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -159,6 +159,8 @@
   void GenerateMemoryBarrier(MemBarrierKind kind);
   void GenerateSuspendCheck(HSuspendCheck* instruction, HBasicBlock* successor);
   void HandleBinaryOp(HBinaryOperation* instr);
+  void HandleFieldSet(HInstruction* instruction, const FieldInfo& field_info);
+  void HandleFieldGet(HInstruction* instruction, const FieldInfo& field_info);
   void HandleShift(HBinaryOperation* instr);
   void GenerateImplicitNullCheck(HNullCheck* instruction);
   void GenerateExplicitNullCheck(HNullCheck* instruction);
@@ -185,8 +187,10 @@
 
  private:
   void HandleBinaryOp(HBinaryOperation* instr);
-  void HandleShift(HBinaryOperation* instr);
+  void HandleFieldSet(HInstruction* instruction);
+  void HandleFieldGet(HInstruction* instruction);
   void HandleInvoke(HInvoke* instr);
+  void HandleShift(HBinaryOperation* instr);
 
   CodeGeneratorARM64* const codegen_;
   InvokeDexCallingConventionVisitor parameter_visitor_;
@@ -194,15 +198,17 @@
   DISALLOW_COPY_AND_ASSIGN(LocationsBuilderARM64);
 };
 
-class ParallelMoveResolverARM64 : public ParallelMoveResolver {
+class ParallelMoveResolverARM64 : public ParallelMoveResolverNoSwap {
  public:
   ParallelMoveResolverARM64(ArenaAllocator* allocator, CodeGeneratorARM64* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverNoSwap(allocator), codegen_(codegen), vixl_temps_() {}
 
+ protected:
+  void PrepareForEmitNativeCode() OVERRIDE;
+  void FinishEmitNativeCode() OVERRIDE;
+  Location AllocateScratchLocationFor(Location::Kind kind) OVERRIDE;
+  void FreeScratchLocation(Location loc) OVERRIDE;
   void EmitMove(size_t index) OVERRIDE;
-  void EmitSwap(size_t index) OVERRIDE;
-  void RestoreScratch(int reg) OVERRIDE;
-  void SpillScratch(int reg) OVERRIDE;
 
  private:
   Arm64Assembler* GetAssembler() const;
@@ -211,6 +217,7 @@
   }
 
   CodeGeneratorARM64* const codegen_;
+  vixl::UseScratchRegisterScope vixl_temps_;
 
   DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverARM64);
 };
@@ -318,7 +325,6 @@
   // locations, and is used for optimisation and debugging.
   void MoveLocation(Location destination, Location source,
                     Primitive::Type type = Primitive::kPrimVoid);
-  void SwapLocations(Location loc_1, Location loc_2);
   void Load(Primitive::Type type, vixl::CPURegister dst, const vixl::MemOperand& src);
   void Store(Primitive::Type type, vixl::CPURegister rt, const vixl::MemOperand& dst);
   void LoadCurrentMethod(vixl::Register current_method);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 879216d..86e84ac 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -877,7 +877,7 @@
       if (rhs.IsRegister()) {
         __ cmpl(lhs.AsRegister<Register>(), rhs.AsRegister<Register>());
       } else if (rhs.IsConstant()) {
-        int32_t constant = rhs.GetConstant()->AsIntConstant()->GetValue();
+        int32_t constant = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
         if (constant == 0) {
           __ testl(lhs.AsRegister<Register>(), lhs.AsRegister<Register>());
         } else {
@@ -1120,6 +1120,14 @@
   UNUSED(constant);
 }
 
+void LocationsBuilderX86::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  memory_barrier->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
+}
+
 void LocationsBuilderX86::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
@@ -1212,6 +1220,7 @@
 
   codegen_->GenerateStaticOrDirectCall(
       invoke, invoke->GetLocations()->GetTemp(0).AsRegister<Register>());
+  codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
 void LocationsBuilderX86::VisitInvokeVirtual(HInvokeVirtual* invoke) {
@@ -3098,7 +3107,6 @@
   }
 
   DCHECK(!IsLeafMethod());
-  RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
 void CodeGeneratorX86::MarkGCCard(Register temp, Register card, Register object, Register value) {
@@ -3119,11 +3127,15 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
 
-  // The output overlaps in case of long: we don't want the low move to overwrite
-  // the object's location.
-  locations->SetOut(Location::RequiresRegister(),
-      (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
-                                                       : Location::kNoOutputOverlap);
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister());
+  } else {
+    // The output overlaps in case of long: we don't want the low move to overwrite
+    // the object's location.
+    locations->SetOut(Location::RequiresRegister(),
+        (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
+                                                         : Location::kNoOutputOverlap);
+  }
 
   if (field_info.IsVolatile() && (field_info.GetFieldType() == Primitive::kPrimLong)) {
     // Long values can be loaded atomically into an XMM using movsd.
@@ -3229,6 +3241,8 @@
   if (is_byte_type) {
     // Ensure the value is in a byte register.
     locations->SetInAt(1, Location::RegisterLocation(EAX));
+  } else if (Primitive::IsFloatingPointType(field_type)) {
+    locations->SetInAt(1, Location::RequiresFpuRegister());
   } else {
     locations->SetInAt(1, Location::RequiresRegister());
   }
@@ -3418,11 +3432,15 @@
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  // The output overlaps in case of long: we don't want the low move to overwrite
-  // the array's location.
-  locations->SetOut(Location::RequiresRegister(),
-      (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
-                                                       : Location::kNoOutputOverlap);
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+  } else {
+    // The output overlaps in case of long: we don't want the low move to overwrite
+    // the array's location.
+    locations->SetOut(Location::RequiresRegister(),
+        (instruction->GetType() == Primitive::kPrimLong) ? Location::kOutputOverlap
+                                                         : Location::kNoOutputOverlap);
+  }
 }
 
 void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
@@ -3578,14 +3596,10 @@
     if (is_byte_type) {
       // Ensure the value is in a byte register.
       locations->SetInAt(2, Location::ByteRegisterOrConstant(EAX, instruction->InputAt(2)));
+    } else if (Primitive::IsFloatingPointType(value_type)) {
+      locations->SetInAt(2, Location::RequiresFpuRegister());
     } else {
-      bool is_fp_type = (value_type == Primitive::kPrimFloat)
-          || (value_type == Primitive::kPrimDouble);
-      if (is_fp_type) {
-        locations->SetInAt(2, Location::RequiresFpuRegister());
-      } else {
-        locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
-      }
+      locations->SetInAt(2, Location::RegisterOrConstant(instruction->InputAt(2)));
     }
     // Temporary registers for the write barrier.
     if (needs_write_barrier) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 368ae0f..07476c6 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -93,10 +93,10 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
 
-class ParallelMoveResolverX86 : public ParallelMoveResolver {
+class ParallelMoveResolverX86 : public ParallelMoveResolverWithSwap {
  public:
   ParallelMoveResolverX86(ArenaAllocator* allocator, CodeGeneratorX86* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverWithSwap(allocator), codegen_(codegen) {}
 
   void EmitMove(size_t index) OVERRIDE;
   void EmitSwap(size_t index) OVERRIDE;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index a3d3490..d8d2ae3 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1145,6 +1145,14 @@
   UNUSED(constant);
 }
 
+void LocationsBuilderX86_64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  memory_barrier->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitMemoryBarrier(HMemoryBarrier* memory_barrier) {
+  GenerateMemoryBarrier(memory_barrier->GetBarrierKind());
+}
+
 void LocationsBuilderX86_64::VisitReturnVoid(HReturnVoid* ret) {
   ret->SetLocations(nullptr);
 }
@@ -3035,7 +3043,11 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister());
+  } else {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  }
 }
 
 void InstructionCodeGeneratorX86_64::HandleFieldGet(HInstruction* instruction,
@@ -3113,7 +3125,11 @@
       CodeGenerator::StoreNeedsWriteBarrier(field_info.GetFieldType(), instruction->InputAt(1));
 
   locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetInAt(1, Location::RequiresRegister());
+  if (Primitive::IsFloatingPointType(instruction->InputAt(1)->GetType())) {
+    locations->SetInAt(1, Location::RequiresFpuRegister());
+  } else {
+    locations->SetInAt(1, Location::RequiresRegister());
+  }
   if (needs_write_barrier) {
     // Temporary registers for the write barrier.
     locations->AddTemp(Location::RequiresRegister());
@@ -3277,7 +3293,11 @@
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(
       1, Location::RegisterOrConstant(instruction->InputAt(1)));
-  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  if (Primitive::IsFloatingPointType(instruction->GetType())) {
+    locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap);
+  } else {
+    locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitArrayGet(HArrayGet* instruction) {
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index b4876ef..6cdc822 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -102,10 +102,10 @@
   DISALLOW_COPY_AND_ASSIGN(SlowPathCodeX86_64);
 };
 
-class ParallelMoveResolverX86_64 : public ParallelMoveResolver {
+class ParallelMoveResolverX86_64 : public ParallelMoveResolverWithSwap {
  public:
   ParallelMoveResolverX86_64(ArenaAllocator* allocator, CodeGeneratorX86_64* codegen)
-      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+      : ParallelMoveResolverWithSwap(allocator), codegen_(codegen) {}
 
   void EmitMove(size_t index) OVERRIDE;
   void EmitSwap(size_t index) OVERRIDE;
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index afcff1e..94f56e5 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -18,8 +18,10 @@
 
 #include "arch/instruction_set.h"
 #include "arch/arm/instruction_set_features_arm.h"
+#include "arch/arm/registers_arm.h"
 #include "arch/arm64/instruction_set_features_arm64.h"
 #include "arch/x86/instruction_set_features_x86.h"
+#include "arch/x86/registers_x86.h"
 #include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "base/macros.h"
 #include "builder.h"
@@ -37,6 +39,8 @@
 #include "register_allocator.h"
 #include "ssa_liveness_analysis.h"
 #include "utils.h"
+#include "utils/arm/managed_register_arm.h"
+#include "utils/x86/managed_register_x86.h"
 
 #include "gtest/gtest.h"
 
@@ -53,17 +57,42 @@
                        const ArmInstructionSetFeatures& isa_features,
                        const CompilerOptions& compiler_options)
       : arm::CodeGeneratorARM(graph, isa_features, compiler_options) {
-    AddAllocatedRegister(Location::RegisterLocation(6));
-    AddAllocatedRegister(Location::RegisterLocation(7));
+    AddAllocatedRegister(Location::RegisterLocation(arm::R6));
+    AddAllocatedRegister(Location::RegisterLocation(arm::R7));
   }
 
   void SetupBlockedRegisters(bool is_baseline) const OVERRIDE {
     arm::CodeGeneratorARM::SetupBlockedRegisters(is_baseline);
-    blocked_core_registers_[4] = true;
-    blocked_core_registers_[6] = false;
-    blocked_core_registers_[7] = false;
+    blocked_core_registers_[arm::R4] = true;
+    blocked_core_registers_[arm::R6] = false;
+    blocked_core_registers_[arm::R7] = false;
     // Makes pair R6-R7 available.
-    blocked_register_pairs_[6 >> 1] = false;
+    blocked_register_pairs_[arm::R6_R7] = false;
+  }
+};
+
+class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
+ public:
+  TestCodeGeneratorX86(HGraph* graph,
+                       const X86InstructionSetFeatures& isa_features,
+                       const CompilerOptions& compiler_options)
+      : x86::CodeGeneratorX86(graph, isa_features, compiler_options) {
+    // Save edi, we need it for getting enough registers for long multiplication.
+    AddAllocatedRegister(Location::RegisterLocation(x86::EDI));
+  }
+
+  void SetupBlockedRegisters(bool is_baseline) const OVERRIDE {
+    x86::CodeGeneratorX86::SetupBlockedRegisters(is_baseline);
+    // ebx is a callee-save register in C, but caller-save for ART.
+    blocked_core_registers_[x86::EBX] = true;
+    blocked_register_pairs_[x86::EAX_EBX] = true;
+    blocked_register_pairs_[x86::EDX_EBX] = true;
+    blocked_register_pairs_[x86::ECX_EBX] = true;
+    blocked_register_pairs_[x86::EBX_EDI] = true;
+
+    // Make edi available.
+    blocked_core_registers_[x86::EDI] = false;
+    blocked_register_pairs_[x86::ECX_EDI] = false;
   }
 };
 
@@ -101,7 +130,7 @@
   }
   Expected result = f();
   if (has_result) {
-    ASSERT_EQ(result, expected);
+    ASSERT_EQ(expected, result);
   }
 }
 
@@ -112,7 +141,7 @@
   CompilerOptions compiler_options;
   std::unique_ptr<const X86InstructionSetFeatures> features_x86(
       X86InstructionSetFeatures::FromCppDefines());
-  x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
+  TestCodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
   // We avoid doing a stack overflow check that requires the runtime being setup,
   // by making sure the compiler knows the methods we are running are leaf methods.
   codegenX86.CompileBaseline(&allocator, true);
@@ -520,29 +549,49 @@
   RunCodeOptimized(graph, hook_before_codegen, true, 0);
 }
 
-#define MUL_TEST(TYPE, TEST_NAME)                     \
-  TEST(CodegenTest, Return ## TEST_NAME) {            \
-    const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(  \
-      Instruction::CONST_4 | 3 << 12 | 0,             \
-      Instruction::CONST_4 | 4 << 12 | 1 << 8,        \
-      Instruction::MUL_ ## TYPE, 1 << 8 | 0,          \
-      Instruction::RETURN);                           \
-                                                      \
-    TestCode(data, true, 12);                         \
-  }                                                   \
-                                                      \
-  TEST(CodegenTest, Return ## TEST_NAME ## 2addr) {   \
-    const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(  \
-      Instruction::CONST_4 | 3 << 12 | 0,             \
-      Instruction::CONST_4 | 4 << 12 | 1 << 8,        \
-      Instruction::MUL_ ## TYPE ## _2ADDR | 1 << 12,  \
-      Instruction::RETURN);                           \
-                                                      \
-    TestCode(data, true, 12);                         \
-  }
+TEST(CodegenTest, ReturnMulInt) {
+  const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 3 << 12 | 0,
+    Instruction::CONST_4 | 4 << 12 | 1 << 8,
+    Instruction::MUL_INT, 1 << 8 | 0,
+    Instruction::RETURN);
 
-MUL_TEST(INT, MulInt);
-MUL_TEST(LONG, MulLong);
+  TestCode(data, true, 12);
+}
+
+TEST(CodegenTest, ReturnMulInt2addr) {
+  const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 3 << 12 | 0,
+    Instruction::CONST_4 | 4 << 12 | 1 << 8,
+    Instruction::MUL_INT_2ADDR | 1 << 12,
+    Instruction::RETURN);
+
+  TestCode(data, true, 12);
+}
+
+TEST(CodegenTest, ReturnMulLong) {
+  const uint16_t data[] = FOUR_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 3 << 12 | 0,
+    Instruction::CONST_4 | 0 << 12 | 1 << 8,
+    Instruction::CONST_4 | 4 << 12 | 2 << 8,
+    Instruction::CONST_4 | 0 << 12 | 3 << 8,
+    Instruction::MUL_LONG, 2 << 8 | 0,
+    Instruction::RETURN_WIDE);
+
+  TestCodeLong(data, true, 12);
+}
+
+TEST(CodegenTest, ReturnMulLong2addr) {
+  const uint16_t data[] = FOUR_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 3 << 12 | 0 << 8,
+    Instruction::CONST_4 | 0 << 12 | 1 << 8,
+    Instruction::CONST_4 | 4 << 12 | 2 << 8,
+    Instruction::CONST_4 | 0 << 12 | 3 << 8,
+    Instruction::MUL_LONG_2ADDR | 2 << 12,
+    Instruction::RETURN_WIDE);
+
+  TestCodeLong(data, true, 12);
+}
 
 TEST(CodegenTest, ReturnMulIntLit8) {
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
diff --git a/compiler/optimizing/constant_folding_test.cc b/compiler/optimizing/constant_folding_test.cc
index 02ad675..e420a62 100644
--- a/compiler/optimizing/constant_folding_test.cc
+++ b/compiler/optimizing/constant_folding_test.cc
@@ -62,7 +62,7 @@
 
   check_after_cf(graph);
 
-  HDeadCodeElimination(graph).Run();
+  HDeadCodeElimination(graph, nullptr).Run();
   SSAChecker ssa_checker_dce(&allocator, graph);
   ssa_checker_dce.Run();
   ASSERT_TRUE(ssa_checker_dce.IsValid());
diff --git a/compiler/optimizing/dead_code_elimination.cc b/compiler/optimizing/dead_code_elimination.cc
index fc3dd01..8045cc5 100644
--- a/compiler/optimizing/dead_code_elimination.cc
+++ b/compiler/optimizing/dead_code_elimination.cc
@@ -38,8 +38,10 @@
       if (!inst->HasSideEffects()
           && !inst->CanThrow()
           && !inst->IsSuspendCheck()
+          && !inst->IsMemoryBarrier()  // If we added an explicit barrier then we should keep it.
           && !inst->HasUses()) {
         block->RemoveInstruction(inst);
+        MaybeRecordStat(MethodCompilationStat::kRemovedDeadInstruction);
       }
     }
   }
diff --git a/compiler/optimizing/dead_code_elimination.h b/compiler/optimizing/dead_code_elimination.h
index 3db2c3f..3f309c5 100644
--- a/compiler/optimizing/dead_code_elimination.h
+++ b/compiler/optimizing/dead_code_elimination.h
@@ -19,6 +19,7 @@
 
 #include "nodes.h"
 #include "optimization.h"
+#include "optimizing_compiler_stats.h"
 
 namespace art {
 
@@ -28,8 +29,8 @@
  */
 class HDeadCodeElimination : public HOptimization {
  public:
-  explicit HDeadCodeElimination(HGraph* graph)
-      : HOptimization(graph, true, kDeadCodeEliminationPassName) {}
+  HDeadCodeElimination(HGraph* graph, OptimizingCompilerStats* stats)
+      : HOptimization(graph, true, kDeadCodeEliminationPassName, stats) {}
 
   void Run() OVERRIDE;
 
diff --git a/compiler/optimizing/dead_code_elimination_test.cc b/compiler/optimizing/dead_code_elimination_test.cc
index 98ae1ec..6350019 100644
--- a/compiler/optimizing/dead_code_elimination_test.cc
+++ b/compiler/optimizing/dead_code_elimination_test.cc
@@ -44,7 +44,7 @@
   std::unique_ptr<const X86InstructionSetFeatures> features_x86(
       X86InstructionSetFeatures::FromCppDefines());
   x86::CodeGeneratorX86 codegenX86(graph, *features_x86.get(), CompilerOptions());
-  HDeadCodeElimination(graph).Run();
+  HDeadCodeElimination(graph, nullptr).Run();
   SSAChecker ssa_checker(&allocator, graph);
   ssa_checker.Run();
   ASSERT_TRUE(ssa_checker.IsValid());
diff --git a/compiler/optimizing/graph_checker.cc b/compiler/optimizing/graph_checker.cc
index 3a56c6c..2216cec 100644
--- a/compiler/optimizing/graph_checker.cc
+++ b/compiler/optimizing/graph_checker.cc
@@ -393,8 +393,10 @@
           static_cast<int>(input_index),
           value));
     }
-  } else if (input->GetType() == Primitive::kPrimInt && input->IsPhi()) {
-    // TODO: We need a data-flow analysis which determines if the Phi is boolean.
+  } else if (input->GetType() == Primitive::kPrimInt
+             && (input->IsPhi() || input->IsAnd() || input->IsOr() || input->IsXor())) {
+    // TODO: We need a data-flow analysis to determine if the Phi or
+    //       binary operation is actually Boolean. Allow for now.
   } else if (input->GetType() != Primitive::kPrimBoolean) {
     AddError(StringPrintf(
         "%s instruction %d has a non-Boolean input %d whose type is: %s.",
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 4c28378..ca9cbc3 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -192,6 +192,10 @@
     output_ << " " << phi->GetRegNumber();
   }
 
+  void VisitMemoryBarrier(HMemoryBarrier* barrier) OVERRIDE {
+    output_ << " " << barrier->GetBarrierKind();
+  }
+
   bool IsPass(const char* name) {
     return strcmp(pass_name_, name) == 0;
   }
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 6d2a8d7..bffd639 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -190,7 +190,7 @@
   }
 
   // Run simple optimizations on the graph.
-  HDeadCodeElimination dce(callee_graph);
+  HDeadCodeElimination dce(callee_graph, stats_);
   HConstantFolding fold(callee_graph);
   InstructionSimplifier simplify(callee_graph, stats_);
 
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 180bdc9..f30c9a6 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -43,6 +43,8 @@
 
   void VisitSuspendCheck(HSuspendCheck* check) OVERRIDE;
   void VisitEqual(HEqual* equal) OVERRIDE;
+  void VisitNotEqual(HNotEqual* equal) OVERRIDE;
+  void VisitBooleanNot(HBooleanNot* bool_not) OVERRIDE;
   void VisitArraySet(HArraySet* equal) OVERRIDE;
   void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE;
   void VisitNullCheck(HNullCheck* instruction) OVERRIDE;
@@ -195,21 +197,62 @@
 }
 
 void InstructionSimplifierVisitor::VisitEqual(HEqual* equal) {
-  HInstruction* input1 = equal->InputAt(0);
-  HInstruction* input2 = equal->InputAt(1);
-  if (input1->GetType() == Primitive::kPrimBoolean && input2->IsIntConstant()) {
-    if (input2->AsIntConstant()->GetValue() == 1) {
-      // Replace (bool_value == 1) with bool_value
-      equal->ReplaceWith(equal->InputAt(0));
-      equal->GetBlock()->RemoveInstruction(equal);
-    } else {
-      // We should replace (bool_value == 0) with !bool_value, but we unfortunately
-      // do not have such instruction.
-      DCHECK_EQ(input2->AsIntConstant()->GetValue(), 0);
+  HInstruction* input_const = equal->GetConstantRight();
+  if (input_const != nullptr) {
+    HInstruction* input_value = equal->GetLeastConstantLeft();
+    if (input_value->GetType() == Primitive::kPrimBoolean && input_const->IsIntConstant()) {
+      HBasicBlock* block = equal->GetBlock();
+      if (input_const->AsIntConstant()->IsOne()) {
+        // Replace (bool_value == true) with bool_value
+        equal->ReplaceWith(input_value);
+        block->RemoveInstruction(equal);
+        RecordSimplification();
+      } else {
+        // Replace (bool_value == false) with !bool_value
+        DCHECK(input_const->AsIntConstant()->IsZero());
+        block->ReplaceAndRemoveInstructionWith(
+            equal, new (block->GetGraph()->GetArena()) HBooleanNot(input_value));
+        RecordSimplification();
+      }
     }
   }
 }
 
+void InstructionSimplifierVisitor::VisitNotEqual(HNotEqual* not_equal) {
+  HInstruction* input_const = not_equal->GetConstantRight();
+  if (input_const != nullptr) {
+    HInstruction* input_value = not_equal->GetLeastConstantLeft();
+    if (input_value->GetType() == Primitive::kPrimBoolean && input_const->IsIntConstant()) {
+      HBasicBlock* block = not_equal->GetBlock();
+      if (input_const->AsIntConstant()->IsOne()) {
+        // Replace (bool_value != true) with !bool_value
+        block->ReplaceAndRemoveInstructionWith(
+            not_equal, new (block->GetGraph()->GetArena()) HBooleanNot(input_value));
+        RecordSimplification();
+      } else {
+        // Replace (bool_value != false) with bool_value
+        DCHECK(input_const->AsIntConstant()->IsZero());
+        not_equal->ReplaceWith(input_value);
+        block->RemoveInstruction(not_equal);
+        RecordSimplification();
+      }
+    }
+  }
+}
+
+void InstructionSimplifierVisitor::VisitBooleanNot(HBooleanNot* bool_not) {
+  HInstruction* parent = bool_not->InputAt(0);
+  if (parent->IsBooleanNot()) {
+    HInstruction* value = parent->InputAt(0);
+    // Replace (!(!bool_value)) with bool_value
+    bool_not->ReplaceWith(value);
+    bool_not->GetBlock()->RemoveInstruction(bool_not);
+    // It is possible that `parent` is dead at this point but we leave
+    // its removal to DCE for simplicity.
+    RecordSimplification();
+  }
+}
+
 void InstructionSimplifierVisitor::VisitArrayLength(HArrayLength* instruction) {
   HInstruction* input = instruction->InputAt(0);
   // If the array is a NewArray with constant size, replace the array length
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 3c7a266..95ab90d 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -828,7 +828,7 @@
                                                               LocationSummary::kNoCall,
                                                               kIntrinsified);
     locations->SetInAt(0, Location::RequiresFpuRegister());
-    locations->SetOut(Location::RequiresFpuRegister());
+    locations->SetOut(Location::RequiresRegister());
     locations->AddTemp(Location::RequiresFpuRegister());
     locations->AddTemp(Location::RequiresFpuRegister());
     return;
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index de876be..c3a9915 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -285,17 +285,26 @@
   bool Contains(Location other) const {
     if (Equals(other)) {
       return true;
-    } else if (IsFpuRegisterPair() && other.IsFpuRegister()) {
-      return other.reg() == low() || other.reg() == high();
-    } else if (IsRegisterPair() && other.IsRegister()) {
-      return other.reg() == low() || other.reg() == high();
-    } else if (IsDoubleStackSlot() && other.IsStackSlot()) {
-      return (GetStackIndex() == other.GetStackIndex())
-          || (GetStackIndex() + 4 == other.GetStackIndex());
+    } else if (IsPair() || IsDoubleStackSlot()) {
+      return ToLow().Equals(other) || ToHigh().Equals(other);
     }
     return false;
   }
 
+  bool OverlapsWith(Location other) const {
+    // Only check the overlapping case that can happen with our register allocation algorithm.
+    bool overlap = Contains(other) || other.Contains(*this);
+    if (kIsDebugBuild && !overlap) {
+      // Note: These are also overlapping cases. But we are not able to handle them in
+      // ParallelMoveResolverWithSwap. Make sure that we do not meet such case with our compiler.
+      if ((IsPair() && other.IsPair()) || (IsDoubleStackSlot() && other.IsDoubleStackSlot())) {
+        DCHECK(!Contains(other.ToLow()));
+        DCHECK(!Contains(other.ToHigh()));
+      }
+    }
+    return overlap;
+  }
+
   const char* DebugString() const {
     switch (GetKind()) {
       case kInvalid: return "I";
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 649038b..1565f58 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -19,6 +19,7 @@
 
 #include "base/arena_containers.h"
 #include "base/arena_object.h"
+#include "dex/compiler_enums.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "handle.h"
 #include "handle_scope.h"
@@ -718,6 +719,7 @@
   M(LoadString, Instruction)                                            \
   M(Local, Instruction)                                                 \
   M(LongConstant, Constant)                                             \
+  M(MemoryBarrier, Instruction)                                         \
   M(MonitorOperation, Instruction)                                      \
   M(Mul, BinaryOperation)                                               \
   M(Neg, UnaryOperation)                                                \
@@ -908,6 +910,12 @@
   HUseListNode<T>* use_node_;
 };
 
+// TODO: Add better documentation to this class and maybe refactor with more suggestive names.
+// - Has(All)SideEffects suggests that all the side effects are present but only ChangesSomething
+//   flag is consider.
+// - DependsOn suggests that there is a real dependency between side effects but it only
+//   checks DependendsOnSomething flag.
+//
 // Represents the side effects an instruction may have.
 class SideEffects : public ValueObject {
  public:
@@ -2105,7 +2113,7 @@
 
   friend class HGraph;
   ART_FRIEND_TEST(GraphTest, InsertInstructionBefore);
-  ART_FRIEND_TEST(ParallelMoveTest, ConstantLast);
+  ART_FRIEND_TYPED_TEST(ParallelMoveTest, ConstantLast);
   DISALLOW_COPY_AND_ASSIGN(HIntConstant);
 };
 
@@ -2162,7 +2170,7 @@
 
   uint32_t GetDexMethodIndex() const { return dex_method_index_; }
 
-  Intrinsics GetIntrinsic() {
+  Intrinsics GetIntrinsic() const {
     return intrinsic_;
   }
 
@@ -3437,6 +3445,22 @@
   DISALLOW_COPY_AND_ASSIGN(HCheckCast);
 };
 
+class HMemoryBarrier : public HTemplateInstruction<0> {
+ public:
+  explicit HMemoryBarrier(MemBarrierKind barrier_kind)
+      : HTemplateInstruction(SideEffects::None()),
+        barrier_kind_(barrier_kind) {}
+
+  MemBarrierKind GetBarrierKind() { return barrier_kind_; }
+
+  DECLARE_INSTRUCTION(MemoryBarrier);
+
+ private:
+  const MemBarrierKind barrier_kind_;
+
+  DISALLOW_COPY_AND_ASSIGN(HMemoryBarrier);
+};
+
 class HMonitorOperation : public HTemplateInstruction<1> {
  public:
   enum OperationKind {
@@ -3502,7 +3526,7 @@
 
   // True if this blocks a move from the given location.
   bool Blocks(Location loc) const {
-    return !IsEliminated() && (source_.Contains(loc) || loc.Contains(source_));
+    return !IsEliminated() && source_.OverlapsWith(loc);
   }
 
   // A move is redundant if it's been eliminated, if its source and
@@ -3571,8 +3595,8 @@
         }
       }
       for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
-        DCHECK(!destination.Equals(moves_.Get(i).GetDestination()))
-            << "Same destination for two moves in a parallel move.";
+        DCHECK(!destination.OverlapsWith(moves_.Get(i).GetDestination()))
+            << "Overlapped destination for two moves in a parallel move.";
       }
     }
     moves_.Add(MoveOperands(source, destination, type, instruction));
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index a17d6e1..ab752c3 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -208,6 +208,12 @@
 
   void UnInit() const OVERRIDE;
 
+  void MaybeRecordStat(MethodCompilationStat compilation_stat) const {
+    if (compilation_stats_.get() != nullptr) {
+      compilation_stats_->RecordStat(compilation_stat);
+    }
+  }
+
  private:
   // Whether we should run any optimization or register allocation. If false, will
   // just run the code generation after the graph was built.
@@ -226,7 +232,7 @@
                                   CompilerDriver* driver,
                                   const DexCompilationUnit& dex_compilation_unit) const;
 
-  mutable OptimizingCompilerStats compilation_stats_;
+  std::unique_ptr<OptimizingCompilerStats> compilation_stats_;
 
   std::unique_ptr<std::ostream> visualizer_output_;
 
@@ -243,7 +249,6 @@
       run_optimizations_(
           (driver->GetCompilerOptions().GetCompilerFilter() != CompilerOptions::kTime)
           && !driver->GetCompilerOptions().GetDebuggable()),
-      compilation_stats_(),
       delegate_(Create(driver, Compiler::Kind::kQuick)) {}
 
 void OptimizingCompiler::Init() {
@@ -258,6 +263,9 @@
       << "Invoke the compiler with '-j1'.";
     visualizer_output_.reset(new std::ofstream(cfg_file_name));
   }
+  if (driver->GetDumpStats()) {
+    compilation_stats_.reset(new OptimizingCompilerStats());
+  }
 }
 
 void OptimizingCompiler::UnInit() const {
@@ -265,7 +273,9 @@
 }
 
 OptimizingCompiler::~OptimizingCompiler() {
-  compilation_stats_.Log();
+  if (compilation_stats_.get() != nullptr) {
+    compilation_stats_->Log();
+  }
 }
 
 void OptimizingCompiler::InitCompilationUnit(CompilationUnit& cu) const {
@@ -310,7 +320,8 @@
                              const DexCompilationUnit& dex_compilation_unit,
                              PassInfoPrinter* pass_info_printer,
                              StackHandleScopeCollection* handles) {
-  HDeadCodeElimination dce(graph);
+  HDeadCodeElimination dce1(graph, stats);
+  HDeadCodeElimination dce2(graph, stats);
   HConstantFolding fold1(graph);
   InstructionSimplifier simplify1(graph, stats);
   HBooleanSimplifier boolean_not(graph);
@@ -329,7 +340,7 @@
 
   HOptimization* optimizations[] = {
     &intrinsics,
-    &dce,
+    &dce1,
     &fold1,
     &simplify1,
     // BooleanSimplifier depends on the InstructionSimplifier removing redundant
@@ -342,7 +353,8 @@
     &licm,
     &bce,
     &type_propagation,
-    &simplify2
+    &simplify2,
+    &dce2,
   };
 
   RunOptimizations(optimizations, arraysize(optimizations), pass_info_printer);
@@ -381,7 +393,7 @@
                                                      const DexCompilationUnit& dex_compilation_unit,
                                                      PassInfoPrinter* pass_info_printer) const {
   StackHandleScopeCollection handles(Thread::Current());
-  RunOptimizations(graph, compiler_driver, &compilation_stats_,
+  RunOptimizations(graph, compiler_driver, compilation_stats_.get(),
                    dex_file, dex_compilation_unit, pass_info_printer, &handles);
 
   AllocateRegisters(graph, codegen, pass_info_printer);
@@ -397,7 +409,7 @@
   std::vector<uint8_t> stack_map;
   codegen->BuildStackMaps(&stack_map);
 
-  compilation_stats_.RecordStat(MethodCompilationStat::kCompiledOptimized);
+  MaybeRecordStat(MethodCompilationStat::kCompiledOptimized);
 
   return CompiledMethod::SwapAllocCompiledMethod(
       compiler_driver,
@@ -435,7 +447,7 @@
   std::vector<uint8_t> gc_map;
   codegen->BuildNativeGCMap(&gc_map, dex_compilation_unit);
 
-  compilation_stats_.RecordStat(MethodCompilationStat::kCompiledBaseline);
+  MaybeRecordStat(MethodCompilationStat::kCompiledBaseline);
   return CompiledMethod::SwapAllocCompiledMethod(
       compiler_driver,
       codegen->GetInstructionSet(),
@@ -463,7 +475,7 @@
                                                const DexFile& dex_file) const {
   UNUSED(invoke_type);
   std::string method_name = PrettyMethod(method_idx, dex_file);
-  compilation_stats_.RecordStat(MethodCompilationStat::kAttemptCompilation);
+  MaybeRecordStat(MethodCompilationStat::kAttemptCompilation);
   CompilerDriver* compiler_driver = GetCompilerDriver();
   InstructionSet instruction_set = compiler_driver->GetInstructionSet();
   // Always use the thumb2 assembler: some runtime functionality (like implicit stack
@@ -474,12 +486,12 @@
 
   // Do not attempt to compile on architectures we do not support.
   if (!IsInstructionSetSupported(instruction_set)) {
-    compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledUnsupportedIsa);
+    MaybeRecordStat(MethodCompilationStat::kNotCompiledUnsupportedIsa);
     return nullptr;
   }
 
   if (Compiler::IsPathologicalCase(*code_item, method_idx, dex_file)) {
-    compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledPathological);
+    MaybeRecordStat(MethodCompilationStat::kNotCompiledPathological);
     return nullptr;
   }
 
@@ -489,7 +501,7 @@
   const CompilerOptions& compiler_options = compiler_driver->GetCompilerOptions();
   if ((compiler_options.GetCompilerFilter() == CompilerOptions::kSpace)
       && (code_item->insns_size_in_code_units_ > kSpaceFilterOptimizingThreshold)) {
-    compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledSpaceFilter);
+    MaybeRecordStat(MethodCompilationStat::kNotCompiledSpaceFilter);
     return nullptr;
   }
 
@@ -514,7 +526,7 @@
                             compiler_driver->GetCompilerOptions()));
   if (codegen.get() == nullptr) {
     CHECK(!shouldCompile) << "Could not find code generator for optimizing compiler";
-    compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledNoCodegen);
+    MaybeRecordStat(MethodCompilationStat::kNotCompiledNoCodegen);
     return nullptr;
   }
   codegen->GetAssembler()->cfi().SetEnabled(
@@ -531,7 +543,7 @@
                         &dex_compilation_unit,
                         &dex_file,
                         compiler_driver,
-                        &compilation_stats_);
+                        compilation_stats_.get());
 
   VLOG(compiler) << "Building " << method_name;
 
@@ -558,7 +570,7 @@
       if (!graph->TryBuildingSsa()) {
         // We could not transform the graph to SSA, bailout.
         LOG(INFO) << "Skipping compilation of " << method_name << ": it contains a non natural loop";
-        compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledCannotBuildSSA);
+        MaybeRecordStat(MethodCompilationStat::kNotCompiledCannotBuildSSA);
         return nullptr;
       }
     }
@@ -576,11 +588,11 @@
     VLOG(compiler) << "Compile baseline " << method_name;
 
     if (!run_optimizations_) {
-      compilation_stats_.RecordStat(MethodCompilationStat::kNotOptimizedDisabled);
+      MaybeRecordStat(MethodCompilationStat::kNotOptimizedDisabled);
     } else if (!can_optimize) {
-      compilation_stats_.RecordStat(MethodCompilationStat::kNotOptimizedTryCatch);
+      MaybeRecordStat(MethodCompilationStat::kNotOptimizedTryCatch);
     } else if (!can_allocate_registers) {
-      compilation_stats_.RecordStat(MethodCompilationStat::kNotOptimizedRegisterAllocator);
+      MaybeRecordStat(MethodCompilationStat::kNotOptimizedRegisterAllocator);
     }
 
     return CompileBaseline(codegen.get(), compiler_driver, dex_compilation_unit);
@@ -603,9 +615,9 @@
                          method_idx, jclass_loader, dex_file);
   } else {
     if (compiler_driver->GetCompilerOptions().VerifyAtRuntime()) {
-      compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledVerifyAtRuntime);
+      MaybeRecordStat(MethodCompilationStat::kNotCompiledVerifyAtRuntime);
     } else {
-      compilation_stats_.RecordStat(MethodCompilationStat::kNotCompiledClassNotVerified);
+      MaybeRecordStat(MethodCompilationStat::kNotCompiledClassNotVerified);
     }
   }
 
@@ -616,7 +628,7 @@
                               jclass_loader, dex_file);
 
   if (method != nullptr) {
-    compilation_stats_.RecordStat(MethodCompilationStat::kCompiledQuick);
+    MaybeRecordStat(MethodCompilationStat::kCompiledQuick);
   }
   return method;
 }
diff --git a/compiler/optimizing/optimizing_compiler_stats.h b/compiler/optimizing/optimizing_compiler_stats.h
index d4a936d..e6508c9 100644
--- a/compiler/optimizing/optimizing_compiler_stats.h
+++ b/compiler/optimizing/optimizing_compiler_stats.h
@@ -29,6 +29,7 @@
   kCompiledBaseline,
   kCompiledOptimized,
   kCompiledQuick,
+  kInstructionSimplifications,
   kInlinedInvoke,
   kNotCompiledUnsupportedIsa,
   kNotCompiledPathological,
@@ -48,8 +49,8 @@
   kNotCompiledVerifyAtRuntime,
   kNotCompiledClassNotVerified,
   kRemovedCheckedCast,
+  kRemovedDeadInstruction,
   kRemovedNullCheck,
-  kInstructionSimplifications,
   kLastStat
 };
 
@@ -82,7 +83,7 @@
 
       for (int i = 0; i < kLastStat; i++) {
         if (compile_stats_[i] != 0) {
-          VLOG(compiler) << PrintMethodCompilationStat(i) << ": " << compile_stats_[i];
+          LOG(INFO) << PrintMethodCompilationStat(i) << ": " << compile_stats_[i];
         }
       }
     }
@@ -96,6 +97,7 @@
       case kCompiledOptimized : return "kCompiledOptimized";
       case kCompiledQuick : return "kCompiledQuick";
       case kInlinedInvoke : return "kInlinedInvoke";
+      case kInstructionSimplifications: return "kInstructionSimplifications";
       case kNotCompiledUnsupportedIsa : return "kNotCompiledUnsupportedIsa";
       case kNotCompiledPathological : return "kNotCompiledPathological";
       case kNotCompiledHugeMethod : return "kNotCompiledHugeMethod";
@@ -114,8 +116,8 @@
       case kNotCompiledVerifyAtRuntime : return "kNotCompiledVerifyAtRuntime";
       case kNotCompiledClassNotVerified : return "kNotCompiledClassNotVerified";
       case kRemovedCheckedCast: return "kRemovedCheckedCast";
+      case kRemovedDeadInstruction: return "kRemovedDeadInstruction";
       case kRemovedNullCheck: return "kRemovedNullCheck";
-      case kInstructionSimplifications: return "kInstructionSimplifications";
       default: LOG(FATAL) << "invalid stat";
     }
     return "";
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index ad92ca5..54ea6f1 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -17,11 +17,23 @@
 
 #include "parallel_move_resolver.h"
 #include "nodes.h"
-#include "locations.h"
 
 namespace art {
 
-void ParallelMoveResolver::EmitNativeCode(HParallelMove* parallel_move) {
+void ParallelMoveResolver::BuildInitialMoveList(HParallelMove* parallel_move) {
+  // Perform a linear sweep of the moves to add them to the initial list of
+  // moves to perform, ignoring any move that is redundant (the source is
+  // the same as the destination, the destination is ignored and
+  // unallocated, or the move was already eliminated).
+  for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
+    MoveOperands* move = parallel_move->MoveOperandsAt(i);
+    if (!move->IsRedundant()) {
+      moves_.Add(move);
+    }
+  }
+}
+
+void ParallelMoveResolverWithSwap::EmitNativeCode(HParallelMove* parallel_move) {
   DCHECK(moves_.IsEmpty());
   // Build up a worklist of moves.
   BuildInitialMoveList(parallel_move);
@@ -50,20 +62,6 @@
   moves_.Reset();
 }
 
-
-void ParallelMoveResolver::BuildInitialMoveList(HParallelMove* parallel_move) {
-  // Perform a linear sweep of the moves to add them to the initial list of
-  // moves to perform, ignoring any move that is redundant (the source is
-  // the same as the destination, the destination is ignored and
-  // unallocated, or the move was already eliminated).
-  for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
-    MoveOperands* move = parallel_move->MoveOperandsAt(i);
-    if (!move->IsRedundant()) {
-      moves_.Add(move);
-    }
-  }
-}
-
 Location LowOf(Location location) {
   if (location.IsRegisterPair()) {
     return Location::RegisterLocation(location.low());
@@ -103,7 +101,7 @@
   }
 }
 
-MoveOperands* ParallelMoveResolver::PerformMove(size_t index) {
+MoveOperands* ParallelMoveResolverWithSwap::PerformMove(size_t index) {
   // Each call to this function performs a move and deletes it from the move
   // graph.  We first recursively perform any move blocking this one.  We
   // mark a move as "pending" on entry to PerformMove in order to detect
@@ -229,7 +227,7 @@
   }
 }
 
-bool ParallelMoveResolver::IsScratchLocation(Location loc) {
+bool ParallelMoveResolverWithSwap::IsScratchLocation(Location loc) {
   for (size_t i = 0; i < moves_.Size(); ++i) {
     if (moves_.Get(i)->Blocks(loc)) {
       return false;
@@ -245,10 +243,10 @@
   return false;
 }
 
-int ParallelMoveResolver::AllocateScratchRegister(int blocked,
-                                                  int register_count,
-                                                  int if_scratch,
-                                                  bool* spilled) {
+int ParallelMoveResolverWithSwap::AllocateScratchRegister(int blocked,
+                                                          int register_count,
+                                                          int if_scratch,
+                                                          bool* spilled) {
   DCHECK_NE(blocked, if_scratch);
   int scratch = -1;
   for (int reg = 0; reg < register_count; ++reg) {
@@ -269,8 +267,8 @@
 }
 
 
-ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
-    ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers)
+ParallelMoveResolverWithSwap::ScratchRegisterScope::ScratchRegisterScope(
+    ParallelMoveResolverWithSwap* resolver, int blocked, int if_scratch, int number_of_registers)
     : resolver_(resolver),
       reg_(kNoRegister),
       spilled_(false) {
@@ -282,10 +280,271 @@
 }
 
 
-ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() {
+ParallelMoveResolverWithSwap::ScratchRegisterScope::~ScratchRegisterScope() {
   if (spilled_) {
     resolver_->RestoreScratch(reg_);
   }
 }
 
+void ParallelMoveResolverNoSwap::EmitNativeCode(HParallelMove* parallel_move) {
+  DCHECK_EQ(GetNumberOfPendingMoves(), 0u);
+  DCHECK(moves_.IsEmpty());
+  DCHECK(scratches_.IsEmpty());
+
+  // Backend dependent initialization.
+  PrepareForEmitNativeCode();
+
+  // Build up a worklist of moves.
+  BuildInitialMoveList(parallel_move);
+
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& move = *moves_.Get(i);
+    // Skip constants to perform them last. They don't block other moves and
+    // skipping such moves with register destinations keeps those registers
+    // free for the whole algorithm.
+    if (!move.IsEliminated() && !move.GetSource().IsConstant()) {
+      PerformMove(i);
+    }
+  }
+
+  // Perform the moves with constant sources and register destinations with UpdateMoveSource()
+  // to reduce the number of literal loads. Stack destinations are skipped since we won't be benefit
+  // from changing the constant sources to stack locations.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    Location destination = move->GetDestination();
+    if (!move->IsEliminated() && !destination.IsStackSlot() && !destination.IsDoubleStackSlot()) {
+      Location source = move->GetSource();
+      EmitMove(i);
+      move->Eliminate();
+      // This may introduce additional instruction dependency, but reduce number
+      // of moves and possible literal loads. For example,
+      // Original moves:
+      //   1234.5678 -> D0
+      //   1234.5678 -> D1
+      // Updated moves:
+      //   1234.5678 -> D0
+      //   D0 -> D1
+      UpdateMoveSource(source, destination);
+    }
+  }
+
+  // Perform the rest of the moves.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    if (!move->IsEliminated()) {
+      EmitMove(i);
+      move->Eliminate();
+    }
+  }
+
+  // All pending moves that we have added for resolve cycles should be performed.
+  DCHECK_EQ(GetNumberOfPendingMoves(), 0u);
+
+  // Backend dependent cleanup.
+  FinishEmitNativeCode();
+
+  moves_.Reset();
+  scratches_.Reset();
+}
+
+Location ParallelMoveResolverNoSwap::GetScratchLocation(Location::Kind kind) {
+  for (size_t i = 0; i < scratches_.Size(); ++i) {
+    Location loc = scratches_.Get(i);
+    if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
+      return loc;
+    }
+  }
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    Location loc = moves_.Get(i)->GetDestination();
+    if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
+      return loc;
+    }
+  }
+  return Location::NoLocation();
+}
+
+void ParallelMoveResolverNoSwap::AddScratchLocation(Location loc) {
+  if (kIsDebugBuild) {
+    for (size_t i = 0; i < scratches_.Size(); ++i) {
+      DCHECK(!loc.Equals(scratches_.Get(i)));
+    }
+  }
+  scratches_.Add(loc);
+}
+
+void ParallelMoveResolverNoSwap::RemoveScratchLocation(Location loc) {
+  DCHECK(!IsBlockedByMoves(loc));
+  for (size_t i = 0; i < scratches_.Size(); ++i) {
+    if (loc.Equals(scratches_.Get(i))) {
+      scratches_.DeleteAt(i);
+      break;
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::PerformMove(size_t index) {
+  // Each call to this function performs a move and deletes it from the move
+  // graph. We first recursively perform any move blocking this one. We mark
+  // a move as "pending" on entry to PerformMove in order to detect cycles
+  // in the move graph. We use scratch location to resolve cycles, also
+  // additional pending moves might be added. After move has been performed,
+  // we will update source operand in the move graph to reduce dependencies in
+  // the graph.
+
+  MoveOperands* move = moves_.Get(index);
+  DCHECK(!move->IsPending());
+  DCHECK(!move->IsEliminated());
+  if (move->IsRedundant()) {
+    // Previous operations on the list of moves have caused this particular move
+    // to become a no-op, so we can safely eliminate it. Consider for example
+    // (0 -> 1) (1 -> 0) (1 -> 2). There is a cycle (0 -> 1) (1 -> 0), that we will
+    // resolve as (1 -> scratch) (0 -> 1) (scratch -> 0). If, by chance, '2' is
+    // used as the scratch location, the move (1 -> 2) will occur while resolving
+    // the cycle. When that move is emitted, the code will update moves with a '1'
+    // as their source to use '2' instead (see `UpdateMoveSource()`. In our example
+    // the initial move (1 -> 2) would then become the no-op (2 -> 2) that can be
+    // eliminated here.
+    move->Eliminate();
+    return;
+  }
+
+  // Clear this move's destination to indicate a pending move. The actual
+  // destination is saved in a stack-allocated local. Recursion may allow
+  // multiple moves to be pending.
+  DCHECK(!move->GetSource().IsInvalid());
+  Location destination = move->MarkPending();
+
+  // Perform a depth-first traversal of the move graph to resolve
+  // dependencies. Any unperformed, unpending move with a source the same
+  // as this one's destination blocks this one so recursively perform all
+  // such moves.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& other_move = *moves_.Get(i);
+    if (other_move.Blocks(destination) && !other_move.IsPending()) {
+      PerformMove(i);
+    }
+  }
+
+  // We are about to resolve this move and don't need it marked as
+  // pending, so restore its destination.
+  move->ClearPending(destination);
+
+  // No one else should write to the move destination when the it is pending.
+  DCHECK(!move->IsRedundant());
+
+  Location source = move->GetSource();
+  // The move may be blocked on several pending moves, in case we have a cycle.
+  if (IsBlockedByMoves(destination)) {
+    // For a cycle like: (A -> B) (B -> C) (C -> A), we change it to following
+    // sequence:
+    // (C -> scratch)     # Emit right now.
+    // (A -> B) (B -> C)  # Unblocked.
+    // (scratch -> A)     # Add to pending_moves_, blocked by (A -> B).
+    Location::Kind kind = source.GetKind();
+    DCHECK_NE(kind, Location::kConstant);
+    Location scratch = AllocateScratchLocationFor(kind);
+    // We only care about the move size.
+    Primitive::Type type = move->Is64BitMove() ? Primitive::kPrimLong : Primitive::kPrimInt;
+    // Perform (C -> scratch)
+    move->SetDestination(scratch);
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(source, scratch);
+    // Add (scratch -> A).
+    AddPendingMove(scratch, destination, type);
+  } else {
+    // This move is not blocked.
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(source, destination);
+  }
+
+  // Moves in the pending list should not block any other moves. But performing
+  // unblocked moves in the pending list can free scratch registers, so we do this
+  // as early as possible.
+  MoveOperands* pending_move;
+  while ((pending_move = GetUnblockedPendingMove(source)) != nullptr) {
+    Location pending_source = pending_move->GetSource();
+    Location pending_destination = pending_move->GetDestination();
+    // We do not depend on the pending move index. So just delete the move instead
+    // of eliminating it to make the pending list cleaner.
+    DeletePendingMove(pending_move);
+    move->SetSource(pending_source);
+    move->SetDestination(pending_destination);
+    EmitMove(index);
+    move->Eliminate();
+    UpdateMoveSource(pending_source, pending_destination);
+    // Free any unblocked locations in the scratch location list.
+    for (size_t i = 0; i < scratches_.Size(); ++i) {
+      Location scratch = scratches_.Get(i);
+      // Only scratch overlapping with performed move source can be unblocked.
+      if (scratch.OverlapsWith(pending_source) && !IsBlockedByMoves(scratch)) {
+        FreeScratchLocation(pending_source);
+      }
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::UpdateMoveSource(Location from, Location to) {
+  // This function is used to reduce the dependencies in the graph after
+  // (from -> to) has been performed. Since we ensure there is no move with the same
+  // destination, (to -> X) can not be blocked while (from -> X) might still be
+  // blocked. Consider for example the moves (0 -> 1) (1 -> 2) (1 -> 3). After
+  // (1 -> 2) has been performed, the moves left are (0 -> 1) and (1 -> 3). There is
+  // a dependency between the two. If we update the source location from 1 to 2, we
+  // will get (0 -> 1) and (2 -> 3). There is no dependency between the two.
+  //
+  // This is not something we must do, but we can use fewer scratch locations with
+  // this trick. For example, we can avoid using additional scratch locations for
+  // moves (0 -> 1), (1 -> 2), (1 -> 0).
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    MoveOperands* move = moves_.Get(i);
+    if (move->GetSource().Equals(from)) {
+      move->SetSource(to);
+    }
+  }
+}
+
+void ParallelMoveResolverNoSwap::AddPendingMove(Location source,
+    Location destination, Primitive::Type type) {
+  pending_moves_.Add(new (allocator_) MoveOperands(source, destination, type, nullptr));
+}
+
+void ParallelMoveResolverNoSwap::DeletePendingMove(MoveOperands* move) {
+  pending_moves_.Delete(move);
+}
+
+MoveOperands* ParallelMoveResolverNoSwap::GetUnblockedPendingMove(Location loc) {
+  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
+    MoveOperands* move = pending_moves_.Get(i);
+    Location destination = move->GetDestination();
+    // Only moves with destination overlapping with input loc can be unblocked.
+    if (destination.OverlapsWith(loc) && !IsBlockedByMoves(destination)) {
+      return move;
+    }
+  }
+  return nullptr;
+}
+
+bool ParallelMoveResolverNoSwap::IsBlockedByMoves(Location loc) {
+  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
+    if (pending_moves_.Get(i)->Blocks(loc)) {
+      return true;
+    }
+  }
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    if (moves_.Get(i)->Blocks(loc)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// So far it is only used for debugging purposes to make sure all pending moves
+// have been performed.
+size_t ParallelMoveResolverNoSwap::GetNumberOfPendingMoves() {
+  return pending_moves_.Size();
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index 95f8ad5..e89417d 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -19,30 +19,47 @@
 
 #include "base/value_object.h"
 #include "utils/growable_array.h"
+#include "locations.h"
 
 namespace art {
 
 class HParallelMove;
-class Location;
 class MoveOperands;
 
-/**
- * Helper class to resolve a set of parallel moves. Architecture dependent code
- * generator must have their own subclass that implements the `EmitMove` and `EmitSwap`
- * operations.
- */
+// Helper classes to resolve a set of parallel moves. Architecture dependent code generator must
+// have their own subclass that implements corresponding virtual functions.
 class ParallelMoveResolver : public ValueObject {
  public:
   explicit ParallelMoveResolver(ArenaAllocator* allocator) : moves_(allocator, 32) {}
   virtual ~ParallelMoveResolver() {}
 
   // Resolve a set of parallel moves, emitting assembler instructions.
-  void EmitNativeCode(HParallelMove* parallel_move);
+  virtual void EmitNativeCode(HParallelMove* parallel_move) = 0;
+
+ protected:
+  // Build the initial list of moves.
+  void BuildInitialMoveList(HParallelMove* parallel_move);
+
+  GrowableArray<MoveOperands*> moves_;
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolver);
+};
+
+// This helper class uses swap to resolve dependencies and may emit swap.
+class ParallelMoveResolverWithSwap : public ParallelMoveResolver {
+ public:
+  explicit ParallelMoveResolverWithSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolver(allocator) {}
+  virtual ~ParallelMoveResolverWithSwap() {}
+
+  // Resolve a set of parallel moves, emitting assembler instructions.
+  void EmitNativeCode(HParallelMove* parallel_move) OVERRIDE;
 
  protected:
   class ScratchRegisterScope : public ValueObject {
    public:
-    ScratchRegisterScope(ParallelMoveResolver* resolver,
+    ScratchRegisterScope(ParallelMoveResolverWithSwap* resolver,
                          int blocked,
                          int if_scratch,
                          int number_of_registers);
@@ -52,11 +69,12 @@
     bool IsSpilled() const { return spilled_; }
 
    private:
-    ParallelMoveResolver* resolver_;
+    ParallelMoveResolverWithSwap* resolver_;
     int reg_;
     bool spilled_;
   };
 
+  // Return true if the location can be scratched.
   bool IsScratchLocation(Location loc);
 
   // Allocate a scratch register for performing a move. The method will try to use
@@ -72,15 +90,9 @@
   virtual void SpillScratch(int reg) = 0;
   virtual void RestoreScratch(int reg) = 0;
 
-  // List of moves not yet resolved.
-  GrowableArray<MoveOperands*> moves_;
-
   static constexpr int kNoRegister = -1;
 
  private:
-  // Build the initial list of moves.
-  void BuildInitialMoveList(HParallelMove* parallel_move);
-
   // Perform the move at the moves_ index in question (possibly requiring
   // other moves to satisfy dependencies).
   //
@@ -99,7 +111,83 @@
   //    the right value.
   MoveOperands* PerformMove(size_t index);
 
-  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolver);
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverWithSwap);
+};
+
+// This helper class uses additional scratch registers to resolve dependencies. It supports all kind
+// of dependency cycles and does not care about the register layout.
+class ParallelMoveResolverNoSwap : public ParallelMoveResolver {
+ public:
+  explicit ParallelMoveResolverNoSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolver(allocator), scratches_(allocator, 32),
+        pending_moves_(allocator, 8), allocator_(allocator) {}
+  virtual ~ParallelMoveResolverNoSwap() {}
+
+  // Resolve a set of parallel moves, emitting assembler instructions.
+  void EmitNativeCode(HParallelMove* parallel_move) OVERRIDE;
+
+ protected:
+  // Called at the beginning of EmitNativeCode(). A subclass may put some architecture dependent
+  // initialization here.
+  virtual void PrepareForEmitNativeCode() = 0;
+
+  // Called at the end of EmitNativeCode(). A subclass may put some architecture dependent cleanup
+  // here. All scratch locations will be removed after this call.
+  virtual void FinishEmitNativeCode() = 0;
+
+  // Allocate a scratch location to perform a move from input kind of location. A subclass should
+  // implement this to get the best fit location. If there is no suitable physical register, it can
+  // also return a stack slot.
+  virtual Location AllocateScratchLocationFor(Location::Kind kind) = 0;
+
+  // Called after a move which takes a scratch location as source. A subclass can defer the cleanup
+  // to FinishEmitNativeCode().
+  virtual void FreeScratchLocation(Location loc) = 0;
+
+  // Emit a move.
+  virtual void EmitMove(size_t index) = 0;
+
+  // Return a scratch location from the moves which exactly matches the kind.
+  // Return Location::NoLocation() if no matching scratch location can be found.
+  Location GetScratchLocation(Location::Kind kind);
+
+  // Add a location to the scratch list which can be returned from GetScratchLocation() to resolve
+  // dependency cycles.
+  void AddScratchLocation(Location loc);
+
+  // Remove a location from the scratch list.
+  void RemoveScratchLocation(Location loc);
+
+  // List of scratch locations.
+  GrowableArray<Location> scratches_;
+
+ private:
+  // Perform the move at the given index in `moves_` (possibly requiring other moves to satisfy
+  // dependencies).
+  void PerformMove(size_t index);
+
+  void UpdateMoveSource(Location from, Location to);
+
+  void AddPendingMove(Location source, Location destination, Primitive::Type type);
+
+  void DeletePendingMove(MoveOperands* move);
+
+  // Find a move that may be unblocked after (loc -> XXX) is performed.
+  MoveOperands* GetUnblockedPendingMove(Location loc);
+
+  // Return true if the location is blocked by outstanding moves.
+  bool IsBlockedByMoves(Location loc);
+
+  // Return the number of pending moves.
+  size_t GetNumberOfPendingMoves();
+
+  // Additional pending moves which might be added to resolve dependency cycle.
+  GrowableArray<MoveOperands*> pending_moves_;
+
+  // Used to allocate pending MoveOperands.
+  ArenaAllocator* const allocator_;
+
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverNoSwap);
 };
 
 }  // namespace art
diff --git a/compiler/optimizing/parallel_move_test.cc b/compiler/optimizing/parallel_move_test.cc
index 95cca51..f8f7010 100644
--- a/compiler/optimizing/parallel_move_test.cc
+++ b/compiler/optimizing/parallel_move_test.cc
@@ -19,27 +19,41 @@
 #include "parallel_move_resolver.h"
 
 #include "gtest/gtest.h"
+#include "gtest/gtest-typed-test.h"
 
 namespace art {
 
-class TestParallelMoveResolver : public ParallelMoveResolver {
- public:
-  explicit TestParallelMoveResolver(ArenaAllocator* allocator) : ParallelMoveResolver(allocator) {}
+constexpr int kScratchRegisterStartIndexForTest = 100;
 
-  void Dump(Location location) {
-    if (location.IsConstant()) {
-      message_ << "C";
-    } else if (location.IsPair()) {
-      message_ << location.low() << "," << location.high();
-    } else if (location.IsRegister()) {
-      message_ << location.reg();
-    } else if (location.IsStackSlot()) {
-      message_ << location.GetStackIndex() << "(sp)";
-    } else {
-      message_ << "2x" << location.GetStackIndex() << "(sp)";
-      DCHECK(location.IsDoubleStackSlot()) << location;
-    }
+static void DumpRegisterForTest(std::ostream& os, int reg) {
+  if (reg >= kScratchRegisterStartIndexForTest) {
+    os << "T" << reg - kScratchRegisterStartIndexForTest;
+  } else {
+    os << reg;
   }
+}
+
+static void DumpLocationForTest(std::ostream& os, Location location) {
+  if (location.IsConstant()) {
+    os << "C";
+  } else if (location.IsPair()) {
+    DumpRegisterForTest(os, location.low());
+    os << ",";
+    DumpRegisterForTest(os, location.high());
+  } else if (location.IsRegister()) {
+    DumpRegisterForTest(os, location.reg());
+  } else if (location.IsStackSlot()) {
+    os << location.GetStackIndex() << "(sp)";
+  } else {
+    DCHECK(location.IsDoubleStackSlot())<< location;
+    os << "2x" << location.GetStackIndex() << "(sp)";
+  }
+}
+
+class TestParallelMoveResolverWithSwap : public ParallelMoveResolverWithSwap {
+ public:
+  explicit TestParallelMoveResolverWithSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolverWithSwap(allocator) {}
 
   void EmitMove(size_t index) OVERRIDE {
     MoveOperands* move = moves_.Get(index);
@@ -47,9 +61,9 @@
       message_ << " ";
     }
     message_ << "(";
-    Dump(move->GetSource());
+    DumpLocationForTest(message_, move->GetSource());
     message_ << " -> ";
-    Dump(move->GetDestination());
+    DumpLocationForTest(message_, move->GetDestination());
     message_ << ")";
   }
 
@@ -59,9 +73,9 @@
       message_ << " ";
     }
     message_ << "(";
-    Dump(move->GetSource());
+    DumpLocationForTest(message_, move->GetSource());
     message_ << " <-> ";
-    Dump(move->GetDestination());
+    DumpLocationForTest(message_, move->GetDestination());
     message_ << ")";
   }
 
@@ -76,7 +90,64 @@
   std::ostringstream message_;
 
 
-  DISALLOW_COPY_AND_ASSIGN(TestParallelMoveResolver);
+  DISALLOW_COPY_AND_ASSIGN(TestParallelMoveResolverWithSwap);
+};
+
+class TestParallelMoveResolverNoSwap : public ParallelMoveResolverNoSwap {
+ public:
+  explicit TestParallelMoveResolverNoSwap(ArenaAllocator* allocator)
+      : ParallelMoveResolverNoSwap(allocator), scratch_index_(kScratchRegisterStartIndexForTest) {}
+
+  void PrepareForEmitNativeCode() OVERRIDE {
+    scratch_index_ = kScratchRegisterStartIndexForTest;
+  }
+
+  void FinishEmitNativeCode() OVERRIDE {}
+
+  Location AllocateScratchLocationFor(Location::Kind kind) OVERRIDE {
+    if (kind == Location::kStackSlot || kind == Location::kFpuRegister ||
+        kind == Location::kRegister) {
+      kind = Location::kRegister;
+    } else {
+      // Allocate register pair for double stack slot which simulates 32-bit backend's behavior.
+      kind = Location::kRegisterPair;
+    }
+    Location scratch = GetScratchLocation(kind);
+    if (scratch.Equals(Location::NoLocation())) {
+      AddScratchLocation(Location::RegisterLocation(scratch_index_));
+      AddScratchLocation(Location::RegisterLocation(scratch_index_ + 1));
+      AddScratchLocation(Location::RegisterPairLocation(scratch_index_, scratch_index_ + 1));
+      scratch = (kind == Location::kRegister) ? Location::RegisterLocation(scratch_index_)
+          : Location::RegisterPairLocation(scratch_index_, scratch_index_ + 1);
+      scratch_index_ += 2;
+    }
+    return scratch;
+  }
+
+  void FreeScratchLocation(Location loc ATTRIBUTE_UNUSED) OVERRIDE {}
+
+  void EmitMove(size_t index) OVERRIDE {
+    MoveOperands* move = moves_.Get(index);
+    if (!message_.str().empty()) {
+      message_ << " ";
+    }
+    message_ << "(";
+    DumpLocationForTest(message_, move->GetSource());
+    message_ << " -> ";
+    DumpLocationForTest(message_, move->GetDestination());
+    message_ << ")";
+  }
+
+  std::string GetMessage() const {
+    return  message_.str();
+  }
+
+ private:
+  std::ostringstream message_;
+
+  int scratch_index_;
+
+  DISALLOW_COPY_AND_ASSIGN(TestParallelMoveResolverNoSwap);
 };
 
 static HParallelMove* BuildParallelMove(ArenaAllocator* allocator,
@@ -93,55 +164,102 @@
   return moves;
 }
 
-TEST(ParallelMoveTest, Dependency) {
+template <typename T>
+class ParallelMoveTest : public ::testing::Test {
+ public:
+  static const bool has_swap;
+};
+
+template<> const bool ParallelMoveTest<TestParallelMoveResolverWithSwap>::has_swap = true;
+template<> const bool ParallelMoveTest<TestParallelMoveResolverNoSwap>::has_swap = false;
+
+typedef ::testing::Types<TestParallelMoveResolverWithSwap, TestParallelMoveResolverNoSwap>
+    ParallelMoveResolverTestTypes;
+
+TYPED_TEST_CASE(ParallelMoveTest, ParallelMoveResolverTestTypes);
+
+
+TYPED_TEST(ParallelMoveTest, Dependency) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(1 -> 2) (0 -> 1)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 -> 2) (0 -> 1)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> 2) (0 -> 1)", resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {2, 3}, {1, 4}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(2 -> 3) (1 -> 2) (1 -> 4) (0 -> 1)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2 -> 3) (1 -> 2) (1 -> 4) (0 -> 1)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2 -> 3) (1 -> 2) (0 -> 1) (2 -> 4)", resolver.GetMessage().c_str());
+    }
   }
 }
 
-TEST(ParallelMoveTest, Swap) {
+TYPED_TEST(ParallelMoveTest, Cycle) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 0}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(1 <-> 0)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> T0) (0 -> 1) (T0 -> 0)", resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {1, 0}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(1 -> 2) (1 <-> 0)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 -> 2) (1 <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> 2) (0 -> 1) (2 -> 0)", resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 0}, {0, 2}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0 -> 2) (1 <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0 -> 2) (1 -> 0) (2 -> 1)", resolver.GetMessage().c_str());
+    }
+  }
+
+  {
+    TypeParam resolver(&allocator);
     static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 0}};
     resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
-    ASSERT_STREQ("(4 <-> 0) (3 <-> 4) (2 <-> 3) (1 <-> 2)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(4 <-> 0) (3 <-> 4) (2 <-> 3) (1 <-> 2)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(4 -> T0) (3 -> 4) (2 -> 3) (1 -> 2) (0 -> 1) (T0 -> 0)",
+          resolver.GetMessage().c_str());
+    }
   }
 }
 
-TEST(ParallelMoveTest, ConstantLast) {
+TYPED_TEST(ParallelMoveTest, ConstantLast) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
-  TestParallelMoveResolver resolver(&allocator);
+  TypeParam resolver(&allocator);
   HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
   moves->AddMove(
       Location::ConstantLocation(new (&allocator) HIntConstant(0)),
@@ -157,12 +275,12 @@
   ASSERT_STREQ("(1 -> 2) (C -> 0)", resolver.GetMessage().c_str());
 }
 
-TEST(ParallelMoveTest, Pairs) {
+TYPED_TEST(ParallelMoveTest, Pairs) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(2),
@@ -179,7 +297,7 @@
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -196,7 +314,7 @@
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -209,10 +327,14 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2 -> T0) (0,1 -> 2,3) (T0 -> 0)", resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(2),
@@ -230,10 +352,15 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0,1 -> T0,T1) (7 -> 1) (2 -> 7) (T0,T1 -> 2,3)",
+          resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(2),
@@ -251,10 +378,15 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0,1 -> T0,T1) (7 -> 1) (2 -> 7) (T0,T1 -> 2,3)",
+          resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -272,10 +404,14 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3) (7 -> 1) (0 -> 7)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(7 -> T0) (2 -> 7) (0,1 -> 2,3) (T0 -> 1)", resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -288,10 +424,14 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2,3 <-> 0,1)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2,3 <-> 0,1)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2,3 -> T0,T1) (0,1 -> 2,3) (T0,T1 -> 0,1)", resolver.GetMessage().c_str());
+    }
   }
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(2, 3),
@@ -304,12 +444,85 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(0,1 -> T0,T1) (2,3 -> 0,1) (T0,T1 -> 2,3)", resolver.GetMessage().c_str());
+    }
+  }
+}
+
+TYPED_TEST(ParallelMoveTest, MultiCycles) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    TypeParam resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 0}, {2, 3}, {3, 2}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(1 <-> 0) (3 <-> 2)",  resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(1 -> T0) (0 -> 1) (T0 -> 0) (3 -> T0) (2 -> 3) (T0 -> 2)",
+          resolver.GetMessage().c_str());
+    }
+  }
+  {
+    TypeParam resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        Primitive::kPrimLong,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(2),
+        Location::RegisterLocation(0),
+        Primitive::kPrimInt,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(3),
+        Location::RegisterLocation(1),
+        Primitive::kPrimInt,
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2 -> T0) (3 -> T1) (0,1 -> 2,3) (T0 -> 0) (T1 -> 1)",
+          resolver.GetMessage().c_str());
+    }
+  }
+  {
+    TypeParam resolver(&allocator);
+    HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
+    moves->AddMove(
+        Location::RegisterLocation(2),
+        Location::RegisterLocation(0),
+        Primitive::kPrimInt,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterLocation(3),
+        Location::RegisterLocation(1),
+        Primitive::kPrimInt,
+        nullptr);
+    moves->AddMove(
+        Location::RegisterPairLocation(0, 1),
+        Location::RegisterPairLocation(2, 3),
+        Primitive::kPrimLong,
+        nullptr);
+    resolver.EmitNativeCode(moves);
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0,1 <-> 2,3)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(3 -> T0) (0,1 -> T2,T3) (T0 -> 1) (2 -> 0) (T2,T3 -> 2,3)",
+          resolver.GetMessage().c_str());
+    }
   }
 
   {
     // Test involving registers used in single context and pair context.
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(10),
@@ -327,17 +540,22 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2x32(sp) <-> 10,11) (4,5 <-> 2x32(sp)) (4 -> 5)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2x32(sp) <-> 10,11) (4,5 <-> 2x32(sp)) (4 -> 5)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2x32(sp) -> T0,T1) (4,5 -> 2x32(sp)) (10 -> 5) (T0,T1 -> 10,11)",
+          resolver.GetMessage().c_str());
+    }
   }
 }
 
 // Test that we do 64bits moves before 32bits moves.
-TEST(ParallelMoveTest, CyclesWith64BitsMoves) {
+TYPED_TEST(ParallelMoveTest, CyclesWith64BitsMoves) {
   ArenaPool pool;
   ArenaAllocator allocator(&pool);
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterLocation(0),
@@ -355,11 +573,16 @@
         Primitive::kPrimInt,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(0 <-> 1) (48(sp) <-> 0)", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(0 <-> 1) (48(sp) <-> 0)", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(48(sp) -> T0) (1 -> 48(sp)) (0 -> 1) (T0 -> 0)",
+          resolver.GetMessage().c_str());
+    }
   }
 
   {
-    TestParallelMoveResolver resolver(&allocator);
+    TypeParam resolver(&allocator);
     HParallelMove* moves = new (&allocator) HParallelMove(&allocator);
     moves->AddMove(
         Location::RegisterPairLocation(0, 1),
@@ -377,7 +600,12 @@
         Primitive::kPrimLong,
         nullptr);
     resolver.EmitNativeCode(moves);
-    ASSERT_STREQ("(2x32(sp) <-> 0,1) (2,3 <-> 2x32(sp))", resolver.GetMessage().c_str());
+    if (TestFixture::has_swap) {
+      ASSERT_STREQ("(2x32(sp) <-> 0,1) (2,3 <-> 2x32(sp))", resolver.GetMessage().c_str());
+    } else {
+      ASSERT_STREQ("(2x32(sp) -> T0,T1) (2,3 -> 2x32(sp)) (0,1 -> 2,3) (T0,T1 -> 0,1)",
+          resolver.GetMessage().c_str());
+    }
   }
 }
 
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 479b87f..de6941c 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -58,11 +58,11 @@
 }
 
 void ReferenceTypePropagation::BoundTypeForIfNotNull(HBasicBlock* block) {
-  HInstruction* lastInstruction = block->GetLastInstruction();
-  if (!lastInstruction->IsIf()) {
+  HIf* ifInstruction = block->GetLastInstruction()->AsIf();
+  if (ifInstruction == nullptr) {
     return;
   }
-  HInstruction* ifInput = lastInstruction->InputAt(0);
+  HInstruction* ifInput = ifInstruction->InputAt(0);
   if (!ifInput->IsNotEqual() && !ifInput->IsEqual()) {
     return;
   }
@@ -78,16 +78,20 @@
     return;
   }
 
-  HBoundType* bound_type =
-      new (graph_->GetArena()) HBoundType(obj, ReferenceTypeInfo::CreateTop(false));
-
-  block->InsertInstructionBefore(bound_type, lastInstruction);
+  // We only need to bound the type if we have uses in the relevant block.
+  // So start with null and create the HBoundType lazily, only if it's needed.
+  HBoundType* bound_type = nullptr;
   HBasicBlock* notNullBlock = ifInput->IsNotEqual()
-      ? lastInstruction->AsIf()->IfTrueSuccessor()
-      : lastInstruction->AsIf()->IfFalseSuccessor();
+      ? ifInstruction->IfTrueSuccessor()
+      : ifInstruction->IfFalseSuccessor();
+
   for (HUseIterator<HInstruction*> it(obj->GetUses()); !it.Done(); it.Advance()) {
     HInstruction* user = it.Current()->GetUser();
     if (notNullBlock->Dominates(user->GetBlock())) {
+      if (bound_type == nullptr) {
+        bound_type = new (graph_->GetArena()) HBoundType(obj, ReferenceTypeInfo::CreateTop(false));
+        notNullBlock->InsertInstructionBefore(bound_type, notNullBlock->GetFirstInstruction());
+      }
       user->ReplaceInput(bound_type, it.Current()->GetIndex());
     }
   }
@@ -98,49 +102,58 @@
 // If that's the case insert an HBoundType instruction to bound the type of `x`
 // to `ClassX` in the scope of the dominated blocks.
 void ReferenceTypePropagation::BoundTypeForIfInstanceOf(HBasicBlock* block) {
-  HInstruction* lastInstruction = block->GetLastInstruction();
-  if (!lastInstruction->IsIf()) {
+  HIf* ifInstruction = block->GetLastInstruction()->AsIf();
+  if (ifInstruction == nullptr) {
     return;
   }
-  HInstruction* ifInput = lastInstruction->InputAt(0);
-  // TODO: Handle more patterns here: HIf(bool) HIf(HNotEqual).
-  if (!ifInput->IsEqual()) {
+  HInstruction* ifInput = ifInstruction->InputAt(0);
+  HInstruction* instanceOf = nullptr;
+  HBasicBlock* instanceOfTrueBlock = nullptr;
+
+  // The instruction simplifier has transformed:
+  //   - `if (a instanceof A)` into an HIf with an HInstanceOf input
+  //   - `if (!(a instanceof A)` into an HIf with an HBooleanNot input (which in turn
+  //     has an HInstanceOf input)
+  // So we should not see the usual HEqual here.
+  if (ifInput->IsInstanceOf()) {
+    instanceOf = ifInput;
+    instanceOfTrueBlock = ifInstruction->IfTrueSuccessor();
+  } else if (ifInput->IsBooleanNot() && ifInput->InputAt(0)->IsInstanceOf()) {
+    instanceOf = ifInput->InputAt(0);
+    instanceOfTrueBlock = ifInstruction->IfFalseSuccessor();
+  } else {
     return;
   }
-  HInstruction* instanceOf = ifInput->InputAt(0);
-  HInstruction* comp_value = ifInput->InputAt(1);
-  if (!instanceOf->IsInstanceOf() || !comp_value->IsIntConstant()) {
-    return;
-  }
+
+  // We only need to bound the type if we have uses in the relevant block.
+  // So start with null and create the HBoundType lazily, only if it's needed.
+  HBoundType* bound_type = nullptr;
 
   HInstruction* obj = instanceOf->InputAt(0);
-  HLoadClass* load_class = instanceOf->InputAt(1)->AsLoadClass();
-
-  ReferenceTypeInfo obj_rti = obj->GetReferenceTypeInfo();
-  ReferenceTypeInfo class_rti = load_class->GetLoadedClassRTI();
-  HBoundType* bound_type = new (graph_->GetArena()) HBoundType(obj, class_rti);
-
-  // Narrow the type as much as possible.
-  {
-    ScopedObjectAccess soa(Thread::Current());
-    if (!load_class->IsResolved() || class_rti.IsSupertypeOf(obj_rti)) {
-      bound_type->SetReferenceTypeInfo(obj_rti);
-    } else {
-      bound_type->SetReferenceTypeInfo(
-          ReferenceTypeInfo::Create(class_rti.GetTypeHandle(), /* is_exact */ false));
-    }
-  }
-
-  block->InsertInstructionBefore(bound_type, lastInstruction);
-  // Pick the right successor based on the value we compare against.
-  HIntConstant* comp_value_int = comp_value->AsIntConstant();
-  HBasicBlock* instanceOfTrueBlock = comp_value_int->GetValue() == 0
-      ? lastInstruction->AsIf()->IfFalseSuccessor()
-      : lastInstruction->AsIf()->IfTrueSuccessor();
-
   for (HUseIterator<HInstruction*> it(obj->GetUses()); !it.Done(); it.Advance()) {
     HInstruction* user = it.Current()->GetUser();
     if (instanceOfTrueBlock->Dominates(user->GetBlock())) {
+      if (bound_type == nullptr) {
+        HLoadClass* load_class = instanceOf->InputAt(1)->AsLoadClass();
+
+        ReferenceTypeInfo obj_rti = obj->GetReferenceTypeInfo();
+        ReferenceTypeInfo class_rti = load_class->GetLoadedClassRTI();
+        bound_type = new (graph_->GetArena()) HBoundType(obj, class_rti);
+
+        // Narrow the type as much as possible.
+        {
+          ScopedObjectAccess soa(Thread::Current());
+          if (!load_class->IsResolved() || class_rti.IsSupertypeOf(obj_rti)) {
+            bound_type->SetReferenceTypeInfo(obj_rti);
+          } else {
+            bound_type->SetReferenceTypeInfo(
+                ReferenceTypeInfo::Create(class_rti.GetTypeHandle(), /* is_exact */ false));
+          }
+        }
+
+        instanceOfTrueBlock->InsertInstructionBefore(
+            bound_type, instanceOfTrueBlock->GetFirstInstruction());
+      }
       user->ReplaceInput(bound_type, it.Current()->GetIndex());
     }
   }
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index 6350b35..f8e00f6 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -903,6 +903,10 @@
     return false;
   }
 
+  // We use the first use to compare with other intervals. If this interval
+  // is used after any active intervals, we will spill this interval.
+  size_t first_use = current->FirstUseAfter(current->GetStart());
+
   // First set all registers as not being used.
   size_t* next_use = registers_array_;
   for (size_t i = 0; i < number_of_registers_; ++i) {
@@ -917,7 +921,7 @@
     if (active->IsFixed()) {
       next_use[active->GetRegister()] = current->GetStart();
     } else {
-      size_t use = active->FirstRegisterUseAfter(current->GetStart());
+      size_t use = active->FirstUseAfter(current->GetStart());
       if (use != kNoLifetime) {
         next_use[active->GetRegister()] = use;
       }
@@ -945,7 +949,7 @@
         next_use[inactive->GetRegister()] =
             std::min(next_intersection, next_use[inactive->GetRegister()]);
       } else {
-        size_t use = inactive->FirstRegisterUseAfter(current->GetStart());
+        size_t use = inactive->FirstUseAfter(current->GetStart());
         if (use != kNoLifetime) {
           next_use[inactive->GetRegister()] = std::min(use, next_use[inactive->GetRegister()]);
         }
@@ -959,16 +963,16 @@
     DCHECK(current->IsHighInterval());
     reg = current->GetRegister();
     // When allocating the low part, we made sure the high register was available.
-    DCHECK_LT(first_register_use, next_use[reg]);
+    DCHECK_LT(first_use, next_use[reg]);
   } else if (current->IsLowInterval()) {
     reg = FindAvailableRegisterPair(next_use, first_register_use);
     // We should spill if both registers are not available.
-    should_spill = (first_register_use >= next_use[reg])
-      || (first_register_use >= next_use[GetHighForLowRegister(reg)]);
+    should_spill = (first_use >= next_use[reg])
+      || (first_use >= next_use[GetHighForLowRegister(reg)]);
   } else {
     DCHECK(!current->IsHighInterval());
     reg = FindAvailableRegister(next_use);
-    should_spill = (first_register_use >= next_use[reg]);
+    should_spill = (first_use >= next_use[reg]);
   }
 
   DCHECK_NE(reg, kNoRegister);
@@ -998,10 +1002,12 @@
         DumpInterval(std::cerr, current);
         DumpAllIntervals(std::cerr);
         // This situation has the potential to infinite loop, so we make it a non-debug CHECK.
+        HInstruction* at = liveness_.GetInstructionFromPosition(first_register_use / 2);
         CHECK(false) << "There is not enough registers available for "
           << split->GetParent()->GetDefinedBy()->DebugName() << " "
           << split->GetParent()->GetDefinedBy()->GetId()
-          << " at " << first_register_use - 1;
+          << " at " << first_register_use - 1 << " "
+          << (at == nullptr ? "" : at->DebugName());
       }
       AddSorted(unhandled_, split);
     }
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 8eb98a1..03f5545 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -131,6 +131,9 @@
 
   void Dump(std::ostream& stream) const {
     stream << position_;
+    if (is_environment_) {
+      stream << " (env)";
+    }
   }
 
   UsePosition* Dup(ArenaAllocator* allocator) const {
@@ -366,6 +369,10 @@
 
   LiveInterval* GetParent() const { return parent_; }
 
+  // Returns whether this interval is the parent interval, that is, the interval
+  // that starts where the HInstruction is defined.
+  bool IsParent() const { return parent_ == this; }
+
   LiveRange* GetFirstRange() const { return first_range_; }
   LiveRange* GetLastRange() const { return last_range_; }
 
@@ -442,7 +449,7 @@
     if (is_temp_) {
       return position == GetStart() ? position : kNoLifetime;
     }
-    if (position == GetStart() && defined_by_ != nullptr) {
+    if (position == GetStart() && IsParent()) {
       LocationSummary* locations = defined_by_->GetLocations();
       Location location = locations->Out();
       // This interval is the first interval of the instruction. If the output
@@ -491,12 +498,19 @@
       return position == GetStart() ? position : kNoLifetime;
     }
 
+    if (position == GetStart() && IsParent()) {
+      if (defined_by_->GetLocations()->Out().IsValid()) {
+        return position;
+      }
+    }
+
     UsePosition* use = first_use_;
     size_t end = GetEnd();
     while (use != nullptr && use->GetPosition() <= end) {
       if (!use->GetIsEnvironment()) {
+        Location location = use->GetUser()->GetLocations()->InAt(use->GetInputIndex());
         size_t use_position = use->GetPosition();
-        if (use_position > position) {
+        if (use_position > position && location.IsValid()) {
           return use_position;
         }
       }
@@ -725,7 +739,7 @@
   }
 
   void AddHighInterval(bool is_temp = false) {
-    DCHECK_EQ(GetParent(), this);
+    DCHECK(IsParent());
     DCHECK(!HasHighInterval());
     DCHECK(!HasLowInterval());
     high_or_low_interval_ = new (allocator_) LiveInterval(
diff --git a/compiler/utils/assembler_thumb_test.cc b/compiler/utils/assembler_thumb_test.cc
index a171e59..772fa9a 100644
--- a/compiler/utils/assembler_thumb_test.cc
+++ b/compiler/utils/assembler_thumb_test.cc
@@ -43,8 +43,6 @@
 static constexpr bool kPrintResults = false;
 #endif
 
-static const char* TOOL_PREFIX = "arm-linux-androideabi-";
-
 void SetAndroidData() {
   const char* data = getenv("ANDROID_DATA");
   if (data == nullptr) {
@@ -65,87 +63,6 @@
   return *s1 - *s2;
 }
 
-std::string GetAndroidToolsDir() {
-  std::string root;
-  const char* android_build_top = getenv("ANDROID_BUILD_TOP");
-  if (android_build_top != nullptr) {
-    root += android_build_top;
-  } else {
-    // Not set by build server, so default to current directory
-    char* cwd = getcwd(nullptr, 0);
-    setenv("ANDROID_BUILD_TOP", cwd, 1);
-    root += cwd;
-    free(cwd);
-  }
-
-  // Look for "prebuilts"
-  std::string toolsdir = root;
-  struct stat st;
-  while (toolsdir != "") {
-    std::string prebuilts = toolsdir + "/prebuilts";
-    if (stat(prebuilts.c_str(), &st) == 0) {
-       // Found prebuilts.
-       toolsdir += "/prebuilts/gcc/linux-x86/arm";
-       break;
-    }
-    // Not present, move up one dir.
-    size_t slash = toolsdir.rfind('/');
-    if (slash == std::string::npos) {
-      toolsdir = "";
-    } else {
-      toolsdir = toolsdir.substr(0, slash-1);
-    }
-  }
-  bool statok = stat(toolsdir.c_str(), &st) == 0;
-  if (!statok) {
-    return "";      // Use path.
-  }
-
-  DIR* dir = opendir(toolsdir.c_str());
-  if (dir == nullptr) {
-    return "";      // Use path.
-  }
-
-  struct dirent* entry;
-  std::string founddir;
-  double maxversion  = 0;
-
-  // Find the latest version of the arm-eabi tools (biggest version number).
-  // Suffix on toolsdir will be something like "arm-eabi-4.8"
-  while ((entry = readdir(dir)) != nullptr) {
-    std::string subdir = toolsdir + std::string("/") + std::string(entry->d_name);
-    size_t eabi = subdir.find(TOOL_PREFIX);
-    if (eabi != std::string::npos) {
-      // Check if "bin/{as,objcopy,objdump}" exist under this folder.
-      struct stat exec_st;
-      std::string exec_path;
-      exec_path = subdir + "/bin/" + TOOL_PREFIX + "as";
-      if (stat(exec_path.c_str(), &exec_st) != 0)
-        continue;
-      exec_path = subdir + "/bin/" + TOOL_PREFIX + "objcopy";
-      if (stat(exec_path.c_str(), &exec_st) != 0)
-        continue;
-      exec_path = subdir + "/bin/" + TOOL_PREFIX + "objdump";
-      if (stat(exec_path.c_str(), &exec_st) != 0)
-        continue;
-
-      std::string suffix = subdir.substr(eabi + strlen(TOOL_PREFIX));
-      double version = strtod(suffix.c_str(), nullptr);
-      if (version > maxversion) {
-        maxversion = version;
-        founddir = subdir;
-      }
-    }
-  }
-  closedir(dir);
-  bool found = founddir != "";
-  if (!found) {
-    return "";      // Use path.
-  }
-
-  return founddir + "/bin/";
-}
-
 void dump(std::vector<uint8_t>& code, const char* testname) {
   // This will only work on the host.  There is no as, objcopy or objdump on the
   // device.
@@ -155,7 +72,7 @@
 
   if (!results_ok) {
     setup_results();
-    toolsdir = GetAndroidToolsDir();
+    toolsdir = CommonRuntimeTest::GetAndroidTargetToolsDir(kThumb2);
     SetAndroidData();
     results_ok = true;
   }
@@ -187,19 +104,18 @@
   char cmd[1024];
 
   // Assemble the .S
-  snprintf(cmd, sizeof(cmd), "%s%sas %s -o %s.o", toolsdir.c_str(), TOOL_PREFIX, filename, filename);
+  snprintf(cmd, sizeof(cmd), "%sas %s -o %s.o", toolsdir.c_str(), filename, filename);
   system(cmd);
 
   // Remove the $d symbols to prevent the disassembler dumping the instructions
   // as .word
-  snprintf(cmd, sizeof(cmd), "%s%sobjcopy -N '$d' %s.o %s.oo", toolsdir.c_str(), TOOL_PREFIX,
-    filename, filename);
+  snprintf(cmd, sizeof(cmd), "%sobjcopy -N '$d' %s.o %s.oo", toolsdir.c_str(), filename, filename);
   system(cmd);
 
   // Disassemble.
 
-  snprintf(cmd, sizeof(cmd), "%s%sobjdump -d %s.oo | grep '^  *[0-9a-f][0-9a-f]*:'",
-    toolsdir.c_str(), TOOL_PREFIX, filename);
+  snprintf(cmd, sizeof(cmd), "%sobjdump -d %s.oo | grep '^  *[0-9a-f][0-9a-f]*:'",
+    toolsdir.c_str(), filename);
   if (kPrintResults) {
     // Print the results only, don't check. This is used to generate new output for inserting
     // into the .inc file.
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 7e32b43..70b4213 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -23,6 +23,7 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #if defined(__linux__) && defined(__arm__)
@@ -1102,7 +1103,7 @@
         return false;
       }
     } else if (image_) {
-      image_classes_.reset(new std::set<std::string>);
+      image_classes_.reset(new std::unordered_set<std::string>);
     }
     // If --compiled-classes was specified, calculate the full list of classes to compile in the
     // image.
@@ -1615,20 +1616,22 @@
   }
 
   // Reads the class names (java.lang.Object) and returns a set of descriptors (Ljava/lang/Object;)
-  static std::set<std::string>* ReadImageClassesFromFile(const char* image_classes_filename) {
+  static std::unordered_set<std::string>* ReadImageClassesFromFile(
+      const char* image_classes_filename) {
     std::unique_ptr<std::ifstream> image_classes_file(new std::ifstream(image_classes_filename,
                                                                         std::ifstream::in));
     if (image_classes_file.get() == nullptr) {
       LOG(ERROR) << "Failed to open image classes file " << image_classes_filename;
       return nullptr;
     }
-    std::unique_ptr<std::set<std::string>> result(ReadImageClasses(*image_classes_file));
+    std::unique_ptr<std::unordered_set<std::string>> result(ReadImageClasses(*image_classes_file));
     image_classes_file->close();
     return result.release();
   }
 
-  static std::set<std::string>* ReadImageClasses(std::istream& image_classes_stream) {
-    std::unique_ptr<std::set<std::string>> image_classes(new std::set<std::string>);
+  static std::unordered_set<std::string>* ReadImageClasses(std::istream& image_classes_stream) {
+    std::unique_ptr<std::unordered_set<std::string>> image_classes(
+        new std::unordered_set<std::string>);
     while (image_classes_stream.good()) {
       std::string dot;
       std::getline(image_classes_stream, dot);
@@ -1642,9 +1645,10 @@
   }
 
   // Reads the class names (java.lang.Object) and returns a set of descriptors (Ljava/lang/Object;)
-  static std::set<std::string>* ReadImageClassesFromZip(const char* zip_filename,
-                                                        const char* image_classes_filename,
-                                                        std::string* error_msg) {
+  static std::unordered_set<std::string>* ReadImageClassesFromZip(
+      const char* zip_filename,
+      const char* image_classes_filename,
+      std::string* error_msg) {
     std::unique_ptr<ZipArchive> zip_archive(ZipArchive::Open(zip_filename, error_msg));
     if (zip_archive.get() == nullptr) {
       return nullptr;
@@ -1720,8 +1724,8 @@
   const char* image_classes_filename_;
   const char* compiled_classes_zip_filename_;
   const char* compiled_classes_filename_;
-  std::unique_ptr<std::set<std::string>> image_classes_;
-  std::unique_ptr<std::set<std::string>> compiled_classes_;
+  std::unique_ptr<std::unordered_set<std::string>> image_classes_;
+  std::unique_ptr<std::unordered_set<std::string>> compiled_classes_;
   bool image_;
   std::unique_ptr<ImageWriter> image_writer_;
   bool is_host_;
diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 6c33232..c00ae78 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h
@@ -46,6 +46,11 @@
 #define ART_FRIEND_TEST(test_set_name, individual_test)\
 friend class test_set_name##_##individual_test##_Test
 
+// Declare a friend relationship in a class with a typed test.
+#define ART_FRIEND_TYPED_TEST(test_set_name, individual_test)\
+template<typename T> ART_FRIEND_TEST(test_set_name, individual_test)
+
+
 // DISALLOW_COPY_AND_ASSIGN disallows the copy and operator= functions. It goes in the private:
 // declarations in a class.
 #if !defined(DISALLOW_COPY_AND_ASSIGN)
diff --git a/runtime/common_runtime_test.cc b/runtime/common_runtime_test.cc
index 60b7fa2..e17b885 100644
--- a/runtime/common_runtime_test.cc
+++ b/runtime/common_runtime_test.cc
@@ -16,6 +16,7 @@
 
 #include "common_runtime_test.h"
 
+#include <cstdio>
 #include <dirent.h>
 #include <dlfcn.h>
 #include <fcntl.h>
@@ -188,6 +189,82 @@
   }
 }
 
+// Helper - find directory with the following format:
+// ${ANDROID_BUILD_TOP}/${subdir1}/${subdir2}-${version}/${subdir3}/bin/
+static std::string GetAndroidToolsDir(const std::string& subdir1,
+                                      const std::string& subdir2,
+                                      const std::string& subdir3) {
+  std::string root;
+  const char* android_build_top = getenv("ANDROID_BUILD_TOP");
+  if (android_build_top != nullptr) {
+    root = android_build_top;
+  } else {
+    // Not set by build server, so default to current directory
+    char* cwd = getcwd(nullptr, 0);
+    setenv("ANDROID_BUILD_TOP", cwd, 1);
+    root = cwd;
+    free(cwd);
+  }
+
+  std::string toolsdir = root + "/" + subdir1;
+  std::string founddir;
+  DIR* dir;
+  if ((dir = opendir(toolsdir.c_str())) != nullptr) {
+    float maxversion = 0;
+    struct dirent* entry;
+    while ((entry = readdir(dir)) != nullptr) {
+      std::string format = subdir2 + "-%f";
+      float version;
+      if (std::sscanf(entry->d_name, format.c_str(), &version) == 1) {
+        if (version > maxversion) {
+          maxversion = version;
+          founddir = toolsdir + "/" + entry->d_name + "/" + subdir3 + "/bin/";
+        }
+      }
+    }
+    closedir(dir);
+  }
+
+  if (founddir.empty()) {
+    ADD_FAILURE() << "Can not find Android tools directory.";
+  }
+  return founddir;
+}
+
+std::string CommonRuntimeTest::GetAndroidHostToolsDir() {
+  return GetAndroidToolsDir("prebuilts/gcc/linux-x86/host",
+                            "x86_64-linux-glibc2.15",
+                            "x86_64-linux");
+}
+
+std::string CommonRuntimeTest::GetAndroidTargetToolsDir(InstructionSet isa) {
+  switch (isa) {
+    case kArm:
+    case kThumb2:
+      return GetAndroidToolsDir("prebuilts/gcc/linux-x86/arm",
+                                "arm-linux-androideabi",
+                                "arm-linux-androideabi");
+    case kArm64:
+      return GetAndroidToolsDir("prebuilts/gcc/linux-x86/aarch64",
+                                "aarch64-linux-android",
+                                "aarch64-linux-android");
+    case kX86:
+    case kX86_64:
+      return GetAndroidToolsDir("prebuilts/gcc/linux-x86/x86",
+                                "x86_64-linux-android",
+                                "x86_64-linux-android");
+    case kMips:
+    case kMips64:
+      return GetAndroidToolsDir("prebuilts/gcc/linux-x86/mips",
+                                "mips64el-linux-android",
+                                "mips64el-linux-android");
+    case kNone:
+      break;
+  }
+  ADD_FAILURE() << "Invalid isa " << isa;
+  return "";
+}
+
 std::string CommonRuntimeTest::GetCoreArtLocation() {
   return GetCoreFileLocation("art");
 }
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index 5fbc2ee..9917378 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -22,6 +22,7 @@
 
 #include <string>
 
+#include "arch/instruction_set.h"
 #include "base/mutex.h"
 #include "globals.h"
 #include "os.h"
@@ -79,6 +80,12 @@
   // Gets the path of the libcore dex file.
   static std::string GetLibCoreDexFileName();
 
+  // Returns bin directory which contains host's prebuild tools.
+  static std::string GetAndroidHostToolsDir();
+
+  // Returns bin directory which contains target's prebuild tools.
+  static std::string GetAndroidTargetToolsDir(InstructionSet isa);
+
  protected:
   static bool IsHost() {
     return !kIsTargetBuild;
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index a4a9d80..7353c83 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -18,6 +18,7 @@
 
 #include <memory>
 
+#include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/space_bitmap-inl.h"
 #include "base/logging.h"
 #include "base/mutex-inl.h"
@@ -127,8 +128,18 @@
     LOG(WARNING) << "Large object allocation failed: " << error_msg;
     return NULL;
   }
+  mirror::Object* const obj = reinterpret_cast<mirror::Object*>(mem_map->Begin());
+  if (kIsDebugBuild) {
+    ReaderMutexLock mu2(Thread::Current(), *Locks::heap_bitmap_lock_);
+    auto* heap = Runtime::Current()->GetHeap();
+    auto* live_bitmap = heap->GetLiveBitmap();
+    auto* space_bitmap = live_bitmap->GetContinuousSpaceBitmap(obj);
+    CHECK(space_bitmap == nullptr) << obj << " overlaps with bitmap " << *space_bitmap;
+    auto* obj_end = reinterpret_cast<mirror::Object*>(mem_map->End());
+    space_bitmap = live_bitmap->GetContinuousSpaceBitmap(obj_end - 1);
+    CHECK(space_bitmap == nullptr) << obj_end << " overlaps with bitmap " << *space_bitmap;
+  }
   MutexLock mu(self, lock_);
-  mirror::Object* obj = reinterpret_cast<mirror::Object*>(mem_map->Begin());
   large_objects_.push_back(obj);
   mem_maps_.Put(obj, mem_map);
   const size_t allocation_size = mem_map->BaseSize();
diff --git a/runtime/prebuilt_tools_test.cc b/runtime/prebuilt_tools_test.cc
new file mode 100644
index 0000000..453c0da
--- /dev/null
+++ b/runtime/prebuilt_tools_test.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common_runtime_test.h"
+
+#include <cstdio>
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+// Run the tests only on host.
+#ifndef HAVE_ANDROID_OS
+
+class PrebuiltToolsTest : public CommonRuntimeTest {
+};
+
+static void CheckToolsExist(const std::string& tools_dir) {
+  const char* tools[] { "as", "objcopy", "objdump" };  // NOLINT
+  for (const char* tool : tools) {
+    struct stat exec_st;
+    std::string exec_path = tools_dir + tool;
+    if (stat(exec_path.c_str(), &exec_st) != 0) {
+      ADD_FAILURE() << "Can not find " << tool << " in " << tools_dir;
+    }
+  }
+}
+
+TEST_F(PrebuiltToolsTest, CheckHostTools) {
+  std::string tools_dir = GetAndroidHostToolsDir();
+  if (tools_dir.empty()) {
+    ADD_FAILURE() << "Can not find Android tools directory for host";
+  } else {
+    CheckToolsExist(tools_dir);
+  }
+}
+
+TEST_F(PrebuiltToolsTest, CheckTargetTools) {
+  InstructionSet isas[] = { kArm, kArm64, kThumb2, kX86, kX86_64, kMips, kMips64 };  // NOLINT
+  for (InstructionSet isa : isas) {
+    std::string tools_dir = GetAndroidTargetToolsDir(isa);
+    if (tools_dir.empty()) {
+      ADD_FAILURE() << "Can not find Android tools directory for " << isa;
+    } else {
+      CheckToolsExist(tools_dir);
+    }
+  }
+}
+
+#endif  // HAVE_ANDROID_OS
+
+}  // namespace art
diff --git a/test/458-checker-instruction-simplification/src/Main.java b/test/458-checker-instruction-simplification/src/Main.java
index 3cbcebb..65be6cb 100644
--- a/test/458-checker-instruction-simplification/src/Main.java
+++ b/test/458-checker-instruction-simplification/src/Main.java
@@ -16,6 +16,12 @@
 
 public class Main {
 
+  public static void assertBooleanEquals(boolean expected, boolean result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   public static void assertIntEquals(int expected, int result) {
     if (expected != result) {
       throw new Error("Expected: " + expected + ", found: " + result);
@@ -41,7 +47,7 @@
   // CHECK-START: long Main.Add0(long) instruction_simplifier (after)
   // CHECK-DAG:     [[Arg:j\d+]]     ParameterValue
   // CHECK-DAG:                      Return [ [[Arg]] ]
-  //
+
   // CHECK-START: long Main.Add0(long) instruction_simplifier (after)
   // CHECK-NOT:                        Add
 
@@ -760,6 +766,147 @@
     return res;
   }
 
+  // CHECK-START: int Main.EqualTrueRhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const1:i\d+]]   IntConstant 1
+  // CHECK-DAG:     [[Cond:z\d+]]     Equal [ [[Arg]] [[Const1]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.EqualTrueRhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:                       If [ [[Arg]] ]
+
+  public static int EqualTrueRhs(boolean arg) {
+    return (arg != true) ? 3 : 5;
+  }
+
+  // CHECK-START: int Main.EqualTrueLhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const1:i\d+]]   IntConstant 1
+  // CHECK-DAG:     [[Cond:z\d+]]     Equal [ [[Const1]] [[Arg]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.EqualTrueLhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:                       If [ [[Arg]] ]
+
+  public static int EqualTrueLhs(boolean arg) {
+    return (true != arg) ? 3 : 5;
+  }
+
+  // CHECK-START: int Main.EqualFalseRhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const0:i\d+]]   IntConstant 0
+  // CHECK-DAG:     [[Cond:z\d+]]     Equal [ [[Arg]] [[Const0]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.EqualFalseRhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[NotArg:z\d+]]   BooleanNot [ [[Arg]] ]
+  // CHECK-DAG:                       If [ [[NotArg]] ]
+
+  public static int EqualFalseRhs(boolean arg) {
+    return (arg != false) ? 3 : 5;
+  }
+
+  // CHECK-START: int Main.EqualFalseLhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const0:i\d+]]   IntConstant 0
+  // CHECK-DAG:     [[Cond:z\d+]]     Equal [ [[Const0]] [[Arg]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.EqualFalseLhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[NotArg:z\d+]]   BooleanNot [ [[Arg]] ]
+  // CHECK-DAG:                       If [ [[NotArg]] ]
+
+  public static int EqualFalseLhs(boolean arg) {
+    return (false != arg) ? 3 : 5;
+  }
+
+  // CHECK-START: int Main.NotEqualTrueRhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const1:i\d+]]   IntConstant 1
+  // CHECK-DAG:     [[Cond:z\d+]]     NotEqual [ [[Arg]] [[Const1]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.NotEqualTrueRhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[NotArg:z\d+]]   BooleanNot [ [[Arg]] ]
+  // CHECK-DAG:                       If [ [[NotArg]] ]
+
+  public static int NotEqualTrueRhs(boolean arg) {
+    return (arg == true) ? 3 : 5;
+  }
+
+  // CHECK-START: int Main.NotEqualTrueLhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const1:i\d+]]   IntConstant 1
+  // CHECK-DAG:     [[Cond:z\d+]]     NotEqual [ [[Const1]] [[Arg]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.NotEqualTrueLhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[NotArg:z\d+]]   BooleanNot [ [[Arg]] ]
+  // CHECK-DAG:                       If [ [[NotArg]] ]
+
+  public static int NotEqualTrueLhs(boolean arg) {
+    return (true == arg) ? 3 : 5;
+  }
+
+  // CHECK-START: int Main.NotEqualFalseRhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const0:i\d+]]   IntConstant 0
+  // CHECK-DAG:     [[Cond:z\d+]]     NotEqual [ [[Arg]] [[Const0]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.NotEqualFalseRhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:                       If [ [[Arg]] ]
+
+  public static int NotEqualFalseRhs(boolean arg) {
+    return (arg == false) ? 3 : 5;
+  }
+
+  // CHECK-START: int Main.NotEqualFalseLhs(boolean) instruction_simplifier (before)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:     [[Const0:i\d+]]   IntConstant 0
+  // CHECK-DAG:     [[Cond:z\d+]]     NotEqual [ [[Const0]] [[Arg]] ]
+  // CHECK-DAG:                       If [ [[Cond]] ]
+
+  // CHECK-START: int Main.NotEqualFalseLhs(boolean) instruction_simplifier (after)
+  // CHECK-DAG:     [[Arg:z\d+]]      ParameterValue
+  // CHECK-DAG:                       If [ [[Arg]] ]
+
+  public static int NotEqualFalseLhs(boolean arg) {
+    return (false == arg) ? 3 : 5;
+  }
+
+  /*
+   * Test simplification of double Boolean negation. Note that sometimes
+   * both negations can be removed but we only expect the simplifier to
+   * remove the second.
+   */
+
+  // CHECK-START: boolean Main.NotNotBool(boolean) instruction_simplifier_after_types (before)
+  // CHECK-DAG:     [[Arg:z\d+]]       ParameterValue
+  // CHECK-DAG:     [[NotArg:z\d+]]    BooleanNot [ [[Arg]] ]
+  // CHECK-DAG:     [[NotNotArg:z\d+]] BooleanNot [ [[NotArg]] ]
+  // CHECK-DAG:                        Return [ [[NotNotArg]] ]
+
+  // CHECK-START: boolean Main.NotNotBool(boolean) instruction_simplifier_after_types (after)
+  // CHECK-DAG:     [[Arg:z\d+]]       ParameterValue
+  // CHECK-DAG:                        BooleanNot [ [[Arg]] ]
+  // CHECK-DAG:                        Return [ [[Arg]] ]
+
+  // CHECK-START: boolean Main.NotNotBool(boolean) instruction_simplifier_after_types (after)
+  // CHECK:                            BooleanNot
+  // CHECK-NOT:                        BooleanNot
+
+  public static boolean NotNotBool(boolean arg) {
+    return !(!arg);
+  }
+
   public static void main(String[] args) {
     int arg = 123456;
 
@@ -794,5 +941,16 @@
     assertIntEquals(SubNeg1(arg, arg + 1), -(arg + arg + 1));
     assertIntEquals(SubNeg2(arg, arg + 1), -(arg + arg + 1));
     assertLongEquals(SubNeg3(arg, arg + 1), -(2 * arg + 1));
+
+    assertIntEquals(EqualTrueRhs(true), 5);
+    assertIntEquals(EqualTrueLhs(true), 5);
+    assertIntEquals(EqualFalseRhs(true), 3);
+    assertIntEquals(EqualFalseLhs(true), 3);
+    assertIntEquals(NotEqualTrueRhs(true), 3);
+    assertIntEquals(NotEqualTrueLhs(true), 3);
+    assertIntEquals(NotEqualFalseRhs(true), 5);
+    assertIntEquals(NotEqualFalseLhs(true), 5);
+    assertBooleanEquals(NotNotBool(true), true);
+    assertBooleanEquals(NotNotBool(false), false);
   }
 }
diff --git a/test/463-checker-boolean-simplifier/src/Main.java b/test/463-checker-boolean-simplifier/src/Main.java
index efe0d3f..3daf693 100644
--- a/test/463-checker-boolean-simplifier/src/Main.java
+++ b/test/463-checker-boolean-simplifier/src/Main.java
@@ -27,16 +27,15 @@
   }
 
   /*
-   * Elementary test negating a boolean. Verifies that the condition is replaced,
-   * blocks merged and empty branches removed.
+   * Elementary test negating a boolean. Verifies that blocks are merged and
+   * empty branches removed.
    */
 
   // CHECK-START: boolean Main.BooleanNot(boolean) boolean_simplifier (before)
   // CHECK-DAG:     [[Param:z\d+]]    ParameterValue
   // CHECK-DAG:     [[Const0:i\d+]]   IntConstant 0
   // CHECK-DAG:     [[Const1:i\d+]]   IntConstant 1
-  // CHECK-DAG:     [[NotEq:z\d+]]    NotEqual [ [[Param]] [[Const0]] ]
-  // CHECK-DAG:                       If [ [[NotEq]] ]
+  // CHECK-DAG:                       If [ [[Param]] ]
   // CHECK-DAG:     [[Phi:i\d+]]      Phi [ [[Const1]] [[Const0]] ]
   // CHECK-DAG:                       Return [ [[Phi]] ]
 
@@ -49,11 +48,10 @@
   // CHECK-START: boolean Main.BooleanNot(boolean) boolean_simplifier (after)
   // CHECK-DAG:     [[Param:z\d+]]    ParameterValue
   // CHECK-DAG:     [[Const0:i\d+]]   IntConstant 0
-  // CHECK-DAG:     [[Eq:z\d+]]       Equal [ [[Param]] [[Const0]] ]
-  // CHECK-DAG:                       Return [ [[Eq]] ]
+  // CHECK-DAG:     [[NotParam:z\d+]] BooleanNot [ [[Param]] ]
+  // CHECK-DAG:                       Return [ [[NotParam]] ]
 
   // CHECK-START: boolean Main.BooleanNot(boolean) boolean_simplifier (after)
-  // CHECK-NOT:                       NotEqual
   // CHECK-NOT:                       If
   // CHECK-NOT:                       Phi
 
@@ -115,6 +113,9 @@
   // CHECK-DAG:     [[Cond:z\d+]]     LessThan [ [[ParamX]] [[ParamY]] ]
   // CHECK-DAG:                       Return [ [[Cond]] ]
 
+  // CHECK-START: boolean Main.LessThan(int, int) boolean_simplifier (after)
+  // CHECK-NOT:                       GreaterThanOrEqual
+
   public static boolean LessThan(int x, int y) {
     return (x < y) ? true : false;
   }
diff --git a/test/474-checker-boolean-input/src/Main.java b/test/474-checker-boolean-input/src/Main.java
index 91e8d4f..1ebe14e 100644
--- a/test/474-checker-boolean-input/src/Main.java
+++ b/test/474-checker-boolean-input/src/Main.java
@@ -23,14 +23,14 @@
   }
 
   /*
-   * Test that zero/one constants are accepted as boolean inputs.
+   * Test that zero/one constants are accepted as Boolean inputs.
    */
 
-  // CHECK-START: boolean Main.TestIntAsBoolean() inliner (before)
+  // CHECK-START: boolean Main.TestConstAsBoolean() inliner (before)
   // CHECK-DAG:     [[Invoke:z\d+]]  InvokeStaticOrDirect
   // CHECK-DAG:                      BooleanNot [ [[Invoke]] ]
 
-  // CHECK-START: boolean Main.TestIntAsBoolean() inliner (after)
+  // CHECK-START: boolean Main.TestConstAsBoolean() inliner (after)
   // CHECK-DAG:     [[Const:i\d+]]   IntConstant 1
   // CHECK-DAG:                      BooleanNot [ [[Const]] ]
 
@@ -38,13 +38,13 @@
     return true;
   }
 
-  public static boolean TestIntAsBoolean() {
+  public static boolean TestConstAsBoolean() {
     return InlineConst() != true ? true : false;
   }
 
   /*
-   * Test that integer Phis are accepted as boolean inputs until we implement
-   * a suitable type analysis.
+   * Test that integer Phis are accepted as Boolean inputs until
+   * we implement a suitable type analysis.
    */
 
   // CHECK-START: boolean Main.TestPhiAsBoolean(int) inliner (before)
@@ -66,10 +66,80 @@
     return InlinePhi(x) != true ? true : false;
   }
 
+  /*
+   * Test that integer And is accepted as a Boolean input until
+   * we implement a suitable type analysis.
+   */
+
+  // CHECK-START: boolean Main.TestAndAsBoolean(boolean, boolean) inliner (before)
+  // CHECK-DAG:     [[Invoke:z\d+]]  InvokeStaticOrDirect
+  // CHECK-DAG:                      BooleanNot [ [[Invoke]] ]
+
+  // CHECK-START: boolean Main.TestAndAsBoolean(boolean, boolean) inliner (after)
+  // CHECK-DAG:     [[And:i\d+]]     And
+  // CHECK-DAG:                      BooleanNot [ [[And]] ]
+
+  public static boolean InlineAnd(boolean x, boolean y) {
+    return x & y;
+  }
+
+  public static boolean TestAndAsBoolean(boolean x, boolean y) {
+    return InlineAnd(x, y) != true ? true : false;
+  }
+
+  /*
+   * Test that integer Or is accepted as a Boolean input until
+   * we implement a suitable type analysis.
+   */
+
+  // CHECK-START: boolean Main.TestOrAsBoolean(boolean, boolean) inliner (before)
+  // CHECK-DAG:     [[Invoke:z\d+]]  InvokeStaticOrDirect
+  // CHECK-DAG:                      BooleanNot [ [[Invoke]] ]
+
+  // CHECK-START: boolean Main.TestOrAsBoolean(boolean, boolean) inliner (after)
+  // CHECK-DAG:     [[Or:i\d+]]      Or
+  // CHECK-DAG:                      BooleanNot [ [[Or]] ]
+
+  public static boolean InlineOr(boolean x, boolean y) {
+    return x | y;
+  }
+
+  public static boolean TestOrAsBoolean(boolean x, boolean y) {
+    return InlineOr(x, y) != true ? true : false;
+  }
+
+  /*
+   * Test that integer Xor is accepted as a Boolean input until
+   * we implement a suitable type analysis.
+   */
+
+  // CHECK-START: boolean Main.TestXorAsBoolean(boolean, boolean) inliner (before)
+  // CHECK-DAG:     [[Invoke:z\d+]]  InvokeStaticOrDirect
+  // CHECK-DAG:                      BooleanNot [ [[Invoke]] ]
+
+  // CHECK-START: boolean Main.TestXorAsBoolean(boolean, boolean) inliner (after)
+  // CHECK-DAG:     [[Xor:i\d+]]     Xor
+  // CHECK-DAG:                      BooleanNot [ [[Xor]] ]
+
+  public static boolean InlineXor(boolean x, boolean y) {
+    return x ^ y;
+  }
+
+  public static boolean TestXorAsBoolean(boolean x, boolean y) {
+    return InlineXor(x, y) != true ? true : false;
+  }
+
   public static void main(String[] args) {
     f1 = true;
     f2 = false;
+    assertBoolEquals(false, TestConstAsBoolean());
     assertBoolEquals(true, TestPhiAsBoolean(0));
     assertBoolEquals(false, TestPhiAsBoolean(42));
+    assertBoolEquals(true, TestAndAsBoolean(true, false));
+    assertBoolEquals(false, TestAndAsBoolean(true, true));
+    assertBoolEquals(true, TestOrAsBoolean(false, false));
+    assertBoolEquals(false, TestOrAsBoolean(true, true));
+    assertBoolEquals(true, TestXorAsBoolean(true, true));
+    assertBoolEquals(false, TestXorAsBoolean(true, false));
   }
 }
diff --git a/test/476-checker-ctor-memory-barrier/expected.txt b/test/476-checker-ctor-memory-barrier/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/476-checker-ctor-memory-barrier/expected.txt
diff --git a/test/476-checker-ctor-memory-barrier/info.txt b/test/476-checker-ctor-memory-barrier/info.txt
new file mode 100644
index 0000000..9bd311f
--- /dev/null
+++ b/test/476-checker-ctor-memory-barrier/info.txt
@@ -0,0 +1,2 @@
+Tests if we add memory barriers on constructors when needed (i.e when the
+class has final fields).
diff --git a/test/476-checker-ctor-memory-barrier/src/Main.java b/test/476-checker-ctor-memory-barrier/src/Main.java
new file mode 100644
index 0000000..10aa2ab
--- /dev/null
+++ b/test/476-checker-ctor-memory-barrier/src/Main.java
@@ -0,0 +1,147 @@
+/*
+* Copyright (C) 2015 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+
+class ClassWithoutFinals {
+  // CHECK-START: void ClassWithoutFinals.<init>() register (after)
+  // CHECK-NOT: MemoryBarrier {{StoreStore}}
+  public ClassWithoutFinals() {}
+}
+
+class ClassWithFinals {
+  public final int x;
+  public ClassWithFinals obj;
+
+  // CHECK-START: void ClassWithFinals.<init>(boolean) register (after)
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     ReturnVoid
+  public ClassWithFinals(boolean cond) {
+    x = 0;
+    if (cond) {
+      // avoid inlining
+      throw new RuntimeException();
+    }
+  }
+
+  // CHECK-START: void ClassWithFinals.<init>() register (after)
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     ReturnVoid
+  public ClassWithFinals() {
+    x = 0;
+  }
+
+  // CHECK-START: void ClassWithFinals.<init>(int) register (after)
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     ReturnVoid
+  public ClassWithFinals(int x) {
+    // This should have two barriers:
+    //   - one for the constructor
+    //   - one for the `new` which should be inlined.
+    obj = new ClassWithFinals();
+    this.x = x;
+  }
+}
+
+class InheritFromClassWithFinals extends ClassWithFinals {
+  // CHECK-START: void InheritFromClassWithFinals.<init>() register (after)
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     ReturnVoid
+
+  // CHECK-START: void InheritFromClassWithFinals.<init>() register (after)
+  // CHECK-NOT: InvokeStaticOrDirect
+  public InheritFromClassWithFinals() {
+    // Should inline the super constructor.
+  }
+
+  // CHECK-START: void InheritFromClassWithFinals.<init>(boolean) register (after)
+  // CHECK:     InvokeStaticOrDirect
+
+  // CHECK-START: void InheritFromClassWithFinals.<init>(boolean) register (after)
+  // CHECK-NOT: MemoryBarrier {{StoreStore}}
+  public InheritFromClassWithFinals(boolean cond) {
+    super(cond);
+    // should not inline the super constructor
+  }
+}
+
+class HaveFinalsAndInheritFromClassWithFinals extends ClassWithFinals {
+  final int y;
+
+  // CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>() register (after)
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     ReturnVoid
+
+  // CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>() register (after)
+  // CHECK-NOT: InvokeStaticOrDirect
+  public HaveFinalsAndInheritFromClassWithFinals() {
+    // Should inline the super constructor.
+    y = 0;
+  }
+
+  // CHECK-START: void HaveFinalsAndInheritFromClassWithFinals.<init>(boolean) register (after)
+  // CHECK:     InvokeStaticOrDirect
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     ReturnVoid
+  public HaveFinalsAndInheritFromClassWithFinals(boolean cond) {
+    super(cond);
+    // should not inline the super constructor
+    y = 0;
+  }
+}
+
+public class Main {
+
+  // CHECK-START: ClassWithFinals Main.noInlineNoConstructorBarrier() register (after)
+  // CHECK:     InvokeStaticOrDirect
+
+  // CHECK-START: ClassWithFinals Main.noInlineNoConstructorBarrier() register (after)
+  // CHECK-NOT: MemoryBarrier {{StoreStore}}
+  public static ClassWithFinals noInlineNoConstructorBarrier() {
+    return new ClassWithFinals(false);
+  }
+
+  // CHECK-START: ClassWithFinals Main.inlineConstructorBarrier() register (after)
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     Return
+
+  // CHECK-START: ClassWithFinals Main.inlineConstructorBarrier() register (after)
+  // CHECK-NOT: InvokeStaticOrDirect
+  public static ClassWithFinals inlineConstructorBarrier() {
+    return new ClassWithFinals();
+  }
+
+  // CHECK-START: InheritFromClassWithFinals Main.doubleInlineConstructorBarrier() register (after)
+  // CHECK:     MemoryBarrier {{StoreStore}}
+  // CHECK-NOT: {{.*}}
+  // CHECK:     Return
+
+  // CHECK-START: InheritFromClassWithFinals Main.doubleInlineConstructorBarrier() register (after)
+  // CHECK-NOT: InvokeStaticOrDirect
+  public static InheritFromClassWithFinals doubleInlineConstructorBarrier() {
+    return new InheritFromClassWithFinals();
+  }
+
+  public static void main(String[] args) {  }
+}
diff --git a/test/477-checker-bound-type/expected.txt b/test/477-checker-bound-type/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/477-checker-bound-type/expected.txt
diff --git a/test/477-checker-bound-type/info.txt b/test/477-checker-bound-type/info.txt
new file mode 100644
index 0000000..68c774a
--- /dev/null
+++ b/test/477-checker-bound-type/info.txt
@@ -0,0 +1,3 @@
+Tests that we only generate a bound type if we have relevant users.
+It also tests a code generator regression for GenerateTestAndBranch which
+didn't take into account NullConstants.
diff --git a/test/477-checker-bound-type/src/Main.java b/test/477-checker-bound-type/src/Main.java
new file mode 100644
index 0000000..b30028d
--- /dev/null
+++ b/test/477-checker-bound-type/src/Main.java
@@ -0,0 +1,61 @@
+/*
+* Copyright (C) 2015 The Android Open Source Project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+
+public class Main {
+
+  // CHECK-START: java.lang.Object Main.boundTypeForIf(java.lang.Object) reference_type_propagation (after)
+  // CHECK:     BoundType
+  public static Object boundTypeForIf(Object a) {
+    if (a != null) {
+      return a.toString();
+    } else {
+      return null;
+    }
+  }
+
+  // CHECK-START: java.lang.Object Main.boundTypeForInstanceOf(java.lang.Object) reference_type_propagation (after)
+  // CHECK:     BoundType
+  public static Object boundTypeForInstanceOf(Object a) {
+    if (a instanceof Main) {
+      return (Main)a;
+    } else {
+      return null;
+    }
+  }
+
+  // CHECK-START: java.lang.Object Main.noBoundTypeForIf(java.lang.Object) reference_type_propagation (after)
+  // CHECK-NOT: BoundType
+  public static Object noBoundTypeForIf(Object a) {
+    if (a == null) {
+      return new Object();
+    } else {
+      return null;
+    }
+  }
+
+  // CHECK-START: java.lang.Object Main.noBoundTypeForInstanceOf(java.lang.Object) reference_type_propagation (after)
+  // CHECK-NOT: BoundType
+  public static Object noBoundTypeForInstanceOf(Object a) {
+    if (a instanceof Main) {
+      return new Object();
+    } else {
+      return null;
+    }
+  }
+
+  public static void main(String[] args) {  }
+}
diff --git a/tools/run-jdwp-tests.sh b/tools/run-jdwp-tests.sh
index 90c01f5..301708b 100755
--- a/tools/run-jdwp-tests.sh
+++ b/tools/run-jdwp-tests.sh
@@ -19,8 +19,8 @@
   exit 1
 fi
 
-if [[ $ANDROID_SERIAL == 03a79ae90ae5889b ]] || [[ $ANDROID_SERIAL == HT4CTJT03670 ]] || [[ $ANDROID_SERIAL == HT49CJT00070 ]]; then
-  echo "Not run because of localhost failures. Investigating."
+if [[ $ANDROID_SERIAL == HT4CTJT03670 ]] || [[ $ANDROID_SERIAL == HT49CJT00070 ]]; then
+  echo "Not running on buildbot because of failures on volantis. Investigating."
   exit 0
 fi