Merge "Allocate large enough space bitmaps for malloc spaces."
diff --git a/Android.mk b/Android.mk
index 3324458..a63c29b 100644
--- a/Android.mk
+++ b/Android.mk
@@ -401,15 +401,27 @@
 .PHONY: dump-oat-core-target
 ifeq ($(ART_BUILD_TARGET),true)
 dump-oat-core-target: $(TARGET_CORE_IMG_OUT) $(OATDUMP)
-	$(OATDUMP) --image=$(TARGET_CORE_IMG_OUT) --output=$(ART_DUMP_OAT_PATH)/core.target.oatdump.txt
+	$(OATDUMP) --image=$(TARGET_CORE_IMG_LOCATION) --output=$(ART_DUMP_OAT_PATH)/core.target.oatdump.txt --instruction-set=$(TARGET_ARCH)
 	@echo Output in $(ART_DUMP_OAT_PATH)/core.target.oatdump.txt
 endif
 
-.PHONY: dump-oat-boot
+.PHONY: dump-oat-boot-$(TARGET_ARCH)
 ifeq ($(ART_BUILD_TARGET_NDEBUG),true)
-dump-oat-boot: $(DEFAULT_DEX_PREOPT_BUILT_IMAGE) $(OATDUMP)
-	$(OATDUMP) --image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE) --output=$(ART_DUMP_OAT_PATH)/boot.oatdump.txt
-	@echo Output in $(ART_DUMP_OAT_PATH)/boot.oatdump.txt
+dump-oat-boot-$(TARGET_ARCH): $(DEFAULT_DEX_PREOPT_BUILT_IMAGE_FILENAME) $(OATDUMP)
+	$(OATDUMP) --image=$(DEFAULT_DEX_PREOPT_BUILT_IMAGE_LOCATION) --output=$(ART_DUMP_OAT_PATH)/boot.$(TARGET_ARCH).oatdump.txt --instruction-set=$(TARGET_ARCH)
+	@echo Output in $(ART_DUMP_OAT_PATH)/boot.$(TARGET_ARCH).oatdump.txt
+endif
+
+ifdef TARGET_2ND_ARCH
+dump-oat-boot-$(TARGET_2ND_ARCH): $(2ND_DEFAULT_DEX_PREOPT_BUILT_IMAGE_FILENAME) $(OATDUMP)
+	$(OATDUMP) --image=$(2ND_DEFAULT_DEX_PREOPT_BUILT_IMAGE_LOCATION) --output=$(ART_DUMP_OAT_PATH)/boot.$(TARGET_2ND_ARCH).oatdump.txt --instruction-set=$(TARGET_2ND_ARCH)
+	@echo Output in $(ART_DUMP_OAT_PATH)/boot.$(TARGET_2ND_ARCH).oatdump.txt
+endif
+
+.PHONY: dump-oat-boot
+dump-oat-boot: dump-oat-boot-$(TARGET_ARCH)
+ifdef TARGET_2ND_ARCH
+dump-oat-boot: dump-oat-boot-$(TARGET_2ND_ARCH)
 endif
 
 .PHONY: dump-oat-Calculator
diff --git a/build/Android.common.mk b/build/Android.common.mk
index ae54efb..83c536f 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -402,5 +402,6 @@
 endif
 
 HOST_CORE_IMG_LOCATION := $(HOST_OUT_JAVA_LIBRARIES)/core.art
+TARGET_CORE_IMG_LOCATION := $(ART_TEST_OUT)/core.art
 
 endif # ANDROID_COMMON_MK
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 765216b..9f1d0f1 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -81,8 +81,11 @@
 	compiler/optimizing/find_loops_test.cc \
 	compiler/optimizing/linearize_test.cc \
 	compiler/optimizing/liveness_test.cc \
+	compiler/optimizing/live_interval_test.cc \
 	compiler/optimizing/live_ranges_test.cc \
+	compiler/optimizing/parallel_move_test.cc \
 	compiler/optimizing/pretty_printer_test.cc \
+	compiler/optimizing/register_allocator_test.cc \
 	compiler/optimizing/ssa_test.cc \
 	compiler/output_stream_test.cc \
 	compiler/utils/arena_allocator_test.cc \
diff --git a/compiler/Android.mk b/compiler/Android.mk
index cb9e41a..f297213 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -61,7 +61,6 @@
 	dex/mir_optimization.cc \
 	dex/bb_optimizations.cc \
 	dex/pass_driver_me.cc \
-	dex/bit_vector_block_iterator.cc \
 	dex/frontend.cc \
 	dex/mir_graph.cc \
 	dex/mir_analysis.cc \
@@ -83,8 +82,11 @@
 	optimizing/code_generator_arm.cc \
 	optimizing/code_generator_x86.cc \
 	optimizing/graph_visualizer.cc \
+	optimizing/locations.cc \
 	optimizing/nodes.cc \
 	optimizing/optimizing_compiler.cc \
+	optimizing/parallel_move_resolver.cc \
+	optimizing/register_allocator.cc \
 	optimizing/ssa_builder.cc \
 	optimizing/ssa_liveness_analysis.cc \
 	trampolines/trampoline_compiler.cc \
diff --git a/compiler/dex/bb_optimizations.cc b/compiler/dex/bb_optimizations.cc
index 1852f80..8b5eba0 100644
--- a/compiler/dex/bb_optimizations.cc
+++ b/compiler/dex/bb_optimizations.cc
@@ -38,6 +38,13 @@
 /*
  * SSATransformation pass implementation start.
  */
+void SSATransformation::Start(const PassDataHolder* data) const {
+  DCHECK(data != nullptr);
+  CompilationUnit* cUnit = down_cast<const PassMEDataHolder*>(data)->c_unit;
+  DCHECK(cUnit != nullptr);
+  cUnit->mir_graph->SSATransformationStart();
+}
+
 bool SSATransformation::Worker(const PassDataHolder* data) const {
   DCHECK(data != nullptr);
   const PassMEDataHolder* pass_me_data_holder = down_cast<const PassMEDataHolder*>(data);
@@ -54,10 +61,7 @@
   DCHECK(data != nullptr);
   CompilationUnit* cUnit = down_cast<const PassMEDataHolder*>(data)->c_unit;
   DCHECK(cUnit != nullptr);
-  // Verify the dataflow information after the pass.
-  if (cUnit->enable_debug & (1 << kDebugVerifyDataflow)) {
-    cUnit->mir_graph->VerifyDataflow();
-  }
+  cUnit->mir_graph->SSATransformationEnd();
 }
 
 /*
diff --git a/compiler/dex/bb_optimizations.h b/compiler/dex/bb_optimizations.h
index 43dcdf4..3a529f2 100644
--- a/compiler/dex/bb_optimizations.h
+++ b/compiler/dex/bb_optimizations.h
@@ -143,12 +143,7 @@
 
   bool Worker(const PassDataHolder* data) const;
 
-  void Start(const PassDataHolder* data) const {
-    DCHECK(data != nullptr);
-    CompilationUnit* cUnit = down_cast<const PassMEDataHolder*>(data)->c_unit;
-    DCHECK(cUnit != nullptr);
-    cUnit->mir_graph->InitializeSSATransformation();
-  }
+  void Start(const PassDataHolder* data) const;
 
   void End(const PassDataHolder* data) const;
 };
diff --git a/compiler/dex/bit_vector_block_iterator.h b/compiler/dex/bit_vector_block_iterator.h
deleted file mode 100644
index 0f1c2b6..0000000
--- a/compiler/dex/bit_vector_block_iterator.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_DEX_BIT_VECTOR_BLOCK_ITERATOR_H_
-#define ART_COMPILER_DEX_BIT_VECTOR_BLOCK_ITERATOR_H_
-
-#include "base/bit_vector.h"
-#include "compiler_enums.h"
-#include "utils/arena_bit_vector.h"
-#include "utils/arena_allocator.h"
-#include "compiler_ir.h"
-
-namespace art {
-
-class MIRGraph;
-
-/**
- * @class BasicBlockIterator
- * @brief Helper class to get the BasicBlocks when iterating through the ArenaBitVector.
- */
-class BitVectorBlockIterator {
-  public:
-    explicit BitVectorBlockIterator(BitVector* bv, MIRGraph* mir_graph)
-      : mir_graph_(mir_graph),
-        internal_iterator_(bv) {}
-
-    explicit BitVectorBlockIterator(BitVector* bv, CompilationUnit* c_unit)
-      : mir_graph_(c_unit->mir_graph.get()),
-        internal_iterator_(bv) {}
-
-    BasicBlock* Next();
-
-    void* operator new(size_t size, ArenaAllocator* arena) {
-      return arena->Alloc(size, kArenaAllocGrowableArray);
-    };
-    void operator delete(void* p) {}  // Nop.
-
-  private:
-    MIRGraph* const mir_graph_;
-    BitVector::Iterator internal_iterator_;
-};
-
-}  // namespace art
-
-#endif  // ART_COMPILER_DEX_BIT_VECTOR_BLOCK_ITERATOR_H_
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 5b4492f..767ffbf 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -22,6 +22,7 @@
 namespace art {
 
 enum RegisterClass {
+  kInvalidRegClass,
   kCoreReg,
   kFPReg,
   kAnyReg,
diff --git a/compiler/dex/dataflow_iterator.h b/compiler/dex/dataflow_iterator.h
index b45d6a4..62973af 100644
--- a/compiler/dex/dataflow_iterator.h
+++ b/compiler/dex/dataflow_iterator.h
@@ -326,6 +326,81 @@
       GrowableArray<BasicBlock*>::Iterator all_nodes_iterator_;    /**< @brief The list of all the nodes */
   };
 
+  /**
+   * @class TopologicalSortIterator
+   * @brief Used to perform a Topological Sort Iteration of a MIRGraph.
+   */
+  class TopologicalSortIterator : public DataflowIterator {
+    public:
+      /**
+       * @brief The constructor, using all of the reachable blocks of the MIRGraph.
+       * @param mir_graph The MIRGraph considered.
+       */
+      explicit TopologicalSortIterator(MIRGraph* mir_graph)
+          : DataflowIterator(mir_graph, 0, mir_graph->GetTopologicalSortOrder() != nullptr ?
+            mir_graph->GetTopologicalSortOrder()->Size() : 0) {
+        // Extra setup for TopologicalSortIterator.
+        idx_ = start_idx_;
+        block_id_list_ = mir_graph->GetTopologicalSortOrder();
+
+        if (mir_graph->GetTopologicalSortOrder() == nullptr) {
+          /* Compute the topological order */
+          mir_graph->ComputeTopologicalSortOrder();
+        }
+      }
+
+      /**
+       * @brief Get the next BasicBlock depending on iteration order.
+       * @param had_change did the user of the iteration change the previous BasicBlock.
+       * @return the next BasicBlock following the iteration order, 0 if finished.
+       */
+      virtual BasicBlock* Next(bool had_change = false) {
+        // Update changed: if had_changed is true, we remember it for the whole iteration.
+        changed_ |= had_change;
+
+        return ForwardSingleNext();
+      }
+  };
+
+  /**
+   * @class RepeatingTopologicalSortIterator
+   * @brief Used to perform a Topological Sort Iteration of a MIRGraph.
+   * @details If there is a change during an iteration, the iteration starts over at the end of the
+   *          iteration.
+   */
+  class RepeatingTopologicalSortIterator : public DataflowIterator {
+    public:
+     /**
+      * @brief The constructor, using all of the reachable blocks of the MIRGraph.
+      * @param mir_graph The MIRGraph considered.
+      */
+     explicit RepeatingTopologicalSortIterator(MIRGraph* mir_graph)
+         : DataflowIterator(mir_graph, 0, mir_graph->GetTopologicalSortOrder() != nullptr ?
+           mir_graph->GetTopologicalSortOrder()->Size() : 0) {
+       // Extra setup for RepeatingTopologicalSortIterator.
+       idx_ = start_idx_;
+       block_id_list_ = mir_graph->GetTopologicalSortOrder();
+
+       if (mir_graph->GetTopologicalSortOrder() == nullptr) {
+         /* Compute the topological order */
+         mir_graph->ComputeTopologicalSortOrder();
+       }
+     }
+
+     /**
+      * @brief Get the next BasicBlock depending on iteration order.
+      * @param had_change did the user of the iteration change the previous BasicBlock.
+      * @return the next BasicBlock following the iteration order, 0 if finished.
+      */
+     virtual BasicBlock* Next(bool had_change = false) {
+       // Update changed: if had_changed is true, we remember it for the whole iteration.
+       changed_ |= had_change;
+
+       return ForwardRepeatNext();
+     }
+  };
+
+
 }  // namespace art
 
 #endif  // ART_COMPILER_DEX_DATAFLOW_ITERATOR_H_
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index ec2556b..c3f694d 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -136,22 +136,22 @@
 // TODO: Remove this when we are able to compile everything.
 int arm64_support_list[] = {
     Instruction::NOP,
-    // Instruction::MOVE,
-    // Instruction::MOVE_FROM16,
-    // Instruction::MOVE_16,
-    // Instruction::MOVE_WIDE,
-    // Instruction::MOVE_WIDE_FROM16,
-    // Instruction::MOVE_WIDE_16,
-    // Instruction::MOVE_OBJECT,
-    // Instruction::MOVE_OBJECT_FROM16,
-    // Instruction::MOVE_OBJECT_16,
+    Instruction::MOVE,
+    Instruction::MOVE_FROM16,
+    Instruction::MOVE_16,
+    Instruction::MOVE_WIDE,
+    Instruction::MOVE_WIDE_FROM16,
+    Instruction::MOVE_WIDE_16,
+    Instruction::MOVE_OBJECT,
+    Instruction::MOVE_OBJECT_FROM16,
+    Instruction::MOVE_OBJECT_16,
     // Instruction::MOVE_RESULT,
     // Instruction::MOVE_RESULT_WIDE,
     // Instruction::MOVE_RESULT_OBJECT,
     Instruction::MOVE_EXCEPTION,
     Instruction::RETURN_VOID,
-    // Instruction::RETURN,
-    // Instruction::RETURN_WIDE,
+    Instruction::RETURN,
+    Instruction::RETURN_WIDE,
     // Instruction::RETURN_OBJECT,
     // Instruction::CONST_4,
     // Instruction::CONST_16,
@@ -180,11 +180,11 @@
     // Instruction::GOTO_32,
     // Instruction::PACKED_SWITCH,
     // Instruction::SPARSE_SWITCH,
-    // Instruction::CMPL_FLOAT,
-    // Instruction::CMPG_FLOAT,
-    // Instruction::CMPL_DOUBLE,
-    // Instruction::CMPG_DOUBLE,
-    // Instruction::CMP_LONG,
+    Instruction::CMPL_FLOAT,
+    Instruction::CMPG_FLOAT,
+    Instruction::CMPL_DOUBLE,
+    Instruction::CMPG_DOUBLE,
+    Instruction::CMP_LONG,
     // Instruction::IF_EQ,
     // Instruction::IF_NE,
     // Instruction::IF_LT,
@@ -258,110 +258,110 @@
     // Instruction::INVOKE_INTERFACE_RANGE,
     // Instruction::UNUSED_79,
     // Instruction::UNUSED_7A,
-    // Instruction::NEG_INT,
-    // Instruction::NOT_INT,
-    // Instruction::NEG_LONG,
-    // Instruction::NOT_LONG,
-    // Instruction::NEG_FLOAT,
-    // Instruction::NEG_DOUBLE,
-    // Instruction::INT_TO_LONG,
-    // Instruction::INT_TO_FLOAT,
-    // Instruction::INT_TO_DOUBLE,
-    // Instruction::LONG_TO_INT,
-    // Instruction::LONG_TO_FLOAT,
-    // Instruction::LONG_TO_DOUBLE,
-    // Instruction::FLOAT_TO_INT,
-    // Instruction::FLOAT_TO_LONG,
-    // Instruction::FLOAT_TO_DOUBLE,
-    // Instruction::DOUBLE_TO_INT,
-    // Instruction::DOUBLE_TO_LONG,
-    // Instruction::DOUBLE_TO_FLOAT,
-    // Instruction::INT_TO_BYTE,
-    // Instruction::INT_TO_CHAR,
-    // Instruction::INT_TO_SHORT,
-    // Instruction::ADD_INT,
-    // Instruction::SUB_INT,
-    // Instruction::MUL_INT,
-    // Instruction::DIV_INT,
-    // Instruction::REM_INT,
-    // Instruction::AND_INT,
-    // Instruction::OR_INT,
-    // Instruction::XOR_INT,
-    // Instruction::SHL_INT,
-    // Instruction::SHR_INT,
-    // Instruction::USHR_INT,
-    // Instruction::ADD_LONG,
-    // Instruction::SUB_LONG,
-    // Instruction::MUL_LONG,
-    // Instruction::DIV_LONG,
-    // Instruction::REM_LONG,
-    // Instruction::AND_LONG,
-    // Instruction::OR_LONG,
-    // Instruction::XOR_LONG,
-    // Instruction::SHL_LONG,
-    // Instruction::SHR_LONG,
-    // Instruction::USHR_LONG,
-    // Instruction::ADD_FLOAT,
-    // Instruction::SUB_FLOAT,
-    // Instruction::MUL_FLOAT,
-    // Instruction::DIV_FLOAT,
+    Instruction::NEG_INT,
+    Instruction::NOT_INT,
+    Instruction::NEG_LONG,
+    Instruction::NOT_LONG,
+    Instruction::NEG_FLOAT,
+    Instruction::NEG_DOUBLE,
+    Instruction::INT_TO_LONG,
+    Instruction::INT_TO_FLOAT,
+    Instruction::INT_TO_DOUBLE,
+    Instruction::LONG_TO_INT,
+    Instruction::LONG_TO_FLOAT,
+    Instruction::LONG_TO_DOUBLE,
+    Instruction::FLOAT_TO_INT,
+    Instruction::FLOAT_TO_LONG,
+    Instruction::FLOAT_TO_DOUBLE,
+    Instruction::DOUBLE_TO_INT,
+    Instruction::DOUBLE_TO_LONG,
+    Instruction::DOUBLE_TO_FLOAT,
+    Instruction::INT_TO_BYTE,
+    Instruction::INT_TO_CHAR,
+    Instruction::INT_TO_SHORT,
+    Instruction::ADD_INT,
+    Instruction::SUB_INT,
+    Instruction::MUL_INT,
+    Instruction::DIV_INT,
+    Instruction::REM_INT,
+    Instruction::AND_INT,
+    Instruction::OR_INT,
+    Instruction::XOR_INT,
+    Instruction::SHL_INT,
+    Instruction::SHR_INT,
+    Instruction::USHR_INT,
+    Instruction::ADD_LONG,
+    Instruction::SUB_LONG,
+    Instruction::MUL_LONG,
+    Instruction::DIV_LONG,
+    Instruction::REM_LONG,
+    Instruction::AND_LONG,
+    Instruction::OR_LONG,
+    Instruction::XOR_LONG,
+    Instruction::SHL_LONG,
+    Instruction::SHR_LONG,
+    Instruction::USHR_LONG,
+    Instruction::ADD_FLOAT,
+    Instruction::SUB_FLOAT,
+    Instruction::MUL_FLOAT,
+    Instruction::DIV_FLOAT,
     // Instruction::REM_FLOAT,
-    // Instruction::ADD_DOUBLE,
-    // Instruction::SUB_DOUBLE,
-    // Instruction::MUL_DOUBLE,
-    // Instruction::DIV_DOUBLE,
+    Instruction::ADD_DOUBLE,
+    Instruction::SUB_DOUBLE,
+    Instruction::MUL_DOUBLE,
+    Instruction::DIV_DOUBLE,
     // Instruction::REM_DOUBLE,
-    // Instruction::ADD_INT_2ADDR,
-    // Instruction::SUB_INT_2ADDR,
-    // Instruction::MUL_INT_2ADDR,
-    // Instruction::DIV_INT_2ADDR,
-    // Instruction::REM_INT_2ADDR,
-    // Instruction::AND_INT_2ADDR,
-    // Instruction::OR_INT_2ADDR,
-    // Instruction::XOR_INT_2ADDR,
-    // Instruction::SHL_INT_2ADDR,
-    // Instruction::SHR_INT_2ADDR,
-    // Instruction::USHR_INT_2ADDR,
-    // Instruction::ADD_LONG_2ADDR,
-    // Instruction::SUB_LONG_2ADDR,
-    // Instruction::MUL_LONG_2ADDR,
-    // Instruction::DIV_LONG_2ADDR,
-    // Instruction::REM_LONG_2ADDR,
-    // Instruction::AND_LONG_2ADDR,
-    // Instruction::OR_LONG_2ADDR,
-    // Instruction::XOR_LONG_2ADDR,
-    // Instruction::SHL_LONG_2ADDR,
-    // Instruction::SHR_LONG_2ADDR,
-    // Instruction::USHR_LONG_2ADDR,
-    // Instruction::ADD_FLOAT_2ADDR,
-    // Instruction::SUB_FLOAT_2ADDR,
-    // Instruction::MUL_FLOAT_2ADDR,
-    // Instruction::DIV_FLOAT_2ADDR,
+    Instruction::ADD_INT_2ADDR,
+    Instruction::SUB_INT_2ADDR,
+    Instruction::MUL_INT_2ADDR,
+    Instruction::DIV_INT_2ADDR,
+    Instruction::REM_INT_2ADDR,
+    Instruction::AND_INT_2ADDR,
+    Instruction::OR_INT_2ADDR,
+    Instruction::XOR_INT_2ADDR,
+    Instruction::SHL_INT_2ADDR,
+    Instruction::SHR_INT_2ADDR,
+    Instruction::USHR_INT_2ADDR,
+    Instruction::ADD_LONG_2ADDR,
+    Instruction::SUB_LONG_2ADDR,
+    Instruction::MUL_LONG_2ADDR,
+    Instruction::DIV_LONG_2ADDR,
+    Instruction::REM_LONG_2ADDR,
+    Instruction::AND_LONG_2ADDR,
+    Instruction::OR_LONG_2ADDR,
+    Instruction::XOR_LONG_2ADDR,
+    Instruction::SHL_LONG_2ADDR,
+    Instruction::SHR_LONG_2ADDR,
+    Instruction::USHR_LONG_2ADDR,
+    Instruction::ADD_FLOAT_2ADDR,
+    Instruction::SUB_FLOAT_2ADDR,
+    Instruction::MUL_FLOAT_2ADDR,
+    Instruction::DIV_FLOAT_2ADDR,
     // Instruction::REM_FLOAT_2ADDR,
-    // Instruction::ADD_DOUBLE_2ADDR,
-    // Instruction::SUB_DOUBLE_2ADDR,
-    // Instruction::MUL_DOUBLE_2ADDR,
-    // Instruction::DIV_DOUBLE_2ADDR,
+    Instruction::ADD_DOUBLE_2ADDR,
+    Instruction::SUB_DOUBLE_2ADDR,
+    Instruction::MUL_DOUBLE_2ADDR,
+    Instruction::DIV_DOUBLE_2ADDR,
     // Instruction::REM_DOUBLE_2ADDR,
-    // Instruction::ADD_INT_LIT16,
-    // Instruction::RSUB_INT,
-    // Instruction::MUL_INT_LIT16,
-    // Instruction::DIV_INT_LIT16,
-    // Instruction::REM_INT_LIT16,
-    // Instruction::AND_INT_LIT16,
-    // Instruction::OR_INT_LIT16,
-    // Instruction::XOR_INT_LIT16,
+    Instruction::ADD_INT_LIT16,
+    Instruction::RSUB_INT,
+    Instruction::MUL_INT_LIT16,
+    Instruction::DIV_INT_LIT16,
+    Instruction::REM_INT_LIT16,
+    Instruction::AND_INT_LIT16,
+    Instruction::OR_INT_LIT16,
+    Instruction::XOR_INT_LIT16,
     Instruction::ADD_INT_LIT8,
-    // Instruction::RSUB_INT_LIT8,
-    // Instruction::MUL_INT_LIT8,
-    // Instruction::DIV_INT_LIT8,
-    // Instruction::REM_INT_LIT8,
-    // Instruction::AND_INT_LIT8,
-    // Instruction::OR_INT_LIT8,
-    // Instruction::XOR_INT_LIT8,
-    // Instruction::SHL_INT_LIT8,
-    // Instruction::SHR_INT_LIT8,
-    // Instruction::USHR_INT_LIT8,
+    Instruction::RSUB_INT_LIT8,
+    Instruction::MUL_INT_LIT8,
+    Instruction::DIV_INT_LIT8,
+    Instruction::REM_INT_LIT8,
+    Instruction::AND_INT_LIT8,
+    Instruction::OR_INT_LIT8,
+    Instruction::XOR_INT_LIT8,
+    Instruction::SHL_INT_LIT8,
+    Instruction::SHR_INT_LIT8,
+    Instruction::USHR_INT_LIT8,
     // Instruction::IGET_QUICK,
     // Instruction::IGET_WIDE_QUICK,
     // Instruction::IGET_OBJECT_QUICK,
@@ -403,7 +403,7 @@
     // kMirOpNop,
     // kMirOpNullCheck,
     // kMirOpRangeCheck,
-    // kMirOpDivZeroCheck,
+    kMirOpDivZeroCheck,
     kMirOpCheck,
     // kMirOpCheckPart2,
     // kMirOpSelect,
@@ -692,14 +692,14 @@
 // S : short
 // C : char
 // I : int
-// L : long
+// J : long
 // F : float
 // D : double
 // L : reference(object, array)
 // V : void
 // (ARM64) Current calling conversion only support 32bit softfp
 //         which has problems with long, float, double
-constexpr char arm64_supported_types[] = "ZBSCILV";
+constexpr char arm64_supported_types[] = "ZBSCILVJFD";
 // (x84_64) We still have troubles with compiling longs/doubles/floats
 constexpr char x86_64_supported_types[] = "ZBSCILV";
 
@@ -879,8 +879,9 @@
         (1 << kPromoteCompilerTemps));
   }
 
-  if (cu.instruction_set == kArm64) {
+  if (cu.instruction_set == kArm64 || cu.instruction_set == kX86_64) {
     // TODO(Arm64): enable optimizations once backend is mature enough.
+    // TODO(X86_64): enable optimizations once backend is mature enough.
     cu.disable_opt = ~(uint32_t)0;
     cu.enable_debug |= (1 << kDebugCodegenDump);
   }
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index ed7e1f5..47b233b 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -953,18 +953,34 @@
   defs[reg_index] = ssa_reg;
 }
 
+void MIRGraph::AllocateSSAUseData(MIR *mir, int num_uses) {
+  mir->ssa_rep->num_uses = num_uses;
+
+  if (mir->ssa_rep->num_uses_allocated < num_uses) {
+    mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses, kArenaAllocDFInfo));
+    // NOTE: will be filled in during type & size inference pass
+    mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses, kArenaAllocDFInfo));
+  }
+}
+
+void MIRGraph::AllocateSSADefData(MIR *mir, int num_defs) {
+  mir->ssa_rep->num_defs = num_defs;
+
+  if (mir->ssa_rep->num_defs_allocated < num_defs) {
+    mir->ssa_rep->defs = static_cast<int*>(arena_->Alloc(sizeof(int) * num_defs,
+          kArenaAllocDFInfo));
+    mir->ssa_rep->fp_def = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_defs,
+          kArenaAllocDFInfo));
+  }
+}
+
 /* Look up new SSA names for format_35c instructions */
 void MIRGraph::DataFlowSSAFormat35C(MIR* mir) {
   MIR::DecodedInstruction* d_insn = &mir->dalvikInsn;
   int num_uses = d_insn->vA;
   int i;
 
-  mir->ssa_rep->num_uses = num_uses;
-  mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                       kArenaAllocDFInfo));
-  // NOTE: will be filled in during type & size inference pass
-  mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                          kArenaAllocDFInfo));
+  AllocateSSAUseData(mir, num_uses);
 
   for (i = 0; i < num_uses; i++) {
     HandleSSAUse(mir->ssa_rep->uses, d_insn->arg[i], i);
@@ -977,12 +993,7 @@
   int num_uses = d_insn->vA;
   int i;
 
-  mir->ssa_rep->num_uses = num_uses;
-  mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                       kArenaAllocDFInfo));
-  // NOTE: will be filled in during type & size inference pass
-  mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                          kArenaAllocDFInfo));
+  AllocateSSAUseData(mir, num_uses);
 
   for (i = 0; i < num_uses; i++) {
     HandleSSAUse(mir->ssa_rep->uses, d_insn->vC+i, i);
@@ -999,6 +1010,7 @@
     mir->ssa_rep =
         static_cast<struct SSARepresentation *>(arena_->Alloc(sizeof(SSARepresentation),
                                                               kArenaAllocDFInfo));
+    memset(mir->ssa_rep, 0, sizeof(*mir->ssa_rep));
 
     uint64_t df_attributes = GetDataFlowAttributes(mir);
 
@@ -1045,13 +1057,7 @@
       }
     }
 
-    if (num_uses) {
-      mir->ssa_rep->num_uses = num_uses;
-      mir->ssa_rep->uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                           kArenaAllocDFInfo));
-      mir->ssa_rep->fp_use = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses,
-                                                              kArenaAllocDFInfo));
-    }
+    AllocateSSAUseData(mir, num_uses);
 
     int num_defs = 0;
 
@@ -1062,13 +1068,7 @@
       }
     }
 
-    if (num_defs) {
-      mir->ssa_rep->num_defs = num_defs;
-      mir->ssa_rep->defs = static_cast<int*>(arena_->Alloc(sizeof(int) * num_defs,
-                                                           kArenaAllocDFInfo));
-      mir->ssa_rep->fp_def = static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_defs,
-                                                              kArenaAllocDFInfo));
-    }
+    AllocateSSADefData(mir, num_defs);
 
     MIR::DecodedInstruction* d_insn = &mir->dalvikInsn;
 
@@ -1114,11 +1114,11 @@
    * input to PHI nodes can be derived from the snapshot of all
    * predecessor blocks.
    */
-  bb->data_flow_info->vreg_to_ssa_map =
+  bb->data_flow_info->vreg_to_ssa_map_exit =
       static_cast<int*>(arena_->Alloc(sizeof(int) * cu_->num_dalvik_registers,
                                       kArenaAllocDFInfo));
 
-  memcpy(bb->data_flow_info->vreg_to_ssa_map, vreg_to_ssa_map_,
+  memcpy(bb->data_flow_info->vreg_to_ssa_map_exit, vreg_to_ssa_map_,
          sizeof(int) * cu_->num_dalvik_registers);
   return true;
 }
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index 4ba6677..0fffa01 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -17,6 +17,7 @@
 #include "mir_graph.h"
 
 #include <inttypes.h>
+#include <queue>
 
 #include "base/stl_util.h"
 #include "compiler_internals.h"
@@ -73,12 +74,13 @@
       use_counts_(arena, 256, kGrowableArrayMisc),
       raw_use_counts_(arena, 256, kGrowableArrayMisc),
       num_reachable_blocks_(0),
+      max_num_reachable_blocks_(0),
       dfs_order_(NULL),
       dfs_post_order_(NULL),
       dom_post_order_traversal_(NULL),
+      topological_order_(nullptr),
       i_dom_list_(NULL),
       def_block_matrix_(NULL),
-      temp_dalvik_register_v_(NULL),
       temp_scoped_alloc_(),
       temp_insn_data_(nullptr),
       temp_bit_vector_size_(0u),
@@ -149,7 +151,7 @@
   if (insn == NULL) {
     LOG(FATAL) << "Break split failed";
   }
-  BasicBlock *bottom_block = NewMemBB(kDalvikByteCode, num_blocks_++);
+  BasicBlock* bottom_block = NewMemBB(kDalvikByteCode, num_blocks_++);
   block_list_.Insert(bottom_block);
 
   bottom_block->start_offset = code_offset;
@@ -187,9 +189,9 @@
     orig_block->successor_blocks = NULL;
     GrowableArray<SuccessorBlockInfo*>::Iterator iterator(bottom_block->successor_blocks);
     while (true) {
-      SuccessorBlockInfo *successor_block_info = iterator.Next();
+      SuccessorBlockInfo* successor_block_info = iterator.Next();
       if (successor_block_info == NULL) break;
-      BasicBlock *bb = GetBasicBlock(successor_block_info->block);
+      BasicBlock* bb = GetBasicBlock(successor_block_info->block);
       bb->predecessors->Delete(orig_block->id);
       bb->predecessors->Insert(bottom_block->id);
     }
@@ -297,7 +299,7 @@
     }
   }
 
-  // Iterate over each of the handlers to enqueue the empty Catch blocks
+  // Iterate over each of the handlers to enqueue the empty Catch blocks.
   const byte* handlers_ptr = DexFile::GetCatchHandlerData(*current_code_item_, 0);
   uint32_t handlers_size = DecodeUnsignedLeb128(&handlers_ptr);
   for (uint32_t idx = 0; idx < handlers_size; idx++) {
@@ -344,7 +346,7 @@
       LOG(FATAL) << "Unexpected opcode(" << insn->dalvikInsn.opcode << ") with kBranch set";
   }
   CountBranch(target);
-  BasicBlock *taken_block = FindBlock(target, /* split */ true, /* create */ true,
+  BasicBlock* taken_block = FindBlock(target, /* split */ true, /* create */ true,
                                       /* immed_pred_block_p */ &cur_block);
   cur_block->taken = taken_block->id;
   taken_block->predecessors->Insert(cur_block->id);
@@ -404,7 +406,7 @@
     size = switch_data[1];
     first_key = switch_data[2] | (switch_data[3] << 16);
     target_table = reinterpret_cast<const int*>(&switch_data[4]);
-    keyTable = NULL;        // Make the compiler happy
+    keyTable = NULL;        // Make the compiler happy.
   /*
    * Sparse switch data format:
    *  ushort ident = 0x0200   magic value
@@ -420,7 +422,7 @@
     size = switch_data[1];
     keyTable = reinterpret_cast<const int*>(&switch_data[2]);
     target_table = reinterpret_cast<const int*>(&switch_data[2 + size*2]);
-    first_key = 0;   // To make the compiler happy
+    first_key = 0;   // To make the compiler happy.
   }
 
   if (cur_block->successor_block_list_type != kNotUsed) {
@@ -433,9 +435,9 @@
       new (arena_) GrowableArray<SuccessorBlockInfo*>(arena_, size, kGrowableArraySuccessorBlocks);
 
   for (i = 0; i < size; i++) {
-    BasicBlock *case_block = FindBlock(cur_offset + target_table[i], /* split */ true,
+    BasicBlock* case_block = FindBlock(cur_offset + target_table[i], /* split */ true,
                                       /* create */ true, /* immed_pred_block_p */ &cur_block);
-    SuccessorBlockInfo *successor_block_info =
+    SuccessorBlockInfo* successor_block_info =
         static_cast<SuccessorBlockInfo*>(arena_->Alloc(sizeof(SuccessorBlockInfo),
                                                        kArenaAllocSuccessor));
     successor_block_info->block = case_block->id;
@@ -478,13 +480,13 @@
         new (arena_) GrowableArray<SuccessorBlockInfo*>(arena_, 2, kGrowableArraySuccessorBlocks);
 
     for (; iterator.HasNext(); iterator.Next()) {
-      BasicBlock *catch_block = FindBlock(iterator.GetHandlerAddress(), false /* split*/,
+      BasicBlock* catch_block = FindBlock(iterator.GetHandlerAddress(), false /* split*/,
                                          false /* creat */, NULL  /* immed_pred_block_p */);
       catch_block->catch_entry = true;
       if (kIsDebugBuild) {
         catches_.insert(catch_block->start_offset);
       }
-      SuccessorBlockInfo *successor_block_info = reinterpret_cast<SuccessorBlockInfo*>
+      SuccessorBlockInfo* successor_block_info = reinterpret_cast<SuccessorBlockInfo*>
           (arena_->Alloc(sizeof(SuccessorBlockInfo), kArenaAllocSuccessor));
       successor_block_info->block = catch_block->id;
       successor_block_info->key = iterator.GetHandlerTypeIndex();
@@ -492,7 +494,7 @@
       catch_block->predecessors->Insert(cur_block->id);
     }
   } else if (build_all_edges) {
-    BasicBlock *eh_block = NewMemBB(kExceptionHandling, num_blocks_++);
+    BasicBlock* eh_block = NewMemBB(kExceptionHandling, num_blocks_++);
     cur_block->taken = eh_block->id;
     block_list_.Insert(eh_block);
     eh_block->start_offset = cur_offset;
@@ -502,7 +504,7 @@
   if (is_throw) {
     cur_block->explicit_throw = true;
     if (code_ptr < code_end) {
-      // Force creation of new block following THROW via side-effect
+      // Force creation of new block following THROW via side-effect.
       FindBlock(cur_offset + width, /* split */ false, /* create */ true,
                 /* immed_pred_block_p */ NULL);
     }
@@ -548,7 +550,7 @@
   *new_insn = *insn;
   insn->dalvikInsn.opcode =
       static_cast<Instruction::Code>(kMirOpCheck);
-  // Associate the two halves
+  // Associate the two halves.
   insn->meta.throw_insn = new_insn;
   new_block->AppendMIR(new_insn);
   return new_block;
@@ -615,7 +617,7 @@
   }
 
   /* Current block to record parsed instructions */
-  BasicBlock *cur_block = NewMemBB(kDalvikByteCode, num_blocks_++);
+  BasicBlock* cur_block = NewMemBB(kDalvikByteCode, num_blocks_++);
   DCHECK_EQ(current_offset_, 0U);
   cur_block->start_offset = current_offset_;
   block_list_.Insert(cur_block);
@@ -653,7 +655,7 @@
       cur_block->use_lvn = true;  // Run local value numbering on this basic block.
     }
 
-    // Check for inline data block signatures
+    // Check for inline data block signatures.
     if (opcode == Instruction::NOP) {
       // A simple NOP will have a width of 1 at this point, embedded data NOP > 1.
       if ((width == 1) && ((current_offset_ & 0x1) == 0x1) && ((code_end - code_ptr) > 1)) {
@@ -797,7 +799,7 @@
 
   for (idx = 0; idx < num_blocks; idx++) {
     int block_idx = all_blocks ? idx : dfs_order_->Get(idx);
-    BasicBlock *bb = GetBasicBlock(block_idx);
+    BasicBlock* bb = GetBasicBlock(block_idx);
     if (bb == NULL) continue;
     if (bb->block_type == kDead) continue;
     if (bb->block_type == kEntryBlock) {
@@ -807,7 +809,7 @@
     } else if (bb->block_type == kDalvikByteCode) {
       fprintf(file, "  block%04x_%d [shape=record,label = \"{ \\\n",
               bb->start_offset, bb->id);
-      const MIR *mir;
+      const MIR* mir;
         fprintf(file, "    {block id %d\\l}%s\\\n", bb->id,
                 bb->first_mir_insn ? " | " : " ");
         for (mir = bb->first_mir_insn; mir; mir = mir->next) {
@@ -869,13 +871,13 @@
               bb->start_offset, bb->id,
               (bb->successor_block_list_type == kCatch) ?  "Mrecord" : "record");
       GrowableArray<SuccessorBlockInfo*>::Iterator iterator(bb->successor_blocks);
-      SuccessorBlockInfo *successor_block_info = iterator.Next();
+      SuccessorBlockInfo* successor_block_info = iterator.Next();
 
       int succ_id = 0;
       while (true) {
         if (successor_block_info == NULL) break;
 
-        BasicBlock *dest_block = GetBasicBlock(successor_block_info->block);
+        BasicBlock* dest_block = GetBasicBlock(successor_block_info->block);
         SuccessorBlockInfo *next_successor_block_info = iterator.Next();
 
         fprintf(file, "    {<f%d> %04x: %04x\\l}%s\\\n",
@@ -897,7 +899,7 @@
 
       succ_id = 0;
       while (true) {
-        SuccessorBlockInfo *successor_block_info = iter.Next();
+        SuccessorBlockInfo* successor_block_info = iter.Next();
         if (successor_block_info == NULL) break;
 
         BasicBlock* dest_block = GetBasicBlock(successor_block_info->block);
@@ -926,44 +928,78 @@
 
 /* Insert an MIR instruction to the end of a basic block. */
 void BasicBlock::AppendMIR(MIR* mir) {
-  if (first_mir_insn == nullptr) {
-    DCHECK(last_mir_insn == nullptr);
-    last_mir_insn = first_mir_insn = mir;
-    mir->next = nullptr;
-  } else {
-    last_mir_insn->next = mir;
-    mir->next = nullptr;
-    last_mir_insn = mir;
-  }
-
-  mir->bb = id;
+  // Insert it after the last MIR.
+  InsertMIRListAfter(last_mir_insn, mir, mir);
 }
 
-/* Insert an MIR instruction to the head of a basic block. */
-void BasicBlock::PrependMIR(MIR* mir) {
-  if (first_mir_insn == nullptr) {
-    DCHECK(last_mir_insn == nullptr);
-    last_mir_insn = first_mir_insn = mir;
-    mir->next = nullptr;
-  } else {
-    mir->next = first_mir_insn;
-    first_mir_insn = mir;
-  }
+void BasicBlock::AppendMIRList(MIR* first_list_mir, MIR* last_list_mir) {
+  // Insert it after the last MIR.
+  InsertMIRListAfter(last_mir_insn, first_list_mir, last_list_mir);
+}
 
-  mir->bb = id;
+void BasicBlock::AppendMIRList(const std::vector<MIR*>& insns) {
+  for (std::vector<MIR*>::const_iterator it = insns.begin(); it != insns.end(); it++) {
+    MIR* new_mir = *it;
+
+    // Add a copy of each MIR.
+    InsertMIRListAfter(last_mir_insn, new_mir, new_mir);
+  }
 }
 
 /* Insert a MIR instruction after the specified MIR. */
 void BasicBlock::InsertMIRAfter(MIR* current_mir, MIR* new_mir) {
-  new_mir->next = current_mir->next;
-  current_mir->next = new_mir;
+  InsertMIRListAfter(current_mir, new_mir, new_mir);
+}
 
-  if (last_mir_insn == current_mir) {
-    /* Is the last MIR in the block? */
-    last_mir_insn = new_mir;
+void BasicBlock::InsertMIRListAfter(MIR* insert_after, MIR* first_list_mir, MIR* last_list_mir) {
+  // If no MIR, we are done.
+  if (first_list_mir == nullptr || last_list_mir == nullptr) {
+    return;
   }
 
-  new_mir->bb = id;
+  // If insert_after is null, assume BB is empty.
+  if (insert_after == nullptr) {
+    first_mir_insn = first_list_mir;
+    last_mir_insn = last_list_mir;
+    last_list_mir->next = nullptr;
+  } else {
+    MIR* after_list = insert_after->next;
+    insert_after->next = first_list_mir;
+    last_list_mir->next = after_list;
+    if (after_list == nullptr) {
+      last_mir_insn = last_list_mir;
+    }
+  }
+
+  // Set this BB to be the basic block of the MIRs.
+  MIR* last = last_list_mir->next;
+  for (MIR* mir = first_list_mir; mir != last; mir = mir->next) {
+    mir->bb = id;
+  }
+}
+
+/* Insert an MIR instruction to the head of a basic block. */
+void BasicBlock::PrependMIR(MIR* mir) {
+  InsertMIRListBefore(first_mir_insn, mir, mir);
+}
+
+void BasicBlock::PrependMIRList(MIR* first_list_mir, MIR* last_list_mir) {
+  // Insert it before the first MIR.
+  InsertMIRListBefore(first_mir_insn, first_list_mir, last_list_mir);
+}
+
+void BasicBlock::PrependMIRList(const std::vector<MIR*>& to_add) {
+  for (std::vector<MIR*>::const_iterator it = to_add.begin(); it != to_add.end(); it++) {
+    MIR* mir = *it;
+
+    InsertMIRListBefore(first_mir_insn, mir, mir);
+  }
+}
+
+/* Insert a MIR instruction before the specified MIR. */
+void BasicBlock::InsertMIRBefore(MIR* current_mir, MIR* new_mir) {
+  // Insert as a single element list.
+  return InsertMIRListBefore(current_mir, new_mir, new_mir);
 }
 
 MIR* BasicBlock::FindPreviousMIR(MIR* mir) {
@@ -982,20 +1018,79 @@
   return nullptr;
 }
 
-void BasicBlock::InsertMIRBefore(MIR* current_mir, MIR* new_mir) {
-  if (first_mir_insn == current_mir) {
-    /* Is the first MIR in the block? */
-    first_mir_insn = new_mir;
-    new_mir->bb = id;
+void BasicBlock::InsertMIRListBefore(MIR* insert_before, MIR* first_list_mir, MIR* last_list_mir) {
+  // If no MIR, we are done.
+  if (first_list_mir == nullptr || last_list_mir == nullptr) {
+    return;
   }
 
-  MIR* prev = FindPreviousMIR(current_mir);
-
-  if (prev != nullptr) {
-    prev->next = new_mir;
-    new_mir->next = current_mir;
-    new_mir->bb = id;
+  // If insert_before is null, assume BB is empty.
+  if (insert_before == nullptr) {
+    first_mir_insn = first_list_mir;
+    last_mir_insn = last_list_mir;
+    last_list_mir->next = nullptr;
+  } else {
+    if (first_mir_insn == insert_before) {
+      last_list_mir->next = first_mir_insn;
+      first_mir_insn = first_list_mir;
+    } else {
+      // Find the preceding MIR.
+      MIR* before_list = FindPreviousMIR(insert_before);
+      DCHECK(before_list != nullptr);
+      before_list->next = first_list_mir;
+      last_list_mir->next = insert_before;
+    }
   }
+
+  // Set this BB to be the basic block of the MIRs.
+  for (MIR* mir = first_list_mir; mir != last_list_mir->next; mir = mir->next) {
+    mir->bb = id;
+  }
+}
+
+bool BasicBlock::RemoveMIR(MIR* mir) {
+  // Remove as a single element list.
+  return RemoveMIRList(mir, mir);
+}
+
+bool BasicBlock::RemoveMIRList(MIR* first_list_mir, MIR* last_list_mir) {
+  if (first_list_mir == nullptr) {
+    return false;
+  }
+
+  // Try to find the MIR.
+  MIR* before_list = nullptr;
+  MIR* after_list = nullptr;
+
+  // If we are removing from the beginning of the MIR list.
+  if (first_mir_insn == first_list_mir) {
+    before_list = nullptr;
+  } else {
+    before_list = FindPreviousMIR(first_list_mir);
+    if (before_list == nullptr) {
+      // We did not find the mir.
+      return false;
+    }
+  }
+
+  // Remove the BB information and also find the after_list
+  for (MIR* mir = first_list_mir; mir != last_list_mir; mir = mir->next) {
+    mir->bb = NullBasicBlockId;
+  }
+
+  after_list = last_list_mir->next;
+
+  // If there is nothing before the list, after_list is the first_mir
+  if (before_list == nullptr) {
+    first_mir_insn = after_list;
+  }
+
+  // If there is nothing after the list, before_list is last_mir
+  if (after_list == nullptr) {
+    last_mir_insn = before_list;
+  }
+
+  return true;
 }
 
 MIR* BasicBlock::GetNextUnconditionalMir(MIRGraph* mir_graph, MIR* current) {
@@ -1023,7 +1118,7 @@
   char* ret;
   bool nop = false;
   SSARepresentation* ssa_rep = mir->ssa_rep;
-  Instruction::Format dalvik_format = Instruction::k10x;  // Default to no-operand format
+  Instruction::Format dalvik_format = Instruction::k10x;  // Default to no-operand format.
   int defs = (ssa_rep != NULL) ? ssa_rep->num_defs : 0;
   int uses = (ssa_rep != NULL) ? ssa_rep->num_uses : 0;
 
@@ -1031,7 +1126,7 @@
   if ((opcode == kMirOpCheck) || (opcode == kMirOpCheckPart2)) {
     str.append(extended_mir_op_names_[opcode - kMirOpFirst]);
     str.append(": ");
-    // Recover the original Dex instruction
+    // Recover the original Dex instruction.
     insn = mir->meta.throw_insn->dalvikInsn;
     ssa_rep = mir->meta.throw_insn->ssa_rep;
     defs = ssa_rep->num_defs;
@@ -1090,7 +1185,7 @@
     str.append(StringPrintf(" 0x%x (%c%x)", mir->offset + offset,
                             offset > 0 ? '+' : '-', offset > 0 ? offset : -offset));
   } else {
-    // For invokes-style formats, treat wide regs as a pair of singles
+    // For invokes-style formats, treat wide regs as a pair of singles.
     bool show_singles = ((dalvik_format == Instruction::k35c) ||
                          (dalvik_format == Instruction::k3rc));
     if (defs != 0) {
@@ -1111,28 +1206,28 @@
       }
     }
     switch (dalvik_format) {
-      case Instruction::k11n:  // Add one immediate from vB
+      case Instruction::k11n:  // Add one immediate from vB.
       case Instruction::k21s:
       case Instruction::k31i:
       case Instruction::k21h:
         str.append(StringPrintf(", #%d", insn.vB));
         break;
-      case Instruction::k51l:  // Add one wide immediate
+      case Instruction::k51l:  // Add one wide immediate.
         str.append(StringPrintf(", #%" PRId64, insn.vB_wide));
         break;
-      case Instruction::k21c:  // One register, one string/type/method index
+      case Instruction::k21c:  // One register, one string/type/method index.
       case Instruction::k31c:
         str.append(StringPrintf(", index #%d", insn.vB));
         break;
-      case Instruction::k22c:  // Two registers, one string/type/method index
+      case Instruction::k22c:  // Two registers, one string/type/method index.
         str.append(StringPrintf(", index #%d", insn.vC));
         break;
-      case Instruction::k22s:  // Add one immediate from vC
+      case Instruction::k22s:  // Add one immediate from vC.
       case Instruction::k22b:
         str.append(StringPrintf(", #%d", insn.vC));
         break;
       default: {
-        // Nothing left to print
+        // Nothing left to print.
       }
     }
   }
@@ -1166,7 +1261,7 @@
 // Similar to GetSSAName, but if ssa name represents an immediate show that as well.
 std::string MIRGraph::GetSSANameWithConst(int ssa_reg, bool singles_only) {
   if (reg_location_ == NULL) {
-    // Pre-SSA - just use the standard name
+    // Pre-SSA - just use the standard name.
     return GetSSAName(ssa_reg);
   }
   if (IsConst(reg_location_[ssa_reg])) {
@@ -1286,8 +1381,8 @@
 
 // Allocate a new basic block.
 BasicBlock* MIRGraph::NewMemBB(BBType block_type, int block_id) {
-  BasicBlock* bb = static_cast<BasicBlock*>(arena_->Alloc(sizeof(BasicBlock),
-                                                          kArenaAllocBB));
+  BasicBlock* bb = new (arena_) BasicBlock();
+
   bb->block_type = block_type;
   bb->id = block_id;
   // TUNING: better estimate of the exit block predecessors?
@@ -1305,18 +1400,24 @@
 }
 
 void MIRGraph::InitializeMethodUses() {
-  // The gate starts by initializing the use counts
+  // The gate starts by initializing the use counts.
   int num_ssa_regs = GetNumSSARegs();
   use_counts_.Resize(num_ssa_regs + 32);
   raw_use_counts_.Resize(num_ssa_regs + 32);
-  // Initialize list
+  // Initialize list.
   for (int i = 0; i < num_ssa_regs; i++) {
     use_counts_.Insert(0);
     raw_use_counts_.Insert(0);
   }
 }
 
-void MIRGraph::InitializeSSATransformation() {
+void MIRGraph::SSATransformationStart() {
+  DCHECK(temp_scoped_alloc_.get() == nullptr);
+  temp_scoped_alloc_.reset(ScopedArenaAllocator::Create(&cu_->arena_stack));
+  temp_bit_vector_size_ = cu_->num_dalvik_registers;
+  temp_bit_vector_ = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), temp_bit_vector_size_, false, kBitMapRegisterV);
+
   /* Compute the DFS order */
   ComputeDFSOrders();
 
@@ -1335,6 +1436,119 @@
   /* Rename register names by local defs and phi nodes */
   ClearAllVisitedFlags();
   DoDFSPreOrderSSARename(GetEntryBlock());
+
+  // Update the maximum number of reachable blocks.
+  max_num_reachable_blocks_ = num_reachable_blocks_;
+}
+
+void MIRGraph::SSATransformationEnd() {
+  // Verify the dataflow information after the pass.
+  if (cu_->enable_debug & (1 << kDebugVerifyDataflow)) {
+    VerifyDataflow();
+  }
+
+  temp_bit_vector_size_ = 0u;
+  temp_bit_vector_ = nullptr;
+  DCHECK(temp_scoped_alloc_.get() != nullptr);
+  temp_scoped_alloc_.reset();
+}
+
+void MIRGraph::ComputeTopologicalSortOrder() {
+  std::queue<BasicBlock *> q;
+  std::map<int, int> visited_cnt_values;
+
+  // Clear the nodes.
+  ClearAllVisitedFlags();
+
+  // Create the topological order if need be.
+  if (topological_order_ != nullptr) {
+    topological_order_ = new (arena_) GrowableArray<BasicBlockId>(arena_, 0);
+  }
+  topological_order_->Reset();
+
+  // Set up visitedCntValues map for all BB. The default value for this counters in the map is zero.
+  // also fill initial queue.
+  GrowableArray<BasicBlock*>::Iterator iterator(&block_list_);
+
+  while (true) {
+    BasicBlock* bb = iterator.Next();
+
+    if (bb == nullptr) {
+      break;
+    }
+
+    if (bb->hidden == true) {
+      continue;
+    }
+
+    visited_cnt_values[bb->id] = bb->predecessors->Size();
+
+    GrowableArray<BasicBlockId>::Iterator pred_iterator(bb->predecessors);
+    // To process loops we should not wait for dominators.
+    while (true) {
+      BasicBlock* pred_bb = GetBasicBlock(pred_iterator.Next());
+
+      if (pred_bb == nullptr) {
+        break;
+      }
+
+      if (pred_bb->dominators == nullptr || pred_bb->hidden == true) {
+        continue;
+      }
+
+      // Skip the backward branch.
+      if (pred_bb->dominators->IsBitSet(bb->id) != 0) {
+        visited_cnt_values[bb->id]--;
+      }
+    }
+
+    // Add entry block to queue.
+    if (visited_cnt_values[bb->id] == 0) {
+      q.push(bb);
+    }
+  }
+
+  while (q.size() > 0) {
+    // Get top.
+    BasicBlock *bb = q.front();
+    q.pop();
+
+    DCHECK_EQ(bb->hidden, false);
+
+    if (bb->IsExceptionBlock() == true) {
+      continue;
+    }
+
+    // We've visited all the predecessors. So, we can visit bb.
+    if (bb->visited == false) {
+      bb->visited = true;
+
+      // Now add the basic block.
+      topological_order_->Insert(bb->id);
+
+      // Reduce visitedCnt for all the successors and add into the queue ones with visitedCnt equals to zero.
+      ChildBlockIterator succIter(bb, this);
+      BasicBlock *successor = succIter.Next();
+      while (successor != nullptr) {
+        // one more predecessor was visited.
+        visited_cnt_values[successor->id]--;
+
+        if (visited_cnt_values[successor->id] <= 0 && successor->visited == false && successor->hidden == false) {
+          q.push(successor);
+        }
+
+        // Take next successor.
+        successor = succIter.Next();
+      }
+    }
+  }
+}
+
+bool BasicBlock::IsExceptionBlock() const {
+  if (block_type == kExceptionHandling) {
+    return true;
+  }
+  return false;
 }
 
 ChildBlockIterator::ChildBlockIterator(BasicBlock* bb, MIRGraph* mir_graph)
@@ -1388,46 +1602,52 @@
   return nullptr;
 }
 
-bool BasicBlock::RemoveMIR(MIR* mir) {
-  if (mir == nullptr) {
-    return false;
-  }
+BasicBlock* BasicBlock::Copy(CompilationUnit* c_unit) {
+  MIRGraph* mir_graph = c_unit->mir_graph.get();
+  return Copy(mir_graph);
+}
 
-  // Find the MIR, and the one before it if they exist.
-  MIR* current = nullptr;
-  MIR* prev = nullptr;
+BasicBlock* BasicBlock::Copy(MIRGraph* mir_graph) {
+  BasicBlock* result_bb = mir_graph->CreateNewBB(block_type);
 
-  // Find the mir we are looking for.
-  for (current = first_mir_insn; current != nullptr; prev = current, current = current->next) {
-    if (current == mir) {
-      break;
+  // We don't do a memcpy style copy here because it would lead to a lot of things
+  // to clean up. Let us do it by hand instead.
+  // Copy in taken and fallthrough.
+  result_bb->fall_through = fall_through;
+  result_bb->taken = taken;
+
+  // Copy successor links if needed.
+  ArenaAllocator* arena = mir_graph->GetArena();
+
+  result_bb->successor_block_list_type = successor_block_list_type;
+  if (result_bb->successor_block_list_type != kNotUsed) {
+    size_t size = successor_blocks->Size();
+    result_bb->successor_blocks = new (arena) GrowableArray<SuccessorBlockInfo*>(arena, size, kGrowableArraySuccessorBlocks);
+    GrowableArray<SuccessorBlockInfo*>::Iterator iterator(successor_blocks);
+    while (true) {
+      SuccessorBlockInfo* sbi_old = iterator.Next();
+      if (sbi_old == nullptr) {
+        break;
+      }
+      SuccessorBlockInfo* sbi_new = static_cast<SuccessorBlockInfo*>(arena->Alloc(sizeof(SuccessorBlockInfo), kArenaAllocSuccessor));
+      memcpy(sbi_new, sbi_old, sizeof(SuccessorBlockInfo));
+      result_bb->successor_blocks->Insert(sbi_new);
     }
   }
 
-  // Did we find it?
-  if (current != nullptr) {
-    MIR* next = current->next;
+  // Copy offset, method.
+  result_bb->start_offset = start_offset;
 
-    // Just update the links of prev and next and current is almost gone.
-    if (prev != nullptr) {
-      prev->next = next;
-    }
+  // Now copy instructions.
+  for (MIR* mir = first_mir_insn; mir != 0; mir = mir->next) {
+    // Get a copy first.
+    MIR* copy = mir->Copy(mir_graph);
 
-    // Exceptions are if first or last mirs are invoke.
-    if (first_mir_insn == current) {
-      first_mir_insn = next;
-    }
-
-    if (last_mir_insn == current) {
-      last_mir_insn = prev;
-    }
-
-    // Found it and removed it.
-    return true;
+    // Append it.
+    result_bb->AppendMIR(copy);
   }
 
-  // We did not find it.
-  return false;
+  return result_bb;
 }
 
 MIR* MIR::Copy(MIRGraph* mir_graph) {
@@ -1490,4 +1710,208 @@
   return res;
 }
 
+/**
+ * @brief Given a decoded instruction, it checks whether the instruction
+ * sets a constant and if it does, more information is provided about the
+ * constant being set.
+ * @param ptr_value pointer to a 64-bit holder for the constant.
+ * @param wide Updated by function whether a wide constant is being set by bytecode.
+ * @return Returns false if the decoded instruction does not represent a constant bytecode.
+ */
+bool MIR::DecodedInstruction::GetConstant(int64_t* ptr_value, bool* wide) const {
+  bool sets_const = true;
+  int64_t value = vB;
+
+  DCHECK(ptr_value != nullptr);
+  DCHECK(wide != nullptr);
+
+  switch (opcode) {
+    case Instruction::CONST_4:
+    case Instruction::CONST_16:
+    case Instruction::CONST:
+      *wide = false;
+      value <<= 32;      // In order to get the sign extend.
+      value >>= 32;
+      break;
+    case Instruction::CONST_HIGH16:
+      *wide = false;
+      value <<= 48;      // In order to get the sign extend.
+      value >>= 32;
+      break;
+    case Instruction::CONST_WIDE_16:
+    case Instruction::CONST_WIDE_32:
+      *wide = true;
+      value <<= 32;      // In order to get the sign extend.
+      value >>= 32;
+      break;
+    case Instruction::CONST_WIDE:
+      *wide = true;
+      value = vB_wide;
+      break;
+    case Instruction::CONST_WIDE_HIGH16:
+      *wide = true;
+      value <<= 48;      // In order to get the sign extend.
+      break;
+    default:
+      sets_const = false;
+      break;
+  }
+
+  if (sets_const) {
+    *ptr_value = value;
+  }
+
+  return sets_const;
+}
+
+void BasicBlock::ResetOptimizationFlags(uint16_t reset_flags) {
+  // Reset flags for all MIRs in bb.
+  for (MIR* mir = first_mir_insn; mir != NULL; mir = mir->next) {
+    mir->optimization_flags &= (~reset_flags);
+  }
+}
+
+void BasicBlock::Hide(CompilationUnit* c_unit) {
+  // First lets make it a dalvik bytecode block so it doesn't have any special meaning.
+  block_type = kDalvikByteCode;
+
+  // Mark it as hidden.
+  hidden = true;
+
+  // Detach it from its MIRs so we don't generate code for them. Also detached MIRs
+  // are updated to know that they no longer have a parent.
+  for (MIR* mir = first_mir_insn; mir != nullptr; mir = mir->next) {
+    mir->bb = NullBasicBlockId;
+  }
+  first_mir_insn = nullptr;
+  last_mir_insn = nullptr;
+
+  GrowableArray<BasicBlockId>::Iterator iterator(predecessors);
+
+  MIRGraph* mir_graph = c_unit->mir_graph.get();
+  while (true) {
+    BasicBlock* pred_bb = mir_graph->GetBasicBlock(iterator.Next());
+    if (pred_bb == nullptr) {
+      break;
+    }
+
+    // Sadly we have to go through the children by hand here.
+    pred_bb->ReplaceChild(id, NullBasicBlockId);
+  }
+
+  // Iterate through children of bb we are hiding.
+  ChildBlockIterator successorChildIter(this, mir_graph);
+
+  for (BasicBlock* childPtr = successorChildIter.Next(); childPtr != 0; childPtr = successorChildIter.Next()) {
+    // Replace child with null child.
+    childPtr->predecessors->Delete(id);
+  }
+}
+
+bool BasicBlock::IsSSALiveOut(const CompilationUnit* c_unit, int ssa_reg) {
+  // In order to determine if the ssa reg is live out, we scan all the MIRs. We remember
+  // the last SSA number of the same dalvik register. At the end, if it is different than ssa_reg,
+  // then it is not live out of this BB.
+  int dalvik_reg = c_unit->mir_graph->SRegToVReg(ssa_reg);
+
+  int last_ssa_reg = -1;
+
+  // Walk through the MIRs backwards.
+  for (MIR* mir = first_mir_insn; mir != nullptr; mir = mir->next) {
+    // Get ssa rep.
+    SSARepresentation *ssa_rep = mir->ssa_rep;
+
+    // Go through the defines for this MIR.
+    for (int i = 0; i < ssa_rep->num_defs; i++) {
+      DCHECK(ssa_rep->defs != nullptr);
+
+      // Get the ssa reg.
+      int def_ssa_reg = ssa_rep->defs[i];
+
+      // Get dalvik reg.
+      int def_dalvik_reg = c_unit->mir_graph->SRegToVReg(def_ssa_reg);
+
+      // Compare dalvik regs.
+      if (dalvik_reg == def_dalvik_reg) {
+        // We found a def of the register that we are being asked about.
+        // Remember it.
+        last_ssa_reg = def_ssa_reg;
+      }
+    }
+  }
+
+  if (last_ssa_reg == -1) {
+    // If we get to this point we couldn't find a define of register user asked about.
+    // Let's assume the user knows what he's doing so we can be safe and say that if we
+    // couldn't find a def, it is live out.
+    return true;
+  }
+
+  // If it is not -1, we found a match, is it ssa_reg?
+  return (ssa_reg == last_ssa_reg);
+}
+
+bool BasicBlock::ReplaceChild(BasicBlockId old_bb, BasicBlockId new_bb) {
+  // We need to check taken, fall_through, and successor_blocks to replace.
+  bool found = false;
+  if (taken == old_bb) {
+    taken = new_bb;
+    found = true;
+  }
+
+  if (fall_through == old_bb) {
+    fall_through = new_bb;
+    found = true;
+  }
+
+  if (successor_block_list_type != kNotUsed) {
+    GrowableArray<SuccessorBlockInfo*>::Iterator iterator(successor_blocks);
+    while (true) {
+      SuccessorBlockInfo* successor_block_info = iterator.Next();
+      if (successor_block_info == nullptr) {
+        break;
+      }
+      if (successor_block_info->block == old_bb) {
+        successor_block_info->block = new_bb;
+        found = true;
+      }
+    }
+  }
+
+  return found;
+}
+
+void BasicBlock::UpdatePredecessor(BasicBlockId old_parent, BasicBlockId new_parent) {
+  GrowableArray<BasicBlockId>::Iterator iterator(predecessors);
+  bool found = false;
+
+  while (true) {
+    BasicBlockId pred_bb_id = iterator.Next();
+
+    if (pred_bb_id == NullBasicBlockId) {
+      break;
+    }
+
+    if (pred_bb_id == old_parent) {
+      size_t idx = iterator.GetIndex() - 1;
+      predecessors->Put(idx, new_parent);
+      found = true;
+      break;
+    }
+  }
+
+  // If not found, add it.
+  if (found == false) {
+    predecessors->Insert(new_parent);
+  }
+}
+
+// Create a new basic block with block_id as num_blocks_ that is
+// post-incremented.
+BasicBlock* MIRGraph::CreateNewBB(BBType block_type) {
+  BasicBlock* res = NewMemBB(block_type, num_blocks_++);
+  block_list_.Insert(res);
+  return res;
+}
+
 }  // namespace art
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index 0bb8265..3655125 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -223,7 +223,7 @@
   ArenaBitVector* def_v;
   ArenaBitVector* live_in_v;
   ArenaBitVector* phi_v;
-  int32_t* vreg_to_ssa_map;
+  int32_t* vreg_to_ssa_map_exit;
   ArenaBitVector* ending_check_v;  // For null check and class init check elimination.
 };
 
@@ -234,14 +234,21 @@
  * Following SSA renaming, this is the primary struct used by code generators to locate
  * operand and result registers.  This is a somewhat confusing and unhelpful convention that
  * we may want to revisit in the future.
+ *
+ * TODO:
+ *  1. Add accessors for uses/defs and make data private
+ *  2. Change fp_use/fp_def to a bit array (could help memory usage)
+ *  3. Combine array storage into internal array and handled via accessors from 1.
  */
 struct SSARepresentation {
-  int16_t num_uses;
-  int16_t num_defs;
   int32_t* uses;
   bool* fp_use;
   int32_t* defs;
   bool* fp_def;
+  int16_t num_uses_allocated;
+  int16_t num_defs_allocated;
+  int16_t num_uses;
+  int16_t num_defs;
 
   static uint32_t GetStartUseIndex(Instruction::Code opcode);
 };
@@ -266,6 +273,56 @@
 
     explicit DecodedInstruction():vA(0), vB(0), vB_wide(0), vC(0), opcode(Instruction::NOP) {
     }
+
+    /*
+     * Given a decoded instruction representing a const bytecode, it updates
+     * the out arguments with proper values as dictated by the constant bytecode.
+     */
+    bool GetConstant(int64_t* ptr_value, bool* wide) const;
+
+    bool IsStore() const {
+      return ((Instruction::FlagsOf(opcode) & Instruction::kStore) == Instruction::kStore);
+    }
+
+    bool IsLoad() const {
+      return ((Instruction::FlagsOf(opcode) & Instruction::kLoad) == Instruction::kLoad);
+    }
+
+    bool IsConditionalBranch() const {
+      return (Instruction::FlagsOf(opcode) == (Instruction::kContinue | Instruction::kBranch));
+    }
+
+    /**
+     * @brief Is the register C component of the decoded instruction a constant?
+     */
+    bool IsCFieldOrConstant() const {
+      return ((Instruction::FlagsOf(opcode) & Instruction::kRegCFieldOrConstant) == Instruction::kRegCFieldOrConstant);
+    }
+
+    /**
+     * @brief Is the register C component of the decoded instruction a constant?
+     */
+    bool IsBFieldOrConstant() const {
+      return ((Instruction::FlagsOf(opcode) & Instruction::kRegBFieldOrConstant) == Instruction::kRegBFieldOrConstant);
+    }
+
+    bool IsCast() const {
+      return ((Instruction::FlagsOf(opcode) & Instruction::kCast) == Instruction::kCast);
+    }
+
+    /**
+     * @brief Does the instruction clobber memory?
+     * @details Clobber means that the instruction changes the memory not in a punctual way.
+     *          Therefore any supposition on memory aliasing or memory contents should be disregarded
+     *            when crossing such an instruction.
+     */
+    bool Clobbers() const {
+      return ((Instruction::FlagsOf(opcode) & Instruction::kClobber) == Instruction::kClobber);
+    }
+
+    bool IsLinear() const {
+      return (Instruction::FlagsOf(opcode) & (Instruction::kAdd | Instruction::kSubtract)) != 0;
+    }
   } dalvikInsn;
 
   NarrowDexOffset offset;         // Offset of the instruction in code units.
@@ -339,10 +396,49 @@
   GrowableArray<SuccessorBlockInfo*>* successor_blocks;
 
   void AppendMIR(MIR* mir);
+  void AppendMIRList(MIR* first_list_mir, MIR* last_list_mir);
+  void AppendMIRList(const std::vector<MIR*>& insns);
   void PrependMIR(MIR* mir);
+  void PrependMIRList(MIR* first_list_mir, MIR* last_list_mir);
+  void PrependMIRList(const std::vector<MIR*>& to_add);
   void InsertMIRAfter(MIR* current_mir, MIR* new_mir);
-  void InsertMIRBefore(MIR* current_mir, MIR* new_mir);
+  void InsertMIRListAfter(MIR* insert_after, MIR* first_list_mir, MIR* last_list_mir);
   MIR* FindPreviousMIR(MIR* mir);
+  void InsertMIRBefore(MIR* insert_before, MIR* list);
+  void InsertMIRListBefore(MIR* insert_before, MIR* first_list_mir, MIR* last_list_mir);
+  bool RemoveMIR(MIR* mir);
+  bool RemoveMIRList(MIR* first_list_mir, MIR* last_list_mir);
+
+  BasicBlock* Copy(CompilationUnit* c_unit);
+  BasicBlock* Copy(MIRGraph* mir_graph);
+
+  /**
+   * @brief Reset the optimization_flags field of each MIR.
+   */
+  void ResetOptimizationFlags(uint16_t reset_flags);
+
+  /**
+   * @brief Hide the BasicBlock.
+   * @details Set it to kDalvikByteCode, set hidden to true, remove all MIRs,
+   *          remove itself from any predecessor edges, remove itself from any
+   *          child's predecessor growable array.
+   */
+  void Hide(CompilationUnit* c_unit);
+
+  /**
+   * @brief Is ssa_reg the last SSA definition of that VR in the block?
+   */
+  bool IsSSALiveOut(const CompilationUnit* c_unit, int ssa_reg);
+
+  /**
+   * @brief Replace the edge going to old_bb to now go towards new_bb.
+   */
+  bool ReplaceChild(BasicBlockId old_bb, BasicBlockId new_bb);
+
+  /**
+   * @brief Update the predecessor growable array from old_pred to new_pred.
+   */
+  void UpdatePredecessor(BasicBlockId old_pred, BasicBlockId new_pred);
 
   /**
    * @brief Used to obtain the next MIR that follows unconditionally.
@@ -353,12 +449,17 @@
    * @return Returns the following MIR if one can be found.
    */
   MIR* GetNextUnconditionalMir(MIRGraph* mir_graph, MIR* current);
-  bool RemoveMIR(MIR* mir);
+  bool IsExceptionBlock() const;
+
+  static void* operator new(size_t size, ArenaAllocator* arena) {
+    return arena->Alloc(sizeof(BasicBlock), kArenaAllocBB);
+  }
+  static void operator delete(void* p) {}  // Nop.
 };
 
 /*
  * The "blocks" field in "successor_block_list" points to an array of elements with the type
- * "SuccessorBlockInfo".  For catch blocks, key is type index for the exception.  For swtich
+ * "SuccessorBlockInfo".  For catch blocks, key is type index for the exception.  For switch
  * blocks, key is the case value.
  */
 struct SuccessorBlockInfo {
@@ -598,6 +699,10 @@
 
   void BasicBlockOptimization();
 
+  GrowableArray<BasicBlockId>* GetTopologicalSortOrder() {
+    return topological_order_;
+  }
+
   bool IsConst(int32_t s_reg) const {
     return is_constant_v_->IsBitSet(s_reg);
   }
@@ -865,6 +970,8 @@
   MIR* AdvanceMIR(BasicBlock** p_bb, MIR* mir);
   BasicBlock* NextDominatedBlock(BasicBlock* bb);
   bool LayoutBlocks(BasicBlock* bb);
+  void ComputeTopologicalSortOrder();
+  BasicBlock* CreateNewBB(BBType block_type);
 
   bool InlineCallsGate();
   void InlineCallsStart();
@@ -884,7 +991,7 @@
   /**
    * @brief Perform the initial preparation for the SSA Transformation.
    */
-  void InitializeSSATransformation();
+  void SSATransformationStart();
 
   /**
    * @brief Insert a the operands for the Phi nodes.
@@ -894,6 +1001,11 @@
   bool InsertPhiNodeOperands(BasicBlock* bb);
 
   /**
+   * @brief Perform the cleanup after the SSA Transformation.
+   */
+  void SSATransformationEnd();
+
+  /**
    * @brief Perform constant propagation on a BasicBlock.
    * @param bb the considered BasicBlock.
    */
@@ -915,6 +1027,10 @@
   void CombineBlocks(BasicBlock* bb);
 
   void ClearAllVisitedFlags();
+
+  void AllocateSSAUseData(MIR *mir, int num_uses);
+  void AllocateSSADefData(MIR *mir, int num_defs);
+
   /*
    * IsDebugBuild sanity check: keep track of the Dex PCs for catch entries so that later on
    * we can verify that all catch entries have native PC entries.
@@ -1000,12 +1116,13 @@
   GrowableArray<uint32_t> use_counts_;      // Weighted by nesting depth
   GrowableArray<uint32_t> raw_use_counts_;  // Not weighted
   unsigned int num_reachable_blocks_;
+  unsigned int max_num_reachable_blocks_;
   GrowableArray<BasicBlockId>* dfs_order_;
   GrowableArray<BasicBlockId>* dfs_post_order_;
   GrowableArray<BasicBlockId>* dom_post_order_traversal_;
+  GrowableArray<BasicBlockId>* topological_order_;
   int* i_dom_list_;
   ArenaBitVector** def_block_matrix_;    // num_dalvik_register x num_blocks.
-  ArenaBitVector* temp_dalvik_register_v_;
   std::unique_ptr<ScopedArenaAllocator> temp_scoped_alloc_;
   uint16_t* temp_insn_data_;
   uint32_t temp_bit_vector_size_;
diff --git a/compiler/dex/mir_optimization.cc b/compiler/dex/mir_optimization.cc
index 749a235..1d4aef2 100644
--- a/compiler/dex/mir_optimization.cc
+++ b/compiler/dex/mir_optimization.cc
@@ -268,13 +268,22 @@
     DCHECK_EQ(ct_type, kCompilerTempVR);
 
     // The new non-special compiler temp must receive a unique v_reg with a negative value.
-    compiler_temp->v_reg = static_cast<int>(kVRegNonSpecialTempBaseReg) - num_non_special_compiler_temps_;
+    compiler_temp->v_reg = static_cast<int>(kVRegNonSpecialTempBaseReg) -
+        num_non_special_compiler_temps_;
     compiler_temp->s_reg_low = AddNewSReg(compiler_temp->v_reg);
     num_non_special_compiler_temps_++;
 
     if (wide) {
-      // Ensure that the two registers are consecutive. Since the virtual registers used for temps grow in a
-      // negative fashion, we need the smaller to refer to the low part. Thus, we redefine the v_reg and s_reg_low.
+      // Create a new CompilerTemp for the high part.
+      CompilerTemp *compiler_temp_high =
+          static_cast<CompilerTemp *>(arena_->Alloc(sizeof(CompilerTemp), kArenaAllocRegAlloc));
+      compiler_temp_high->v_reg = compiler_temp->v_reg;
+      compiler_temp_high->s_reg_low = compiler_temp->s_reg_low;
+      compiler_temps_.Insert(compiler_temp_high);
+
+      // Ensure that the two registers are consecutive. Since the virtual registers used for temps
+      // grow in a negative fashion, we need the smaller to refer to the low part. Thus, we
+      // redefine the v_reg and s_reg_low.
       compiler_temp->v_reg--;
       int ssa_reg_high = compiler_temp->s_reg_low;
       compiler_temp->s_reg_low = AddNewSReg(compiler_temp->v_reg);
diff --git a/compiler/dex/pass.h b/compiler/dex/pass.h
index ac22294..4ce040e 100644
--- a/compiler/dex/pass.h
+++ b/compiler/dex/pass.h
@@ -22,6 +22,11 @@
 #include "base/macros.h"
 namespace art {
 
+// Forward declarations.
+struct BasicBlock;
+struct CompilationUnit;
+class Pass;
+
 // Empty Pass Data Class, can be extended by any pass extending the base Pass class.
 class PassDataHolder {
 };
diff --git a/compiler/dex/pass_driver.cc b/compiler/dex/pass_driver.cc
deleted file mode 100644
index 999ed2a..0000000
--- a/compiler/dex/pass_driver.cc
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <dlfcn.h>
-
-#include "base/logging.h"
-#include "base/macros.h"
-#include "bb_optimizations.h"
-#include "compiler_internals.h"
-#include "dataflow_iterator.h"
-#include "dataflow_iterator-inl.h"
-#include "pass.h"
-#include "pass_driver.h"
-
-namespace art {
-
-namespace {  // anonymous namespace
-
-/**
- * @brief Helper function to create a single instance of a given Pass and can be shared across
- * the threads.
- */
-template <typename PassType>
-const Pass* GetPassInstance() {
-  static const PassType pass;
-  return &pass;
-}
-
-void DoWalkBasicBlocks(CompilationUnit* c_unit, const Pass* pass, DataflowIterator* iterator) {
-  // Paranoid: Check the iterator before walking the BasicBlocks.
-  DCHECK(iterator != nullptr);
-
-  bool change = false;
-  for (BasicBlock *bb = iterator->Next(change); bb != 0; bb = iterator->Next(change)) {
-    change = pass->WalkBasicBlocks(c_unit, bb);
-  }
-}
-
-template <typename Iterator>
-inline void DoWalkBasicBlocks(CompilationUnit* c_unit, const Pass* pass) {
-  Iterator iterator(c_unit->mir_graph.get());
-  DoWalkBasicBlocks(c_unit, pass, &iterator);
-}
-
-}  // anonymous namespace
-
-PassDriver::PassDriver(CompilationUnit* cu, bool create_default_passes)
-    : cu_(cu), dump_cfg_folder_("/sdcard/") {
-  DCHECK(cu != nullptr);
-
-  // If need be, create the default passes.
-  if (create_default_passes) {
-    CreatePasses();
-  }
-}
-
-PassDriver::~PassDriver() {
-}
-
-void PassDriver::InsertPass(const Pass* new_pass) {
-  DCHECK(new_pass != nullptr);
-  DCHECK(new_pass->GetName() != nullptr && new_pass->GetName()[0] != 0);
-
-  // It is an error to override an existing pass.
-  DCHECK(GetPass(new_pass->GetName()) == nullptr)
-      << "Pass name " << new_pass->GetName() << " already used.";
-
-  // Now add to the list.
-  pass_list_.push_back(new_pass);
-}
-
-/*
- * Create the pass list. These passes are immutable and are shared across the threads.
- *
- * Advantage is that there will be no race conditions here.
- * Disadvantage is the passes can't change their internal states depending on CompilationUnit:
- *   - This is not yet an issue: no current pass would require it.
- */
-static const Pass* const gPasses[] = {
-  GetPassInstance<CacheFieldLoweringInfo>(),
-  GetPassInstance<CacheMethodLoweringInfo>(),
-  GetPassInstance<CallInlining>(),
-  GetPassInstance<CodeLayout>(),
-  GetPassInstance<SSATransformation>(),
-  GetPassInstance<ConstantPropagation>(),
-  GetPassInstance<InitRegLocations>(),
-  GetPassInstance<MethodUseCount>(),
-  GetPassInstance<NullCheckEliminationAndTypeInference>(),
-  GetPassInstance<ClassInitCheckElimination>(),
-  GetPassInstance<BBCombine>(),
-  GetPassInstance<BBOptimizations>(),
-};
-
-// The default pass list is used by CreatePasses to initialize pass_list_.
-static std::vector<const Pass*> gDefaultPassList(gPasses, gPasses + arraysize(gPasses));
-
-void PassDriver::CreateDefaultPassList(const std::string& disable_passes) {
-  // Insert each pass from gPasses into gDefaultPassList.
-  gDefaultPassList.clear();
-  gDefaultPassList.reserve(arraysize(gPasses));
-  for (const Pass* pass : gPasses) {
-    // Check if we should disable this pass.
-    if (disable_passes.find(pass->GetName()) != std::string::npos) {
-      LOG(INFO) << "Skipping " << pass->GetName();
-    } else {
-      gDefaultPassList.push_back(pass);
-    }
-  }
-}
-
-void PassDriver::CreatePasses() {
-  // Insert each pass into the list via the InsertPass method.
-  pass_list_.reserve(gDefaultPassList.size());
-  for (const Pass* pass : gDefaultPassList) {
-    InsertPass(pass);
-  }
-}
-
-void PassDriver::HandlePassFlag(CompilationUnit* c_unit, const Pass* pass) {
-  // Unused parameters for the moment.
-  UNUSED(c_unit);
-  UNUSED(pass);
-}
-
-void PassDriver::DispatchPass(CompilationUnit* c_unit, const Pass* curPass) {
-  VLOG(compiler) << "Dispatching " << curPass->GetName();
-
-  DataFlowAnalysisMode mode = curPass->GetTraversal();
-
-  switch (mode) {
-    case kPreOrderDFSTraversal:
-      DoWalkBasicBlocks<PreOrderDfsIterator>(c_unit, curPass);
-      break;
-    case kRepeatingPreOrderDFSTraversal:
-      DoWalkBasicBlocks<RepeatingPreOrderDfsIterator>(c_unit, curPass);
-      break;
-    case kRepeatingPostOrderDFSTraversal:
-      DoWalkBasicBlocks<RepeatingPostOrderDfsIterator>(c_unit, curPass);
-      break;
-    case kReversePostOrderDFSTraversal:
-      DoWalkBasicBlocks<ReversePostOrderDfsIterator>(c_unit, curPass);
-      break;
-    case kRepeatingReversePostOrderDFSTraversal:
-      DoWalkBasicBlocks<RepeatingReversePostOrderDfsIterator>(c_unit, curPass);
-      break;
-    case kPostOrderDOMTraversal:
-      DoWalkBasicBlocks<PostOrderDOMIterator>(c_unit, curPass);
-      break;
-    case kAllNodes:
-      DoWalkBasicBlocks<AllNodesIterator>(c_unit, curPass);
-      break;
-    case kNoNodes:
-      break;
-    default:
-      LOG(FATAL) << "Iterator mode not handled in dispatcher: " << mode;
-      break;
-  }
-}
-
-void PassDriver::ApplyPass(CompilationUnit* c_unit, const Pass* curPass) {
-  curPass->Start(c_unit);
-  DispatchPass(c_unit, curPass);
-  curPass->End(c_unit);
-}
-
-bool PassDriver::RunPass(CompilationUnit* c_unit, const Pass* pass, bool time_split) {
-  // Paranoid: c_unit and pass cannot be nullptr, and the pass should have a name.
-  DCHECK(c_unit != nullptr);
-  DCHECK(pass != nullptr);
-  DCHECK(pass->GetName() != nullptr && pass->GetName()[0] != 0);
-
-  // Do we perform a time split
-  if (time_split) {
-    c_unit->NewTimingSplit(pass->GetName());
-  }
-
-  // Check the pass gate first.
-  bool should_apply_pass = pass->Gate(c_unit);
-
-  if (should_apply_pass) {
-    // Applying the pass: first start, doWork, and end calls.
-    ApplyPass(c_unit, pass);
-
-    // Clean up if need be.
-    HandlePassFlag(c_unit, pass);
-
-    // Do we want to log it?
-    if ((c_unit->enable_debug&  (1 << kDebugDumpCFG)) != 0) {
-      // Do we have a pass folder?
-      const char* passFolder = pass->GetDumpCFGFolder();
-      DCHECK(passFolder != nullptr);
-
-      if (passFolder[0] != 0) {
-        // Create directory prefix.
-        std::string prefix = GetDumpCFGFolder();
-        prefix += passFolder;
-        prefix += "/";
-
-        c_unit->mir_graph->DumpCFG(prefix.c_str(), false);
-      }
-    }
-  }
-
-  // If the pass gate passed, we can declare success.
-  return should_apply_pass;
-}
-
-bool PassDriver::RunPass(CompilationUnit* c_unit, const char* pass_name) {
-  // Paranoid: c_unit cannot be nullptr and we need a pass name.
-  DCHECK(c_unit != nullptr);
-  DCHECK(pass_name != nullptr && pass_name[0] != 0);
-
-  const Pass* cur_pass = GetPass(pass_name);
-
-  if (cur_pass != nullptr) {
-    return RunPass(c_unit, cur_pass);
-  }
-
-  // Return false, we did not find the pass.
-  return false;
-}
-
-void PassDriver::Launch() {
-  for (const Pass* cur_pass : pass_list_) {
-    RunPass(cu_, cur_pass, true);
-  }
-}
-
-void PassDriver::PrintPassNames() {
-  LOG(INFO) << "Loop Passes are:";
-
-  for (const Pass* cur_pass : gPasses) {
-    LOG(INFO) << "\t-" << cur_pass->GetName();
-  }
-}
-
-const Pass* PassDriver::GetPass(const char* name) const {
-  for (const Pass* cur_pass : pass_list_) {
-    if (strcmp(name, cur_pass->GetName()) == 0) {
-      return cur_pass;
-    }
-  }
-  return nullptr;
-}
-
-}  // namespace art
diff --git a/compiler/dex/pass_me.h b/compiler/dex/pass_me.h
index 1132166..069fb45 100644
--- a/compiler/dex/pass_me.h
+++ b/compiler/dex/pass_me.h
@@ -49,6 +49,8 @@
   kRepeatingPostOrderDFSTraversal,         /**< @brief Depth-First-Search / Repeating Post-Order. */
   kRepeatingReversePostOrderDFSTraversal,  /**< @brief Depth-First-Search / Repeating Reverse Post-Order. */
   kPostOrderDOMTraversal,                  /**< @brief Dominator tree / Post-Order. */
+  kTopologicalSortTraversal,               /**< @brief Topological Order traversal. */
+  kRepeatingTopologicalSortTraversal,      /**< @brief Repeating Topological Order traversal. */
   kNoNodes,                                /**< @brief Skip BasicBlock traversal. */
 };
 
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 2d1c19e..f0a9ca4 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -120,6 +120,7 @@
     bool GenInlinedSqrt(CallInfo* info);
     bool GenInlinedPeek(CallInfo* info, OpSize size);
     bool GenInlinedPoke(CallInfo* info, OpSize size);
+    void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                    RegLocation rl_src2);
@@ -127,6 +128,8 @@
                     RegLocation rl_src2);
     void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                     RegLocation rl_src2);
+    void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                       RegLocation rl_src2, bool is_div);
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/arm/fp_arm.cc b/compiler/dex/quick/arm/fp_arm.cc
index bb02f74..dde8ff0 100644
--- a/compiler/dex/quick/arm/fp_arm.cc
+++ b/compiler/dex/quick/arm/fp_arm.cc
@@ -141,8 +141,11 @@
       break;
     case Instruction::LONG_TO_DOUBLE: {
       rl_src = LoadValueWide(rl_src, kFPReg);
-      RegStorage src_low = rl_src.reg.DoubleToLowSingle();
-      RegStorage src_high = rl_src.reg.DoubleToHighSingle();
+      RegisterInfo* info = GetRegInfo(rl_src.reg);
+      RegStorage src_low = info->FindMatchingView(RegisterInfo::kLowSingleStorageMask)->GetReg();
+      DCHECK(src_low.Valid());
+      RegStorage src_high = info->FindMatchingView(RegisterInfo::kHighSingleStorageMask)->GetReg();
+      DCHECK(src_high.Valid());
       rl_result = EvalLoc(rl_dest, kFPReg, true);
       RegStorage tmp1 = AllocTempDouble();
       RegStorage tmp2 = AllocTempDouble();
@@ -161,8 +164,11 @@
       return;
     case Instruction::LONG_TO_FLOAT: {
       rl_src = LoadValueWide(rl_src, kFPReg);
-      RegStorage src_low = rl_src.reg.DoubleToLowSingle();
-      RegStorage src_high = rl_src.reg.DoubleToHighSingle();
+      RegisterInfo* info = GetRegInfo(rl_src.reg);
+      RegStorage src_low = info->FindMatchingView(RegisterInfo::kLowSingleStorageMask)->GetReg();
+      DCHECK(src_low.Valid());
+      RegStorage src_high = info->FindMatchingView(RegisterInfo::kHighSingleStorageMask)->GetReg();
+      DCHECK(src_high.Valid());
       rl_result = EvalLoc(rl_dest, kFPReg, true);
       // Allocate temp registers.
       RegStorage high_val = AllocTempDouble();
@@ -334,22 +340,11 @@
 
 bool ArmMir2Lir::GenInlinedSqrt(CallInfo* info) {
   DCHECK_EQ(cu_->instruction_set, kThumb2);
-  LIR *branch;
   RegLocation rl_src = info->args[0];
   RegLocation rl_dest = InlineTargetWide(info);  // double place for result
   rl_src = LoadValueWide(rl_src, kFPReg);
   RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
   NewLIR2(kThumb2Vsqrtd, rl_result.reg.GetReg(), rl_src.reg.GetReg());
-  NewLIR2(kThumb2Vcmpd, rl_result.reg.GetReg(), rl_result.reg.GetReg());
-  NewLIR0(kThumb2Fmstat);
-  branch = NewLIR2(kThumbBCond, 0, kArmCondEq);
-  ClobberCallerSave();
-  LockCallTemps();  // Using fixed registers
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(4, pSqrt));
-  NewLIR3(kThumb2Fmrrd, rs_r0.GetReg(), rs_r1.GetReg(), rl_src.reg.GetReg());
-  NewLIR1(kThumbBlxR, r_tgt.GetReg());
-  NewLIR3(kThumb2Fmdrr, rl_result.reg.GetReg(), rs_r0.GetReg(), rs_r1.GetReg());
-  branch->target = NewLIR0(kPseudoTargetLabel);
   StoreValueWide(rl_dest, rl_result);
   return true;
 }
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 384a008..2556788 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -998,6 +998,15 @@
 #endif
 }
 
+void ArmMir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
+  LOG(FATAL) << "Unexpected use GenNotLong()";
+}
+
+void ArmMir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                           RegLocation rl_src2, bool is_div) {
+  LOG(FATAL) << "Unexpected use GenDivRemLong()";
+}
+
 void ArmMir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
   rl_src = LoadValueWide(rl_src, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
diff --git a/compiler/dex/quick/arm/target_arm.cc b/compiler/dex/quick/arm/target_arm.cc
index 1520c52..309f676 100644
--- a/compiler/dex/quick/arm/target_arm.cc
+++ b/compiler/dex/quick/arm/target_arm.cc
@@ -575,10 +575,10 @@
     // Redirect single precision's master storage to master.
     info->SetMaster(dp_reg_info);
     // Singles should show a single 32-bit mask bit, at first referring to the low half.
-    DCHECK_EQ(info->StorageMask(), 0x1U);
+    DCHECK_EQ(info->StorageMask(), RegisterInfo::kLowSingleStorageMask);
     if (sp_reg_num & 1) {
-      // For odd singles, change to user the high word of the backing double.
-      info->SetStorageMask(0x2);
+      // For odd singles, change to use the high word of the backing double.
+      info->SetStorageMask(RegisterInfo::kHighSingleStorageMask);
     }
   }
 
@@ -786,10 +786,13 @@
     }
   }
   if (res.Valid()) {
+    RegisterInfo* info = GetRegInfo(res);
     promotion_map_[p_map_idx].fp_location = kLocPhysReg;
-    promotion_map_[p_map_idx].FpReg = res.DoubleToLowSingle().GetReg();
+    promotion_map_[p_map_idx].FpReg =
+        info->FindMatchingView(RegisterInfo::kLowSingleStorageMask)->GetReg().GetReg();
     promotion_map_[p_map_idx+1].fp_location = kLocPhysReg;
-    promotion_map_[p_map_idx+1].FpReg = res.DoubleToHighSingle().GetReg();
+    promotion_map_[p_map_idx+1].FpReg =
+        info->FindMatchingView(RegisterInfo::kHighSingleStorageMask)->GetReg().GetReg();
   }
   return res;
 }
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index c3b23fd..6a6b0f6 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -298,6 +298,7 @@
   kA64Mov2rr,        // mov [00101010000] rm[20-16] [000000] [11111] rd[4-0].
   kA64Mvn2rr,        // mov [00101010001] rm[20-16] [000000] [11111] rd[4-0].
   kA64Mul3rrr,       // mul [00011011000] rm[20-16] [011111] rn[9-5] rd[4-0].
+  kA64Msub4rrrr,     // msub[s0011011000] rm[20-16] [1] ra[14-10] rn[9-5] rd[4-0].
   kA64Neg3rro,       // neg alias of "sub arg0, rzr, arg1, arg2".
   kA64Orr3Rrl,       // orr [s01100100] N[22] imm_r[21-16] imm_s[15-10] rn[9-5] rd[4-0].
   kA64Orr4rrro,      // orr [s0101010] shift[23-22] [0] rm[20-16] imm_6[15-10] rn[9-5] rd[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 656f8fd..4a0c055 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -422,6 +422,10 @@
                  kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtRegR, 20, 16,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE12,
                  "mul", "!0r, !1r, !2r", kFixupNone),
+    ENCODING_MAP(WIDE(kA64Msub4rrrr), SF_VARIANTS(0x1b008000),
+                 kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtRegR, 14, 10,
+                 kFmtRegR, 20, 16, IS_QUAD_OP | REG_DEF0_USE123,
+                 "msub", "!0r, !1r, !3r, !2r", kFixupNone),
     ENCODING_MAP(WIDE(kA64Neg3rro), SF_VARIANTS(0x4b0003e0),
                  kFmtRegR, 4, 0, kFmtRegR, 20, 16, kFmtShift, -1, -1,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1,
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index f7a0199..2e3ef86 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -94,8 +94,7 @@
   tab_rec->anchor = switch_label;
 
   // Add displacement to base branch address and go!
-  OpRegRegRegShift(kOpAdd, r_base.GetReg(), r_base.GetReg(), r_disp.GetReg(),
-                   ENCODE_NO_SHIFT, true);
+  OpRegRegRegShift(kOpAdd, r_base, r_base, r_disp, ENCODE_NO_SHIFT);
   NewLIR1(kA64Br1x, r_base.GetReg());
 
   // Loop exit label.
@@ -148,8 +147,7 @@
   tab_rec->anchor = switch_label;
 
   // Add displacement to base branch address and go!
-  OpRegRegRegShift(kOpAdd, branch_reg.GetReg(), branch_reg.GetReg(), disp_reg.GetReg(),
-                   ENCODE_NO_SHIFT, true);
+  OpRegRegRegShift(kOpAdd, branch_reg, branch_reg, disp_reg, ENCODE_NO_SHIFT);
   NewLIR1(kA64Br1x, branch_reg.GetReg());
 
   // branch_over target here
@@ -334,7 +332,7 @@
 
   if (!skip_overflow_check) {
     LoadWordDisp(rs_rA64_SELF, Thread::StackEndOffset<8>().Int32Value(), rs_x12);
-    OpRegImm64(kOpSub, rs_rA64_SP, frame_size_, /*is_wide*/true);
+    OpRegImm64(kOpSub, rs_rA64_SP, frame_size_);
     if (Runtime::Current()->ExplicitStackOverflowChecks()) {
       /* Load stack limit */
       // TODO(Arm64): fix the line below:
@@ -348,7 +346,7 @@
       MarkPossibleStackOverflowException();
     }
   } else if (frame_size_ > 0) {
-    OpRegImm64(kOpSub, rs_rA64_SP, frame_size_, /*is_wide*/true);
+    OpRegImm64(kOpSub, rs_rA64_SP, frame_size_);
   }
 
   /* Need to spill any FP regs? */
@@ -391,7 +389,7 @@
     UnSpillCoreRegs(rs_rA64_SP, spill_offset, core_spill_mask_);
   }
 
-  OpRegImm64(kOpAdd, rs_rA64_SP, frame_size_, /*is_wide*/true);
+  OpRegImm64(kOpAdd, rs_rA64_SP, frame_size_);
   NewLIR0(kA64Ret);
 }
 
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 350e483..fddbfd7 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -93,6 +93,8 @@
     RegisterClass RegClassForFieldLoadStore(OpSize size, bool is_volatile) OVERRIDE;
 
     // Required for target - Dalvik-level generators.
+    void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                        RegLocation lr_shift);
     void GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                            RegLocation rl_src1, RegLocation rl_src2);
     void GenArrayGet(int opt_flags, OpSize size, RegLocation rl_array,
@@ -120,6 +122,8 @@
     bool GenInlinedSqrt(CallInfo* info);
     bool GenInlinedPeek(CallInfo* info, OpSize size);
     bool GenInlinedPoke(CallInfo* info, OpSize size);
+    void GenIntToLong(RegLocation rl_dest, RegLocation rl_src);
+    void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                    RegLocation rl_src2);
@@ -127,6 +131,8 @@
                     RegLocation rl_src2);
     void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                     RegLocation rl_src2);
+    void GenDivRemLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
+                       RegLocation rl_src2, bool is_div);
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
@@ -170,7 +176,7 @@
     LIR* OpReg(OpKind op, RegStorage r_dest_src);
     void OpRegCopy(RegStorage r_dest, RegStorage r_src);
     LIR* OpRegCopyNoInsert(RegStorage r_dest, RegStorage r_src);
-    LIR* OpRegImm64(OpKind op, RegStorage r_dest_src1, int64_t value, bool is_wide);
+    LIR* OpRegImm64(OpKind op, RegStorage r_dest_src1, int64_t value);
     LIR* OpRegImm(OpKind op, RegStorage r_dest_src1, int value);
     LIR* OpRegMem(OpKind op, RegStorage r_dest, RegStorage r_base, int offset);
     LIR* OpRegReg(OpKind op, RegStorage r_dest_src1, RegStorage r_src2);
@@ -191,8 +197,8 @@
 
     LIR* LoadBaseDispBody(RegStorage r_base, int displacement, RegStorage r_dest, OpSize size);
     LIR* StoreBaseDispBody(RegStorage r_base, int displacement, RegStorage r_src, OpSize size);
-    LIR* OpRegRegRegShift(OpKind op, int r_dest, int r_src1, int r_src2, int shift,
-                          bool is_wide = false);
+    LIR* OpRegRegRegShift(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2,
+                          int shift);
     LIR* OpRegRegShift(OpKind op, RegStorage r_dest_src1, RegStorage r_src2, int shift);
     static const ArmEncodingMap EncodingMap[kA64Last];
     int EncodeShift(int code, int amount);
diff --git a/compiler/dex/quick/arm64/fp_arm64.cc b/compiler/dex/quick/arm64/fp_arm64.cc
index 87ab6fe..882ee66 100644
--- a/compiler/dex/quick/arm64/fp_arm64.cc
+++ b/compiler/dex/quick/arm64/fp_arm64.cc
@@ -25,10 +25,6 @@
   int op = kA64Brk1d;
   RegLocation rl_result;
 
-  /*
-   * Don't attempt to optimize register usage since these opcodes call out to
-   * the handlers.
-   */
   switch (opcode) {
     case Instruction::ADD_FLOAT_2ADDR:
     case Instruction::ADD_FLOAT:
@@ -119,49 +115,75 @@
                                  RegLocation rl_dest, RegLocation rl_src) {
   int op = kA64Brk1d;
   RegLocation rl_result;
+  RegisterClass src_reg_class = kInvalidRegClass;
+  RegisterClass dst_reg_class = kInvalidRegClass;
 
   switch (opcode) {
     case Instruction::INT_TO_FLOAT:
       op = kA64Scvtf2fw;
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::FLOAT_TO_INT:
       op = kA64Fcvtzs2wf;
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     case Instruction::DOUBLE_TO_FLOAT:
       op = kA64Fcvt2sS;
+      src_reg_class = kFPReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::FLOAT_TO_DOUBLE:
       op = kA64Fcvt2Ss;
+      src_reg_class = kFPReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::INT_TO_DOUBLE:
       op = FWIDE(kA64Scvtf2fw);
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::DOUBLE_TO_INT:
       op = FWIDE(kA64Fcvtzs2wf);
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     case Instruction::LONG_TO_DOUBLE:
       op = FWIDE(kA64Scvtf2fx);
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::FLOAT_TO_LONG:
       op = kA64Fcvtzs2xf;
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     case Instruction::LONG_TO_FLOAT:
       op = kA64Scvtf2fx;
+      src_reg_class = kCoreReg;
+      dst_reg_class = kFPReg;
       break;
     case Instruction::DOUBLE_TO_LONG:
       op = FWIDE(kA64Fcvtzs2xf);
+      src_reg_class = kFPReg;
+      dst_reg_class = kCoreReg;
       break;
     default:
       LOG(FATAL) << "Unexpected opcode: " << opcode;
   }
 
+  DCHECK_NE(src_reg_class, kInvalidRegClass);
+  DCHECK_NE(dst_reg_class, kInvalidRegClass);
+  DCHECK_NE(op, kA64Brk1d);
+
   if (rl_src.wide) {
-    rl_src = LoadValueWide(rl_src, kFPReg);
+    rl_src = LoadValueWide(rl_src, src_reg_class);
   } else {
-    rl_src = LoadValue(rl_src, kFPReg);
+    rl_src = LoadValue(rl_src, src_reg_class);
   }
 
-  rl_result = EvalLoc(rl_dest, kFPReg, true);
+  rl_result = EvalLoc(rl_dest, dst_reg_class, true);
   NewLIR2(op, rl_result.reg.GetReg(), rl_src.reg.GetReg());
 
   if (rl_dest.wide) {
@@ -296,25 +318,11 @@
 }
 
 bool Arm64Mir2Lir::GenInlinedSqrt(CallInfo* info) {
-  // TODO(Arm64): implement this.
-  UNIMPLEMENTED(FATAL) << "GenInlinedSqrt not implemented for Arm64";
-
-  DCHECK_EQ(cu_->instruction_set, kArm64);
-  LIR *branch;
   RegLocation rl_src = info->args[0];
   RegLocation rl_dest = InlineTargetWide(info);  // double place for result
   rl_src = LoadValueWide(rl_src, kFPReg);
   RegLocation rl_result = EvalLoc(rl_dest, kFPReg, true);
   NewLIR2(FWIDE(kA64Fsqrt2ff), rl_result.reg.GetReg(), rl_src.reg.GetReg());
-  NewLIR2(FWIDE(kA64Fcmp2ff), rl_result.reg.GetReg(), rl_result.reg.GetReg());
-  branch = NewLIR2(kA64B2ct, kArmCondEq, 0);
-  ClobberCallerSave();
-  LockCallTemps();  // Using fixed registers
-  RegStorage r_tgt = LoadHelper(QUICK_ENTRYPOINT_OFFSET(8, pSqrt));
-  // NewLIR3(kThumb2Fmrrd, r0, r1, rl_src.reg.GetReg());
-  NewLIR1(kA64Blr1x, r_tgt.GetReg());
-  // NewLIR3(kThumb2Fmdrr, rl_result.reg.GetReg(), r0, r1);
-  branch->target = NewLIR0(kPseudoTargetLabel);
   StoreValueWide(rl_dest, rl_result);
   return true;
 }
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index b0f5904..8dad90a 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -53,10 +53,36 @@
   rl_result = EvalLoc(rl_dest, kCoreReg, true);
 
   OpRegReg(kOpCmp, rl_src1.reg, rl_src2.reg);
-  NewLIR4(kA64Csinc4rrrc, rl_result.reg.GetReg(), rwzr, rwzr, kArmCondEq);
-  NewLIR4(kA64Csneg4rrrc, rl_result.reg.GetReg(), rl_result.reg.GetReg(),
+  NewLIR4(WIDE(kA64Csinc4rrrc), rl_result.reg.GetReg(), rxzr, rxzr, kArmCondEq);
+  NewLIR4(WIDE(kA64Csneg4rrrc), rl_result.reg.GetReg(), rl_result.reg.GetReg(),
           rl_result.reg.GetReg(), kArmCondLe);
-  StoreValue(rl_dest, rl_result);
+  StoreValueWide(rl_dest, rl_result);
+}
+
+void Arm64Mir2Lir::GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
+                             RegLocation rl_src1, RegLocation rl_shift) {
+  OpKind op = kOpBkpt;
+  switch (opcode) {
+  case Instruction::SHL_LONG:
+  case Instruction::SHL_LONG_2ADDR:
+    op = kOpLsl;
+    break;
+  case Instruction::SHR_LONG:
+  case Instruction::SHR_LONG_2ADDR:
+    op = kOpAsr;
+    break;
+  case Instruction::USHR_LONG:
+  case Instruction::USHR_LONG_2ADDR:
+    op = kOpLsr;
+    break;
+  default:
+    LOG(FATAL) << "Unexpected case: " << opcode;
+  }
+  rl_shift = LoadValueWide(rl_shift, kCoreReg);
+  rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+  RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+  OpRegRegReg(op, rl_result.reg, rl_src1.reg, rl_shift.reg);
+  StoreValueWide(rl_dest, rl_result);
 }
 
 void Arm64Mir2Lir::GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1,
@@ -69,7 +95,7 @@
     LIR* branch = NewLIR2(WIDE(opcode), rl_src1.reg.GetLowReg(), 0);
     branch->target = taken;
   } else {
-    OpRegImm64(kOpCmp, rl_src1.reg, val, /*is_wide*/true);
+    OpRegImm64(kOpCmp, rl_src1.reg, val);
     OpCondBranch(ccode, taken);
   }
 }
@@ -219,7 +245,8 @@
   ArmConditionCode arm_cond = ArmConditionEncoding(cond);
   if (check_value == 0 && (arm_cond == kArmCondEq || arm_cond == kArmCondNe)) {
     ArmOpcode opcode = (arm_cond == kArmCondEq) ? kA64Cbz2rt : kA64Cbnz2rt;
-    branch = NewLIR2(opcode, reg.GetReg(), 0);
+    ArmOpcode wide = reg.Is64Bit() ? WIDE(0) : UNWIDE(0);
+    branch = NewLIR2(opcode | wide, reg.GetReg(), 0);
   } else {
     OpRegImm(kOpCmp, reg, check_value);
     branch = NewLIR2(kA64B2ct, arm_cond, 0);
@@ -354,19 +381,16 @@
   NewLIR4(kA64Smaddl4xwwx, r_lo.GetReg(), r_magic.GetReg(), rl_src.reg.GetReg(), rxzr);
   switch (pattern) {
     case Divide3:
-      OpRegRegRegShift(kOpSub, rl_result.reg.GetReg(), r_hi.GetReg(),
-               rl_src.reg.GetReg(), EncodeShift(kA64Asr, 31));
+      OpRegRegRegShift(kOpSub, rl_result.reg, r_hi, rl_src.reg, EncodeShift(kA64Asr, 31));
       break;
     case Divide5:
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg.GetReg(), r_lo.GetReg(), r_hi.GetReg(),
-               EncodeShift(kA64Asr, magic_table[lit].shift));
+      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi, EncodeShift(kA64Asr, magic_table[lit].shift));
       break;
     case Divide7:
       OpRegReg(kOpAdd, r_hi, rl_src.reg);
       OpRegRegImm(kOpAsr, r_lo, rl_src.reg, 31);
-      OpRegRegRegShift(kOpRsub, rl_result.reg.GetReg(), r_lo.GetReg(), r_hi.GetReg(),
-               EncodeShift(kA64Asr, magic_table[lit].shift));
+      OpRegRegRegShift(kOpRsub, rl_result.reg, r_lo, r_hi, EncodeShift(kA64Asr, magic_table[lit].shift));
       break;
     default:
       LOG(FATAL) << "Unexpected pattern: " << pattern;
@@ -405,25 +429,30 @@
   return rl_result;
 }
 
-RegLocation Arm64Mir2Lir::GenDivRem(RegLocation rl_dest, RegStorage reg1, RegStorage reg2,
+RegLocation Arm64Mir2Lir::GenDivRem(RegLocation rl_dest, RegStorage r_src1, RegStorage r_src2,
                                   bool is_div) {
+  CHECK_EQ(r_src1.Is64Bit(), r_src2.Is64Bit());
+
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (is_div) {
-    // Simple case, use sdiv instruction.
-    OpRegRegReg(kOpDiv, rl_result.reg, reg1, reg2);
+    OpRegRegReg(kOpDiv, rl_result.reg, r_src1, r_src2);
   } else {
-    // Remainder case, use the following code:
-    // temp = reg1 / reg2      - integer division
-    // temp = temp * reg2
-    // dest = reg1 - temp
-
-    RegStorage temp = AllocTemp();
-    OpRegRegReg(kOpDiv, temp, reg1, reg2);
-    OpRegReg(kOpMul, temp, reg2);
-    OpRegRegReg(kOpSub, rl_result.reg, reg1, temp);
+    // temp = r_src1 / r_src2
+    // dest = r_src1 - temp * r_src2
+    RegStorage temp;
+    ArmOpcode wide;
+    if (rl_result.reg.Is64Bit()) {
+      temp = AllocTempWide();
+      wide = WIDE(0);
+    } else {
+      temp = AllocTemp();
+      wide = UNWIDE(0);
+    }
+    OpRegRegReg(kOpDiv, temp, r_src1, r_src2);
+    NewLIR4(kA64Msub4rrrr | wide, rl_result.reg.GetReg(), temp.GetReg(),
+            r_src1.GetReg(), r_src2.GetReg());
     FreeTemp(temp);
   }
-
   return rl_result;
 }
 
@@ -684,32 +713,22 @@
 void Arm64Mir2Lir::GenMultiplyByTwoBitMultiplier(RegLocation rl_src,
                                                RegLocation rl_result, int lit,
                                                int first_bit, int second_bit) {
-  OpRegRegRegShift(kOpAdd, rl_result.reg.GetReg(), rl_src.reg.GetReg(), rl_src.reg.GetReg(),
-                   EncodeShift(kA64Lsl, second_bit - first_bit));
+  OpRegRegRegShift(kOpAdd, rl_result.reg, rl_src.reg, rl_src.reg, EncodeShift(kA64Lsl, second_bit - first_bit));
   if (first_bit != 0) {
     OpRegRegImm(kOpLsl, rl_result.reg, rl_result.reg, first_bit);
   }
 }
 
 void Arm64Mir2Lir::GenDivZeroCheckWide(RegStorage reg) {
-  DCHECK(reg.IsPair());   // TODO: support k64BitSolo.
-  OpRegImm64(kOpCmp, reg, 0, /*is_wide*/true);
-  GenDivZeroCheck(kCondEq);
+  LOG(FATAL) << "Unexpected use of GenDivZero for Arm64";
 }
 
 // Test suspend flag, return target of taken suspend branch
 LIR* Arm64Mir2Lir::OpTestSuspend(LIR* target) {
-  // TODO(Arm64): re-enable suspend checks, once art_quick_test_suspend is implemented and
-  //   the suspend register is properly handled in the trampolines.
-#if 0
+  // FIXME: Define rA64_SUSPEND as w19, when we do not need two copies of reserved register.
+  // Note: The opcode is not set as wide, so actually we are using the 32-bit version register.
   NewLIR3(kA64Subs3rRd, rA64_SUSPEND, rA64_SUSPEND, 1);
   return OpCondBranch((target == NULL) ? kCondEq : kCondNe, target);
-#else
-  // TODO(Arm64): Fake suspend check. Will always fail to branch. Remove this.
-  LIR* branch = NewLIR2((target == NULL) ? kA64Cbnz2rt : kA64Cbz2rt, rwzr, 0);
-  branch->target = target;
-  return branch;
-#endif
 }
 
 // Decrement register and branch on condition
@@ -756,33 +775,51 @@
 #endif
 }
 
-void Arm64Mir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
-  rl_src = LoadValueWide(rl_src, kCoreReg);
-  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  RegStorage z_reg = AllocTemp();
-  LoadConstantNoClobber(z_reg, 0);
-  // Check for destructive overlap
-  if (rl_result.reg.GetLowReg() == rl_src.reg.GetHighReg()) {
-    RegStorage t_reg = AllocTemp();
-    OpRegRegReg(kOpSub, rl_result.reg.GetLow(), z_reg, rl_src.reg.GetLow());
-    OpRegRegReg(kOpSbc, rl_result.reg.GetHigh(), z_reg, t_reg);
-    FreeTemp(t_reg);
-  } else {
-    OpRegRegReg(kOpSub, rl_result.reg.GetLow(), z_reg, rl_src.reg.GetLow());
-    OpRegRegReg(kOpSbc, rl_result.reg.GetHigh(), z_reg, rl_src.reg.GetHigh());
-  }
-  FreeTemp(z_reg);
+void Arm64Mir2Lir::GenIntToLong(RegLocation rl_dest, RegLocation rl_src) {
+  RegLocation rl_result;
+
+  rl_src = LoadValue(rl_src, kCoreReg);
+  rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+  NewLIR4(WIDE(kA64Sbfm4rrdd), rl_result.reg.GetReg(), rl_src.reg.GetReg(), 0, 31);
+  StoreValueWide(rl_dest, rl_result);
+}
+
+void Arm64Mir2Lir::GenDivRemLong(Instruction::Code opcode, RegLocation rl_dest,
+                                 RegLocation rl_src1, RegLocation rl_src2, bool is_div) {
+  RegLocation rl_result;
+  rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+  rl_src2 = LoadValueWide(rl_src2, kCoreReg);
+  GenDivZeroCheck(rl_src2.reg);
+  rl_result = GenDivRem(rl_dest, rl_src1.reg, rl_src2.reg, is_div);
   StoreValueWide(rl_dest, rl_result);
 }
 
 void Arm64Mir2Lir::GenLongOp(OpKind op, RegLocation rl_dest, RegLocation rl_src1,
                              RegLocation rl_src2) {
   RegLocation rl_result;
+
   rl_src1 = LoadValueWide(rl_src1, kCoreReg);
   rl_src2 = LoadValueWide(rl_src2, kCoreReg);
   rl_result = EvalLocWide(rl_dest, kCoreReg, true);
-  OpRegRegRegShift(op, rl_result.reg.GetReg(), rl_src1.reg.GetReg(), rl_src2.reg.GetReg(),
-                   ENCODE_NO_SHIFT, /*is_wide*/ true);
+  OpRegRegRegShift(op, rl_result.reg, rl_src1.reg, rl_src2.reg, ENCODE_NO_SHIFT);
+  StoreValueWide(rl_dest, rl_result);
+}
+
+void Arm64Mir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
+  RegLocation rl_result;
+
+  rl_src = LoadValueWide(rl_src, kCoreReg);
+  rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+  OpRegRegShift(kOpNeg, rl_result.reg, rl_src.reg, ENCODE_NO_SHIFT);
+  StoreValueWide(rl_dest, rl_result);
+}
+
+void Arm64Mir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
+  RegLocation rl_result;
+
+  rl_src = LoadValueWide(rl_src, kCoreReg);
+  rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+  OpRegRegShift(kOpMvn, rl_result.reg, rl_src.reg, ENCODE_NO_SHIFT);
   StoreValueWide(rl_dest, rl_result);
 }
 
@@ -865,8 +902,7 @@
     } else {
       // No special indexed operation, lea + load w/ displacement
       reg_ptr = AllocTemp();
-      OpRegRegRegShift(kOpAdd, reg_ptr.GetReg(), rl_array.reg.GetReg(), rl_index.reg.GetReg(),
-                       EncodeShift(kA64Lsl, scale));
+      OpRegRegRegShift(kOpAdd, reg_ptr, rl_array.reg, rl_index.reg, EncodeShift(kA64Lsl, scale));
       FreeTemp(rl_index.reg);
     }
     rl_result = EvalLoc(rl_dest, reg_class, true);
@@ -971,8 +1007,7 @@
       rl_src = LoadValue(rl_src, reg_class);
     }
     if (!constant_index) {
-      OpRegRegRegShift(kOpAdd, reg_ptr.GetReg(), rl_array.reg.GetReg(), rl_index.reg.GetReg(),
-                       EncodeShift(kA64Lsl, scale));
+      OpRegRegRegShift(kOpAdd, reg_ptr, rl_array.reg, rl_index.reg, EncodeShift(kA64Lsl, scale));
     }
     if (needs_range_check) {
       if (constant_index) {
@@ -1004,167 +1039,84 @@
   }
 }
 
-
 void Arm64Mir2Lir::GenShiftImmOpLong(Instruction::Code opcode,
                                    RegLocation rl_dest, RegLocation rl_src, RegLocation rl_shift) {
-  // TODO(Arm64): check this.
-  UNIMPLEMENTED(WARNING);
-
-  rl_src = LoadValueWide(rl_src, kCoreReg);
+  OpKind op = kOpBkpt;
   // Per spec, we only care about low 6 bits of shift amount.
   int shift_amount = mir_graph_->ConstantValue(rl_shift) & 0x3f;
+  rl_src = LoadValueWide(rl_src, kCoreReg);
   if (shift_amount == 0) {
     StoreValueWide(rl_dest, rl_src);
     return;
   }
-  if (BadOverlap(rl_src, rl_dest)) {
-    GenShiftOpLong(opcode, rl_dest, rl_src, rl_shift);
-    return;
-  }
-  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+
+  RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
   switch (opcode) {
     case Instruction::SHL_LONG:
     case Instruction::SHL_LONG_2ADDR:
-      if (shift_amount == 1) {
-        OpRegRegReg(kOpAdd, rl_result.reg.GetLow(), rl_src.reg.GetLow(), rl_src.reg.GetLow());
-        OpRegRegReg(kOpAdc, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), rl_src.reg.GetHigh());
-      } else if (shift_amount == 32) {
-        OpRegCopy(rl_result.reg.GetHigh(), rl_src.reg);
-        LoadConstant(rl_result.reg.GetLow(), 0);
-      } else if (shift_amount > 31) {
-        OpRegRegImm(kOpLsl, rl_result.reg.GetHigh(), rl_src.reg.GetLow(), shift_amount - 32);
-        LoadConstant(rl_result.reg.GetLow(), 0);
-      } else {
-        OpRegRegImm(kOpLsl, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), shift_amount);
-        OpRegRegRegShift(kOpOr, rl_result.reg.GetHighReg(), rl_result.reg.GetHighReg(), rl_src.reg.GetLowReg(),
-                         EncodeShift(kA64Lsr, 32 - shift_amount));
-        OpRegRegImm(kOpLsl, rl_result.reg.GetLow(), rl_src.reg.GetLow(), shift_amount);
-      }
+      op = kOpLsl;
       break;
     case Instruction::SHR_LONG:
     case Instruction::SHR_LONG_2ADDR:
-      if (shift_amount == 32) {
-        OpRegCopy(rl_result.reg.GetLow(), rl_src.reg.GetHigh());
-        OpRegRegImm(kOpAsr, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), 31);
-      } else if (shift_amount > 31) {
-        OpRegRegImm(kOpAsr, rl_result.reg.GetLow(), rl_src.reg.GetHigh(), shift_amount - 32);
-        OpRegRegImm(kOpAsr, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), 31);
-      } else {
-        RegStorage t_reg = AllocTemp();
-        OpRegRegImm(kOpLsr, t_reg, rl_src.reg.GetLow(), shift_amount);
-        OpRegRegRegShift(kOpOr, rl_result.reg.GetLowReg(), t_reg.GetReg(), rl_src.reg.GetHighReg(),
-                         EncodeShift(kA64Lsl, 32 - shift_amount));
-        FreeTemp(t_reg);
-        OpRegRegImm(kOpAsr, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), shift_amount);
-      }
+      op = kOpAsr;
       break;
     case Instruction::USHR_LONG:
     case Instruction::USHR_LONG_2ADDR:
-      if (shift_amount == 32) {
-        OpRegCopy(rl_result.reg.GetLow(), rl_src.reg.GetHigh());
-        LoadConstant(rl_result.reg.GetHigh(), 0);
-      } else if (shift_amount > 31) {
-        OpRegRegImm(kOpLsr, rl_result.reg.GetLow(), rl_src.reg.GetHigh(), shift_amount - 32);
-        LoadConstant(rl_result.reg.GetHigh(), 0);
-      } else {
-        RegStorage t_reg = AllocTemp();
-        OpRegRegImm(kOpLsr, t_reg, rl_src.reg.GetLow(), shift_amount);
-        OpRegRegRegShift(kOpOr, rl_result.reg.GetLowReg(), t_reg.GetReg(), rl_src.reg.GetHighReg(),
-                         EncodeShift(kA64Lsl, 32 - shift_amount));
-        FreeTemp(t_reg);
-        OpRegRegImm(kOpLsr, rl_result.reg.GetHigh(), rl_src.reg.GetHigh(), shift_amount);
-      }
+      op = kOpLsr;
       break;
     default:
       LOG(FATAL) << "Unexpected case";
   }
+  OpRegRegImm(op, rl_result.reg, rl_src.reg, shift_amount);
   StoreValueWide(rl_dest, rl_result);
 }
 
 void Arm64Mir2Lir::GenArithImmOpLong(Instruction::Code opcode, RegLocation rl_dest,
                                      RegLocation rl_src1, RegLocation rl_src2) {
-  // TODO(Arm64): implement this.
-  UNIMPLEMENTED(WARNING);
-
-  if ((opcode == Instruction::SUB_LONG_2ADDR) || (opcode == Instruction::SUB_LONG)) {
+  if ((opcode == Instruction::SUB_LONG) || (opcode == Instruction::SUB_LONG_2ADDR)) {
     if (!rl_src2.is_const) {
-      // Don't bother with special handling for subtract from immediate.
-      GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
-      return;
+      return GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
     }
   } else {
-    // Normalize
+    // Associativity.
     if (!rl_src2.is_const) {
       DCHECK(rl_src1.is_const);
       std::swap(rl_src1, rl_src2);
     }
   }
-  if (BadOverlap(rl_src1, rl_dest)) {
-    GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
-    return;
-  }
   DCHECK(rl_src2.is_const);
-  // TODO(Arm64): implement this.
-  //  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
-  int32_t mod_imm_lo = -1;  // ModifiedImmediate(val_lo);
-  int32_t mod_imm_hi = -1;  // ModifiedImmediate(val_hi);
 
-  // Only a subset of add/sub immediate instructions set carry - so bail if we don't fit
+  OpKind op = kOpBkpt;
+  int64_t val = mir_graph_->ConstantValueWide(rl_src2);
+
   switch (opcode) {
     case Instruction::ADD_LONG:
     case Instruction::ADD_LONG_2ADDR:
+      op = kOpAdd;
+      break;
     case Instruction::SUB_LONG:
     case Instruction::SUB_LONG_2ADDR:
-      if ((mod_imm_lo < 0) || (mod_imm_hi < 0)) {
-        GenArithOpLong(opcode, rl_dest, rl_src1, rl_src2);
-        return;
-      }
-      break;
-    default:
-      break;
-  }
-  rl_src1 = LoadValueWide(rl_src1, kCoreReg);
-  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  // NOTE: once we've done the EvalLoc on dest, we can no longer bail.
-  switch (opcode) {
-#if 0
-    case Instruction::ADD_LONG:
-    case Instruction::ADD_LONG_2ADDR:
-      NewLIR3(kThumb2AddRRI8M, rl_result.reg.GetLowReg(), rl_src1.reg.GetLowReg(), mod_imm_lo);
-      NewLIR3(kThumb2AdcRRI8M, rl_result.reg.GetHighReg(), rl_src1.reg.GetHighReg(), mod_imm_hi);
-      break;
-    case Instruction::OR_LONG:
-    case Instruction::OR_LONG_2ADDR:
-      if ((val_lo != 0) || (rl_result.reg.GetLowReg() != rl_src1.reg.GetLowReg())) {
-        OpRegRegImm(kOpOr, rl_result.reg.GetLow(), rl_src1.reg.GetLow(), val_lo);
-      }
-      if ((val_hi != 0) || (rl_result.reg.GetHighReg() != rl_src1.reg.GetHighReg())) {
-        OpRegRegImm(kOpOr, rl_result.reg.GetHigh(), rl_src1.reg.GetHigh(), val_hi);
-      }
-      break;
-    case Instruction::XOR_LONG:
-    case Instruction::XOR_LONG_2ADDR:
-      OpRegRegImm(kOpXor, rl_result.reg.GetLow(), rl_src1.reg.GetLow(), val_lo);
-      OpRegRegImm(kOpXor, rl_result.reg.GetHigh(), rl_src1.reg.GetHigh(), val_hi);
+      op = kOpSub;
       break;
     case Instruction::AND_LONG:
     case Instruction::AND_LONG_2ADDR:
-      if ((val_lo != 0xffffffff) || (rl_result.reg.GetLowReg() != rl_src1.reg.GetLowReg())) {
-        OpRegRegImm(kOpAnd, rl_result.reg.GetLow(), rl_src1.reg.GetLow(), val_lo);
-      }
-      if ((val_hi != 0xffffffff) || (rl_result.reg.GetHighReg() != rl_src1.reg.GetHighReg())) {
-        OpRegRegImm(kOpAnd, rl_result.reg.GetHigh(), rl_src1.reg.GetHigh(), val_hi);
-      }
+      op = kOpAnd;
       break;
-    case Instruction::SUB_LONG_2ADDR:
-    case Instruction::SUB_LONG:
-      NewLIR3(kThumb2SubRRI8M, rl_result.reg.GetLowReg(), rl_src1.reg.GetLowReg(), mod_imm_lo);
-      NewLIR3(kThumb2SbcRRI8M, rl_result.reg.GetHighReg(), rl_src1.reg.GetHighReg(), mod_imm_hi);
+    case Instruction::OR_LONG:
+    case Instruction::OR_LONG_2ADDR:
+      op = kOpOr;
       break;
-#endif
+    case Instruction::XOR_LONG:
+    case Instruction::XOR_LONG_2ADDR:
+      op = kOpXor;
+      break;
     default:
-      LOG(FATAL) << "Unexpected opcode " << opcode;
+      LOG(FATAL) << "Unexpected opcode";
   }
+
+  rl_src1 = LoadValueWide(rl_src1, kCoreReg);
+  RegLocation rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+  OpRegRegImm(op, rl_result.reg, rl_src1.reg, val);
   StoreValueWide(rl_dest, rl_result);
 }
 
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 2b1c5e8..0222447 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -606,7 +606,7 @@
   GrowableArray<RegisterInfo*>::Iterator fp_it(&reg_pool_->sp_regs_);
   for (RegisterInfo* info = fp_it.Next(); info != nullptr; info = fp_it.Next()) {
     int fp_reg_num = info->GetReg().GetRegNum();
-    RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | fp_reg_num);
+    RegStorage dp_reg = RegStorage::FloatSolo64(fp_reg_num);
     RegisterInfo* dp_reg_info = GetRegInfo(dp_reg);
     // Double precision register's master storage should refer to itself.
     DCHECK_EQ(dp_reg_info, dp_reg_info->Master());
@@ -616,10 +616,18 @@
     DCHECK_EQ(info->StorageMask(), 0x1U);
   }
 
-  // TODO: re-enable this when we can safely save r4 over the suspension code path.
-  bool no_suspend = NO_SUSPEND;  // || !Runtime::Current()->ExplicitSuspendChecks();
-  if (no_suspend) {
-    GetRegInfo(rs_rA64_SUSPEND)->MarkFree();
+  // Alias 32bit W registers to corresponding 64bit X registers.
+  GrowableArray<RegisterInfo*>::Iterator w_it(&reg_pool_->core_regs_);
+  for (RegisterInfo* info = w_it.Next(); info != nullptr; info = w_it.Next()) {
+    int x_reg_num = info->GetReg().GetRegNum();
+    RegStorage x_reg = RegStorage::Solo64(x_reg_num);
+    RegisterInfo* x_reg_info = GetRegInfo(x_reg);
+    // 64bit X register's master storage should refer to itself.
+    DCHECK_EQ(x_reg_info, x_reg_info->Master());
+    // Redirect 32bit W master storage to 64bit X.
+    info->SetMaster(x_reg_info);
+    // 32bit W should show a single 32-bit mask bit, at first referring to the low half.
+    DCHECK_EQ(info->StorageMask(), 0x1U);
   }
 
   // Don't start allocating temps at r0/s0/d0 or you may clobber return regs in early-exit methods.
diff --git a/compiler/dex/quick/arm64/utility_arm64.cc b/compiler/dex/quick/arm64/utility_arm64.cc
index 39e9fad..eca0d2f 100644
--- a/compiler/dex/quick/arm64/utility_arm64.cc
+++ b/compiler/dex/quick/arm64/utility_arm64.cc
@@ -408,7 +408,7 @@
       DCHECK_EQ(shift, ENCODE_NO_SHIFT);
       return NewLIR4(kA64Ubfm4rrdd | wide, r_dest_src1.GetReg(), r_src2.GetReg(), 0, 15);
     default:
-      return OpRegRegRegShift(op, r_dest_src1.GetReg(), r_dest_src1.GetReg(), r_src2.GetReg(), shift);
+      return OpRegRegRegShift(op, r_dest_src1, r_dest_src1, r_src2, shift);
   }
 
   DCHECK(!IsPseudoLirOp(opcode));
@@ -445,8 +445,8 @@
   return NULL;
 }
 
-LIR* Arm64Mir2Lir::OpRegRegRegShift(OpKind op, int r_dest, int r_src1,
-                                    int r_src2, int shift, bool is_wide) {
+LIR* Arm64Mir2Lir::OpRegRegRegShift(OpKind op, RegStorage r_dest, RegStorage r_src1,
+                                    RegStorage r_src2, int shift) {
   ArmOpcode opcode = kA64Brk1d;
 
   switch (op) {
@@ -500,21 +500,24 @@
   // The instructions above belong to two kinds:
   // - 4-operands instructions, where the last operand is a shift/extend immediate,
   // - 3-operands instructions with no shift/extend.
-  ArmOpcode widened_opcode = (is_wide) ? WIDE(opcode) : opcode;
+  ArmOpcode widened_opcode = r_dest.Is64Bit() ? WIDE(opcode) : opcode;
+  CHECK_EQ(r_dest.Is64Bit(), r_src1.Is64Bit());
+  CHECK_EQ(r_dest.Is64Bit(), r_src2.Is64Bit());
   if (EncodingMap[opcode].flags & IS_QUAD_OP) {
     DCHECK_EQ(shift, ENCODE_NO_SHIFT);
-    return NewLIR4(widened_opcode, r_dest, r_src1, r_src2, shift);
+    return NewLIR4(widened_opcode, r_dest.GetReg(), r_src1.GetReg(), r_src2.GetReg(), shift);
   } else {
     DCHECK(EncodingMap[opcode].flags & IS_TERTIARY_OP);
     DCHECK_EQ(shift, ENCODE_NO_SHIFT);
-    return NewLIR3(widened_opcode, r_dest, r_src1, r_src2);
+    return NewLIR3(widened_opcode, r_dest.GetReg(), r_src1.GetReg(), r_src2.GetReg());
   }
 }
 
 LIR* Arm64Mir2Lir::OpRegRegReg(OpKind op, RegStorage r_dest, RegStorage r_src1, RegStorage r_src2) {
-  return OpRegRegRegShift(op, r_dest.GetReg(), r_src1.GetReg(), r_src2.GetReg(), ENCODE_NO_SHIFT);
+  return OpRegRegRegShift(op, r_dest, r_src1, r_src2, ENCODE_NO_SHIFT);
 }
 
+// Should be taking an int64_t value ?
 LIR* Arm64Mir2Lir::OpRegRegImm(OpKind op, RegStorage r_dest, RegStorage r_src1, int value) {
   LIR* res;
   bool neg = (value < 0);
@@ -523,6 +526,7 @@
   ArmOpcode alt_opcode = kA64Brk1d;
   int32_t log_imm = -1;
   bool is_wide = r_dest.Is64Bit();
+  CHECK_EQ(r_dest.Is64Bit(), r_src1.Is64Bit());
   ArmOpcode wide = (is_wide) ? WIDE(0) : UNWIDE(0);
 
   switch (op) {
@@ -610,11 +614,11 @@
 }
 
 LIR* Arm64Mir2Lir::OpRegImm(OpKind op, RegStorage r_dest_src1, int value) {
-  return OpRegImm64(op, r_dest_src1, static_cast<int64_t>(value), /*is_wide*/false);
+  return OpRegImm64(op, r_dest_src1, static_cast<int64_t>(value));
 }
 
-LIR* Arm64Mir2Lir::OpRegImm64(OpKind op, RegStorage r_dest_src1, int64_t value, bool is_wide) {
-  ArmOpcode wide = (is_wide) ? WIDE(0) : UNWIDE(0);
+LIR* Arm64Mir2Lir::OpRegImm64(OpKind op, RegStorage r_dest_src1, int64_t value) {
+  ArmOpcode wide = (r_dest_src1.Is64Bit()) ? WIDE(0) : UNWIDE(0);
   ArmOpcode opcode = kA64Brk1d;
   ArmOpcode neg_opcode = kA64Brk1d;
   bool shift;
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 256135d..3fbbc4e 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1201,21 +1201,27 @@
 }
 
 RegLocation Mir2Lir::NarrowRegLoc(RegLocation loc) {
-  loc.wide = false;
   if (loc.location == kLocPhysReg) {
+    DCHECK(!loc.reg.Is32Bit());
     if (loc.reg.IsPair()) {
-      loc.reg = loc.reg.GetLow();
+      RegisterInfo* info_lo = GetRegInfo(loc.reg.GetLow());
+      RegisterInfo* info_hi = GetRegInfo(loc.reg.GetHigh());
+      info_lo->SetIsWide(false);
+      info_hi->SetIsWide(false);
+      loc.reg = info_lo->GetReg();
     } else {
-      // FIXME: temp workaround.
-      // Issue here: how do we narrow to a 32-bit value in 64-bit container?
-      // Probably the wrong thing to narrow the RegStorage container here.  That
-      // should be a target decision.  At the RegLocation level, we're only
-      // modifying the view of the Dalvik value - this is orthogonal to the storage
-      // container size.  Consider this a temp workaround.
-      DCHECK(loc.reg.IsDouble());
-      loc.reg = loc.reg.DoubleToLowSingle();
+      RegisterInfo* info = GetRegInfo(loc.reg);
+      RegisterInfo* info_new = info->FindMatchingView(RegisterInfo::k32SoloStorageMask);
+      DCHECK(info_new != nullptr);
+      if (info->IsLive() && (info->SReg() == loc.s_reg_low)) {
+        info->MarkDead();
+        info_new->MarkLive(loc.s_reg_low);
+      }
+      loc.reg = info_new->GetReg();
     }
+    DCHECK(loc.reg.Valid());
   }
+  loc.wide = false;
   return loc;
 }
 
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index de55a05..7e3c8ce 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -1595,7 +1595,7 @@
       rl_result = EvalLoc(rl_dest, kCoreReg, true);
       OpRegReg(op, rl_result.reg, rl_src1.reg);
     } else {
-      if (shift_op) {
+      if ((shift_op) && (cu_->instruction_set != kArm64)) {
         rl_src2 = LoadValue(rl_src2, kCoreReg);
         RegStorage t_reg = AllocTemp();
         OpRegRegImm(kOpAnd, t_reg, rl_src2.reg, 31);
@@ -1613,7 +1613,7 @@
     StoreValue(rl_dest, rl_result);
   } else {
     bool done = false;      // Set to true if we happen to find a way to use a real instruction.
-    if (cu_->instruction_set == kMips) {
+    if (cu_->instruction_set == kMips || cu_->instruction_set == kArm64) {
       rl_src1 = LoadValue(rl_src1, kCoreReg);
       rl_src2 = LoadValue(rl_src2, kCoreReg);
       if (check_zero) {
@@ -1889,7 +1889,7 @@
       }
 
       bool done = false;
-      if (cu_->instruction_set == kMips) {
+      if (cu_->instruction_set == kMips || cu_->instruction_set == kArm64) {
         rl_src = LoadValue(rl_src, kCoreReg);
         rl_result = GenDivRemLit(rl_dest, rl_src.reg, lit, is_div);
         done = true;
@@ -1952,6 +1952,10 @@
 
   switch (opcode) {
     case Instruction::NOT_LONG:
+      if (cu->instruction_set == kArm64) {
+        mir_to_lir->GenNotLong(rl_dest, rl_src2);
+        return;
+      }
       rl_src2 = mir_to_lir->LoadValueWide(rl_src2, kCoreReg);
       rl_result = mir_to_lir->EvalLoc(rl_dest, kCoreReg, true);
       // Check for destructive overlap
@@ -1998,6 +2002,10 @@
       break;
     case Instruction::DIV_LONG:
     case Instruction::DIV_LONG_2ADDR:
+      if (cu->instruction_set == kArm64) {
+        mir_to_lir->GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ true);
+        return;
+      }
       call_out = true;
       check_zero = true;
       ret_reg = mir_to_lir->TargetReg(kRet0).GetReg();
@@ -2005,6 +2013,10 @@
       break;
     case Instruction::REM_LONG:
     case Instruction::REM_LONG_2ADDR:
+      if (cu->instruction_set == kArm64) {
+        mir_to_lir->GenDivRemLong(opcode, rl_dest, rl_src1, rl_src2, /*is_div*/ false);
+        return;
+      }
       call_out = true;
       check_zero = true;
       func_offset = QUICK_ENTRYPOINT_OFFSET(pointer_size, pLmod);
@@ -2014,7 +2026,8 @@
       break;
     case Instruction::AND_LONG_2ADDR:
     case Instruction::AND_LONG:
-      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64) {
+      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64 ||
+          cu->instruction_set == kArm64) {
         return mir_to_lir->GenAndLong(opcode, rl_dest, rl_src1, rl_src2);
       }
       first_op = kOpAnd;
@@ -2022,7 +2035,8 @@
       break;
     case Instruction::OR_LONG:
     case Instruction::OR_LONG_2ADDR:
-      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64) {
+      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64 ||
+          cu->instruction_set == kArm64) {
         mir_to_lir->GenOrLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       }
@@ -2031,7 +2045,8 @@
       break;
     case Instruction::XOR_LONG:
     case Instruction::XOR_LONG_2ADDR:
-      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64) {
+      if (cu->instruction_set == kX86 || cu->instruction_set == kX86_64 ||
+          cu->instruction_set == kArm64) {
         mir_to_lir->GenXorLong(opcode, rl_dest, rl_src1, rl_src2);
         return;
       }
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 721b345..a6d56bd 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -864,8 +864,17 @@
       // Wide spans, we need the 2nd half of uses[2].
       rl_arg = UpdateLocWide(rl_use2);
       if (rl_arg.location == kLocPhysReg) {
-        // NOTE: not correct for 64-bit core regs, but this needs rewriting for hard-float.
-        reg = rl_arg.reg.IsPair() ? rl_arg.reg.GetHigh() : rl_arg.reg.DoubleToHighSingle();
+        if (rl_arg.reg.IsPair()) {
+          reg = rl_arg.reg.GetHigh();
+        } else {
+          RegisterInfo* info = GetRegInfo(rl_arg.reg);
+          info = info->FindMatchingView(RegisterInfo::kHighSingleStorageMask);
+          if (info == nullptr) {
+            // NOTE: For hard float convention we won't split arguments across reg/mem.
+            UNIMPLEMENTED(FATAL) << "Needs hard float api.";
+          }
+          reg = info->GetReg();
+        }
       } else {
         // kArg2 & rArg3 can safely be used here
         reg = TargetReg(kArg3);
@@ -1659,9 +1668,13 @@
     return;
   }
   DCHECK(cu_->compiler_driver->GetMethodInlinerMap() != nullptr);
-  if (cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(cu_->dex_file)
-      ->GenIntrinsic(this, info)) {
-    return;
+  // TODO: Enable instrinsics for x86_64
+  // Temporary disable intrinsics for x86_64. We will enable them later step by step.
+  if (cu_->instruction_set != kX86_64) {
+    if (cu_->compiler_driver->GetMethodInlinerMap()->GetMethodInliner(cu_->dex_file)
+        ->GenIntrinsic(this, info)) {
+      return;
+    }
   }
   GenInvokeNoInline(info);
 }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 2b57b35..e462173 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -118,6 +118,7 @@
     bool GenInlinedSqrt(CallInfo* info);
     bool GenInlinedPeek(CallInfo* info, OpSize size);
     bool GenInlinedPoke(CallInfo* info, OpSize size);
+    void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                    RegLocation rl_src2);
@@ -125,6 +126,8 @@
                     RegLocation rl_src2);
     void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                     RegLocation rl_src2);
+    void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                       RegLocation rl_src2, bool is_div);
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
     void GenCmpLong(RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2);
diff --git a/compiler/dex/quick/mips/int_mips.cc b/compiler/dex/quick/mips/int_mips.cc
index 55e93d7..beaf6bb 100644
--- a/compiler/dex/quick/mips/int_mips.cc
+++ b/compiler/dex/quick/mips/int_mips.cc
@@ -431,6 +431,15 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void MipsMir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
+  LOG(FATAL) << "Unexpected use GenNotLong()";
+}
+
+void MipsMir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                           RegLocation rl_src2, bool is_div) {
+  LOG(FATAL) << "Unexpected use GenDivRemLong()";
+}
+
 void MipsMir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
   rl_src = LoadValueWide(rl_src, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 3584c33..361aba8 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -332,6 +332,15 @@
         return arena->Alloc(size, kArenaAllocRegAlloc);
       }
 
+      static const uint32_t k32SoloStorageMask     = 0x00000001;
+      static const uint32_t kLowSingleStorageMask  = 0x00000001;
+      static const uint32_t kHighSingleStorageMask = 0x00000002;
+      static const uint32_t k64SoloStorageMask     = 0x00000003;
+      static const uint32_t k128SoloStorageMask    = 0x0000000f;
+      static const uint32_t k256SoloStorageMask    = 0x000000ff;
+      static const uint32_t k512SoloStorageMask    = 0x0000ffff;
+      static const uint32_t k1024SoloStorageMask   = 0xffffffff;
+
       bool InUse() { return (storage_mask_ & master_->used_storage_) != 0; }
       void MarkInUse() { master_->used_storage_ |= storage_mask_; }
       void MarkFree() { master_->used_storage_ &= ~storage_mask_; }
@@ -389,7 +398,15 @@
       LIR* DefEnd() { return def_end_; }
       void SetDefEnd(LIR* def_end) { def_end_ = def_end; }
       void ResetDefBody() { def_start_ = def_end_ = nullptr; }
-
+      // Find member of aliased set matching storage_used; return nullptr if none.
+      RegisterInfo* FindMatchingView(uint32_t storage_used) {
+        RegisterInfo* res = Master();
+        for (; res != nullptr; res = res->GetAliasChain()) {
+          if (res->StorageMask() == storage_used)
+            break;
+        }
+        return res;
+      }
 
      private:
       RegStorage reg_;
@@ -648,7 +665,7 @@
     virtual void EndInvoke(CallInfo* info) {}
 
 
-    // Handle bookkeeping to convert a wide RegLocation to a narow RegLocation.  No code generated.
+    // Handle bookkeeping to convert a wide RegLocation to a narrow RegLocation.  No code generated.
     RegLocation NarrowRegLoc(RegLocation loc);
 
     // Shared by all targets - implemented in local_optimizations.cc
@@ -670,7 +687,7 @@
     /* Mark a temp register as dead.  Does not affect allocation state. */
     void Clobber(RegStorage reg);
     void ClobberSReg(int s_reg);
-    void ClobberAliases(RegisterInfo* info);
+    void ClobberAliases(RegisterInfo* info, uint32_t clobber_mask);
     int SRegToPMap(int s_reg);
     void RecordCorePromotion(RegStorage reg, int s_reg);
     RegStorage AllocPreservedCoreReg(int s_reg);
@@ -775,7 +792,7 @@
                              RegLocation rl_src2, LIR* taken, LIR* fall_through);
     void GenCompareZeroAndBranch(Instruction::Code opcode, RegLocation rl_src,
                                  LIR* taken, LIR* fall_through);
-    void GenIntToLong(RegLocation rl_dest, RegLocation rl_src);
+    virtual void GenIntToLong(RegLocation rl_dest, RegLocation rl_src);
     void GenIntNarrowing(Instruction::Code opcode, RegLocation rl_dest,
                          RegLocation rl_src);
     void GenNewArray(uint32_t type_idx, RegLocation rl_dest,
@@ -800,7 +817,7 @@
     void GenCheckCast(uint32_t insn_idx, uint32_t type_idx, RegLocation rl_src);
     void GenLong3Addr(OpKind first_op, OpKind second_op, RegLocation rl_dest,
                       RegLocation rl_src1, RegLocation rl_src2);
-    void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
+    virtual void GenShiftOpLong(Instruction::Code opcode, RegLocation rl_dest,
                         RegLocation rl_src1, RegLocation rl_shift);
     void GenArithOpIntLit(Instruction::Code opcode, RegLocation rl_dest,
                           RegLocation rl_src, int lit);
@@ -1170,6 +1187,7 @@
     virtual bool GenInlinedSqrt(CallInfo* info) = 0;
     virtual bool GenInlinedPeek(CallInfo* info, OpSize size) = 0;
     virtual bool GenInlinedPoke(CallInfo* info, OpSize size) = 0;
+    virtual void GenNotLong(RegLocation rl_dest, RegLocation rl_src) = 0;
     virtual void GenNegLong(RegLocation rl_dest, RegLocation rl_src) = 0;
     virtual void GenOrLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
                            RegLocation rl_src2) = 0;
@@ -1177,6 +1195,8 @@
                             RegLocation rl_src2) = 0;
     virtual void GenXorLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2) = 0;
+    virtual void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                            RegLocation rl_src2, bool is_div) = 0;
     virtual RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi,
                                   bool is_div) = 0;
     virtual RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit,
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index 2c51c1f..e5ca460 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -173,22 +173,26 @@
       }
       ClobberBody(info);
       if (info->IsAliased()) {
-        ClobberAliases(info);
+        ClobberAliases(info, info->StorageMask());
       } else {
         RegisterInfo* master = info->Master();
         if (info != master) {
           ClobberBody(info->Master());
+          ClobberAliases(info->Master(), info->StorageMask());
         }
       }
     }
   }
 }
 
-void Mir2Lir::ClobberAliases(RegisterInfo* info) {
+void Mir2Lir::ClobberAliases(RegisterInfo* info, uint32_t clobber_mask) {
   for (RegisterInfo* alias = info->GetAliasChain(); alias != nullptr;
        alias = alias->GetAliasChain()) {
     DCHECK(!alias->IsAliased());  // Only the master should be marked as alised.
-    ClobberBody(alias);
+    // Only clobber if we have overlap.
+    if ((alias->StorageMask() & clobber_mask) != 0) {
+      ClobberBody(alias);
+    }
   }
 }
 
@@ -218,7 +222,7 @@
         }
         ClobberBody(info);
         if (info->IsAliased()) {
-          ClobberAliases(info);
+          ClobberAliases(info, info->StorageMask());
         }
       }
     }
@@ -447,8 +451,11 @@
     reg = FindLiveReg(wide ? reg_pool_->dp_regs_ : reg_pool_->sp_regs_, s_reg);
   }
   if (!reg.Valid() && (reg_class != kFPReg)) {
-    // TODO: add 64-bit core pool similar to above.
-    reg = FindLiveReg(reg_pool_->core_regs_, s_reg);
+    if (Is64BitInstructionSet(cu_->instruction_set)) {
+      reg = FindLiveReg(wide ? reg_pool_->core64_regs_ : reg_pool_->core_regs_, s_reg);
+    } else {
+      reg = FindLiveReg(reg_pool_->core_regs_, s_reg);
+    }
   }
   if (reg.Valid()) {
     if (wide && !reg.IsFloat() && !Is64BitInstructionSet(cu_->instruction_set)) {
@@ -950,11 +957,8 @@
         // If I'm live, master should not be live, but should show liveness in alias set.
         DCHECK_EQ(info->Master()->SReg(), INVALID_SREG);
         DCHECK(!info->Master()->IsDead());
-      } else if (!info->IsDead()) {
-        // If I'm not live, but there is liveness in the set master must be live.
-        DCHECK_EQ(info->SReg(), INVALID_SREG);
-        DCHECK(info->Master()->IsLive());
       }
+// TODO: Add checks in !info->IsDead() case to ensure every live bit is owned by exactly 1 reg.
     }
     if (info->IsAliased()) {
       // Has child aliases.
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 9200106..b8222ef 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -63,27 +63,36 @@
 { kX86 ## opname ## 16TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0x66, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "16TI8", "fs:[!0d],!1d" }, \
   \
 { kX86 ## opname ## 32MR,  kMemReg,    mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { 0,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "32MR", "[!0r+!1d],!2r" }, \
-{ kX86 ## opname ## 64MR,  kMemReg64,  mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64MR", "[!0r+!1d],!2r" }, \
 { kX86 ## opname ## 32AR,  kArrayReg,  mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { 0,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "32AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
-{ kX86 ## opname ## 64AR,  kArrayReg64, mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
 { kX86 ## opname ## 32TR,  kThreadReg, mem_use | IS_BINARY_OP   |           REG_USE1   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "32TR", "fs:[!0d],!1r" }, \
 { kX86 ## opname ## 32RR,  kRegReg,              IS_BINARY_OP   | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RR", "!0r,!1r" }, \
 { kX86 ## opname ## 32RM,  kRegMem,    IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RM", "!0r,[!1r+!2d]" }, \
-{ kX86 ## opname ## 64RM,  kRegMem,    IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,         0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## 32RA,  kRegArray,  IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE012 | SETS_CCODES | uses_ccodes, { 0,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RA", "!0r,[!1r+!2r<<!3d+!4d]" }, \
-{ kX86 ## opname ## 64RA,  kRegArray,  IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE012 | SETS_CCODES | uses_ccodes, { REX_W,         0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RA", "!0r,[!1r+!2r<<!3d+!4d]" }, \
 { kX86 ## opname ## 32RT,  kRegThread, IS_LOAD | IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "32RT", "!0r,fs:[!1d]" }, \
-{ kX86 ## opname ## 64RT,  kReg64Thread, IS_LOAD | IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RT", "!0r,fs:[!1d]" }, \
 { kX86 ## opname ## 32RI,  kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i32, 0, 0, rm32_i32_modrm, ax32_i32, 4 }, #opname "32RI", "!0r,!1d" }, \
-{ kX86 ## opname ## 64RI,  kReg64Imm,            IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_i32, 0, 0, rm32_i32_modrm, ax32_i32, 4 }, #opname "32RI", "!0r,!1d" }, \
 { kX86 ## opname ## 32MI,  kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "32MI", "[!0r+!1d],!2d" }, \
 { kX86 ## opname ## 32AI,  kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "32AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
 { kX86 ## opname ## 32TI,  kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "32TI", "fs:[!0d],!1d" }, \
 { kX86 ## opname ## 32RI8, kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32RI8", "!0r,!1d" }, \
-{ kX86 ## opname ## 64RI8, kReg64Imm,            IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,         0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64RI8", "!0r,!1d" }, \
 { kX86 ## opname ## 32MI8, kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32MI8", "[!0r+!1d],!2d" }, \
 { kX86 ## opname ## 32AI8, kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { 0,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32AI8", "[!0r+!1r<<!2d+!3d],!4d" }, \
-{ kX86 ## opname ## 32TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32TI8", "fs:[!0d],!1d" }
+{ kX86 ## opname ## 32TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, 0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "32TI8", "fs:[!0d],!1d" }, \
+  \
+{ kX86 ## opname ## 64MR,  kMemReg,    mem_use | IS_TERTIARY_OP |           REG_USE02  | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64MR", "[!0r+!1d],!2r" }, \
+{ kX86 ## opname ## 64AR,  kArrayReg,  mem_use | IS_QUIN_OP     |           REG_USE014 | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64AR", "[!0r+!1r<<!2d+!3d],!4r" }, \
+{ kX86 ## opname ## 64TR,  kThreadReg, mem_use | IS_BINARY_OP   |           REG_USE1   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, rm32_r32, 0, 0, 0,              0,        0 }, #opname "64TR", "fs:[!0d],!1r" }, \
+{ kX86 ## opname ## 64RR,  kRegReg,              IS_BINARY_OP   | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RR", "!0r,!1r" }, \
+{ kX86 ## opname ## 64RM,  kRegMem,    IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## 64RA,  kRegArray,  IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE012 | SETS_CCODES | uses_ccodes, { REX_W,             0, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RA", "!0r,[!1r+!2r<<!3d+!4d]" }, \
+{ kX86 ## opname ## 64RT,  kRegThread, IS_LOAD | IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, r32_rm32, 0, 0, 0,              0,        0 }, #opname "64RT", "!0r,fs:[!1d]" }, \
+{ kX86 ## opname ## 64RI,  kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i32, 0, 0, rm32_i32_modrm, ax32_i32, 4 }, #opname "64RI", "!0r,!1d" }, \
+{ kX86 ## opname ## 64MI,  kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "64MI", "[!0r+!1d],!2d" }, \
+{ kX86 ## opname ## 64AI,  kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "64AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
+{ kX86 ## opname ## 64TI,  kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, rm32_i32, 0, 0, rm32_i32_modrm, 0,        4 }, #opname "64TI", "fs:[!0d],!1d" }, \
+{ kX86 ## opname ## 64RI8, kRegImm,              IS_BINARY_OP   | reg_def | REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64RI8", "!0r,!1d" }, \
+{ kX86 ## opname ## 64MI8, kMemImm,    mem_use | IS_TERTIARY_OP |           REG_USE0   | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64MI8", "[!0r+!1d],!2d" }, \
+{ kX86 ## opname ## 64AI8, kArrayImm,  mem_use | IS_QUIN_OP     |           REG_USE01  | SETS_CCODES | uses_ccodes, { REX_W,             0, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64AI8", "[!0r+!1r<<!2d+!3d],!4d" }, \
+{ kX86 ## opname ## 64TI8, kThreadImm, mem_use | IS_BINARY_OP   |                        SETS_CCODES | uses_ccodes, { THREAD_PREFIX, REX_W, rm32_i8,  0, 0, rm32_i8_modrm,  0,        1 }, #opname "64TI8", "fs:[!0d],!1d" }
 
 ENCODING_MAP(Add, IS_LOAD | IS_STORE, REG_DEF0, 0,
   0x00 /* RegMem8/Reg8 */,     0x01 /* RegMem32/Reg32 */,
@@ -146,6 +155,13 @@
   { kX86Imul32RMI8,  kRegMemImm,   IS_LOAD | IS_QUAD_OP     | REG_DEF0_USE1  | SETS_CCODES, { 0, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul32RMI8", "!0r,[!1r+!2d],!3d" },
   { kX86Imul32RAI8,  kRegArrayImm, IS_LOAD | IS_SEXTUPLE_OP | REG_DEF0_USE12 | SETS_CCODES, { 0, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul32RAI8", "!0r,[!1r+!2r<<!3d+!4d],!5d" },
 
+  { kX86Imul64RRI,   kRegRegImm,             IS_TERTIARY_OP | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x69, 0, 0, 0, 0, 8 }, "Imul64RRI", "!0r,!1r,!2d" },
+  { kX86Imul64RMI,   kRegMemImm,   IS_LOAD | IS_QUAD_OP     | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x69, 0, 0, 0, 0, 8 }, "Imul64RMI", "!0r,[!1r+!2d],!3d" },
+  { kX86Imul64RAI,   kRegArrayImm, IS_LOAD | IS_SEXTUPLE_OP | REG_DEF0_USE12 | SETS_CCODES, { REX_W, 0, 0x69, 0, 0, 0, 0, 8 }, "Imul64RAI", "!0r,[!1r+!2r<<!3d+!4d],!5d" },
+  { kX86Imul64RRI8,  kRegRegImm,             IS_TERTIARY_OP | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul64RRI8", "!0r,!1r,!2d" },
+  { kX86Imul64RMI8,  kRegMemImm,   IS_LOAD | IS_QUAD_OP     | REG_DEF0_USE1  | SETS_CCODES, { REX_W, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul64RMI8", "!0r,[!1r+!2d],!3d" },
+  { kX86Imul64RAI8,  kRegArrayImm, IS_LOAD | IS_SEXTUPLE_OP | REG_DEF0_USE12 | SETS_CCODES, { REX_W, 0, 0x6B, 0, 0, 0, 0, 1 }, "Imul64RAI8", "!0r,[!1r+!2r<<!3d+!4d],!5d" },
+
   { kX86Mov8MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { 0,             0, 0x88, 0, 0, 0, 0, 0 }, "Mov8MR", "[!0r+!1d],!2r" },
   { kX86Mov8AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { 0,             0, 0x88, 0, 0, 0, 0, 0 }, "Mov8AR", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86Mov8TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, 0, 0x88, 0, 0, 0, 0, 0 }, "Mov8TR", "fs:[!0d],!1r" },
@@ -171,30 +187,42 @@
   { kX86Mov16TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0x66, 0xC7, 0, 0, 0, 0, 2 }, "Mov16TI", "fs:[!0d],!1d" },
 
   { kX86Mov32MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { 0,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov32MR", "[!0r+!1d],!2r" },
-  { kX86Mov64MR, kMemReg64,  IS_STORE | IS_TERTIARY_OP | REG_USE02,      { REX_W,         0, 0x89, 0, 0, 0, 0, 0 }, "Mov64MR", "[!0r+!1d],!2r" },
   { kX86Mov32AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { 0,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov32AR", "[!0r+!1r<<!2d+!3d],!4r" },
-  { kX86Mov64AR, kArrayReg64, IS_STORE | IS_QUIN_OP     | REG_USE014,     { REX_W,        0, 0x89, 0, 0, 0, 0, 0 }, "Mov64AR", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86Mov32TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, 0, 0x89, 0, 0, 0, 0, 0 }, "Mov32TR", "fs:[!0d],!1r" },
   { kX86Mov32RR, kRegReg,               IS_BINARY_OP   | REG_DEF0_USE1,  { 0,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RR", "!0r,!1r" },
   { kX86Mov32RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { 0,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RM", "!0r,[!1r+!2d]" },
-  { kX86Mov64RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { REX_W,         0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RM", "!0r,[!1r+!2d]" },
   { kX86Mov32RA, kRegArray,  IS_LOAD  | IS_QUIN_OP     | REG_DEF0_USE12, { 0,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
-  { kX86Mov64RA, kRegArray,  IS_LOAD  | IS_QUIN_OP     | REG_DEF0_USE12, { REX_W,         0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RA", "!0r,[!1r+!2r<<!3d+!4d]" },
   { kX86Mov32RT, kRegThread, IS_LOAD  | IS_BINARY_OP   | REG_DEF0,       { THREAD_PREFIX, 0, 0x8B, 0, 0, 0, 0, 0 }, "Mov32RT", "!0r,fs:[!1d]" },
-  { kX86Mov64RT, kRegThread, IS_LOAD  | IS_BINARY_OP   | REG_DEF0,       { THREAD_PREFIX, REX_W, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RT", "!0r,fs:[!1d]" },
   { kX86Mov32RI, kMovRegImm,            IS_BINARY_OP   | REG_DEF0,       { 0,             0, 0xB8, 0, 0, 0, 0, 4 }, "Mov32RI", "!0r,!1d" },
   { kX86Mov32MI, kMemImm,    IS_STORE | IS_TERTIARY_OP | REG_USE0,       { 0,             0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32MI", "[!0r+!1d],!2d" },
   { kX86Mov32AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { 0,             0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32AI", "[!0r+!1r<<!2d+!3d],!4d" },
   { kX86Mov32TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, 0, 0xC7, 0, 0, 0, 0, 4 }, "Mov32TI", "fs:[!0d],!1d" },
-  { kX86Mov64TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, REX_W, 0xC7, 0, 0, 0, 0, 4 }, "Mov64TI", "fs:[!0d],!1d" },
 
-  { kX86Lea32RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE1,      { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RM", "!0r,[!1r+!2d]" },
+  { kX86Lea32RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE1,      { 0,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RM", "!0r,[!1r+!2d]" },
 
-  { kX86Lea32RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12, { 0, 0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
+  { kX86Lea32RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12,                 { 0,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea32RA", "!0r,[!1r+!2r<<!3d+!4d]" },
 
-  { kX86Cmov32RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {0, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RR", "!2c !0r,!1r" },
+  { kX86Mov64MR, kMemReg,    IS_STORE | IS_TERTIARY_OP | REG_USE02,      { REX_W,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov64MR", "[!0r+!1d],!2r" },
+  { kX86Mov64AR, kArrayReg,  IS_STORE | IS_QUIN_OP     | REG_USE014,     { REX_W,             0, 0x89, 0, 0, 0, 0, 0 }, "Mov64AR", "[!0r+!1r<<!2d+!3d],!4r" },
+  { kX86Mov64TR, kThreadReg, IS_STORE | IS_BINARY_OP   | REG_USE1,       { THREAD_PREFIX, REX_W, 0x89, 0, 0, 0, 0, 0 }, "Mov64TR", "fs:[!0d],!1r" },
+  { kX86Mov64RR, kRegReg,               IS_BINARY_OP   | REG_DEF0_USE1,  { REX_W,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RR", "!0r,!1r" },
+  { kX86Mov64RM, kRegMem,    IS_LOAD  | IS_TERTIARY_OP | REG_DEF0_USE1,  { REX_W,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RM", "!0r,[!1r+!2d]" },
+  { kX86Mov64RA, kRegArray,  IS_LOAD  | IS_QUIN_OP     | REG_DEF0_USE12, { REX_W,             0, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RA", "!0r,[!1r+!2r<<!3d+!4d]" },
+  { kX86Mov64RT, kRegThread, IS_LOAD  | IS_BINARY_OP   | REG_DEF0,       { THREAD_PREFIX, REX_W, 0x8B, 0, 0, 0, 0, 0 }, "Mov64RT", "!0r,fs:[!1d]" },
+  { kX86Mov64RI, kMovRegImm,            IS_BINARY_OP   | REG_DEF0,       { REX_W,             0, 0xB8, 0, 0, 0, 0, 8 }, "Mov64RI", "!0r,!1d" },
+  { kX86Mov64MI, kMemImm,    IS_STORE | IS_TERTIARY_OP | REG_USE0,       { REX_W,             0, 0xC7, 0, 0, 0, 0, 8 }, "Mov64MI", "[!0r+!1d],!2d" },
+  { kX86Mov64AI, kArrayImm,  IS_STORE | IS_QUIN_OP     | REG_USE01,      { REX_W,             0, 0xC7, 0, 0, 0, 0, 8 }, "Mov64AI", "[!0r+!1r<<!2d+!3d],!4d" },
+  { kX86Mov64TI, kThreadImm, IS_STORE | IS_BINARY_OP,                    { THREAD_PREFIX, REX_W, 0xC7, 0, 0, 0, 0, 8 }, "Mov64TI", "fs:[!0d],!1d" },
 
-  { kX86Cmov32RMC, kRegMemCond, IS_QUAD_OP | IS_LOAD | REG_DEF0_USE01 | USES_CCODES, {0, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RM", "!3c !0r,[!1r+!2d]" },
+  { kX86Lea64RM, kRegMem, IS_TERTIARY_OP | IS_LOAD | REG_DEF0_USE1,      { REX_W,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea64RM", "!0r,[!1r+!2d]" },
+
+  { kX86Lea64RA, kRegArray, IS_QUIN_OP | REG_DEF0_USE12,                 { REX_W,             0, 0x8D, 0, 0, 0, 0, 0 }, "Lea64RA", "!0r,[!1r+!2r<<!3d+!4d]" },
+
+  { kX86Cmov32RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {0,     0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RR", "!2c !0r,!1r" },
+  { kX86Cmov64RRC, kRegRegCond, IS_TERTIARY_OP | REG_DEF0_USE01 | USES_CCODES, {REX_W, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc64RR", "!2c !0r,!1r" },
+
+  { kX86Cmov32RMC, kRegMemCond, IS_QUAD_OP | IS_LOAD | REG_DEF0_USE01 | USES_CCODES, {0,     0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc32RM", "!3c !0r,[!1r+!2d]" },
+  { kX86Cmov64RMC, kRegMemCond, IS_QUAD_OP | IS_LOAD | REG_DEF0_USE01 | USES_CCODES, {REX_W, 0, 0x0F, 0x40, 0, 0, 0, 0}, "Cmovcc64RM", "!3c !0r,[!1r+!2d]" },
 
 #define SHIFT_ENCODING_MAP(opname, modrm_opcode) \
 { kX86 ## opname ## 8RI, kShiftRegImm,                        IS_BINARY_OP   | REG_DEF0_USE0 |            SETS_CCODES, { 0,    0, 0xC0, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "8RI", "!0r,!1d" }, \
@@ -216,7 +244,14 @@
 { kX86 ## opname ## 32AI, kShiftArrayImm, IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     |            SETS_CCODES, { 0,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "32AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
 { kX86 ## opname ## 32RC, kShiftRegCl,                         IS_BINARY_OP   | REG_DEF0_USE0 | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32RC", "!0r,cl" }, \
 { kX86 ## opname ## 32MC, kShiftMemCl,    IS_LOAD | IS_STORE | IS_TERTIARY_OP | REG_USE0      | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32MC", "[!0r+!1d],cl" }, \
-{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32AC", "[!0r+!1r<<!2d+!3d],cl" }
+{ kX86 ## opname ## 32AC, kShiftArrayCl,  IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     | REG_USEC | SETS_CCODES, { 0,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "32AC", "[!0r+!1r<<!2d+!3d],cl" }, \
+  \
+{ kX86 ## opname ## 64RI, kShiftRegImm,                        IS_BINARY_OP   | REG_DEF0_USE0 |            SETS_CCODES, { REX_W,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "64RI", "!0r,!1d" }, \
+{ kX86 ## opname ## 64MI, kShiftMemImm,   IS_LOAD | IS_STORE | IS_TERTIARY_OP | REG_USE0      |            SETS_CCODES, { REX_W,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "64MI", "[!0r+!1d],!2d" }, \
+{ kX86 ## opname ## 64AI, kShiftArrayImm, IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     |            SETS_CCODES, { REX_W,    0, 0xC1, 0, 0, modrm_opcode, 0xD1, 1 }, #opname "64AI", "[!0r+!1r<<!2d+!3d],!4d" }, \
+{ kX86 ## opname ## 64RC, kShiftRegCl,                         IS_BINARY_OP   | REG_DEF0_USE0 | REG_USEC | SETS_CCODES, { REX_W,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "64RC", "!0r,cl" }, \
+{ kX86 ## opname ## 64MC, kShiftMemCl,    IS_LOAD | IS_STORE | IS_TERTIARY_OP | REG_USE0      | REG_USEC | SETS_CCODES, { REX_W,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "64MC", "[!0r+!1d],cl" }, \
+{ kX86 ## opname ## 64AC, kShiftArrayCl,  IS_LOAD | IS_STORE | IS_QUIN_OP     | REG_USE01     | REG_USEC | SETS_CCODES, { REX_W,    0, 0xD3, 0, 0, modrm_opcode, 0,    0 }, #opname "64AC", "[!0r+!1r<<!2d+!3d],cl" }
 
   SHIFT_ENCODING_MAP(Rol, 0x0),
   SHIFT_ENCODING_MAP(Ror, 0x1),
@@ -232,6 +267,10 @@
   { kX86Shld32MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { 0,    0, 0x0F, 0xA4, 0, 0, 0, 1}, "Shld32MRI", "[!0r+!1d],!2r,!3d" },
   { kX86Shrd32RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { 0,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd32RRI", "!0r,!1r,!2d" },
   { kX86Shrd32MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { 0,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd32MRI", "[!0r+!1d],!2r,!3d" },
+  { kX86Shld64RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { REX_W,    0, 0x0F, 0xA4, 0, 0, 0, 1}, "Shld64RRI", "!0r,!1r,!2d" },
+  { kX86Shld64MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { REX_W,    0, 0x0F, 0xA4, 0, 0, 0, 1}, "Shld64MRI", "[!0r+!1d],!2r,!3d" },
+  { kX86Shrd64RRI,  kRegRegImmRev, IS_TERTIARY_OP | REG_DEF0_USE01  | SETS_CCODES, { REX_W,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd64RRI", "!0r,!1r,!2d" },
+  { kX86Shrd64MRI,  kMemRegImm,    IS_QUAD_OP | REG_USE02 | IS_LOAD | IS_STORE | SETS_CCODES, { REX_W,    0, 0x0F, 0xAC, 0, 0, 0, 1}, "Shrd64MRI", "[!0r+!1d],!2r,!3d" },
 
   { kX86Test8RI,  kRegImm,             IS_BINARY_OP   | REG_USE0  | SETS_CCODES, { 0,    0, 0xF6, 0, 0, 0, 0, 1}, "Test8RI", "!0r,!1d" },
   { kX86Test8MI,  kMemImm,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { 0,    0, 0xF6, 0, 0, 0, 0, 1}, "Test8MI", "[!0r+!1d],!2d" },
@@ -242,7 +281,12 @@
   { kX86Test32RI, kRegImm,             IS_BINARY_OP   | REG_USE0  | SETS_CCODES, { 0,    0, 0xF7, 0, 0, 0, 0, 4}, "Test32RI", "!0r,!1d" },
   { kX86Test32MI, kMemImm,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { 0,    0, 0xF7, 0, 0, 0, 0, 4}, "Test32MI", "[!0r+!1d],!2d" },
   { kX86Test32AI, kArrayImm, IS_LOAD | IS_QUIN_OP     | REG_USE01 | SETS_CCODES, { 0,    0, 0xF7, 0, 0, 0, 0, 4}, "Test32AI", "[!0r+!1r<<!2d+!3d],!4d" },
+  { kX86Test64RI, kRegImm,             IS_BINARY_OP   | REG_USE0  | SETS_CCODES, { REX_W, 0, 0xF7, 0, 0, 0, 0, 8}, "Test64RI", "!0r,!1d" },
+  { kX86Test64MI, kMemImm,   IS_LOAD | IS_TERTIARY_OP | REG_USE0  | SETS_CCODES, { REX_W, 0, 0xF7, 0, 0, 0, 0, 8}, "Test64MI", "[!0r+!1d],!2d" },
+  { kX86Test64AI, kArrayImm, IS_LOAD | IS_QUIN_OP     | REG_USE01 | SETS_CCODES, { REX_W, 0, 0xF7, 0, 0, 0, 0, 8}, "Test64AI", "[!0r+!1r<<!2d+!3d],!4d" },
+
   { kX86Test32RR, kRegReg,             IS_BINARY_OP   | REG_USE01 | SETS_CCODES, { 0,    0, 0x85, 0, 0, 0, 0, 0}, "Test32RR", "!0r,!1r" },
+  { kX86Test64RR, kRegReg,             IS_BINARY_OP   | REG_USE01 | SETS_CCODES, { REX_W, 0, 0x85, 0, 0, 0, 0, 0}, "Test64RR", "!0r,!1r" },
 
 #define UNARY_ENCODING_MAP(opname, modrm, is_store, sets_ccodes, \
                            reg, reg_kind, reg_flags, \
@@ -258,7 +302,10 @@
 { kX86 ## opname ## 16 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | hw_flags | sets_ccodes, { 0x66, 0, 0xF7, 0, 0, modrm, 0, imm << 1}, #opname "16" #arr, hw_format "[!0r+!1r<<!2d+!3d]" }, \
 { kX86 ## opname ## 32 ## reg, reg_kind,                      reg_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #reg, w_format "!0r" }, \
 { kX86 ## opname ## 32 ## mem, mem_kind, IS_LOAD | is_store | mem_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #mem, w_format "[!0r+!1d]" }, \
-{ kX86 ## opname ## 32 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #arr, w_format "[!0r+!1r<<!2d+!3d]" }
+{ kX86 ## opname ## 32 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | w_flags  | sets_ccodes, { 0,    0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "32" #arr, w_format "[!0r+!1r<<!2d+!3d]" }, \
+{ kX86 ## opname ## 64 ## reg, reg_kind,                      reg_flags | w_flags  | sets_ccodes, { REX_W, 0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "64" #reg, w_format "!0r" }, \
+{ kX86 ## opname ## 64 ## mem, mem_kind, IS_LOAD | is_store | mem_flags | w_flags  | sets_ccodes, { REX_W, 0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "64" #mem, w_format "[!0r+!1d]" }, \
+{ kX86 ## opname ## 64 ## arr, arr_kind, IS_LOAD | is_store | arr_flags | w_flags  | sets_ccodes, { REX_W, 0, 0xF7, 0, 0, modrm, 0, imm << 2}, #opname "64" #arr, w_format "[!0r+!1r<<!2d+!3d]" }
 
   UNARY_ENCODING_MAP(Not, 0x2, IS_STORE, 0,           R, kReg, IS_UNARY_OP | REG_DEF0_USE0, M, kMem, IS_BINARY_OP | REG_USE0, A, kArray, IS_QUAD_OP | REG_USE01, 0, 0, 0, 0, "", "", ""),
   UNARY_ENCODING_MAP(Neg, 0x3, IS_STORE, SETS_CCODES, R, kReg, IS_UNARY_OP | REG_DEF0_USE0, M, kMem, IS_BINARY_OP | REG_USE0, A, kArray, IS_QUAD_OP | REG_USE01, 0, 0, 0, 0, "", "", ""),
@@ -279,6 +326,11 @@
 { kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, 0, 0, 0, 0 }, #opname "RM", "!0r,[!1r+!2d]" }, \
 { kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, 0, 0, 0, 0 }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
 
+#define EXT_0F_ENCODING2_MAP(opname, prefix, opcode, opcode2, reg_def) \
+{ kX86 ## opname ## RR, kRegReg,             IS_BINARY_OP   | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RR", "!0r,!1r" }, \
+{ kX86 ## opname ## RM, kRegMem,   IS_LOAD | IS_TERTIARY_OP | reg_def | REG_USE1,  { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RM", "!0r,[!1r+!2d]" }, \
+{ kX86 ## opname ## RA, kRegArray, IS_LOAD | IS_QUIN_OP     | reg_def | REG_USE12, { prefix, 0, 0x0F, opcode, opcode2, 0, 0, 0 }, #opname "RA", "!0r,[!1r+!2r<<!3d+!4d]" }
+
   EXT_0F_ENCODING_MAP(Movsd, 0xF2, 0x10, REG_DEF0),
   { kX86MovsdMR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0xF2, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovsdMR", "[!0r+!1d],!2r" },
   { kX86MovsdAR, kArrayReg, IS_STORE | IS_QUIN_OP     | REG_USE014, { 0xF2, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovsdAR", "[!0r+!1r<<!2d+!3d],!4r" },
@@ -310,10 +362,42 @@
   EXT_0F_ENCODING_MAP(Divsd,     0xF2, 0x5E, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Divss,     0xF3, 0x5E, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Sqrtsd,    0xF2, 0x51, REG_DEF0_USE0),
+  EXT_0F_ENCODING2_MAP(Pmulld,   0x66, 0x38, 0x40, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Pmullw,    0x66, 0xD5, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Mulps,     0x00, 0x59, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Mulpd,     0x66, 0x59, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Paddb,     0x66, 0xFC, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Paddw,     0x66, 0xFD, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Paddd,     0x66, 0xFE, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Addps,     0x00, 0x58, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Addpd,     0xF2, 0x58, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psubb,     0x66, 0xF8, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psubw,     0x66, 0xF9, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psubd,     0x66, 0xFA, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Subps,     0x00, 0x5C, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Subpd,     0x66, 0x5C, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Pand,      0x66, 0xDB, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Por,       0x66, 0xEB, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Pxor,      0x66, 0xEF, REG_DEF0_USE0),
+  EXT_0F_ENCODING2_MAP(Phaddw,   0x66, 0x38, 0x01, REG_DEF0_USE0),
+  EXT_0F_ENCODING2_MAP(Phaddd,   0x66, 0x38, 0x02, REG_DEF0_USE0),
 
+  { kX86PextrbRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x14, 0, 0, 1 }, "PextbRRI", "!0r,!1r,!2d" },
+  { kX86PextrwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0xC5, 0x00, 0, 0, 1 }, "PextwRRI", "!0r,!1r,!2d" },
+  { kX86PextrdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x3A, 0x16, 0, 0, 1 }, "PextdRRI", "!0r,!1r,!2d" },
+
+  { kX86PshuflwRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0xF2, 0, 0x0F, 0x70, 0, 0, 0, 1 }, "PshuflwRRI", "!0r,!1r,!2d" },
+  { kX86PshufdRRI, kRegRegImm, IS_TERTIARY_OP | REG_DEF0 | REG_USE1, { 0x66, 0, 0x0F, 0x70, 0, 0, 0, 1 }, "PshuffRRI", "!0r,!1r,!2d" },
+
+  { kX86PsrawRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 4, 0, 1 }, "PsrawRI", "!0r,!1d" },
+  { kX86PsradRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 4, 0, 1 }, "PsradRI", "!0r,!1d" },
+  { kX86PsrlwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 2, 0, 1 }, "PsrlwRI", "!0r,!1d" },
+  { kX86PsrldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 2, 0, 1 }, "PsrldRI", "!0r,!1d" },
   { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
+  { kX86PsllwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 6, 0, 1 }, "PsllwRI", "!0r,!1d" },
+  { kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1 }, "PslldRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
-  { kX86SqrtsdRR, kRegReg, IS_BINARY_OP | REG_DEF0_USE1, { 0xF2, 0, 0x0F, 0x51, 0, 0, 0, 0 }, "SqrtsdRR", "!0r,!1r" },
 
   { kX86Fild32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDB, 0x00, 0, 0, 0, 0 }, "Fild32M", "[!0r,!1d]" },
   { kX86Fild64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0 | USE_FP_STACK, { 0x0, 0, 0xDF, 0x00, 0, 5, 0, 0 }, "Fild64M", "[!0r,!1d]" },
@@ -441,19 +525,16 @@
       return 3;  // 1 byte of opcode + 2 prefixes
     case kRegOpcode:  // lir operands - 0: reg
       return ComputeSize(entry, 0, 0, false) - 1;  // substract 1 for modrm
-    case kReg64:
     case kReg:  // lir operands - 0: reg
       return ComputeSize(entry, 0, 0, false);
     case kMem:  // lir operands - 0: base, 1: disp
       return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
     case kArray:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
       return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
-    case kMemReg64:
     case kMemReg:  // lir operands - 0: base, 1: disp, 2: reg
       return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
     case kMemRegImm:  // lir operands - 0: base, 1: disp, 2: reg 3: immediate
       return ComputeSize(entry, lir->operands[0], lir->operands[1], false);
-    case kArrayReg64:
     case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
       return ComputeSize(entry, lir->operands[0], lir->operands[3], true);
     case kThreadReg:  // lir operands - 0: disp, 1: reg
@@ -466,10 +547,8 @@
       return ComputeSize(entry, lir->operands[1], lir->operands[2], false);
     case kRegArray:   // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp
       return ComputeSize(entry, lir->operands[1], lir->operands[4], true);
-    case kReg64Thread:  // lir operands - 0: reg, 1: disp
     case kRegThread:  // lir operands - 0: reg, 1: disp
       return ComputeSize(entry, 0, 0x12345678, false);  // displacement size is always 32bit
-    case kReg64Imm:
     case kRegImm: {  // lir operands - 0: reg, 1: immediate
       size_t size = ComputeSize(entry, 0, 0, false);
       if (entry->skeleton.ax_opcode == 0) {
@@ -494,7 +573,7 @@
     case kRegArrayImm:  // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp, 5: imm
       return ComputeSize(entry, lir->operands[1], lir->operands[4], true);
     case kMovRegImm:  // lir operands - 0: reg, 1: immediate
-      return 1 + entry->skeleton.immediate_bytes;
+      return (entry->skeleton.prefix1 != 0?1:0) + 1 + entry->skeleton.immediate_bytes;  // TODO(64): reg1
     case kShiftRegImm:  // lir operands - 0: reg, 1: immediate
       // Shift by immediate one has a shorter opcode.
       return ComputeSize(entry, 0, 0, false) - (lir->operands[1] == 1 ? 1 : 0);
@@ -675,7 +754,7 @@
   EmitDisp(base, disp);
 }
 
-void X86Mir2Lir::EmitImm(const X86EncodingMap* entry, int imm) {
+void X86Mir2Lir::EmitImm(const X86EncodingMap* entry, int64_t imm) {
   switch (entry->skeleton.immediate_bytes) {
     case 1:
       DCHECK(IS_SIMM8(imm));
@@ -692,6 +771,16 @@
       code_buffer_.push_back((imm >> 16) & 0xFF);
       code_buffer_.push_back((imm >> 24) & 0xFF);
       break;
+    case 8:
+      code_buffer_.push_back(imm & 0xFF);
+      code_buffer_.push_back((imm >> 8) & 0xFF);
+      code_buffer_.push_back((imm >> 16) & 0xFF);
+      code_buffer_.push_back((imm >> 24) & 0xFF);
+      code_buffer_.push_back((imm >> 32) & 0xFF);
+      code_buffer_.push_back((imm >> 40) & 0xFF);
+      code_buffer_.push_back((imm >> 48) & 0xFF);
+      code_buffer_.push_back((imm >> 56) & 0xFF);
+      break;
     default:
       LOG(FATAL) << "Unexpected immediate bytes (" << entry->skeleton.immediate_bytes
                  << ") for instruction: " << entry->name;
@@ -881,13 +970,29 @@
   DCHECK_EQ(entry->skeleton.ax_opcode, 0);
 }
 
-void X86Mir2Lir::EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
+void X86Mir2Lir::EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int64_t imm) {
   DCHECK_LT(RegStorage::RegNum(reg), 8);
   code_buffer_.push_back(0xB8 + RegStorage::RegNum(reg));
-  code_buffer_.push_back(imm & 0xFF);
-  code_buffer_.push_back((imm >> 8) & 0xFF);
-  code_buffer_.push_back((imm >> 16) & 0xFF);
-  code_buffer_.push_back((imm >> 24) & 0xFF);
+  switch (entry->skeleton.immediate_bytes) {
+    case 4:
+      code_buffer_.push_back(imm & 0xFF);
+      code_buffer_.push_back((imm >> 8) & 0xFF);
+      code_buffer_.push_back((imm >> 16) & 0xFF);
+      code_buffer_.push_back((imm >> 24) & 0xFF);
+      break;
+    case 8:
+      code_buffer_.push_back(imm & 0xFF);
+      code_buffer_.push_back((imm >> 8) & 0xFF);
+      code_buffer_.push_back((imm >> 16) & 0xFF);
+      code_buffer_.push_back((imm >> 24) & 0xFF);
+      code_buffer_.push_back((imm >> 32) & 0xFF);
+      code_buffer_.push_back((imm >> 40) & 0xFF);
+      code_buffer_.push_back((imm >> 48) & 0xFF);
+      code_buffer_.push_back((imm >> 56) & 0xFF);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported immediate size for EmitMovRegImm: " << static_cast<uint32_t>(entry->skeleton.immediate_bytes);
+  }
 }
 
 void X86Mir2Lir::EmitShiftRegImm(const X86EncodingMap* entry, uint8_t reg, int imm) {
@@ -1343,7 +1448,6 @@
       case kRegOpcode:  // lir operands - 0: reg
         EmitOpRegOpcode(entry, lir->operands[0]);
         break;
-      case kReg64:
       case kReg:  // lir operands - 0: reg
         EmitOpReg(entry, lir->operands[0]);
         break;
@@ -1353,7 +1457,6 @@
       case kArray:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
         EmitOpArray(entry, lir->operands[0], lir->operands[1], lir->operands[2], lir->operands[3]);
         break;
-      case kMemReg64:
       case kMemReg:  // lir operands - 0: base, 1: disp, 2: reg
         EmitMemReg(entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
@@ -1364,7 +1467,6 @@
         EmitArrayImm(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                      lir->operands[3], lir->operands[4]);
         break;
-      case kArrayReg64:
       case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
         EmitArrayReg(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                      lir->operands[3], lir->operands[4]);
@@ -1376,7 +1478,6 @@
         EmitRegArray(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                      lir->operands[3], lir->operands[4]);
         break;
-      case kReg64Thread:  // lir operands - 0: reg, 1: disp
       case kRegThread:  // lir operands - 0: reg, 1: disp
         EmitRegThread(entry, lir->operands[0], lir->operands[1]);
         break;
@@ -1400,7 +1501,6 @@
         EmitRegMemImm(entry, lir->operands[0], lir->operands[1], lir->operands[2],
                       lir->operands[3]);
         break;
-      case kReg64Imm:
       case kRegImm:  // lir operands - 0: reg, 1: immediate
         EmitRegImm(entry, lir->operands[0], lir->operands[1]);
         break;
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 3070edd..fcc846f 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -118,6 +118,7 @@
     bool GenInlinedSqrt(CallInfo* info);
     bool GenInlinedPeek(CallInfo* info, OpSize size);
     bool GenInlinedPoke(CallInfo* info, OpSize size);
+    void GenNotLong(RegLocation rl_dest, RegLocation rl_src);
     void GenNegLong(RegLocation rl_dest, RegLocation rl_src);
     void GenOrLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                    RegLocation rl_src2);
@@ -125,6 +126,8 @@
                     RegLocation rl_src2);
     void GenXorLong(Instruction::Code opcode, RegLocation rl_dest, RegLocation rl_src1,
                     RegLocation rl_src2);
+    void GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                       RegLocation rl_src2, bool is_div);
     // TODO: collapse reg_lo, reg_hi
     RegLocation GenDivRem(RegLocation rl_dest, RegStorage reg_lo, RegStorage reg_hi, bool is_div);
     RegLocation GenDivRemLit(RegLocation rl_dest, RegStorage reg_lo, int lit, bool is_div);
@@ -336,7 +339,7 @@
     void EmitModrmThread(uint8_t reg_or_opcode);
     void EmitModrmDisp(uint8_t reg_or_opcode, uint8_t base, int disp);
     void EmitModrmSibDisp(uint8_t reg_or_opcode, uint8_t base, uint8_t index, int scale, int disp);
-    void EmitImm(const X86EncodingMap* entry, int imm);
+    void EmitImm(const X86EncodingMap* entry, int64_t imm);
     void EmitOpRegOpcode(const X86EncodingMap* entry, uint8_t reg);
     void EmitOpReg(const X86EncodingMap* entry, uint8_t reg);
     void EmitOpMem(const X86EncodingMap* entry, uint8_t base, int disp);
@@ -359,7 +362,7 @@
     void EmitMemRegImm(const X86EncodingMap* entry, uint8_t base, int disp, uint8_t reg1, int32_t imm);
     void EmitRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
     void EmitThreadImm(const X86EncodingMap* entry, int disp, int imm);
-    void EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
+    void EmitMovRegImm(const X86EncodingMap* entry, uint8_t reg, int64_t imm);
     void EmitShiftRegImm(const X86EncodingMap* entry, uint8_t reg, int imm);
     void EmitShiftMemImm(const X86EncodingMap* entry, uint8_t base, int disp, int imm);
     void EmitShiftMemCl(const X86EncodingMap* entry, uint8_t base, int displacement, uint8_t cl);
@@ -426,6 +429,136 @@
     void GenConst128(BasicBlock* bb, MIR* mir);
 
     /*
+     * @brief MIR to move a vectorized register to another.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination
+     * @note vC: source
+     */
+    void GenMoveVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed multiply of units in two vector registers: vB = vB .* @note vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenMultiplyVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed addition of units in two vector registers: vB = vB .+ vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenAddVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed subtraction of units in two vector registers: vB = vB .- vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenSubtractVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed shift left of units in two vector registers: vB = vB .<< vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: immediate
+     */
+    void GenShiftLeftVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed signed shift right of units in two vector registers: vB = vB .>> vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: immediate
+     */
+    void GenSignedShiftRightVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed unsigned shift right of units in two vector registers: vB = vB .>>> vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from..
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: immediate
+     */
+    void GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed bitwise and of units in two vector registers: vB = vB .& vC using vA to know the type of the vector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenAndVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed bitwise or of units in two vector registers: vB = vB .| vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenOrVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Packed bitwise xor of units in two vector registers: vB = vB .^ vC using vA to know the type of the vector.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination and source
+     * @note vC: source
+     */
+    void GenXorVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Reduce a 128-bit packed element into a single VR by taking lower bits
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @details Instruction does a horizontal addition of the packed elements and then adds it to VR.
+     * @note vA: TypeSize
+     * @note vB: destination and source VR (not vector register)
+     * @note vC: source (vector register)
+     */
+    void GenAddReduceVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Extract a packed element into a single VR.
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize
+     * @note vB: destination VR (not vector register)
+     * @note vC: source (vector register)
+     * @note arg[0]: The index to use for extraction from vector register (which packed element).
+     */
+    void GenReduceVector(BasicBlock *bb, MIR *mir);
+
+    /*
+     * @brief Create a vector value, with all TypeSize values equal to vC
+     * @param bb The basic block in which the MIR is from.
+     * @param mir The MIR whose opcode is kMirConstVector.
+     * @note vA: TypeSize.
+     * @note vB: destination vector register.
+     * @note vC: source VR (not vector register).
+     */
+    void GenSetVector(BasicBlock *bb, MIR *mir);
+
+    /*
      * @brief Generate code for a vector opcode.
      * @param bb The basic block in which the MIR is from.
      * @param mir The MIR whose opcode is a non-standard opcode.
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index a6ccc99..48bff6e 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -1372,6 +1372,15 @@
   GenLongArith(rl_dest, rl_src1, rl_src2, opcode, true);
 }
 
+void X86Mir2Lir::GenNotLong(RegLocation rl_dest, RegLocation rl_src) {
+  LOG(FATAL) << "Unexpected use GenNotLong()";
+}
+
+void X86Mir2Lir::GenDivRemLong(Instruction::Code, RegLocation rl_dest, RegLocation rl_src1,
+                           RegLocation rl_src2, bool is_div) {
+  LOG(FATAL) << "Unexpected use GenDivRemLong()";
+}
+
 void X86Mir2Lir::GenNegLong(RegLocation rl_dest, RegLocation rl_src) {
   rl_src = LoadValueWide(rl_src, kCoreReg);
   RegLocation rl_result = ForceTempWide(rl_src);
@@ -2189,6 +2198,10 @@
           }
         }
         rl_rhs = LoadValue(rl_rhs, kCoreReg);
+        // It might happen rl_rhs and rl_dest are the same VR
+        // in this case rl_dest is in reg after LoadValue while
+        // rl_result is not updated yet, so do this
+        rl_result = UpdateLocTyped(rl_dest, kCoreReg);
         if (rl_result.location != kLocPhysReg) {
           // Okay, we can do this into memory.
           OpMemReg(op, rl_result, rl_rhs.reg.GetReg());
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index e7a629a..19ad2f8 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -35,6 +35,12 @@
     rs_r8, rs_r9, rs_r10, rs_r11, rs_r12, rs_r13, rs_r14, rs_r15
 #endif
 };
+static const RegStorage core_regs_arr_64q[] = {
+    rs_r0q, rs_r1q, rs_r2q, rs_r3q, rs_rX86_SP_64, rs_r5q, rs_r6q, rs_r7q,
+#ifdef TARGET_REX_SUPPORT
+    rs_r8, rs_r9, rs_r10, rs_r11, rs_r12, rs_r13, rs_r14, rs_r15
+#endif
+};
 static const RegStorage sp_regs_arr_32[] = {
     rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7,
 };
@@ -55,6 +61,7 @@
 };
 static const RegStorage reserved_regs_arr_32[] = {rs_rX86_SP_32};
 static const RegStorage reserved_regs_arr_64[] = {rs_rX86_SP_64};
+static const RegStorage reserved_regs_arr_64q[] = {rs_rX86_SP_64};
 static const RegStorage core_temps_arr_32[] = {rs_rAX, rs_rCX, rs_rDX, rs_rBX};
 static const RegStorage core_temps_arr_64[] = {
     rs_rAX, rs_rCX, rs_rDX, rs_rSI, rs_rDI,
@@ -62,6 +69,12 @@
     rs_r8, rs_r9, rs_r10, rs_r11
 #endif
 };
+static const RegStorage core_temps_arr_64q[] = {
+    rs_r0q, rs_r1q, rs_r2q, rs_r6q, rs_r7q,
+#ifdef TARGET_REX_SUPPORT
+    rs_r8q, rs_r9q, rs_r10q, rs_r11q
+#endif
+};
 static const RegStorage sp_temps_arr_32[] = {
     rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7,
 };
@@ -81,11 +94,23 @@
 #endif
 };
 
+static const RegStorage xp_temps_arr_32[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+};
+static const RegStorage xp_temps_arr_64[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+#ifdef TARGET_REX_SUPPORT
+    rs_xr8, rs_xr9, rs_xr10, rs_xr11, rs_xr12, rs_xr13, rs_xr14, rs_xr15
+#endif
+};
+
 static const std::vector<RegStorage> empty_pool;
 static const std::vector<RegStorage> core_regs_32(core_regs_arr_32,
     core_regs_arr_32 + sizeof(core_regs_arr_32) / sizeof(core_regs_arr_32[0]));
 static const std::vector<RegStorage> core_regs_64(core_regs_arr_64,
     core_regs_arr_64 + sizeof(core_regs_arr_64) / sizeof(core_regs_arr_64[0]));
+static const std::vector<RegStorage> core_regs_64q(core_regs_arr_64q,
+    core_regs_arr_64q + sizeof(core_regs_arr_64q) / sizeof(core_regs_arr_64q[0]));
 static const std::vector<RegStorage> sp_regs_32(sp_regs_arr_32,
     sp_regs_arr_32 + sizeof(sp_regs_arr_32) / sizeof(sp_regs_arr_32[0]));
 static const std::vector<RegStorage> sp_regs_64(sp_regs_arr_64,
@@ -98,10 +123,14 @@
     reserved_regs_arr_32 + sizeof(reserved_regs_arr_32) / sizeof(reserved_regs_arr_32[0]));
 static const std::vector<RegStorage> reserved_regs_64(reserved_regs_arr_64,
     reserved_regs_arr_64 + sizeof(reserved_regs_arr_64) / sizeof(reserved_regs_arr_64[0]));
+static const std::vector<RegStorage> reserved_regs_64q(reserved_regs_arr_64q,
+    reserved_regs_arr_64q + sizeof(reserved_regs_arr_64q) / sizeof(reserved_regs_arr_64q[0]));
 static const std::vector<RegStorage> core_temps_32(core_temps_arr_32,
     core_temps_arr_32 + sizeof(core_temps_arr_32) / sizeof(core_temps_arr_32[0]));
 static const std::vector<RegStorage> core_temps_64(core_temps_arr_64,
     core_temps_arr_64 + sizeof(core_temps_arr_64) / sizeof(core_temps_arr_64[0]));
+static const std::vector<RegStorage> core_temps_64q(core_temps_arr_64q,
+    core_temps_arr_64q + sizeof(core_temps_arr_64q) / sizeof(core_temps_arr_64q[0]));
 static const std::vector<RegStorage> sp_temps_32(sp_temps_arr_32,
     sp_temps_arr_32 + sizeof(sp_temps_arr_32) / sizeof(sp_temps_arr_32[0]));
 static const std::vector<RegStorage> sp_temps_64(sp_temps_arr_64,
@@ -111,6 +140,11 @@
 static const std::vector<RegStorage> dp_temps_64(dp_temps_arr_64,
     dp_temps_arr_64 + sizeof(dp_temps_arr_64) / sizeof(dp_temps_arr_64[0]));
 
+static const std::vector<RegStorage> xp_temps_32(xp_temps_arr_32,
+    xp_temps_arr_32 + sizeof(xp_temps_arr_32) / sizeof(xp_temps_arr_32[0]));
+static const std::vector<RegStorage> xp_temps_64(xp_temps_arr_64,
+    xp_temps_arr_64 + sizeof(xp_temps_arr_64) / sizeof(xp_temps_arr_64[0]));
+
 RegStorage rs_rX86_SP;
 
 X86NativeRegisterPool rX86_ARG0;
@@ -209,7 +243,7 @@
   /* Double registers in x86 are just a single FP register */
   seed = 1;
   /* FP register starts at bit position 16 */
-  shift = reg.IsFloat() ? kX86FPReg0 : 0;
+  shift = (reg.IsFloat() || reg.StorageSize() > 8) ? kX86FPReg0 : 0;
   /* Expand the double register id into single offset */
   shift += reg_id;
   return (seed << shift);
@@ -531,9 +565,9 @@
 
 void X86Mir2Lir::CompilerInitializeRegAlloc() {
   if (Gen64Bit()) {
-    reg_pool_ = new (arena_) RegisterPool(this, arena_, empty_pool, core_regs_64, sp_regs_64,
-                                          dp_regs_64, empty_pool, reserved_regs_64,
-                                          empty_pool, core_temps_64, sp_temps_64, dp_temps_64);
+    reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs_64, empty_pool/*core_regs_64q*/, sp_regs_64,
+                                          dp_regs_64, reserved_regs_64, empty_pool/*reserved_regs_64q*/,
+                                          core_temps_64, empty_pool/*core_temps_64q*/, sp_temps_64, dp_temps_64);
   } else {
     reg_pool_ = new (arena_) RegisterPool(this, arena_, core_regs_32, empty_pool, sp_regs_32,
                                           dp_regs_32, reserved_regs_32, empty_pool,
@@ -542,17 +576,31 @@
 
   // Target-specific adjustments.
 
+  // Add in XMM registers.
+  const std::vector<RegStorage> *xp_temps = Gen64Bit() ? &xp_temps_64 : &xp_temps_32;
+  for (RegStorage reg : *xp_temps) {
+    RegisterInfo* info = new (arena_) RegisterInfo(reg, GetRegMaskCommon(reg));
+    reginfo_map_.Put(reg.GetReg(), info);
+    info->SetIsTemp(true);
+  }
+
   // Alias single precision xmm to double xmms.
   // TODO: as needed, add larger vector sizes - alias all to the largest.
   GrowableArray<RegisterInfo*>::Iterator it(&reg_pool_->sp_regs_);
   for (RegisterInfo* info = it.Next(); info != nullptr; info = it.Next()) {
     int sp_reg_num = info->GetReg().GetRegNum();
+    RegStorage xp_reg = RegStorage::Solo128(sp_reg_num);
+    RegisterInfo* xp_reg_info = GetRegInfo(xp_reg);
+    // 128-bit xmm vector register's master storage should refer to itself.
+    DCHECK_EQ(xp_reg_info, xp_reg_info->Master());
+
+    // Redirect 32-bit vector's master storage to 128-bit vector.
+    info->SetMaster(xp_reg_info);
+
     RegStorage dp_reg = RegStorage::Solo64(RegStorage::kFloatingPoint | sp_reg_num);
     RegisterInfo* dp_reg_info = GetRegInfo(dp_reg);
-    // 64-bit xmm vector register's master storage should refer to itself.
-    DCHECK_EQ(dp_reg_info, dp_reg_info->Master());
-    // Redirect 32-bit vector's master storage to 64-bit vector.
-    info->SetMaster(dp_reg_info);
+    // Redirect 64-bit vector's master storage to 128-bit vector.
+    dp_reg_info->SetMaster(xp_reg_info);
   }
 
   // Don't start allocating temps at r0/s0/d0 or you may clobber return regs in early-exit methods.
@@ -1240,6 +1288,45 @@
     case kMirOpConstVector:
       GenConst128(bb, mir);
       break;
+    case kMirOpMoveVector:
+      GenMoveVector(bb, mir);
+      break;
+    case kMirOpPackedMultiply:
+      GenMultiplyVector(bb, mir);
+      break;
+    case kMirOpPackedAddition:
+      GenAddVector(bb, mir);
+      break;
+    case kMirOpPackedSubtract:
+      GenSubtractVector(bb, mir);
+      break;
+    case kMirOpPackedShiftLeft:
+      GenShiftLeftVector(bb, mir);
+      break;
+    case kMirOpPackedSignedShiftRight:
+      GenSignedShiftRightVector(bb, mir);
+      break;
+    case kMirOpPackedUnsignedShiftRight:
+      GenUnsignedShiftRightVector(bb, mir);
+      break;
+    case kMirOpPackedAnd:
+      GenAndVector(bb, mir);
+      break;
+    case kMirOpPackedOr:
+      GenOrVector(bb, mir);
+      break;
+    case kMirOpPackedXor:
+      GenXorVector(bb, mir);
+      break;
+    case kMirOpPackedAddReduce:
+      GenAddReduceVector(bb, mir);
+      break;
+    case kMirOpPackedReduce:
+      GenReduceVector(bb, mir);
+      break;
+    case kMirOpPackedSet:
+      GenSetVector(bb, mir);
+      break;
     default:
       break;
   }
@@ -1249,9 +1336,9 @@
   int type_size = mir->dalvikInsn.vA;
   // We support 128 bit vectors.
   DCHECK_EQ(type_size & 0xFFFF, 128);
-  int reg = mir->dalvikInsn.vB;
-  DCHECK_LT(reg, 8);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
   uint32_t *args = mir->dalvikInsn.arg;
+  int reg = rs_dest.GetReg();
   // Check for all 0 case.
   if (args[0] == 0 && args[1] == 0 && args[2] == 0 && args[3] == 0) {
     NewLIR2(kX86XorpsRR, reg, reg);
@@ -1277,6 +1364,287 @@
   SetMemRefType(load, true, kLiteral);
 }
 
+void X86Mir2Lir::GenMoveVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
+}
+
+void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PmulldRR;
+      break;
+    case kSignedHalf:
+      opcode = kX86PmullwRR;
+      break;
+    case kSingle:
+      opcode = kX86MulpsRR;
+      break;
+    case kDouble:
+      opcode = kX86MulpdRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector multiply " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenAddVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PadddRR;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PaddwRR;
+      break;
+    case kUnsignedByte:
+    case kSignedByte:
+      opcode = kX86PaddbRR;
+      break;
+    case kSingle:
+      opcode = kX86AddpsRR;
+      break;
+    case kDouble:
+      opcode = kX86AddpdRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector addition " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenSubtractVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PsubdRR;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsubwRR;
+      break;
+    case kUnsignedByte:
+    case kSignedByte:
+      opcode = kX86PsubbRR;
+      break;
+    case kSingle:
+      opcode = kX86SubpsRR;
+      break;
+    case kDouble:
+      opcode = kX86SubpdRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector subtraction " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PslldRI;
+      break;
+    case k64:
+      opcode = kX86PsllqRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsllwRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector shift left " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenSignedShiftRightVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PsradRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsrawRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenUnsignedShiftRightVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PsrldRI;
+      break;
+    case k64:
+      opcode = kX86PsrlqRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PsrlwRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector unsigned shift right " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenAndVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenOrVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenXorVector(BasicBlock *bb, MIR *mir) {
+  // We only support 128 bit registers.
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vC);
+  NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+}
+
+void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int imm = mir->dalvikInsn.vC;
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PhadddRR;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PhaddwRR;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+      break;
+  }
+  NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
+}
+
+void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int index = mir->dalvikInsn.arg[0];
+  int opcode = 0;
+  switch (opsize) {
+    case k32:
+      opcode = kX86PextrdRRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      opcode = kX86PextrwRRI;
+      break;
+    case kUnsignedByte:
+    case kSignedByte:
+      opcode = kX86PextrbRRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector reduce " << opsize;
+      break;
+  }
+  // We need to extract to a GPR.
+  RegStorage temp = AllocTemp();
+  NewLIR3(opcode, temp.GetReg(), rs_src.GetReg(), index);
+
+  // Assume that the destination VR is in the def for the mir.
+  RegLocation rl_dest = mir_graph_->GetDest(mir);
+  RegLocation rl_temp =
+    {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, temp, INVALID_SREG, INVALID_SREG};
+  StoreValue(rl_dest, rl_temp);
+}
+
+void X86Mir2Lir::GenSetVector(BasicBlock *bb, MIR *mir) {
+  DCHECK_EQ(mir->dalvikInsn.vA & 0xFFFF, 128U);
+  OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vA >> 16);
+  RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vB);
+  int op_low = 0, op_high = 0;
+  switch (opsize) {
+    case k32:
+      op_low = kX86PshufdRRI;
+      break;
+    case kSignedHalf:
+    case kUnsignedHalf:
+      // Handles low quadword.
+      op_low = kX86PshuflwRRI;
+      // Handles upper quadword.
+      op_high = kX86PshufdRRI;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported vector set " << opsize;
+      break;
+  }
+
+  // Load the value from the VR into a GPR.
+  RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
+  rl_src = LoadValue(rl_src, kCoreReg);
+
+  // Load the value into the XMM register.
+  NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), rl_src.reg.GetReg());
+
+  // Now shuffle the value across the destination.
+  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+
+  // And then repeat as needed.
+  if (op_high != 0) {
+    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+  }
+}
+
+
 LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
   int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
   for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index fed31c1..ee02257 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -89,7 +89,11 @@
     res = NewLIR2(kX86Xor32RR, r_dest.GetReg(), r_dest.GetReg());
   } else {
     // Note, there is no byte immediate form of a 32 bit immediate move.
-    res = NewLIR2(kX86Mov32RI, r_dest.GetReg(), value);
+    if (r_dest.Is64Bit()) {
+      res = NewLIR2(kX86Mov64RI, r_dest.GetReg(), value);
+    } else {
+      res = NewLIR2(kX86Mov32RI, r_dest.GetReg(), value);
+    }
   }
 
   if (r_dest_save.IsFloat()) {
@@ -181,7 +185,6 @@
         LOG(FATAL) << "Bad case in OpRegImm " << op;
     }
   }
-  CHECK(!r_dest_src1.Is64Bit() || X86Mir2Lir::EncodingMap[opcode].kind == kReg64Imm) << "OpRegImm(" << op << ")";
   return NewLIR2(opcode, r_dest_src1.GetReg(), value);
 }
 
@@ -559,7 +562,7 @@
         // We don't know the proper offset for the value, so pick one that will force
         // 4 byte offset.  We will fix this up in the assembler later to have the right
         // value.
-        res = LoadBaseDisp(rl_method.reg, 256 /* bogus */, RegStorage::Solo64(low_reg_val),
+        res = LoadBaseDisp(rl_method.reg, 256 /* bogus */, RegStorage::FloatSolo64(low_reg_val),
                            kDouble);
         res->target = data_target;
         res->flags.fixup = kFixupLoad;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index adfed0c..24c64cd 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -117,41 +117,56 @@
 // FIXME: for 64-bit, perhaps add an X86_64NativeRegisterPool enum?
 enum X86NativeRegisterPool {
   r0             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 0,
+  r0q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 0,
   rAX            = r0,
   r1             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 1,
+  r1q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 1,
   rCX            = r1,
   r2             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 2,
+  r2q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 2,
   rDX            = r2,
   r3             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 3,
+  r3q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 3,
   rBX            = r3,
   r4sp_32        = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 4,
   rX86_SP_32     = r4sp_32,
   r4sp_64        = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 4,
   rX86_SP_64     = r4sp_64,
   r5             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 5,
+  r5q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 5,
   rBP            = r5,
   r5sib_no_base  = r5,
   r6             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 6,
+  r6q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 6,
   rSI            = r6,
   r7             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 7,
+  r7q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 7,
   rDI            = r7,
 #ifndef TARGET_REX_SUPPORT
   // fake return address register for core spill mask.
   rRET           = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 8,
 #else
   r8             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 8,
+  r8q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 8,
   r9             = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 9,
+  r9q            = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 9,
   r10            = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 10,
+  r10q           = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 10,
   r11            = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 11,
+  r11q           = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 11,
   r12            = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 12,
+  r12q           = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 12,
   r13            = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 13,
+  r13q           = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 13,
   r14            = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 14,
+  r14q           = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 14,
   r15            = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 15,
+  r15q           = RegStorage::k64BitSolo | RegStorage::kCoreRegister | 15,
   // fake return address register for core spill mask.
   rRET           = RegStorage::k32BitSolo | RegStorage::kCoreRegister | 16,
 #endif
 
-  // xmm registers, single precision view
+  // xmm registers, single precision view.
   fr0  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 0,
   fr1  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 1,
   fr2  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 2,
@@ -161,7 +176,7 @@
   fr6  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 6,
   fr7  = RegStorage::k32BitSolo | RegStorage::kFloatingPoint | 7,
 
-  // xmm registers, double precision alises
+  // xmm registers, double precision aliases.
   dr0  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 0,
   dr1  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 1,
   dr2  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 2,
@@ -171,37 +186,62 @@
   dr6  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 6,
   dr7  = RegStorage::k64BitSolo | RegStorage::kFloatingPoint | 7,
 
-  // xmm registers, quad precision alises
-  qr0  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 0,
-  qr1  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 1,
-  qr2  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 2,
-  qr3  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 3,
-  qr4  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 4,
-  qr5  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 5,
-  qr6  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 6,
-  qr7  = RegStorage::k128BitSolo | RegStorage::kFloatingPoint | 7,
+  // xmm registers aliases.
+  xr0  = RegStorage::k128BitSolo | 0,
+  xr1  = RegStorage::k128BitSolo | 1,
+  xr2  = RegStorage::k128BitSolo | 2,
+  xr3  = RegStorage::k128BitSolo | 3,
+  xr4  = RegStorage::k128BitSolo | 4,
+  xr5  = RegStorage::k128BitSolo | 5,
+  xr6  = RegStorage::k128BitSolo | 6,
+  xr7  = RegStorage::k128BitSolo | 7,
 
   // TODO: as needed, add 256, 512 and 1024-bit xmm views.
 };
 
 constexpr RegStorage rs_r0(RegStorage::kValid | r0);
+constexpr RegStorage rs_r0q(RegStorage::kValid | r0q);
 constexpr RegStorage rs_rAX = rs_r0;
 constexpr RegStorage rs_r1(RegStorage::kValid | r1);
+constexpr RegStorage rs_r1q(RegStorage::kValid | r1q);
 constexpr RegStorage rs_rCX = rs_r1;
 constexpr RegStorage rs_r2(RegStorage::kValid | r2);
+constexpr RegStorage rs_r2q(RegStorage::kValid | r2q);
 constexpr RegStorage rs_rDX = rs_r2;
 constexpr RegStorage rs_r3(RegStorage::kValid | r3);
+constexpr RegStorage rs_r3q(RegStorage::kValid | r3q);
 constexpr RegStorage rs_rBX = rs_r3;
 constexpr RegStorage rs_rX86_SP_64(RegStorage::kValid | r4sp_64);
 constexpr RegStorage rs_rX86_SP_32(RegStorage::kValid | r4sp_32);
 extern RegStorage rs_rX86_SP;
 constexpr RegStorage rs_r5(RegStorage::kValid | r5);
+constexpr RegStorage rs_r5q(RegStorage::kValid | r5q);
 constexpr RegStorage rs_rBP = rs_r5;
 constexpr RegStorage rs_r6(RegStorage::kValid | r6);
+constexpr RegStorage rs_r6q(RegStorage::kValid | r6q);
 constexpr RegStorage rs_rSI = rs_r6;
 constexpr RegStorage rs_r7(RegStorage::kValid | r7);
+constexpr RegStorage rs_r7q(RegStorage::kValid | r7q);
 constexpr RegStorage rs_rDI = rs_r7;
 constexpr RegStorage rs_rRET(RegStorage::kValid | rRET);
+#ifdef TARGET_REX_SUPPORT
+constexpr RegStorage rs_r8(RegStorage::kValid | r8);
+constexpr RegStorage rs_r8q(RegStorage::kValid | r8q);
+constexpr RegStorage rs_r9(RegStorage::kValid | r9);
+constexpr RegStorage rs_r9q(RegStorage::kValid | r9q);
+constexpr RegStorage rs_r10(RegStorage::kValid | r10);
+constexpr RegStorage rs_r10q(RegStorage::kValid | r10q);
+constexpr RegStorage rs_r11(RegStorage::kValid | r11);
+constexpr RegStorage rs_r11q(RegStorage::kValid | r11q);
+constexpr RegStorage rs_r12(RegStorage::kValid | r12);
+constexpr RegStorage rs_r12q(RegStorage::kValid | r12q);
+constexpr RegStorage rs_r13(RegStorage::kValid | r13);
+constexpr RegStorage rs_r13q(RegStorage::kValid | r13q);
+constexpr RegStorage rs_r14(RegStorage::kValid | r14);
+constexpr RegStorage rs_r14q(RegStorage::kValid | r14q);
+constexpr RegStorage rs_r15(RegStorage::kValid | r15);
+constexpr RegStorage rs_r15q(RegStorage::kValid | r15q);
+#endif
 
 constexpr RegStorage rs_fr0(RegStorage::kValid | fr0);
 constexpr RegStorage rs_fr1(RegStorage::kValid | fr1);
@@ -221,14 +261,14 @@
 constexpr RegStorage rs_dr6(RegStorage::kValid | dr6);
 constexpr RegStorage rs_dr7(RegStorage::kValid | dr7);
 
-constexpr RegStorage rs_qr0(RegStorage::kValid | qr0);
-constexpr RegStorage rs_qr1(RegStorage::kValid | qr1);
-constexpr RegStorage rs_qr2(RegStorage::kValid | qr2);
-constexpr RegStorage rs_qr3(RegStorage::kValid | qr3);
-constexpr RegStorage rs_qr4(RegStorage::kValid | qr4);
-constexpr RegStorage rs_qr5(RegStorage::kValid | qr5);
-constexpr RegStorage rs_qr6(RegStorage::kValid | qr6);
-constexpr RegStorage rs_qr7(RegStorage::kValid | qr7);
+constexpr RegStorage rs_xr0(RegStorage::kValid | xr0);
+constexpr RegStorage rs_xr1(RegStorage::kValid | xr1);
+constexpr RegStorage rs_xr2(RegStorage::kValid | xr2);
+constexpr RegStorage rs_xr3(RegStorage::kValid | xr3);
+constexpr RegStorage rs_xr4(RegStorage::kValid | xr4);
+constexpr RegStorage rs_xr5(RegStorage::kValid | xr5);
+constexpr RegStorage rs_xr6(RegStorage::kValid | xr6);
+constexpr RegStorage rs_xr7(RegStorage::kValid | xr7);
 
 extern X86NativeRegisterPool rX86_ARG0;
 extern X86NativeRegisterPool rX86_ARG1;
@@ -311,10 +351,14 @@
   opcode ## 16RR, opcode ## 16RM, opcode ## 16RA, opcode ## 16RT, \
   opcode ## 16RI, opcode ## 16MI, opcode ## 16AI, opcode ## 16TI, \
   opcode ## 16RI8, opcode ## 16MI8, opcode ## 16AI8, opcode ## 16TI8, \
-  opcode ## 32MR, opcode ## 64MR, opcode ## 32AR, opcode ## 64AR, opcode ## 32TR,  \
-  opcode ## 32RR, opcode ## 32RM, opcode ## 64RM, opcode ## 32RA, opcode ## 64RA, opcode ## 32RT, opcode ## 64RT, \
-  opcode ## 32RI, opcode ## 64RI, opcode ## 32MI, opcode ## 32AI, opcode ## 32TI, \
-  opcode ## 32RI8, opcode ## 64RI8, opcode ## 32MI8, opcode ## 32AI8, opcode ## 32TI8
+  opcode ## 32MR, opcode ## 32AR, opcode ## 32TR,  \
+  opcode ## 32RR, opcode ## 32RM, opcode ## 32RA, opcode ## 32RT, \
+  opcode ## 32RI, opcode ## 32MI, opcode ## 32AI, opcode ## 32TI, \
+  opcode ## 32RI8, opcode ## 32MI8, opcode ## 32AI8, opcode ## 32TI8, \
+  opcode ## 64MR, opcode ## 64AR, opcode ## 64TR,  \
+  opcode ## 64RR, opcode ## 64RM, opcode ## 64RA, opcode ## 64RT, \
+  opcode ## 64RI, opcode ## 64MI, opcode ## 64AI, opcode ## 64TI, \
+  opcode ## 64RI8, opcode ## 64MI8, opcode ## 64AI8, opcode ## 64TI8
   BinaryOpCode(kX86Add),
   BinaryOpCode(kX86Or),
   BinaryOpCode(kX86Adc),
@@ -327,23 +371,32 @@
   kX86Imul16RRI, kX86Imul16RMI, kX86Imul16RAI,
   kX86Imul32RRI, kX86Imul32RMI, kX86Imul32RAI,
   kX86Imul32RRI8, kX86Imul32RMI8, kX86Imul32RAI8,
+  kX86Imul64RRI, kX86Imul64RMI, kX86Imul64RAI,
+  kX86Imul64RRI8, kX86Imul64RMI8, kX86Imul64RAI8,
   kX86Mov8MR, kX86Mov8AR, kX86Mov8TR,
   kX86Mov8RR, kX86Mov8RM, kX86Mov8RA, kX86Mov8RT,
   kX86Mov8RI, kX86Mov8MI, kX86Mov8AI, kX86Mov8TI,
   kX86Mov16MR, kX86Mov16AR, kX86Mov16TR,
   kX86Mov16RR, kX86Mov16RM, kX86Mov16RA, kX86Mov16RT,
   kX86Mov16RI, kX86Mov16MI, kX86Mov16AI, kX86Mov16TI,
-  kX86Mov32MR, kX86Mov64MR, kX86Mov32AR, kX86Mov64AR, kX86Mov32TR,
-  kX86Mov32RR, kX86Mov32RM, kX86Mov64RM, kX86Mov32RA, kX86Mov64RA, kX86Mov32RT, kX86Mov64RT,
-  kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI, kX86Mov64TI,
+  kX86Mov32MR, kX86Mov32AR, kX86Mov32TR,
+  kX86Mov32RR, kX86Mov32RM, kX86Mov32RA, kX86Mov32RT,
+  kX86Mov32RI, kX86Mov32MI, kX86Mov32AI, kX86Mov32TI,
   kX86Lea32RM,
   kX86Lea32RA,
+  kX86Mov64MR, kX86Mov64AR, kX86Mov64TR,
+  kX86Mov64RR, kX86Mov64RM, kX86Mov64RA, kX86Mov64RT,
+  kX86Mov64RI, kX86Mov64MI, kX86Mov64AI, kX86Mov64TI,
+  kX86Lea64RM,
+  kX86Lea64RA,
   // RRC - Register Register ConditionCode - cond_opcode reg1, reg2
   //             - lir operands - 0: reg1, 1: reg2, 2: CC
   kX86Cmov32RRC,
+  kX86Cmov64RRC,
   // RMC - Register Memory ConditionCode - cond_opcode reg1, [base + disp]
   //             - lir operands - 0: reg1, 1: base, 2: disp 3: CC
   kX86Cmov32RMC,
+  kX86Cmov64RMC,
 
   // RC - Register CL - opcode reg, CL
   //          - lir operands - 0: reg, 1: CL
@@ -357,7 +410,9 @@
   opcode ## 16RI, opcode ## 16MI, opcode ## 16AI, \
   opcode ## 16RC, opcode ## 16MC, opcode ## 16AC, \
   opcode ## 32RI, opcode ## 32MI, opcode ## 32AI, \
-  opcode ## 32RC, opcode ## 32MC, opcode ## 32AC
+  opcode ## 32RC, opcode ## 32MC, opcode ## 32AC, \
+  opcode ## 64RI, opcode ## 64MI, opcode ## 64AI, \
+  opcode ## 64RC, opcode ## 64MC, opcode ## 64AC
   BinaryShiftOpCode(kX86Rol),
   BinaryShiftOpCode(kX86Ror),
   BinaryShiftOpCode(kX86Rcl),
@@ -371,12 +426,18 @@
   kX86Shld32MRI,
   kX86Shrd32RRI,
   kX86Shrd32MRI,
+  kX86Shld64RRI,
+  kX86Shld64MRI,
+  kX86Shrd64RRI,
+  kX86Shrd64MRI,
 #define UnaryOpcode(opcode, reg, mem, array) \
   opcode ## 8 ## reg, opcode ## 8 ## mem, opcode ## 8 ## array, \
   opcode ## 16 ## reg, opcode ## 16 ## mem, opcode ## 16 ## array, \
-  opcode ## 32 ## reg, opcode ## 32 ## mem, opcode ## 32 ## array
+  opcode ## 32 ## reg, opcode ## 32 ## mem, opcode ## 32 ## array, \
+  opcode ## 64 ## reg, opcode ## 64 ## mem, opcode ## 64 ## array
   UnaryOpcode(kX86Test, RI, MI, AI),
   kX86Test32RR,
+  kX86Test64RR,
   UnaryOpcode(kX86Not, R, M, A),
   UnaryOpcode(kX86Neg, R, M, A),
   UnaryOpcode(kX86Mul,  DaR, DaM, DaA),
@@ -418,9 +479,39 @@
   Binary0fOpCode(kX86Divsd),    // double divide
   Binary0fOpCode(kX86Divss),    // float divide
   Binary0fOpCode(kX86Punpckldq),  // Interleave low-order double words
-  kX86PsrlqRI,                  // right shift of floating point registers
-  kX86PsllqRI,                  // left shift of floating point registers
-  kX86SqrtsdRR,                 // sqrt of floating point register
+  Binary0fOpCode(kX86Sqrtsd),   // square root
+  Binary0fOpCode(kX86Pmulld),   // parallel integer multiply 32 bits x 4
+  Binary0fOpCode(kX86Pmullw),   // parallel integer multiply 16 bits x 8
+  Binary0fOpCode(kX86Mulps),    // parallel FP multiply 32 bits x 4
+  Binary0fOpCode(kX86Mulpd),    // parallel FP multiply 64 bits x 2
+  Binary0fOpCode(kX86Paddb),    // parallel integer addition 8 bits x 16
+  Binary0fOpCode(kX86Paddw),    // parallel integer addition 16 bits x 8
+  Binary0fOpCode(kX86Paddd),    // parallel integer addition 32 bits x 4
+  Binary0fOpCode(kX86Addps),    // parallel FP addition 32 bits x 4
+  Binary0fOpCode(kX86Addpd),    // parallel FP addition 64 bits x 2
+  Binary0fOpCode(kX86Psubb),    // parallel integer subtraction 8 bits x 16
+  Binary0fOpCode(kX86Psubw),    // parallel integer subtraction 16 bits x 8
+  Binary0fOpCode(kX86Psubd),    // parallel integer subtraction 32 bits x 4
+  Binary0fOpCode(kX86Subps),    // parallel FP subtraction 32 bits x 4
+  Binary0fOpCode(kX86Subpd),    // parallel FP subtraction 64 bits x 2
+  Binary0fOpCode(kX86Pand),     // parallel AND 128 bits x 1
+  Binary0fOpCode(kX86Por),      // parallel OR 128 bits x 1
+  Binary0fOpCode(kX86Pxor),     // parallel XOR 128 bits x 1
+  Binary0fOpCode(kX86Phaddw),   // parallel horizontal addition 16 bits x 8
+  Binary0fOpCode(kX86Phaddd),   // parallel horizontal addition 32 bits x 4
+  kX86PextrbRRI,                // Extract 8 bits from XMM into GPR
+  kX86PextrwRRI,                // Extract 16 bits from XMM into GPR
+  kX86PextrdRRI,                // Extract 32 bits from XMM into GPR
+  kX86PshuflwRRI,               // Shuffle 16 bits in lower 64 bits of XMM.
+  kX86PshufdRRI,                // Shuffle 32 bits in XMM.
+  kX86PsrawRI,                  // signed right shift of floating point registers 16 bits x 8
+  kX86PsradRI,                  // signed right shift of floating point registers 32 bits x 4
+  kX86PsrlwRI,                  // logical right shift of floating point registers 16 bits x 8
+  kX86PsrldRI,                  // logical right shift of floating point registers 32 bits x 4
+  kX86PsrlqRI,                  // logical right shift of floating point registers 64 bits x 2
+  kX86PsllwRI,                  // left shift of floating point registers 16 bits x 8
+  kX86PslldRI,                  // left shift of floating point registers 32 bits x 4
+  kX86PsllqRI,                  // left shift of floating point registers 64 bits x 2
   kX86Fild32M,                  // push 32-bit integer on x87 stack
   kX86Fild64M,                  // push 64-bit integer on x87 stack
   kX86Fstp32M,                  // pop top x87 fp stack and do 32-bit store
@@ -474,20 +565,20 @@
 
 /* Instruction assembly field_loc kind */
 enum X86EncodingKind {
-  kData,                                   // Special case for raw data.
-  kNop,                                    // Special case for variable length nop.
-  kNullary,                                // Opcode that takes no arguments.
-  kPrefix2Nullary,                         // Opcode that takes no arguments, but 2 prefixes.
-  kRegOpcode,                              // Shorter form of R instruction kind (opcode+rd)
-  kReg, kReg64, kMem, kArray,              // R, M and A instruction kinds.
-  kMemReg, kMemReg64, kArrayReg, kArrayReg64, kThreadReg,          // MR, AR and TR instruction kinds.
-  kRegReg, kRegMem, kRegArray, kRegThread, kReg64Thread,  // RR, RM, RA and RT instruction kinds.
-  kRegRegStore,                            // RR following the store modrm reg-reg encoding rather than the load.
-  kRegImm, kReg64Imm, kMemImm, kArrayImm, kThreadImm,  // RI, MI, AI and TI instruction kinds.
-  kRegRegImm, kRegMemImm, kRegArrayImm,    // RRI, RMI and RAI instruction kinds.
-  kMovRegImm,                              // Shorter form move RI.
-  kRegRegImmRev,                           // RRI with first reg in r/m
-  kMemRegImm,                              // MRI instruction kinds.
+  kData,                                    // Special case for raw data.
+  kNop,                                     // Special case for variable length nop.
+  kNullary,                                 // Opcode that takes no arguments.
+  kPrefix2Nullary,                          // Opcode that takes no arguments, but 2 prefixes.
+  kRegOpcode,                               // Shorter form of R instruction kind (opcode+rd)
+  kReg, kMem, kArray,                       // R, M and A instruction kinds.
+  kMemReg, kArrayReg, kThreadReg,           // MR, AR and TR instruction kinds.
+  kRegReg, kRegMem, kRegArray, kRegThread,  // RR, RM, RA and RT instruction kinds.
+  kRegRegStore,                             // RR following the store modrm reg-reg encoding rather than the load.
+  kRegImm, kMemImm, kArrayImm, kThreadImm,  // RI, MI, AI and TI instruction kinds.
+  kRegRegImm, kRegMemImm, kRegArrayImm,     // RRI, RMI and RAI instruction kinds.
+  kMovRegImm,                               // Shorter form move RI.
+  kRegRegImmRev,                            // RRI with first reg in r/m
+  kMemRegImm,                               // MRI instruction kinds.
   kShiftRegImm, kShiftMemImm, kShiftArrayImm,  // Shift opcode with immediate.
   kShiftRegCl, kShiftMemCl, kShiftArrayCl,     // Shift opcode with register CL.
   kRegRegReg, kRegRegMem, kRegRegArray,    // RRR, RRM, RRA instruction kinds.
diff --git a/compiler/dex/reg_storage.h b/compiler/dex/reg_storage.h
index 3387c50..7e50c31 100644
--- a/compiler/dex/reg_storage.h
+++ b/compiler/dex/reg_storage.h
@@ -225,24 +225,6 @@
     return reg_ & kRegNumMask;
   }
 
-  // Aliased double to low single.
-  RegStorage DoubleToLowSingle() const {
-    DCHECK(IsDouble());
-    return FloatSolo32(GetRegNum() << 1);
-  }
-
-  // Aliased double to high single.
-  RegStorage DoubleToHighSingle() const {
-    DCHECK(IsDouble());
-    return FloatSolo32((GetRegNum() << 1) + 1);
-  }
-
-  // Single to aliased double.
-  RegStorage SingleToDouble() const {
-    DCHECK(IsSingle());
-    return FloatSolo64(GetRegNum() >> 1);
-  }
-
   // Is register number in 0..7?
   bool Low8() const {
     return GetRegNum() < 8;
@@ -280,6 +262,11 @@
     return RegStorage(k32BitSolo, (reg_num & kRegNumMask) | kFloatingPoint);
   }
 
+  // Create a 128-bit solo.
+  static RegStorage Solo128(int reg_num) {
+    return RegStorage(k128BitSolo, reg_num & kRegTypeMask);
+  }
+
   // Create a 64-bit solo.
   static RegStorage Solo64(int reg_num) {
     return RegStorage(k64BitSolo, reg_num & kRegTypeMask);
@@ -312,7 +299,7 @@
       case k256BitSolo: return 32;
       case k512BitSolo: return 64;
       case k1024BitSolo: return 128;
-      default: LOG(FATAL) << "Unexpected shap";
+      default: LOG(FATAL) << "Unexpected shape";
     }
     return 0;
   }
diff --git a/compiler/dex/ssa_transformation.cc b/compiler/dex/ssa_transformation.cc
index 865311b..0c5a4ca 100644
--- a/compiler/dex/ssa_transformation.cc
+++ b/compiler/dex/ssa_transformation.cc
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "bit_vector_block_iterator.h"
 #include "compiler_internals.h"
 #include "dataflow_iterator-inl.h"
 
@@ -127,12 +126,7 @@
     return false;
   }
 
-  ArenaBitVector::Iterator iterator(bb->data_flow_info->def_v);
-  while (true) {
-    int idx = iterator.Next();
-    if (idx == -1) {
-      break;
-    }
+  for (uint32_t idx : bb->data_flow_info->def_v->Indexes()) {
     /* Block bb defines register idx */
     def_block_matrix_[idx]->SetBit(bb->id);
   }
@@ -173,8 +167,8 @@
 }
 
 void MIRGraph::ComputeDomPostOrderTraversal(BasicBlock* bb) {
-  if (dom_post_order_traversal_ == NULL) {
-    // First time - create the array.
+  if (dom_post_order_traversal_ == NULL || max_num_reachable_blocks_ < num_reachable_blocks_) {
+    // First time or too small - create the array.
     dom_post_order_traversal_ =
         new (arena_) GrowableArray<BasicBlockId>(arena_, num_reachable_blocks_,
                                         kGrowableArrayDomPostOrderTraversal);
@@ -182,22 +176,22 @@
     dom_post_order_traversal_->Reset();
   }
   ClearAllVisitedFlags();
-  std::vector<std::pair<BasicBlock*, ArenaBitVector::Iterator*>> work_stack;
+  std::vector<std::pair<BasicBlock*, ArenaBitVector::IndexIterator>> work_stack;
   bb->visited = true;
-  work_stack.push_back(std::make_pair(bb, bb->i_dominated->GetIterator()));
+  work_stack.push_back(std::make_pair(bb, bb->i_dominated->Indexes().begin()));
   while (!work_stack.empty()) {
-    const std::pair<BasicBlock*, ArenaBitVector::Iterator*>& curr = work_stack.back();
-    BasicBlock* curr_bb = curr.first;
-    ArenaBitVector::Iterator* curr_idom_iter = curr.second;
-    int bb_idx = curr_idom_iter->Next();
-    while ((bb_idx != -1) && (NeedsVisit(GetBasicBlock(bb_idx)) == NULL)) {
-      bb_idx = curr_idom_iter->Next();
+    std::pair<BasicBlock*, ArenaBitVector::IndexIterator>* curr = &work_stack.back();
+    BasicBlock* curr_bb = curr->first;
+    ArenaBitVector::IndexIterator* curr_idom_iter = &curr->second;
+    while (!curr_idom_iter->Done() && (NeedsVisit(GetBasicBlock(**curr_idom_iter)) == nullptr)) {
+      ++*curr_idom_iter;
     }
-    if (bb_idx != -1) {
-      BasicBlock* new_bb = GetBasicBlock(bb_idx);
+    // NOTE: work_stack.push_back()/pop_back() invalidate curr and curr_idom_iter.
+    if (!curr_idom_iter->Done()) {
+      BasicBlock* new_bb = GetBasicBlock(**curr_idom_iter);
+      ++*curr_idom_iter;
       new_bb->visited = true;
-      work_stack.push_back(
-          std::make_pair(new_bb, new_bb->i_dominated->GetIterator()));
+      work_stack.push_back(std::make_pair(new_bb, new_bb->i_dominated->Indexes().begin()));
     } else {
       // no successor/next
       if (curr_bb->id != NullBasicBlockId) {
@@ -249,11 +243,10 @@
   }
 
   /* Calculate DF_up */
-  BitVectorBlockIterator it(bb->i_dominated, cu_);
-  for (BasicBlock *dominated_bb = it.Next(); dominated_bb != nullptr; dominated_bb = it.Next()) {
-    BitVectorBlockIterator inner_it(dominated_bb->dom_frontier, cu_);
-    for (BasicBlock *df_up_block = inner_it.Next(); df_up_block != nullptr;
-         df_up_block = inner_it.Next()) {
+  for (uint32_t dominated_idx : bb->i_dominated->Indexes()) {
+    BasicBlock *dominated_bb = GetBasicBlock(dominated_idx);
+    for (uint32_t df_up_block_idx : dominated_bb->dom_frontier->Indexes()) {
+      BasicBlock *df_up_block = GetBasicBlock(df_up_block_idx);
       CheckForDominanceFrontier(bb, df_up_block);
     }
   }
@@ -380,8 +373,8 @@
     InitializeDominationInfo(bb);
   }
 
-  /* Initalize & Clear i_dom_list */
-  if (i_dom_list_ == NULL) {
+  /* Initialize & Clear i_dom_list */
+  if (max_num_reachable_blocks_ < num_reachable_blocks_) {
     i_dom_list_ = static_cast<int*>(arena_->Alloc(sizeof(int) * num_reachable_blocks,
                                                   kArenaAllocDFInfo));
   }
@@ -449,7 +442,8 @@
  * insert a phi node if the variable is live-in to the block.
  */
 bool MIRGraph::ComputeBlockLiveIns(BasicBlock* bb) {
-  ArenaBitVector* temp_dalvik_register_v = temp_dalvik_register_v_;
+  DCHECK_EQ(temp_bit_vector_size_, cu_->num_dalvik_registers);
+  ArenaBitVector* temp_dalvik_register_v = temp_bit_vector_;
 
   if (bb->data_flow_info == NULL) {
     return false;
@@ -487,15 +481,10 @@
 /* Insert phi nodes to for each variable to the dominance frontiers */
 void MIRGraph::InsertPhiNodes() {
   int dalvik_reg;
-  ArenaBitVector* phi_blocks =
-      new (arena_) ArenaBitVector(arena_, GetNumBlocks(), false, kBitMapPhi);
-  ArenaBitVector* tmp_blocks =
-      new (arena_) ArenaBitVector(arena_, GetNumBlocks(), false, kBitMapTmpBlocks);
-  ArenaBitVector* input_blocks =
-      new (arena_) ArenaBitVector(arena_, GetNumBlocks(), false, kBitMapInputBlocks);
-
-  temp_dalvik_register_v_ =
-      new (arena_) ArenaBitVector(arena_, cu_->num_dalvik_registers, false, kBitMapRegisterV);
+  ArenaBitVector* phi_blocks = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), GetNumBlocks(), false, kBitMapPhi);
+  ArenaBitVector* input_blocks = new (temp_scoped_alloc_.get()) ArenaBitVector(
+      temp_scoped_alloc_.get(), GetNumBlocks(), false, kBitMapInputBlocks);
 
   RepeatingPostOrderDfsIterator iter(this);
   bool change = false;
@@ -505,53 +494,23 @@
 
   /* Iterate through each Dalvik register */
   for (dalvik_reg = cu_->num_dalvik_registers - 1; dalvik_reg >= 0; dalvik_reg--) {
-    bool change;
-
     input_blocks->Copy(def_block_matrix_[dalvik_reg]);
     phi_blocks->ClearAllBits();
-
-    /* Calculate the phi blocks for each Dalvik register */
     do {
-      change = false;
-      tmp_blocks->ClearAllBits();
-      ArenaBitVector::Iterator iterator(input_blocks);
-
-      while (true) {
-        int idx = iterator.Next();
-        if (idx == -1) {
-          break;
-        }
+      // TUNING: When we repeat this, we could skip indexes from the previous pass.
+      for (uint32_t idx : input_blocks->Indexes()) {
         BasicBlock* def_bb = GetBasicBlock(idx);
-
-        /* Merge the dominance frontier to tmp_blocks */
-        // TUNING: hot call to Union().
-        if (def_bb->dom_frontier != NULL) {
-          tmp_blocks->Union(def_bb->dom_frontier);
+        if (def_bb->dom_frontier != nullptr) {
+          phi_blocks->Union(def_bb->dom_frontier);
         }
       }
-      if (!phi_blocks->Equal(tmp_blocks)) {
-        change = true;
-        phi_blocks->Copy(tmp_blocks);
-
-        /*
-         * Iterate through the original blocks plus the new ones in
-         * the dominance frontier.
-         */
-        input_blocks->Copy(phi_blocks);
-        input_blocks->Union(def_block_matrix_[dalvik_reg]);
-      }
-    } while (change);
+    } while (input_blocks->Union(phi_blocks));
 
     /*
      * Insert a phi node for dalvik_reg in the phi_blocks if the Dalvik
      * register is in the live-in set.
      */
-    ArenaBitVector::Iterator iterator(phi_blocks);
-    while (true) {
-      int idx = iterator.Next();
-      if (idx == -1) {
-        break;
-      }
+    for (uint32_t idx : phi_blocks->Indexes()) {
       BasicBlock* phi_bb = GetBasicBlock(idx);
       /* Variable will be clobbered before being used - no need for phi */
       if (!phi_bb->data_flow_info->live_in_v->IsBitSet(dalvik_reg)) {
@@ -583,12 +542,8 @@
     /* Iterate through the predecessors */
     GrowableArray<BasicBlockId>::Iterator iter(bb->predecessors);
     size_t num_uses = bb->predecessors->Size();
-    mir->ssa_rep->num_uses = num_uses;
-    int* uses = static_cast<int*>(arena_->Alloc(sizeof(int) * num_uses,
-                                                kArenaAllocDFInfo));
-    mir->ssa_rep->uses = uses;
-    mir->ssa_rep->fp_use =
-        static_cast<bool*>(arena_->Alloc(sizeof(bool) * num_uses, kArenaAllocDFInfo));
+    AllocateSSAUseData(mir, num_uses);
+    int* uses = mir->ssa_rep->uses;
     BasicBlockId* incoming =
         static_cast<BasicBlockId*>(arena_->Alloc(sizeof(BasicBlockId) * num_uses,
                                                  kArenaAllocDFInfo));
@@ -597,9 +552,9 @@
     while (true) {
       BasicBlock* pred_bb = GetBasicBlock(iter.Next());
       if (!pred_bb) {
-        break;
+       break;
       }
-      int ssa_reg = pred_bb->data_flow_info->vreg_to_ssa_map[v_reg];
+      int ssa_reg = pred_bb->data_flow_info->vreg_to_ssa_map_exit[v_reg];
       uses[idx] = ssa_reg;
       incoming[idx] = pred_bb->id;
       idx++;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 3304561..8d4e283 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1135,7 +1135,7 @@
       if (dex_method_idx != DexFile::kDexNoIndex) {
         target_method->dex_method_index = dex_method_idx;
       } else {
-        if (compiling_boot) {
+        if (compiling_boot && !use_dex_cache) {
           target_method->dex_method_index = method->GetDexMethodIndex();
           target_method->dex_file = method->GetDeclaringClass()->GetDexCache()->GetDexFile();
         }
diff --git a/compiler/oat_test.cc b/compiler/oat_test.cc
index 6812f3c..49cf71b 100644
--- a/compiler/oat_test.cc
+++ b/compiler/oat_test.cc
@@ -180,7 +180,7 @@
   EXPECT_EQ(80U, sizeof(OatHeader));
   EXPECT_EQ(8U, sizeof(OatMethodOffsets));
   EXPECT_EQ(24U, sizeof(OatQuickMethodHeader));
-  EXPECT_EQ(80 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
+  EXPECT_EQ(79 * GetInstructionSetPointerSize(kRuntimeISA), sizeof(QuickEntryPoints));
 }
 
 TEST_F(OatTest, OatHeaderIsValid) {
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index aafd801..e197ccd 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -20,6 +20,7 @@
 #include "base/bit_field.h"
 #include "globals.h"
 #include "instruction_set.h"
+#include "locations.h"
 #include "memory_region.h"
 #include "nodes.h"
 #include "utils/assembler.h"
@@ -46,267 +47,6 @@
   uintptr_t native_pc;
 };
 
-/**
- * A Location is an abstraction over the potential location
- * of an instruction. It could be in register or stack.
- */
-class Location : public ValueObject {
- public:
-  enum Kind {
-    kInvalid = 0,
-    kStackSlot = 1,  // Word size slot.
-    kDoubleStackSlot = 2,  // 64bit stack slot.
-    kRegister = 3,
-    // On 32bits architectures, quick can pass a long where the
-    // low bits are in the last parameter register, and the high
-    // bits are in a stack slot. The kQuickParameter kind is for
-    // handling this special case.
-    kQuickParameter = 4,
-
-    // Unallocated location represents a location that is not fixed and can be
-    // allocated by a register allocator.  Each unallocated location has
-    // a policy that specifies what kind of location is suitable. Payload
-    // contains register allocation policy.
-    kUnallocated = 5,
-  };
-
-  Location() : value_(kInvalid) {
-    DCHECK(!IsValid());
-  }
-
-  Location(const Location& other) : ValueObject(), value_(other.value_) {}
-
-  Location& operator=(const Location& other) {
-    value_ = other.value_;
-    return *this;
-  }
-
-  bool IsValid() const {
-    return value_ != kInvalid;
-  }
-
-  // Register locations.
-  static Location RegisterLocation(ManagedRegister reg) {
-    return Location(kRegister, reg.RegId());
-  }
-
-  bool IsRegister() const {
-    return GetKind() == kRegister;
-  }
-
-  ManagedRegister reg() const {
-    DCHECK(IsRegister());
-    return static_cast<ManagedRegister>(GetPayload());
-  }
-
-  static uword EncodeStackIndex(intptr_t stack_index) {
-    DCHECK(-kStackIndexBias <= stack_index);
-    DCHECK(stack_index < kStackIndexBias);
-    return static_cast<uword>(kStackIndexBias + stack_index);
-  }
-
-  static Location StackSlot(intptr_t stack_index) {
-    uword payload = EncodeStackIndex(stack_index);
-    Location loc(kStackSlot, payload);
-    // Ensure that sign is preserved.
-    DCHECK_EQ(loc.GetStackIndex(), stack_index);
-    return loc;
-  }
-
-  bool IsStackSlot() const {
-    return GetKind() == kStackSlot;
-  }
-
-  static Location DoubleStackSlot(intptr_t stack_index) {
-    uword payload = EncodeStackIndex(stack_index);
-    Location loc(kDoubleStackSlot, payload);
-    // Ensure that sign is preserved.
-    DCHECK_EQ(loc.GetStackIndex(), stack_index);
-    return loc;
-  }
-
-  bool IsDoubleStackSlot() const {
-    return GetKind() == kDoubleStackSlot;
-  }
-
-  intptr_t GetStackIndex() const {
-    DCHECK(IsStackSlot() || IsDoubleStackSlot());
-    // Decode stack index manually to preserve sign.
-    return GetPayload() - kStackIndexBias;
-  }
-
-  intptr_t GetHighStackIndex(uintptr_t word_size) const {
-    DCHECK(IsDoubleStackSlot());
-    // Decode stack index manually to preserve sign.
-    return GetPayload() - kStackIndexBias + word_size;
-  }
-
-  static Location QuickParameter(uint32_t parameter_index) {
-    return Location(kQuickParameter, parameter_index);
-  }
-
-  uint32_t GetQuickParameterIndex() const {
-    DCHECK(IsQuickParameter());
-    return GetPayload();
-  }
-
-  bool IsQuickParameter() const {
-    return GetKind() == kQuickParameter;
-  }
-
-  arm::ArmManagedRegister AsArm() const;
-  x86::X86ManagedRegister AsX86() const;
-
-  Kind GetKind() const {
-    return KindField::Decode(value_);
-  }
-
-  bool Equals(Location other) const {
-    return value_ == other.value_;
-  }
-
-  const char* DebugString() const {
-    switch (GetKind()) {
-      case kInvalid: return "?";
-      case kRegister: return "R";
-      case kStackSlot: return "S";
-      case kDoubleStackSlot: return "DS";
-      case kQuickParameter: return "Q";
-      case kUnallocated: return "U";
-    }
-    return "?";
-  }
-
-  // Unallocated locations.
-  enum Policy {
-    kAny,
-    kRequiresRegister,
-    kSameAsFirstInput,
-  };
-
-  bool IsUnallocated() const {
-    return GetKind() == kUnallocated;
-  }
-
-  static Location UnallocatedLocation(Policy policy) {
-    return Location(kUnallocated, PolicyField::Encode(policy));
-  }
-
-  // Any free register is suitable to replace this unallocated location.
-  static Location Any() {
-    return UnallocatedLocation(kAny);
-  }
-
-  static Location RequiresRegister() {
-    return UnallocatedLocation(kRequiresRegister);
-  }
-
-  // The location of the first input to the instruction will be
-  // used to replace this unallocated location.
-  static Location SameAsFirstInput() {
-    return UnallocatedLocation(kSameAsFirstInput);
-  }
-
-  Policy GetPolicy() const {
-    DCHECK(IsUnallocated());
-    return PolicyField::Decode(GetPayload());
-  }
-
-  uword GetEncoding() const {
-    return GetPayload();
-  }
-
- private:
-  // Number of bits required to encode Kind value.
-  static constexpr uint32_t kBitsForKind = 4;
-  static constexpr uint32_t kBitsForPayload = kWordSize * kBitsPerByte - kBitsForKind;
-
-  explicit Location(uword value) : value_(value) {}
-
-  Location(Kind kind, uword payload)
-      : value_(KindField::Encode(kind) | PayloadField::Encode(payload)) {}
-
-  uword GetPayload() const {
-    return PayloadField::Decode(value_);
-  }
-
-  typedef BitField<Kind, 0, kBitsForKind> KindField;
-  typedef BitField<uword, kBitsForKind, kBitsForPayload> PayloadField;
-
-  // Layout for kUnallocated locations payload.
-  typedef BitField<Policy, 0, 3> PolicyField;
-
-  // Layout for stack slots.
-  static const intptr_t kStackIndexBias =
-      static_cast<intptr_t>(1) << (kBitsForPayload - 1);
-
-  // Location either contains kind and payload fields or a tagged handle for
-  // a constant locations. Values of enumeration Kind are selected in such a
-  // way that none of them can be interpreted as a kConstant tag.
-  uword value_;
-};
-
-/**
- * The code generator computes LocationSummary for each instruction so that
- * the instruction itself knows what code to generate: where to find the inputs
- * and where to place the result.
- *
- * The intent is to have the code for generating the instruction independent of
- * register allocation. A register allocator just has to provide a LocationSummary.
- */
-class LocationSummary : public ArenaObject {
- public:
-  explicit LocationSummary(HInstruction* instruction)
-      : inputs_(instruction->GetBlock()->GetGraph()->GetArena(), instruction->InputCount()),
-        temps_(instruction->GetBlock()->GetGraph()->GetArena(), 0) {
-    inputs_.SetSize(instruction->InputCount());
-    for (size_t i = 0; i < instruction->InputCount(); i++) {
-      inputs_.Put(i, Location());
-    }
-  }
-
-  void SetInAt(uint32_t at, Location location) {
-    inputs_.Put(at, location);
-  }
-
-  Location InAt(uint32_t at) const {
-    return inputs_.Get(at);
-  }
-
-  size_t GetInputCount() const {
-    return inputs_.Size();
-  }
-
-  void SetOut(Location location) {
-    output_ = Location(location);
-  }
-
-  void AddTemp(Location location) {
-    temps_.Add(location);
-  }
-
-  Location GetTemp(uint32_t at) const {
-    return temps_.Get(at);
-  }
-
-  void SetTempAt(uint32_t at, Location location) {
-    temps_.Put(at, location);
-  }
-
-  size_t GetTempCount() const {
-    return temps_.Size();
-  }
-
-  Location Out() const { return output_; }
-
- private:
-  GrowableArray<Location> inputs_;
-  GrowableArray<Location> temps_;
-  Location output_;
-
-  DISALLOW_COPY_AND_ASSIGN(LocationSummary);
-};
-
 class CodeGenerator : public ArenaObject {
  public:
   // Compiles the graph to executable instructions. Returns whether the compilation
@@ -334,6 +74,13 @@
   void SetFrameSize(uint32_t size) { frame_size_ = size; }
   uint32_t GetCoreSpillMask() const { return core_spill_mask_; }
 
+  virtual size_t GetNumberOfCoreRegisters() const = 0;
+  virtual size_t GetNumberOfFloatingPointRegisters() const = 0;
+  virtual size_t GetNumberOfRegisters() const = 0;
+  virtual void SetupBlockedRegisters(bool* blocked_registers) const = 0;
+  virtual void DumpCoreRegister(std::ostream& stream, int reg) const = 0;
+  virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const = 0;
+
   void RecordPcInfo(uint32_t dex_pc) {
     struct PcInfo pc_info;
     pc_info.dex_pc = dex_pc;
@@ -352,8 +99,7 @@
         graph_(graph),
         block_labels_(graph->GetArena(), 0),
         pc_infos_(graph->GetArena(), 32),
-        blocked_registers_(static_cast<bool*>(
-            graph->GetArena()->Alloc(number_of_registers * sizeof(bool), kArenaAllocData))) {
+        blocked_registers_(graph->GetArena()->AllocArray<bool>(number_of_registers)) {
     block_labels_.SetSize(graph->GetBlocks().Size());
   }
   ~CodeGenerator() { }
@@ -369,9 +115,6 @@
   // the first available register.
   size_t AllocateFreeRegisterInternal(bool* blocked_registers, size_t number_of_registers) const;
 
-  virtual void SetupBlockedRegisters(bool* blocked_registers) const = 0;
-  virtual size_t GetNumberOfRegisters() const = 0;
-
   virtual Location GetStackLocation(HLoadLocal* load) const = 0;
 
   // Frame size required for this method.
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index be51232..ed3f43c 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -37,6 +37,14 @@
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
+void CodeGeneratorARM::DumpCoreRegister(std::ostream& stream, int reg) const {
+  stream << ArmManagedRegister::FromCoreRegister(Register(reg));
+}
+
+void CodeGeneratorARM::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
+  stream << ArmManagedRegister::FromDRegister(DRegister(reg));
+}
+
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph)
     : CodeGenerator(graph, kNumberOfRegIds),
       location_builder_(graph, this),
@@ -793,5 +801,13 @@
   LOG(FATAL) << "Unimplemented";
 }
 
+void LocationsBuilderARM::VisitParallelMove(HParallelMove* instruction) {
+  LOG(FATAL) << "Unimplemented";
+}
+
+void InstructionCodeGeneratorARM::VisitParallelMove(HParallelMove* instruction) {
+  LOG(FATAL) << "Unimplemented";
+}
+
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 2405d4b..423b13e 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -133,6 +133,17 @@
   int32_t GetStackSlot(HLocal* local) const;
   virtual Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
 
+  virtual size_t GetNumberOfCoreRegisters() const OVERRIDE {
+    return kNumberOfCoreRegisters;
+  }
+
+  virtual size_t GetNumberOfFloatingPointRegisters() const OVERRIDE {
+    return kNumberOfDRegisters;
+  }
+
+  virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
+  virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
+
  private:
   // Helper method to move a 32bits value between two locations.
   void Move32(Location destination, Location source);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index e4f95c7..8bfd8d6 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -37,6 +37,14 @@
 static constexpr int kNumberOfPushedRegistersAtEntry = 1;
 static constexpr int kCurrentMethodStackOffset = 0;
 
+void CodeGeneratorX86::DumpCoreRegister(std::ostream& stream, int reg) const {
+  stream << X86ManagedRegister::FromCpuRegister(Register(reg));
+}
+
+void CodeGeneratorX86::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
+  stream << X86ManagedRegister::FromXmmRegister(XmmRegister(reg));
+}
+
 CodeGeneratorX86::CodeGeneratorX86(HGraph* graph)
     : CodeGenerator(graph, kNumberOfRegIds),
       location_builder_(graph, this),
@@ -813,5 +821,13 @@
   LOG(FATAL) << "Unimplemented";
 }
 
+void LocationsBuilderX86::VisitParallelMove(HParallelMove* instruction) {
+  LOG(FATAL) << "Unimplemented";
+}
+
+void InstructionCodeGeneratorX86::VisitParallelMove(HParallelMove* instruction) {
+  LOG(FATAL) << "Unimplemented";
+}
+
 }  // namespace x86
 }  // namespace art
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 1ee11bf..4a70636 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -134,6 +134,17 @@
   int32_t GetStackSlot(HLocal* local) const;
   virtual Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
 
+  virtual size_t GetNumberOfCoreRegisters() const OVERRIDE {
+    return kNumberOfCpuRegisters;
+  }
+
+  virtual size_t GetNumberOfFloatingPointRegisters() const OVERRIDE {
+    return kNumberOfXmmRegisters;
+  }
+
+  virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
+  virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
+
  private:
   // Helper method to move a 32bits value between two locations.
   void Move32(Location destination, Location source);
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 52e3e37..5c5042e 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -16,6 +16,7 @@
 
 #include "graph_visualizer.h"
 
+#include "code_generator.h"
 #include "driver/dex_compilation_unit.h"
 #include "nodes.h"
 #include "ssa_liveness_analysis.h"
@@ -27,8 +28,8 @@
  */
 class HGraphVisualizerPrinter : public HGraphVisitor {
  public:
-  HGraphVisualizerPrinter(HGraph* graph, std::ostream& output)
-      : HGraphVisitor(graph), output_(output), indent_(0) {}
+  HGraphVisualizerPrinter(HGraph* graph, std::ostream& output, const CodeGenerator& codegen)
+      : HGraphVisitor(graph), output_(output), codegen_(codegen), indent_(0) {}
 
   void StartTag(const char* name) {
     AddIndent();
@@ -107,17 +108,18 @@
       output_ << " (liveness: " << instruction->GetLifetimePosition();
       if (instruction->HasLiveInterval()) {
         output_ << " ";
-        const GrowableArray<LiveRange>& ranges = instruction->GetLiveInterval()->GetRanges();
-        size_t i = ranges.Size() - 1;
-        do {
-          output_ << "[" << ranges.Get(i).GetStart() << "," << ranges.Get(i).GetEnd() << "[";
-          if (i == 0) {
-            break;
+        const LiveInterval& interval = *instruction->GetLiveInterval();
+        interval.Dump(output_);
+        if (interval.HasRegister()) {
+          int reg = interval.GetRegister();
+          output_ << " ";
+          if (instruction->GetType() == Primitive::kPrimFloat
+              || instruction->GetType() == Primitive::kPrimDouble) {
+            codegen_.DumpFloatingPointRegister(output_, reg);
           } else {
-            --i;
-            output_ << ",";
+            codegen_.DumpCoreRegister(output_, reg);
           }
-        } while (true);
+        }
       }
       output_ << ")";
     }
@@ -186,6 +188,7 @@
 
  private:
   std::ostream& output_;
+  const CodeGenerator& codegen_;
   size_t indent_;
 
   DISALLOW_COPY_AND_ASSIGN(HGraphVisualizerPrinter);
@@ -194,8 +197,9 @@
 HGraphVisualizer::HGraphVisualizer(std::ostream* output,
                                    HGraph* graph,
                                    const char* string_filter,
+                                   const CodeGenerator& codegen,
                                    const DexCompilationUnit& cu)
-    : output_(output), graph_(graph), is_enabled_(false) {
+    : output_(output), graph_(graph), codegen_(codegen), is_enabled_(false) {
   if (output == nullptr) {
     return;
   }
@@ -205,7 +209,7 @@
   }
 
   is_enabled_ = true;
-  HGraphVisualizerPrinter printer(graph, *output_);
+  HGraphVisualizerPrinter printer(graph, *output_, codegen_);
   printer.StartTag("compilation");
   printer.PrintProperty("name", pretty_name.c_str());
   printer.PrintProperty("method", pretty_name.c_str());
@@ -215,14 +219,15 @@
 
 HGraphVisualizer::HGraphVisualizer(std::ostream* output,
                                    HGraph* graph,
+                                   const CodeGenerator& codegen,
                                    const char* name)
-    : output_(output), graph_(graph), is_enabled_(false) {
+    : output_(output), graph_(graph), codegen_(codegen), is_enabled_(false) {
   if (output == nullptr) {
     return;
   }
 
   is_enabled_ = true;
-  HGraphVisualizerPrinter printer(graph, *output_);
+  HGraphVisualizerPrinter printer(graph, *output_, codegen_);
   printer.StartTag("compilation");
   printer.PrintProperty("name", name);
   printer.PrintProperty("method", name);
@@ -234,7 +239,7 @@
   if (!is_enabled_) {
     return;
   }
-  HGraphVisualizerPrinter printer(graph_, *output_);
+  HGraphVisualizerPrinter printer(graph_, *output_, codegen_);
   printer.Run(pass_name);
 }
 
diff --git a/compiler/optimizing/graph_visualizer.h b/compiler/optimizing/graph_visualizer.h
index 2b88e65..2638cf5 100644
--- a/compiler/optimizing/graph_visualizer.h
+++ b/compiler/optimizing/graph_visualizer.h
@@ -21,6 +21,7 @@
 
 namespace art {
 
+class CodeGenerator;
 class DexCompilationUnit;
 class HGraph;
 
@@ -39,13 +40,17 @@
   HGraphVisualizer(std::ostream* output,
                    HGraph* graph,
                    const char* string_filter,
+                   const CodeGenerator& codegen,
                    const DexCompilationUnit& cu);
 
   /**
    * Version of `HGraphVisualizer` for unit testing, that is when a
    * `DexCompilationUnit` is not available.
    */
-  HGraphVisualizer(std::ostream* output, HGraph* graph, const char* name);
+  HGraphVisualizer(std::ostream* output,
+                   HGraph* graph,
+                   const CodeGenerator& codegen,
+                   const char* name);
 
   /**
    * If this visualizer is enabled, emit the compilation information
@@ -56,6 +61,7 @@
  private:
   std::ostream* const output_;
   HGraph* const graph_;
+  const CodeGenerator& codegen_;
 
   // Is true when `output_` is not null, and the compiled method's name
   // contains the string_filter given in the constructor.
diff --git a/compiler/optimizing/live_interval_test.cc b/compiler/optimizing/live_interval_test.cc
new file mode 100644
index 0000000..3e4b83b
--- /dev/null
+++ b/compiler/optimizing/live_interval_test.cc
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "optimizing_unit_test.h"
+#include "ssa_liveness_analysis.h"
+#include "utils/arena_allocator.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+TEST(LiveInterval, GetStart) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    static constexpr size_t ranges[][2] = {{0, 42}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    ASSERT_EQ(0u, interval->GetStart());
+  }
+
+  {
+    static constexpr size_t ranges[][2] = {{4, 12}, {14, 16}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    ASSERT_EQ(4u, interval->GetStart());
+  }
+}
+
+TEST(LiveInterval, IsDeadAt) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    static constexpr size_t ranges[][2] = {{0, 42}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    ASSERT_TRUE(interval->IsDeadAt(42));
+    ASSERT_TRUE(interval->IsDeadAt(43));
+    ASSERT_FALSE(interval->IsDeadAt(41));
+    ASSERT_FALSE(interval->IsDeadAt(0));
+    ASSERT_FALSE(interval->IsDeadAt(22));
+  }
+
+  {
+    static constexpr size_t ranges[][2] = {{4, 12}, {14, 16}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    ASSERT_TRUE(interval->IsDeadAt(16));
+    ASSERT_TRUE(interval->IsDeadAt(32));
+    ASSERT_FALSE(interval->IsDeadAt(0));
+    ASSERT_FALSE(interval->IsDeadAt(4));
+    ASSERT_FALSE(interval->IsDeadAt(12));
+    ASSERT_FALSE(interval->IsDeadAt(13));
+    ASSERT_FALSE(interval->IsDeadAt(14));
+    ASSERT_FALSE(interval->IsDeadAt(15));
+  }
+}
+
+TEST(LiveInterval, Covers) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    static constexpr size_t ranges[][2] = {{0, 42}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    ASSERT_TRUE(interval->Covers(0));
+    ASSERT_TRUE(interval->Covers(4));
+    ASSERT_TRUE(interval->Covers(41));
+    ASSERT_FALSE(interval->Covers(42));
+    ASSERT_FALSE(interval->Covers(54));
+  }
+
+  {
+    static constexpr size_t ranges[][2] = {{4, 12}, {14, 16}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    ASSERT_TRUE(interval->Covers(4));
+    ASSERT_TRUE(interval->Covers(11));
+    ASSERT_TRUE(interval->Covers(14));
+    ASSERT_TRUE(interval->Covers(15));
+    ASSERT_FALSE(interval->Covers(0));
+    ASSERT_FALSE(interval->Covers(12));
+    ASSERT_FALSE(interval->Covers(13));
+    ASSERT_FALSE(interval->Covers(16));
+  }
+}
+
+TEST(LiveInterval, FirstIntersectionWith) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    static constexpr size_t ranges1[][2] = {{0, 4}, {8, 10}};
+    LiveInterval* interval1 = BuildInterval(ranges1, arraysize(ranges1), &allocator);
+    static constexpr size_t ranges2[][2] = {{5, 6}};
+    LiveInterval* interval2 = BuildInterval(ranges2, arraysize(ranges2), &allocator);
+
+    ASSERT_EQ(kNoLifetime, interval1->FirstIntersectionWith(interval2));
+  }
+
+  {
+    static constexpr size_t ranges1[][2] = {{0, 4}, {8, 10}};
+    LiveInterval* interval1 = BuildInterval(ranges1, arraysize(ranges1), &allocator);
+    static constexpr size_t ranges2[][2] = {{5, 42}};
+    LiveInterval* interval2 = BuildInterval(ranges2, arraysize(ranges2), &allocator);
+
+    ASSERT_EQ(8u, interval1->FirstIntersectionWith(interval2));
+  }
+
+  {
+    static constexpr size_t ranges1[][2] = {{0, 4}, {8, 10}};
+    LiveInterval* interval1 = BuildInterval(ranges1, arraysize(ranges1), &allocator);
+    static constexpr size_t ranges2[][2] = {{5, 6}, {7, 8}, {11, 12}};
+    LiveInterval* interval2 = BuildInterval(ranges2, arraysize(ranges2), &allocator);
+
+    ASSERT_EQ(kNoLifetime, interval1->FirstIntersectionWith(interval2));
+  }
+
+  {
+    static constexpr size_t ranges1[][2] = {{0, 4}, {8, 10}};
+    LiveInterval* interval1 = BuildInterval(ranges1, arraysize(ranges1), &allocator);
+    static constexpr size_t ranges2[][2] = {{5, 6}, {7, 8}, {9, 10}};
+    LiveInterval* interval2 = BuildInterval(ranges2, arraysize(ranges2), &allocator);
+
+    ASSERT_EQ(9u, interval1->FirstIntersectionWith(interval2));
+  }
+
+  {
+    static constexpr size_t ranges1[][2] = {{0, 1}, {2, 7}, {8, 10}};
+    LiveInterval* interval1 = BuildInterval(ranges1, arraysize(ranges1), &allocator);
+    static constexpr size_t ranges2[][2] = {{1, 2}, {6, 7}, {9, 10}};
+    LiveInterval* interval2 = BuildInterval(ranges2, arraysize(ranges2), &allocator);
+
+    ASSERT_EQ(6u, interval1->FirstIntersectionWith(interval2));
+  }
+
+  {
+    static constexpr size_t ranges1[][2] = {{0, 1}, {2, 8}, {55, 58}};
+    LiveInterval* interval1 = BuildInterval(ranges1, arraysize(ranges1), &allocator);
+    static constexpr size_t ranges2[][2] = {{1, 2}, {11, 42}, {43, 48}, {54, 56}};
+    LiveInterval* interval2 = BuildInterval(ranges2, arraysize(ranges2), &allocator);
+
+    ASSERT_EQ(55u, interval1->FirstIntersectionWith(interval2));
+  }
+
+  {
+    static constexpr size_t ranges1[][2] = {{0, 1}, {2, 8}, {15, 18}, {27, 32}, {41, 53}, {54, 60}};
+    LiveInterval* interval1 = BuildInterval(ranges1, arraysize(ranges1), &allocator);
+    static constexpr size_t ranges2[][2] = {{1, 2}, {11, 12}, {19, 25}, {34, 42}, {52, 60}};
+    LiveInterval* interval2 = BuildInterval(ranges2, arraysize(ranges2), &allocator);
+
+    ASSERT_EQ(41u, interval1->FirstIntersectionWith(interval2));
+  }
+}
+
+static bool RangesEquals(LiveInterval* interval,
+                         const size_t expected[][2],
+                         size_t number_of_expected_ranges) {
+  LiveRange* current = interval->GetFirstRange();
+
+  size_t i = 0;
+  for (;
+       i < number_of_expected_ranges && current != nullptr;
+       ++i, current = current->GetNext()) {
+    if (expected[i][0] != current->GetStart()) {
+      return false;
+    }
+    if (expected[i][1] != current->GetEnd()) {
+      return false;
+    }
+  }
+
+  if (current != nullptr || i != number_of_expected_ranges) {
+    return false;
+  }
+
+  return true;
+}
+
+TEST(LiveInterval, SplitAt) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    // Test within one range.
+    static constexpr size_t ranges[][2] = {{0, 4}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(1);
+    static constexpr size_t expected[][2] = {{0, 1}};
+    ASSERT_TRUE(RangesEquals(interval, expected, arraysize(expected)));
+    static constexpr size_t expected_split[][2] = {{1, 4}};
+    ASSERT_TRUE(RangesEquals(split, expected_split, arraysize(expected_split)));
+  }
+
+  {
+    // Test just before the end of one range.
+    static constexpr size_t ranges[][2] = {{0, 4}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(3);
+    static constexpr size_t expected[][2] = {{0, 3}};
+    ASSERT_TRUE(RangesEquals(interval, expected, arraysize(expected)));
+    static constexpr size_t expected_split[][2] = {{3, 4}};
+    ASSERT_TRUE(RangesEquals(split, expected_split, arraysize(expected_split)));
+  }
+
+  {
+    // Test withing the first range.
+    static constexpr size_t ranges[][2] = {{0, 4}, {8, 12}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(1);
+    static constexpr size_t expected[][2] = {{0, 1}};
+    ASSERT_TRUE(RangesEquals(interval, expected, arraysize(expected)));
+    static constexpr size_t expected_split[][2] = {{1, 4}, {8, 12}};
+    ASSERT_TRUE(RangesEquals(split, expected_split, arraysize(expected_split)));
+  }
+
+  {
+    // Test in a hole.
+    static constexpr size_t ranges[][2] = {{0, 4}, {8, 12}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(5);
+    static constexpr size_t expected[][2] = {{0, 4}};
+    ASSERT_TRUE(RangesEquals(interval, expected, arraysize(expected)));
+    static constexpr size_t expected_split[][2] = {{8, 12}};
+    ASSERT_TRUE(RangesEquals(split, expected_split, arraysize(expected_split)));
+  }
+
+  {
+    // Test withing the second range.
+    static constexpr size_t ranges[][2] = {{0, 4}, {8, 12}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(9);
+    static constexpr size_t expected[][2] = {{0, 4}, {8, 9}};
+    ASSERT_TRUE(RangesEquals(interval, expected, arraysize(expected)));
+    static constexpr size_t expected_split[][2] = {{9, 12}};
+    ASSERT_TRUE(RangesEquals(split, expected_split, arraysize(expected_split)));
+  }
+
+  {
+    // Test at the beginning of the second range.
+    static constexpr size_t ranges[][2] = {{0, 4}, {6, 10}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(6);
+    static constexpr size_t expected[][2] = {{0, 4}};
+    ASSERT_TRUE(RangesEquals(interval, expected, arraysize(expected)));
+    static constexpr size_t expected_split[][2] = {{6, 10}};
+    ASSERT_TRUE(RangesEquals(split, expected_split, arraysize(expected_split)));
+  }
+
+  {
+    // Test at the end of the first range.
+    static constexpr size_t ranges[][2] = {{0, 4}, {6, 10}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(4);
+    static constexpr size_t expected[][2] = {{0, 4}};
+    ASSERT_TRUE(RangesEquals(interval, expected, arraysize(expected)));
+    static constexpr size_t expected_split[][2] = {{6, 10}};
+    ASSERT_TRUE(RangesEquals(split, expected_split, arraysize(expected_split)));
+  }
+
+  {
+    // Test that we get null if we split at a position where the interval is dead.
+    static constexpr size_t ranges[][2] = {{0, 4}};
+    LiveInterval* interval = BuildInterval(ranges, arraysize(ranges), &allocator);
+    LiveInterval* split = interval->SplitAt(5);
+    ASSERT_TRUE(split == nullptr);
+    ASSERT_TRUE(RangesEquals(interval, ranges, arraysize(ranges)));
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/live_ranges_test.cc b/compiler/optimizing/live_ranges_test.cc
index 9849388..c797497 100644
--- a/compiler/optimizing/live_ranges_test.cc
+++ b/compiler/optimizing/live_ranges_test.cc
@@ -43,11 +43,11 @@
    *
    * Which becomes the following graph (numbered by lifetime position):
    *       2: constant0
-   *       3: goto
+   *       4: goto
    *           |
-   *       6: return
+   *       8: return
    *           |
-   *       9: exit
+   *       12: exit
    */
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -60,14 +60,14 @@
   liveness.Analyze();
 
   LiveInterval* interval = liveness.GetInstructionFromSsaIndex(0)->GetLiveInterval();
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  LiveRange range = interval->GetRanges().Get(0);
-  ASSERT_EQ(2u, range.GetStart());
+  LiveRange* range = interval->GetFirstRange();
+  ASSERT_EQ(2u, range->GetStart());
   // Last use is the return instruction.
-  ASSERT_EQ(6u, range.GetEnd());
+  ASSERT_EQ(8u, range->GetEnd());
   HBasicBlock* block = graph->GetBlocks().Get(1);
   ASSERT_TRUE(block->GetLastInstruction()->AsReturn() != nullptr);
-  ASSERT_EQ(6u, block->GetLastInstruction()->GetLifetimePosition());
+  ASSERT_EQ(8u, block->GetLastInstruction()->GetLifetimePosition());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 }
 
 TEST(LiveRangesTest, CFG2) {
@@ -81,16 +81,16 @@
    *
    * Which becomes the following graph (numbered by lifetime position):
    *       2: constant0
-   *       3: goto
+   *       4: goto
    *           |
-   *       6: equal
-   *       7: if
+   *       8: equal
+   *       10: if
    *       /       \
-   *   10: goto   13: goto
+   *   14: goto   18: goto
    *       \       /
-   *       16: return
+   *       22: return
    *         |
-   *       19: exit
+   *       26: exit
    */
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -105,14 +105,14 @@
   liveness.Analyze();
 
   LiveInterval* interval = liveness.GetInstructionFromSsaIndex(0)->GetLiveInterval();
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  LiveRange range = interval->GetRanges().Get(0);
-  ASSERT_EQ(2u, range.GetStart());
+  LiveRange* range = interval->GetFirstRange();
+  ASSERT_EQ(2u, range->GetStart());
   // Last use is the return instruction.
-  ASSERT_EQ(16u, range.GetEnd());
+  ASSERT_EQ(22u, range->GetEnd());
   HBasicBlock* block = graph->GetBlocks().Get(3);
   ASSERT_TRUE(block->GetLastInstruction()->AsReturn() != nullptr);
-  ASSERT_EQ(16u, block->GetLastInstruction()->GetLifetimePosition());
+  ASSERT_EQ(22u, block->GetLastInstruction()->GetLifetimePosition());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 }
 
 TEST(LiveRangesTest, CFG3) {
@@ -127,18 +127,18 @@
    *
    * Which becomes the following graph (numbered by lifetime position):
    *       2: constant0
-   *       3: constant4
-   *       4: goto
+   *       4: constant4
+   *       6: goto
    *           |
-   *       7: equal
-   *       8: if
+   *       10: equal
+   *       12: if
    *       /       \
-   *   11: goto   14: goto
+   *   16: goto   20: goto
    *       \       /
-   *       16: phi
-   *       17: return
+   *       22: phi
+   *       24: return
    *         |
-   *       20: exit
+   *       38: exit
    */
   const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
     Instruction::CONST_4 | 0 | 0,
@@ -154,34 +154,34 @@
 
   // Test for the 0 constant.
   LiveInterval* interval = liveness.GetInstructionFromSsaIndex(0)->GetLiveInterval();
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  LiveRange range = interval->GetRanges().Get(0);
-  ASSERT_EQ(2u, range.GetStart());
+  LiveRange* range = interval->GetFirstRange();
+  ASSERT_EQ(2u, range->GetStart());
   // Last use is the phi at the return block so instruction is live until
   // the end of the then block.
-  ASSERT_EQ(12u, range.GetEnd());
+  ASSERT_EQ(18u, range->GetEnd());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 
   // Test for the 4 constant.
   interval = liveness.GetInstructionFromSsaIndex(1)->GetLiveInterval();
   // The then branch is a hole for this constant, therefore its interval has 2 ranges.
-  ASSERT_EQ(2u, interval->GetRanges().Size());
-  // First range is the else block.
-  range = interval->GetRanges().Get(0);
-  ASSERT_EQ(13u, range.GetStart());
-  // Last use is the phi at the return block.
-  ASSERT_EQ(15u, range.GetEnd());
-  // Second range starts from the definition and ends at the if block.
-  range = interval->GetRanges().Get(1);
-  ASSERT_EQ(3u, range.GetStart());
+  // First range starts from the definition and ends at the if block.
+  range = interval->GetFirstRange();
+  ASSERT_EQ(4u, range->GetStart());
   // 9 is the end of the if block.
-  ASSERT_EQ(9u, range.GetEnd());
+  ASSERT_EQ(14u, range->GetEnd());
+  // Second range is the else block.
+  range = range->GetNext();
+  ASSERT_EQ(18u, range->GetStart());
+  // Last use is the phi at the return block.
+  ASSERT_EQ(22u, range->GetEnd());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 
   // Test for the phi.
   interval = liveness.GetInstructionFromSsaIndex(3)->GetLiveInterval();
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  range = interval->GetRanges().Get(0);
-  ASSERT_EQ(16u, range.GetStart());
-  ASSERT_EQ(17u, range.GetEnd());
+  range = interval->GetFirstRange();
+  ASSERT_EQ(22u, range->GetStart());
+  ASSERT_EQ(24u, range->GetEnd());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 }
 
 TEST(LiveRangesTest, Loop) {
@@ -195,21 +195,21 @@
    *
    * Which becomes the following graph (numbered by lifetime position):
    *       2: constant0
-   *       3: constant4
-   *       4: constant5
-   *       5: goto
-   *           |
+   *       4: constant4
+   *       6: constant5
    *       8: goto
    *           |
-   *       10: phi
-   *       11: equal
-   *       12: if +++++
+   *       12: goto
+   *           |
+   *       14: phi
+   *       16: equal
+   *       18: if +++++
    *        |       \ +
-   *        |     15: goto
+   *        |     22: goto
    *        |
-   *       18: return
+   *       26: return
    *         |
-   *       21: exit
+   *       30: exit
    */
 
   const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
@@ -228,36 +228,36 @@
 
   // Test for the 0 constant.
   LiveInterval* interval = liveness.GetInstructionFromSsaIndex(0)->GetLiveInterval();
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  LiveRange range = interval->GetRanges().Get(0);
-  ASSERT_EQ(2u, range.GetStart());
+  LiveRange* range = interval->GetFirstRange();
+  ASSERT_EQ(2u, range->GetStart());
   // Last use is the loop phi so instruction is live until
   // the end of the pre loop header.
-  ASSERT_EQ(9u, range.GetEnd());
+  ASSERT_EQ(14u, range->GetEnd());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 
   // Test for the 4 constant.
   interval = liveness.GetInstructionFromSsaIndex(1)->GetLiveInterval();
+  range = interval->GetFirstRange();
   // The instruction is live until the end of the loop.
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  range = interval->GetRanges().Get(0);
-  ASSERT_EQ(3u, range.GetStart());
-  ASSERT_EQ(16u, range.GetEnd());
+  ASSERT_EQ(4u, range->GetStart());
+  ASSERT_EQ(24u, range->GetEnd());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 
   // Test for the 5 constant.
   interval = liveness.GetInstructionFromSsaIndex(2)->GetLiveInterval();
-  // The instruction is live until the return instruction of the loop.
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  range = interval->GetRanges().Get(0);
-  ASSERT_EQ(4u, range.GetStart());
-  ASSERT_EQ(18u, range.GetEnd());
+  range = interval->GetFirstRange();
+  // The instruction is live until the return instruction after the loop.
+  ASSERT_EQ(6u, range->GetStart());
+  ASSERT_EQ(26u, range->GetEnd());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 
   // Test for the phi.
   interval = liveness.GetInstructionFromSsaIndex(3)->GetLiveInterval();
-  ASSERT_EQ(1u, interval->GetRanges().Size());
-  range = interval->GetRanges().Get(0);
+  range = interval->GetFirstRange();
   // Instruction is consumed by the if.
-  ASSERT_EQ(10u, range.GetStart());
-  ASSERT_EQ(11u, range.GetEnd());
+  ASSERT_EQ(14u, range->GetStart());
+  ASSERT_EQ(16u, range->GetEnd());
+  ASSERT_TRUE(range->GetNext() == nullptr);
 }
 
 }  // namespace art
diff --git a/compiler/dex/bit_vector_block_iterator.cc b/compiler/optimizing/locations.cc
similarity index 61%
rename from compiler/dex/bit_vector_block_iterator.cc
rename to compiler/optimizing/locations.cc
index 32d7d71..98766d2 100644
--- a/compiler/dex/bit_vector_block_iterator.cc
+++ b/compiler/optimizing/locations.cc
@@ -1,4 +1,4 @@
-/*
+  /*
  * Copyright (C) 2014 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,19 +14,19 @@
  * limitations under the License.
  */
 
-#include "bit_vector_block_iterator.h"
-#include "mir_graph.h"
+#include "locations.h"
+
+#include "nodes.h"
 
 namespace art {
 
-BasicBlock* BitVectorBlockIterator::Next() {
-  int idx = internal_iterator_.Next();
-
-  if (idx == -1) {
-    return nullptr;
+LocationSummary::LocationSummary(HInstruction* instruction)
+    : inputs_(instruction->GetBlock()->GetGraph()->GetArena(), instruction->InputCount()),
+      temps_(instruction->GetBlock()->GetGraph()->GetArena(), 0) {
+  inputs_.SetSize(instruction->InputCount());
+  for (size_t i = 0; i < instruction->InputCount(); i++) {
+    inputs_.Put(i, Location());
   }
-
-  return mir_graph_->GetBasicBlock(idx);
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
new file mode 100644
index 0000000..3c60d3c
--- /dev/null
+++ b/compiler/optimizing/locations.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_LOCATIONS_H_
+#define ART_COMPILER_OPTIMIZING_LOCATIONS_H_
+
+#include "base/bit_field.h"
+#include "utils/allocation.h"
+#include "utils/growable_array.h"
+#include "utils/managed_register.h"
+
+namespace art {
+
+class HInstruction;
+
+/**
+ * A Location is an abstraction over the potential location
+ * of an instruction. It could be in register or stack.
+ */
+class Location : public ValueObject {
+ public:
+  enum Kind {
+    kInvalid = 0,
+    kStackSlot = 1,  // Word size slot.
+    kDoubleStackSlot = 2,  // 64bit stack slot.
+    kRegister = 3,
+    // On 32bits architectures, quick can pass a long where the
+    // low bits are in the last parameter register, and the high
+    // bits are in a stack slot. The kQuickParameter kind is for
+    // handling this special case.
+    kQuickParameter = 4,
+
+    // Unallocated location represents a location that is not fixed and can be
+    // allocated by a register allocator.  Each unallocated location has
+    // a policy that specifies what kind of location is suitable. Payload
+    // contains register allocation policy.
+    kUnallocated = 5,
+  };
+
+  Location() : value_(kInvalid) {
+    DCHECK(!IsValid());
+  }
+
+  Location(const Location& other) : ValueObject(), value_(other.value_) {}
+
+  Location& operator=(const Location& other) {
+    value_ = other.value_;
+    return *this;
+  }
+
+  bool IsValid() const {
+    return value_ != kInvalid;
+  }
+
+  bool IsInvalid() const {
+    return !IsValid();
+  }
+
+  bool IsConstant() const {
+    // TODO: support constants.
+    return false;
+  }
+
+  // Empty location. Used if there the location should be ignored.
+  static Location NoLocation() {
+    return Location();
+  }
+
+  // Register locations.
+  static Location RegisterLocation(ManagedRegister reg) {
+    return Location(kRegister, reg.RegId());
+  }
+
+  bool IsRegister() const {
+    return GetKind() == kRegister;
+  }
+
+  ManagedRegister reg() const {
+    DCHECK(IsRegister());
+    return static_cast<ManagedRegister>(GetPayload());
+  }
+
+  static uword EncodeStackIndex(intptr_t stack_index) {
+    DCHECK(-kStackIndexBias <= stack_index);
+    DCHECK(stack_index < kStackIndexBias);
+    return static_cast<uword>(kStackIndexBias + stack_index);
+  }
+
+  static Location StackSlot(intptr_t stack_index) {
+    uword payload = EncodeStackIndex(stack_index);
+    Location loc(kStackSlot, payload);
+    // Ensure that sign is preserved.
+    DCHECK_EQ(loc.GetStackIndex(), stack_index);
+    return loc;
+  }
+
+  bool IsStackSlot() const {
+    return GetKind() == kStackSlot;
+  }
+
+  static Location DoubleStackSlot(intptr_t stack_index) {
+    uword payload = EncodeStackIndex(stack_index);
+    Location loc(kDoubleStackSlot, payload);
+    // Ensure that sign is preserved.
+    DCHECK_EQ(loc.GetStackIndex(), stack_index);
+    return loc;
+  }
+
+  bool IsDoubleStackSlot() const {
+    return GetKind() == kDoubleStackSlot;
+  }
+
+  intptr_t GetStackIndex() const {
+    DCHECK(IsStackSlot() || IsDoubleStackSlot());
+    // Decode stack index manually to preserve sign.
+    return GetPayload() - kStackIndexBias;
+  }
+
+  intptr_t GetHighStackIndex(uintptr_t word_size) const {
+    DCHECK(IsDoubleStackSlot());
+    // Decode stack index manually to preserve sign.
+    return GetPayload() - kStackIndexBias + word_size;
+  }
+
+  static Location QuickParameter(uint32_t parameter_index) {
+    return Location(kQuickParameter, parameter_index);
+  }
+
+  uint32_t GetQuickParameterIndex() const {
+    DCHECK(IsQuickParameter());
+    return GetPayload();
+  }
+
+  bool IsQuickParameter() const {
+    return GetKind() == kQuickParameter;
+  }
+
+  arm::ArmManagedRegister AsArm() const;
+  x86::X86ManagedRegister AsX86() const;
+
+  Kind GetKind() const {
+    return KindField::Decode(value_);
+  }
+
+  bool Equals(Location other) const {
+    return value_ == other.value_;
+  }
+
+  const char* DebugString() const {
+    switch (GetKind()) {
+      case kInvalid: return "?";
+      case kRegister: return "R";
+      case kStackSlot: return "S";
+      case kDoubleStackSlot: return "DS";
+      case kQuickParameter: return "Q";
+      case kUnallocated: return "U";
+    }
+    return "?";
+  }
+
+  // Unallocated locations.
+  enum Policy {
+    kAny,
+    kRequiresRegister,
+    kSameAsFirstInput,
+  };
+
+  bool IsUnallocated() const {
+    return GetKind() == kUnallocated;
+  }
+
+  static Location UnallocatedLocation(Policy policy) {
+    return Location(kUnallocated, PolicyField::Encode(policy));
+  }
+
+  // Any free register is suitable to replace this unallocated location.
+  static Location Any() {
+    return UnallocatedLocation(kAny);
+  }
+
+  static Location RequiresRegister() {
+    return UnallocatedLocation(kRequiresRegister);
+  }
+
+  // The location of the first input to the instruction will be
+  // used to replace this unallocated location.
+  static Location SameAsFirstInput() {
+    return UnallocatedLocation(kSameAsFirstInput);
+  }
+
+  Policy GetPolicy() const {
+    DCHECK(IsUnallocated());
+    return PolicyField::Decode(GetPayload());
+  }
+
+  uword GetEncoding() const {
+    return GetPayload();
+  }
+
+ private:
+  // Number of bits required to encode Kind value.
+  static constexpr uint32_t kBitsForKind = 4;
+  static constexpr uint32_t kBitsForPayload = kWordSize * kBitsPerByte - kBitsForKind;
+
+  explicit Location(uword value) : value_(value) {}
+
+  Location(Kind kind, uword payload)
+      : value_(KindField::Encode(kind) | PayloadField::Encode(payload)) {}
+
+  uword GetPayload() const {
+    return PayloadField::Decode(value_);
+  }
+
+  typedef BitField<Kind, 0, kBitsForKind> KindField;
+  typedef BitField<uword, kBitsForKind, kBitsForPayload> PayloadField;
+
+  // Layout for kUnallocated locations payload.
+  typedef BitField<Policy, 0, 3> PolicyField;
+
+  // Layout for stack slots.
+  static const intptr_t kStackIndexBias =
+      static_cast<intptr_t>(1) << (kBitsForPayload - 1);
+
+  // Location either contains kind and payload fields or a tagged handle for
+  // a constant locations. Values of enumeration Kind are selected in such a
+  // way that none of them can be interpreted as a kConstant tag.
+  uword value_;
+};
+
+/**
+ * The code generator computes LocationSummary for each instruction so that
+ * the instruction itself knows what code to generate: where to find the inputs
+ * and where to place the result.
+ *
+ * The intent is to have the code for generating the instruction independent of
+ * register allocation. A register allocator just has to provide a LocationSummary.
+ */
+class LocationSummary : public ArenaObject {
+ public:
+  explicit LocationSummary(HInstruction* instruction);
+
+  void SetInAt(uint32_t at, Location location) {
+    inputs_.Put(at, location);
+  }
+
+  Location InAt(uint32_t at) const {
+    return inputs_.Get(at);
+  }
+
+  size_t GetInputCount() const {
+    return inputs_.Size();
+  }
+
+  void SetOut(Location location) {
+    output_ = Location(location);
+  }
+
+  void AddTemp(Location location) {
+    temps_.Add(location);
+  }
+
+  Location GetTemp(uint32_t at) const {
+    return temps_.Get(at);
+  }
+
+  void SetTempAt(uint32_t at, Location location) {
+    temps_.Put(at, location);
+  }
+
+  size_t GetTempCount() const {
+    return temps_.Size();
+  }
+
+  Location Out() const { return output_; }
+
+ private:
+  GrowableArray<Location> inputs_;
+  GrowableArray<Location> temps_;
+  Location output_;
+
+  DISALLOW_COPY_AND_ASSIGN(LocationSummary);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_LOCATIONS_H_
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index afaedd7..752466b 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -291,6 +291,17 @@
   return false;
 }
 
+void HBasicBlock::InsertInstructionBefore(HInstruction* instruction, HInstruction* cursor) {
+  DCHECK(cursor->AsPhi() == nullptr);
+  DCHECK(instruction->AsPhi() == nullptr);
+  instruction->next_ = cursor;
+  instruction->previous_ = cursor->previous_;
+  cursor->previous_ = instruction;
+  if (GetFirstInstruction() == cursor) {
+    instructions_.first_instruction_ = instruction;
+  }
+}
+
 static void Add(HInstructionList* instruction_list,
                 HBasicBlock* block,
                 HInstruction* instruction) {
@@ -377,6 +388,7 @@
 }
 
 void HInstruction::ReplaceWith(HInstruction* other) {
+  DCHECK(other != nullptr);
   for (HUseIterator<HInstruction> it(GetUses()); !it.Done(); it.Advance()) {
     HUseListNode<HInstruction>* current = it.Current();
     HInstruction* user = current->GetUser();
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index a2cb1c4..b1c8016 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -17,6 +17,7 @@
 #ifndef ART_COMPILER_OPTIMIZING_NODES_H_
 #define ART_COMPILER_OPTIMIZING_NODES_H_
 
+#include "locations.h"
 #include "utils/allocation.h"
 #include "utils/arena_bit_vector.h"
 #include "utils/growable_array.h"
@@ -275,6 +276,7 @@
   HInstruction* GetLastInstruction() const { return instructions_.last_instruction_; }
   const HInstructionList& GetInstructions() const { return instructions_; }
   const HInstructionList& GetPhis() const { return phis_; }
+  HInstruction* GetFirstPhi() const { return phis_.first_instruction_; }
 
   void AddSuccessor(HBasicBlock* block) {
     successors_.Add(block);
@@ -315,6 +317,7 @@
 
   void AddInstruction(HInstruction* instruction);
   void RemoveInstruction(HInstruction* instruction);
+  void InsertInstructionBefore(HInstruction* instruction, HInstruction* cursor);
   void AddPhi(HPhi* phi);
   void RemovePhi(HPhi* phi);
 
@@ -383,6 +386,7 @@
   M(NewInstance)                                           \
   M(Not)                                                   \
   M(ParameterValue)                                        \
+  M(ParallelMove)                                          \
   M(Phi)                                                   \
   M(Return)                                                \
   M(ReturnVoid)                                            \
@@ -394,9 +398,9 @@
 #undef FORWARD_DECLARATION
 
 #define DECLARE_INSTRUCTION(type)                          \
-  virtual void Accept(HGraphVisitor* visitor);             \
   virtual const char* DebugName() const { return #type; }  \
   virtual H##type* As##type() { return this; }             \
+  virtual void Accept(HGraphVisitor* visitor)              \
 
 template <typename T>
 class HUseListNode : public ArenaObject {
@@ -731,7 +735,7 @@
  public:
   HReturnVoid() { }
 
-  DECLARE_INSTRUCTION(ReturnVoid)
+  DECLARE_INSTRUCTION(ReturnVoid);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HReturnVoid);
@@ -745,7 +749,7 @@
     SetRawInputAt(0, value);
   }
 
-  DECLARE_INSTRUCTION(Return)
+  DECLARE_INSTRUCTION(Return);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HReturn);
@@ -758,7 +762,7 @@
  public:
   HExit() { }
 
-  DECLARE_INSTRUCTION(Exit)
+  DECLARE_INSTRUCTION(Exit);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HExit);
@@ -773,7 +777,7 @@
     return GetBlock()->GetSuccessors().Get(0);
   }
 
-  DECLARE_INSTRUCTION(Goto)
+  DECLARE_INSTRUCTION(Goto);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HGoto);
@@ -795,7 +799,7 @@
     return GetBlock()->GetSuccessors().Get(1);
   }
 
-  DECLARE_INSTRUCTION(If)
+  DECLARE_INSTRUCTION(If);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HIf);
@@ -834,7 +838,7 @@
 
   virtual Primitive::Type GetType() const { return Primitive::kPrimBoolean; }
 
-  DECLARE_INSTRUCTION(Equal)
+  DECLARE_INSTRUCTION(Equal);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HEqual);
@@ -845,7 +849,7 @@
  public:
   explicit HLocal(uint16_t reg_number) : reg_number_(reg_number) { }
 
-  DECLARE_INSTRUCTION(Local)
+  DECLARE_INSTRUCTION(Local);
 
   uint16_t GetRegNumber() const { return reg_number_; }
 
@@ -867,7 +871,7 @@
 
   HLocal* GetLocal() const { return reinterpret_cast<HLocal*>(InputAt(0)); }
 
-  DECLARE_INSTRUCTION(LoadLocal)
+  DECLARE_INSTRUCTION(LoadLocal);
 
  private:
   const Primitive::Type type_;
@@ -886,7 +890,7 @@
 
   HLocal* GetLocal() const { return reinterpret_cast<HLocal*>(InputAt(0)); }
 
-  DECLARE_INSTRUCTION(StoreLocal)
+  DECLARE_INSTRUCTION(StoreLocal);
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HStoreLocal);
@@ -901,7 +905,7 @@
   int32_t GetValue() const { return value_; }
   virtual Primitive::Type GetType() const { return Primitive::kPrimInt; }
 
-  DECLARE_INSTRUCTION(IntConstant)
+  DECLARE_INSTRUCTION(IntConstant);
 
  private:
   const int32_t value_;
@@ -917,7 +921,7 @@
 
   virtual Primitive::Type GetType() const { return Primitive::kPrimLong; }
 
-  DECLARE_INSTRUCTION(LongConstant)
+  DECLARE_INSTRUCTION(LongConstant);
 
  private:
   const int64_t value_;
@@ -977,7 +981,7 @@
 
   uint32_t GetIndexInDexCache() const { return index_in_dex_cache_; }
 
-  DECLARE_INSTRUCTION(InvokeStatic)
+  DECLARE_INSTRUCTION(InvokeStatic);
 
  private:
   const uint32_t index_in_dex_cache_;
@@ -997,7 +1001,7 @@
   // Calls runtime so needs an environment.
   virtual bool NeedsEnvironment() const { return true; }
 
-  DECLARE_INSTRUCTION(NewInstance)
+  DECLARE_INSTRUCTION(NewInstance);
 
  private:
   const uint32_t dex_pc_;
@@ -1088,20 +1092,103 @@
   void AddInput(HInstruction* input);
 
   virtual Primitive::Type GetType() const { return type_; }
+  void SetType(Primitive::Type type) { type_ = type; }
 
   uint32_t GetRegNumber() const { return reg_number_; }
 
-  DECLARE_INSTRUCTION(Phi)
+  DECLARE_INSTRUCTION(Phi);
 
  protected:
   GrowableArray<HInstruction*> inputs_;
   const uint32_t reg_number_;
-  const Primitive::Type type_;
+  Primitive::Type type_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(HPhi);
 };
 
+class MoveOperands : public ArenaObject {
+ public:
+  MoveOperands(Location source, Location destination)
+      : source_(source), destination_(destination) {}
+
+  Location GetSource() const { return source_; }
+  Location GetDestination() const { return destination_; }
+
+  void SetSource(Location value) { source_ = value; }
+  void SetDestination(Location value) { destination_ = value; }
+
+  // The parallel move resolver marks moves as "in-progress" by clearing the
+  // destination (but not the source).
+  Location MarkPending() {
+    DCHECK(!IsPending());
+    Location dest = destination_;
+    destination_ = Location::NoLocation();
+    return dest;
+  }
+
+  void ClearPending(Location dest) {
+    DCHECK(IsPending());
+    destination_ = dest;
+  }
+
+  bool IsPending() const {
+    DCHECK(!source_.IsInvalid() || destination_.IsInvalid());
+    return destination_.IsInvalid() && !source_.IsInvalid();
+  }
+
+  // True if this blocks a move from the given location.
+  bool Blocks(Location loc) const {
+    return !IsEliminated() && source_.Equals(loc);
+  }
+
+  // A move is redundant if it's been eliminated, if its source and
+  // destination are the same, or if its destination is unneeded.
+  bool IsRedundant() const {
+    return IsEliminated() || destination_.IsInvalid() || source_.Equals(destination_);
+  }
+
+  // We clear both operands to indicate move that's been eliminated.
+  void Eliminate() {
+    source_ = destination_ = Location::NoLocation();
+  }
+
+  bool IsEliminated() const {
+    DCHECK(!source_.IsInvalid() || destination_.IsInvalid());
+    return source_.IsInvalid();
+  }
+
+ private:
+  Location source_;
+  Location destination_;
+
+  DISALLOW_COPY_AND_ASSIGN(MoveOperands);
+};
+
+static constexpr size_t kDefaultNumberOfMoves = 4;
+
+class HParallelMove : public HTemplateInstruction<0> {
+ public:
+  explicit HParallelMove(ArenaAllocator* arena) : moves_(arena, kDefaultNumberOfMoves) {}
+
+  void AddMove(MoveOperands* move) {
+    moves_.Add(move);
+  }
+
+  MoveOperands* MoveOperandsAt(size_t index) const {
+    return moves_.Get(index);
+  }
+
+  size_t NumMoves() const { return moves_.Size(); }
+
+  DECLARE_INSTRUCTION(ParallelMove);
+
+ private:
+  GrowableArray<MoveOperands*> moves_;
+
+  DISALLOW_COPY_AND_ASSIGN(HParallelMove);
+};
+
 class HGraphVisitor : public ValueObject {
  public:
   explicit HGraphVisitor(HGraph* graph) : graph_(graph) { }
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 286f48a..dfbb488 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -24,6 +24,7 @@
 #include "driver/dex_compilation_unit.h"
 #include "graph_visualizer.h"
 #include "nodes.h"
+#include "register_allocator.h"
 #include "ssa_liveness_analysis.h"
 #include "utils/arena_allocator.h"
 
@@ -96,8 +97,6 @@
     }
     return nullptr;
   }
-  HGraphVisualizer visualizer(visualizer_output_.get(), graph, kStringFilter, dex_compilation_unit);
-  visualizer.DumpGraph("builder");
 
   InstructionSet instruction_set = GetCompilerDriver()->GetInstructionSet();
   // The optimizing compiler currently does not have a Thumb2 assembler.
@@ -112,6 +111,10 @@
     return nullptr;
   }
 
+  HGraphVisualizer visualizer(
+      visualizer_output_.get(), graph, kStringFilter, *codegen, dex_compilation_unit);
+  visualizer.DumpGraph("builder");
+
   CodeVectorAllocator allocator;
   codegen->Compile(&allocator);
 
@@ -128,9 +131,13 @@
   visualizer.DumpGraph("ssa");
 
   graph->FindNaturalLoops();
-  SsaLivenessAnalysis(*graph).Analyze();
+  SsaLivenessAnalysis liveness(*graph);
+  liveness.Analyze();
   visualizer.DumpGraph("liveness");
 
+  RegisterAllocator(graph->GetArena(), *codegen).AllocateRegisters(liveness);
+  visualizer.DumpGraph("register");
+
   return new CompiledMethod(GetCompilerDriver(),
                             instruction_set,
                             allocator.GetMemory(),
diff --git a/compiler/optimizing/optimizing_unit_test.h b/compiler/optimizing/optimizing_unit_test.h
index 67c4850..36a6a21 100644
--- a/compiler/optimizing/optimizing_unit_test.h
+++ b/compiler/optimizing/optimizing_unit_test.h
@@ -17,6 +17,10 @@
 #ifndef ART_COMPILER_OPTIMIZING_OPTIMIZING_UNIT_TEST_H_
 #define ART_COMPILER_OPTIMIZING_OPTIMIZING_UNIT_TEST_H_
 
+#include "ssa_liveness_analysis.h"
+
+namespace art {
+
 #define NUM_INSTRUCTIONS(...)  \
   (sizeof((uint16_t[]) {__VA_ARGS__}) /sizeof(uint16_t))
 
@@ -29,4 +33,21 @@
 #define TWO_REGISTERS_CODE_ITEM(...)                                       \
     { 2, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
 
+#define THREE_REGISTERS_CODE_ITEM(...)                                     \
+    { 3, 0, 0, 0, 0, 0, NUM_INSTRUCTIONS(__VA_ARGS__), 0, __VA_ARGS__ }
+
+LiveInterval* BuildInterval(const size_t ranges[][2],
+                            size_t number_of_ranges,
+                            ArenaAllocator* allocator,
+                            int reg = -1) {
+  LiveInterval* interval = new (allocator) LiveInterval(allocator, Primitive::kPrimInt);
+  for (size_t i = number_of_ranges; i > 0; --i) {
+    interval->AddRange(ranges[i - 1][0], ranges[i - 1][1]);
+  }
+  interval->SetRegister(reg);
+  return interval;
+}
+
+}  // namespace art
+
 #endif  // ART_COMPILER_OPTIMIZING_OPTIMIZING_UNIT_TEST_H_
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
new file mode 100644
index 0000000..3d2d136
--- /dev/null
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "parallel_move_resolver.h"
+#include "nodes.h"
+#include "locations.h"
+
+namespace art {
+
+void ParallelMoveResolver::EmitNativeCode(HParallelMove* parallel_move) {
+  DCHECK(moves_.IsEmpty());
+  // Build up a worklist of moves.
+  BuildInitialMoveList(parallel_move);
+
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& move = *moves_.Get(i);
+    // Skip constants to perform them last.  They don't block other moves
+    // and skipping such moves with register destinations keeps those
+    // registers free for the whole algorithm.
+    if (!move.IsEliminated() && !move.GetSource().IsConstant()) {
+      PerformMove(i);
+    }
+  }
+
+  // Perform the moves with constant sources.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& move = *moves_.Get(i);
+    if (!move.IsEliminated()) {
+      DCHECK(move.GetSource().IsConstant());
+      EmitMove(i);
+    }
+  }
+
+  moves_.Reset();
+}
+
+
+void ParallelMoveResolver::BuildInitialMoveList(HParallelMove* parallel_move) {
+  // Perform a linear sweep of the moves to add them to the initial list of
+  // moves to perform, ignoring any move that is redundant (the source is
+  // the same as the destination, the destination is ignored and
+  // unallocated, or the move was already eliminated).
+  for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
+    MoveOperands* move = parallel_move->MoveOperandsAt(i);
+    if (!move->IsRedundant()) {
+      moves_.Add(move);
+    }
+  }
+}
+
+
+void ParallelMoveResolver::PerformMove(size_t index) {
+  // Each call to this function performs a move and deletes it from the move
+  // graph.  We first recursively perform any move blocking this one.  We
+  // mark a move as "pending" on entry to PerformMove in order to detect
+  // cycles in the move graph.  We use operand swaps to resolve cycles,
+  // which means that a call to PerformMove could change any source operand
+  // in the move graph.
+
+  DCHECK(!moves_.Get(index)->IsPending());
+  DCHECK(!moves_.Get(index)->IsRedundant());
+
+  // Clear this move's destination to indicate a pending move.  The actual
+  // destination is saved in a stack-allocated local.  Recursion may allow
+  // multiple moves to be pending.
+  DCHECK(!moves_.Get(index)->GetSource().IsInvalid());
+  Location destination = moves_.Get(index)->MarkPending();
+
+  // Perform a depth-first traversal of the move graph to resolve
+  // dependencies.  Any unperformed, unpending move with a source the same
+  // as this one's destination blocks this one so recursively perform all
+  // such moves.
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& other_move = *moves_.Get(i);
+    if (other_move.Blocks(destination) && !other_move.IsPending()) {
+      // Though PerformMove can change any source operand in the move graph,
+      // this call cannot create a blocking move via a swap (this loop does
+      // not miss any).  Assume there is a non-blocking move with source A
+      // and this move is blocked on source B and there is a swap of A and
+      // B.  Then A and B must be involved in the same cycle (or they would
+      // not be swapped).  Since this move's destination is B and there is
+      // only a single incoming edge to an operand, this move must also be
+      // involved in the same cycle.  In that case, the blocking move will
+      // be created but will be "pending" when we return from PerformMove.
+      PerformMove(i);
+    }
+  }
+  MoveOperands* move = moves_.Get(index);
+
+  // We are about to resolve this move and don't need it marked as
+  // pending, so restore its destination.
+  move->ClearPending(destination);
+
+  // This move's source may have changed due to swaps to resolve cycles and
+  // so it may now be the last move in the cycle.  If so remove it.
+  if (move->GetSource().Equals(destination)) {
+    move->Eliminate();
+    return;
+  }
+
+  // The move may be blocked on a (at most one) pending move, in which case
+  // we have a cycle.  Search for such a blocking move and perform a swap to
+  // resolve it.
+  bool do_swap = false;
+  for (size_t i = 0; i < moves_.Size(); ++i) {
+    const MoveOperands& other_move = *moves_.Get(i);
+    if (other_move.Blocks(destination)) {
+      DCHECK(other_move.IsPending());
+      do_swap = true;
+      break;
+    }
+  }
+
+  if (do_swap) {
+    EmitSwap(index);
+    // Any unperformed (including pending) move with a source of either
+    // this move's source or destination needs to have their source
+    // changed to reflect the state of affairs after the swap.
+    Location source = move->GetSource();
+    Location destination = move->GetDestination();
+    move->Eliminate();
+    for (size_t i = 0; i < moves_.Size(); ++i) {
+      const MoveOperands& other_move = *moves_.Get(i);
+      if (other_move.Blocks(source)) {
+        moves_.Get(i)->SetSource(destination);
+      } else if (other_move.Blocks(destination)) {
+        moves_.Get(i)->SetSource(source);
+      }
+    }
+  } else {
+    // This move is not blocked.
+    EmitMove(index);
+    move->Eliminate();
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
new file mode 100644
index 0000000..ff20cb0
--- /dev/null
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_PARALLEL_MOVE_RESOLVER_H_
+#define ART_COMPILER_OPTIMIZING_PARALLEL_MOVE_RESOLVER_H_
+
+#include "utils/allocation.h"
+#include "utils/growable_array.h"
+
+namespace art {
+
+class HParallelMove;
+class MoveOperands;
+
+/**
+ * Helper class to resolve a set of parallel moves. Architecture dependent code
+ * generator must have their own subclass that implements the `EmitMove` and `EmitSwap`
+ * operations.
+ */
+class ParallelMoveResolver : public ValueObject {
+ public:
+  explicit ParallelMoveResolver(ArenaAllocator* allocator) : moves_(allocator, 32) {}
+  virtual ~ParallelMoveResolver() {}
+
+  // Resolve a set of parallel moves, emitting assembler instructions.
+  void EmitNativeCode(HParallelMove* parallel_move);
+
+ protected:
+  // Emit a move.
+  virtual void EmitMove(size_t index) = 0;
+
+  // Execute a move by emitting a swap of two operands.
+  virtual void EmitSwap(size_t index) = 0;
+
+  // List of moves not yet resolved.
+  GrowableArray<MoveOperands*> moves_;
+
+ private:
+  // Build the initial list of moves.
+  void BuildInitialMoveList(HParallelMove* parallel_move);
+
+  // Perform the move at the moves_ index in question (possibly requiring
+  // other moves to satisfy dependencies).
+  void PerformMove(size_t index);
+
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolver);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_PARALLEL_MOVE_RESOLVER_H_
diff --git a/compiler/optimizing/parallel_move_test.cc b/compiler/optimizing/parallel_move_test.cc
new file mode 100644
index 0000000..88df24d
--- /dev/null
+++ b/compiler/optimizing/parallel_move_test.cc
@@ -0,0 +1,128 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nodes.h"
+#include "parallel_move_resolver.h"
+#include "utils/arena_allocator.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+class TestParallelMoveResolver : public ParallelMoveResolver {
+ public:
+  explicit TestParallelMoveResolver(ArenaAllocator* allocator) : ParallelMoveResolver(allocator) {}
+
+  virtual void EmitMove(size_t index) {
+    MoveOperands* move = moves_.Get(index);
+    if (!message_.str().empty()) {
+      message_ << " ";
+    }
+    message_ << "("
+             << move->GetSource().reg().RegId()
+             << " -> "
+             << move->GetDestination().reg().RegId()
+             << ")";
+  }
+
+  virtual void EmitSwap(size_t index) {
+    MoveOperands* move = moves_.Get(index);
+    if (!message_.str().empty()) {
+      message_ << " ";
+    }
+    message_ << "("
+             << move->GetSource().reg().RegId()
+             << " <-> "
+             << move->GetDestination().reg().RegId()
+             << ")";
+  }
+
+  std::string GetMessage() const {
+    return  message_.str();
+  }
+
+ private:
+  std::ostringstream message_;
+
+
+  DISALLOW_COPY_AND_ASSIGN(TestParallelMoveResolver);
+};
+
+static HParallelMove* BuildParallelMove(ArenaAllocator* allocator,
+                                        const size_t operands[][2],
+                                        size_t number_of_moves) {
+  HParallelMove* moves = new (allocator) HParallelMove(allocator);
+  for (size_t i = 0; i < number_of_moves; ++i) {
+    moves->AddMove(new (allocator) MoveOperands(
+        Location::RegisterLocation(ManagedRegister(operands[i][0])),
+        Location::RegisterLocation(ManagedRegister(operands[i][1]))));
+  }
+  return moves;
+}
+
+TEST(ParallelMoveTest, Dependency) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 2}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    ASSERT_STREQ("(1 -> 2) (0 -> 1)", resolver.GetMessage().c_str());
+  }
+
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {2, 3}, {1, 4}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    ASSERT_STREQ("(2 -> 3) (1 -> 2) (1 -> 4) (0 -> 1)", resolver.GetMessage().c_str());
+  }
+}
+
+TEST(ParallelMoveTest, Swap) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 0}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    ASSERT_STREQ("(1 <-> 0)", resolver.GetMessage().c_str());
+  }
+
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {1, 0}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    ASSERT_STREQ("(1 -> 2) (1 <-> 0)", resolver.GetMessage().c_str());
+  }
+
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 1}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    ASSERT_STREQ("(4 <-> 1) (3 <-> 4) (2 <-> 3) (0 -> 1)", resolver.GetMessage().c_str());
+  }
+
+  {
+    TestParallelMoveResolver resolver(&allocator);
+    static constexpr size_t moves[][2] = {{0, 1}, {1, 2}, {2, 3}, {3, 4}, {4, 1}, {5, 4}};
+    resolver.EmitNativeCode(BuildParallelMove(&allocator, moves, arraysize(moves)));
+    ASSERT_STREQ("(4 <-> 1) (3 <-> 4) (2 <-> 3) (0 -> 1) (5 -> 4)", resolver.GetMessage().c_str());
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
new file mode 100644
index 0000000..dd175d2
--- /dev/null
+++ b/compiler/optimizing/register_allocator.cc
@@ -0,0 +1,378 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "register_allocator.h"
+
+#include "code_generator.h"
+#include "ssa_liveness_analysis.h"
+
+namespace art {
+
+static constexpr size_t kMaxLifetimePosition = -1;
+
+RegisterAllocator::RegisterAllocator(ArenaAllocator* allocator, const CodeGenerator& codegen)
+      : allocator_(allocator),
+        codegen_(codegen),
+        unhandled_(allocator, 0),
+        handled_(allocator, 0),
+        active_(allocator, 0),
+        inactive_(allocator, 0),
+        processing_core_registers_(false),
+        number_of_registers_(-1),
+        registers_array_(nullptr),
+        blocked_registers_(allocator->AllocArray<bool>(codegen.GetNumberOfRegisters())) {
+  codegen.SetupBlockedRegisters(blocked_registers_);
+}
+
+static bool ShouldProcess(bool processing_core_registers, HInstruction* instruction) {
+  bool is_core_register = (instruction->GetType() != Primitive::kPrimDouble)
+      && (instruction->GetType() != Primitive::kPrimFloat);
+  return processing_core_registers == is_core_register;
+}
+
+void RegisterAllocator::AllocateRegistersInternal(const SsaLivenessAnalysis& liveness) {
+  number_of_registers_ = processing_core_registers_
+      ? codegen_.GetNumberOfCoreRegisters()
+      : codegen_.GetNumberOfFloatingPointRegisters();
+
+  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_);
+
+  // Iterate post-order, to ensure the list is sorted, and the last added interval
+  // is the one with the lowest start position.
+  for (size_t i = liveness.GetNumberOfSsaValues(); i > 0; --i) {
+    HInstruction* instruction = liveness.GetInstructionFromSsaIndex(i - 1);
+    if (ShouldProcess(processing_core_registers_, instruction)) {
+      LiveInterval* current = instruction->GetLiveInterval();
+      DCHECK(unhandled_.IsEmpty() || current->StartsBefore(unhandled_.Peek()));
+      unhandled_.Add(current);
+    }
+  }
+
+  LinearScan();
+  if (kIsDebugBuild) {
+    ValidateInternal(liveness, true);
+  }
+}
+
+bool RegisterAllocator::ValidateInternal(const SsaLivenessAnalysis& liveness,
+                                         bool log_fatal_on_failure) const {
+  // To simplify unit testing, we eagerly create the array of intervals, and
+  // call the helper method.
+  GrowableArray<LiveInterval*> intervals(allocator_, 0);
+  for (size_t i = 0; i < liveness.GetNumberOfSsaValues(); ++i) {
+    HInstruction* instruction = liveness.GetInstructionFromSsaIndex(i);
+    if (ShouldProcess(processing_core_registers_, instruction)) {
+      intervals.Add(instruction->GetLiveInterval());
+    }
+  }
+  return ValidateIntervals(intervals, codegen_, allocator_, processing_core_registers_,
+                           log_fatal_on_failure);
+}
+
+bool RegisterAllocator::ValidateIntervals(const GrowableArray<LiveInterval*>& ranges,
+                                          const CodeGenerator& codegen,
+                                          ArenaAllocator* allocator,
+                                          bool processing_core_registers,
+                                          bool log_fatal_on_failure) {
+  size_t number_of_registers = processing_core_registers
+      ? codegen.GetNumberOfCoreRegisters()
+      : codegen.GetNumberOfFloatingPointRegisters();
+  GrowableArray<ArenaBitVector*> bit_vectors(allocator, number_of_registers);
+
+  // Allocate a bit vector per register. A live interval that has a register
+  // allocated will populate the associated bit vector based on its live ranges.
+  for (size_t i = 0; i < number_of_registers; i++) {
+    bit_vectors.Add(new (allocator) ArenaBitVector(allocator, 0, true));
+  }
+
+  for (size_t i = 0, e = ranges.Size(); i < e; ++i) {
+    LiveInterval* current = ranges.Get(i);
+    do {
+      if (!current->HasRegister()) {
+        continue;
+      }
+      BitVector* vector = bit_vectors.Get(current->GetRegister());
+      LiveRange* range = current->GetFirstRange();
+      do {
+        for (size_t j = range->GetStart(); j < range->GetEnd(); ++j) {
+          if (vector->IsBitSet(j)) {
+            if (log_fatal_on_failure) {
+              std::ostringstream message;
+              message << "Register conflict at " << j << " for ";
+              if (processing_core_registers) {
+                codegen.DumpCoreRegister(message, current->GetRegister());
+              } else {
+                codegen.DumpFloatingPointRegister(message, current->GetRegister());
+              }
+              LOG(FATAL) << message.str();
+            } else {
+              return false;
+            }
+          } else {
+            vector->SetBit(j);
+          }
+        }
+      } while ((range = range->GetNext()) != nullptr);
+    } while ((current = current->GetNextSibling()) != nullptr);
+  }
+  return true;
+}
+
+void RegisterAllocator::DumpInterval(std::ostream& stream, LiveInterval* interval) {
+  interval->Dump(stream);
+  stream << ": ";
+  if (interval->HasRegister()) {
+    if (processing_core_registers_) {
+      codegen_.DumpCoreRegister(stream, interval->GetRegister());
+    } else {
+      codegen_.DumpFloatingPointRegister(stream, interval->GetRegister());
+    }
+  } else {
+    stream << "spilled";
+  }
+  stream << std::endl;
+}
+
+// By the book implementation of a linear scan register allocator.
+void RegisterAllocator::LinearScan() {
+  while (!unhandled_.IsEmpty()) {
+    // (1) Remove interval with the lowest start position from unhandled.
+    LiveInterval* current = unhandled_.Pop();
+    size_t position = current->GetStart();
+
+    // (2) Remove currently active intervals that are dead at this position.
+    //     Move active intervals that have a lifetime hole at this position
+    //     to inactive.
+    for (size_t i = 0; i < active_.Size(); ++i) {
+      LiveInterval* interval = active_.Get(i);
+      if (interval->IsDeadAt(position)) {
+        active_.Delete(interval);
+        --i;
+        handled_.Add(interval);
+      } else if (!interval->Covers(position)) {
+        active_.Delete(interval);
+        --i;
+        inactive_.Add(interval);
+      }
+    }
+
+    // (3) Remove currently inactive intervals that are dead at this position.
+    //     Move inactive intervals that cover this position to active.
+    for (size_t i = 0; i < inactive_.Size(); ++i) {
+      LiveInterval* interval = inactive_.Get(i);
+      if (interval->IsDeadAt(position)) {
+        inactive_.Delete(interval);
+        --i;
+        handled_.Add(interval);
+      } else if (interval->Covers(position)) {
+        inactive_.Delete(interval);
+        --i;
+        active_.Add(interval);
+      }
+    }
+
+    // (4) Try to find an available register.
+    bool success = TryAllocateFreeReg(current);
+
+    // (5) If no register could be found, we need to spill.
+    if (!success) {
+      success = AllocateBlockedReg(current);
+    }
+
+    // (6) If the interval had a register allocated, add it to the list of active
+    //     intervals.
+    if (success) {
+      active_.Add(current);
+    }
+  }
+}
+
+// Find a free register. If multiple are found, pick the register that
+// is free the longest.
+bool RegisterAllocator::TryAllocateFreeReg(LiveInterval* current) {
+  size_t* free_until = registers_array_;
+
+  // First set all registers to be free.
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    free_until[i] = kMaxLifetimePosition;
+  }
+
+  // For each active interval, set its register to not free.
+  for (size_t i = 0, e = active_.Size(); i < e; ++i) {
+    LiveInterval* interval = active_.Get(i);
+    DCHECK(interval->HasRegister());
+    free_until[interval->GetRegister()] = 0;
+  }
+
+  // For each inactive interval, set its register to be free until
+  // the next intersection with `current`.
+  // Thanks to SSA, this should only be needed for intervals
+  // that are the result of a split.
+  for (size_t i = 0, e = inactive_.Size(); i < e; ++i) {
+    LiveInterval* inactive = inactive_.Get(i);
+    DCHECK(inactive->HasRegister());
+    size_t next_intersection = inactive->FirstIntersectionWith(current);
+    if (next_intersection != kNoLifetime) {
+      free_until[inactive->GetRegister()] = next_intersection;
+    }
+  }
+
+  // Pick the register that is free the longest.
+  int reg = -1;
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    if (IsBlocked(i)) continue;
+    if (reg == -1 || free_until[i] > free_until[reg]) {
+      reg = i;
+      if (free_until[i] == kMaxLifetimePosition) break;
+    }
+  }
+
+  // If we could not find a register, we need to spill.
+  if (reg == -1 || free_until[reg] == 0) {
+    return false;
+  }
+
+  current->SetRegister(reg);
+  if (!current->IsDeadAt(free_until[reg])) {
+    // If the register is only available for a subset of live ranges
+    // covered by `current`, split `current` at the position where
+    // the register is not available anymore.
+    LiveInterval* split = Split(current, free_until[reg]);
+    DCHECK(split != nullptr);
+    AddToUnhandled(split);
+  }
+  return true;
+}
+
+bool RegisterAllocator::IsBlocked(int reg) const {
+  // TODO: This only works for core registers and needs to be adjusted for
+  // floating point registers.
+  DCHECK(processing_core_registers_);
+  return blocked_registers_[reg];
+}
+
+// Find the register that is used the last, and spill the interval
+// that holds it. If the first use of `current` is after that register
+// we spill `current` instead.
+bool RegisterAllocator::AllocateBlockedReg(LiveInterval* current) {
+  size_t first_register_use = current->FirstRegisterUse();
+  if (current->FirstRegisterUse() == kNoLifetime) {
+    // TODO: Allocate spill slot for `current`.
+    return false;
+  }
+
+  // First set all registers as not being used.
+  size_t* next_use = registers_array_;
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    next_use[i] = kMaxLifetimePosition;
+  }
+
+  // For each active interval, find the next use of its register after the
+  // start of current.
+  for (size_t i = 0, e = active_.Size(); i < e; ++i) {
+    LiveInterval* active = active_.Get(i);
+    DCHECK(active->HasRegister());
+    size_t use = active->FirstRegisterUseAfter(current->GetStart());
+    if (use != kNoLifetime) {
+      next_use[active->GetRegister()] = use;
+    }
+  }
+
+  // For each inactive interval, find the next use of its register after the
+  // start of current.
+  // Thanks to SSA, this should only be needed for intervals
+  // that are the result of a split.
+  for (size_t i = 0, e = inactive_.Size(); i < e; ++i) {
+    LiveInterval* inactive = inactive_.Get(i);
+    DCHECK(inactive->HasRegister());
+    size_t use = inactive->FirstRegisterUseAfter(current->GetStart());
+    if (use != kNoLifetime) {
+      next_use[inactive->GetRegister()] = use;
+    }
+  }
+
+  // Pick the register that is used the last.
+  int reg = -1;
+  for (size_t i = 0; i < number_of_registers_; ++i) {
+    if (IsBlocked(i)) continue;
+    if (reg == -1 || next_use[i] > next_use[reg]) {
+      reg = i;
+      if (next_use[i] == kMaxLifetimePosition) break;
+    }
+  }
+
+  if (first_register_use >= next_use[reg]) {
+    // If the first use of that instruction is after the last use of the found
+    // register, we split this interval just before its first register use.
+    LiveInterval* split = Split(current, first_register_use - 1);
+    AddToUnhandled(split);
+    return false;
+  } else {
+    // Use this register and spill the active and inactives interval that
+    // have that register.
+    current->SetRegister(reg);
+
+    for (size_t i = 0, e = active_.Size(); i < e; ++i) {
+      LiveInterval* active = active_.Get(i);
+      if (active->GetRegister() == reg) {
+        LiveInterval* split = Split(active, current->GetStart());
+        active_.DeleteAt(i);
+        handled_.Add(active);
+        AddToUnhandled(split);
+        break;
+      }
+    }
+
+    for (size_t i = 0; i < inactive_.Size(); ++i) {
+      LiveInterval* inactive = inactive_.Get(i);
+      if (inactive->GetRegister() == reg) {
+        LiveInterval* split = Split(inactive, current->GetStart());
+        inactive_.DeleteAt(i);
+        handled_.Add(inactive);
+        AddToUnhandled(split);
+        --i;
+      }
+    }
+
+    return true;
+  }
+}
+
+void RegisterAllocator::AddToUnhandled(LiveInterval* interval) {
+  for (size_t i = unhandled_.Size(); i > 0; --i) {
+    LiveInterval* current = unhandled_.Get(i - 1);
+    if (current->StartsAfter(interval)) {
+      unhandled_.InsertAt(i, interval);
+      break;
+    }
+  }
+}
+
+LiveInterval* RegisterAllocator::Split(LiveInterval* interval, size_t position) {
+  DCHECK(position >= interval->GetStart());
+  DCHECK(!interval->IsDeadAt(position));
+  if (position == interval->GetStart()) {
+    // Spill slot will be allocated when handling `interval` again.
+    interval->ClearRegister();
+    return interval;
+  } else {
+    LiveInterval* new_interval = interval->SplitAt(position);
+    // TODO: Allocate spill slot for `interval`.
+    return new_interval;
+  }
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
new file mode 100644
index 0000000..e575b96
--- /dev/null
+++ b/compiler/optimizing/register_allocator.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_H_
+#define ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_H_
+
+#include "base/macros.h"
+#include "utils/growable_array.h"
+
+namespace art {
+
+class CodeGenerator;
+class LiveInterval;
+class SsaLivenessAnalysis;
+
+/**
+ * An implementation of a linear scan register allocator on an `HGraph` with SSA form.
+ */
+class RegisterAllocator {
+ public:
+  RegisterAllocator(ArenaAllocator* allocator, const CodeGenerator& codegen);
+
+  // Main entry point for the register allocator. Given the liveness analysis,
+  // allocates registers to live intervals.
+  void AllocateRegisters(const SsaLivenessAnalysis& liveness) {
+    processing_core_registers_ = true;
+    AllocateRegistersInternal(liveness);
+    processing_core_registers_ = false;
+    AllocateRegistersInternal(liveness);
+  }
+
+  // Validate that the register allocator did not allocate the same register to
+  // intervals that intersect each other. Returns false if it did not.
+  bool Validate(const SsaLivenessAnalysis& liveness, bool log_fatal_on_failure) {
+    processing_core_registers_ = true;
+    if (!ValidateInternal(liveness, log_fatal_on_failure)) {
+      return false;
+    }
+    processing_core_registers_ = false;
+    return ValidateInternal(liveness, log_fatal_on_failure);
+  }
+
+  // Helper method for validation. Used by unit testing.
+  static bool ValidateIntervals(const GrowableArray<LiveInterval*>& intervals,
+                                const CodeGenerator& codegen,
+                                ArenaAllocator* allocator,
+                                bool processing_core_registers,
+                                bool log_fatal_on_failure);
+
+ private:
+  // Main methods of the allocator.
+  void LinearScan();
+  bool TryAllocateFreeReg(LiveInterval* interval);
+  bool AllocateBlockedReg(LiveInterval* interval);
+
+  // Add `interval` in the sorted list of unhandled intervals.
+  void AddToUnhandled(LiveInterval* interval);
+
+  // Split `interval` at the position `at`. The new interval starts at `at`.
+  LiveInterval* Split(LiveInterval* interval, size_t at);
+
+  // Returns whether `reg` is blocked by the code generator.
+  bool IsBlocked(int reg) const;
+
+  // Helper methods.
+  void AllocateRegistersInternal(const SsaLivenessAnalysis& liveness);
+  bool ValidateInternal(const SsaLivenessAnalysis& liveness, bool log_fatal_on_failure) const;
+  void DumpInterval(std::ostream& stream, LiveInterval* interval);
+
+  ArenaAllocator* const allocator_;
+  const CodeGenerator& codegen_;
+
+  // List of intervals that must be processed, ordered by start position. Last entry
+  // is the interval that has the lowest start position.
+  GrowableArray<LiveInterval*> unhandled_;
+
+  // List of intervals that have been processed.
+  GrowableArray<LiveInterval*> handled_;
+
+  // List of intervals that are currently active when processing a new live interval.
+  // That is, they have a live range that spans the start of the new interval.
+  GrowableArray<LiveInterval*> active_;
+
+  // List of intervals that are currently inactive when processing a new live interval.
+  // That is, they have a lifetime hole that spans the start of the new interval.
+  GrowableArray<LiveInterval*> inactive_;
+
+  // True if processing core registers. False if processing floating
+  // point registers.
+  bool processing_core_registers_;
+
+  // Number of registers for the current register kind (core or floating point).
+  size_t number_of_registers_;
+
+  // Temporary array, allocated ahead of time for simplicity.
+  size_t* registers_array_;
+
+  // Blocked registers, as decided by the code generator.
+  bool* const blocked_registers_;
+
+  DISALLOW_COPY_AND_ASSIGN(RegisterAllocator);
+};
+
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_REGISTER_ALLOCATOR_H_
diff --git a/compiler/optimizing/register_allocator_test.cc b/compiler/optimizing/register_allocator_test.cc
new file mode 100644
index 0000000..019d0f8
--- /dev/null
+++ b/compiler/optimizing/register_allocator_test.cc
@@ -0,0 +1,310 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "builder.h"
+#include "code_generator.h"
+#include "dex_file.h"
+#include "dex_instruction.h"
+#include "nodes.h"
+#include "optimizing_unit_test.h"
+#include "register_allocator.h"
+#include "ssa_liveness_analysis.h"
+#include "utils/arena_allocator.h"
+
+#include "gtest/gtest.h"
+
+namespace art {
+
+// Note: the register allocator tests rely on the fact that constants have live
+// intervals and registers get allocated to them.
+
+static bool Check(const uint16_t* data) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HGraphBuilder builder(&allocator);
+  const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
+  HGraph* graph = builder.BuildGraph(*item);
+  graph->BuildDominatorTree();
+  graph->TransformToSSA();
+  graph->FindNaturalLoops();
+  SsaLivenessAnalysis liveness(*graph);
+  liveness.Analyze();
+  CodeGenerator* codegen = CodeGenerator::Create(&allocator, graph, kX86);
+  RegisterAllocator register_allocator(&allocator, *codegen);
+  register_allocator.AllocateRegisters(liveness);
+  return register_allocator.Validate(liveness, false);
+}
+
+/**
+ * Unit testing of RegisterAllocator::ValidateIntervals. Register allocator
+ * tests are based on this validation method.
+ */
+TEST(RegisterAllocatorTest, ValidateIntervals) {
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HGraph* graph = new (&allocator) HGraph(&allocator);
+  CodeGenerator* codegen = CodeGenerator::Create(&allocator, graph, kX86);
+  GrowableArray<LiveInterval*> intervals(&allocator, 0);
+
+  // Test with two intervals of the same range.
+  {
+    static constexpr size_t ranges[][2] = {{0, 42}};
+    intervals.Add(BuildInterval(ranges, arraysize(ranges), &allocator, 0));
+    intervals.Add(BuildInterval(ranges, arraysize(ranges), &allocator, 1));
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+
+    intervals.Get(1)->SetRegister(0);
+    ASSERT_FALSE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+    intervals.Reset();
+  }
+
+  // Test with two non-intersecting intervals.
+  {
+    static constexpr size_t ranges1[][2] = {{0, 42}};
+    intervals.Add(BuildInterval(ranges1, arraysize(ranges1), &allocator, 0));
+    static constexpr size_t ranges2[][2] = {{42, 43}};
+    intervals.Add(BuildInterval(ranges2, arraysize(ranges2), &allocator, 1));
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+
+    intervals.Get(1)->SetRegister(0);
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+    intervals.Reset();
+  }
+
+  // Test with two non-intersecting intervals, with one with a lifetime hole.
+  {
+    static constexpr size_t ranges1[][2] = {{0, 42}, {45, 48}};
+    intervals.Add(BuildInterval(ranges1, arraysize(ranges1), &allocator, 0));
+    static constexpr size_t ranges2[][2] = {{42, 43}};
+    intervals.Add(BuildInterval(ranges2, arraysize(ranges2), &allocator, 1));
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+
+    intervals.Get(1)->SetRegister(0);
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+    intervals.Reset();
+  }
+
+  // Test with intersecting intervals.
+  {
+    static constexpr size_t ranges1[][2] = {{0, 42}, {44, 48}};
+    intervals.Add(BuildInterval(ranges1, arraysize(ranges1), &allocator, 0));
+    static constexpr size_t ranges2[][2] = {{42, 47}};
+    intervals.Add(BuildInterval(ranges2, arraysize(ranges2), &allocator, 1));
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+
+    intervals.Get(1)->SetRegister(0);
+    ASSERT_FALSE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+    intervals.Reset();
+  }
+
+  // Test with siblings.
+  {
+    static constexpr size_t ranges1[][2] = {{0, 42}, {44, 48}};
+    intervals.Add(BuildInterval(ranges1, arraysize(ranges1), &allocator, 0));
+    intervals.Get(0)->SplitAt(43);
+    static constexpr size_t ranges2[][2] = {{42, 47}};
+    intervals.Add(BuildInterval(ranges2, arraysize(ranges2), &allocator, 1));
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+
+    intervals.Get(1)->SetRegister(0);
+    // Sibling of the first interval has no register allocated to it.
+    ASSERT_TRUE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+
+    intervals.Get(0)->GetNextSibling()->SetRegister(0);
+    ASSERT_FALSE(RegisterAllocator::ValidateIntervals(intervals, *codegen, &allocator, true, false));
+  }
+}
+
+TEST(RegisterAllocatorTest, CFG1) {
+  /*
+   * Test the following snippet:
+   *  return 0;
+   *
+   * Which becomes the following graph:
+   *       constant0
+   *       goto
+   *        |
+   *       return
+   *        |
+   *       exit
+   */
+  const uint16_t data[] = ONE_REGISTER_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::RETURN);
+
+  ASSERT_TRUE(Check(data));
+}
+
+TEST(RegisterAllocatorTest, Loop1) {
+  /*
+   * Test the following snippet:
+   *  int a = 0;
+   *  while (a == a) {
+   *    a = 4;
+   *  }
+   *  return 5;
+   *
+   * Which becomes the following graph:
+   *       constant0
+   *       constant4
+   *       constant5
+   *       goto
+   *        |
+   *       goto
+   *        |
+   *       phi
+   *       equal
+   *       if +++++
+   *        |       \ +
+   *        |     goto
+   *        |
+   *       return
+   *        |
+   *       exit
+   */
+
+  const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::IF_EQ, 4,
+    Instruction::CONST_4 | 4 << 12 | 0,
+    Instruction::GOTO | 0xFD00,
+    Instruction::CONST_4 | 5 << 12 | 1 << 8,
+    Instruction::RETURN | 1 << 8);
+
+  ASSERT_TRUE(Check(data));
+}
+
+TEST(RegisterAllocatorTest, Loop2) {
+  /*
+   * Test the following snippet:
+   *  int a = 0;
+   *  while (a == 8) {
+   *    a = 4 + 5;
+   *  }
+   *  return 6 + 7;
+   *
+   * Which becomes the following graph:
+   *       constant0
+   *       constant4
+   *       constant5
+   *       constant6
+   *       constant7
+   *       constant8
+   *       goto
+   *        |
+   *       goto
+   *        |
+   *       phi
+   *       equal
+   *       if +++++
+   *        |       \ +
+   *        |      4 + 5
+   *        |      goto
+   *        |
+   *       6 + 7
+   *       return
+   *        |
+   *       exit
+   */
+
+  const uint16_t data[] = TWO_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::CONST_4 | 8 << 12 | 1 << 8,
+    Instruction::IF_EQ | 1 << 8, 7,
+    Instruction::CONST_4 | 4 << 12 | 0 << 8,
+    Instruction::CONST_4 | 5 << 12 | 1 << 8,
+    Instruction::ADD_INT, 1 << 8 | 0,
+    Instruction::GOTO | 0xFA00,
+    Instruction::CONST_4 | 6 << 12 | 1 << 8,
+    Instruction::CONST_4 | 7 << 12 | 1 << 8,
+    Instruction::ADD_INT, 1 << 8 | 0,
+    Instruction::RETURN | 1 << 8);
+
+  ASSERT_TRUE(Check(data));
+}
+
+static HGraph* BuildSSAGraph(const uint16_t* data, ArenaAllocator* allocator) {
+  HGraphBuilder builder(allocator);
+  const DexFile::CodeItem* item = reinterpret_cast<const DexFile::CodeItem*>(data);
+  HGraph* graph = builder.BuildGraph(*item);
+  graph->BuildDominatorTree();
+  graph->TransformToSSA();
+  graph->FindNaturalLoops();
+  return graph;
+}
+
+TEST(RegisterAllocatorTest, Loop3) {
+  /*
+   * Test the following snippet:
+   *  int a = 0
+   *  do {
+   *    b = a;
+   *    a++;
+   *  } while (a != 5)
+   *  return b;
+   *
+   * Which becomes the following graph:
+   *       constant0
+   *       constant1
+   *       constant5
+   *       goto
+   *        |
+   *       goto
+   *        |++++++++++++
+   *       phi          +
+   *       a++          +
+   *       equals       +
+   *       if           +
+   *        |++++++++++++
+   *       return
+   *        |
+   *       exit
+   */
+
+  const uint16_t data[] = THREE_REGISTERS_CODE_ITEM(
+    Instruction::CONST_4 | 0 | 0,
+    Instruction::ADD_INT_LIT8 | 1 << 8, 1 << 8,
+    Instruction::CONST_4 | 5 << 12 | 2 << 8,
+    Instruction::IF_NE | 1 << 8 | 2 << 12, 3,
+    Instruction::RETURN | 0 << 8,
+    Instruction::MOVE | 1 << 12 | 0 << 8,
+    Instruction::GOTO | 0xF900);
+
+  ArenaPool pool;
+  ArenaAllocator allocator(&pool);
+  HGraph* graph = BuildSSAGraph(data, &allocator);
+  SsaLivenessAnalysis liveness(*graph);
+  liveness.Analyze();
+  CodeGenerator* codegen = CodeGenerator::Create(&allocator, graph, kX86);
+  RegisterAllocator register_allocator(&allocator, *codegen);
+  register_allocator.AllocateRegisters(liveness);
+  ASSERT_TRUE(register_allocator.Validate(liveness, false));
+
+  HBasicBlock* loop_header = graph->GetBlocks().Get(2);
+  HPhi* phi = loop_header->GetFirstPhi()->AsPhi();
+
+  LiveInterval* phi_interval = phi->GetLiveInterval();
+  LiveInterval* loop_update = phi->InputAt(1)->GetLiveInterval();
+  ASSERT_TRUE(phi_interval->HasRegister());
+  ASSERT_TRUE(loop_update->HasRegister());
+  ASSERT_NE(phi_interval->GetRegister(), loop_update->GetRegister());
+
+  HBasicBlock* return_block = graph->GetBlocks().Get(3);
+  HReturn* ret = return_block->GetFirstInstruction()->AsReturn();
+  ASSERT_EQ(phi_interval->GetRegister(), ret->InputAt(0)->GetLiveInterval()->GetRegister());
+}
+
+}  // namespace art
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index 50e3254..33084df 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -19,6 +19,18 @@
 
 namespace art {
 
+static Primitive::Type MergeTypes(Primitive::Type existing, Primitive::Type new_type) {
+  // We trust the verifier has already done the necessary checking.
+  switch (existing) {
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimNot:
+      return existing;
+    default:
+      return new_type;
+  }
+}
+
 void SsaBuilder::BuildSsa() {
   // 1) Visit in reverse post order. We need to have all predecessors of a block visited
   // (with the exception of loops) in order to create the right environment for that
@@ -32,11 +44,16 @@
     HBasicBlock* block = loop_headers_.Get(i);
     for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
       HPhi* phi = it.Current()->AsPhi();
+      Primitive::Type type = Primitive::kPrimVoid;
       for (size_t pred = 0; pred < block->GetPredecessors().Size(); pred++) {
-        phi->AddInput(ValueOfLocal(block->GetPredecessors().Get(pred), phi->GetRegNumber()));
+        HInstruction* input = ValueOfLocal(block->GetPredecessors().Get(pred), phi->GetRegNumber());
+        phi->AddInput(input);
+        type = MergeTypes(type, input->GetType());
       }
+      phi->SetType(type);
     }
   }
+  // TODO: Now that the type of loop phis is set, we need a type propagation phase.
 
   // 3) Clear locals.
   // TODO: Move this to a dead code eliminator phase.
@@ -65,7 +82,6 @@
     for (size_t local = 0; local < current_locals_->Size(); local++) {
       HInstruction* incoming = ValueOfLocal(block->GetLoopInformation()->GetPreHeader(), local);
       if (incoming != nullptr) {
-        // TODO: Compute union type.
         HPhi* phi = new (GetGraph()->GetArena()) HPhi(
             GetGraph()->GetArena(), local, 0, Primitive::kPrimVoid);
         block->AddPhi(phi);
@@ -88,12 +104,18 @@
         }
       }
       if (is_different) {
-        // TODO: Compute union type.
         HPhi* phi = new (GetGraph()->GetArena()) HPhi(
             GetGraph()->GetArena(), local, block->GetPredecessors().Size(), Primitive::kPrimVoid);
+        Primitive::Type type = Primitive::kPrimVoid;
         for (size_t i = 0; i < block->GetPredecessors().Size(); i++) {
-          phi->SetRawInputAt(i, ValueOfLocal(block->GetPredecessors().Get(i), local));
+          HInstruction* value = ValueOfLocal(block->GetPredecessors().Get(i), local);
+          // We need to merge the incoming types, as the Dex format does not
+          // guarantee the inputs have the same type. In particular the 0 constant is
+          // used for all types, but the graph builder treats it as an int.
+          type = MergeTypes(type, value->GetType());
+          phi->SetRawInputAt(i, value);
         }
+        phi->SetType(type);
         block->AddPhi(phi);
         value = phi;
       }
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 0f16ad2..dc4b2e5 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -122,20 +122,27 @@
 void SsaLivenessAnalysis::NumberInstructions() {
   int ssa_index = 0;
   size_t lifetime_position = 0;
-  // Each instruction gets an individual lifetime position, and a block gets a lifetime
+  // Each instruction gets a lifetime position, and a block gets a lifetime
   // start and end position. Non-phi instructions have a distinct lifetime position than
   // the block they are in. Phi instructions have the lifetime start of their block as
-  // lifetime position
+  // lifetime position.
+  //
+  // Because the register allocator will insert moves in the graph, we need
+  // to differentiate between the start and end of an instruction. Adding 2 to
+  // the lifetime position for each instruction ensures the start of an
+  // instruction is different than the end of the previous instruction.
   for (HLinearOrderIterator it(linear_post_order_); !it.Done(); it.Advance()) {
     HBasicBlock* block = it.Current();
-    block->SetLifetimeStart(++lifetime_position);
+    block->SetLifetimeStart(lifetime_position);
+    lifetime_position += 2;
 
     for (HInstructionIterator it(block->GetPhis()); !it.Done(); it.Advance()) {
       HInstruction* current = it.Current();
       if (current->HasUses()) {
         instructions_from_ssa_index_.Add(current);
         current->SetSsaIndex(ssa_index++);
-        current->SetLiveInterval(new (graph_.GetArena()) LiveInterval(graph_.GetArena()));
+        current->SetLiveInterval(
+            new (graph_.GetArena()) LiveInterval(graph_.GetArena(), current->GetType()));
       }
       current->SetLifetimePosition(lifetime_position);
     }
@@ -145,12 +152,14 @@
       if (current->HasUses()) {
         instructions_from_ssa_index_.Add(current);
         current->SetSsaIndex(ssa_index++);
-        current->SetLiveInterval(new (graph_.GetArena()) LiveInterval(graph_.GetArena()));
+        current->SetLiveInterval(
+            new (graph_.GetArena()) LiveInterval(graph_.GetArena(), current->GetType()));
       }
-      current->SetLifetimePosition(++lifetime_position);
+      current->SetLifetimePosition(lifetime_position);
+      lifetime_position += 2;
     }
 
-    block->SetLifetimeEnd(++lifetime_position);
+    block->SetLifetimeEnd(lifetime_position);
   }
   number_of_ssa_values_ = ssa_index;
 }
@@ -174,28 +183,6 @@
   ComputeLiveInAndLiveOutSets();
 }
 
-class InstructionBitVectorIterator : public ValueObject {
- public:
-  InstructionBitVectorIterator(const BitVector& vector,
-                               const GrowableArray<HInstruction*>& instructions)
-        : instructions_(instructions),
-          iterator_(BitVector::Iterator(&vector)),
-          current_bit_index_(iterator_.Next()) {}
-
-  bool Done() const { return current_bit_index_ == -1; }
-  HInstruction* Current() const { return instructions_.Get(current_bit_index_); }
-  void Advance() {
-    current_bit_index_ = iterator_.Next();
-  }
-
- private:
-  const GrowableArray<HInstruction*> instructions_;
-  BitVector::Iterator iterator_;
-  int32_t current_bit_index_;
-
-  DISALLOW_COPY_AND_ASSIGN(InstructionBitVectorIterator);
-};
-
 void SsaLivenessAnalysis::ComputeLiveRanges() {
   // Do a post order visit, adding inputs of instructions live in the block where
   // that instruction is defined, and killing instructions that are being visited.
@@ -212,16 +199,19 @@
       live_in->Union(GetLiveInSet(*successor));
       size_t phi_input_index = successor->GetPredecessorIndexOf(block);
       for (HInstructionIterator it(successor->GetPhis()); !it.Done(); it.Advance()) {
-        HInstruction* input = it.Current()->InputAt(phi_input_index);
+        HInstruction* phi = it.Current();
+        HInstruction* input = phi->InputAt(phi_input_index);
+        input->GetLiveInterval()->AddPhiUse(phi, block);
+        // A phi input whose last user is the phi dies at the end of the predecessor block,
+        // and not at the phi's lifetime position.
         live_in->SetBit(input->GetSsaIndex());
       }
     }
 
     // Add a range that covers this block to all instructions live_in because of successors.
-    for (InstructionBitVectorIterator it(*live_in, instructions_from_ssa_index_);
-         !it.Done();
-         it.Advance()) {
-      it.Current()->GetLiveInterval()->AddRange(block->GetLifetimeStart(), block->GetLifetimeEnd());
+    for (uint32_t idx : live_in->Indexes()) {
+      HInstruction* current = instructions_from_ssa_index_.Get(idx);
+      current->GetLiveInterval()->AddRange(block->GetLifetimeStart(), block->GetLifetimeEnd());
     }
 
     for (HBackwardInstructionIterator it(block->GetInstructions()); !it.Done(); it.Advance()) {
@@ -268,11 +258,10 @@
       HBasicBlock* back_edge = block->GetLoopInformation()->GetBackEdges().Get(0);
       // For all live_in instructions at the loop header, we need to create a range
       // that covers the full loop.
-      for (InstructionBitVectorIterator it(*live_in, instructions_from_ssa_index_);
-           !it.Done();
-           it.Advance()) {
-        it.Current()->GetLiveInterval()->AddLoopRange(block->GetLifetimeStart(),
-                                                      back_edge->GetLifetimeEnd());
+      for (uint32_t idx : live_in->Indexes()) {
+        HInstruction* current = instructions_from_ssa_index_.Get(idx);
+        current->GetLiveInterval()->AddLoopRange(block->GetLifetimeStart(),
+                                                 back_edge->GetLifetimeEnd());
       }
     }
   }
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 2d91436..4d56e1f 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -48,21 +48,68 @@
  * A live range contains the start and end of a range where an instruction
  * is live.
  */
-class LiveRange : public ValueObject {
+class LiveRange : public ArenaObject {
  public:
-  LiveRange(size_t start, size_t end) : start_(start), end_(end) {
+  LiveRange(size_t start, size_t end, LiveRange* next) : start_(start), end_(end), next_(next) {
     DCHECK_LT(start, end);
+    DCHECK(next_ == nullptr || next_->GetStart() > GetEnd());
   }
 
   size_t GetStart() const { return start_; }
   size_t GetEnd() const { return end_; }
+  LiveRange* GetNext() const { return next_; }
+
+  bool IntersectsWith(const LiveRange& other) {
+    return (start_ >= other.start_ && start_ < other.end_)
+        || (other.start_ >= start_ && other.start_ < end_);
+  }
+
+  bool IsBefore(const LiveRange& other) {
+    return end_ <= other.start_;
+  }
+
+  void Dump(std::ostream& stream) {
+    stream << "[" << start_ << ", " << end_ << ")";
+  }
 
  private:
   size_t start_;
   size_t end_;
+  LiveRange* next_;
+
+  friend class LiveInterval;
+
+  DISALLOW_COPY_AND_ASSIGN(LiveRange);
 };
 
-static constexpr int kDefaultNumberOfRanges = 3;
+/**
+ * A use position represents a live interval use at a given position.
+ */
+class UsePosition : public ArenaObject {
+ public:
+  UsePosition(HInstruction* user, size_t position, UsePosition* next)
+      : user_(user), position_(position), next_(next) {
+    DCHECK(user->AsPhi() != nullptr || GetPosition() == user->GetLifetimePosition());
+    DCHECK(next_ == nullptr || next->GetPosition() >= GetPosition());
+  }
+
+  size_t GetPosition() const { return position_; }
+
+  UsePosition* GetNext() const { return next_; }
+
+  HInstruction* GetUser() const { return user_; }
+
+  void Dump(std::ostream& stream) {
+    stream << position_;
+  }
+
+ private:
+  HInstruction* const user_;
+  const size_t position_;
+  UsePosition* const next_;
+
+  DISALLOW_COPY_AND_ASSIGN(UsePosition);
+};
 
 /**
  * An interval is a list of disjoint live ranges where an instruction is live.
@@ -70,67 +117,276 @@
  */
 class LiveInterval : public ArenaObject {
  public:
-  explicit LiveInterval(ArenaAllocator* allocator) : ranges_(allocator, kDefaultNumberOfRanges) {}
+  LiveInterval(ArenaAllocator* allocator, Primitive::Type type)
+      : allocator_(allocator),
+        first_range_(nullptr),
+        last_range_(nullptr),
+        first_use_(nullptr),
+        type_(type),
+        next_sibling_(nullptr),
+        register_(kNoRegister) {}
 
   void AddUse(HInstruction* instruction) {
     size_t position = instruction->GetLifetimePosition();
     size_t start_block_position = instruction->GetBlock()->GetLifetimeStart();
     size_t end_block_position = instruction->GetBlock()->GetLifetimeEnd();
-    if (ranges_.IsEmpty()) {
+    if (first_range_ == nullptr) {
       // First time we see a use of that interval.
-      ranges_.Add(LiveRange(start_block_position, position));
-    } else if (ranges_.Peek().GetStart() == start_block_position) {
+      first_range_ = last_range_ = new (allocator_) LiveRange(start_block_position, position, nullptr);
+    } else if (first_range_->GetStart() == start_block_position) {
       // There is a use later in the same block.
-      DCHECK_LE(position, ranges_.Peek().GetEnd());
-    } else if (ranges_.Peek().GetStart() == end_block_position + 1) {
-      // Last use is in a following block.
-      LiveRange existing = ranges_.Pop();
-      ranges_.Add(LiveRange(start_block_position, existing.GetEnd()));
+      DCHECK_LE(position, first_range_->GetEnd());
+    } else if (first_range_->GetStart() == end_block_position) {
+      // Last use is in the following block.
+      first_range_->start_ = start_block_position;
     } else {
       // There is a hole in the interval. Create a new range.
-      ranges_.Add(LiveRange(start_block_position, position));
+      first_range_ = new (allocator_) LiveRange(start_block_position, position, first_range_);
     }
+    first_use_ = new (allocator_) UsePosition(instruction, position, first_use_);
+  }
+
+  void AddPhiUse(HInstruction* instruction, HBasicBlock* block) {
+    DCHECK(instruction->AsPhi() != nullptr);
+    first_use_ = new (allocator_) UsePosition(instruction, block->GetLifetimeEnd(), first_use_);
   }
 
   void AddRange(size_t start, size_t end) {
-    if (ranges_.IsEmpty()) {
-      ranges_.Add(LiveRange(start, end));
-    } else if (ranges_.Peek().GetStart() == end + 1) {
+    if (first_range_ == nullptr) {
+      first_range_ = last_range_ = new (allocator_) LiveRange(start, end, first_range_);
+    } else if (first_range_->GetStart() == end) {
       // There is a use in the following block.
-      LiveRange existing = ranges_.Pop();
-      ranges_.Add(LiveRange(start, existing.GetEnd()));
+      first_range_->start_ = start;
     } else {
       // There is a hole in the interval. Create a new range.
-      ranges_.Add(LiveRange(start, end));
+      first_range_ = new (allocator_) LiveRange(start, end, first_range_);
     }
   }
 
   void AddLoopRange(size_t start, size_t end) {
-    DCHECK(!ranges_.IsEmpty());
-    while (!ranges_.IsEmpty() && ranges_.Peek().GetEnd() < end) {
-      DCHECK_LE(start, ranges_.Peek().GetStart());
-      ranges_.Pop();
+    DCHECK(first_range_ != nullptr);
+    while (first_range_ != nullptr && first_range_->GetEnd() < end) {
+      DCHECK_LE(start, first_range_->GetStart());
+      first_range_ = first_range_->GetNext();
     }
-    if (ranges_.IsEmpty()) {
+    if (first_range_ == nullptr) {
       // Uses are only in the loop.
-      ranges_.Add(LiveRange(start, end));
+      first_range_ = last_range_ = new (allocator_) LiveRange(start, end, nullptr);
     } else {
       // There are uses after the loop.
-      LiveRange range = ranges_.Pop();
-      ranges_.Add(LiveRange(start, range.GetEnd()));
+      first_range_->start_ = start;
     }
   }
 
   void SetFrom(size_t from) {
-    DCHECK(!ranges_.IsEmpty());
-    LiveRange existing = ranges_.Pop();
-    ranges_.Add(LiveRange(from, existing.GetEnd()));
+    DCHECK(first_range_ != nullptr);
+    first_range_->start_ = from;
   }
 
-  const GrowableArray<LiveRange>& GetRanges() const { return ranges_; }
+  LiveRange* GetFirstRange() const { return first_range_; }
+
+  int GetRegister() const { return register_; }
+  void SetRegister(int reg) { register_ = reg; }
+  void ClearRegister() { register_ = kNoRegister; }
+  bool HasRegister() const { return register_ != kNoRegister; }
+
+  bool IsDeadAt(size_t position) {
+    return last_range_->GetEnd() <= position;
+  }
+
+  bool Covers(size_t position) {
+    LiveRange* current = first_range_;
+    while (current != nullptr) {
+      if (position >= current->GetStart() && position < current->GetEnd()) {
+        return true;
+      }
+      current = current->GetNext();
+    }
+    return false;
+  }
+
+  /**
+   * Returns the first intersection of this interval with `other`.
+   */
+  size_t FirstIntersectionWith(LiveInterval* other) {
+    // We only call this method if there is a lifetime hole in this interval
+    // at the start of `other`.
+    DCHECK(!Covers(other->GetStart()));
+    DCHECK_LE(GetStart(), other->GetStart());
+    // Move to the range in this interval that starts after the other interval.
+    size_t other_start = other->GetStart();
+    LiveRange* my_range = first_range_;
+    while (my_range != nullptr) {
+      if (my_range->GetStart() >= other_start) {
+        break;
+      } else {
+        my_range = my_range->GetNext();
+      }
+    }
+    if (my_range == nullptr) {
+      return kNoLifetime;
+    }
+
+    // Advance both intervals and find the first matching range start in
+    // this interval.
+    LiveRange* other_range = other->first_range_;
+    do {
+      if (my_range->IntersectsWith(*other_range)) {
+        return std::max(my_range->GetStart(), other_range->GetStart());
+      } else if (my_range->IsBefore(*other_range)) {
+        my_range = my_range->GetNext();
+        if (my_range == nullptr) {
+          return kNoLifetime;
+        }
+      } else {
+        DCHECK(other_range->IsBefore(*my_range));
+        other_range = other_range->GetNext();
+        if (other_range == nullptr) {
+          return kNoLifetime;
+        }
+      }
+    } while (true);
+  }
+
+  size_t GetStart() const {
+    return first_range_->GetStart();
+  }
+
+  size_t FirstRegisterUseAfter(size_t position) const {
+    UsePosition* use = first_use_;
+    while (use != nullptr) {
+      size_t use_position = use->GetPosition();
+      // TODO: Once we plug the Locations builder of the code generator
+      // to the register allocator, this method must be adjusted. We
+      // test if there is an environment, because these are currently the only
+      // instructions that could have more uses than the number of registers.
+      if (use_position >= position && !use->GetUser()->NeedsEnvironment()) {
+        return use_position;
+      }
+      use = use->GetNext();
+    }
+    return kNoLifetime;
+  }
+
+  size_t FirstRegisterUse() const {
+    return FirstRegisterUseAfter(GetStart());
+  }
+
+  Primitive::Type GetType() const {
+    return type_;
+  }
+
+  /**
+   * Split this interval at `position`. This interval is changed to:
+   * [start ... position).
+   *
+   * The new interval covers:
+   * [position ... end)
+   */
+  LiveInterval* SplitAt(size_t position) {
+    DCHECK(next_sibling_ == nullptr);
+    DCHECK_GT(position, GetStart());
+
+    if (last_range_->GetEnd() <= position) {
+      // This range dies before `position`, no need to split.
+      return nullptr;
+    }
+
+    LiveInterval* new_interval = new (allocator_) LiveInterval(allocator_, type_);
+    next_sibling_ = new_interval;
+
+    new_interval->first_use_ = first_use_;
+    LiveRange* current = first_range_;
+    LiveRange* previous = nullptr;
+    // Iterate over the ranges, and either find a range that covers this position, or
+    // a two ranges in between this position (that is, the position is in a lifetime hole).
+    do {
+      if (position >= current->GetEnd()) {
+        // Move to next range.
+        previous = current;
+        current = current->next_;
+      } else if (position <= current->GetStart()) {
+        // If the previous range did not cover this position, we know position is in
+        // a lifetime hole. We can just break the first_range_ and last_range_ links
+        // and return the new interval.
+        DCHECK(previous != nullptr);
+        DCHECK(current != first_range_);
+        new_interval->last_range_ = last_range_;
+        last_range_ = previous;
+        previous->next_ = nullptr;
+        new_interval->first_range_ = current;
+        return new_interval;
+      } else {
+        // This range covers position. We create a new last_range_ for this interval
+        // that covers last_range_->Start() and position. We also shorten the current
+        // range and make it the first range of the new interval.
+        DCHECK(position < current->GetEnd() && position > current->GetStart());
+        new_interval->last_range_ = last_range_;
+        last_range_ = new (allocator_) LiveRange(current->start_, position, nullptr);
+        if (previous != nullptr) {
+          previous->next_ = last_range_;
+        } else {
+          first_range_ = last_range_;
+        }
+        new_interval->first_range_ = current;
+        current->start_ = position;
+        return new_interval;
+      }
+    } while (current != nullptr);
+
+    LOG(FATAL) << "Unreachable";
+    return nullptr;
+  }
+
+  bool StartsBefore(LiveInterval* other) const {
+    return GetStart() <= other->GetStart();
+  }
+
+  bool StartsAfter(LiveInterval* other) const {
+    return GetStart() >= other->GetStart();
+  }
+
+  void Dump(std::ostream& stream) const {
+    stream << "ranges: { ";
+    LiveRange* current = first_range_;
+    do {
+      current->Dump(stream);
+      stream << " ";
+    } while ((current = current->GetNext()) != nullptr);
+    stream << "}, uses: { ";
+    UsePosition* use = first_use_;
+    if (use != nullptr) {
+      do {
+        use->Dump(stream);
+        stream << " ";
+      } while ((use = use->GetNext()) != nullptr);
+    }
+    stream << "}";
+  }
+
+  LiveInterval* GetNextSibling() const { return next_sibling_; }
 
  private:
-  GrowableArray<LiveRange> ranges_;
+  ArenaAllocator* const allocator_;
+
+  // Ranges of this interval. We need a quick access to the last range to test
+  // for liveness (see `IsDeadAt`).
+  LiveRange* first_range_;
+  LiveRange* last_range_;
+
+  // Uses of this interval. Note that this linked list is shared amongst siblings.
+  UsePosition* first_use_;
+
+  // The instruction type this interval corresponds to.
+  const Primitive::Type type_;
+
+  // Live interval that is the result of a split.
+  LiveInterval* next_sibling_;
+
+  // The register allocated to this interval.
+  int register_;
+
+  static constexpr int kNoRegister = -1;
 
   DISALLOW_COPY_AND_ASSIGN(LiveInterval);
 };
@@ -164,10 +420,14 @@
     return linear_post_order_;
   }
 
-  HInstruction* GetInstructionFromSsaIndex(size_t index) {
+  HInstruction* GetInstructionFromSsaIndex(size_t index) const {
     return instructions_from_ssa_index_.Get(index);
   }
 
+  size_t GetNumberOfSsaValues() const {
+    return number_of_ssa_values_;
+  }
+
  private:
   // Linearize the graph so that:
   // (1): a block is always after its dominator,
diff --git a/compiler/utils/arena_allocator.h b/compiler/utils/arena_allocator.h
index 032eabc..dbe482d 100644
--- a/compiler/utils/arena_allocator.h
+++ b/compiler/utils/arena_allocator.h
@@ -170,6 +170,10 @@
     return ret;
   }
 
+  template <typename T> T* AllocArray(size_t length) {
+    return static_cast<T*>(Alloc(length * sizeof(T), kArenaAllocMisc));
+  }
+
   void* AllocValgrind(size_t bytes, ArenaAllocKind kind);
   void ObtainNewArenaForAllocation(size_t allocation_size);
   size_t BytesAllocated() const;
diff --git a/compiler/utils/growable_array.h b/compiler/utils/growable_array.h
index e703d8e..a1a3312 100644
--- a/compiler/utils/growable_array.h
+++ b/compiler/utils/growable_array.h
@@ -169,6 +169,13 @@
       num_used_--;
     };
 
+    void DeleteAt(size_t index) {
+      for (size_t i = index; i < num_used_ - 1; i++) {
+        elem_list_[i] = elem_list_[i + 1];
+      }
+      num_used_--;
+    };
+
     size_t GetNumAllocated() const { return num_allocated_; }
 
     size_t Size() const { return num_used_; }
diff --git a/compiler/utils/x86/managed_register_x86.cc b/compiler/utils/x86/managed_register_x86.cc
index 034a795..021fe88 100644
--- a/compiler/utils/x86/managed_register_x86.cc
+++ b/compiler/utils/x86/managed_register_x86.cc
@@ -95,11 +95,11 @@
   if (!IsValidManagedRegister()) {
     os << "No Register";
   } else if (IsXmmRegister()) {
-    os << "XMM: " << static_cast<int>(AsXmmRegister());
+    os << "XMM: " << AsXmmRegister();
   } else if (IsX87Register()) {
-    os << "X87: " << static_cast<int>(AsX87Register());
+    os << "X87: " << AsX87Register();
   } else if (IsCpuRegister()) {
-    os << "CPU: " << static_cast<int>(AsCpuRegister());
+    os << "CPU: " << AsCpuRegister();
   } else if (IsRegisterPair()) {
     os << "Pair: " << AsRegisterPairLow() << ", " << AsRegisterPairHigh();
   } else {
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index f0b5750..2126625 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -295,8 +295,9 @@
                                 zip_filename, error_msg->c_str());
       return nullptr;
     }
-    std::unique_ptr<MemMap> image_classes_file(zip_entry->ExtractToMemMap(image_classes_filename,
-                                                                    error_msg));
+    std::unique_ptr<MemMap> image_classes_file(zip_entry->ExtractToMemMap(zip_filename,
+                                                                          image_classes_filename,
+                                                                          error_msg));
     if (image_classes_file.get() == nullptr) {
       *error_msg = StringPrintf("Failed to extract '%s' from '%s': %s", image_classes_filename,
                                 zip_filename, error_msg->c_str());
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 5cc6acf..cba4ebf 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -363,10 +363,49 @@
         src_reg_file = dst_reg_file = SSE;
         break;
       case 0x38:  // 3 byte extended opcode
-        opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+        instr++;
+        if (prefix[2] == 0x66) {
+          switch (*instr) {
+            case 0x40:
+              opcode << "pmulld";
+              prefix[2] = 0;
+              has_modrm = true;
+              load = true;
+              src_reg_file = dst_reg_file = SSE;
+              break;
+            default:
+              opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+          }
+        } else {
+          opcode << StringPrintf("unknown opcode '0F 38 %02X'", *instr);
+        }
         break;
       case 0x3A:  // 3 byte extended opcode
-        opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+        instr++;
+        if (prefix[2] == 0x66) {
+          switch (*instr) {
+            case 0x14:
+              opcode << "pextrb";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
+            case 0x16:
+              opcode << "pextrd";
+              prefix[2] = 0;
+              has_modrm = true;
+              store = true;
+              dst_reg_file = SSE;
+              immediate_bytes = 1;
+              break;
+            default:
+              opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+          }
+        } else {
+          opcode << StringPrintf("unknown opcode '0F 3A %02X'", *instr);
+        }
         break;
       case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
       case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C: case 0x4D: case 0x4E: case 0x4F:
@@ -467,11 +506,11 @@
         break;
       case 0x6F:
         if (prefix[2] == 0x66) {
-          dst_reg_file = SSE;
+          src_reg_file = dst_reg_file = SSE;
           opcode << "movdqa";
           prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else if (prefix[0] == 0xF3) {
-          dst_reg_file = SSE;
+          src_reg_file = dst_reg_file = SSE;
           opcode << "movdqu";
           prefix[0] = 0;  // clear prefix now it's served its purpose as part of the opcode
         } else {
@@ -481,6 +520,25 @@
         load = true;
         has_modrm = true;
         break;
+      case 0x70:
+        if (prefix[2] == 0x66) {
+          opcode << "pshufd";
+          prefix[2] = 0;
+          has_modrm = true;
+          store = true;
+          src_reg_file = dst_reg_file = SSE;
+          immediate_bytes = 1;
+        } else if (prefix[0] == 0xF2) {
+          opcode << "pshuflw";
+          prefix[0] = 0;
+          has_modrm = true;
+          store = true;
+          src_reg_file = dst_reg_file = SSE;
+          immediate_bytes = 1;
+        } else {
+          opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+        }
+        break;
       case 0x71:
         if (prefix[2] == 0x66) {
           dst_reg_file = SSE;
@@ -603,6 +661,18 @@
       case 0xB7: opcode << "movzxw"; has_modrm = true; load = true; break;
       case 0xBE: opcode << "movsxb"; has_modrm = true; load = true; break;
       case 0xBF: opcode << "movsxw"; has_modrm = true; load = true; break;
+      case 0xC5:
+        if (prefix[2] == 0x66) {
+          opcode << "pextrw";
+          prefix[2] = 0;
+          has_modrm = true;
+          store = true;
+          src_reg_file = dst_reg_file = SSE;
+          immediate_bytes = 1;
+        } else {
+          opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+        }
+        break;
       case 0xC7:
         static const char* x0FxC7_opcodes[] = { "unknown-0f-c7", "cmpxchg8b", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7", "unknown-0f-c7" };
         modrm_opcodes = x0FxC7_opcodes;
@@ -614,6 +684,125 @@
         opcode << "bswap";
         reg_in_opcode = true;
         break;
+      case 0xDB:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "pand";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xD5:
+        if (prefix[2] == 0x66) {
+          opcode << "pmullw";
+          prefix[2] = 0;
+          has_modrm = true;
+          load = true;
+          src_reg_file = dst_reg_file = SSE;
+        } else {
+          opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
+        }
+        break;
+      case 0xEB:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "por";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xEF:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "pxor";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xF8:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "psubb";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xF9:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "psubw";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFA:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "psubd";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFC:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "paddb";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFD:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "paddw";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
+      case 0xFE:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "paddd";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
       default:
         opcode << StringPrintf("unknown opcode '0F %02X'", *instr);
         break;
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 8fd6b6d..7c76b3c 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -76,6 +76,13 @@
           "      Example: --boot-image=/system/framework/boot.art\n"
           "\n");
   fprintf(stderr,
+          "  --instruction-set=(arm|arm64|mips|x86|x86_64): for locating the image file based on the image location\n"
+          "      set.\n"
+          "      Example: --instruction-set=x86\n"
+          "      Default: %s\n"
+          "\n",
+          GetInstructionSetString(kRuntimeISA));
+  fprintf(stderr,
           "  --output=<file> may be used to send the output to a file.\n"
           "      Example: --output=/tmp/oatdump.txt\n"
           "\n");
@@ -423,7 +430,7 @@
         NullHandle<mirror::ClassLoader> class_loader;
         verifier::MethodVerifier verifier(&dex_file, &dex_cache, &class_loader, &class_def,
                                           code_item, dex_method_idx, nullptr, method_access_flags,
-                                          true, true);
+                                          true, true, true);
         verifier.Verify();
         DumpCode(indent2_os, &verifier, oat_method, code_item);
       } else {
@@ -1461,8 +1468,9 @@
   }
 
   const char* oat_filename = NULL;
-  const char* image_filename = NULL;
-  const char* boot_image_filename = NULL;
+  const char* image_location = NULL;
+  const char* boot_image_location = NULL;
+  InstructionSet instruction_set = kRuntimeISA;
   std::string elf_filename_prefix;
   std::ostream* os = &std::cout;
   std::unique_ptr<std::ofstream> out;
@@ -1474,9 +1482,22 @@
     if (option.starts_with("--oat-file=")) {
       oat_filename = option.substr(strlen("--oat-file=")).data();
     } else if (option.starts_with("--image=")) {
-      image_filename = option.substr(strlen("--image=")).data();
+      image_location = option.substr(strlen("--image=")).data();
     } else if (option.starts_with("--boot-image=")) {
-      boot_image_filename = option.substr(strlen("--boot-image=")).data();
+      boot_image_location = option.substr(strlen("--boot-image=")).data();
+    } else if (option.starts_with("--instruction-set=")) {
+      StringPiece instruction_set_str = option.substr(strlen("--instruction-set=")).data();
+      if (instruction_set_str == "arm") {
+        instruction_set = kThumb2;
+      } else if (instruction_set_str == "arm64") {
+        instruction_set = kArm64;
+      } else if (instruction_set_str == "mips") {
+        instruction_set = kMips;
+      } else if (instruction_set_str == "x86") {
+        instruction_set = kX86;
+      } else if (instruction_set_str == "x86_64") {
+        instruction_set = kX86_64;
+      }
     } else if (option.starts_with("--dump:")) {
         if (option == "--dump:raw_mapping_table") {
           dump_raw_mapping_table = true;
@@ -1500,12 +1521,12 @@
     }
   }
 
-  if (image_filename == NULL && oat_filename == NULL) {
+  if (image_location == NULL && oat_filename == NULL) {
     fprintf(stderr, "Either --image or --oat must be specified\n");
     return EXIT_FAILURE;
   }
 
-  if (image_filename != NULL && oat_filename != NULL) {
+  if (image_location != NULL && oat_filename != NULL) {
     fprintf(stderr, "Either --image or --oat must be specified but not both\n");
     return EXIT_FAILURE;
   }
@@ -1533,16 +1554,19 @@
   NoopCompilerCallbacks callbacks;
   options.push_back(std::make_pair("compilercallbacks", &callbacks));
 
-  if (boot_image_filename != NULL) {
+  if (boot_image_location != NULL) {
     boot_image_option += "-Ximage:";
-    boot_image_option += boot_image_filename;
+    boot_image_option += boot_image_location;
     options.push_back(std::make_pair(boot_image_option.c_str(), reinterpret_cast<void*>(NULL)));
   }
-  if (image_filename != NULL) {
+  if (image_location != NULL) {
     image_option += "-Ximage:";
-    image_option += image_filename;
+    image_option += image_location;
     options.push_back(std::make_pair(image_option.c_str(), reinterpret_cast<void*>(NULL)));
   }
+  options.push_back(
+      std::make_pair("imageinstructionset",
+                     reinterpret_cast<const void*>(GetInstructionSetString(instruction_set))));
 
   if (!Runtime::Create(options, false)) {
     fprintf(stderr, "Failed to create runtime\n");
@@ -1558,7 +1582,7 @@
   CHECK(image_space != NULL);
   const ImageHeader& image_header = image_space->GetImageHeader();
   if (!image_header.IsValid()) {
-    fprintf(stderr, "Invalid image header %s\n", image_filename);
+    fprintf(stderr, "Invalid image header %s\n", image_location);
     return EXIT_FAILURE;
   }
   ImageDumper image_dumper(os, *image_space, image_header,
diff --git a/runtime/Android.mk b/runtime/Android.mk
index 296cec9..c2507b1 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -308,6 +308,12 @@
   LIBART_CFLAGS += -DART_USE_PORTABLE_COMPILER=1
 endif
 
+ifeq ($(MALLOC_IMPL),jemalloc)
+  LIBART_CFLAGS += -DUSE_JEMALLOC
+else
+  LIBART_CFLAGS += -DUSE_DLMALLOC
+endif
+
 # $(1): target or host
 # $(2): ndebug or debug
 # $(3): true or false for LOCAL_CLANG
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index 23e3433..340a83e 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -196,7 +196,6 @@
   qpoints->pCmplDouble = CmplDouble;
   qpoints->pCmplFloat = CmplFloat;
   qpoints->pFmod = fmod;
-  qpoints->pSqrt = sqrt;
   qpoints->pL2d = __aeabi_l2d;
   qpoints->pFmodf = fmodf;
   qpoints->pL2f = __aeabi_l2f;
diff --git a/runtime/arch/arm64/asm_support_arm64.S b/runtime/arch/arm64/asm_support_arm64.S
index 9614c29..b94375e 100644
--- a/runtime/arch/arm64/asm_support_arm64.S
+++ b/runtime/arch/arm64/asm_support_arm64.S
@@ -21,6 +21,9 @@
 
 // Define special registers.
 
+// Register holding suspend check count down.
+// 32-bit is enough for the suspend register.
+#define wSUSPEND w19
 // Register holding Thread::Current().
 #define xSELF x18
 // Frame Pointer
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 2a5c7d1..46e819e 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -84,12 +84,6 @@
 // Double-precision FP arithmetics.
 extern "C" double fmod(double a, double b);         // REM_DOUBLE[_2ADDR]
 
-// Long long arithmetics - REM_LONG[_2ADDR] and DIV_LONG[_2ADDR]
-extern "C" int64_t art_quick_mul_long(int64_t, int64_t);
-extern "C" uint64_t art_quick_shl_long(uint64_t, uint32_t);
-extern "C" uint64_t art_quick_shr_long(uint64_t, uint32_t);
-extern "C" uint64_t art_quick_ushr_long(uint64_t, uint32_t);
-
 // Intrinsic entrypoints.
 extern "C" int32_t __memcmp16(void*, void*, int32_t);
 extern "C" int32_t art_quick_indexof(void*, uint32_t, uint32_t, uint32_t);
@@ -188,7 +182,6 @@
   qpoints->pCmplDouble = CmplDouble;
   qpoints->pCmplFloat = CmplFloat;
   qpoints->pFmod = fmod;
-  qpoints->pSqrt = sqrt;
   qpoints->pL2d = NULL;
   qpoints->pFmodf = fmodf;
   qpoints->pL2f = NULL;
@@ -199,10 +192,10 @@
   qpoints->pF2l = NULL;
   qpoints->pLdiv = NULL;
   qpoints->pLmod = NULL;
-  qpoints->pLmul = art_quick_mul_long;
-  qpoints->pShlLong = art_quick_shl_long;
-  qpoints->pShrLong = art_quick_shr_long;
-  qpoints->pUshrLong = art_quick_ushr_long;
+  qpoints->pLmul = NULL;
+  qpoints->pShlLong = NULL;
+  qpoints->pShrLong = NULL;
+  qpoints->pUshrLong = NULL;
 
   // Intrinsics
   qpoints->pIndexOf = art_quick_indexof;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index ac922dd..97caa1f 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -197,7 +197,8 @@
 .endm
 
 .macro RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
-    brk 0
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME
+    ret
 .endm
 
 
@@ -561,32 +562,33 @@
 SAVE_SIZE=5*8   // x4, x5, SP, LR & FP saved.
 SAVE_SIZE_AND_METHOD=SAVE_SIZE+8
 
-    mov x9, sp                          // Save stack pointer.
+    mov x9, sp                             // Save stack pointer.
     .cfi_register sp,x9
 
-    add x10, x2, # SAVE_SIZE_AND_METHOD // calculate size of frame.
-    sub x10, sp, x10                    // Calculate SP position - saves + ArtMethod* +  args
-    and x10, x10, # ~0xf                // Enforce 16 byte stack alignment.
-    mov sp, x10                         // Set new SP.
+    add x10, x2, # SAVE_SIZE_AND_METHOD    // calculate size of frame.
+    sub x10, sp, x10                       // Calculate SP position - saves + ArtMethod* +  args
+    and x10, x10, # ~0xf                   // Enforce 16 byte stack alignment.
+    mov sp, x10                            // Set new SP.
 
-    sub x10, x9, #SAVE_SIZE             // Calculate new FP (later). Done here as we must move SP
-    .cfi_def_cfa_register x10           // before this.
+    sub x10, x9, #SAVE_SIZE                // Calculate new FP (later). Done here as we must move SP
+    .cfi_def_cfa_register x10              // before this.
     .cfi_adjust_cfa_offset SAVE_SIZE
 
-    str x9, [x10, #32]                  // Save old stack pointer.
+    str x9, [x10, #32]                     // Save old stack pointer.
     .cfi_rel_offset sp, 32
 
-    stp x4, x5, [x10, #16]              // Save result and shorty addresses.
+    stp x4, x5, [x10, #16]                 // Save result and shorty addresses.
     .cfi_rel_offset x4, 16
     .cfi_rel_offset x5, 24
 
-    stp xFP, xLR, [x10]                 // Store LR & FP.
+    stp xFP, xLR, [x10]                    // Store LR & FP.
     .cfi_rel_offset x29, 0
     .cfi_rel_offset x30, 8
 
-    mov xFP, x10                        // Use xFP now, as it's callee-saved.
+    mov xFP, x10                           // Use xFP now, as it's callee-saved.
     .cfi_def_cfa_register x29
-    mov xSELF, x3                       // Move thread pointer into SELF register.
+    mov xSELF, x3                          // Move thread pointer into SELF register.
+    mov wSUSPEND, #SUSPEND_CHECK_INTERVAL  // reset wSUSPEND to suspend check interval
 
     // Copy arguments into stack frame.
     // Use simple copy routine for now.
@@ -595,7 +597,7 @@
     // W2 - args length
     // X9 - destination address.
     // W10 - temporary
-    add x9, sp, #8     // Destination address is bottom of stack + NULL.
+    add x9, sp, #8                         // Destination address is bottom of stack + NULL.
 
     // Use \@ to differentiate between macro invocations.
 .LcopyParams\@:
@@ -693,6 +695,7 @@
  *  x1-x7 - integer parameters.
  *  d0-d7 - Floating point parameters.
  *  xSELF = self
+ *  wSUSPEND = suspend count
  *  SP = & of ArtMethod*
  *  x1 = "this" pointer.
  *
@@ -1373,7 +1376,22 @@
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALL_ALLOC_ENTRYPOINTS
 
-UNIMPLEMENTED art_quick_test_suspend
+    /*
+     * Called by managed code when the value in wSUSPEND has been decremented to 0.
+     */
+    .extern artTestSuspendFromCode
+ENTRY art_quick_test_suspend
+    ldrh   w0, [xSELF, #THREAD_FLAGS_OFFSET]  // get xSELF->state_and_flags.as_struct.flags
+    mov    wSUSPEND, #SUSPEND_CHECK_INTERVAL  // reset wSUSPEND to SUSPEND_CHECK_INTERVAL
+    cbnz   w0, .Lneed_suspend                 // check flags == 0
+    ret                                       // return if flags == 0
+.Lneed_suspend:
+    mov    x0, xSELF
+    SETUP_REF_ONLY_CALLEE_SAVE_FRAME          // save callee saves for stack crawl
+    mov    x1, sp
+    bl     artTestSuspendFromCode             // (Thread*, SP)
+    RESTORE_REF_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+END art_quick_test_suspend
 
      /*
      * Called by managed code that is attempting to call a method on a proxy class. On entry
@@ -1611,10 +1629,6 @@
 UNIMPLEMENTED art_quick_instrumentation_entry
 UNIMPLEMENTED art_quick_instrumentation_exit
 UNIMPLEMENTED art_quick_deoptimize
-UNIMPLEMENTED art_quick_mul_long
-UNIMPLEMENTED art_quick_shl_long
-UNIMPLEMENTED art_quick_shr_long
-UNIMPLEMENTED art_quick_ushr_long
 UNIMPLEMENTED art_quick_indexof
 
    /*
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 95fcd73..96e0afd 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -298,7 +298,11 @@
     l.s     $f29, 116($a1)
     l.s     $f30, 120($a1)
     l.s     $f31, 124($a1)
+    .set push
+    .set nomacro
+    .set noat
     lw      $at, 4($a0)
+    .set pop
     lw      $v0, 8($a0)
     lw      $v1, 12($a0)
     lw      $a1, 20($a0)
@@ -322,8 +326,6 @@
     lw      $s7, 92($a0)
     lw      $t8, 96($a0)
     lw      $t9, 100($a0)
-    lw      $k0, 104($a0)
-    lw      $k1, 108($a0)
     lw      $gp, 112($a0)
     lw      $sp, 116($a0)
     lw      $fp, 120($a0)
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index c4a7b1b..c53fa1e 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -71,11 +71,8 @@
 // Math entrypoints.
 extern "C" double art_quick_fmod(double, double);
 extern "C" float art_quick_fmodf(float, float);
-extern "C" double art_quick_l2d(int64_t);
-extern "C" float art_quick_l2f(int64_t);
 extern "C" int64_t art_quick_d2l(double);
 extern "C" int64_t art_quick_f2l(float);
-extern "C" int32_t art_quick_idivmod(int32_t, int32_t);
 extern "C" int64_t art_quick_ldiv(int64_t, int64_t);
 extern "C" int64_t art_quick_lmod(int64_t, int64_t);
 extern "C" int64_t art_quick_lmul(int64_t, int64_t);
@@ -180,13 +177,12 @@
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
   qpoints->pFmod = art_quick_fmod;
-  // qpoints->pSqrt = NULL;  // Not needed on x86.
-  qpoints->pL2d = art_quick_l2d;
+  // qpoints->pL2d = NULL;  // Not needed on x86.
   qpoints->pFmodf = art_quick_fmodf;
-  qpoints->pL2f = art_quick_l2f;
+  // qpoints->pL2f = NULL;  // Not needed on x86.
   // points->pD2iz = NULL;  // Not needed on x86.
   // points->pF2iz = NULL;  // Not needed on x86.
-  qpoints->pIdivmod = art_quick_idivmod;
+  // qpoints->pIdivmod = NULL;  // Not needed on x86.
   qpoints->pD2l = art_quick_d2l;
   qpoints->pF2l = art_quick_f2l;
   qpoints->pLdiv = art_quick_ldiv;
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 339ed2e..b311ea5 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -763,28 +763,6 @@
     ret
 END_FUNCTION art_quick_fmodf
 
-DEFINE_FUNCTION art_quick_l2d
-    PUSH ecx                      // push arg2 a.hi
-    PUSH eax                      // push arg1 a.lo
-    fildll (%esp)                 // load as integer and push into st0
-    fstpl (%esp)                  // pop value off fp stack as double
-    movsd (%esp), %xmm0           // place into %xmm0
-    addl LITERAL(8), %esp         // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-8)
-    ret
-END_FUNCTION art_quick_l2d
-
-DEFINE_FUNCTION art_quick_l2f
-    PUSH ecx                      // push arg2 a.hi
-    PUSH eax                      // push arg1 a.lo
-    fildll (%esp)                 // load as integer and push into st0
-    fstps (%esp)                  // pop value off fp stack as a single
-    movss (%esp), %xmm0           // place into %xmm0
-    addl LITERAL(8), %esp         // pop argument
-    CFI_ADJUST_CFA_OFFSET(-8)
-    ret
-END_FUNCTION art_quick_l2f
-
 DEFINE_FUNCTION art_quick_d2l
     PUSH eax                      // alignment padding
     PUSH ecx                      // pass arg2 a.hi
@@ -807,20 +785,6 @@
     ret
 END_FUNCTION art_quick_f2l
 
-DEFINE_FUNCTION art_quick_idivmod
-    cmpl LITERAL(0x80000000), %eax
-    je .Lcheck_arg2  // special case
-.Largs_ok:
-    cdq         // edx:eax = sign extend eax
-    idiv %ecx   // (edx,eax) = (edx:eax % ecx, edx:eax / ecx)
-    ret
-.Lcheck_arg2:
-    cmpl LITERAL(-1), %ecx
-    jne .Largs_ok
-    xorl %edx, %edx
-    ret         // eax already holds min int
-END_FUNCTION art_quick_idivmod
-
 DEFINE_FUNCTION art_quick_ldiv
     subl LITERAL(12), %esp       // alignment padding
     CFI_ADJUST_CFA_OFFSET(12)
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 30067cf..aeda072 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -18,6 +18,7 @@
 #include "entrypoints/quick/quick_alloc_entrypoints.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "entrypoints/entrypoint_utils.h"
+#include "entrypoints/math_entrypoints.h"
 
 namespace art {
 
@@ -34,8 +35,8 @@
 extern "C" void art_portable_to_interpreter_bridge(mirror::ArtMethod*);
 
 // Cast entrypoints.
-extern "C" uint32_t art_quick_is_assignable(const mirror::Class* klass,
-                                                const mirror::Class* ref_class);
+extern "C" uint32_t artIsAssignableFromCode(const mirror::Class* klass,
+                                            const mirror::Class* ref_class);
 extern "C" void art_quick_check_cast(void*, void*);
 
 // DexCache entrypoints.
@@ -69,13 +70,8 @@
 extern "C" void art_quick_unlock_object(void*);
 
 // Math entrypoints.
-extern "C" double art_quick_fmod(double, double);
-extern "C" float art_quick_fmodf(float, float);
-extern "C" double art_quick_l2d(int64_t);
-extern "C" float art_quick_l2f(int64_t);
 extern "C" int64_t art_quick_d2l(double);
 extern "C" int64_t art_quick_f2l(float);
-extern "C" int32_t art_quick_idivmod(int32_t, int32_t);
 extern "C" int64_t art_quick_ldiv(int64_t, int64_t);
 extern "C" int64_t art_quick_lmod(int64_t, int64_t);
 extern "C" int64_t art_quick_lmul(int64_t, int64_t);
@@ -85,7 +81,6 @@
 
 // Intrinsic entrypoints.
 extern "C" int32_t art_quick_memcmp16(void*, void*, int32_t);
-extern "C" int32_t art_quick_indexof(void*, uint32_t, uint32_t, uint32_t);
 extern "C" int32_t art_quick_string_compareto(void*, void*);
 extern "C" void* art_quick_memcpy(void*, const void*, size_t);
 
@@ -133,7 +128,7 @@
   ResetQuickAllocEntryPoints(qpoints);
 
   // Cast
-  qpoints->pInstanceofNonTrivial = art_quick_is_assignable;
+  qpoints->pInstanceofNonTrivial = artIsAssignableFromCode;
   qpoints->pCheckCast = art_quick_check_cast;
 
   // DexCache
@@ -180,16 +175,15 @@
   // points->pCmpgFloat = NULL;  // Not needed on x86.
   // points->pCmplDouble = NULL;  // Not needed on x86.
   // points->pCmplFloat = NULL;  // Not needed on x86.
-  qpoints->pFmod = art_quick_fmod;
-  // qpoints->pSqrt = NULL;  // Not needed on x86.
-  qpoints->pL2d = art_quick_l2d;
-  qpoints->pFmodf = art_quick_fmodf;
-  qpoints->pL2f = art_quick_l2f;
+  qpoints->pFmod = fmod;
+  // qpoints->pL2d = NULL;  // Not needed on x86.
+  qpoints->pFmodf = fmodf;
+  // qpoints->pL2f = NULL;  // Not needed on x86.
   // points->pD2iz = NULL;  // Not needed on x86.
   // points->pF2iz = NULL;  // Not needed on x86.
-  qpoints->pIdivmod = art_quick_idivmod;
-  qpoints->pD2l = art_quick_d2l;
-  qpoints->pF2l = art_quick_f2l;
+  // qpoints->pIdivmod = NULL;  // Not needed on x86.
+  qpoints->pD2l = art_d2l;
+  qpoints->pF2l = art_f2l;
   qpoints->pLdiv = art_quick_ldiv;
   qpoints->pLmod = art_quick_lmod;
   qpoints->pLmul = art_quick_lmul;
@@ -198,7 +192,7 @@
   qpoints->pUshrLong = art_quick_lushr;
 
   // Intrinsics
-  qpoints->pIndexOf = art_quick_indexof;
+  // qpoints->pIndexOf = NULL;  // Not needed on x86.
   qpoints->pMemcmp16 = art_quick_memcmp16;
   qpoints->pStringCompareTo = art_quick_string_compareto;
   qpoints->pMemcpy = art_quick_memcpy;
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index ed7f246..971688d 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -737,11 +737,6 @@
     RETURN_IF_EAX_ZERO
 END_FUNCTION art_quick_unlock_object
 
-DEFINE_FUNCTION art_quick_is_assignable
-    int3
-    int3
-END_FUNCTION art_quick_is_assignable
-
 DEFINE_FUNCTION art_quick_check_cast
     PUSH rdi                          // Save args for exc
     PUSH rsi
@@ -876,13 +871,6 @@
 
 NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
 
-UNIMPLEMENTED art_quick_fmod
-UNIMPLEMENTED art_quick_fmodf
-UNIMPLEMENTED art_quick_l2d
-UNIMPLEMENTED art_quick_l2f
-UNIMPLEMENTED art_quick_d2l
-UNIMPLEMENTED art_quick_f2l
-UNIMPLEMENTED art_quick_idivmod
 UNIMPLEMENTED art_quick_ldiv
 UNIMPLEMENTED art_quick_lmod
 UNIMPLEMENTED art_quick_lmul
@@ -1301,8 +1289,6 @@
      */
 UNIMPLEMENTED art_quick_deoptimize
 
-UNIMPLEMENTED art_quick_indexof
-
     /*
      * String's compareTo.
      *
diff --git a/runtime/arch/x86_64/thread_x86_64.cc b/runtime/arch/x86_64/thread_x86_64.cc
index de4c56a..b7a5c43 100644
--- a/runtime/arch/x86_64/thread_x86_64.cc
+++ b/runtime/arch/x86_64/thread_x86_64.cc
@@ -31,8 +31,7 @@
   syscall(__NR_arch_prctl, code, val);
 }
 void Thread::InitCpu() {
-  static Mutex modify_ldt_lock("modify_ldt lock");
-  MutexLock mu(Thread::Current(), modify_ldt_lock);
+  MutexLock mu(nullptr, *Locks::modify_ldt_lock_);
   arch_prctl(ARCH_SET_GS, this);
 
   // Allow easy indirection back to Thread*.
diff --git a/runtime/base/bit_vector.cc b/runtime/base/bit_vector.cc
index a3e2b15..0053389 100644
--- a/runtime/base/bit_vector.cc
+++ b/runtime/base/bit_vector.cc
@@ -45,10 +45,11 @@
     storage_size_(storage_size),
     storage_(storage),
     number_of_bits_(start_bits) {
-  DCHECK_EQ(sizeof(*storage_), 4U);  // Assuming 32-bit units.
+  COMPILE_ASSERT(sizeof(*storage_) == kWordBytes, check_word_bytes);
+  COMPILE_ASSERT(sizeof(*storage_) * 8u == kWordBits, check_word_bits);
   if (storage_ == nullptr) {
     storage_size_ = BitsToWords(start_bits);
-    storage_ = static_cast<uint32_t*>(allocator_->Alloc(storage_size_ * sizeof(*storage_)));
+    storage_ = static_cast<uint32_t*>(allocator_->Alloc(storage_size_ * kWordBytes));
   }
 }
 
@@ -61,7 +62,7 @@
  */
 bool BitVector::IsBitSet(uint32_t num) const {
   // If the index is over the size:
-  if (num >= storage_size_ * sizeof(*storage_) * 8) {
+  if (num >= storage_size_ * kWordBits) {
     // Whether it is expandable or not, this bit does not exist: thus it is not set.
     return false;
   }
@@ -71,7 +72,7 @@
 
 // Mark all bits bit as "clear".
 void BitVector::ClearAllBits() {
-  memset(storage_, 0, storage_size_ * sizeof(*storage_));
+  memset(storage_, 0, storage_size_ * kWordBytes);
 }
 
 // Mark the specified bit as "set".
@@ -80,17 +81,17 @@
  * not using it badly or change resize mechanism.
  */
 void BitVector::SetBit(uint32_t num) {
-  if (num >= storage_size_ * sizeof(*storage_) * 8) {
+  if (num >= storage_size_ * kWordBits) {
     DCHECK(expandable_) << "Attempted to expand a non-expandable bitmap to position " << num;
 
     /* Round up to word boundaries for "num+1" bits */
     uint32_t new_size = BitsToWords(num + 1);
     DCHECK_GT(new_size, storage_size_);
     uint32_t *new_storage =
-        static_cast<uint32_t*>(allocator_->Alloc(new_size * sizeof(*storage_)));
-    memcpy(new_storage, storage_, storage_size_ * sizeof(*storage_));
+        static_cast<uint32_t*>(allocator_->Alloc(new_size * kWordBytes));
+    memcpy(new_storage, storage_, storage_size_ * kWordBytes);
     // Zero out the new storage words.
-    memset(&new_storage[storage_size_], 0, (new_size - storage_size_) * sizeof(*storage_));
+    memset(&new_storage[storage_size_], 0, (new_size - storage_size_) * kWordBytes);
     // TOTO: collect stats on space wasted because of resize.
     storage_ = new_storage;
     storage_size_ = new_size;
@@ -103,7 +104,7 @@
 // Mark the specified bit as "unset".
 void BitVector::ClearBit(uint32_t num) {
   // If the index is over the size, we don't have to do anything, it is cleared.
-  if (num < storage_size_ * sizeof(*storage_) * 8) {
+  if (num < storage_size_ * kWordBits) {
     // Otherwise, go ahead and clear it.
     storage_[num >> 5] &= ~check_masks[num & 0x1f];
   }
@@ -132,7 +133,7 @@
   //   - Therefore, min_size goes up to at least that, we are thus comparing at least what we need to, but not less.
   //      ie. we are comparing all storage cells that could have difference, if both vectors have cells above our_highest_index,
   //          they are automatically at 0.
-  return (memcmp(storage_, src->GetRawStorage(), our_highest_index * sizeof(*storage_)) == 0);
+  return (memcmp(storage_, src->GetRawStorage(), our_highest_index * kWordBytes) == 0);
 }
 
 // Intersect with another bit vector.
@@ -180,7 +181,7 @@
     SetBit(highest_bit);
 
     // Paranoid: storage size should be big enough to hold this bit now.
-    DCHECK_LT(static_cast<uint32_t> (highest_bit), storage_size_ * sizeof(*(storage_)) * 8);
+    DCHECK_LT(static_cast<uint32_t> (highest_bit), storage_size_ * kWordBits);
   }
 
   for (uint32_t idx = 0; idx < src_size; idx++) {
@@ -215,7 +216,7 @@
     SetBit(highest_bit);
 
     // Paranoid: storage size should be big enough to hold this bit now.
-    DCHECK_LT(static_cast<uint32_t> (highest_bit), storage_size_ * sizeof(*(storage_)) * 8);
+    DCHECK_LT(static_cast<uint32_t> (highest_bit), storage_size_ * kWordBits);
   }
 
   uint32_t not_in_size = not_in->GetStorageSize();
@@ -268,14 +269,10 @@
 
 // Count the number of bits that are set in range [0, end).
 uint32_t BitVector::NumSetBits(uint32_t end) const {
-  DCHECK_LE(end, storage_size_ * sizeof(*storage_) * 8);
+  DCHECK_LE(end, storage_size_ * kWordBits);
   return NumSetBits(storage_, end);
 }
 
-BitVector::Iterator* BitVector::GetIterator() const {
-  return new (allocator_) Iterator(this);
-}
-
 /*
  * Mark specified number of bits as "set". Cannot set all bits like ClearAll
  * since there might be unused bits - setting those to one will confuse the
@@ -329,7 +326,7 @@
       }
 
       // Return cnt + how many storage units still remain * the number of bits per unit.
-      int res = cnt + (idx * (sizeof(*storage_) * 8));
+      int res = cnt + (idx * kWordBits);
       return res;
     }
   }
@@ -369,14 +366,14 @@
   SetBit(highest_bit);
 
   // Now set until highest bit's storage.
-  uint32_t size = 1 + (highest_bit / (sizeof(*storage_) * 8));
-  memcpy(storage_, src->GetRawStorage(), sizeof(*storage_) * size);
+  uint32_t size = 1 + (highest_bit / kWordBits);
+  memcpy(storage_, src->GetRawStorage(), kWordBytes * size);
 
   // Set upper bits to 0.
   uint32_t left = storage_size_ - size;
 
   if (left > 0) {
-    memset(storage_ + size, 0, sizeof(*storage_) * left);
+    memset(storage_ + size, 0, kWordBytes * left);
   }
 }
 
@@ -401,14 +398,12 @@
 
 void BitVector::Dump(std::ostream& os, const char *prefix) const {
   std::ostringstream buffer;
-  DumpHelper(buffer, prefix);
+  DumpHelper(prefix, buffer);
   os << buffer.str() << std::endl;
 }
 
-void BitVector::DumpDot(FILE* file, const char* prefix, bool last_entry) const {
-  std::ostringstream buffer;
-  Dump(buffer, prefix);
 
+void BitVector::DumpDotHelper(bool last_entry, FILE* file, std::ostringstream& buffer) const {
   // Now print it to the file.
   fprintf(file, "    {%s}", buffer.str().c_str());
 
@@ -421,7 +416,32 @@
   fprintf(file, "\\\n");
 }
 
-void BitVector::DumpHelper(std::ostringstream& buffer, const char* prefix) const {
+void BitVector::DumpDot(FILE* file, const char* prefix, bool last_entry) const {
+  std::ostringstream buffer;
+  DumpHelper(prefix, buffer);
+  DumpDotHelper(last_entry, file, buffer);
+}
+
+void BitVector::DumpIndicesDot(FILE* file, const char* prefix, bool last_entry) const {
+  std::ostringstream buffer;
+  DumpIndicesHelper(prefix, buffer);
+  DumpDotHelper(last_entry, file, buffer);
+}
+
+void BitVector::DumpIndicesHelper(const char* prefix, std::ostringstream& buffer) const {
+  // Initialize it.
+  if (prefix != nullptr) {
+    buffer << prefix;
+  }
+
+  for (size_t i = 0; i < number_of_bits_; i++) {
+    if (IsBitSet(i)) {
+      buffer << i << " ";
+    }
+  }
+}
+
+void BitVector::DumpHelper(const char* prefix, std::ostringstream& buffer) const {
   // Initialize it.
   if (prefix != nullptr) {
     buffer << prefix;
diff --git a/runtime/base/bit_vector.h b/runtime/base/bit_vector.h
index 2a68396..8f9afff 100644
--- a/runtime/base/bit_vector.h
+++ b/runtime/base/bit_vector.h
@@ -32,59 +32,115 @@
  */
 class BitVector {
   public:
-    class Iterator {
+    class IndexContainer;
+
+    /**
+     * @brief Convenient iterator across the indexes of the BitVector's set bits.
+     *
+     * @details IndexIterator is a Forward iterator (C++11: 24.2.5) from the lowest
+     * to the highest index of the BitVector's set bits. Instances can be retrieved
+     * only through BitVector::Indexes() which returns an IndexContainer wrapper
+     * object with begin() and end() suitable for range-based loops:
+     *   for (uint32_t idx : bit_vector.Indexes()) {
+     *     // Use idx.
+     *   }
+     */
+    class IndexIterator
+        : std::iterator<std::forward_iterator_tag, uint32_t, ptrdiff_t, void, uint32_t> {
       public:
-        explicit Iterator(const BitVector* bit_vector)
-          : p_bits_(bit_vector),
-            bit_storage_(bit_vector->GetRawStorage()),
-            bit_index_(0),
-            bit_size_(p_bits_->storage_size_ * sizeof(uint32_t) * 8) {}
-
-        // Return the position of the next set bit.  -1 means end-of-element reached.
-        int32_t Next() {
-          // Did anything obviously change since we started?
-          DCHECK_EQ(bit_size_, p_bits_->GetStorageSize() * sizeof(uint32_t) * 8);
-          DCHECK_EQ(bit_storage_, p_bits_->GetRawStorage());
-
-          if (UNLIKELY(bit_index_ >= bit_size_)) {
-            return -1;
-          }
-
-          uint32_t word_index = bit_index_ / 32;
-          uint32_t word = bit_storage_[word_index];
-          // Mask out any bits in the first word we've already considered.
-          word >>= bit_index_ & 0x1f;
-          if (word == 0) {
-            bit_index_ &= ~0x1f;
-            do {
-              word_index++;
-              if (UNLIKELY((word_index * 32) >= bit_size_)) {
-                bit_index_ = bit_size_;
-                return -1;
-              }
-              word = bit_storage_[word_index];
-              bit_index_ += 32;
-            } while (word == 0);
-          }
-          bit_index_ += CTZ(word) + 1;
-          return bit_index_ - 1;
+        bool operator==(const IndexIterator& other) const {
+          DCHECK(bit_storage_ == other.bit_storage_);
+          DCHECK_EQ(storage_size_, other.storage_size_);
+          return bit_index_ == other.bit_index_;
         }
 
-        static void* operator new(size_t size, Allocator* allocator) {
-          return allocator->Alloc(sizeof(BitVector::Iterator));
-        };
-        static void operator delete(void* p) {
-          Iterator* it = reinterpret_cast<Iterator*>(p);
-          it->p_bits_->allocator_->Free(p);
+        bool operator!=(const IndexIterator& other) const {
+          return !(*this == other);
+        }
+
+        int operator*() const {
+          DCHECK_LT(bit_index_, BitSize());
+          return bit_index_;
+        }
+
+        IndexIterator& operator++() {
+          DCHECK_LT(bit_index_, BitSize());
+          bit_index_ = FindIndex(bit_index_ + 1u);
+          return *this;
+        }
+
+        IndexIterator operator++(int) {
+          IndexIterator result(*this);
+          ++*this;
+          return result;
+        }
+
+        // Helper function to check for end without comparing with bit_vector.Indexes().end().
+        bool Done() const {
+          return bit_index_ == BitSize();
         }
 
       private:
-        const BitVector* const p_bits_;
-        const uint32_t* const bit_storage_;
-        uint32_t bit_index_;           // Current index (size in bits).
-        const uint32_t bit_size_;      // Size of vector in bits.
+        struct begin_tag { };
+        struct end_tag { };
 
-        friend class BitVector;
+        IndexIterator(const BitVector* bit_vector, begin_tag)
+          : bit_storage_(bit_vector->GetRawStorage()),
+            storage_size_(bit_vector->storage_size_),
+            bit_index_(FindIndex(0u)) { }
+
+        IndexIterator(const BitVector* bit_vector, end_tag)
+          : bit_storage_(bit_vector->GetRawStorage()),
+            storage_size_(bit_vector->storage_size_),
+            bit_index_(BitSize()) { }
+
+        uint32_t BitSize() const {
+          return storage_size_ * kWordBits;
+        }
+
+        uint32_t FindIndex(uint32_t start_index) const {
+          DCHECK_LE(start_index, BitSize());
+          uint32_t word_index = start_index / kWordBits;
+          if (UNLIKELY(word_index == storage_size_)) {
+            return start_index;
+          }
+          uint32_t word = bit_storage_[word_index];
+          // Mask out any bits in the first word we've already considered.
+          word &= static_cast<uint32_t>(-1) << (start_index & 0x1f);
+          while (word == 0u) {
+            ++word_index;
+            if (UNLIKELY(word_index == storage_size_)) {
+              return BitSize();
+            }
+            word = bit_storage_[word_index];
+          }
+          return word_index * 32u + CTZ(word);
+        }
+
+        const uint32_t* const bit_storage_;
+        const uint32_t storage_size_;  // Size of vector in words.
+        uint32_t bit_index_;           // Current index (size in bits).
+
+        friend class BitVector::IndexContainer;
+    };
+
+    /**
+     * @brief BitVector wrapper class for iteration across indexes of set bits.
+     */
+    class IndexContainer {
+     public:
+      explicit IndexContainer(const BitVector* bit_vector) : bit_vector_(bit_vector) { }
+
+      IndexIterator begin() const {
+        return IndexIterator(bit_vector_, IndexIterator::begin_tag());
+      }
+
+      IndexIterator end() const {
+        return IndexIterator(bit_vector_, IndexIterator::end_tag());
+      }
+
+     private:
+      const BitVector* const bit_vector_;
     };
 
     BitVector(uint32_t start_bits,
@@ -127,14 +183,16 @@
     // Number of bits set in range [0, end).
     uint32_t NumSetBits(uint32_t end) const;
 
-    Iterator* GetIterator() const;
+    IndexContainer Indexes() const {
+      return IndexContainer(this);
+    }
 
     uint32_t GetStorageSize() const { return storage_size_; }
     bool IsExpandable() const { return expandable_; }
     uint32_t GetRawStorageWord(size_t idx) const { return storage_[idx]; }
     uint32_t* GetRawStorage() { return storage_; }
     const uint32_t* GetRawStorage() const { return storage_; }
-    size_t GetSizeOf() const { return storage_size_ * sizeof(uint32_t); }
+    size_t GetSizeOf() const { return storage_size_ * kWordBytes; }
 
     /**
      * @return the highest bit set, -1 if none are set
@@ -149,12 +207,42 @@
     bool EnsureSizeAndClear(unsigned int num);
 
     void Dump(std::ostream& os, const char* prefix) const;
+
+    /**
+     * @brief last_entry is this the last entry for the dot dumping
+     * @details if not, a "|" is appended to the dump.
+     */
     void DumpDot(FILE* file, const char* prefix, bool last_entry = false) const;
 
+    /**
+     * @brief last_entry is this the last entry for the dot dumping
+     * @details if not, a "|" is appended to the dump.
+     */
+    void DumpIndicesDot(FILE* file, const char* prefix, bool last_entry = false) const;
+
   protected:
-    void DumpHelper(std::ostringstream& buffer, const char* prefix) const;
+    /**
+     * @brief Dump the bitvector into buffer in a 00101..01 format.
+     * @param buffer the ostringstream used to dump the bitvector into.
+     */
+    void DumpHelper(const char* prefix, std::ostringstream& buffer) const;
+
+    /**
+     * @brief Dump the bitvector in a 1 2 5 8 format, where the numbers are the bit set.
+     * @param buffer the ostringstream used to dump the bitvector into.
+     */
+    void DumpIndicesHelper(const char* prefix, std::ostringstream& buffer) const;
+
+    /**
+     * @brief Wrapper to perform the bitvector dumping with the .dot format.
+     * @param buffer the ostringstream used to dump the bitvector into.
+     */
+    void DumpDotHelper(bool last_entry, FILE* file, std::ostringstream& buffer) const;
 
   private:
+    static constexpr uint32_t kWordBytes = sizeof(uint32_t);
+    static constexpr uint32_t kWordBits = kWordBytes * 8;
+
     Allocator* const allocator_;
     const bool expandable_;         // expand bitmap if we run out?
     uint32_t   storage_size_;       // current size, in 32-bit words.
diff --git a/runtime/base/bit_vector_test.cc b/runtime/base/bit_vector_test.cc
index 0f866a4..1403f50 100644
--- a/runtime/base/bit_vector_test.cc
+++ b/runtime/base/bit_vector_test.cc
@@ -38,11 +38,8 @@
   EXPECT_EQ(0U, bv.GetRawStorageWord(0));
   EXPECT_EQ(0U, *bv.GetRawStorage());
 
-  BitVector::Iterator empty_iterator(&bv);
-  EXPECT_EQ(-1, empty_iterator.Next());
-
-  std::unique_ptr<BitVector::Iterator> empty_iterator_on_heap(bv.GetIterator());
-  EXPECT_EQ(-1, empty_iterator_on_heap->Next());
+  EXPECT_TRUE(bv.Indexes().begin().Done());
+  EXPECT_TRUE(bv.Indexes().begin() == bv.Indexes().end());
 
   bv.SetBit(0);
   bv.SetBit(kBits - 1);
@@ -57,10 +54,14 @@
   EXPECT_EQ(0x80000001U, bv.GetRawStorageWord(0));
   EXPECT_EQ(0x80000001U, *bv.GetRawStorage());
 
-  BitVector::Iterator iterator(&bv);
-  EXPECT_EQ(0, iterator.Next());
-  EXPECT_EQ(static_cast<int>(kBits - 1), iterator.Next());
-  EXPECT_EQ(-1, iterator.Next());
+  BitVector::IndexIterator iterator = bv.Indexes().begin();
+  EXPECT_TRUE(iterator != bv.Indexes().end());
+  EXPECT_EQ(0, *iterator);
+  ++iterator;
+  EXPECT_TRUE(iterator != bv.Indexes().end());
+  EXPECT_EQ(static_cast<int>(kBits - 1), *iterator);
+  ++iterator;
+  EXPECT_TRUE(iterator == bv.Indexes().end());
 }
 
 TEST(BitVector, NoopAllocator) {
diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index 6c415e7..a9472f7 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h
@@ -146,7 +146,7 @@
           // Ignore logging which may or may not have set up thread data structures.
           level == kLoggingLock ||
           // Avoid recursive death.
-          level == kAbortLock);
+          level == kAbortLock) << level;
   }
 }
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 705be40..d2b4e01 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -816,7 +816,7 @@
 void Locks::Init() {
   if (logging_lock_ != nullptr) {
     // Already initialized.
-    if (kRuntimeISA == kX86) {
+    if (kRuntimeISA == kX86 || kRuntimeISA == kX86_64) {
       DCHECK(modify_ldt_lock_ != nullptr);
     } else {
       DCHECK(modify_ldt_lock_ == nullptr);
@@ -877,7 +877,7 @@
     DCHECK(allocated_thread_ids_lock_ == nullptr);
     allocated_thread_ids_lock_ =  new Mutex("allocated thread ids lock", current_lock_level);
 
-    if (kRuntimeISA == kX86) {
+    if (kRuntimeISA == kX86 || kRuntimeISA == kX86_64) {
       UPDATE_CURRENT_LOCK_LEVEL(kModifyLdtLock);
       DCHECK(modify_ldt_lock_ == nullptr);
       modify_ldt_lock_ = new Mutex("modify_ldt lock", current_lock_level);
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index afff7a2..b9c42ee 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -563,8 +563,7 @@
                                   const char* oat_cache_filename,
                                   std::string* error_msg) {
   Locks::mutator_lock_->AssertNotHeld(Thread::Current());  // Avoid starving GC.
-  std::string dex2oat(GetAndroidRoot());
-  dex2oat += (kIsDebugBuild ? "/bin/dex2oatd" : "/bin/dex2oat");
+  std::string dex2oat(Runtime::Current()->GetCompilerExecutable());
 
   gc::Heap* heap = Runtime::Current()->GetHeap();
   std::string boot_image_option("--boot-image=");
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 7136c67..984f287 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -2881,7 +2881,7 @@
   Handle<mirror::ClassLoader> class_loader(hs.NewHandle(mh.GetClassLoader()));
   verifier::MethodVerifier verifier(&mh.GetDexFile(), &dex_cache, &class_loader,
                                     &mh.GetClassDef(), code_item, m->GetDexMethodIndex(), m,
-                                    m->GetAccessFlags(), false, true);
+                                    m->GetAccessFlags(), false, true, false);
   // Note: we don't need to verify the method.
   return InlineMethodAnalyser::AnalyseMethodCode(&verifier, nullptr);
 }
@@ -4039,7 +4039,11 @@
   // Send a series of heap segment chunks.
   HeapChunkContext context((what == HPSG_WHAT_MERGED_OBJECTS), native);
   if (native) {
+#ifdef USE_DLMALLOC
     dlmalloc_inspect_all(HeapChunkContext::HeapChunkCallback, &context);
+#else
+    UNIMPLEMENTED(WARNING) << "Native heap inspection is only supported with dlmalloc";
+#endif
   } else {
     gc::Heap* heap = Runtime::Current()->GetHeap();
     const std::vector<gc::space::ContinuousSpace*>& spaces = heap->GetContinuousSpaces();
diff --git a/runtime/deoptimize_stack_visitor.cc b/runtime/deoptimize_stack_visitor.cc
index c7fbc87..449ccce 100644
--- a/runtime/deoptimize_stack_visitor.cc
+++ b/runtime/deoptimize_stack_visitor.cc
@@ -55,7 +55,7 @@
   Handle<mirror::ClassLoader> class_loader(hs.NewHandle(mh.GetClassLoader()));
   verifier::MethodVerifier verifier(&mh.GetDexFile(), &dex_cache, &class_loader,
                                     &mh.GetClassDef(), code_item, m->GetDexMethodIndex(), m,
-                                    m->GetAccessFlags(), false, true);
+                                    m->GetAccessFlags(), false, true, true);
   verifier.Verify();
   std::vector<int32_t> kinds = verifier.DescribeVRegs(dex_pc);
   for (uint16_t reg = 0; reg < num_regs; ++reg) {
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index 43ae308..3ff55ab 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -245,7 +245,7 @@
   if (zip_entry.get() == NULL) {
     return nullptr;
   }
-  std::unique_ptr<MemMap> map(zip_entry->ExtractToMemMap(kClassesDex, error_msg));
+  std::unique_ptr<MemMap> map(zip_entry->ExtractToMemMap(location.c_str(), kClassesDex, error_msg));
   if (map.get() == NULL) {
     *error_msg = StringPrintf("Failed to extract '%s' from '%s': %s", kClassesDex, location.c_str(),
                               error_msg->c_str());
@@ -258,15 +258,15 @@
                               error_msg->c_str());
     return nullptr;
   }
-  if (!DexFileVerifier::Verify(dex_file.get(), dex_file->Begin(), dex_file->Size(),
-                               location.c_str(), error_msg)) {
-    return nullptr;
-  }
   if (!dex_file->DisableWrite()) {
     *error_msg = StringPrintf("Failed to make dex file '%s' read only", location.c_str());
     return nullptr;
   }
   CHECK(dex_file->IsReadOnly()) << location;
+  if (!DexFileVerifier::Verify(dex_file.get(), dex_file->Begin(), dex_file->Size(),
+                               location.c_str(), error_msg)) {
+    return nullptr;
+  }
   return dex_file.release();
 }
 
diff --git a/runtime/dex_file_verifier.cc b/runtime/dex_file_verifier.cc
index a1c8c71..17d1ffc 100644
--- a/runtime/dex_file_verifier.cc
+++ b/runtime/dex_file_verifier.cc
@@ -698,6 +698,7 @@
   const byte* file_end = begin_ + size_;
 
   for (uint32_t i = 0; i < size; i++) {
+    CHECK_LT(i, size);  // b/15014252 Prevents hitting the impossible case below
     if (UNLIKELY(ptr_ >= file_end)) {
       ErrorStringPrintf("String data would go beyond end-of-file");
       return false;
@@ -710,6 +711,7 @@
       case 0x00:
         // Special case of bit pattern 0xxx.
         if (UNLIKELY(byte == 0)) {
+          CHECK_LT(i, size);  // b/15014252 Actually hit this impossible case with clang
           ErrorStringPrintf("String data shorter than indicated utf16_size %x", size);
           return false;
         }
diff --git a/runtime/dex_instruction.h b/runtime/dex_instruction.h
index 560e5ff..1ff5c19 100644
--- a/runtime/dex_instruction.h
+++ b/runtime/dex_instruction.h
@@ -118,13 +118,30 @@
   };
 
   enum Flags {
-    kBranch   = 0x01,  // conditional or unconditional branch
-    kContinue = 0x02,  // flow can continue to next statement
-    kSwitch   = 0x04,  // switch statement
-    kThrow    = 0x08,  // could cause an exception to be thrown
-    kReturn   = 0x10,  // returns, no additional statements
-    kInvoke   = 0x20,  // a flavor of invoke
-    kUnconditional = 0x40,  // unconditional branch
+    kBranch              = 0x000001,  // conditional or unconditional branch
+    kContinue            = 0x000002,  // flow can continue to next statement
+    kSwitch              = 0x000004,  // switch statement
+    kThrow               = 0x000008,  // could cause an exception to be thrown
+    kReturn              = 0x000010,  // returns, no additional statements
+    kInvoke              = 0x000020,  // a flavor of invoke
+    kUnconditional       = 0x000040,  // unconditional branch
+    kAdd                 = 0x000080,  // addition
+    kSubtract            = 0x000100,  // subtract
+    kMultiply            = 0x000200,  // multiply
+    kDivide              = 0x000400,  // division
+    kRemainder           = 0x000800,  // remainder
+    kAnd                 = 0x001000,  // and
+    kOr                  = 0x002000,  // or
+    kXor                 = 0x004000,  // xor
+    kShl                 = 0x008000,  // shl
+    kShr                 = 0x010000,  // shr
+    kUshr                = 0x020000,  // ushr
+    kCast                = 0x040000,  // cast
+    kStore               = 0x080000,  // store opcode
+    kLoad                = 0x100000,  // load opcode
+    kClobber             = 0x200000,  // clobbers memory in a big way (not just a write)
+    kRegCFieldOrConstant = 0x400000,  // is the third virtual register a field or literal constant (vC)
+    kRegBFieldOrConstant = 0x800000,  // is the second virtual register a field or literal constant (vB)
   };
 
   enum VerifyFlag {
diff --git a/runtime/dex_instruction_list.h b/runtime/dex_instruction_list.h
index c2cd65a..f43e42f 100644
--- a/runtime/dex_instruction_list.h
+++ b/runtime/dex_instruction_list.h
@@ -36,27 +36,27 @@
   V(0x0F, RETURN, "return", k11x, false, kNone, kReturn, kVerifyRegA) \
   V(0x10, RETURN_WIDE, "return-wide", k11x, false, kNone, kReturn, kVerifyRegAWide) \
   V(0x11, RETURN_OBJECT, "return-object", k11x, false, kNone, kReturn, kVerifyRegA) \
-  V(0x12, CONST_4, "const/4", k11n, true, kNone, kContinue, kVerifyRegA) \
-  V(0x13, CONST_16, "const/16", k21s, true, kNone, kContinue, kVerifyRegA) \
-  V(0x14, CONST, "const", k31i, true, kNone, kContinue, kVerifyRegA) \
-  V(0x15, CONST_HIGH16, "const/high16", k21h, true, kNone, kContinue, kVerifyRegA) \
-  V(0x16, CONST_WIDE_16, "const-wide/16", k21s, true, kNone, kContinue, kVerifyRegAWide) \
-  V(0x17, CONST_WIDE_32, "const-wide/32", k31i, true, kNone, kContinue, kVerifyRegAWide) \
-  V(0x18, CONST_WIDE, "const-wide", k51l, true, kNone, kContinue, kVerifyRegAWide) \
-  V(0x19, CONST_WIDE_HIGH16, "const-wide/high16", k21h, true, kNone, kContinue, kVerifyRegAWide) \
+  V(0x12, CONST_4, "const/4", k11n, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x13, CONST_16, "const/16", k21s, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x14, CONST, "const", k31i, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x15, CONST_HIGH16, "const/high16", k21h, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegA) \
+  V(0x16, CONST_WIDE_16, "const-wide/16", k21s, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
+  V(0x17, CONST_WIDE_32, "const-wide/32", k31i, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
+  V(0x18, CONST_WIDE, "const-wide", k51l, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
+  V(0x19, CONST_WIDE_HIGH16, "const-wide/high16", k21h, true, kNone, kContinue | kRegBFieldOrConstant, kVerifyRegAWide) \
   V(0x1A, CONST_STRING, "const-string", k21c, true, kStringRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBString) \
   V(0x1B, CONST_STRING_JUMBO, "const-string/jumbo", k31c, true, kStringRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBString) \
   V(0x1C, CONST_CLASS, "const-class", k21c, true, kTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBType) \
-  V(0x1D, MONITOR_ENTER, "monitor-enter", k11x, false, kNone, kContinue | kThrow, kVerifyRegA) \
-  V(0x1E, MONITOR_EXIT, "monitor-exit", k11x, false, kNone, kContinue | kThrow, kVerifyRegA) \
+  V(0x1D, MONITOR_ENTER, "monitor-enter", k11x, false, kNone, kContinue | kThrow | kClobber, kVerifyRegA) \
+  V(0x1E, MONITOR_EXIT, "monitor-exit", k11x, false, kNone, kContinue | kThrow | kClobber, kVerifyRegA) \
   V(0x1F, CHECK_CAST, "check-cast", k21c, true, kTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBType) \
   V(0x20, INSTANCE_OF, "instance-of", k22c, true, kTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCType) \
   V(0x21, ARRAY_LENGTH, "array-length", k12x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0x22, NEW_INSTANCE, "new-instance", k21c, true, kTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBNewInstance) \
-  V(0x23, NEW_ARRAY, "new-array", k22c, true, kTypeRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCNewArray) \
-  V(0x24, FILLED_NEW_ARRAY, "filled-new-array", k35c, false, kTypeRef, kContinue | kThrow, kVerifyRegBType | kVerifyVarArg) \
-  V(0x25, FILLED_NEW_ARRAY_RANGE, "filled-new-array/range", k3rc, false, kTypeRef, kContinue | kThrow, kVerifyRegBType | kVerifyVarArgRange) \
-  V(0x26, FILL_ARRAY_DATA, "fill-array-data", k31t, false, kNone, kContinue | kThrow, kVerifyRegA | kVerifyArrayData) \
+  V(0x22, NEW_INSTANCE, "new-instance", k21c, true, kTypeRef, kContinue | kThrow | kClobber, kVerifyRegA | kVerifyRegBNewInstance) \
+  V(0x23, NEW_ARRAY, "new-array", k22c, true, kTypeRef, kContinue | kThrow | kClobber, kVerifyRegA | kVerifyRegB | kVerifyRegCNewArray) \
+  V(0x24, FILLED_NEW_ARRAY, "filled-new-array", k35c, false, kTypeRef, kContinue | kThrow | kClobber, kVerifyRegBType | kVerifyVarArg) \
+  V(0x25, FILLED_NEW_ARRAY_RANGE, "filled-new-array/range", k3rc, false, kTypeRef, kContinue | kThrow | kClobber, kVerifyRegBType | kVerifyVarArgRange) \
+  V(0x26, FILL_ARRAY_DATA, "fill-array-data", k31t, false, kNone, kContinue | kThrow | kClobber, kVerifyRegA | kVerifyArrayData) \
   V(0x27, THROW, "throw", k11x, false, kNone, kThrow, kVerifyRegA) \
   V(0x28, GOTO, "goto", k10t, false, kNone, kBranch | kUnconditional, kVerifyBranchTarget) \
   V(0x29, GOTO_16, "goto/16", k20t, false, kNone, kBranch | kUnconditional, kVerifyBranchTarget) \
@@ -86,48 +86,48 @@
   V(0x41, UNUSED_41, "unused-41", k10x, false, kUnknown, 0, kVerifyError) \
   V(0x42, UNUSED_42, "unused-42", k10x, false, kUnknown, 0, kVerifyError) \
   V(0x43, UNUSED_43, "unused-43", k10x, false, kUnknown, 0, kVerifyError) \
-  V(0x44, AGET, "aget", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x45, AGET_WIDE, "aget-wide", k23x, true, kNone, kContinue | kThrow, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
-  V(0x46, AGET_OBJECT, "aget-object", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x47, AGET_BOOLEAN, "aget-boolean", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x48, AGET_BYTE, "aget-byte", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x49, AGET_CHAR, "aget-char", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4A, AGET_SHORT, "aget-short", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4B, APUT, "aput", k23x, false, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4C, APUT_WIDE, "aput-wide", k23x, false, kNone, kContinue | kThrow, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
-  V(0x4D, APUT_OBJECT, "aput-object", k23x, false, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4E, APUT_BOOLEAN, "aput-boolean", k23x, false, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x4F, APUT_BYTE, "aput-byte", k23x, false, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x50, APUT_CHAR, "aput-char", k23x, false, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x51, APUT_SHORT, "aput-short", k23x, false, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x52, IGET, "iget", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x53, IGET_WIDE, "iget-wide", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
-  V(0x54, IGET_OBJECT, "iget-object", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x55, IGET_BOOLEAN, "iget-boolean", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x56, IGET_BYTE, "iget-byte", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x57, IGET_CHAR, "iget-char", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x58, IGET_SHORT, "iget-short", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x59, IPUT, "iput", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5A, IPUT_WIDE, "iput-wide", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
-  V(0x5B, IPUT_OBJECT, "iput-object", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5C, IPUT_BOOLEAN, "iput-boolean", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5D, IPUT_BYTE, "iput-byte", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5E, IPUT_CHAR, "iput-char", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x5F, IPUT_SHORT, "iput-short", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
-  V(0x60, SGET, "sget", k21c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x61, SGET_WIDE, "sget-wide", k21c, true, kFieldRef, kContinue | kThrow, kVerifyRegAWide | kVerifyRegBField) \
-  V(0x62, SGET_OBJECT, "sget-object", k21c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x63, SGET_BOOLEAN, "sget-boolean", k21c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x64, SGET_BYTE, "sget-byte", k21c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x65, SGET_CHAR, "sget-char", k21c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x66, SGET_SHORT, "sget-short", k21c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x67, SPUT, "sput", k21c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x68, SPUT_WIDE, "sput-wide", k21c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x69, SPUT_OBJECT, "sput-object", k21c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x6A, SPUT_BOOLEAN, "sput-boolean", k21c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x6B, SPUT_BYTE, "sput-byte", k21c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x6C, SPUT_CHAR, "sput-char", k21c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
-  V(0x6D, SPUT_SHORT, "sput-short", k21c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegBField) \
+  V(0x44, AGET, "aget", k23x, true, kNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x45, AGET_WIDE, "aget-wide", k23x, true, kNone, kContinue | kThrow | kLoad, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
+  V(0x46, AGET_OBJECT, "aget-object", k23x, true, kNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x47, AGET_BOOLEAN, "aget-boolean", k23x, true, kNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x48, AGET_BYTE, "aget-byte", k23x, true, kNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x49, AGET_CHAR, "aget-char", k23x, true, kNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4A, AGET_SHORT, "aget-short", k23x, true, kNone, kContinue | kThrow | kLoad, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4B, APUT, "aput", k23x, false, kNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4C, APUT_WIDE, "aput-wide", k23x, false, kNone, kContinue | kThrow | kStore, kVerifyRegAWide | kVerifyRegB | kVerifyRegC) \
+  V(0x4D, APUT_OBJECT, "aput-object", k23x, false, kNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4E, APUT_BOOLEAN, "aput-boolean", k23x, false, kNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x4F, APUT_BYTE, "aput-byte", k23x, false, kNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x50, APUT_CHAR, "aput-char", k23x, false, kNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x51, APUT_SHORT, "aput-short", k23x, false, kNone, kContinue | kThrow | kStore, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x52, IGET, "iget", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x53, IGET_WIDE, "iget-wide", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
+  V(0x54, IGET_OBJECT, "iget-object", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x55, IGET_BOOLEAN, "iget-boolean", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x56, IGET_BYTE, "iget-byte", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x57, IGET_CHAR, "iget-char", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x58, IGET_SHORT, "iget-short", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x59, IPUT, "iput", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5A, IPUT_WIDE, "iput-wide", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRegCField) \
+  V(0x5B, IPUT_OBJECT, "iput-object", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5C, IPUT_BOOLEAN, "iput-boolean", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5D, IPUT_BYTE, "iput-byte", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5E, IPUT_CHAR, "iput-char", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x5F, IPUT_SHORT, "iput-short", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRegCField) \
+  V(0x60, SGET, "sget", k21c, true, kFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x61, SGET_WIDE, "sget-wide", k21c, true, kFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegAWide | kVerifyRegBField) \
+  V(0x62, SGET_OBJECT, "sget-object", k21c, true, kFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x63, SGET_BOOLEAN, "sget-boolean", k21c, true, kFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x64, SGET_BYTE, "sget-byte", k21c, true, kFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x65, SGET_CHAR, "sget-char", k21c, true, kFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x66, SGET_SHORT, "sget-short", k21c, true, kFieldRef, kContinue | kThrow | kLoad | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x67, SPUT, "sput", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x68, SPUT_WIDE, "sput-wide", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x69, SPUT_OBJECT, "sput-object", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6A, SPUT_BOOLEAN, "sput-boolean", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6B, SPUT_BYTE, "sput-byte", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6C, SPUT_CHAR, "sput-char", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
+  V(0x6D, SPUT_SHORT, "sput-short", k21c, false, kFieldRef, kContinue | kThrow | kStore | kRegBFieldOrConstant, kVerifyRegA | kVerifyRegBField) \
   V(0x6E, INVOKE_VIRTUAL, "invoke-virtual", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
   V(0x6F, INVOKE_SUPER, "invoke-super", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
   V(0x70, INVOKE_DIRECT, "invoke-direct", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyRegBMethod | kVerifyVarArg) \
@@ -147,110 +147,110 @@
   V(0x7E, NOT_LONG, "not-long", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
   V(0x7F, NEG_FLOAT, "neg-float", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
   V(0x80, NEG_DOUBLE, "neg-double", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x81, INT_TO_LONG, "int-to-long", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegB) \
-  V(0x82, INT_TO_FLOAT, "int-to-float", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x83, INT_TO_DOUBLE, "int-to-double", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegB) \
-  V(0x84, LONG_TO_INT, "long-to-int", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegBWide) \
-  V(0x85, LONG_TO_FLOAT, "long-to-float", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegBWide) \
-  V(0x86, LONG_TO_DOUBLE, "long-to-double", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x87, FLOAT_TO_INT, "float-to-int", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x88, FLOAT_TO_LONG, "float-to-long", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegB) \
-  V(0x89, FLOAT_TO_DOUBLE, "float-to-double", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegB) \
-  V(0x8A, DOUBLE_TO_INT, "double-to-int", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegBWide) \
-  V(0x8B, DOUBLE_TO_LONG, "double-to-long", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0x8C, DOUBLE_TO_FLOAT, "double-to-float", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegBWide) \
-  V(0x8D, INT_TO_BYTE, "int-to-byte", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x8E, INT_TO_CHAR, "int-to-char", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x8F, INT_TO_SHORT, "int-to-short", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0x90, ADD_INT, "add-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x91, SUB_INT, "sub-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x92, MUL_INT, "mul-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x93, DIV_INT, "div-int", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x94, REM_INT, "rem-int", k23x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x95, AND_INT, "and-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x96, OR_INT, "or-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x97, XOR_INT, "xor-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x98, SHL_INT, "shl-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x99, SHR_INT, "shr-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x9A, USHR_INT, "ushr-int", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0x9B, ADD_LONG, "add-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9C, SUB_LONG, "sub-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9D, MUL_LONG, "mul-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9E, DIV_LONG, "div-long", k23x, true, kNone, kContinue | kThrow, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0x9F, REM_LONG, "rem-long", k23x, true, kNone, kContinue | kThrow, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA0, AND_LONG, "and-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA1, OR_LONG, "or-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA2, XOR_LONG, "xor-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xA3, SHL_LONG, "shl-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
-  V(0xA4, SHR_LONG, "shr-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
-  V(0xA5, USHR_LONG, "ushr-long", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
-  V(0xA6, ADD_FLOAT, "add-float", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xA7, SUB_FLOAT, "sub-float", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xA8, MUL_FLOAT, "mul-float", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xA9, DIV_FLOAT, "div-float", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xAA, REM_FLOAT, "rem-float", k23x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
-  V(0xAB, ADD_DOUBLE, "add-double", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAC, SUB_DOUBLE, "sub-double", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAD, MUL_DOUBLE, "mul-double", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAE, DIV_DOUBLE, "div-double", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xAF, REM_DOUBLE, "rem-double", k23x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
-  V(0xB0, ADD_INT_2ADDR, "add-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xB1, SUB_INT_2ADDR, "sub-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xB2, MUL_INT_2ADDR, "mul-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xB3, DIV_INT_2ADDR, "div-int/2addr", k12x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xB4, REM_INT_2ADDR, "rem-int/2addr", k12x, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xB5, AND_INT_2ADDR, "and-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xB6, OR_INT_2ADDR, "or-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xB7, XOR_INT_2ADDR, "xor-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xB8, SHL_INT_2ADDR, "shl-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xB9, SHR_INT_2ADDR, "shr-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xBA, USHR_INT_2ADDR, "ushr-int/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xBB, ADD_LONG_2ADDR, "add-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBC, SUB_LONG_2ADDR, "sub-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBD, MUL_LONG_2ADDR, "mul-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBE, DIV_LONG_2ADDR, "div-long/2addr", k12x, true, kNone, kContinue | kThrow, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xBF, REM_LONG_2ADDR, "rem-long/2addr", k12x, true, kNone, kContinue | kThrow, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC0, AND_LONG_2ADDR, "and-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC1, OR_LONG_2ADDR, "or-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC2, XOR_LONG_2ADDR, "xor-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xC3, SHL_LONG_2ADDR, "shl-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegB) \
-  V(0xC4, SHR_LONG_2ADDR, "shr-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegB) \
-  V(0xC5, USHR_LONG_2ADDR, "ushr-long/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegB) \
-  V(0xC6, ADD_FLOAT_2ADDR, "add-float/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xC7, SUB_FLOAT_2ADDR, "sub-float/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xC8, MUL_FLOAT_2ADDR, "mul-float/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xC9, DIV_FLOAT_2ADDR, "div-float/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xCA, REM_FLOAT_2ADDR, "rem-float/2addr", k12x, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xCB, ADD_DOUBLE_2ADDR, "add-double/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCC, SUB_DOUBLE_2ADDR, "sub-double/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCD, MUL_DOUBLE_2ADDR, "mul-double/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCE, DIV_DOUBLE_2ADDR, "div-double/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xCF, REM_DOUBLE_2ADDR, "rem-double/2addr", k12x, true, kNone, kContinue, kVerifyRegAWide | kVerifyRegBWide) \
-  V(0xD0, ADD_INT_LIT16, "add-int/lit16", k22s, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xD1, RSUB_INT, "rsub-int", k22s, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xD2, MUL_INT_LIT16, "mul-int/lit16", k22s, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xD3, DIV_INT_LIT16, "div-int/lit16", k22s, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xD4, REM_INT_LIT16, "rem-int/lit16", k22s, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xD5, AND_INT_LIT16, "and-int/lit16", k22s, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xD6, OR_INT_LIT16, "or-int/lit16", k22s, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xD7, XOR_INT_LIT16, "xor-int/lit16", k22s, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xD8, ADD_INT_LIT8, "add-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xD9, RSUB_INT_LIT8, "rsub-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xDA, MUL_INT_LIT8, "mul-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xDB, DIV_INT_LIT8, "div-int/lit8", k22b, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xDC, REM_INT_LIT8, "rem-int/lit8", k22b, true, kNone, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xDD, AND_INT_LIT8, "and-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xDE, OR_INT_LIT8, "or-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xDF, XOR_INT_LIT8, "xor-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xE0, SHL_INT_LIT8, "shl-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xE1, SHR_INT_LIT8, "shr-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xE2, USHR_INT_LIT8, "ushr-int/lit8", k22b, true, kNone, kContinue, kVerifyRegA | kVerifyRegB) \
-  V(0xE3, IGET_QUICK, "iget-quick", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xE4, IGET_WIDE_QUICK, "iget-wide-quick", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegAWide | kVerifyRegB) \
-  V(0xE5, IGET_OBJECT_QUICK, "iget-object-quick", k22c, true, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xE6, IPUT_QUICK, "iput-quick", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
-  V(0xE7, IPUT_WIDE_QUICK, "iput-wide-quick", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegAWide | kVerifyRegB) \
-  V(0xE8, IPUT_OBJECT_QUICK, "iput-object-quick", k22c, false, kFieldRef, kContinue | kThrow, kVerifyRegA | kVerifyRegB) \
+  V(0x81, INT_TO_LONG, "int-to-long", k12x, true, kNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x82, INT_TO_FLOAT, "int-to-float", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x83, INT_TO_DOUBLE, "int-to-double", k12x, true, kNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x84, LONG_TO_INT, "long-to-int", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x85, LONG_TO_FLOAT, "long-to-float", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x86, LONG_TO_DOUBLE, "long-to-double", k12x, true, kNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x87, FLOAT_TO_INT, "float-to-int", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x88, FLOAT_TO_LONG, "float-to-long", k12x, true, kNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x89, FLOAT_TO_DOUBLE, "float-to-double", k12x, true, kNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegB) \
+  V(0x8A, DOUBLE_TO_INT, "double-to-int", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x8B, DOUBLE_TO_LONG, "double-to-long", k12x, true, kNone, kContinue | kCast, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0x8C, DOUBLE_TO_FLOAT, "double-to-float", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegBWide) \
+  V(0x8D, INT_TO_BYTE, "int-to-byte", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x8E, INT_TO_CHAR, "int-to-char", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x8F, INT_TO_SHORT, "int-to-short", k12x, true, kNone, kContinue | kCast, kVerifyRegA | kVerifyRegB) \
+  V(0x90, ADD_INT, "add-int", k23x, true, kNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x91, SUB_INT, "sub-int", k23x, true, kNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x92, MUL_INT, "mul-int", k23x, true, kNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x93, DIV_INT, "div-int", k23x, true, kNone, kContinue | kThrow | kDivide, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x94, REM_INT, "rem-int", k23x, true, kNone, kContinue | kThrow | kRemainder, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x95, AND_INT, "and-int", k23x, true, kNone, kContinue | kAnd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x96, OR_INT, "or-int", k23x, true, kNone, kContinue | kOr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x97, XOR_INT, "xor-int", k23x, true, kNone, kContinue | kXor, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x98, SHL_INT, "shl-int", k23x, true, kNone, kContinue | kShl, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x99, SHR_INT, "shr-int", k23x, true, kNone, kContinue | kShr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x9A, USHR_INT, "ushr-int", k23x, true, kNone, kContinue | kUshr, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0x9B, ADD_LONG, "add-long", k23x, true, kNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9C, SUB_LONG, "sub-long", k23x, true, kNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9D, MUL_LONG, "mul-long", k23x, true, kNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9E, DIV_LONG, "div-long", k23x, true, kNone, kContinue | kThrow | kDivide, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0x9F, REM_LONG, "rem-long", k23x, true, kNone, kContinue | kThrow | kRemainder, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA0, AND_LONG, "and-long", k23x, true, kNone, kContinue | kAnd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA1, OR_LONG, "or-long", k23x, true, kNone, kContinue | kOr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA2, XOR_LONG, "xor-long", k23x, true, kNone, kContinue | kXor, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xA3, SHL_LONG, "shl-long", k23x, true, kNone, kContinue | kShl, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
+  V(0xA4, SHR_LONG, "shr-long", k23x, true, kNone, kContinue | kShr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
+  V(0xA5, USHR_LONG, "ushr-long", k23x, true, kNone, kContinue | kUshr, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegC) \
+  V(0xA6, ADD_FLOAT, "add-float", k23x, true, kNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xA7, SUB_FLOAT, "sub-float", k23x, true, kNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xA8, MUL_FLOAT, "mul-float", k23x, true, kNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xA9, DIV_FLOAT, "div-float", k23x, true, kNone, kContinue | kDivide, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xAA, REM_FLOAT, "rem-float", k23x, true, kNone, kContinue | kRemainder, kVerifyRegA | kVerifyRegB | kVerifyRegC) \
+  V(0xAB, ADD_DOUBLE, "add-double", k23x, true, kNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAC, SUB_DOUBLE, "sub-double", k23x, true, kNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAD, MUL_DOUBLE, "mul-double", k23x, true, kNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAE, DIV_DOUBLE, "div-double", k23x, true, kNone, kContinue | kDivide, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xAF, REM_DOUBLE, "rem-double", k23x, true, kNone, kContinue | kRemainder, kVerifyRegAWide | kVerifyRegBWide | kVerifyRegCWide) \
+  V(0xB0, ADD_INT_2ADDR, "add-int/2addr", k12x, true, kNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB) \
+  V(0xB1, SUB_INT_2ADDR, "sub-int/2addr", k12x, true, kNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB) \
+  V(0xB2, MUL_INT_2ADDR, "mul-int/2addr", k12x, true, kNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB) \
+  V(0xB3, DIV_INT_2ADDR, "div-int/2addr", k12x, true, kNone, kContinue | kThrow | kDivide, kVerifyRegA | kVerifyRegB) \
+  V(0xB4, REM_INT_2ADDR, "rem-int/2addr", k12x, true, kNone, kContinue | kThrow | kRemainder, kVerifyRegA | kVerifyRegB) \
+  V(0xB5, AND_INT_2ADDR, "and-int/2addr", k12x, true, kNone, kContinue | kAnd, kVerifyRegA | kVerifyRegB) \
+  V(0xB6, OR_INT_2ADDR, "or-int/2addr", k12x, true, kNone, kContinue | kOr, kVerifyRegA | kVerifyRegB) \
+  V(0xB7, XOR_INT_2ADDR, "xor-int/2addr", k12x, true, kNone, kContinue | kXor, kVerifyRegA | kVerifyRegB) \
+  V(0xB8, SHL_INT_2ADDR, "shl-int/2addr", k12x, true, kNone, kContinue | kShl, kVerifyRegA | kVerifyRegB) \
+  V(0xB9, SHR_INT_2ADDR, "shr-int/2addr", k12x, true, kNone, kContinue | kShr, kVerifyRegA | kVerifyRegB) \
+  V(0xBA, USHR_INT_2ADDR, "ushr-int/2addr", k12x, true, kNone, kContinue | kUshr, kVerifyRegA | kVerifyRegB) \
+  V(0xBB, ADD_LONG_2ADDR, "add-long/2addr", k12x, true, kNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBC, SUB_LONG_2ADDR, "sub-long/2addr", k12x, true, kNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBD, MUL_LONG_2ADDR, "mul-long/2addr", k12x, true, kNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBE, DIV_LONG_2ADDR, "div-long/2addr", k12x, true, kNone, kContinue | kThrow | kDivide, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xBF, REM_LONG_2ADDR, "rem-long/2addr", k12x, true, kNone, kContinue | kThrow | kRemainder, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC0, AND_LONG_2ADDR, "and-long/2addr", k12x, true, kNone, kContinue | kAnd, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC1, OR_LONG_2ADDR, "or-long/2addr", k12x, true, kNone, kContinue | kOr, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC2, XOR_LONG_2ADDR, "xor-long/2addr", k12x, true, kNone, kContinue | kXor, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xC3, SHL_LONG_2ADDR, "shl-long/2addr", k12x, true, kNone, kContinue | kShl, kVerifyRegAWide | kVerifyRegB) \
+  V(0xC4, SHR_LONG_2ADDR, "shr-long/2addr", k12x, true, kNone, kContinue | kShr, kVerifyRegAWide | kVerifyRegB) \
+  V(0xC5, USHR_LONG_2ADDR, "ushr-long/2addr", k12x, true, kNone, kContinue | kUshr, kVerifyRegAWide | kVerifyRegB) \
+  V(0xC6, ADD_FLOAT_2ADDR, "add-float/2addr", k12x, true, kNone, kContinue | kAdd, kVerifyRegA | kVerifyRegB) \
+  V(0xC7, SUB_FLOAT_2ADDR, "sub-float/2addr", k12x, true, kNone, kContinue | kSubtract, kVerifyRegA | kVerifyRegB) \
+  V(0xC8, MUL_FLOAT_2ADDR, "mul-float/2addr", k12x, true, kNone, kContinue | kMultiply, kVerifyRegA | kVerifyRegB) \
+  V(0xC9, DIV_FLOAT_2ADDR, "div-float/2addr", k12x, true, kNone, kContinue | kDivide, kVerifyRegA | kVerifyRegB) \
+  V(0xCA, REM_FLOAT_2ADDR, "rem-float/2addr", k12x, true, kNone, kContinue | kRemainder, kVerifyRegA | kVerifyRegB) \
+  V(0xCB, ADD_DOUBLE_2ADDR, "add-double/2addr", k12x, true, kNone, kContinue | kAdd, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCC, SUB_DOUBLE_2ADDR, "sub-double/2addr", k12x, true, kNone, kContinue | kSubtract, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCD, MUL_DOUBLE_2ADDR, "mul-double/2addr", k12x, true, kNone, kContinue | kMultiply, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCE, DIV_DOUBLE_2ADDR, "div-double/2addr", k12x, true, kNone, kContinue | kDivide, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xCF, REM_DOUBLE_2ADDR, "rem-double/2addr", k12x, true, kNone, kContinue | kRemainder, kVerifyRegAWide | kVerifyRegBWide) \
+  V(0xD0, ADD_INT_LIT16, "add-int/lit16", k22s, true, kNone, kContinue | kAdd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD1, RSUB_INT, "rsub-int", k22s, true, kNone, kContinue | kSubtract | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD2, MUL_INT_LIT16, "mul-int/lit16", k22s, true, kNone, kContinue | kMultiply | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD3, DIV_INT_LIT16, "div-int/lit16", k22s, true, kNone, kContinue | kThrow | kDivide | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD4, REM_INT_LIT16, "rem-int/lit16", k22s, true, kNone, kContinue | kThrow | kRemainder | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD5, AND_INT_LIT16, "and-int/lit16", k22s, true, kNone, kContinue | kAnd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD6, OR_INT_LIT16, "or-int/lit16", k22s, true, kNone, kContinue | kOr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD7, XOR_INT_LIT16, "xor-int/lit16", k22s, true, kNone, kContinue | kXor | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD8, ADD_INT_LIT8, "add-int/lit8", k22b, true, kNone, kContinue | kAdd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xD9, RSUB_INT_LIT8, "rsub-int/lit8", k22b, true, kNone, kContinue | kSubtract | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDA, MUL_INT_LIT8, "mul-int/lit8", k22b, true, kNone, kContinue | kMultiply | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDB, DIV_INT_LIT8, "div-int/lit8", k22b, true, kNone, kContinue | kThrow | kDivide | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDC, REM_INT_LIT8, "rem-int/lit8", k22b, true, kNone, kContinue | kThrow | kRemainder | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDD, AND_INT_LIT8, "and-int/lit8", k22b, true, kNone, kContinue | kAnd | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDE, OR_INT_LIT8, "or-int/lit8", k22b, true, kNone, kContinue | kOr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xDF, XOR_INT_LIT8, "xor-int/lit8", k22b, true, kNone, kContinue | kXor | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE0, SHL_INT_LIT8, "shl-int/lit8", k22b, true, kNone, kContinue | kShl | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE1, SHR_INT_LIT8, "shr-int/lit8", k22b, true, kNone, kContinue | kShr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE2, USHR_INT_LIT8, "ushr-int/lit8", k22b, true, kNone, kContinue | kUshr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE3, IGET_QUICK, "iget-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE4, IGET_WIDE_QUICK, "iget-wide-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB) \
+  V(0xE5, IGET_OBJECT_QUICK, "iget-object-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE6, IPUT_QUICK, "iput-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
+  V(0xE7, IPUT_WIDE_QUICK, "iput-wide-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB) \
+  V(0xE8, IPUT_OBJECT_QUICK, "iput-object-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
   V(0xE9, INVOKE_VIRTUAL_QUICK, "invoke-virtual-quick", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArg) \
   V(0xEA, INVOKE_VIRTUAL_RANGE_QUICK, "invoke-virtual/range-quick", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArgRange) \
   V(0xEB, UNUSED_EB, "unused-eb", k10x, false, kUnknown, 0, kVerifyError) \
diff --git a/runtime/entrypoints/quick/quick_entrypoints.h b/runtime/entrypoints/quick/quick_entrypoints.h
index ec69e28..7bd1582 100644
--- a/runtime/entrypoints/quick/quick_entrypoints.h
+++ b/runtime/entrypoints/quick/quick_entrypoints.h
@@ -98,7 +98,6 @@
   int32_t (*pCmplDouble)(double, double);
   int32_t (*pCmplFloat)(float, float);
   double (*pFmod)(double, double);
-  double (*pSqrt)(double);
   double (*pL2d)(int64_t);
   float (*pFmodf)(float, float);
   float (*pL2f)(int64_t);
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 2c920de..554bff4 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -1314,7 +1314,7 @@
                       &cur_stack_arg_, &cur_gpr_reg_, &cur_fpr_reg_, &code_return_,
                       &alloca_used_size_);
     handle_scope_number_of_references_ = 0;
-    cur_hs_entry_ = reinterpret_cast<StackReference<mirror::Object>*>(GetFirstHandleScopeEntry());
+    cur_hs_entry_ = GetFirstHandleScopeEntry();
 
     // jni environment is always first argument
     sm_.AdvancePointer(self->GetJniEnv());
@@ -1328,7 +1328,13 @@
 
   void FinalizeHandleScope(Thread* self) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  jobject GetFirstHandleScopeEntry() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  StackReference<mirror::Object>* GetFirstHandleScopeEntry()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return handle_scope_->GetHandle(0).GetReference();
+  }
+
+  jobject GetFirstHandleScopeJObject()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return handle_scope_->GetHandle(0).ToJObject();
   }
 
@@ -1503,7 +1509,7 @@
   // Start JNI, save the cookie.
   uint32_t cookie;
   if (called->IsSynchronized()) {
-    cookie = JniMethodStartSynchronized(visitor.GetFirstHandleScopeEntry(), self);
+    cookie = JniMethodStartSynchronized(visitor.GetFirstHandleScopeJObject(), self);
     if (self->IsExceptionPending()) {
       self->PopHandleScope();
       // A negative value denotes an error.
@@ -1531,7 +1537,7 @@
       DCHECK(self->IsExceptionPending());    // There should be an exception pending now.
 
       // End JNI, as the assembly will move to deliver the exception.
-      jobject lock = called->IsSynchronized() ? visitor.GetFirstHandleScopeEntry() : nullptr;
+      jobject lock = called->IsSynchronized() ? visitor.GetFirstHandleScopeJObject() : nullptr;
       if (mh.GetShorty()[0] == 'L') {
         artQuickGenericJniEndJNIRef(self, cookie, nullptr, lock);
       } else {
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index 979970c..bd04473 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -35,8 +35,8 @@
 class AtomicStack {
  public:
   // Capacity is how many elements we can store in the stack.
-  static AtomicStack* Create(const std::string& name, size_t capacity) {
-    std::unique_ptr<AtomicStack> mark_stack(new AtomicStack(name, capacity));
+  static AtomicStack* Create(const std::string& name, size_t growth_limit, size_t capacity) {
+    std::unique_ptr<AtomicStack> mark_stack(new AtomicStack(name, growth_limit, capacity));
     mark_stack->Init();
     return mark_stack.release();
   }
@@ -44,7 +44,7 @@
   ~AtomicStack() {}
 
   void Reset() {
-    DCHECK(mem_map_.get() != NULL);
+    DCHECK(mem_map_.get() != nullptr);
     DCHECK(begin_ != NULL);
     front_index_.StoreRelaxed(0);
     back_index_.StoreRelaxed(0);
@@ -58,20 +58,13 @@
   // Beware: Mixing atomic pushes and atomic pops will cause ABA problem.
 
   // Returns false if we overflowed the stack.
+  bool AtomicPushBackIgnoreGrowthLimit(const T& value) {
+    return AtomicPushBackInternal(value, capacity_);
+  }
+
+  // Returns false if we overflowed the stack.
   bool AtomicPushBack(const T& value) {
-    if (kIsDebugBuild) {
-      debug_is_sorted_ = false;
-    }
-    int32_t index;
-    do {
-      index = back_index_.LoadRelaxed();
-      if (UNLIKELY(static_cast<size_t>(index) >= capacity_)) {
-        // Stack overflow.
-        return false;
-      }
-    } while (!back_index_.CompareExchangeWeakRelaxed(index, index + 1));
-    begin_[index] = value;
-    return true;
+    return AtomicPushBackInternal(value, growth_limit_);
   }
 
   // Atomically bump the back index by the given number of
@@ -85,7 +78,7 @@
     do {
       index = back_index_.LoadRelaxed();
       new_index = index + num_slots;
-      if (UNLIKELY(static_cast<size_t>(new_index) >= capacity_)) {
+      if (UNLIKELY(static_cast<size_t>(new_index) >= growth_limit_)) {
         // Stack overflow.
         return false;
       }
@@ -115,7 +108,7 @@
       debug_is_sorted_ = false;
     }
     int32_t index = back_index_.LoadRelaxed();
-    DCHECK_LT(static_cast<size_t>(index), capacity_);
+    DCHECK_LT(static_cast<size_t>(index), growth_limit_);
     back_index_.StoreRelaxed(index + 1);
     begin_[index] = value;
   }
@@ -165,6 +158,7 @@
   // Will clear the stack.
   void Resize(size_t new_capacity) {
     capacity_ = new_capacity;
+    growth_limit_ = new_capacity;
     Init();
   }
 
@@ -189,15 +183,33 @@
   }
 
  private:
-  AtomicStack(const std::string& name, const size_t capacity)
+  AtomicStack(const std::string& name, size_t growth_limit, size_t capacity)
       : name_(name),
         back_index_(0),
         front_index_(0),
-        begin_(NULL),
+        begin_(nullptr),
+        growth_limit_(growth_limit),
         capacity_(capacity),
         debug_is_sorted_(true) {
   }
 
+  // Returns false if we overflowed the stack.
+  bool AtomicPushBackInternal(const T& value, size_t limit) ALWAYS_INLINE {
+    if (kIsDebugBuild) {
+      debug_is_sorted_ = false;
+    }
+    int32_t index;
+    do {
+      index = back_index_.LoadRelaxed();
+      if (UNLIKELY(static_cast<size_t>(index) >= limit)) {
+        // Stack overflow.
+        return false;
+      }
+    } while (!back_index_.CompareExchangeWeakRelaxed(index, index + 1));
+    begin_[index] = value;
+    return true;
+  }
+
   // Size in number of elements.
   void Init() {
     std::string error_msg;
@@ -213,22 +225,18 @@
 
   // Name of the mark stack.
   std::string name_;
-
   // Memory mapping of the atomic stack.
   std::unique_ptr<MemMap> mem_map_;
-
   // Back index (index after the last element pushed).
   AtomicInteger back_index_;
-
   // Front index, used for implementing PopFront.
   AtomicInteger front_index_;
-
   // Base of the atomic stack.
   T* begin_;
-
+  // Current maximum which we can push back to, must be <= capacity_.
+  size_t growth_limit_;
   // Maximum number of elements.
   size_t capacity_;
-
   // Whether or not the stack is sorted, only updated in debug mode to avoid performance overhead.
   bool debug_is_sorted_;
 
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 03b72b6..58ba61b 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -137,33 +137,11 @@
 
 inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object** obj) {
   if (kUseThreadLocalAllocationStack) {
-    bool success = self->PushOnThreadLocalAllocationStack(*obj);
-    if (UNLIKELY(!success)) {
-      // Slow path. Allocate a new thread-local allocation stack.
-      mirror::Object** start_address;
-      mirror::Object** end_address;
-      while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize,
-                                                &start_address, &end_address)) {
-        // TODO: Add handle VerifyObject.
-        StackHandleScope<1> hs(self);
-        HandleWrapper<mirror::Object> wrapper(hs.NewHandleWrapper(obj));
-        CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
-      }
-      self->SetThreadLocalAllocationStack(start_address, end_address);
-      // Retry on the new thread-local allocation stack.
-      success = self->PushOnThreadLocalAllocationStack(*obj);
-      // Must succeed.
-      CHECK(success);
+    if (UNLIKELY(!self->PushOnThreadLocalAllocationStack(*obj))) {
+      PushOnThreadLocalAllocationStackWithInternalGC(self, obj);
     }
-  } else {
-    // This is safe to do since the GC will never free objects which are neither in the allocation
-    // stack or the live bitmap.
-    while (!allocation_stack_->AtomicPushBack(*obj)) {
-      // TODO: Add handle VerifyObject.
-      StackHandleScope<1> hs(self);
-      HandleWrapper<mirror::Object> wrapper(hs.NewHandleWrapper(obj));
-      CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
-    }
+  } else if (UNLIKELY(!allocation_stack_->AtomicPushBack(*obj))) {
+    PushOnAllocationStackWithInternalGC(self, obj);
   }
 }
 
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index ea1ccdd..a6093ca 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -84,9 +84,14 @@
 static constexpr double kStickyGcThroughputAdjustment = 1.0;
 // Whether or not we use the free list large object space.
 static constexpr bool kUseFreeListSpaceForLOS = false;
-// Whtehr or not we compact the zygote in PreZygoteFork.
+// Whether or not we compact the zygote in PreZygoteFork.
 static constexpr bool kCompactZygote = kMovingCollector;
 static constexpr size_t kNonMovingSpaceCapacity = 64 * MB;
+// How many reserve entries are at the end of the allocation stack, these are only needed if the
+// allocation stack overflows.
+static constexpr size_t kAllocationStackReserveSize = 1024;
+// Default mark stack size in bytes.
+static const size_t kDefaultMarkStackSize = 64 * KB;
 
 Heap::Heap(size_t initial_size, size_t growth_limit, size_t min_free, size_t max_free,
            double target_utilization, double foreground_heap_growth_multiplier, size_t capacity,
@@ -295,13 +300,13 @@
   // TODO: Count objects in the image space here.
   num_bytes_allocated_.StoreRelaxed(0);
 
-  // Default mark stack size in bytes.
-  static const size_t default_mark_stack_size = 64 * KB;
-  mark_stack_.reset(accounting::ObjectStack::Create("mark stack", default_mark_stack_size));
-  allocation_stack_.reset(accounting::ObjectStack::Create("allocation stack",
-                                                          max_allocation_stack_size_));
-  live_stack_.reset(accounting::ObjectStack::Create("live stack",
-                                                    max_allocation_stack_size_));
+  mark_stack_.reset(accounting::ObjectStack::Create("mark stack", kDefaultMarkStackSize,
+                                                    kDefaultMarkStackSize));
+  const size_t alloc_stack_capacity = max_allocation_stack_size_ + kAllocationStackReserveSize;
+  allocation_stack_.reset(accounting::ObjectStack::Create(
+      "allocation stack", max_allocation_stack_size_, alloc_stack_capacity));
+  live_stack_.reset(accounting::ObjectStack::Create(
+      "live stack", max_allocation_stack_size_, alloc_stack_capacity));
 
   // It's still too early to take a lock because there are no threads yet, but we can create locks
   // now. We don't create it earlier to make it clear that you can't use locks during heap
@@ -893,10 +898,16 @@
   uint64_t gc_heap_end_ns = NanoTime();
   // We never move things in the native heap, so we can finish the GC at this point.
   FinishGC(self, collector::kGcTypeNone);
+  size_t native_reclaimed = 0;
+#if defined(USE_DLMALLOC)
   // Trim the native heap.
   dlmalloc_trim(0);
-  size_t native_reclaimed = 0;
   dlmalloc_inspect_all(DlmallocMadviseCallback, &native_reclaimed);
+#elif defined(USE_JEMALLOC)
+  // Jemalloc does it's own internal trimming.
+#else
+  UNIMPLEMENTED(WARNING) << "Add trimming support";
+#endif
   uint64_t end_ns = NanoTime();
   VLOG(heap) << "Heap trim of managed (duration=" << PrettyDuration(gc_heap_end_ns - start_ns)
       << ", advised=" << PrettySize(managed_reclaimed) << ") and native (duration="
@@ -2029,6 +2040,43 @@
   const bool verify_referent_;
 };
 
+void Heap::PushOnAllocationStackWithInternalGC(Thread* self, mirror::Object** obj) {
+  // Slow path, the allocation stack push back must have already failed.
+  DCHECK(!allocation_stack_->AtomicPushBack(*obj));
+  do {
+    // TODO: Add handle VerifyObject.
+    StackHandleScope<1> hs(self);
+    HandleWrapper<mirror::Object> wrapper(hs.NewHandleWrapper(obj));
+    // Push our object into the reserve region of the allocaiton stack. This is only required due
+    // to heap verification requiring that roots are live (either in the live bitmap or in the
+    // allocation stack).
+    CHECK(allocation_stack_->AtomicPushBackIgnoreGrowthLimit(*obj));
+    CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+  } while (!allocation_stack_->AtomicPushBack(*obj));
+}
+
+void Heap::PushOnThreadLocalAllocationStackWithInternalGC(Thread* self, mirror::Object** obj) {
+  // Slow path, the allocation stack push back must have already failed.
+  DCHECK(!self->PushOnThreadLocalAllocationStack(*obj));
+  mirror::Object** start_address;
+  mirror::Object** end_address;
+  while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize, &start_address,
+                                            &end_address)) {
+    // TODO: Add handle VerifyObject.
+    StackHandleScope<1> hs(self);
+    HandleWrapper<mirror::Object> wrapper(hs.NewHandleWrapper(obj));
+    // Push our object into the reserve region of the allocaiton stack. This is only required due
+    // to heap verification requiring that roots are live (either in the live bitmap or in the
+    // allocation stack).
+    CHECK(allocation_stack_->AtomicPushBackIgnoreGrowthLimit(*obj));
+    // Push into the reserve allocation stack.
+    CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+  }
+  self->SetThreadLocalAllocationStack(start_address, end_address);
+  // Retry on the new thread-local allocation stack.
+  CHECK(self->PushOnThreadLocalAllocationStack(*obj));  // Must succeed.
+}
+
 // Must do this with mutators suspended since we are directly accessing the allocation stacks.
 size_t Heap::VerifyHeapReferences(bool verify_referents) {
   Thread* self = Thread::Current();
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 887b17e..e11671b 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -698,6 +698,10 @@
   // Push an object onto the allocation stack.
   void PushOnAllocationStack(Thread* self, mirror::Object** obj)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void PushOnAllocationStackWithInternalGC(Thread* self, mirror::Object** obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+  void PushOnThreadLocalAllocationStackWithInternalGC(Thread* thread, mirror::Object** obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // What kind of concurrency behavior is the runtime after? Currently true for concurrent mark
   // sweep GC, false for other GC types.
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 45fee14..3d35c00 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -53,8 +53,7 @@
 
   std::vector<std::string> arg_vector;
 
-  std::string dex2oat(GetAndroidRoot());
-  dex2oat += (kIsDebugBuild ? "/bin/dex2oatd" : "/bin/dex2oat");
+  std::string dex2oat(Runtime::Current()->GetCompilerExecutable());
   arg_vector.push_back(dex2oat);
 
   std::string image_option_string("--image=");
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index e63cc39..54a63f0 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -141,8 +141,10 @@
 size_t LargeObjectMapSpace::Free(Thread* self, mirror::Object* ptr) {
   MutexLock mu(self, lock_);
   MemMaps::iterator found = mem_maps_.find(ptr);
-  CHECK(found != mem_maps_.end()) << "Attempted to free large object" << ptr
-      << "which was not live";
+  if (UNLIKELY(found == mem_maps_.end())) {
+    Runtime::Current()->GetHeap()->DumpSpaces(LOG(ERROR));
+    LOG(FATAL) << "Attempted to free large object " << ptr << " which was not live";
+  }
   DCHECK_GE(num_bytes_allocated_, found->second->Size());
   size_t allocation_size = found->second->Size();
   num_bytes_allocated_ -= allocation_size;
diff --git a/runtime/handle.h b/runtime/handle.h
index c4e9285..b70f651 100644
--- a/runtime/handle.h
+++ b/runtime/handle.h
@@ -73,7 +73,12 @@
       : reference_(reinterpret_cast<StackReference<T>*>(handle.reference_)) {
   }
 
+  StackReference<T>* GetReference() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) ALWAYS_INLINE {
+    return reference_;
+  }
+
  private:
+  friend class BuildGenericJniFrameVisitor;
   template<class S> friend class Handle;
   friend class HandleScope;
   template<class S> friend class HandleWrapper;
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 2dbcc80..194cb18 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -180,13 +180,6 @@
 
     virtual bool VisitFrame() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
       mirror::ArtMethod* m = GetMethod();
-      if (GetCurrentQuickFrame() == NULL) {
-        if (kVerboseInstrumentation) {
-          LOG(INFO) << "  Ignoring a shadow frame. Frame " << GetFrameId()
-                    << " Method=" << PrettyMethod(m);
-        }
-        return true;  // Ignore shadow frames.
-      }
       if (m == NULL) {
         if (kVerboseInstrumentation) {
           LOG(INFO) << "  Skipping upcall. Frame " << GetFrameId();
@@ -204,6 +197,14 @@
       if (kVerboseInstrumentation) {
         LOG(INFO) << "  Installing exit stub in " << DescribeLocation();
       }
+      if (GetCurrentQuickFrame() == NULL) {
+        InstrumentationStackFrame instrumentation_frame(GetThisObject(), m, 0, GetFrameId(), false);
+        if (kVerboseInstrumentation) {
+          LOG(INFO) << "Pushing shadow frame " << instrumentation_frame.Dump();
+        }
+        shadow_stack_.push_back(instrumentation_frame);
+        return true;  // Continue.
+      }
       uintptr_t return_pc = GetReturnPc();
       if (return_pc == instrumentation_exit_pc_) {
         // We've reached a frame which has already been installed with instrumentation exit stub.
@@ -238,6 +239,7 @@
       return true;  // Continue.
     }
     std::deque<InstrumentationStackFrame>* const instrumentation_stack_;
+    std::vector<InstrumentationStackFrame> shadow_stack_;
     const size_t existing_instrumentation_frames_count_;
     std::vector<uint32_t> dex_pcs_;
     const uintptr_t instrumentation_exit_pc_;
@@ -261,14 +263,16 @@
   if (instrumentation->ShouldNotifyMethodEnterExitEvents()) {
     // Create method enter events for all methods currently on the thread's stack. We only do this
     // if no debugger is attached to prevent from posting events twice.
-    typedef std::deque<InstrumentationStackFrame>::const_reverse_iterator It;
-    for (It it = thread->GetInstrumentationStack()->rbegin(),
-        end = thread->GetInstrumentationStack()->rend(); it != end; ++it) {
-      mirror::Object* this_object = (*it).this_object_;
-      mirror::ArtMethod* method = (*it).method_;
+    auto ssi = visitor.shadow_stack_.rbegin();
+    for (auto isi = thread->GetInstrumentationStack()->rbegin(),
+        end = thread->GetInstrumentationStack()->rend(); isi != end; ++isi) {
+      while (ssi != visitor.shadow_stack_.rend() && (*ssi).frame_id_ < (*isi).frame_id_) {
+        instrumentation->MethodEnterEvent(thread, (*ssi).this_object_, (*ssi).method_, 0);
+        ++ssi;
+      }
       uint32_t dex_pc = visitor.dex_pcs_.back();
       visitor.dex_pcs_.pop_back();
-      instrumentation->MethodEnterEvent(thread, this_object, method, dex_pc);
+      instrumentation->MethodEnterEvent(thread, (*isi).this_object_, (*isi).method_, dex_pc);
     }
   }
   thread->VerifyStack();
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index 817d104..339eb36 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -84,7 +84,8 @@
 
 mirror::String* InternTable::Lookup(Table& table, mirror::String* s, int32_t hash_code) {
   Locks::intern_table_lock_->AssertHeld(Thread::Current());
-  for (auto it = table.find(hash_code), end = table.end(); it != end; ++it) {
+  for (auto it = table.lower_bound(hash_code), end = table.end();
+       it != end && it->first == hash_code; ++it) {
     mirror::String* existing_string = it->second;
     if (existing_string->Equals(s)) {
       return existing_string;
@@ -123,7 +124,8 @@
 }
 
 void InternTable::Remove(Table& table, mirror::String* s, int32_t hash_code) {
-  for (auto it = table.find(hash_code), end = table.end(); it != end; ++it) {
+  for (auto it = table.lower_bound(hash_code), end = table.end();
+       it != end && it->first == hash_code; ++it) {
     if (it->second == s) {
       table.erase(it);
       return;
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index cc38c59..58e6dd4 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -226,9 +226,9 @@
     // Do this before releasing the lock so that we don't get deflated.
     ++num_waiters_;
     monitor_lock_.Unlock(self);  // Let go of locks in order.
+    self->SetMonitorEnterObject(GetObject());
     {
       ScopedThreadStateChange tsc(self, kBlocked);  // Change to blocked and give up mutator_lock_.
-      self->SetMonitorEnterObject(GetObject());
       MutexLock mu2(self, monitor_lock_);  // Reacquire monitor_lock_ without mutator_lock_ for Wait.
       if (owner_ != NULL) {  // Did the owner_ give the lock up?
         monitor_contenders_.Wait(self);  // Still contended so wait.
@@ -249,8 +249,8 @@
           }
         }
       }
-      self->SetMonitorEnterObject(nullptr);
     }
+    self->SetMonitorEnterObject(nullptr);
     monitor_lock_.Lock(self);  // Reacquire locks in order.
     --num_waiters_;
   }
@@ -983,7 +983,7 @@
   // Ask the verifier for the dex pcs of all the monitor-enter instructions corresponding to
   // the locks held in this stack frame.
   std::vector<uint32_t> monitor_enter_dex_pcs;
-  verifier::MethodVerifier::FindLocksAtDexPc(m, stack_visitor->GetDexPc(), monitor_enter_dex_pcs);
+  verifier::MethodVerifier::FindLocksAtDexPc(m, stack_visitor->GetDexPc(), &monitor_enter_dex_pcs);
   if (monitor_enter_dex_pcs.empty()) {
     return;
   }
diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
index 5d90f1a..e17e60a 100644
--- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
+++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
@@ -52,9 +52,15 @@
     jobject internal_trace = self->CreateInternalStackTrace<false>(soa);
     trace = Thread::InternalStackTraceToStackTraceElementArray(soa, internal_trace);
   } else {
-    // Suspend thread to build stack trace.
     ThreadList* thread_list = Runtime::Current()->GetThreadList();
     bool timed_out;
+
+    // Check for valid thread
+    if (thin_lock_id == ThreadList::kInvalidThreadId) {
+      return nullptr;
+    }
+
+    // Suspend thread to build stack trace.
     Thread* thread = thread_list->SuspendThreadByThreadId(thin_lock_id, false, &timed_out);
     if (thread != nullptr) {
       {
diff --git a/runtime/oat.cc b/runtime/oat.cc
index 10d335e..9c14a4f 100644
--- a/runtime/oat.cc
+++ b/runtime/oat.cc
@@ -22,7 +22,7 @@
 namespace art {
 
 const uint8_t OatHeader::kOatMagic[] = { 'o', 'a', 't', '\n' };
-const uint8_t OatHeader::kOatVersion[] = { '0', '2', '9', '\0' };
+const uint8_t OatHeader::kOatVersion[] = { '0', '3', '0', '\0' };
 
 OatHeader::OatHeader() {
   memset(this, 0, sizeof(*this));
diff --git a/runtime/parsed_options.cc b/runtime/parsed_options.cc
index 4330d27..db2a61b 100644
--- a/runtime/parsed_options.cc
+++ b/runtime/parsed_options.cc
@@ -15,6 +15,7 @@
  */
 
 #include "parsed_options.h"
+#include "utils.h"
 #ifdef HAVE_ANDROID_OS
 #include "cutils/properties.h"
 #endif
@@ -604,6 +605,10 @@
           return false;
         }
       }
+    } else if (StartsWith(option, "-Xcompiler:")) {
+      if (!ParseStringAfterChar(option, ':', &compiler_executable_)) {
+        return false;
+      }
     } else if (option == "-Xcompiler-option") {
       i++;
       if (i == options.size()) {
@@ -790,7 +795,8 @@
   UsageMessage(stream, "  -Xprofile-period:integervalue\n");
   UsageMessage(stream, "  -Xprofile-duration:integervalue\n");
   UsageMessage(stream, "  -Xprofile-interval:integervalue\n");
-  UsageMessage(stream, "  -Xprofile-backoff:integervalue\n");
+  UsageMessage(stream, "  -Xprofile-backoff:doublevalue\n");
+  UsageMessage(stream, "  -Xcompiler:filename\n");
   UsageMessage(stream, "  -Xcompiler-option dex2oat-option\n");
   UsageMessage(stream, "  -Ximage-compiler-option dex2oat-option\n");
   UsageMessage(stream, "\n");
diff --git a/runtime/parsed_options.h b/runtime/parsed_options.h
index e0b0fb5..25fc12a 100644
--- a/runtime/parsed_options.h
+++ b/runtime/parsed_options.h
@@ -74,6 +74,7 @@
   void (*hook_exit_)(jint status);
   void (*hook_abort_)();
   std::vector<std::string> properties_;
+  std::string compiler_executable_;
   std::vector<std::string> compiler_options_;
   std::vector<std::string> image_compiler_options_;
   bool profile_;
diff --git a/runtime/profiler.cc b/runtime/profiler.cc
index 6e33f9d..5459ce3 100644
--- a/runtime/profiler.cc
+++ b/runtime/profiler.cc
@@ -193,7 +193,7 @@
 
       valid_samples += barrier_count;
 
-      ThreadState old_state = self->SetState(kWaitingForCheckPointsToRun);
+      ScopedThreadStateChange tsc(self, kWaitingForCheckPointsToRun);
 
       // Wait for the barrier to be crossed by all runnable threads.  This wait
       // is done with a timeout so that we can detect problems with the checkpoint
@@ -211,13 +211,11 @@
       // code.  Crash the process in this case.
       CHECK_LT(waitdiff_us, kWaitTimeoutUs);
 
-      self->SetState(old_state);
-
       // Update the current time.
       now_us = MicroTime();
     }
 
-    if (valid_samples > 0 && !ShuttingDown(self)) {
+    if (valid_samples > 0) {
       // After the profile has been taken, write it out.
       ScopedObjectAccess soa(self);   // Acquire the mutator lock.
       uint32_t size = profiler->WriteProfile();
@@ -335,6 +333,7 @@
   pthread_t profiler_pthread = 0U;
   {
     MutexLock trace_mu(Thread::Current(), *Locks::profiler_lock_);
+    CHECK(!shutting_down_);
     profiler = profiler_;
     shutting_down_ = true;
     profiler_pthread = profiler_pthread_;
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index dcbf42d..23a49cb 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -164,6 +164,11 @@
     }
     shutting_down_ = true;
   }
+  // Shut down background profiler before the runtime exits.
+  if (profile_) {
+    BackgroundMethodSamplingProfiler::Shutdown();
+  }
+
   Trace::Shutdown();
 
   // Make sure to let the GC complete if it is running.
@@ -365,6 +370,15 @@
   return env->NewGlobalRef(system_class_loader.get());
 }
 
+std::string Runtime::GetCompilerExecutable() const {
+  if (!compiler_executable_.empty()) {
+    return compiler_executable_;
+  }
+  std::string compiler_executable(GetAndroidRoot());
+  compiler_executable += (kIsDebugBuild ? "/bin/dex2oatd" : "/bin/dex2oat");
+  return compiler_executable;
+}
+
 bool Runtime::Start() {
   VLOG(startup) << "Runtime::Start entering";
 
@@ -531,6 +545,7 @@
   default_stack_size_ = options->stack_size_;
   stack_trace_file_ = options->stack_trace_file_;
 
+  compiler_executable_ = options->compiler_executable_;
   compiler_options_ = options->compiler_options_;
   image_compiler_options_ = options->image_compiler_options_;
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index f7074f6..261429e 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -104,6 +104,8 @@
     return is_explicit_gc_disabled_;
   }
 
+  std::string GetCompilerExecutable() const;
+
   const std::vector<std::string>& GetCompilerOptions() const {
     return compiler_options_;
   }
@@ -482,6 +484,7 @@
   bool is_concurrent_gc_enabled_;
   bool is_explicit_gc_disabled_;
 
+  std::string compiler_executable_;
   std::vector<std::string> compiler_options_;
   std::vector<std::string> image_compiler_options_;
 
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 41cfc58..55bec1e 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1816,7 +1816,6 @@
   QUICK_ENTRY_POINT_INFO(pCmplDouble)
   QUICK_ENTRY_POINT_INFO(pCmplFloat)
   QUICK_ENTRY_POINT_INFO(pFmod)
-  QUICK_ENTRY_POINT_INFO(pSqrt)
   QUICK_ENTRY_POINT_INFO(pL2d)
   QUICK_ENTRY_POINT_INFO(pFmodf)
   QUICK_ENTRY_POINT_INFO(pL2f)
diff --git a/runtime/thread.h b/runtime/thread.h
index de054ee..9a7cb48 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -396,11 +396,11 @@
   // Convert a jobject into a Object*
   mirror::Object* DecodeJObject(jobject obj) const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  mirror::Object* GetMonitorEnterObject() const {
+  mirror::Object* GetMonitorEnterObject() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return tlsPtr_.monitor_enter_object;
   }
 
-  void SetMonitorEnterObject(mirror::Object* obj) {
+  void SetMonitorEnterObject(mirror::Object* obj) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     tlsPtr_.monitor_enter_object = obj;
   }
 
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 2293ad7..b5c07aa 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -166,7 +166,8 @@
                                                       it.GetMethodCodeItem(),
                                                       method,
                                                       it.GetMemberAccessFlags(),
-                                                      allow_soft_failures);
+                                                      allow_soft_failures,
+                                                      false);
     if (result != kNoFailure) {
       if (result == kHardFailure) {
         hard_fail = true;
@@ -209,7 +210,8 @@
                                                       it.GetMethodCodeItem(),
                                                       method,
                                                       it.GetMemberAccessFlags(),
-                                                      allow_soft_failures);
+                                                      allow_soft_failures,
+                                                      false);
     if (result != kNoFailure) {
       if (result == kHardFailure) {
         hard_fail = true;
@@ -240,32 +242,34 @@
                                                          const DexFile::CodeItem* code_item,
                                                          mirror::ArtMethod* method,
                                                          uint32_t method_access_flags,
-                                                         bool allow_soft_failures) {
+                                                         bool allow_soft_failures,
+                                                         bool need_precise_constants) {
   MethodVerifier::FailureKind result = kNoFailure;
   uint64_t start_ns = NanoTime();
 
-  MethodVerifier verifier_(dex_file, &dex_cache, &class_loader, class_def, code_item,
-                           method_idx, method, method_access_flags, true, allow_soft_failures);
-  if (verifier_.Verify()) {
+  MethodVerifier verifier(dex_file, &dex_cache, &class_loader, class_def, code_item,
+                           method_idx, method, method_access_flags, true, allow_soft_failures,
+                           need_precise_constants);
+  if (verifier.Verify()) {
     // Verification completed, however failures may be pending that didn't cause the verification
     // to hard fail.
-    CHECK(!verifier_.have_pending_hard_failure_);
-    if (verifier_.failures_.size() != 0) {
+    CHECK(!verifier.have_pending_hard_failure_);
+    if (verifier.failures_.size() != 0) {
       if (VLOG_IS_ON(verifier)) {
-          verifier_.DumpFailures(VLOG_STREAM(verifier) << "Soft verification failures in "
+          verifier.DumpFailures(VLOG_STREAM(verifier) << "Soft verification failures in "
                                 << PrettyMethod(method_idx, *dex_file) << "\n");
       }
       result = kSoftFailure;
     }
   } else {
     // Bad method data.
-    CHECK_NE(verifier_.failures_.size(), 0U);
-    CHECK(verifier_.have_pending_hard_failure_);
-    verifier_.DumpFailures(LOG(INFO) << "Verification error in "
+    CHECK_NE(verifier.failures_.size(), 0U);
+    CHECK(verifier.have_pending_hard_failure_);
+    verifier.DumpFailures(LOG(INFO) << "Verification error in "
                                     << PrettyMethod(method_idx, *dex_file) << "\n");
     if (gDebugVerify) {
-      std::cout << "\n" << verifier_.info_messages_.str();
-      verifier_.Dump(std::cout);
+      std::cout << "\n" << verifier.info_messages_.str();
+      verifier.Dump(std::cout);
     }
     result = kHardFailure;
   }
@@ -286,7 +290,7 @@
                                          mirror::ArtMethod* method,
                                          uint32_t method_access_flags) {
   MethodVerifier verifier(dex_file, &dex_cache, &class_loader, class_def, code_item,
-                          dex_method_idx, method, method_access_flags, true, true);
+                          dex_method_idx, method, method_access_flags, true, true, true);
   verifier.Verify();
   verifier.DumpFailures(os);
   os << verifier.info_messages_.str();
@@ -298,7 +302,8 @@
                                const DexFile::ClassDef* class_def,
                                const DexFile::CodeItem* code_item, uint32_t dex_method_idx,
                                mirror::ArtMethod* method, uint32_t method_access_flags,
-                               bool can_load_classes, bool allow_soft_failures)
+                               bool can_load_classes, bool allow_soft_failures,
+                               bool need_precise_constants)
     : reg_types_(can_load_classes),
       work_insn_idx_(-1),
       dex_method_idx_(dex_method_idx),
@@ -319,6 +324,7 @@
       monitor_enter_count_(0),
       can_load_classes_(can_load_classes),
       allow_soft_failures_(allow_soft_failures),
+      need_precise_constants_(need_precise_constants),
       has_check_casts_(false),
       has_virtual_or_interface_invokes_(false) {
   Runtime::Current()->AddMethodVerifier(this);
@@ -331,16 +337,16 @@
 }
 
 void MethodVerifier::FindLocksAtDexPc(mirror::ArtMethod* m, uint32_t dex_pc,
-                                      std::vector<uint32_t>& monitor_enter_dex_pcs) {
+                                      std::vector<uint32_t>* monitor_enter_dex_pcs) {
   MethodHelper mh(m);
   StackHandleScope<2> hs(Thread::Current());
   Handle<mirror::DexCache> dex_cache(hs.NewHandle(mh.GetDexCache()));
   Handle<mirror::ClassLoader> class_loader(hs.NewHandle(mh.GetClassLoader()));
   MethodVerifier verifier(&mh.GetDexFile(), &dex_cache, &class_loader, &mh.GetClassDef(),
                           mh.GetCodeItem(), m->GetDexMethodIndex(), m, m->GetAccessFlags(), false,
-                          true);
+                          true, false);
   verifier.interesting_dex_pc_ = dex_pc;
-  verifier.monitor_enter_dex_pcs_ = &monitor_enter_dex_pcs;
+  verifier.monitor_enter_dex_pcs_ = monitor_enter_dex_pcs;
   verifier.FindLocksAtDexPc();
 }
 
@@ -356,14 +362,14 @@
 }
 
 mirror::ArtField* MethodVerifier::FindAccessedFieldAtDexPc(mirror::ArtMethod* m,
-                                                        uint32_t dex_pc) {
+                                                           uint32_t dex_pc) {
   MethodHelper mh(m);
   StackHandleScope<2> hs(Thread::Current());
   Handle<mirror::DexCache> dex_cache(hs.NewHandle(mh.GetDexCache()));
   Handle<mirror::ClassLoader> class_loader(hs.NewHandle(mh.GetClassLoader()));
   MethodVerifier verifier(&mh.GetDexFile(), &dex_cache, &class_loader, &mh.GetClassDef(),
                           mh.GetCodeItem(), m->GetDexMethodIndex(), m, m->GetAccessFlags(), true,
-                          true);
+                          true, false);
   return verifier.FindAccessedFieldAtDexPc(dex_pc);
 }
 
@@ -394,7 +400,7 @@
   Handle<mirror::ClassLoader> class_loader(hs.NewHandle(mh.GetClassLoader()));
   MethodVerifier verifier(&mh.GetDexFile(), &dex_cache, &class_loader, &mh.GetClassDef(),
                           mh.GetCodeItem(), m->GetDexMethodIndex(), m, m->GetAccessFlags(), true,
-                          true);
+                          true, false);
   return verifier.FindInvokedMethodAtDexPc(dex_pc);
 }
 
@@ -1438,9 +1444,6 @@
   std::unique_ptr<RegisterLine> branch_line;
   std::unique_ptr<RegisterLine> fallthrough_line;
 
-  // We need precise constant types only for deoptimization which happens at runtime.
-  const bool need_precise_constant = !Runtime::Current()->IsCompiler();
-
   switch (inst->Opcode()) {
     case Instruction::NOP:
       /*
@@ -1592,25 +1595,25 @@
     case Instruction::CONST_4: {
       int32_t val = static_cast<int32_t>(inst->VRegB_11n() << 28) >> 28;
       work_line_->SetRegisterType(inst->VRegA_11n(),
-                                  DetermineCat1Constant(val, need_precise_constant));
+                                  DetermineCat1Constant(val, need_precise_constants_));
       break;
     }
     case Instruction::CONST_16: {
       int16_t val = static_cast<int16_t>(inst->VRegB_21s());
       work_line_->SetRegisterType(inst->VRegA_21s(),
-                                  DetermineCat1Constant(val, need_precise_constant));
+                                  DetermineCat1Constant(val, need_precise_constants_));
       break;
     }
     case Instruction::CONST: {
       int32_t val = inst->VRegB_31i();
       work_line_->SetRegisterType(inst->VRegA_31i(),
-                                  DetermineCat1Constant(val, need_precise_constant));
+                                  DetermineCat1Constant(val, need_precise_constants_));
       break;
     }
     case Instruction::CONST_HIGH16: {
       int32_t val = static_cast<int32_t>(inst->VRegB_21h() << 16);
       work_line_->SetRegisterType(inst->VRegA_21h(),
-                                  DetermineCat1Constant(val, need_precise_constant));
+                                  DetermineCat1Constant(val, need_precise_constants_));
       break;
     }
       /* could be long or double; resolved upon use */
diff --git a/runtime/verifier/method_verifier.h b/runtime/verifier/method_verifier.h
index f614425..a23e80d 100644
--- a/runtime/verifier/method_verifier.h
+++ b/runtime/verifier/method_verifier.h
@@ -185,7 +185,7 @@
   // Fills 'monitor_enter_dex_pcs' with the dex pcs of the monitor-enter instructions corresponding
   // to the locks held at 'dex_pc' in method 'm'.
   static void FindLocksAtDexPc(mirror::ArtMethod* m, uint32_t dex_pc,
-                               std::vector<uint32_t>& monitor_enter_dex_pcs)
+                               std::vector<uint32_t>* monitor_enter_dex_pcs)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Returns the accessed field corresponding to the quick instruction's field
@@ -208,7 +208,8 @@
   MethodVerifier(const DexFile* dex_file, Handle<mirror::DexCache>* dex_cache,
                  Handle<mirror::ClassLoader>* class_loader, const DexFile::ClassDef* class_def,
                  const DexFile::CodeItem* code_item, uint32_t method_idx, mirror::ArtMethod* method,
-                 uint32_t access_flags, bool can_load_classes, bool allow_soft_failures)
+                 uint32_t access_flags, bool can_load_classes, bool allow_soft_failures,
+                 bool need_precise_constants)
           SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   ~MethodVerifier();
@@ -260,7 +261,7 @@
                                   const DexFile::ClassDef* class_def_idx,
                                   const DexFile::CodeItem* code_item,
                                   mirror::ArtMethod* method, uint32_t method_access_flags,
-                                  bool allow_soft_failures)
+                                  bool allow_soft_failures, bool need_precise_constants)
           SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void FindLocksAtDexPc() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
@@ -672,6 +673,12 @@
   // running and the verifier is called from the class linker.
   const bool allow_soft_failures_;
 
+  // An optimization where instead of generating unique RegTypes for constants we use imprecise
+  // constants that cover a range of constants. This isn't good enough for deoptimization that
+  // avoids loading from registers in the case of a constant as the dex instruction set lost the
+  // notion of whether a value should be in a floating point or general purpose register file.
+  const bool need_precise_constants_;
+
   // Indicates the method being verified contains at least one check-cast or aput-object
   // instruction. Aput-object operations implicitly check for array-store exceptions, similar to
   // check-cast.
diff --git a/runtime/zip_archive.cc b/runtime/zip_archive.cc
index 841c01a..c02f310 100644
--- a/runtime/zip_archive.cc
+++ b/runtime/zip_archive.cc
@@ -50,13 +50,14 @@
   return true;
 }
 
-MemMap* ZipEntry::ExtractToMemMap(const char* entry_filename, std::string* error_msg) {
+MemMap* ZipEntry::ExtractToMemMap(const char* zip_filename, const char* entry_filename,
+                                  std::string* error_msg) {
   std::string name(entry_filename);
   name += " extracted in memory from ";
-  name += entry_filename;
+  name += zip_filename;
   std::unique_ptr<MemMap> map(MemMap::MapAnonymous(name.c_str(),
-                                             NULL, GetUncompressedLength(),
-                                             PROT_READ | PROT_WRITE, false, error_msg));
+                                                   NULL, GetUncompressedLength(),
+                                                   PROT_READ | PROT_WRITE, false, error_msg));
   if (map.get() == nullptr) {
     DCHECK(!error_msg->empty());
     return nullptr;
diff --git a/runtime/zip_archive.h b/runtime/zip_archive.h
index c0e2f2f..865af51 100644
--- a/runtime/zip_archive.h
+++ b/runtime/zip_archive.h
@@ -37,7 +37,8 @@
 class ZipEntry {
  public:
   bool ExtractToFile(File& file, std::string* error_msg);
-  MemMap* ExtractToMemMap(const char* entry_filename, std::string* error_msg);
+  MemMap* ExtractToMemMap(const char* zip_filename, const char* entry_filename,
+                          std::string* error_msg);
   virtual ~ZipEntry();
 
   uint32_t GetUncompressedLength();
diff --git a/test/079-phantom/src/Bitmap.java b/test/079-phantom/src/Bitmap.java
index 9d03cbd..85eb3cc 100644
--- a/test/079-phantom/src/Bitmap.java
+++ b/test/079-phantom/src/Bitmap.java
@@ -29,6 +29,7 @@
             new ReferenceQueue<PhantomWrapper>();
     private static BitmapWatcher sWatcher = new BitmapWatcher(sPhantomQueue);
     static {
+        sWatcher.setDaemon(true);
         sWatcher.start();
     };