ART: Vectorization opcode implementation fixes

This patch fixes the implementation of the x86 vectorization opcodes.

Change-Id: I0028d54a9fa6edce791b7e3a053002d076798748
Signed-off-by: Razvan A Lupusoru <razvan.a.lupusoru@intel.com>
Signed-off-by: Udayan Banerji <udayan.banerji@intel.com>
Signed-off-by: Philbert Lin <philbert.lin@intel.com>
diff --git a/compiler/dex/backend.h b/compiler/dex/backend.h
index 1f24849..cab3427 100644
--- a/compiler/dex/backend.h
+++ b/compiler/dex/backend.h
@@ -38,14 +38,15 @@
 
     /*
      * Return the number of reservable vector registers supported
-     * @param fp_used  ‘true’ if floating point computations will be
-     * executed while vector registers are reserved.
+     * @param long_or_fp ‘true’ if floating point computations will be
+     * executed or the operations will be long type while vector
+     * registers are reserved.
      * @return the number of vector registers that are available
      * @note The backend should ensure that sufficient vector registers
      * are held back to generate scalar code without exhausting vector
      * registers, if scalar code also uses the vector registers.
      */
-    virtual int NumReservableVectorRegisters(bool fp_used) { return 0; }
+    virtual int NumReservableVectorRegisters(bool long_or_fp) { return 0; }
 
   protected:
     explicit Backend(ArenaAllocator* arena) : arena_(arena) {}
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 9c2a8ba..e4003bf 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -256,13 +256,16 @@
   // vC: TypeSize
   kMirOpPackedSet,
 
-  // @brief Reserve N vector registers (named 0..N-1)
-  // vA: Number of registers
+  // @brief Reserve a range of vector registers.
+  // vA: Start vector register to reserve.
+  // vB: Inclusive end vector register to reserve.
   // @note: The backend may choose to map vector numbers used in vector opcodes.
   //  Reserved registers are removed from the list of backend temporary pool.
   kMirOpReserveVectorRegisters,
 
-  // @brief Free Reserved vector registers
+  // @brief Free a range of reserved vector registers
+  // vA: Start vector register to unreserve.
+  // vB: Inclusive end vector register to unreserve.
   // @note: All currently reserved vector registers are returned to the temporary pool.
   kMirOpReturnVectorRegisters,
 
@@ -270,6 +273,30 @@
   // vA: a constant defined by enum MemBarrierKind.
   kMirOpMemBarrier,
 
+  // @brief Used to fill a vector register with array values.
+  // @details Just as with normal arrays, access on null object register must ensure NullPointerException
+  // and invalid index must ensure ArrayIndexOutOfBoundsException. Exception behavior must be the same
+  // as the aget it replaced and must happen at same index. Therefore, it is generally recommended that
+  // before using this MIR, it is proven that exception is guaranteed to not be thrown and marked with
+  // MIR_IGNORE_NULL_CHECK and MIR_IGNORE_RANGE_CHECK.
+  // vA: destination vector register
+  // vB: array register
+  // vC: index register
+  // arg[0]: TypeSize (most other vector opcodes have this in vC)
+  kMirOpPackedArrayGet,
+
+  // @brief Used to store a vector register into array.
+  // @details Just as with normal arrays, access on null object register must ensure NullPointerException
+  // and invalid index must ensure ArrayIndexOutOfBoundsException. Exception behavior must be the same
+  // as the aget it replaced and must happen at same index. Therefore, it is generally recommended that
+  // before using this MIR, it is proven that exception is guaranteed to not be thrown and marked with
+  // MIR_IGNORE_NULL_CHECK and MIR_IGNORE_RANGE_CHECK.
+  // vA: source vector register
+  // vB: array register
+  // vC: index register
+  // arg[0]: TypeSize (most other vector opcodes have this in vC)
+  kMirOpPackedArrayPut,
+
   kMirOpLast,
 };
 
diff --git a/compiler/dex/mir_analysis.cc b/compiler/dex/mir_analysis.cc
index a8af92c..b265ee7 100644
--- a/compiler/dex/mir_analysis.cc
+++ b/compiler/dex/mir_analysis.cc
@@ -830,68 +830,74 @@
   // 109 MIR_RANGE_CHECK
   AN_NONE,
 
-  // 110 MIR_DIV_ZERO_CHECK
+  // 10A MIR_DIV_ZERO_CHECK
   AN_NONE,
 
-  // 111 MIR_CHECK
+  // 10B MIR_CHECK
   AN_NONE,
 
-  // 112 MIR_CHECKPART2
+  // 10C MIR_CHECKPART2
   AN_NONE,
 
-  // 113 MIR_SELECT
+  // 10D MIR_SELECT
   AN_NONE,
 
-  // 114 MirOpConstVector
+  // 10E MirOpConstVector
   AN_NONE,
 
-  // 115 MirOpMoveVector
+  // 10F MirOpMoveVector
   AN_NONE,
 
-  // 116 MirOpPackedMultiply
+  // 110 MirOpPackedMultiply
   AN_NONE,
 
-  // 117 MirOpPackedAddition
+  // 111 MirOpPackedAddition
   AN_NONE,
 
-  // 118 MirOpPackedSubtract
+  // 112 MirOpPackedSubtract
   AN_NONE,
 
-  // 119 MirOpPackedShiftLeft
+  // 113 MirOpPackedShiftLeft
   AN_NONE,
 
-  // 120 MirOpPackedSignedShiftRight
+  // 114 MirOpPackedSignedShiftRight
   AN_NONE,
 
-  // 121 MirOpPackedUnsignedShiftRight
+  // 115 MirOpPackedUnsignedShiftRight
   AN_NONE,
 
-  // 122 MirOpPackedAnd
+  // 116 MirOpPackedAnd
   AN_NONE,
 
-  // 123 MirOpPackedOr
+  // 117 MirOpPackedOr
   AN_NONE,
 
-  // 124 MirOpPackedXor
+  // 118 MirOpPackedXor
   AN_NONE,
 
-  // 125 MirOpPackedAddReduce
+  // 119 MirOpPackedAddReduce
   AN_NONE,
 
-  // 126 MirOpPackedReduce
+  // 11A MirOpPackedReduce
   AN_NONE,
 
-  // 127 MirOpPackedSet
+  // 11B MirOpPackedSet
   AN_NONE,
 
-  // 128 MirOpReserveVectorRegisters
+  // 11C MirOpReserveVectorRegisters
   AN_NONE,
 
-  // 129 MirOpReturnVectorRegisters
+  // 11D MirOpReturnVectorRegisters
   AN_NONE,
 
-  // 130 MirOpMemBarrier
+  // 11E MirOpMemBarrier
   AN_NONE,
+
+  // 11F MirOpPackedArrayGet
+  AN_ARRAYOP,
+
+  // 120 MirOpPackedArrayPut
+  AN_ARRAYOP,
 };
 
 struct MethodStats {
diff --git a/compiler/dex/mir_dataflow.cc b/compiler/dex/mir_dataflow.cc
index 4c906b0..d9531fb 100644
--- a/compiler/dex/mir_dataflow.cc
+++ b/compiler/dex/mir_dataflow.cc
@@ -829,68 +829,74 @@
   // 109 MIR_RANGE_CHECK
   0,
 
-  // 110 MIR_DIV_ZERO_CHECK
+  // 10A MIR_DIV_ZERO_CHECK
   0,
 
-  // 111 MIR_CHECK
+  // 10B MIR_CHECK
   0,
 
-  // 112 MIR_CHECKPART2
+  // 10C MIR_CHECKPART2
   0,
 
-  // 113 MIR_SELECT
+  // 10D MIR_SELECT
   DF_DA | DF_UB,
 
-  // 114 MirOpConstVector
-  DF_DA,
-
-  // 115 MirOpMoveVector
+  // 10E MirOpConstVector
   0,
 
-  // 116 MirOpPackedMultiply
+  // 10F MirOpMoveVector
   0,
 
-  // 117 MirOpPackedAddition
+  // 110 MirOpPackedMultiply
   0,
 
-  // 118 MirOpPackedSubtract
+  // 111 MirOpPackedAddition
   0,
 
-  // 119 MirOpPackedShiftLeft
+  // 112 MirOpPackedSubtract
   0,
 
-  // 120 MirOpPackedSignedShiftRight
+  // 113 MirOpPackedShiftLeft
   0,
 
-  // 121 MirOpPackedUnsignedShiftRight
+  // 114 MirOpPackedSignedShiftRight
   0,
 
-  // 122 MirOpPackedAnd
+  // 115 MirOpPackedUnsignedShiftRight
   0,
 
-  // 123 MirOpPackedOr
+  // 116 MirOpPackedAnd
   0,
 
-  // 124 MirOpPackedXor
+  // 117 MirOpPackedOr
   0,
 
-  // 125 MirOpPackedAddReduce
-  DF_DA | DF_UA,
-
-  // 126 MirOpPackedReduce
-  DF_DA,
-
-  // 127 MirOpPackedSet
-  DF_UB,
-
-  // 128 MirOpReserveVectorRegisters
+  // 118 MirOpPackedXor
   0,
 
-  // 129 MirOpReturnVectorRegisters
+  // 119 MirOpPackedAddReduce
+  DF_FORMAT_EXTENDED,
+
+  // 11A MirOpPackedReduce
+  DF_FORMAT_EXTENDED,
+
+  // 11B MirOpPackedSet
+  DF_FORMAT_EXTENDED,
+
+  // 11C MirOpReserveVectorRegisters
   0,
 
-  // 130 MirOpMemBarrier
+  // 11D MirOpReturnVectorRegisters
   0,
+
+  // 11E MirOpMemBarrier
+  0,
+
+  // 11F MirOpPackedArrayGet
+  DF_UB | DF_UC | DF_NULL_CHK_0 | DF_RANGE_CHK_1 | DF_REF_B | DF_CORE_C | DF_LVN,
+
+  // 120 MirOpPackedArrayPut
+  DF_UB | DF_UC | DF_NULL_CHK_0 | DF_RANGE_CHK_1 | DF_REF_B | DF_CORE_C | DF_LVN,
 };
 
 /* Return the base virtual register for a SSA name */
@@ -915,7 +921,36 @@
 void MIRGraph::HandleExtended(ArenaBitVector* use_v, ArenaBitVector* def_v,
                             ArenaBitVector* live_in_v,
                             const MIR::DecodedInstruction& d_insn) {
+  // For vector MIRs, vC contains type information
+  bool is_vector_type_wide = false;
+  int type_size = d_insn.vC >> 16;
+  if (type_size == k64 || type_size == kDouble) {
+    is_vector_type_wide = true;
+  }
+
   switch (static_cast<int>(d_insn.opcode)) {
+    case kMirOpPackedAddReduce:
+      HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vA);
+      if (is_vector_type_wide == true) {
+        HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vA + 1);
+      }
+      HandleDef(def_v, d_insn.vA);
+      if (is_vector_type_wide == true) {
+        HandleDef(def_v, d_insn.vA + 1);
+      }
+      break;
+    case kMirOpPackedReduce:
+      HandleDef(def_v, d_insn.vA);
+      if (is_vector_type_wide == true) {
+        HandleDef(def_v, d_insn.vA + 1);
+      }
+      break;
+    case kMirOpPackedSet:
+      HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vB);
+      if (is_vector_type_wide == true) {
+        HandleLiveInUse(use_v, def_v, live_in_v, d_insn.vB + 1);
+      }
+      break;
     default:
       LOG(ERROR) << "Unexpected Extended Opcode " << d_insn.opcode;
       break;
@@ -1064,7 +1099,46 @@
 }
 
 void MIRGraph::DataFlowSSAFormatExtended(MIR* mir) {
+  const MIR::DecodedInstruction& d_insn = mir->dalvikInsn;
+  // For vector MIRs, vC contains type information
+  bool is_vector_type_wide = false;
+  int type_size = d_insn.vC >> 16;
+  if (type_size == k64 || type_size == kDouble) {
+    is_vector_type_wide = true;
+  }
+
   switch (static_cast<int>(mir->dalvikInsn.opcode)) {
+    case kMirOpPackedAddReduce:
+      // We have one use, plus one more for wide
+      AllocateSSAUseData(mir, is_vector_type_wide ? 2 : 1);
+      HandleSSAUse(mir->ssa_rep->uses, d_insn.vA, 0);
+      if (is_vector_type_wide == true) {
+        HandleSSAUse(mir->ssa_rep->uses, d_insn.vA + 1, 1);
+      }
+
+      // We have a def, plus one more for wide
+      AllocateSSADefData(mir, is_vector_type_wide ? 2 : 1);
+      HandleSSADef(mir->ssa_rep->defs, d_insn.vA, 0);
+      if (is_vector_type_wide == true) {
+        HandleSSADef(mir->ssa_rep->defs, d_insn.vA + 1, 1);
+      }
+      break;
+    case kMirOpPackedReduce:
+      // We have a def, plus one more for wide
+      AllocateSSADefData(mir, is_vector_type_wide ? 2 : 1);
+      HandleSSADef(mir->ssa_rep->defs, d_insn.vA, 0);
+      if (is_vector_type_wide == true) {
+        HandleSSADef(mir->ssa_rep->defs, d_insn.vA + 1, 1);
+      }
+      break;
+    case kMirOpPackedSet:
+      // We have one use, plus one more for wide
+      AllocateSSAUseData(mir, is_vector_type_wide ? 2 : 1);
+      HandleSSAUse(mir->ssa_rep->uses, d_insn.vB, 0);
+      if (is_vector_type_wide == true) {
+        HandleSSAUse(mir->ssa_rep->uses, d_insn.vB + 1, 1);
+      }
+      break;
     default:
       LOG(ERROR) << "Missing case for extended MIR: " << mir->dalvikInsn.opcode;
       break;
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index e77be5d..62a8f26 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -68,6 +68,8 @@
   "ReserveVectorRegisters",
   "ReturnVectorRegisters",
   "MemBarrier",
+  "PackedArrayGet",
+  "PackedArrayPut",
 };
 
 MIRGraph::MIRGraph(CompilationUnit* cu, ArenaAllocator* arena)
@@ -1386,6 +1388,18 @@
       decoded_mir->append(ss.str());
       break;
     }
+    case kMirOpPackedArrayGet:
+    case kMirOpPackedArrayPut:
+      decoded_mir->append(StringPrintf(" vect%d", mir->dalvikInsn.vA));
+      if (ssa_rep != nullptr) {
+        decoded_mir->append(StringPrintf(", %s[%s]",
+                                        GetSSANameWithConst(ssa_rep->uses[0], false).c_str(),
+                                        GetSSANameWithConst(ssa_rep->uses[1], false).c_str()));
+      } else {
+        decoded_mir->append(StringPrintf(", v%d[v%d]", mir->dalvikInsn.vB, mir->dalvikInsn.vC));
+      }
+      FillTypeSizeString(mir->dalvikInsn.arg[0], decoded_mir);
+      break;
     default:
       break;
   }
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 46f5dd3..9935a22 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -16,6 +16,7 @@
 
 #include "codegen_x86.h"
 #include "dex/quick/mir_to_lir-inl.h"
+#include "oat.h"
 #include "x86_lir.h"
 
 namespace art {
@@ -389,20 +390,27 @@
   EXT_0F_ENCODING_MAP(Subss,     0xF3, 0x5C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Divsd,     0xF2, 0x5E, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Divss,     0xF3, 0x5E, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Punpcklbw, 0x66, 0x60, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Punpcklwd, 0x66, 0x61, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Punpcklqdq, 0x66, 0x6C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Sqrtsd,    0xF2, 0x51, REG_DEF0_USE0),
   EXT_0F_ENCODING2_MAP(Pmulld,   0x66, 0x38, 0x40, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Pmullw,    0x66, 0xD5, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Pmuludq,   0x66, 0xF4, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Mulps,     0x00, 0x59, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Mulpd,     0x66, 0x59, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Paddb,     0x66, 0xFC, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Paddw,     0x66, 0xFD, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Paddd,     0x66, 0xFE, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Paddq,     0x66, 0xD4, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psadbw,    0x66, 0xF6, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Addps,     0x00, 0x58, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Addpd,     0xF2, 0x58, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Psubb,     0x66, 0xF8, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Psubw,     0x66, 0xF9, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Psubd,     0x66, 0xFA, REG_DEF0_USE0),
+  EXT_0F_ENCODING_MAP(Psubq,     0x66, 0xFB, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Subps,     0x00, 0x5C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Subpd,     0x66, 0x5C, REG_DEF0_USE0),
   EXT_0F_ENCODING_MAP(Pand,      0x66, 0xDB, REG_DEF0_USE0),
@@ -431,6 +439,7 @@
   { kX86PsrlwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 2, 0, 1, false }, "PsrlwRI", "!0r,!1d" },
   { kX86PsrldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 2, 0, 1, false }, "PsrldRI", "!0r,!1d" },
   { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1, false }, "PsrlqRI", "!0r,!1d" },
+  { kX86PsrldqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 3, 0, 1, false }, "PsrldqRI", "!0r,!1d" },
   { kX86PsllwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 6, 0, 1, false }, "PsllwRI", "!0r,!1d" },
   { kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1, false }, "PslldRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1, false }, "PsllqRI", "!0r,!1d" },
@@ -447,9 +456,9 @@
   { kX86Fucompp,  kNullary, NO_OPERAND | USE_FP_STACK,                          { 0xDA, 0,    0xE9, 0,    0, 0, 0, 0, false }, "Fucompp",  "" },
   { kX86Fstsw16R, kNullary, NO_OPERAND | REG_DEFA | USE_FP_STACK,               { 0x9B, 0xDF, 0xE0, 0,    0, 0, 0, 0, false }, "Fstsw16R", "ax" },
 
-  EXT_0F_ENCODING_MAP(Mova128,    0x66, 0x6F, REG_DEF0),
-  { kX86Mova128MR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128MR", "[!0r+!1d],!2r" },
-  { kX86Mova128AR, kArrayReg, IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128AR", "[!0r+!1r<<!2d+!3d],!4r" },
+  EXT_0F_ENCODING_MAP(Movdqa,    0x66, 0x6F, REG_DEF0),
+  { kX86MovdqaMR, kMemReg,   IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaMR", "[!0r+!1d],!2r" },
+  { kX86MovdqaAR, kArrayReg, IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaAR", "[!0r+!1r<<!2d+!3d],!4r" },
 
 
   EXT_0F_ENCODING_MAP(Movups,    0x0, 0x10, REG_DEF0),
@@ -1956,17 +1965,12 @@
   int offset = AssignInsnOffsets();
 
   if (const_vectors_ != nullptr) {
-    /* assign offsets to vector literals */
-
-    // First, get offset to 12 mod 16 to align to 16 byte boundary.
-    // This will ensure that the vector is 16 byte aligned, as the procedure is
-    // always aligned at at 4 mod 16.
-    int align_size = (16-4) - (offset & 0xF);
-    if (align_size < 0) {
-      align_size += 16;
-    }
-
-    offset += align_size;
+    // Vector literals must be 16-byte aligned. The header that is placed
+    // in the code section causes misalignment so we take it into account.
+    // Otherwise, we are sure that for x86 method is aligned to 16.
+    DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u);
+    uint32_t bytes_to_fill = (0x10 - ((offset + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF;
+    offset += bytes_to_fill;
 
     // Now assign each literal the right offset.
     for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 7ad917d..a85e02f 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -125,7 +125,7 @@
 
   void CompilerInitializeRegAlloc() OVERRIDE;
   int VectorRegisterSize() OVERRIDE;
-  int NumReservableVectorRegisters(bool fp_used) OVERRIDE;
+  int NumReservableVectorRegisters(bool long_or_fp) OVERRIDE;
 
   // Required for target - miscellaneous.
   void AssembleLIR() OVERRIDE;
@@ -479,7 +479,8 @@
   void GenFusedLongCmpImmBranch(BasicBlock* bb, RegLocation rl_src1,
                                 int64_t val, ConditionCode ccode);
   void GenConstWide(RegLocation rl_dest, int64_t value);
-  void GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir);
+  void GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2);
+  void GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2);
   void GenShiftByteVector(BasicBlock *bb, MIR *mir);
   void AndMaskVectorRegister(RegStorage rs_src1, uint32_t m1, uint32_t m2, uint32_t m3,
                              uint32_t m4);
@@ -521,20 +522,18 @@
   bool GenInlinedIndexOf(CallInfo* info, bool zero_based);
 
   /**
-   * @brief Reserve a fixed number of vector  registers from the register pool
-   * @details The mir->dalvikInsn.vA specifies an N such that vector registers
-   * [0..N-1] are removed from the temporary pool. The caller must call
-   * ReturnVectorRegisters before calling ReserveVectorRegisters again.
-   * Also sets the num_reserved_vector_regs_ to the specified value
-   * @param mir whose vA specifies the number of registers to reserve
+   * @brief Used to reserve a range of vector registers.
+   * @see kMirOpReserveVectorRegisters
+   * @param mir The extended MIR for reservation.
    */
   void ReserveVectorRegisters(MIR* mir);
 
   /**
-   * @brief Return all the reserved vector registers to the temp pool
-   * @details Returns [0..num_reserved_vector_regs_]
+   * @brief Used to return a range of vector registers.
+   * @see kMirOpReturnVectorRegisters
+   * @param mir The extended MIR for returning vector regs.
    */
-  void ReturnVectorRegisters();
+  void ReturnVectorRegisters(MIR* mir);
 
   /*
    * @brief Load 128 bit constant into vector register.
@@ -684,6 +683,20 @@
    */
   void GenSetVector(BasicBlock *bb, MIR *mir);
 
+  /**
+   * @brief Used to generate code for kMirOpPackedArrayGet.
+   * @param bb The basic block of MIR.
+   * @param mir The mir whose opcode is kMirOpPackedArrayGet.
+   */
+  void GenPackedArrayGet(BasicBlock *bb, MIR *mir);
+
+  /**
+   * @brief Used to generate code for kMirOpPackedArrayPut.
+   * @param bb The basic block of MIR.
+   * @param mir The mir whose opcode is kMirOpPackedArrayPut.
+   */
+  void GenPackedArrayPut(BasicBlock *bb, MIR *mir);
+
   /*
    * @brief Generate code for a vector opcode.
    * @param bb The basic block in which the MIR is from.
@@ -937,20 +950,20 @@
   LIR* stack_increment_;
 
   // The list of const vector literals.
-  LIR *const_vectors_;
+  LIR* const_vectors_;
 
   /*
    * @brief Search for a matching vector literal
-   * @param mir A kMirOpConst128b MIR instruction to match.
+   * @param constants An array of size 4 which contains all of 32-bit constants.
    * @returns pointer to matching LIR constant, or nullptr if not found.
    */
-  LIR *ScanVectorLiteral(MIR *mir);
+  LIR* ScanVectorLiteral(int32_t* constants);
 
   /*
    * @brief Add a constant vector literal
-   * @param mir A kMirOpConst128b MIR instruction to match.
+   * @param constants An array of size 4 which contains all of 32-bit constants.
    */
-  LIR *AddVectorLiteral(MIR *mir);
+  LIR* AddVectorLiteral(int32_t* constants);
 
   InToRegStorageMapping in_to_reg_storage_mapping_;
 
@@ -970,9 +983,6 @@
   static const X86EncodingMap EncodingMap[kX86Last];
 
  private:
-  // The number of vector registers [0..N] reserved by a call to ReserveVectorRegisters
-  int num_reserved_vector_regs_;
-
   void SwapBits(RegStorage result_reg, int shift, int32_t value);
   void SwapBits64(RegStorage result_reg, int shift, int64_t value);
 };
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 68c1633..ffe6702 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -24,6 +24,7 @@
 #include "dex/reg_storage_eq.h"
 #include "mirror/array.h"
 #include "mirror/string.h"
+#include "oat.h"
 #include "x86_lir.h"
 #include "utils/dwarf_cfi.h"
 
@@ -454,7 +455,7 @@
 }
 
 RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) {
-  return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg();
+  return GetRegInfo(reg)->Master()->GetReg();
 }
 
 bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
@@ -689,8 +690,11 @@
   return 128;
 }
 
-int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) {
-  return fp_used ? 5 : 7;
+int X86Mir2Lir::NumReservableVectorRegisters(bool long_or_fp) {
+  int num_vector_temps = cu_->target64 ? xp_temps_64.size() : xp_temps_32.size();
+
+  // Leave a few temps for use by backend as scratch.
+  return long_or_fp ? num_vector_temps - 2 : num_vector_temps - 1;
 }
 
 void X86Mir2Lir::SpillCoreRegs() {
@@ -864,9 +868,6 @@
   rX86_RET1 = rDX;
   rX86_INVOKE_TGT = rAX;
   rX86_COUNT = rCX;
-
-  // Initialize the number of reserved vector registers
-  num_reserved_vector_regs_ = -1;
 }
 
 Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph,
@@ -1022,19 +1023,18 @@
   DCHECK(method_literal_list_ == nullptr);
   DCHECK(class_literal_list_ == nullptr);
 
-  // Align to 16 byte boundary.  We have implicit knowledge that the start of the method is
-  // on a 4 byte boundary.   How can I check this if it changes (other than aligned loads
-  // will fail at runtime)?
+
   if (const_vectors_ != nullptr) {
-    int align_size = (16-4) - (code_buffer_.size() & 0xF);
-    if (align_size < 0) {
-      align_size += 16;
+    // Vector literals must be 16-byte aligned. The header that is placed
+    // in the code section causes misalignment so we take it into account.
+    // Otherwise, we are sure that for x86 method is aligned to 16.
+    DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u);
+    uint32_t bytes_to_fill = (0x10 - ((code_buffer_.size() + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF;
+    while (bytes_to_fill > 0) {
+      code_buffer_.push_back(0);
+      bytes_to_fill--;
     }
 
-    while (align_size > 0) {
-      code_buffer_.push_back(0);
-      align_size--;
-    }
     for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
       PushWord(&code_buffer_, p->operands[0]);
       PushWord(&code_buffer_, p->operands[1]);
@@ -1489,7 +1489,7 @@
       ReserveVectorRegisters(mir);
       break;
     case kMirOpReturnVectorRegisters:
-      ReturnVectorRegisters();
+      ReturnVectorRegisters(mir);
       break;
     case kMirOpConstVector:
       GenConst128(bb, mir);
@@ -1536,17 +1536,19 @@
     case kMirOpMemBarrier:
       GenMemBarrier(static_cast<MemBarrierKind>(mir->dalvikInsn.vA));
       break;
+    case kMirOpPackedArrayGet:
+      GenPackedArrayGet(bb, mir);
+      break;
+    case kMirOpPackedArrayPut:
+      GenPackedArrayPut(bb, mir);
+      break;
     default:
       break;
   }
 }
 
 void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) {
-  // We should not try to reserve twice without returning the registers
-  DCHECK_NE(num_reserved_vector_regs_, -1);
-
-  int num_vector_reg = mir->dalvikInsn.vA;
-  for (int i = 0; i < num_vector_reg; i++) {
+  for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) {
     RegStorage xp_reg = RegStorage::Solo128(i);
     RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
     Clobber(xp_reg);
@@ -1561,13 +1563,10 @@
       }
     }
   }
-
-  num_reserved_vector_regs_ = num_vector_reg;
 }
 
-void X86Mir2Lir::ReturnVectorRegisters() {
-  // Return all the reserved registers
-  for (int i = 0; i < num_reserved_vector_regs_; i++) {
+void X86Mir2Lir::ReturnVectorRegisters(MIR* mir) {
+  for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) {
     RegStorage xp_reg = RegStorage::Solo128(i);
     RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
 
@@ -1581,17 +1580,12 @@
       }
     }
   }
-
-  // We don't have anymore reserved vector registers
-  num_reserved_vector_regs_ = -1;
 }
 
 void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
-  store_method_addr_used_ = true;
-  int type_size = mir->dalvikInsn.vB;
-  // We support 128 bit vectors.
-  DCHECK_EQ(type_size & 0xFFFF, 128);
   RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest);
+
   uint32_t *args = mir->dalvikInsn.arg;
   int reg = rs_dest.GetReg();
   // Check for all 0 case.
@@ -1601,14 +1595,24 @@
   }
 
   // Append the mov const vector to reg opcode.
-  AppendOpcodeWithConst(kX86MovupsRM, reg, mir);
+  AppendOpcodeWithConst(kX86MovdqaRM, reg, mir);
 }
 
 void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) {
-  // Okay, load it from the constant vector area.
-  LIR *data_target = ScanVectorLiteral(mir);
+  // The literal pool needs position independent logic.
+  store_method_addr_used_ = true;
+
+  // To deal with correct memory ordering, reverse order of constants.
+  int32_t constants[4];
+  constants[3] = mir->dalvikInsn.arg[0];
+  constants[2] = mir->dalvikInsn.arg[1];
+  constants[1] = mir->dalvikInsn.arg[2];
+  constants[0] = mir->dalvikInsn.arg[3];
+
+  // Search if there is already a constant in pool with this value.
+  LIR *data_target = ScanVectorLiteral(constants);
   if (data_target == nullptr) {
-    data_target = AddVectorLiteral(mir);
+    data_target = AddVectorLiteral(constants);
   }
 
   // Address the start of the method.
@@ -1624,7 +1628,7 @@
   // 4 byte offset.  We will fix this up in the assembler later to have the right
   // value.
   ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
-  LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg());
+  LIR *load = NewLIR3(opcode, reg, rl_method.reg.GetReg(), 256 /* bogus */);
   load->flags.fixup = kFixupLoad;
   load->target = data_target;
 }
@@ -1633,16 +1637,12 @@
   // We only support 128 bit registers.
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest);
   RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
-  NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
+  NewLIR2(kX86MovdqaRR, rs_dest.GetReg(), rs_src.GetReg());
 }
 
-void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) {
-  const int BYTE_SIZE = 8;
-  RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
-  RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide());
-
+void X86Mir2Lir::GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2) {
   /*
    * Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM
    * and multiplying 8 at a time before recombining back into one XMM register.
@@ -1660,29 +1660,100 @@
    */
 
   // Copy xmm1.
-  NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg());
+  RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempDouble());
+  RegStorage rs_dest_high_tmp = Get128BitRegister(AllocTempDouble());
+  NewLIR2(kX86MovdqaRR, rs_src1_high_tmp.GetReg(), rs_src2.GetReg());
+  NewLIR2(kX86MovdqaRR, rs_dest_high_tmp.GetReg(), rs_dest_src1.GetReg());
 
   // Multiply low bits.
+  // x7 *= x3
   NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 
   // xmm1 now has low bits.
   AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
 
   // Prepare high bits for multiplication.
-  NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE);
-  AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+  NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), 0x8);
+  AndMaskVectorRegister(rs_dest_high_tmp,  0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
 
   // Multiply high bits and xmm2 now has high bits.
-  NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg());
+  NewLIR2(kX86PmullwRR, rs_src1_high_tmp.GetReg(), rs_dest_high_tmp.GetReg());
 
   // Combine back into dest XMM register.
-  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src1_high_tmp.GetReg());
+}
+
+void X86Mir2Lir::GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2) {
+  /*
+   * We need to emulate the packed long multiply.
+   * For kMirOpPackedMultiply xmm1, xmm0:
+   * - xmm1 is src/dest
+   * - xmm0 is src
+   * - Get xmm2 and xmm3 as temp
+   * - Idea is to multiply the lower 32 of each operand with the higher 32 of the other.
+   * - Then add the two results.
+   * - Move it to the upper 32 of the destination
+   * - Then multiply the lower 32-bits of the operands and add the result to the destination.
+   *
+   * (op     dest   src )
+   * movdqa  %xmm2, %xmm1
+   * movdqa  %xmm3, %xmm0
+   * psrlq   %xmm3, $0x20
+   * pmuludq %xmm3, %xmm2
+   * psrlq   %xmm1, $0x20
+   * pmuludq %xmm1, %xmm0
+   * paddq   %xmm1, %xmm3
+   * psllq   %xmm1, $0x20
+   * pmuludq %xmm2, %xmm0
+   * paddq   %xmm1, %xmm2
+   *
+   * When both the operands are the same, then we need to calculate the lower-32 * higher-32
+   * calculation only once. Thus we don't need the xmm3 temp above. That sequence becomes:
+   *
+   * (op     dest   src )
+   * movdqa  %xmm2, %xmm1
+   * psrlq   %xmm1, $0x20
+   * pmuludq %xmm1, %xmm0
+   * paddq   %xmm1, %xmm1
+   * psllq   %xmm1, $0x20
+   * pmuludq %xmm2, %xmm0
+   * paddq   %xmm1, %xmm2
+   *
+   */
+
+  bool both_operands_same = (rs_dest_src1.GetReg() == rs_src2.GetReg());
+
+  RegStorage rs_tmp_vector_1;
+  RegStorage rs_tmp_vector_2;
+  rs_tmp_vector_1 = Get128BitRegister(AllocTempDouble());
+  NewLIR2(kX86MovdqaRR, rs_tmp_vector_1.GetReg(), rs_dest_src1.GetReg());
+
+  if (both_operands_same == false) {
+    rs_tmp_vector_2 = Get128BitRegister(AllocTempDouble());
+    NewLIR2(kX86MovdqaRR, rs_tmp_vector_2.GetReg(), rs_src2.GetReg());
+    NewLIR2(kX86PsrlqRI, rs_tmp_vector_2.GetReg(), 0x20);
+    NewLIR2(kX86PmuludqRR, rs_tmp_vector_2.GetReg(), rs_tmp_vector_1.GetReg());
+  }
+
+  NewLIR2(kX86PsrlqRI, rs_dest_src1.GetReg(), 0x20);
+  NewLIR2(kX86PmuludqRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+
+  if (both_operands_same == false) {
+    NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_2.GetReg());
+  } else {
+    NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg());
+  }
+
+  NewLIR2(kX86PsllqRI, rs_dest_src1.GetReg(), 0x20);
+  NewLIR2(kX86PmuludqRR, rs_tmp_vector_1.GetReg(), rs_src2.GetReg());
+  NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_1.GetReg());
 }
 
 void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
@@ -1700,7 +1771,10 @@
       break;
     case kSignedByte:
       // HW doesn't support 16x16 byte multiplication so emulate it.
-      GenMultiplyVectorSignedByte(bb, mir);
+      GenMultiplyVectorSignedByte(rs_dest_src1, rs_src2);
+      return;
+    case k64:
+      GenMultiplyVectorLong(rs_dest_src1, rs_src2);
       return;
     default:
       LOG(FATAL) << "Unsupported vector multiply " << opsize;
@@ -1713,12 +1787,16 @@
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
       opcode = kX86PadddRR;
       break;
+    case k64:
+      opcode = kX86PaddqRR;
+      break;
     case kSignedHalf:
     case kUnsignedHalf:
       opcode = kX86PaddwRR;
@@ -1744,12 +1822,16 @@
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   int opcode = 0;
   switch (opsize) {
     case k32:
       opcode = kX86PsubdRR;
       break;
+    case k64:
+      opcode = kX86PsubqRR;
+      break;
     case kSignedHalf:
     case kUnsignedHalf:
       opcode = kX86PsubwRR;
@@ -1772,58 +1854,54 @@
 }
 
 void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) {
+  // Destination does not need clobbered because it has already been as part
+  // of the general packed shift handler (caller of this method).
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
-  RegStorage rs_tmp = Get128BitRegister(AllocTempWide());
 
   int opcode = 0;
-  int imm = mir->dalvikInsn.vB;
-
   switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
     case kMirOpPackedShiftLeft:
       opcode = kX86PsllwRI;
       break;
     case kMirOpPackedSignedShiftRight:
-      opcode = kX86PsrawRI;
-      break;
     case kMirOpPackedUnsignedShiftRight:
-      opcode = kX86PsrlwRI;
-      break;
+      // TODO Add support for emulated byte shifts.
     default:
       LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode;
       break;
   }
 
-  /*
-   * xmm1 will have low bits
-   * xmm2 will have high bits
-   *
-   * xmm2 = xmm1
-   * xmm1 = xmm1 .<< N
-   * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00
-   * xmm2 = xmm2 .<< N
-   * xmm1 = xmm1 | xmm2
-   */
-
-  // Copy xmm1.
-  NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg());
+  // Clear xmm register and return if shift more than byte length.
+  int imm = mir->dalvikInsn.vB;
+  if (imm >= 8) {
+    NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg());
+    return;
+  }
 
   // Shift lower values.
   NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
 
-  // Mask bottom bits.
-  AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+  /*
+   * The above shift will shift the whole word, but that means
+   * both the bytes will shift as well. To emulate a byte level
+   * shift, we can just throw away the lower (8 - N) bits of the
+   * upper byte, and we are done.
+   */
+  uint8_t byte_mask = 0xFF << imm;
+  uint32_t int_mask = byte_mask;
+  int_mask = int_mask << 8 | byte_mask;
+  int_mask = int_mask << 8 | byte_mask;
+  int_mask = int_mask << 8 | byte_mask;
 
-  // Shift higher values.
-  NewLIR2(opcode, rs_tmp.GetReg(), imm);
-
-  // Combine back into dest XMM register.
-  NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg());
+  // And the destination with the mask
+  AndMaskVectorRegister(rs_dest_src1, int_mask, int_mask, int_mask, int_mask);
 }
 
 void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
@@ -1852,6 +1930,7 @@
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
@@ -1866,6 +1945,8 @@
     case kUnsignedByte:
       GenShiftByteVector(bb, mir);
       return;
+    case k64:
+      // TODO Implement emulated shift algorithm.
     default:
       LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
       break;
@@ -1877,6 +1958,7 @@
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   int imm = mir->dalvikInsn.vB;
   int opcode = 0;
   switch (opsize) {
@@ -1905,6 +1987,7 @@
   // We only support 128 bit registers.
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
@@ -1913,6 +1996,7 @@
   // We only support 128 bit registers.
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
@@ -1921,6 +2005,7 @@
   // We only support 128 bit registers.
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+  Clobber(rs_dest_src1);
   RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
   NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
 }
@@ -1945,134 +2030,240 @@
 
 void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
-  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
-  RegLocation rl_dest = mir_graph_->GetDest(mir);
-  RegStorage rs_tmp;
+  RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB);
+  bool is_wide = opsize == k64 || opsize == kDouble;
 
-  int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
-  int vec_unit_size = 0;
-  int opcode = 0;
-  int extr_opcode = 0;
-  RegLocation rl_result;
-
-  switch (opsize) {
-    case k32:
-      extr_opcode = kX86PextrdRRI;
-      opcode = kX86PhadddRR;
-      vec_unit_size = 4;
-      break;
-    case kSignedByte:
-    case kUnsignedByte:
-      extr_opcode = kX86PextrbRRI;
-      opcode = kX86PhaddwRR;
-      vec_unit_size = 2;
-      break;
-    case kSignedHalf:
-    case kUnsignedHalf:
-      extr_opcode = kX86PextrwRRI;
-      opcode = kX86PhaddwRR;
-      vec_unit_size = 2;
-      break;
-    case kSingle:
-      rl_result = EvalLoc(rl_dest, kFPReg, true);
-      vec_unit_size = 4;
-      for (int i = 0; i < 3; i++) {
-        NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
-        NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39);
-      }
-      NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
-      StoreValue(rl_dest, rl_result);
-
-      // For single-precision floats, we are done here
-      return;
-    default:
-      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
-      break;
-  }
-
-  int elems = vec_bytes / vec_unit_size;
-
-  // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again
-  // TODO is overflow handled correctly?
-  if (opsize == kSignedByte || opsize == kUnsignedByte) {
-    rs_tmp = Get128BitRegister(AllocTempWide());
-
-    // tmp = xmm1 .>> 8.
-    NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg());
-    NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8);
-
-    // Zero extend low bits in xmm1.
-    AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
-  }
-
-  while (elems > 1) {
-    if (opsize == kSignedByte || opsize == kUnsignedByte) {
-      NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg());
-    }
-    NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg());
-    elems >>= 1;
-  }
-
-  // Combine the results if we separated them.
-  if (opsize == kSignedByte || opsize == kUnsignedByte) {
-    NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg());
-  }
-
-  // We need to extract to a GPR.
-  RegStorage temp = AllocTemp();
-  NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0);
-
-  // Can we do this directly into memory?
-  rl_result = UpdateLocTyped(rl_dest, kCoreReg);
-  if (rl_result.location == kLocPhysReg) {
-    // Ensure res is in a core reg
-    rl_result = EvalLoc(rl_dest, kCoreReg, true);
-    OpRegReg(kOpAdd, rl_result.reg, temp);
-    StoreFinalValue(rl_dest, rl_result);
+  // Get the location of the virtual register. Since this bytecode is overloaded
+  // for different types (and sizes), we need different logic for each path.
+  // The design of bytecode uses same VR for source and destination.
+  RegLocation rl_src, rl_dest, rl_result;
+  if (is_wide) {
+    rl_src = mir_graph_->GetSrcWide(mir, 0);
+    rl_dest = mir_graph_->GetDestWide(mir);
   } else {
-    OpMemReg(kOpAdd, rl_result, temp.GetReg());
+    rl_src = mir_graph_->GetSrc(mir, 0);
+    rl_dest = mir_graph_->GetDest(mir);
   }
 
-  FreeTemp(temp);
+  // We need a temp for byte and short values
+  RegStorage temp;
+
+  // There is a different path depending on type and size.
+  if (opsize == kSingle) {
+    // Handle float case.
+    // TODO Add support for fast math (not value safe) and do horizontal add in that case.
+
+    rl_src = LoadValue(rl_src, kFPReg);
+    rl_result = EvalLoc(rl_dest, kFPReg, true);
+
+    // Since we are doing an add-reduce, we move the reg holding the VR
+    // into the result so we include it in result.
+    OpRegCopy(rl_result.reg, rl_src.reg);
+    NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+
+    // Since FP must keep order of operation for value safety, we shift to low
+    // 32-bits and add to result.
+    for (int i = 0; i < 3; i++) {
+      NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+      NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+    }
+
+    StoreValue(rl_dest, rl_result);
+  } else if (opsize == kDouble) {
+    // Handle double case.
+    rl_src = LoadValueWide(rl_src, kFPReg);
+    rl_result = EvalLocWide(rl_dest, kFPReg, true);
+    LOG(FATAL) << "Unsupported vector add reduce for double.";
+  } else if (opsize == k64) {
+    /*
+     * Handle long case:
+     * 1) Reduce the vector register to lower half (with addition).
+     * 1-1) Get an xmm temp and fill it with vector register.
+     * 1-2) Shift the xmm temp by 8-bytes.
+     * 1-3) Add the xmm temp to vector register that is being reduced.
+     * 2) Allocate temp GP / GP pair.
+     * 2-1) In 64-bit case, use movq to move result to a 64-bit GP.
+     * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair.
+     * 3) Finish the add reduction by doing what add-long/2addr does,
+     * but instead of having a VR as one of the sources, we have our temp GP.
+     */
+    RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble());
+    NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg());
+    NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8);
+    NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg());
+    FreeTemp(rs_tmp_vector);
+
+    // We would like to be able to reuse the add-long implementation, so set up a fake
+    // register location to pass it.
+    RegLocation temp_loc = mir_graph_->GetBadLoc();
+    temp_loc.core = 1;
+    temp_loc.wide = 1;
+    temp_loc.location = kLocPhysReg;
+    temp_loc.reg = AllocTempWide();
+
+    if (cu_->target64) {
+      DCHECK(!temp_loc.reg.IsPair());
+      NewLIR2(kX86MovqrxRR, temp_loc.reg.GetReg(), vector_src.GetReg());
+    } else {
+      NewLIR2(kX86MovdrxRR, temp_loc.reg.GetLowReg(), vector_src.GetReg());
+      NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20);
+      NewLIR2(kX86MovdrxRR, temp_loc.reg.GetHighReg(), vector_src.GetReg());
+    }
+
+    GenArithOpLong(Instruction::ADD_LONG_2ADDR, rl_dest, temp_loc, temp_loc);
+  } else if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    RegStorage rs_tmp = Get128BitRegister(AllocTempDouble());
+    NewLIR2(kX86PxorRR, rs_tmp.GetReg(), rs_tmp.GetReg());
+    NewLIR2(kX86PsadbwRR, vector_src.GetReg(), rs_tmp.GetReg());
+    NewLIR3(kX86PshufdRRI, rs_tmp.GetReg(), vector_src.GetReg(), 0x4e);
+    NewLIR2(kX86PaddbRR, vector_src.GetReg(), rs_tmp.GetReg());
+    // Move to a GPR
+    temp = AllocTemp();
+    NewLIR2(kX86MovdrxRR, temp.GetReg(), vector_src.GetReg());
+  } else {
+    // Handle and the int and short cases together
+
+    // Initialize as if we were handling int case. Below we update
+    // the opcode if handling byte or short.
+    int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
+    int vec_unit_size;
+    int horizontal_add_opcode;
+    int extract_opcode;
+
+    if (opsize == kSignedHalf || opsize == kUnsignedHalf) {
+      extract_opcode = kX86PextrwRRI;
+      horizontal_add_opcode = kX86PhaddwRR;
+      vec_unit_size = 2;
+    } else if (opsize == k32) {
+      vec_unit_size = 4;
+      horizontal_add_opcode = kX86PhadddRR;
+      extract_opcode = kX86PextrdRRI;
+    } else {
+      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+      return;
+    }
+
+    int elems = vec_bytes / vec_unit_size;
+
+    while (elems > 1) {
+      NewLIR2(horizontal_add_opcode, vector_src.GetReg(), vector_src.GetReg());
+      elems >>= 1;
+    }
+
+    // Handle this as arithmetic unary case.
+    ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
+
+    // Extract to a GP register because this is integral typed.
+    temp = AllocTemp();
+    NewLIR3(extract_opcode, temp.GetReg(), vector_src.GetReg(), 0);
+  }
+
+  if (opsize != k64 && opsize != kSingle && opsize != kDouble) {
+    // The logic below looks very similar to the handling of ADD_INT_2ADDR
+    // except the rhs is not a VR but a physical register allocated above.
+    // No load of source VR is done because it assumes that rl_result will
+    // share physical register / memory location.
+    rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+    if (rl_result.location == kLocPhysReg) {
+      // Ensure res is in a core reg.
+      rl_result = EvalLoc(rl_dest, kCoreReg, true);
+      OpRegReg(kOpAdd, rl_result.reg, temp);
+      StoreFinalValue(rl_dest, rl_result);
+    } else {
+      // Do the addition directly to memory.
+      OpMemReg(kOpAdd, rl_result, temp.GetReg());
+    }
+  }
 }
 
 void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegLocation rl_dest = mir_graph_->GetDest(mir);
-  RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+  RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB);
   int extract_index = mir->dalvikInsn.arg[0];
   int extr_opcode = 0;
   RegLocation rl_result;
   bool is_wide = false;
 
-  switch (opsize) {
-    case k32:
-      rl_result = UpdateLocTyped(rl_dest, kCoreReg);
-      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
-      break;
-    case kSignedHalf:
-    case kUnsignedHalf:
-      rl_result= UpdateLocTyped(rl_dest, kCoreReg);
-      extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
-      break;
-    default:
-      LOG(FATAL) << "Unsupported vector add reduce " << opsize;
-      return;
-      break;
-  }
+  // There is a different path depending on type and size.
+  if (opsize == kSingle) {
+    // Handle float case.
+    // TODO Add support for fast math (not value safe) and do horizontal add in that case.
 
-  if (rl_result.location == kLocPhysReg) {
-    NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index);
-    if (is_wide == true) {
-      StoreFinalValue(rl_dest, rl_result);
-    } else {
-      StoreFinalValueWide(rl_dest, rl_result);
+    rl_result = EvalLoc(rl_dest, kFPReg, true);
+    NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
+    NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+
+    // Since FP must keep order of operation for value safety, we shift to low
+    // 32-bits and add to result.
+    for (int i = 0; i < 3; i++) {
+      NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+      NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
     }
+
+    StoreValue(rl_dest, rl_result);
+  } else if (opsize == kDouble) {
+    // TODO Handle double case.
+    LOG(FATAL) << "Unsupported add reduce for double.";
+  } else if (opsize == k64) {
+    /*
+     * Handle long case:
+     * 1) Reduce the vector register to lower half (with addition).
+     * 1-1) Get an xmm temp and fill it with vector register.
+     * 1-2) Shift the xmm temp by 8-bytes.
+     * 1-3) Add the xmm temp to vector register that is being reduced.
+     * 2) Evaluate destination to a GP / GP pair.
+     * 2-1) In 64-bit case, use movq to move result to a 64-bit GP.
+     * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair.
+     * 3) Store the result to the final destination.
+     */
+    RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble());
+    NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg());
+    NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8);
+    NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg());
+    FreeTemp(rs_tmp_vector);
+
+    rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+    if (cu_->target64) {
+      DCHECK(!rl_result.reg.IsPair());
+      NewLIR2(kX86MovqrxRR, rl_result.reg.GetReg(), vector_src.GetReg());
+    } else {
+      NewLIR2(kX86MovdrxRR, rl_result.reg.GetLowReg(), vector_src.GetReg());
+      NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20);
+      NewLIR2(kX86MovdrxRR, rl_result.reg.GetHighReg(), vector_src.GetReg());
+    }
+
+    StoreValueWide(rl_dest, rl_result);
   } else {
-    int displacement = SRegOffset(rl_result.s_reg_low);
-    LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg());
-    AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
-    AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+    // Handle the rest of integral types now.
+    switch (opsize) {
+      case k32:
+        rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+        extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
+        break;
+      case kSignedHalf:
+      case kUnsignedHalf:
+        rl_result= UpdateLocTyped(rl_dest, kCoreReg);
+        extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
+        break;
+      default:
+        LOG(FATAL) << "Unsupported vector reduce " << opsize;
+        return;
+    }
+
+    if (rl_result.location == kLocPhysReg) {
+      NewLIR3(extr_opcode, rl_result.reg.GetReg(), vector_src.GetReg(), extract_index);
+      if (is_wide == true) {
+        StoreFinalValue(rl_dest, rl_result);
+      } else {
+        StoreFinalValueWide(rl_dest, rl_result);
+      }
+    } else {
+      int displacement = SRegOffset(rl_result.s_reg_low);
+      LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, vector_src.GetReg());
+      AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
+      AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+    }
   }
 }
 
@@ -2080,96 +2271,113 @@
   DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
   OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
   RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
-  int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR;
+  Clobber(rs_dest);
+  int op_shuffle = 0, op_shuffle_high = 0, op_mov = kX86MovdxrRR;
   RegisterClass reg_type = kCoreReg;
+  bool is_wide = false;
 
   switch (opsize) {
     case k32:
-      op_low = kX86PshufdRRI;
+      op_shuffle = kX86PshufdRRI;
       break;
     case kSingle:
-      op_low = kX86PshufdRRI;
-      op_mov = kX86Mova128RR;
+      op_shuffle = kX86PshufdRRI;
+      op_mov = kX86MovdqaRR;
       reg_type = kFPReg;
       break;
     case k64:
-      op_low = kX86PshufdRRI;
-      imm = 0x44;
-      break;
-    case kDouble:
-      op_low = kX86PshufdRRI;
-      op_mov = kX86Mova128RR;
-      reg_type = kFPReg;
-      imm = 0x44;
+      op_shuffle = kX86PunpcklqdqRR;
+      op_mov = kX86MovqrxRR;
+      is_wide = true;
       break;
     case kSignedByte:
     case kUnsignedByte:
-      // Shuffle 8 bit value into 16 bit word.
-      // We set val = val + (val << 8) below and use 16 bit shuffle.
+      // We will have the source loaded up in a
+      // double-word before we use this shuffle
+      op_shuffle = kX86PshufdRRI;
+      break;
     case kSignedHalf:
     case kUnsignedHalf:
       // Handles low quadword.
-      op_low = kX86PshuflwRRI;
+      op_shuffle = kX86PshuflwRRI;
       // Handles upper quadword.
-      op_high = kX86PshufdRRI;
+      op_shuffle_high = kX86PshufdRRI;
       break;
     default:
       LOG(FATAL) << "Unsupported vector set " << opsize;
       break;
   }
 
-  RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
-
-  // Load the value from the VR into the reg.
-  if (rl_src.wide == 0) {
+  // Load the value from the VR into a physical register.
+  RegLocation rl_src;
+  if (!is_wide) {
+    rl_src = mir_graph_->GetSrc(mir, 0);
     rl_src = LoadValue(rl_src, reg_type);
   } else {
+    rl_src = mir_graph_->GetSrcWide(mir, 0);
     rl_src = LoadValueWide(rl_src, reg_type);
   }
-
-  // If opsize is 8 bits wide then double value and use 16 bit shuffle instead.
-  if (opsize == kSignedByte || opsize == kUnsignedByte) {
-    RegStorage temp = AllocTemp();
-    // val = val + (val << 8).
-    NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg());
-    NewLIR2(kX86Sal32RI, temp.GetReg(), 8);
-    NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg());
-    FreeTemp(temp);
-  }
+  RegStorage reg_to_shuffle = rl_src.reg;
 
   // Load the value into the XMM register.
-  NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg());
+  if (!cu_->target64 && opsize == k64) {
+    // Logic assumes that longs are loaded in GP register pairs.
+    NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), reg_to_shuffle.GetLowReg());
+    RegStorage r_tmp = AllocTempDouble();
+    NewLIR2(kX86MovdxrRR, r_tmp.GetReg(), reg_to_shuffle.GetHighReg());
+    NewLIR2(kX86PunpckldqRR, rs_dest.GetReg(), r_tmp.GetReg());
+    FreeTemp(r_tmp);
+  } else {
+    NewLIR2(op_mov, rs_dest.GetReg(), reg_to_shuffle.GetReg());
+  }
+
+  if (opsize == kSignedByte || opsize == kUnsignedByte) {
+    // In the byte case, first duplicate it to be a word
+    // Then duplicate it to be a double-word
+    NewLIR2(kX86PunpcklbwRR, rs_dest.GetReg(), rs_dest.GetReg());
+    NewLIR2(kX86PunpcklwdRR, rs_dest.GetReg(), rs_dest.GetReg());
+  }
 
   // Now shuffle the value across the destination.
-  NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm);
+  if (op_shuffle == kX86PunpcklqdqRR) {
+    NewLIR2(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg());
+  } else {
+    NewLIR3(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+  }
 
   // And then repeat as needed.
-  if (op_high != 0) {
-    NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm);
+  if (op_shuffle_high != 0) {
+    NewLIR3(op_shuffle_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
   }
 }
 
-LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
-  int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
+void X86Mir2Lir::GenPackedArrayGet(BasicBlock *bb, MIR *mir) {
+  UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayGet not supported.";
+}
+
+void X86Mir2Lir::GenPackedArrayPut(BasicBlock *bb, MIR *mir) {
+  UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayPut not supported.";
+}
+
+LIR* X86Mir2Lir::ScanVectorLiteral(int32_t* constants) {
   for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
-    if (args[0] == p->operands[0] && args[1] == p->operands[1] &&
-        args[2] == p->operands[2] && args[3] == p->operands[3]) {
+    if (constants[0] == p->operands[0] && constants[1] == p->operands[1] &&
+        constants[2] == p->operands[2] && constants[3] == p->operands[3]) {
       return p;
     }
   }
   return nullptr;
 }
 
-LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) {
+LIR* X86Mir2Lir::AddVectorLiteral(int32_t* constants) {
   LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocData));
-  int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
-  new_value->operands[0] = args[0];
-  new_value->operands[1] = args[1];
-  new_value->operands[2] = args[2];
-  new_value->operands[3] = args[3];
+  new_value->operands[0] = constants[0];
+  new_value->operands[1] = constants[1];
+  new_value->operands[2] = constants[2];
+  new_value->operands[3] = constants[3];
   new_value->next = const_vectors_;
   if (const_vectors_ == nullptr) {
-    estimated_native_code_size_ += 12;  // Amount needed to align to 16 byte boundary.
+    estimated_native_code_size_ += 12;  // Maximum needed to align to 16 byte boundary.
   }
   estimated_native_code_size_ += 16;  // Space for one vector.
   const_vectors_ = new_value;
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index 4f65a0f..30384ec 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -990,6 +990,17 @@
     case kMirOpConstVector:
       store_method_addr_ = true;
       break;
+    case kMirOpPackedMultiply:
+    case kMirOpPackedShiftLeft:
+    case kMirOpPackedSignedShiftRight:
+    case kMirOpPackedUnsignedShiftRight: {
+      // Byte emulation requires constants from the literal pool.
+      OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
+      if (opsize == kSignedByte || opsize == kUnsignedByte) {
+        store_method_addr_ = true;
+      }
+      break;
+    }
     default:
       // Ignore the rest.
       break;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index e3ef8c1..22a2f30 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -555,20 +555,27 @@
   Binary0fOpCode(kX86Subss),    // float subtract
   Binary0fOpCode(kX86Divsd),    // double divide
   Binary0fOpCode(kX86Divss),    // float divide
-  Binary0fOpCode(kX86Punpckldq),  // Interleave low-order double words
+  Binary0fOpCode(kX86Punpcklbw),  // Interleave low-order bytes
+  Binary0fOpCode(kX86Punpcklwd),  // Interleave low-order single words (16-bits)
+  Binary0fOpCode(kX86Punpckldq),  // Interleave low-order double words (32-bit)
+  Binary0fOpCode(kX86Punpcklqdq),  // Interleave low-order quad word
   Binary0fOpCode(kX86Sqrtsd),   // square root
   Binary0fOpCode(kX86Pmulld),   // parallel integer multiply 32 bits x 4
   Binary0fOpCode(kX86Pmullw),   // parallel integer multiply 16 bits x 8
+  Binary0fOpCode(kX86Pmuludq),   // parallel unsigned 32 integer and stores result as 64
   Binary0fOpCode(kX86Mulps),    // parallel FP multiply 32 bits x 4
   Binary0fOpCode(kX86Mulpd),    // parallel FP multiply 64 bits x 2
   Binary0fOpCode(kX86Paddb),    // parallel integer addition 8 bits x 16
   Binary0fOpCode(kX86Paddw),    // parallel integer addition 16 bits x 8
   Binary0fOpCode(kX86Paddd),    // parallel integer addition 32 bits x 4
+  Binary0fOpCode(kX86Paddq),    // parallel integer addition 64 bits x 2
+  Binary0fOpCode(kX86Psadbw),   // computes sum of absolute differences for unsigned byte integers
   Binary0fOpCode(kX86Addps),    // parallel FP addition 32 bits x 4
   Binary0fOpCode(kX86Addpd),    // parallel FP addition 64 bits x 2
   Binary0fOpCode(kX86Psubb),    // parallel integer subtraction 8 bits x 16
   Binary0fOpCode(kX86Psubw),    // parallel integer subtraction 16 bits x 8
   Binary0fOpCode(kX86Psubd),    // parallel integer subtraction 32 bits x 4
+  Binary0fOpCode(kX86Psubq),    // parallel integer subtraction 32 bits x 4
   Binary0fOpCode(kX86Subps),    // parallel FP subtraction 32 bits x 4
   Binary0fOpCode(kX86Subpd),    // parallel FP subtraction 64 bits x 2
   Binary0fOpCode(kX86Pand),     // parallel AND 128 bits x 1
@@ -593,6 +600,7 @@
   kX86PsrlwRI,                  // logical right shift of floating point registers 16 bits x 8
   kX86PsrldRI,                  // logical right shift of floating point registers 32 bits x 4
   kX86PsrlqRI,                  // logical right shift of floating point registers 64 bits x 2
+  kX86PsrldqRI,                 // logical shift of 128-bit vector register, immediate in bytes
   kX86PsllwRI,                  // left shift of floating point registers 16 bits x 8
   kX86PslldRI,                  // left shift of floating point registers 32 bits x 4
   kX86PsllqRI,                  // left shift of floating point registers 64 bits x 2
@@ -607,8 +615,8 @@
   kX86Fprem,                    // remainder from dividing of two floating point values
   kX86Fucompp,                  // compare floating point values and pop x87 fp stack twice
   kX86Fstsw16R,                 // store FPU status word
-  Binary0fOpCode(kX86Mova128),  // move 128 bits aligned
-  kX86Mova128MR, kX86Mova128AR,  // store 128 bit aligned from xmm1 to m128
+  Binary0fOpCode(kX86Movdqa),   // move 128 bits aligned
+  kX86MovdqaMR, kX86MovdqaAR,   // store 128 bit aligned from xmm1 to m128
   Binary0fOpCode(kX86Movups),   // load unaligned packed single FP values from xmm2/m128 to xmm1
   kX86MovupsMR, kX86MovupsAR,   // store unaligned packed single FP values from xmm1 to m128
   Binary0fOpCode(kX86Movaps),   // load aligned packed single FP values from xmm2/m128 to xmm1
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 7551add..1848abe 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -558,14 +558,19 @@
         has_modrm = true;
         src_reg_file = dst_reg_file = SSE;
         break;
-      case 0x62:
+      case 0x60: case 0x61: case 0x62: case 0x6C:
         if (prefix[2] == 0x66) {
           src_reg_file = dst_reg_file = SSE;
           prefix[2] = 0;  // Clear prefix now. It has served its purpose as part of the opcode.
         } else {
           src_reg_file = dst_reg_file = MMX;
         }
-        opcode << "punpckldq";
+        switch (*instr) {
+          case 0x60: opcode << "punpcklbw"; break;
+          case 0x61: opcode << "punpcklwd"; break;
+          case 0x62: opcode << "punpckldq"; break;
+          case 0x6c: opcode << "punpcklqdq"; break;
+        }
         load = true;
         has_modrm = true;
         break;
@@ -650,7 +655,7 @@
         } else {
           dst_reg_file = MMX;
         }
-        static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "unknown-73", "unknown-73", "unknown-73", "psllq", "unknown-73"};
+        static const char* x73_opcodes[] = {"unknown-73", "unknown-73", "psrlq", "psrldq", "unknown-73", "unknown-73", "psllq", "unknown-73"};
         modrm_opcodes = x73_opcodes;
         reg_is_opcode = true;
         has_modrm = true;
@@ -800,6 +805,18 @@
         opcode << "bswap";
         reg_in_opcode = true;
         break;
+      case 0xD4:
+        if (prefix[2] == 0x66) {
+          src_reg_file = dst_reg_file = SSE;
+          prefix[2] = 0;
+        } else {
+          src_reg_file = dst_reg_file = MMX;
+        }
+        opcode << "paddq";
+        prefix[2] = 0;
+        has_modrm = true;
+        load = true;
+        break;
       case 0xDB:
         if (prefix[2] == 0x66) {
           src_reg_file = dst_reg_file = SSE;
@@ -847,66 +864,14 @@
         has_modrm = true;
         load = true;
         break;
+      case 0xF4:
+      case 0xF6:
       case 0xF8:
-        if (prefix[2] == 0x66) {
-          src_reg_file = dst_reg_file = SSE;
-          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
-        } else {
-          src_reg_file = dst_reg_file = MMX;
-        }
-        opcode << "psubb";
-        prefix[2] = 0;
-        has_modrm = true;
-        load = true;
-        break;
       case 0xF9:
-        if (prefix[2] == 0x66) {
-          src_reg_file = dst_reg_file = SSE;
-          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
-        } else {
-          src_reg_file = dst_reg_file = MMX;
-        }
-        opcode << "psubw";
-        prefix[2] = 0;
-        has_modrm = true;
-        load = true;
-        break;
       case 0xFA:
-        if (prefix[2] == 0x66) {
-          src_reg_file = dst_reg_file = SSE;
-          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
-        } else {
-          src_reg_file = dst_reg_file = MMX;
-        }
-        opcode << "psubd";
-        prefix[2] = 0;
-        has_modrm = true;
-        load = true;
-        break;
+      case 0xFB:
       case 0xFC:
-        if (prefix[2] == 0x66) {
-          src_reg_file = dst_reg_file = SSE;
-          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
-        } else {
-          src_reg_file = dst_reg_file = MMX;
-        }
-        opcode << "paddb";
-        prefix[2] = 0;
-        has_modrm = true;
-        load = true;
-        break;
       case 0xFD:
-        if (prefix[2] == 0x66) {
-          src_reg_file = dst_reg_file = SSE;
-          prefix[2] = 0;  // clear prefix now it's served its purpose as part of the opcode
-        } else {
-          src_reg_file = dst_reg_file = MMX;
-        }
-        opcode << "paddw";
-        prefix[2] = 0;
-        has_modrm = true;
-        load = true;
-        break;
       case 0xFE:
         if (prefix[2] == 0x66) {
           src_reg_file = dst_reg_file = SSE;
@@ -914,7 +879,17 @@
         } else {
           src_reg_file = dst_reg_file = MMX;
         }
-        opcode << "paddd";
+        switch (*instr) {
+          case 0xF4: opcode << "pmuludq"; break;
+          case 0xF6: opcode << "psadbw"; break;
+          case 0xF8: opcode << "psubb"; break;
+          case 0xF9: opcode << "psubw"; break;
+          case 0xFA: opcode << "psubd"; break;
+          case 0xFB: opcode << "psubq"; break;
+          case 0xFC: opcode << "paddb"; break;
+          case 0xFD: opcode << "paddw"; break;
+          case 0xFE: opcode << "paddd"; break;
+        }
         prefix[2] = 0;
         has_modrm = true;
         load = true;