Merge "Revert^2 "Verify profile wrt dex file in dex2oat"""
diff --git a/compiler/Android.bp b/compiler/Android.bp
index 1475679..c50c197 100644
--- a/compiler/Android.bp
+++ b/compiler/Android.bp
@@ -204,10 +204,10 @@
     cmd: "$(location generate-operator-out.py) art/compiler $(in) > $(out)",
     tool_files: ["generate-operator-out.py"],
     srcs: [
-        "compiled_method.h",
         "dex/dex_to_dex_compiler.h",
         "driver/compiler_driver.h",
         "driver/compiler_options.h",
+        "linker/linker_patch.h",
         "optimizing/locations.h",
 
         "utils/arm/constants_arm.h",
@@ -310,13 +310,14 @@
         "art_gtest_defaults",
     ],
     srcs: [
-        "compiled_method_test.cc",
         "debug/dwarf/dwarf_test.cc",
+        "debug/src_map_elem_test.cc",
         "dex/dex_to_dex_decompiler_test.cc",
         "driver/compiled_method_storage_test.cc",
         "driver/compiler_driver_test.cc",
         "exception_test.cc",
         "jni/jni_compiler_test.cc",
+        "linker/linker_patch_test.cc",
         "linker/method_bss_mapping_encoder_test.cc",
         "linker/output_stream_test.cc",
         "optimizing/bounds_check_elimination_test.cc",
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index 0d38620..500fc4a 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -22,7 +22,7 @@
 #include "base/callee_save_type.h"
 #include "base/enums.h"
 #include "class_linker.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "dex/quick_compiler_callbacks.h"
 #include "dex/verification_results.h"
 #include "driver/compiler_driver.h"
diff --git a/compiler/compiled_method-inl.h b/compiler/compiled_method-inl.h
new file mode 100644
index 0000000..c432747
--- /dev/null
+++ b/compiler/compiled_method-inl.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_COMPILED_METHOD_INL_H_
+#define ART_COMPILER_COMPILED_METHOD_INL_H_
+
+#include "compiled_method.h"
+
+#include "base/array_ref.h"
+#include "base/length_prefixed_array.h"
+#include "linker/linker_patch.h"
+
+namespace art {
+
+inline ArrayRef<const uint8_t> CompiledCode::GetQuickCode() const {
+  return GetArray(quick_code_);
+}
+
+template <typename T>
+inline ArrayRef<const T> CompiledCode::GetArray(const LengthPrefixedArray<T>* array) {
+  if (array == nullptr) {
+    return ArrayRef<const T>();
+  }
+  DCHECK_NE(array->size(), 0u);
+  return ArrayRef<const T>(&array->At(0), array->size());
+}
+
+inline ArrayRef<const uint8_t> CompiledMethod::GetMethodInfo() const {
+  return GetArray(method_info_);
+}
+
+inline ArrayRef<const uint8_t> CompiledMethod::GetVmapTable() const {
+  return GetArray(vmap_table_);
+}
+
+inline ArrayRef<const uint8_t> CompiledMethod::GetCFIInfo() const {
+  return GetArray(cfi_info_);
+}
+
+inline ArrayRef<const linker::LinkerPatch> CompiledMethod::GetPatches() const {
+  return GetArray(patches_);
+}
+
+}  // namespace art
+
+#endif  // ART_COMPILER_COMPILED_METHOD_INL_H_
diff --git a/compiler/compiled_method.cc b/compiler/compiled_method.cc
index 0d9021f..111469f 100644
--- a/compiler/compiled_method.cc
+++ b/compiler/compiled_method.cc
@@ -22,7 +22,8 @@
 
 namespace art {
 
-CompiledCode::CompiledCode(CompilerDriver* compiler_driver, InstructionSet instruction_set,
+CompiledCode::CompiledCode(CompilerDriver* compiler_driver,
+                           InstructionSet instruction_set,
                            const ArrayRef<const uint8_t>& quick_code)
     : compiler_driver_(compiler_driver),
       instruction_set_(instruction_set),
@@ -77,8 +78,7 @@
   }
 }
 
-const void* CompiledCode::CodePointer(const void* code_pointer,
-                                      InstructionSet instruction_set) {
+const void* CompiledCode::CodePointer(const void* code_pointer, InstructionSet instruction_set) {
   switch (instruction_set) {
     case kArm:
     case kArm64:
@@ -108,7 +108,7 @@
                                const ArrayRef<const uint8_t>& method_info,
                                const ArrayRef<const uint8_t>& vmap_table,
                                const ArrayRef<const uint8_t>& cfi_info,
-                               const ArrayRef<const LinkerPatch>& patches)
+                               const ArrayRef<const linker::LinkerPatch>& patches)
     : CompiledCode(driver, instruction_set, quick_code),
       frame_size_in_bytes_(frame_size_in_bytes),
       core_spill_mask_(core_spill_mask),
@@ -129,7 +129,7 @@
     const ArrayRef<const uint8_t>& method_info,
     const ArrayRef<const uint8_t>& vmap_table,
     const ArrayRef<const uint8_t>& cfi_info,
-    const ArrayRef<const LinkerPatch>& patches) {
+    const ArrayRef<const linker::LinkerPatch>& patches) {
   SwapAllocator<CompiledMethod> alloc(driver->GetCompiledMethodStorage()->GetSwapSpaceAllocator());
   CompiledMethod* ret = alloc.allocate(1);
   alloc.construct(ret,
diff --git a/compiler/compiled_method.h b/compiler/compiled_method.h
index 5ef6cbf..892bc59 100644
--- a/compiler/compiled_method.h
+++ b/compiler/compiled_method.h
@@ -17,27 +17,28 @@
 #ifndef ART_COMPILER_COMPILED_METHOD_H_
 #define ART_COMPILER_COMPILED_METHOD_H_
 
-#include <iosfwd>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "arch/instruction_set.h"
-#include "base/array_ref.h"
-#include "base/bit_utils.h"
-#include "base/length_prefixed_array.h"
-#include "dex_file_types.h"
-#include "method_reference.h"
 
 namespace art {
 
+template <typename T> class ArrayRef;
 class CompilerDriver;
 class CompiledMethodStorage;
+template<typename T> class LengthPrefixedArray;
+
+namespace linker {
+class LinkerPatch;
+}  // namespace linker
 
 class CompiledCode {
  public:
   // For Quick to supply an code blob
-  CompiledCode(CompilerDriver* compiler_driver, InstructionSet instruction_set,
+  CompiledCode(CompilerDriver* compiler_driver,
+               InstructionSet instruction_set,
                const ArrayRef<const uint8_t>& quick_code);
 
   virtual ~CompiledCode();
@@ -46,9 +47,7 @@
     return instruction_set_;
   }
 
-  ArrayRef<const uint8_t> GetQuickCode() const {
-    return GetArray(quick_code_);
-  }
+  ArrayRef<const uint8_t> GetQuickCode() const;
 
   bool operator==(const CompiledCode& rhs) const;
 
@@ -66,18 +65,11 @@
   // Returns a pointer suitable for invoking the code at the argument
   // code_pointer address.  Mainly to cope with kThumb2 where the
   // lower bit must be set to indicate Thumb mode.
-  static const void* CodePointer(const void* code_pointer,
-                                 InstructionSet instruction_set);
+  static const void* CodePointer(const void* code_pointer, InstructionSet instruction_set);
 
  protected:
   template <typename T>
-  static ArrayRef<const T> GetArray(const LengthPrefixedArray<T>* array) {
-    if (array == nullptr) {
-      return ArrayRef<const T>();
-    }
-    DCHECK_NE(array->size(), 0u);
-    return ArrayRef<const T>(&array->At(0), array->size());
-  }
+  static ArrayRef<const T> GetArray(const LengthPrefixedArray<T>* array);
 
   CompilerDriver* GetCompilerDriver() {
     return compiler_driver_;
@@ -92,298 +84,6 @@
   const LengthPrefixedArray<uint8_t>* const quick_code_;
 };
 
-class SrcMapElem {
- public:
-  uint32_t from_;
-  int32_t to_;
-};
-
-inline bool operator<(const SrcMapElem& lhs, const SrcMapElem& rhs) {
-  if (lhs.from_ != rhs.from_) {
-    return lhs.from_ < rhs.from_;
-  }
-  return lhs.to_ < rhs.to_;
-}
-
-inline bool operator==(const SrcMapElem& lhs, const SrcMapElem& rhs) {
-  return lhs.from_ == rhs.from_ && lhs.to_ == rhs.to_;
-}
-
-class LinkerPatch {
- public:
-  // Note: We explicitly specify the underlying type of the enum because GCC
-  // would otherwise select a bigger underlying type and then complain that
-  //     'art::LinkerPatch::patch_type_' is too small to hold all
-  //     values of 'enum class art::LinkerPatch::Type'
-  // which is ridiculous given we have only a handful of values here. If we
-  // choose to squeeze the Type into fewer than 8 bits, we'll have to declare
-  // patch_type_ as an uintN_t and do explicit static_cast<>s.
-  enum class Type : uint8_t {
-    kMethodRelative,          // NOTE: Actual patching is instruction_set-dependent.
-    kMethodBssEntry,          // NOTE: Actual patching is instruction_set-dependent.
-    kCall,
-    kCallRelative,            // NOTE: Actual patching is instruction_set-dependent.
-    kTypeRelative,            // NOTE: Actual patching is instruction_set-dependent.
-    kTypeClassTable,          // NOTE: Actual patching is instruction_set-dependent.
-    kTypeBssEntry,            // NOTE: Actual patching is instruction_set-dependent.
-    kStringRelative,          // NOTE: Actual patching is instruction_set-dependent.
-    kStringInternTable,       // NOTE: Actual patching is instruction_set-dependent.
-    kStringBssEntry,          // NOTE: Actual patching is instruction_set-dependent.
-    kBakerReadBarrierBranch,  // NOTE: Actual patching is instruction_set-dependent.
-  };
-
-  static LinkerPatch RelativeMethodPatch(size_t literal_offset,
-                                         const DexFile* target_dex_file,
-                                         uint32_t pc_insn_offset,
-                                         uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, Type::kMethodRelative, target_dex_file);
-    patch.method_idx_ = target_method_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch MethodBssEntryPatch(size_t literal_offset,
-                                         const DexFile* target_dex_file,
-                                         uint32_t pc_insn_offset,
-                                         uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, Type::kMethodBssEntry, target_dex_file);
-    patch.method_idx_ = target_method_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch CodePatch(size_t literal_offset,
-                               const DexFile* target_dex_file,
-                               uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, Type::kCall, target_dex_file);
-    patch.method_idx_ = target_method_idx;
-    return patch;
-  }
-
-  static LinkerPatch RelativeCodePatch(size_t literal_offset,
-                                       const DexFile* target_dex_file,
-                                       uint32_t target_method_idx) {
-    LinkerPatch patch(literal_offset, Type::kCallRelative, target_dex_file);
-    patch.method_idx_ = target_method_idx;
-    return patch;
-  }
-
-  static LinkerPatch RelativeTypePatch(size_t literal_offset,
-                                       const DexFile* target_dex_file,
-                                       uint32_t pc_insn_offset,
-                                       uint32_t target_type_idx) {
-    LinkerPatch patch(literal_offset, Type::kTypeRelative, target_dex_file);
-    patch.type_idx_ = target_type_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch TypeClassTablePatch(size_t literal_offset,
-                                         const DexFile* target_dex_file,
-                                         uint32_t pc_insn_offset,
-                                         uint32_t target_type_idx) {
-    LinkerPatch patch(literal_offset, Type::kTypeClassTable, target_dex_file);
-    patch.type_idx_ = target_type_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch TypeBssEntryPatch(size_t literal_offset,
-                                       const DexFile* target_dex_file,
-                                       uint32_t pc_insn_offset,
-                                       uint32_t target_type_idx) {
-    LinkerPatch patch(literal_offset, Type::kTypeBssEntry, target_dex_file);
-    patch.type_idx_ = target_type_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch RelativeStringPatch(size_t literal_offset,
-                                         const DexFile* target_dex_file,
-                                         uint32_t pc_insn_offset,
-                                         uint32_t target_string_idx) {
-    LinkerPatch patch(literal_offset, Type::kStringRelative, target_dex_file);
-    patch.string_idx_ = target_string_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch StringInternTablePatch(size_t literal_offset,
-                                            const DexFile* target_dex_file,
-                                            uint32_t pc_insn_offset,
-                                            uint32_t target_string_idx) {
-    LinkerPatch patch(literal_offset, Type::kStringInternTable, target_dex_file);
-    patch.string_idx_ = target_string_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch StringBssEntryPatch(size_t literal_offset,
-                                         const DexFile* target_dex_file,
-                                         uint32_t pc_insn_offset,
-                                         uint32_t target_string_idx) {
-    LinkerPatch patch(literal_offset, Type::kStringBssEntry, target_dex_file);
-    patch.string_idx_ = target_string_idx;
-    patch.pc_insn_offset_ = pc_insn_offset;
-    return patch;
-  }
-
-  static LinkerPatch BakerReadBarrierBranchPatch(size_t literal_offset,
-                                                 uint32_t custom_value1 = 0u,
-                                                 uint32_t custom_value2 = 0u) {
-    LinkerPatch patch(literal_offset, Type::kBakerReadBarrierBranch, nullptr);
-    patch.baker_custom_value1_ = custom_value1;
-    patch.baker_custom_value2_ = custom_value2;
-    return patch;
-  }
-
-  LinkerPatch(const LinkerPatch& other) = default;
-  LinkerPatch& operator=(const LinkerPatch& other) = default;
-
-  size_t LiteralOffset() const {
-    return literal_offset_;
-  }
-
-  Type GetType() const {
-    return patch_type_;
-  }
-
-  bool IsPcRelative() const {
-    switch (GetType()) {
-      case Type::kMethodRelative:
-      case Type::kMethodBssEntry:
-      case Type::kCallRelative:
-      case Type::kTypeRelative:
-      case Type::kTypeClassTable:
-      case Type::kTypeBssEntry:
-      case Type::kStringRelative:
-      case Type::kStringInternTable:
-      case Type::kStringBssEntry:
-      case Type::kBakerReadBarrierBranch:
-        return true;
-      default:
-        return false;
-    }
-  }
-
-  MethodReference TargetMethod() const {
-    DCHECK(patch_type_ == Type::kMethodRelative ||
-           patch_type_ == Type::kMethodBssEntry ||
-           patch_type_ == Type::kCall ||
-           patch_type_ == Type::kCallRelative);
-    return MethodReference(target_dex_file_, method_idx_);
-  }
-
-  const DexFile* TargetTypeDexFile() const {
-    DCHECK(patch_type_ == Type::kTypeRelative ||
-           patch_type_ == Type::kTypeClassTable ||
-           patch_type_ == Type::kTypeBssEntry);
-    return target_dex_file_;
-  }
-
-  dex::TypeIndex TargetTypeIndex() const {
-    DCHECK(patch_type_ == Type::kTypeRelative ||
-           patch_type_ == Type::kTypeClassTable ||
-           patch_type_ == Type::kTypeBssEntry);
-    return dex::TypeIndex(type_idx_);
-  }
-
-  const DexFile* TargetStringDexFile() const {
-    DCHECK(patch_type_ == Type::kStringRelative ||
-           patch_type_ == Type::kStringInternTable ||
-           patch_type_ == Type::kStringBssEntry);
-    return target_dex_file_;
-  }
-
-  dex::StringIndex TargetStringIndex() const {
-    DCHECK(patch_type_ == Type::kStringRelative ||
-           patch_type_ == Type::kStringInternTable ||
-           patch_type_ == Type::kStringBssEntry);
-    return dex::StringIndex(string_idx_);
-  }
-
-  uint32_t PcInsnOffset() const {
-    DCHECK(patch_type_ == Type::kMethodRelative ||
-           patch_type_ == Type::kMethodBssEntry ||
-           patch_type_ == Type::kTypeRelative ||
-           patch_type_ == Type::kTypeClassTable ||
-           patch_type_ == Type::kTypeBssEntry ||
-           patch_type_ == Type::kStringRelative ||
-           patch_type_ == Type::kStringInternTable ||
-           patch_type_ == Type::kStringBssEntry);
-    return pc_insn_offset_;
-  }
-
-  uint32_t GetBakerCustomValue1() const {
-    DCHECK(patch_type_ == Type::kBakerReadBarrierBranch);
-    return baker_custom_value1_;
-  }
-
-  uint32_t GetBakerCustomValue2() const {
-    DCHECK(patch_type_ == Type::kBakerReadBarrierBranch);
-    return baker_custom_value2_;
-  }
-
- private:
-  LinkerPatch(size_t literal_offset, Type patch_type, const DexFile* target_dex_file)
-      : target_dex_file_(target_dex_file),
-        literal_offset_(literal_offset),
-        patch_type_(patch_type) {
-    cmp1_ = 0u;
-    cmp2_ = 0u;
-    // The compiler rejects methods that are too big, so the compiled code
-    // of a single method really shouln't be anywhere close to 16MiB.
-    DCHECK(IsUint<24>(literal_offset));
-  }
-
-  const DexFile* target_dex_file_;
-  // TODO: Clean up naming. Some patched locations are literals but others are not.
-  uint32_t literal_offset_ : 24;  // Method code size up to 16MiB.
-  Type patch_type_ : 8;
-  union {
-    uint32_t cmp1_;             // Used for relational operators.
-    uint32_t method_idx_;       // Method index for Call/Method patches.
-    uint32_t type_idx_;         // Type index for Type patches.
-    uint32_t string_idx_;       // String index for String patches.
-    uint32_t baker_custom_value1_;
-    static_assert(sizeof(method_idx_) == sizeof(cmp1_), "needed by relational operators");
-    static_assert(sizeof(type_idx_) == sizeof(cmp1_), "needed by relational operators");
-    static_assert(sizeof(string_idx_) == sizeof(cmp1_), "needed by relational operators");
-    static_assert(sizeof(baker_custom_value1_) == sizeof(cmp1_), "needed by relational operators");
-  };
-  union {
-    // Note: To avoid uninitialized padding on 64-bit systems, we use `size_t` for `cmp2_`.
-    // This allows a hashing function to treat an array of linker patches as raw memory.
-    size_t cmp2_;             // Used for relational operators.
-    // Literal offset of the insn loading PC (same as literal_offset if it's the same insn,
-    // may be different if the PC-relative addressing needs multiple insns).
-    uint32_t pc_insn_offset_;
-    uint32_t baker_custom_value2_;
-    static_assert(sizeof(pc_insn_offset_) <= sizeof(cmp2_), "needed by relational operators");
-    static_assert(sizeof(baker_custom_value2_) <= sizeof(cmp2_), "needed by relational operators");
-  };
-
-  friend bool operator==(const LinkerPatch& lhs, const LinkerPatch& rhs);
-  friend bool operator<(const LinkerPatch& lhs, const LinkerPatch& rhs);
-};
-std::ostream& operator<<(std::ostream& os, const LinkerPatch::Type& type);
-
-inline bool operator==(const LinkerPatch& lhs, const LinkerPatch& rhs) {
-  return lhs.literal_offset_ == rhs.literal_offset_ &&
-      lhs.patch_type_ == rhs.patch_type_ &&
-      lhs.target_dex_file_ == rhs.target_dex_file_ &&
-      lhs.cmp1_ == rhs.cmp1_ &&
-      lhs.cmp2_ == rhs.cmp2_;
-}
-
-inline bool operator<(const LinkerPatch& lhs, const LinkerPatch& rhs) {
-  return (lhs.literal_offset_ != rhs.literal_offset_) ? lhs.literal_offset_ < rhs.literal_offset_
-      : (lhs.patch_type_ != rhs.patch_type_) ? lhs.patch_type_ < rhs.patch_type_
-      : (lhs.target_dex_file_ != rhs.target_dex_file_) ? lhs.target_dex_file_ < rhs.target_dex_file_
-      : (lhs.cmp1_ != rhs.cmp1_) ? lhs.cmp1_ < rhs.cmp1_
-      : lhs.cmp2_ < rhs.cmp2_;
-}
-
 class CompiledMethod FINAL : public CompiledCode {
  public:
   // Constructs a CompiledMethod.
@@ -398,7 +98,7 @@
                  const ArrayRef<const uint8_t>& method_info,
                  const ArrayRef<const uint8_t>& vmap_table,
                  const ArrayRef<const uint8_t>& cfi_info,
-                 const ArrayRef<const LinkerPatch>& patches);
+                 const ArrayRef<const linker::LinkerPatch>& patches);
 
   virtual ~CompiledMethod();
 
@@ -412,7 +112,7 @@
       const ArrayRef<const uint8_t>& method_info,
       const ArrayRef<const uint8_t>& vmap_table,
       const ArrayRef<const uint8_t>& cfi_info,
-      const ArrayRef<const LinkerPatch>& patches);
+      const ArrayRef<const linker::LinkerPatch>& patches);
 
   static void ReleaseSwapAllocatedCompiledMethod(CompilerDriver* driver, CompiledMethod* m);
 
@@ -428,21 +128,13 @@
     return fp_spill_mask_;
   }
 
-  ArrayRef<const uint8_t> GetMethodInfo() const {
-    return GetArray(method_info_);
-  }
+  ArrayRef<const uint8_t> GetMethodInfo() const;
 
-  ArrayRef<const uint8_t> GetVmapTable() const {
-    return GetArray(vmap_table_);
-  }
+  ArrayRef<const uint8_t> GetVmapTable() const;
 
-  ArrayRef<const uint8_t> GetCFIInfo() const {
-    return GetArray(cfi_info_);
-  }
+  ArrayRef<const uint8_t> GetCFIInfo() const;
 
-  ArrayRef<const LinkerPatch> GetPatches() const {
-    return GetArray(patches_);
-  }
+  ArrayRef<const linker::LinkerPatch> GetPatches() const;
 
  private:
   // For quick code, the size of the activation used by the code.
@@ -458,7 +150,7 @@
   // For quick code, a FDE entry for the debug_frame section.
   const LengthPrefixedArray<uint8_t>* const cfi_info_;
   // For quick code, linker patches needed by the method.
-  const LengthPrefixedArray<LinkerPatch>* const patches_;
+  const LengthPrefixedArray<linker::LinkerPatch>* const patches_;
 };
 
 }  // namespace art
diff --git a/compiler/debug/elf_debug_line_writer.h b/compiler/debug/elf_debug_line_writer.h
index cf5d65e..49d52c4 100644
--- a/compiler/debug/elf_debug_line_writer.h
+++ b/compiler/debug/elf_debug_line_writer.h
@@ -20,10 +20,10 @@
 #include <unordered_set>
 #include <vector>
 
-#include "compiled_method.h"
 #include "debug/dwarf/debug_line_opcode_writer.h"
 #include "debug/dwarf/headers.h"
 #include "debug/elf_compilation_unit.h"
+#include "debug/src_map_elem.h"
 #include "dex_file-inl.h"
 #include "linker/elf_builder.h"
 #include "stack_map.h"
diff --git a/compiler/debug/method_debug_info.h b/compiler/debug/method_debug_info.h
index 5678910..a8225fa 100644
--- a/compiler/debug/method_debug_info.h
+++ b/compiler/debug/method_debug_info.h
@@ -19,7 +19,8 @@
 
 #include <string>
 
-#include "compiled_method.h"
+#include "arch/instruction_set.h"
+#include "base/array_ref.h"
 #include "dex_file.h"
 
 namespace art {
diff --git a/compiler/debug/src_map_elem.h b/compiler/debug/src_map_elem.h
new file mode 100644
index 0000000..5286b8c
--- /dev/null
+++ b/compiler/debug/src_map_elem.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_DEBUG_SRC_MAP_ELEM_H_
+#define ART_COMPILER_DEBUG_SRC_MAP_ELEM_H_
+
+#include <stdint.h>
+
+namespace art {
+
+class SrcMapElem {
+ public:
+  uint32_t from_;
+  int32_t to_;
+};
+
+inline bool operator<(const SrcMapElem& lhs, const SrcMapElem& rhs) {
+  if (lhs.from_ != rhs.from_) {
+    return lhs.from_ < rhs.from_;
+  }
+  return lhs.to_ < rhs.to_;
+}
+
+inline bool operator==(const SrcMapElem& lhs, const SrcMapElem& rhs) {
+  return lhs.from_ == rhs.from_ && lhs.to_ == rhs.to_;
+}
+
+}  // namespace art
+
+#endif  // ART_COMPILER_DEBUG_SRC_MAP_ELEM_H_
diff --git a/compiler/debug/src_map_elem_test.cc b/compiler/debug/src_map_elem_test.cc
new file mode 100644
index 0000000..ceaa53f
--- /dev/null
+++ b/compiler/debug/src_map_elem_test.cc
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "src_map_elem.h"
+
+#include "base/macros.h"
+
+namespace art {
+namespace debug {
+
+TEST(SrcMapElem, Operators) {
+  SrcMapElem elems[] = {
+      { 1u, -1 },
+      { 1u, 0 },
+      { 1u, 1 },
+      { 2u, -1 },
+      { 2u, 0 },    // Index 4.
+      { 2u, 1 },
+      { 2u, 0u },   // Index 6: Arbitrarily add identical SrcMapElem with index 4.
+  };
+
+  for (size_t i = 0; i != arraysize(elems); ++i) {
+    for (size_t j = 0; j != arraysize(elems); ++j) {
+      bool expected = (i != 6u ? i : 4u) == (j != 6u ? j : 4u);
+      EXPECT_EQ(expected, elems[i] == elems[j]) << i << " " << j;
+    }
+  }
+
+  for (size_t i = 0; i != arraysize(elems); ++i) {
+    for (size_t j = 0; j != arraysize(elems); ++j) {
+      bool expected = (i != 6u ? i : 4u) < (j != 6u ? j : 4u);
+      EXPECT_EQ(expected, elems[i] < elems[j]) << i << " " << j;
+    }
+  }
+}
+
+}  // namespace debug
+}  // namespace art
diff --git a/compiler/dex/dex_to_dex_compiler.cc b/compiler/dex/dex_to_dex_compiler.cc
index 9d57b96..e49f83f 100644
--- a/compiler/dex/dex_to_dex_compiler.cc
+++ b/compiler/dex/dex_to_dex_compiler.cc
@@ -395,7 +395,7 @@
         ArrayRef<const uint8_t>(),                   // method_info
         ArrayRef<const uint8_t>(quicken_data),       // vmap_table
         ArrayRef<const uint8_t>(),                   // cfi data
-        ArrayRef<const LinkerPatch>());
+        ArrayRef<const linker::LinkerPatch>());
   }
   return nullptr;
 }
diff --git a/compiler/dex/dex_to_dex_decompiler_test.cc b/compiler/dex/dex_to_dex_decompiler_test.cc
index e36d416..6637be2 100644
--- a/compiler/dex/dex_to_dex_decompiler_test.cc
+++ b/compiler/dex/dex_to_dex_decompiler_test.cc
@@ -18,7 +18,7 @@
 
 #include "class_linker.h"
 #include "common_compiler_test.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "compiler_callbacks.h"
 #include "dex_file.h"
 #include "driver/compiler_driver.h"
diff --git a/compiler/driver/compiled_method_storage.cc b/compiler/driver/compiled_method_storage.cc
index 528b0a2..c739333 100644
--- a/compiler/driver/compiled_method_storage.cc
+++ b/compiler/driver/compiled_method_storage.cc
@@ -21,6 +21,7 @@
 
 #include "base/logging.h"
 #include "compiled_method.h"
+#include "linker/linker_patch.h"
 #include "thread-current-inl.h"
 #include "utils.h"
 #include "utils/dedupe_set-inl.h"
@@ -178,7 +179,7 @@
                          LengthPrefixedArrayAlloc<uint8_t>(swap_space_.get())),
       dedupe_cfi_info_("dedupe cfi info", LengthPrefixedArrayAlloc<uint8_t>(swap_space_.get())),
       dedupe_linker_patches_("dedupe cfi info",
-                             LengthPrefixedArrayAlloc<LinkerPatch>(swap_space_.get())) {
+                             LengthPrefixedArrayAlloc<linker::LinkerPatch>(swap_space_.get())) {
 }
 
 CompiledMethodStorage::~CompiledMethodStorage() {
@@ -234,13 +235,13 @@
   ReleaseArrayIfNotDeduplicated(cfi_info);
 }
 
-const LengthPrefixedArray<LinkerPatch>* CompiledMethodStorage::DeduplicateLinkerPatches(
-    const ArrayRef<const LinkerPatch>& linker_patches) {
+const LengthPrefixedArray<linker::LinkerPatch>* CompiledMethodStorage::DeduplicateLinkerPatches(
+    const ArrayRef<const linker::LinkerPatch>& linker_patches) {
   return AllocateOrDeduplicateArray(linker_patches, &dedupe_linker_patches_);
 }
 
 void CompiledMethodStorage::ReleaseLinkerPatches(
-    const LengthPrefixedArray<LinkerPatch>* linker_patches) {
+    const LengthPrefixedArray<linker::LinkerPatch>* linker_patches) {
   ReleaseArrayIfNotDeduplicated(linker_patches);
 }
 
diff --git a/compiler/driver/compiled_method_storage.h b/compiler/driver/compiled_method_storage.h
index 27011e8..249f06c 100644
--- a/compiler/driver/compiled_method_storage.h
+++ b/compiler/driver/compiled_method_storage.h
@@ -28,7 +28,9 @@
 
 namespace art {
 
+namespace linker {
 class LinkerPatch;
+}  // namespace linker
 
 class CompiledMethodStorage {
  public:
@@ -61,9 +63,9 @@
   const LengthPrefixedArray<uint8_t>* DeduplicateCFIInfo(const ArrayRef<const uint8_t>& cfi_info);
   void ReleaseCFIInfo(const LengthPrefixedArray<uint8_t>* cfi_info);
 
-  const LengthPrefixedArray<LinkerPatch>* DeduplicateLinkerPatches(
-      const ArrayRef<const LinkerPatch>& linker_patches);
-  void ReleaseLinkerPatches(const LengthPrefixedArray<LinkerPatch>* linker_patches);
+  const LengthPrefixedArray<linker::LinkerPatch>* DeduplicateLinkerPatches(
+      const ArrayRef<const linker::LinkerPatch>& linker_patches);
+  void ReleaseLinkerPatches(const LengthPrefixedArray<linker::LinkerPatch>* linker_patches);
 
  private:
   template <typename T, typename DedupeSetType>
@@ -98,7 +100,7 @@
   ArrayDedupeSet<uint8_t> dedupe_method_info_;
   ArrayDedupeSet<uint8_t> dedupe_vmap_table_;
   ArrayDedupeSet<uint8_t> dedupe_cfi_info_;
-  ArrayDedupeSet<LinkerPatch> dedupe_linker_patches_;
+  ArrayDedupeSet<linker::LinkerPatch> dedupe_linker_patches_;
 
   DISALLOW_COPY_AND_ASSIGN(CompiledMethodStorage);
 };
diff --git a/compiler/driver/compiled_method_storage_test.cc b/compiler/driver/compiled_method_storage_test.cc
index 2ec2af5..e1ea630 100644
--- a/compiler/driver/compiled_method_storage_test.cc
+++ b/compiler/driver/compiled_method_storage_test.cc
@@ -18,7 +18,7 @@
 
 #include <gtest/gtest.h>
 
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "compiler_driver.h"
 #include "compiler_options.h"
 #include "dex/verification_results.h"
@@ -70,17 +70,17 @@
       ArrayRef<const uint8_t>(raw_cfi_info1),
       ArrayRef<const uint8_t>(raw_cfi_info2),
   };
-  const LinkerPatch raw_patches1[] = {
-      LinkerPatch::CodePatch(0u, nullptr, 1u),
-      LinkerPatch::RelativeMethodPatch(4u, nullptr, 0u, 1u),
+  const linker::LinkerPatch raw_patches1[] = {
+      linker::LinkerPatch::CodePatch(0u, nullptr, 1u),
+      linker::LinkerPatch::RelativeMethodPatch(4u, nullptr, 0u, 1u),
   };
-  const LinkerPatch raw_patches2[] = {
-      LinkerPatch::CodePatch(0u, nullptr, 1u),
-      LinkerPatch::RelativeMethodPatch(4u, nullptr, 0u, 2u),
+  const linker::LinkerPatch raw_patches2[] = {
+      linker::LinkerPatch::CodePatch(0u, nullptr, 1u),
+      linker::LinkerPatch::RelativeMethodPatch(4u, nullptr, 0u, 2u),
   };
-  ArrayRef<const LinkerPatch> patches[] = {
-      ArrayRef<const LinkerPatch>(raw_patches1),
-      ArrayRef<const LinkerPatch>(raw_patches2),
+  ArrayRef<const linker::LinkerPatch> patches[] = {
+      ArrayRef<const linker::LinkerPatch>(raw_patches1),
+      ArrayRef<const linker::LinkerPatch>(raw_patches2),
   };
 
   std::vector<CompiledMethod*> compiled_methods;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 678f090..03d8ef5 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -37,7 +37,7 @@
 #include "base/time_utils.h"
 #include "base/timing_logger.h"
 #include "class_linker-inl.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "compiler.h"
 #include "compiler_callbacks.h"
 #include "compiler_driver-inl.h"
@@ -55,6 +55,7 @@
 #include "handle_scope-inl.h"
 #include "intrinsics_enum.h"
 #include "jni_internal.h"
+#include "linker/linker_patch.h"
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
 #include "mirror/dex_cache-inl.h"
@@ -618,7 +619,7 @@
   if (compiled_method != nullptr) {
     // Count non-relative linker patches.
     size_t non_relative_linker_patch_count = 0u;
-    for (const LinkerPatch& patch : compiled_method->GetPatches()) {
+    for (const linker::LinkerPatch& patch : compiled_method->GetPatches()) {
       if (!patch.IsPcRelative()) {
         ++non_relative_linker_patch_count;
       }
diff --git a/compiler/jni/quick/jni_compiler.cc b/compiler/jni/quick/jni_compiler.cc
index e7e4647..c66a2a6 100644
--- a/compiler/jni/quick/jni_compiler.cc
+++ b/compiler/jni/quick/jni_compiler.cc
@@ -665,7 +665,7 @@
                                                  /* method_info */ ArrayRef<const uint8_t>(),
                                                  /* vmap_table */ ArrayRef<const uint8_t>(),
                                                  ArrayRef<const uint8_t>(*jni_asm->cfi().data()),
-                                                 ArrayRef<const LinkerPatch>());
+                                                 ArrayRef<const linker::LinkerPatch>());
 }
 
 // Copy a single parameter from the managed to the JNI calling convention.
diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc
index cb6522c..2cb23d1 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.cc
+++ b/compiler/linker/arm/relative_patcher_arm_base.cc
@@ -17,9 +17,10 @@
 #include "linker/arm/relative_patcher_arm_base.h"
 
 #include "base/stl_util.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "debug/method_debug_info.h"
 #include "dex_file_types.h"
+#include "linker/linker_patch.h"
 #include "linker/output_stream.h"
 #include "oat.h"
 #include "oat_quick_method_header.h"
diff --git a/compiler/linker/arm/relative_patcher_thumb2.cc b/compiler/linker/arm/relative_patcher_thumb2.cc
index 704feeb..f84fea3 100644
--- a/compiler/linker/arm/relative_patcher_thumb2.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2.cc
@@ -21,6 +21,7 @@
 #include "base/bit_utils.h"
 #include "compiled_method.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
+#include "linker/linker_patch.h"
 #include "lock_word.h"
 #include "mirror/array-inl.h"
 #include "mirror/object.h"
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index 82f502a..828c99b 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -20,10 +20,11 @@
 #include "arch/arm64/instruction_set_features_arm64.h"
 #include "art_method.h"
 #include "base/bit_utils.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "driver/compiler_driver.h"
 #include "entrypoints/quick/quick_entrypoints_enum.h"
 #include "heap_poisoning.h"
+#include "linker/linker_patch.h"
 #include "linker/output_stream.h"
 #include "lock_word.h"
 #include "mirror/array-inl.h"
diff --git a/compiler/linker/linker_patch.h b/compiler/linker/linker_patch.h
new file mode 100644
index 0000000..0ac1490
--- /dev/null
+++ b/compiler/linker/linker_patch.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_LINKER_LINKER_PATCH_H_
+#define ART_COMPILER_LINKER_LINKER_PATCH_H_
+
+#include <iosfwd>
+#include <stdint.h>
+
+#include "base/bit_utils.h"
+#include "base/logging.h"
+#include "method_reference.h"
+
+namespace art {
+
+class DexFile;
+
+namespace linker {
+
+class LinkerPatch {
+ public:
+  // Note: We explicitly specify the underlying type of the enum because GCC
+  // would otherwise select a bigger underlying type and then complain that
+  //     'art::LinkerPatch::patch_type_' is too small to hold all
+  //     values of 'enum class art::LinkerPatch::Type'
+  // which is ridiculous given we have only a handful of values here. If we
+  // choose to squeeze the Type into fewer than 8 bits, we'll have to declare
+  // patch_type_ as an uintN_t and do explicit static_cast<>s.
+  enum class Type : uint8_t {
+    kMethodRelative,          // NOTE: Actual patching is instruction_set-dependent.
+    kMethodBssEntry,          // NOTE: Actual patching is instruction_set-dependent.
+    kCall,
+    kCallRelative,            // NOTE: Actual patching is instruction_set-dependent.
+    kTypeRelative,            // NOTE: Actual patching is instruction_set-dependent.
+    kTypeClassTable,          // NOTE: Actual patching is instruction_set-dependent.
+    kTypeBssEntry,            // NOTE: Actual patching is instruction_set-dependent.
+    kStringRelative,          // NOTE: Actual patching is instruction_set-dependent.
+    kStringInternTable,       // NOTE: Actual patching is instruction_set-dependent.
+    kStringBssEntry,          // NOTE: Actual patching is instruction_set-dependent.
+    kBakerReadBarrierBranch,  // NOTE: Actual patching is instruction_set-dependent.
+  };
+
+  static LinkerPatch RelativeMethodPatch(size_t literal_offset,
+                                         const DexFile* target_dex_file,
+                                         uint32_t pc_insn_offset,
+                                         uint32_t target_method_idx) {
+    LinkerPatch patch(literal_offset, Type::kMethodRelative, target_dex_file);
+    patch.method_idx_ = target_method_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch MethodBssEntryPatch(size_t literal_offset,
+                                         const DexFile* target_dex_file,
+                                         uint32_t pc_insn_offset,
+                                         uint32_t target_method_idx) {
+    LinkerPatch patch(literal_offset, Type::kMethodBssEntry, target_dex_file);
+    patch.method_idx_ = target_method_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch CodePatch(size_t literal_offset,
+                               const DexFile* target_dex_file,
+                               uint32_t target_method_idx) {
+    LinkerPatch patch(literal_offset, Type::kCall, target_dex_file);
+    patch.method_idx_ = target_method_idx;
+    return patch;
+  }
+
+  static LinkerPatch RelativeCodePatch(size_t literal_offset,
+                                       const DexFile* target_dex_file,
+                                       uint32_t target_method_idx) {
+    LinkerPatch patch(literal_offset, Type::kCallRelative, target_dex_file);
+    patch.method_idx_ = target_method_idx;
+    return patch;
+  }
+
+  static LinkerPatch RelativeTypePatch(size_t literal_offset,
+                                       const DexFile* target_dex_file,
+                                       uint32_t pc_insn_offset,
+                                       uint32_t target_type_idx) {
+    LinkerPatch patch(literal_offset, Type::kTypeRelative, target_dex_file);
+    patch.type_idx_ = target_type_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch TypeClassTablePatch(size_t literal_offset,
+                                         const DexFile* target_dex_file,
+                                         uint32_t pc_insn_offset,
+                                         uint32_t target_type_idx) {
+    LinkerPatch patch(literal_offset, Type::kTypeClassTable, target_dex_file);
+    patch.type_idx_ = target_type_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch TypeBssEntryPatch(size_t literal_offset,
+                                       const DexFile* target_dex_file,
+                                       uint32_t pc_insn_offset,
+                                       uint32_t target_type_idx) {
+    LinkerPatch patch(literal_offset, Type::kTypeBssEntry, target_dex_file);
+    patch.type_idx_ = target_type_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch RelativeStringPatch(size_t literal_offset,
+                                         const DexFile* target_dex_file,
+                                         uint32_t pc_insn_offset,
+                                         uint32_t target_string_idx) {
+    LinkerPatch patch(literal_offset, Type::kStringRelative, target_dex_file);
+    patch.string_idx_ = target_string_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch StringInternTablePatch(size_t literal_offset,
+                                            const DexFile* target_dex_file,
+                                            uint32_t pc_insn_offset,
+                                            uint32_t target_string_idx) {
+    LinkerPatch patch(literal_offset, Type::kStringInternTable, target_dex_file);
+    patch.string_idx_ = target_string_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch StringBssEntryPatch(size_t literal_offset,
+                                         const DexFile* target_dex_file,
+                                         uint32_t pc_insn_offset,
+                                         uint32_t target_string_idx) {
+    LinkerPatch patch(literal_offset, Type::kStringBssEntry, target_dex_file);
+    patch.string_idx_ = target_string_idx;
+    patch.pc_insn_offset_ = pc_insn_offset;
+    return patch;
+  }
+
+  static LinkerPatch BakerReadBarrierBranchPatch(size_t literal_offset,
+                                                 uint32_t custom_value1 = 0u,
+                                                 uint32_t custom_value2 = 0u) {
+    LinkerPatch patch(literal_offset, Type::kBakerReadBarrierBranch, nullptr);
+    patch.baker_custom_value1_ = custom_value1;
+    patch.baker_custom_value2_ = custom_value2;
+    return patch;
+  }
+
+  LinkerPatch(const LinkerPatch& other) = default;
+  LinkerPatch& operator=(const LinkerPatch& other) = default;
+
+  size_t LiteralOffset() const {
+    return literal_offset_;
+  }
+
+  Type GetType() const {
+    return patch_type_;
+  }
+
+  bool IsPcRelative() const {
+    switch (GetType()) {
+      case Type::kMethodRelative:
+      case Type::kMethodBssEntry:
+      case Type::kCallRelative:
+      case Type::kTypeRelative:
+      case Type::kTypeClassTable:
+      case Type::kTypeBssEntry:
+      case Type::kStringRelative:
+      case Type::kStringInternTable:
+      case Type::kStringBssEntry:
+      case Type::kBakerReadBarrierBranch:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  MethodReference TargetMethod() const {
+    DCHECK(patch_type_ == Type::kMethodRelative ||
+           patch_type_ == Type::kMethodBssEntry ||
+           patch_type_ == Type::kCall ||
+           patch_type_ == Type::kCallRelative);
+    return MethodReference(target_dex_file_, method_idx_);
+  }
+
+  const DexFile* TargetTypeDexFile() const {
+    DCHECK(patch_type_ == Type::kTypeRelative ||
+           patch_type_ == Type::kTypeClassTable ||
+           patch_type_ == Type::kTypeBssEntry);
+    return target_dex_file_;
+  }
+
+  dex::TypeIndex TargetTypeIndex() const {
+    DCHECK(patch_type_ == Type::kTypeRelative ||
+           patch_type_ == Type::kTypeClassTable ||
+           patch_type_ == Type::kTypeBssEntry);
+    return dex::TypeIndex(type_idx_);
+  }
+
+  const DexFile* TargetStringDexFile() const {
+    DCHECK(patch_type_ == Type::kStringRelative ||
+           patch_type_ == Type::kStringInternTable ||
+           patch_type_ == Type::kStringBssEntry);
+    return target_dex_file_;
+  }
+
+  dex::StringIndex TargetStringIndex() const {
+    DCHECK(patch_type_ == Type::kStringRelative ||
+           patch_type_ == Type::kStringInternTable ||
+           patch_type_ == Type::kStringBssEntry);
+    return dex::StringIndex(string_idx_);
+  }
+
+  uint32_t PcInsnOffset() const {
+    DCHECK(patch_type_ == Type::kMethodRelative ||
+           patch_type_ == Type::kMethodBssEntry ||
+           patch_type_ == Type::kTypeRelative ||
+           patch_type_ == Type::kTypeClassTable ||
+           patch_type_ == Type::kTypeBssEntry ||
+           patch_type_ == Type::kStringRelative ||
+           patch_type_ == Type::kStringInternTable ||
+           patch_type_ == Type::kStringBssEntry);
+    return pc_insn_offset_;
+  }
+
+  uint32_t GetBakerCustomValue1() const {
+    DCHECK(patch_type_ == Type::kBakerReadBarrierBranch);
+    return baker_custom_value1_;
+  }
+
+  uint32_t GetBakerCustomValue2() const {
+    DCHECK(patch_type_ == Type::kBakerReadBarrierBranch);
+    return baker_custom_value2_;
+  }
+
+ private:
+  LinkerPatch(size_t literal_offset, Type patch_type, const DexFile* target_dex_file)
+      : target_dex_file_(target_dex_file),
+        literal_offset_(literal_offset),
+        patch_type_(patch_type) {
+    cmp1_ = 0u;
+    cmp2_ = 0u;
+    // The compiler rejects methods that are too big, so the compiled code
+    // of a single method really shouln't be anywhere close to 16MiB.
+    DCHECK(IsUint<24>(literal_offset));
+  }
+
+  const DexFile* target_dex_file_;
+  // TODO: Clean up naming. Some patched locations are literals but others are not.
+  uint32_t literal_offset_ : 24;  // Method code size up to 16MiB.
+  Type patch_type_ : 8;
+  union {
+    uint32_t cmp1_;             // Used for relational operators.
+    uint32_t method_idx_;       // Method index for Call/Method patches.
+    uint32_t type_idx_;         // Type index for Type patches.
+    uint32_t string_idx_;       // String index for String patches.
+    uint32_t baker_custom_value1_;
+    static_assert(sizeof(method_idx_) == sizeof(cmp1_), "needed by relational operators");
+    static_assert(sizeof(type_idx_) == sizeof(cmp1_), "needed by relational operators");
+    static_assert(sizeof(string_idx_) == sizeof(cmp1_), "needed by relational operators");
+    static_assert(sizeof(baker_custom_value1_) == sizeof(cmp1_), "needed by relational operators");
+  };
+  union {
+    // Note: To avoid uninitialized padding on 64-bit systems, we use `size_t` for `cmp2_`.
+    // This allows a hashing function to treat an array of linker patches as raw memory.
+    size_t cmp2_;             // Used for relational operators.
+    // Literal offset of the insn loading PC (same as literal_offset if it's the same insn,
+    // may be different if the PC-relative addressing needs multiple insns).
+    uint32_t pc_insn_offset_;
+    uint32_t baker_custom_value2_;
+    static_assert(sizeof(pc_insn_offset_) <= sizeof(cmp2_), "needed by relational operators");
+    static_assert(sizeof(baker_custom_value2_) <= sizeof(cmp2_), "needed by relational operators");
+  };
+
+  friend bool operator==(const LinkerPatch& lhs, const LinkerPatch& rhs);
+  friend bool operator<(const LinkerPatch& lhs, const LinkerPatch& rhs);
+};
+std::ostream& operator<<(std::ostream& os, const LinkerPatch::Type& type);
+
+inline bool operator==(const LinkerPatch& lhs, const LinkerPatch& rhs) {
+  return lhs.literal_offset_ == rhs.literal_offset_ &&
+      lhs.patch_type_ == rhs.patch_type_ &&
+      lhs.target_dex_file_ == rhs.target_dex_file_ &&
+      lhs.cmp1_ == rhs.cmp1_ &&
+      lhs.cmp2_ == rhs.cmp2_;
+}
+
+inline bool operator<(const LinkerPatch& lhs, const LinkerPatch& rhs) {
+  return (lhs.literal_offset_ != rhs.literal_offset_) ? lhs.literal_offset_ < rhs.literal_offset_
+      : (lhs.patch_type_ != rhs.patch_type_) ? lhs.patch_type_ < rhs.patch_type_
+      : (lhs.target_dex_file_ != rhs.target_dex_file_) ? lhs.target_dex_file_ < rhs.target_dex_file_
+      : (lhs.cmp1_ != rhs.cmp1_) ? lhs.cmp1_ < rhs.cmp1_
+      : lhs.cmp2_ < rhs.cmp2_;
+}
+
+}  // namespace linker
+}  // namespace art
+
+#endif  // ART_COMPILER_LINKER_LINKER_PATCH_H_
diff --git a/compiler/compiled_method_test.cc b/compiler/linker/linker_patch_test.cc
similarity index 90%
rename from compiler/compiled_method_test.cc
rename to compiler/linker/linker_patch_test.cc
index f4a72cf..e87dc8d 100644
--- a/compiler/compiled_method_test.cc
+++ b/compiler/linker/linker_patch_test.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2015 The Android Open Source Project
+ * Copyright (C) 2017 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,37 +16,12 @@
 
 #include <gtest/gtest.h>
 
-#include "compiled_method.h"
+#include "linker_patch.h"
 
 namespace art {
+namespace linker {
 
-TEST(CompiledMethod, SrcMapElemOperators) {
-  SrcMapElem elems[] = {
-      { 1u, -1 },
-      { 1u, 0 },
-      { 1u, 1 },
-      { 2u, -1 },
-      { 2u, 0 },    // Index 4.
-      { 2u, 1 },
-      { 2u, 0u },   // Index 6: Arbitrarily add identical SrcMapElem with index 4.
-  };
-
-  for (size_t i = 0; i != arraysize(elems); ++i) {
-    for (size_t j = 0; j != arraysize(elems); ++j) {
-      bool expected = (i != 6u ? i : 4u) == (j != 6u ? j : 4u);
-      EXPECT_EQ(expected, elems[i] == elems[j]) << i << " " << j;
-    }
-  }
-
-  for (size_t i = 0; i != arraysize(elems); ++i) {
-    for (size_t j = 0; j != arraysize(elems); ++j) {
-      bool expected = (i != 6u ? i : 4u) < (j != 6u ? j : 4u);
-      EXPECT_EQ(expected, elems[i] < elems[j]) << i << " " << j;
-    }
-  }
-}
-
-TEST(CompiledMethod, LinkerPatchOperators) {
+TEST(LinkerPatch, LinkerPatchOperators) {
   const DexFile* dex_file1 = reinterpret_cast<const DexFile*>(1);
   const DexFile* dex_file2 = reinterpret_cast<const DexFile*>(2);
   LinkerPatch patches[] = {
@@ -191,4 +166,5 @@
   }
 }
 
+}  // namespace linker
 }  // namespace art
diff --git a/compiler/linker/mips/relative_patcher_mips.cc b/compiler/linker/mips/relative_patcher_mips.cc
index 408ac22..69e0846 100644
--- a/compiler/linker/mips/relative_patcher_mips.cc
+++ b/compiler/linker/mips/relative_patcher_mips.cc
@@ -18,6 +18,7 @@
 
 #include "compiled_method.h"
 #include "debug/method_debug_info.h"
+#include "linker/linker_patch.h"
 
 namespace art {
 namespace linker {
diff --git a/compiler/linker/mips64/relative_patcher_mips64.cc b/compiler/linker/mips64/relative_patcher_mips64.cc
index 2bcd98a..aae5746 100644
--- a/compiler/linker/mips64/relative_patcher_mips64.cc
+++ b/compiler/linker/mips64/relative_patcher_mips64.cc
@@ -18,6 +18,7 @@
 
 #include "compiled_method.h"
 #include "debug/method_debug_info.h"
+#include "linker/linker_patch.h"
 
 namespace art {
 namespace linker {
diff --git a/compiler/linker/relative_patcher.h b/compiler/linker/relative_patcher.h
index e079946..548e128 100644
--- a/compiler/linker/relative_patcher.h
+++ b/compiler/linker/relative_patcher.h
@@ -28,7 +28,6 @@
 namespace art {
 
 class CompiledMethod;
-class LinkerPatch;
 
 namespace debug {
 struct MethodDebugInfo;
@@ -36,6 +35,7 @@
 
 namespace linker {
 
+class LinkerPatch;
 class OutputStream;
 
 /**
diff --git a/compiler/linker/relative_patcher_test.h b/compiler/linker/relative_patcher_test.h
index f7dbc1e..6297dd0 100644
--- a/compiler/linker/relative_patcher_test.h
+++ b/compiler/linker/relative_patcher_test.h
@@ -21,7 +21,7 @@
 #include "arch/instruction_set_features.h"
 #include "base/array_ref.h"
 #include "base/macros.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "dex/verification_results.h"
 #include "driver/compiler_driver.h"
 #include "driver/compiler_options.h"
diff --git a/compiler/linker/x86/relative_patcher_x86.cc b/compiler/linker/x86/relative_patcher_x86.cc
index 6967b0b..cdd2cef 100644
--- a/compiler/linker/x86/relative_patcher_x86.cc
+++ b/compiler/linker/x86/relative_patcher_x86.cc
@@ -17,6 +17,7 @@
 #include "linker/x86/relative_patcher_x86.h"
 
 #include "compiled_method.h"
+#include "linker/linker_patch.h"
 
 namespace art {
 namespace linker {
diff --git a/compiler/linker/x86_64/relative_patcher_x86_64.cc b/compiler/linker/x86_64/relative_patcher_x86_64.cc
index 156ece9..9633564 100644
--- a/compiler/linker/x86_64/relative_patcher_x86_64.cc
+++ b/compiler/linker/x86_64/relative_patcher_x86_64.cc
@@ -17,6 +17,7 @@
 #include "linker/x86_64/relative_patcher_x86_64.h"
 
 #include "compiled_method.h"
+#include "linker/linker_patch.h"
 
 namespace art {
 namespace linker {
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 1e5f1ec..6533e2b 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -288,7 +288,8 @@
   GetAssembler()->FinalizeInstructions(code);
 }
 
-void CodeGenerator::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches ATTRIBUTE_UNUSED) {
+void CodeGenerator::EmitLinkerPatches(
+    ArenaVector<linker::LinkerPatch>* linker_patches ATTRIBUTE_UNUSED) {
   // No linker patches by default.
 }
 
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 30c2b52..4b4abdf 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -61,9 +61,12 @@
 class CodeGenerator;
 class CompilerDriver;
 class CompilerOptions;
-class LinkerPatch;
 class ParallelMoveResolver;
 
+namespace linker {
+class LinkerPatch;
+}  // namespace linker
+
 class CodeAllocator {
  public:
   CodeAllocator() {}
@@ -205,7 +208,7 @@
 
   virtual void Initialize() = 0;
   virtual void Finalize(CodeAllocator* allocator);
-  virtual void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches);
+  virtual void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches);
   virtual void GenerateFrameEntry() = 0;
   virtual void GenerateFrameExit() = 0;
   virtual void Bind(HBasicBlock* block) = 0;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 8814cfc..aaea7c1 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -31,6 +31,7 @@
 #include "intrinsics.h"
 #include "intrinsics_arm64.h"
 #include "linker/arm64/relative_patcher_arm64.h"
+#include "linker/linker_patch.h"
 #include "lock_word.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
@@ -4754,10 +4755,10 @@
   __ ldr(out, MemOperand(base, /* offset placeholder */ 0));
 }
 
-template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorARM64::EmitPcRelativeLinkerPatches(
     const ArenaDeque<PcRelativePatchInfo>& infos,
-    ArenaVector<LinkerPatch>* linker_patches) {
+    ArenaVector<linker::LinkerPatch>* linker_patches) {
   for (const PcRelativePatchInfo& info : infos) {
     linker_patches->push_back(Factory(info.label.GetLocation(),
                                       &info.target_dex_file,
@@ -4766,7 +4767,7 @@
   }
 }
 
-void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
+void CodeGeneratorARM64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_method_patches_.size() +
@@ -4778,28 +4779,28 @@
       baker_read_barrier_patches_.size();
   linker_patches->reserve(size);
   if (GetCompilerOptions().IsBootImage()) {
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
-                                                                linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
-                                                                  linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeMethodPatch>(
+        pc_relative_method_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeTypePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
+        pc_relative_string_patches_, linker_patches);
   } else {
     DCHECK(pc_relative_method_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::TypeClassTablePatch>(pc_relative_type_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringInternTablePatch>(pc_relative_string_patches_,
-                                                                     linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
+        pc_relative_string_patches_, linker_patches);
   }
-  EmitPcRelativeLinkerPatches<LinkerPatch::MethodBssEntryPatch>(method_bss_entry_patches_,
-                                                                linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
-                                                              linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_bss_entry_patches_,
-                                                                linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>(
+      method_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeBssEntryPatch>(
+      type_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
+      string_bss_entry_patches_, linker_patches);
   for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) {
-    linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(),
-                                                                       info.custom_data));
+    linker_patches->push_back(linker::LinkerPatch::BakerReadBarrierBranchPatch(
+        info.label.GetLocation(), info.custom_data));
   }
   DCHECK_EQ(size, linker_patches->size());
 }
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 69c5119..cebdaa1 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -627,7 +627,7 @@
                                 vixl::aarch64::Register out,
                                 vixl::aarch64::Register base);
 
-  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) OVERRIDE;
 
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
@@ -805,9 +805,9 @@
 
   void EmitJumpTables();
 
-  template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+  template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
   static void EmitPcRelativeLinkerPatches(const ArenaDeque<PcRelativePatchInfo>& infos,
-                                          ArenaVector<LinkerPatch>* linker_patches);
+                                          ArenaVector<linker::LinkerPatch>* linker_patches);
 
   // Labels for each block that will be compiled.
   // We use a deque so that the `vixl::aarch64::Label` objects do not move in memory.
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index baf68c4..e1ea080 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -30,6 +30,7 @@
 #include "heap_poisoning.h"
 #include "intrinsics_arm_vixl.h"
 #include "linker/arm/relative_patcher_thumb2.h"
+#include "linker/linker_patch.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "thread.h"
@@ -9191,10 +9192,10 @@
       });
 }
 
-template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorARMVIXL::EmitPcRelativeLinkerPatches(
     const ArenaDeque<PcRelativePatchInfo>& infos,
-    ArenaVector<LinkerPatch>* linker_patches) {
+    ArenaVector<linker::LinkerPatch>* linker_patches) {
   for (const PcRelativePatchInfo& info : infos) {
     const DexFile& dex_file = info.target_dex_file;
     size_t offset_or_index = info.offset_or_index;
@@ -9211,7 +9212,7 @@
   }
 }
 
-void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
+void CodeGeneratorARMVIXL::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       /* MOVW+MOVT for each entry */ 2u * pc_relative_method_patches_.size() +
@@ -9223,28 +9224,28 @@
       baker_read_barrier_patches_.size();
   linker_patches->reserve(size);
   if (GetCompilerOptions().IsBootImage()) {
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
-                                                                linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
-                                                                  linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeMethodPatch>(
+        pc_relative_method_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeTypePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
+        pc_relative_string_patches_, linker_patches);
   } else {
     DCHECK(pc_relative_method_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::TypeClassTablePatch>(pc_relative_type_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringInternTablePatch>(pc_relative_string_patches_,
-                                                                     linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
+        pc_relative_string_patches_, linker_patches);
   }
-  EmitPcRelativeLinkerPatches<LinkerPatch::MethodBssEntryPatch>(method_bss_entry_patches_,
-                                                                linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
-                                                              linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_bss_entry_patches_,
-                                                                linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>(
+      method_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeBssEntryPatch>(
+      type_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
+      string_bss_entry_patches_, linker_patches);
   for (const BakerReadBarrierPatchInfo& info : baker_read_barrier_patches_) {
-    linker_patches->push_back(LinkerPatch::BakerReadBarrierBranchPatch(info.label.GetLocation(),
-                                                                       info.custom_data));
+    linker_patches->push_back(linker::LinkerPatch::BakerReadBarrierBranchPatch(
+        info.label.GetLocation(), info.custom_data));
   }
   DCHECK_EQ(size, linker_patches->size());
 }
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index e78bc15..337ecf1 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -594,7 +594,7 @@
                                                 dex::TypeIndex type_index,
                                                 Handle<mirror::Class> handle);
 
-  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) OVERRIDE;
 
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
@@ -778,9 +778,9 @@
   PcRelativePatchInfo* NewPcRelativePatch(const DexFile& dex_file,
                                           uint32_t offset_or_index,
                                           ArenaDeque<PcRelativePatchInfo>* patches);
-  template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+  template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
   static void EmitPcRelativeLinkerPatches(const ArenaDeque<PcRelativePatchInfo>& infos,
-                                          ArenaVector<LinkerPatch>* linker_patches);
+                                          ArenaVector<linker::LinkerPatch>* linker_patches);
 
   // Labels for each block that will be compiled.
   // We use a deque so that the `vixl::aarch32::Label` objects do not move in memory.
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 6256722..8ada76a 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -29,6 +29,7 @@
 #include "heap_poisoning.h"
 #include "intrinsics.h"
 #include "intrinsics_mips.h"
+#include "linker/linker_patch.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "offsets.h"
@@ -1628,10 +1629,10 @@
   }
 }
 
-template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorMIPS::EmitPcRelativeLinkerPatches(
     const ArenaDeque<PcRelativePatchInfo>& infos,
-    ArenaVector<LinkerPatch>* linker_patches) {
+    ArenaVector<linker::LinkerPatch>* linker_patches) {
   for (const PcRelativePatchInfo& info : infos) {
     const DexFile& dex_file = info.target_dex_file;
     size_t offset_or_index = info.offset_or_index;
@@ -1647,7 +1648,7 @@
   }
 }
 
-void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
+void CodeGeneratorMIPS::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_method_patches_.size() +
@@ -1658,25 +1659,25 @@
       string_bss_entry_patches_.size();
   linker_patches->reserve(size);
   if (GetCompilerOptions().IsBootImage()) {
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
-                                                                linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
-                                                                  linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeMethodPatch>(
+        pc_relative_method_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeTypePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
+        pc_relative_string_patches_, linker_patches);
   } else {
     DCHECK(pc_relative_method_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::TypeClassTablePatch>(pc_relative_type_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringInternTablePatch>(pc_relative_string_patches_,
-                                                                     linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
+        pc_relative_string_patches_, linker_patches);
   }
-  EmitPcRelativeLinkerPatches<LinkerPatch::MethodBssEntryPatch>(method_bss_entry_patches_,
-                                                                linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
-                                                              linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_bss_entry_patches_,
-                                                                linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>(
+      method_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeBssEntryPatch>(
+      type_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
+      string_bss_entry_patches_, linker_patches);
   DCHECK_EQ(size, linker_patches->size());
 }
 
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index f15f8c6..2b1075d 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -395,7 +395,7 @@
   const MipsAssembler& GetAssembler() const OVERRIDE { return assembler_; }
 
   // Emit linker patches.
-  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) OVERRIDE;
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
   // Fast path implementation of ReadBarrier::Barrier for a heap
@@ -679,9 +679,9 @@
                                           const PcRelativePatchInfo* info_high,
                                           ArenaDeque<PcRelativePatchInfo>* patches);
 
-  template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+  template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
   void EmitPcRelativeLinkerPatches(const ArenaDeque<PcRelativePatchInfo>& infos,
-                                   ArenaVector<LinkerPatch>* linker_patches);
+                                   ArenaVector<linker::LinkerPatch>* linker_patches);
 
   // Labels for each block that will be compiled.
   MipsLabel* block_labels_;
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index e8ae2db..119e0f6 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -27,6 +27,7 @@
 #include "heap_poisoning.h"
 #include "intrinsics.h"
 #include "intrinsics_mips64.h"
+#include "linker/linker_patch.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "offsets.h"
@@ -1541,10 +1542,10 @@
   }
 }
 
-template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorMIPS64::EmitPcRelativeLinkerPatches(
     const ArenaDeque<PcRelativePatchInfo>& infos,
-    ArenaVector<LinkerPatch>* linker_patches) {
+    ArenaVector<linker::LinkerPatch>* linker_patches) {
   for (const PcRelativePatchInfo& info : infos) {
     const DexFile& dex_file = info.target_dex_file;
     size_t offset_or_index = info.offset_or_index;
@@ -1556,7 +1557,7 @@
   }
 }
 
-void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
+void CodeGeneratorMIPS64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       pc_relative_method_patches_.size() +
@@ -1567,25 +1568,25 @@
       string_bss_entry_patches_.size();
   linker_patches->reserve(size);
   if (GetCompilerOptions().IsBootImage()) {
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(pc_relative_method_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(pc_relative_type_patches_,
-                                                                linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(pc_relative_string_patches_,
-                                                                  linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeMethodPatch>(
+        pc_relative_method_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeTypePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
+        pc_relative_string_patches_, linker_patches);
   } else {
     DCHECK(pc_relative_method_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::TypeClassTablePatch>(pc_relative_type_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringInternTablePatch>(pc_relative_string_patches_,
-                                                                     linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
+        pc_relative_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
+        pc_relative_string_patches_, linker_patches);
   }
-  EmitPcRelativeLinkerPatches<LinkerPatch::MethodBssEntryPatch>(method_bss_entry_patches_,
-                                                                linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
-                                                              linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_bss_entry_patches_,
-                                                                linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>(
+      method_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeBssEntryPatch>(
+      type_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
+      string_bss_entry_patches_, linker_patches);
   DCHECK_EQ(size, linker_patches->size());
 }
 
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index 3035621..9fe47ee 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -374,7 +374,7 @@
   const Mips64Assembler& GetAssembler() const OVERRIDE { return assembler_; }
 
   // Emit linker patches.
-  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) OVERRIDE;
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
   // Fast path implementation of ReadBarrier::Barrier for a heap
@@ -643,9 +643,9 @@
                                           const PcRelativePatchInfo* info_high,
                                           ArenaDeque<PcRelativePatchInfo>* patches);
 
-  template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+  template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
   void EmitPcRelativeLinkerPatches(const ArenaDeque<PcRelativePatchInfo>& infos,
-                                   ArenaVector<LinkerPatch>* linker_patches);
+                                   ArenaVector<linker::LinkerPatch>* linker_patches);
 
   // Labels for each block that will be compiled.
   Mips64Label* block_labels_;  // Indexed by block id.
diff --git a/compiler/optimizing/code_generator_vector_arm64.cc b/compiler/optimizing/code_generator_vector_arm64.cc
index 18a55c8..3f576c8 100644
--- a/compiler/optimizing/code_generator_vector_arm64.cc
+++ b/compiler/optimizing/code_generator_vector_arm64.cc
@@ -949,20 +949,18 @@
   }
 }
 
-void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     default:
@@ -971,18 +969,25 @@
   }
 }
 
+void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
 // Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
 // 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
 // However vector MultiplyAccumulate instruction is not affected.
-void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
-  VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
-  VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
-  switch (instr->GetPackedType()) {
+void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister acc = VRegisterFrom(locations->InAt(0));
+  VRegister left = VRegisterFrom(locations->InAt(1));
+  VRegister right = VRegisterFrom(locations->InAt(2));
+
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.V16B(), left.V16B(), right.V16B());
       } else {
         __ Mls(acc.V16B(), left.V16B(), right.V16B());
@@ -990,16 +995,16 @@
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.V8H(), left.V8H(), right.V8H());
       } else {
         __ Mls(acc.V8H(), left.V8H(), right.V8H());
       }
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ Mla(acc.V4S(), left.V4S(), right.V4S());
       } else {
         __ Mls(acc.V4S(), left.V4S(), right.V4S());
@@ -1007,6 +1012,186 @@
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+void LocationsBuilderARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+  // Some conversions require temporary registers.
+  LocationSummary* locations = instruction->GetLocations();
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(a->GetPackedType(), b->GetPackedType());
+  switch (a->GetPackedType()) {
+    case Primitive::kPrimByte:
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimLong:
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          FALLTHROUGH_INTENDED;
+        case Primitive::kPrimInt:
+          locations->AddTemp(Location::RequiresFpuRegister());
+          locations->AddTemp(Location::RequiresFpuRegister());
+          break;
+        default:
+          break;
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      if (instruction->GetPackedType() == Primitive::kPrimLong) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      break;
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      if (instruction->GetPackedType() == a->GetPackedType()) {
+        locations->AddTemp(Location::RequiresFpuRegister());
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+void InstructionCodeGeneratorARM64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VRegister acc = VRegisterFrom(locations->InAt(0));
+  VRegister left = VRegisterFrom(locations->InAt(1));
+  VRegister right = VRegisterFrom(locations->InAt(2));
+
+  DCHECK(locations->InAt(0).Equals(locations->Out()));
+
+  // Handle all feasible acc_T += sad(a_S, b_S) type combinations (T x S).
+  HVecOperation* a = instruction->InputAt(1)->AsVecOperation();
+  HVecOperation* b = instruction->InputAt(2)->AsVecOperation();
+  DCHECK_EQ(a->GetPackedType(), b->GetPackedType());
+  switch (a->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort:
+          DCHECK_EQ(8u, instruction->GetVectorLength());
+          __ Sabal(acc.V8H(), left.V8B(), right.V8B());
+          __ Sabal2(acc.V8H(), left.V16B(), right.V16B());
+          break;
+        case Primitive::kPrimInt: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+          __ Sxtl(tmp1.V8H(), left.V8B());
+          __ Sxtl(tmp2.V8H(), right.V8B());
+          __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
+          __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
+          __ Sxtl2(tmp1.V8H(), left.V16B());
+          __ Sxtl2(tmp2.V8H(), right.V16B());
+          __ Sabal(acc.V4S(), tmp1.V4H(), tmp2.V4H());
+          __ Sabal2(acc.V4S(), tmp1.V8H(), tmp2.V8H());
+          break;
+        }
+        case Primitive::kPrimLong: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+          VRegister tmp3 = VRegisterFrom(locations->GetTemp(2));
+          VRegister tmp4 = VRegisterFrom(locations->GetTemp(3));
+          __ Sxtl(tmp1.V8H(), left.V8B());
+          __ Sxtl(tmp2.V8H(), right.V8B());
+          __ Sxtl(tmp3.V4S(), tmp1.V4H());
+          __ Sxtl(tmp4.V4S(), tmp2.V4H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          __ Sxtl2(tmp3.V4S(), tmp1.V8H());
+          __ Sxtl2(tmp4.V4S(), tmp2.V8H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          __ Sxtl2(tmp1.V8H(), left.V16B());
+          __ Sxtl2(tmp2.V8H(), right.V16B());
+          __ Sxtl(tmp3.V4S(), tmp1.V4H());
+          __ Sxtl(tmp4.V4S(), tmp2.V4H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          __ Sxtl2(tmp3.V4S(), tmp1.V8H());
+          __ Sxtl2(tmp4.V4S(), tmp2.V8H());
+          __ Sabal(acc.V2D(), tmp3.V2S(), tmp4.V2S());
+          __ Sabal2(acc.V2D(), tmp3.V4S(), tmp4.V4S());
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimInt:
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          __ Sabal(acc.V4S(), left.V4H(), right.V4H());
+          __ Sabal2(acc.V4S(), left.V8H(), right.V8H());
+          break;
+        case Primitive::kPrimLong: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VRegister tmp1 = VRegisterFrom(locations->GetTemp(0));
+          VRegister tmp2 = VRegisterFrom(locations->GetTemp(1));
+          __ Sxtl(tmp1.V4S(), left.V4H());
+          __ Sxtl(tmp2.V4S(), right.V4H());
+          __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
+          __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
+          __ Sxtl2(tmp1.V4S(), left.V8H());
+          __ Sxtl2(tmp2.V4S(), right.V8H());
+          __ Sabal(acc.V2D(), tmp1.V2S(), tmp2.V2S());
+          __ Sabal2(acc.V2D(), tmp1.V4S(), tmp2.V4S());
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimInt: {
+          DCHECK_EQ(4u, instruction->GetVectorLength());
+          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+          __ Sub(tmp.V4S(), left.V4S(), right.V4S());
+          __ Abs(tmp.V4S(), tmp.V4S());
+          __ Add(acc.V4S(), acc.V4S(), tmp.V4S());
+          break;
+        }
+        case Primitive::kPrimLong:
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          __ Sabal(acc.V2D(), left.V2S(), right.V2S());
+          __ Sabal2(acc.V2D(), left.V4S(), right.V4S());
+          break;
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    case Primitive::kPrimLong:
+      DCHECK_EQ(2u, a->GetVectorLength());
+      switch (instruction->GetPackedType()) {
+        case Primitive::kPrimLong: {
+          DCHECK_EQ(2u, instruction->GetVectorLength());
+          VRegister tmp = VRegisterFrom(locations->GetTemp(0));
+          __ Sub(tmp.V2D(), left.V2D(), right.V2D());
+          __ Abs(tmp.V2D(), tmp.V2D());
+          __ Add(acc.V2D(), acc.V2D(), tmp.V2D());
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unsupported SIMD type";
+          UNREACHABLE();
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
   }
 }
 
diff --git a/compiler/optimizing/code_generator_vector_arm_vixl.cc b/compiler/optimizing/code_generator_vector_arm_vixl.cc
index 7a11dff..069054c 100644
--- a/compiler/optimizing/code_generator_vector_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_vector_arm_vixl.cc
@@ -629,12 +629,40 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 // Return whether the vector memory access operation is guaranteed to be word-aligned (ARM word
diff --git a/compiler/optimizing/code_generator_vector_mips.cc b/compiler/optimizing/code_generator_vector_mips.cc
index c2fbf7f..0bedafc 100644
--- a/compiler/optimizing/code_generator_vector_mips.cc
+++ b/compiler/optimizing/code_generator_vector_mips.cc
@@ -826,21 +826,18 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     default:
@@ -849,18 +846,19 @@
   }
 }
 
-void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  VectorRegister acc =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
-  VectorRegister left =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
-  VectorRegister right =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
-  switch (instr->GetPackedType()) {
+void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvB(acc, left, right);
       } else {
         __ MsubvB(acc, left, right);
@@ -868,24 +866,24 @@
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvH(acc, left, right);
       } else {
         __ MsubvH(acc, left, right);
       }
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvW(acc, left, right);
       } else {
         __ MsubvW(acc, left, right);
       }
       break;
     case Primitive::kPrimLong:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvD(acc, left, right);
       } else {
         __ MsubvD(acc, left, right);
@@ -897,6 +895,15 @@
   }
 }
 
+void LocationsBuilderMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  // TODO: implement this, location helper already filled out (shared with MulAcc).
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_mips64.cc b/compiler/optimizing/code_generator_vector_mips64.cc
index 9d3a777..db31bdc 100644
--- a/compiler/optimizing/code_generator_vector_mips64.cc
+++ b/compiler/optimizing/code_generator_vector_mips64.cc
@@ -830,21 +830,18 @@
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
-void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
-  switch (instr->GetPackedType()) {
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
     case Primitive::kPrimLong:
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
-      locations->SetInAt(
-          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
-      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
       locations->SetOut(Location::SameAsFirstInput());
       break;
     default:
@@ -853,18 +850,19 @@
   }
 }
 
-void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LocationSummary* locations = instr->GetLocations();
-  VectorRegister acc =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
-  VectorRegister left =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
-  VectorRegister right =
-      VectorRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
-  switch (instr->GetPackedType()) {
+void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  VectorRegister acc = VectorRegisterFrom(locations->InAt(0));
+  VectorRegister left = VectorRegisterFrom(locations->InAt(1));
+  VectorRegister right = VectorRegisterFrom(locations->InAt(2));
+  switch (instruction->GetPackedType()) {
     case Primitive::kPrimByte:
-      DCHECK_EQ(16u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(16u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvB(acc, left, right);
       } else {
         __ MsubvB(acc, left, right);
@@ -872,24 +870,24 @@
       break;
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
-      DCHECK_EQ(8u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(8u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvH(acc, left, right);
       } else {
         __ MsubvH(acc, left, right);
       }
       break;
     case Primitive::kPrimInt:
-      DCHECK_EQ(4u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(4u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvW(acc, left, right);
       } else {
         __ MsubvW(acc, left, right);
       }
       break;
     case Primitive::kPrimLong:
-      DCHECK_EQ(2u, instr->GetVectorLength());
-      if (instr->GetOpKind() == HInstruction::kAdd) {
+      DCHECK_EQ(2u, instruction->GetVectorLength());
+      if (instruction->GetOpKind() == HInstruction::kAdd) {
         __ MaddvD(acc, left, right);
       } else {
         __ MsubvD(acc, left, right);
@@ -901,6 +899,15 @@
   }
 }
 
+void LocationsBuilderMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  // TODO: implement this, location helper already filled out (shared with MulAcc).
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
diff --git a/compiler/optimizing/code_generator_vector_x86.cc b/compiler/optimizing/code_generator_vector_x86.cc
index 37190f8..5a012e7 100644
--- a/compiler/optimizing/code_generator_vector_x86.cc
+++ b/compiler/optimizing/code_generator_vector_x86.cc
@@ -51,7 +51,6 @@
                                     : Location::RequiresFpuRegister());
       locations->SetOut(is_zero ? Location::RequiresFpuRegister()
                                 : Location::SameAsFirstInput());
-
       break;
     default:
       LOG(FATAL) << "Unsupported SIMD type";
@@ -1033,12 +1032,42 @@
   }
 }
 
-void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
 }
 
-void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
-  LOG(FATAL) << "No SIMD for " << instr->GetId();
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  // TODO: psadbw for unsigned?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
 // Helper to set up locations for vector memory operations.
diff --git a/compiler/optimizing/code_generator_vector_x86_64.cc b/compiler/optimizing/code_generator_vector_x86_64.cc
index edd0209..3698b7f 100644
--- a/compiler/optimizing/code_generator_vector_x86_64.cc
+++ b/compiler/optimizing/code_generator_vector_x86_64.cc
@@ -1005,11 +1005,41 @@
   }
 }
 
+// Helper to set up locations for vector accumulations.
+static void CreateVecAccumLocations(ArenaAllocator* arena, HVecOperation* instruction) {
+  LocationSummary* locations = new (arena) LocationSummary(instruction);
+  switch (instruction->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, Location::RequiresFpuRegister());
+      locations->SetInAt(1, Location::RequiresFpuRegister());
+      locations->SetInAt(2, Location::RequiresFpuRegister());
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
 void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
-  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
 }
 
 void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) {
+  // TODO: pmaddwd?
+  LOG(FATAL) << "No SIMD for " << instruction->GetId();
+}
+
+void LocationsBuilderX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  CreateVecAccumLocations(GetGraph()->GetArena(), instruction);
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecSADAccumulate(HVecSADAccumulate* instruction) {
+  // TODO: psadbw for unsigned?
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 0b9130f..99581ee 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -26,6 +26,7 @@
 #include "heap_poisoning.h"
 #include "intrinsics.h"
 #include "intrinsics_x86.h"
+#include "linker/linker_patch.h"
 #include "lock_word.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
@@ -4675,10 +4676,10 @@
 // for method patch needs to point to the embedded constant which occupies the last 4 bytes.
 constexpr uint32_t kLabelPositionToLiteralOffsetAdjustment = 4u;
 
-template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorX86::EmitPcRelativeLinkerPatches(
     const ArenaDeque<X86PcRelativePatchInfo>& infos,
-    ArenaVector<LinkerPatch>* linker_patches) {
+    ArenaVector<linker::LinkerPatch>* linker_patches) {
   for (const X86PcRelativePatchInfo& info : infos) {
     uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
     linker_patches->push_back(Factory(
@@ -4686,7 +4687,7 @@
   }
 }
 
-void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
+void CodeGeneratorX86::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       boot_image_method_patches_.size() +
@@ -4697,24 +4698,25 @@
       string_bss_entry_patches_.size();
   linker_patches->reserve(size);
   if (GetCompilerOptions().IsBootImage()) {
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(boot_image_method_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(boot_image_type_patches_,
-                                                                linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(string_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeMethodPatch>(
+        boot_image_method_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeTypePatch>(
+        boot_image_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
+        string_patches_, linker_patches);
   } else {
     DCHECK(boot_image_method_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::TypeClassTablePatch>(boot_image_type_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringInternTablePatch>(string_patches_,
-                                                                     linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
+        boot_image_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
+        string_patches_, linker_patches);
   }
-  EmitPcRelativeLinkerPatches<LinkerPatch::MethodBssEntryPatch>(method_bss_entry_patches_,
-                                                                linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
-                                                              linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_bss_entry_patches_,
-                                                                linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>(
+      method_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeBssEntryPatch>(
+      type_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
+      string_bss_entry_patches_, linker_patches);
   DCHECK_EQ(size, linker_patches->size());
 }
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index b32d57a..e8f919d 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -431,7 +431,7 @@
   void MoveFromReturnRegister(Location trg, Primitive::Type type) OVERRIDE;
 
   // Emit linker patches.
-  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) OVERRIDE;
 
   void PatchJitRootUse(uint8_t* code,
                        const uint8_t* roots_data,
@@ -617,9 +617,9 @@
     HX86ComputeBaseMethodAddress* method_address;
   };
 
-  template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+  template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
   void EmitPcRelativeLinkerPatches(const ArenaDeque<X86PcRelativePatchInfo>& infos,
-                                   ArenaVector<LinkerPatch>* linker_patches);
+                                   ArenaVector<linker::LinkerPatch>* linker_patches);
 
   Register GetInvokeStaticOrDirectExtraParameter(HInvokeStaticOrDirect* invoke, Register temp);
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 39a6580..65b3f62 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -25,6 +25,7 @@
 #include "heap_poisoning.h"
 #include "intrinsics.h"
 #include "intrinsics_x86_64.h"
+#include "linker/linker_patch.h"
 #include "lock_word.h"
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
@@ -1106,10 +1107,10 @@
 // for method patch needs to point to the embedded constant which occupies the last 4 bytes.
 constexpr uint32_t kLabelPositionToLiteralOffsetAdjustment = 4u;
 
-template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorX86_64::EmitPcRelativeLinkerPatches(
     const ArenaDeque<PatchInfo<Label>>& infos,
-    ArenaVector<LinkerPatch>* linker_patches) {
+    ArenaVector<linker::LinkerPatch>* linker_patches) {
   for (const PatchInfo<Label>& info : infos) {
     uint32_t literal_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
     linker_patches->push_back(
@@ -1117,7 +1118,7 @@
   }
 }
 
-void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) {
+void CodeGeneratorX86_64::EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) {
   DCHECK(linker_patches->empty());
   size_t size =
       boot_image_method_patches_.size() +
@@ -1128,24 +1129,25 @@
       string_bss_entry_patches_.size();
   linker_patches->reserve(size);
   if (GetCompilerOptions().IsBootImage()) {
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeMethodPatch>(boot_image_method_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeTypePatch>(boot_image_type_patches_,
-                                                                linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::RelativeStringPatch>(string_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeMethodPatch>(
+        boot_image_method_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeTypePatch>(
+        boot_image_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::RelativeStringPatch>(
+        string_patches_, linker_patches);
   } else {
     DCHECK(boot_image_method_patches_.empty());
-    EmitPcRelativeLinkerPatches<LinkerPatch::TypeClassTablePatch>(boot_image_type_patches_,
-                                                                  linker_patches);
-    EmitPcRelativeLinkerPatches<LinkerPatch::StringInternTablePatch>(string_patches_,
-                                                                     linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeClassTablePatch>(
+        boot_image_type_patches_, linker_patches);
+    EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringInternTablePatch>(
+        string_patches_, linker_patches);
   }
-  EmitPcRelativeLinkerPatches<LinkerPatch::MethodBssEntryPatch>(method_bss_entry_patches_,
-                                                                linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::TypeBssEntryPatch>(type_bss_entry_patches_,
-                                                              linker_patches);
-  EmitPcRelativeLinkerPatches<LinkerPatch::StringBssEntryPatch>(string_bss_entry_patches_,
-                                                                linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::MethodBssEntryPatch>(
+      method_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::TypeBssEntryPatch>(
+      type_bss_entry_patches_, linker_patches);
+  EmitPcRelativeLinkerPatches<linker::LinkerPatch::StringBssEntryPatch>(
+      string_bss_entry_patches_, linker_patches);
   DCHECK_EQ(size, linker_patches->size());
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index f5fa86b..8e8e695 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -424,7 +424,7 @@
 
   void MoveFromReturnRegister(Location trg, Primitive::Type type) OVERRIDE;
 
-  void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
+  void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) OVERRIDE;
 
   void PatchJitRootUse(uint8_t* code,
                        const uint8_t* roots_data,
@@ -586,9 +586,9 @@
   static constexpr int32_t kDummy32BitOffset = 256;
 
  private:
-  template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
+  template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
   static void EmitPcRelativeLinkerPatches(const ArenaDeque<PatchInfo<Label>>& infos,
-                                          ArenaVector<LinkerPatch>* linker_patches);
+                                          ArenaVector<linker::LinkerPatch>* linker_patches);
 
   // Labels for each block that will be compiled.
   Label* block_labels_;  // Indexed by block id.
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index baa0453..6f8743b 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -71,10 +71,13 @@
   return false;
 }
 
-// Detect a sign extension from the given type. Returns the promoted operand on success.
+// Detect a sign extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus sign extension from int can be included.
+// Returns the promoted operand on success.
 static bool IsSignExtensionAndGet(HInstruction* instruction,
                                   Primitive::Type type,
-                                  /*out*/ HInstruction** operand) {
+                                  /*out*/ HInstruction** operand,
+                                  bool to64 = false) {
   // Accept any already wider constant that would be handled properly by sign
   // extension when represented in the *width* of the given narrower data type
   // (the fact that char normally zero extends does not matter here).
@@ -82,20 +85,24 @@
   if (IsInt64AndGet(instruction, /*out*/ &value)) {
     switch (type) {
       case Primitive::kPrimByte:
-        if (std::numeric_limits<int8_t>::min() <= value &&
-            std::numeric_limits<int8_t>::max() >= value) {
+        if (IsInt<8>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
       case Primitive::kPrimChar:
       case Primitive::kPrimShort:
-        if (std::numeric_limits<int16_t>::min() <= value &&
-            std::numeric_limits<int16_t>::max() <= value) {
+        if (IsInt<16>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
+      case Primitive::kPrimInt:
+        if (IsInt<32>(value)) {
+          *operand = instruction;
+          return to64;
+        }
+        return false;
       default:
         return false;
     }
@@ -110,40 +117,52 @@
       case Primitive::kPrimShort:
         *operand = instruction;
         return true;
+      case Primitive::kPrimInt:
+        *operand = instruction;
+        return to64;
       default:
         return false;
     }
   }
-  // TODO: perhaps explicit conversions later too?
-  //       (this may return something different from instruction)
+  // Explicit type conversion to long.
+  if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+    return IsSignExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+  }
   return false;
 }
 
-// Detect a zero extension from the given type. Returns the promoted operand on success.
+// Detect a zero extension in instruction from the given type. The to64 parameter
+// denotes if result is long, and thus zero extension from int can be included.
+// Returns the promoted operand on success.
 static bool IsZeroExtensionAndGet(HInstruction* instruction,
                                   Primitive::Type type,
-                                  /*out*/ HInstruction** operand) {
+                                  /*out*/ HInstruction** operand,
+                                  bool to64 = false) {
   // Accept any already wider constant that would be handled properly by zero
   // extension when represented in the *width* of the given narrower data type
-  // (the fact that byte/short normally sign extend does not matter here).
+  // (the fact that byte/short/int normally sign extend does not matter here).
   int64_t value = 0;
   if (IsInt64AndGet(instruction, /*out*/ &value)) {
     switch (type) {
       case Primitive::kPrimByte:
-        if (std::numeric_limits<uint8_t>::min() <= value &&
-            std::numeric_limits<uint8_t>::max() >= value) {
+        if (IsUint<8>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
       case Primitive::kPrimChar:
       case Primitive::kPrimShort:
-        if (std::numeric_limits<uint16_t>::min() <= value &&
-            std::numeric_limits<uint16_t>::max() <= value) {
+        if (IsUint<16>(value)) {
           *operand = instruction;
           return true;
         }
         return false;
+      case Primitive::kPrimInt:
+        if (IsUint<32>(value)) {
+          *operand = instruction;
+          return to64;
+        }
+        return false;
       default:
         return false;
     }
@@ -170,14 +189,21 @@
         (IsInt64AndGet(b, /*out*/ &mask) && (IsSignExtensionAndGet(a, type, /*out*/ operand) ||
                                              IsZeroExtensionAndGet(a, type, /*out*/ operand)))) {
       switch ((*operand)->GetType()) {
-        case Primitive::kPrimByte:  return mask == std::numeric_limits<uint8_t>::max();
+        case Primitive::kPrimByte:
+          return mask == std::numeric_limits<uint8_t>::max();
         case Primitive::kPrimChar:
-        case Primitive::kPrimShort: return mask == std::numeric_limits<uint16_t>::max();
+        case Primitive::kPrimShort:
+          return mask == std::numeric_limits<uint16_t>::max();
+        case Primitive::kPrimInt:
+          return mask == std::numeric_limits<uint32_t>::max() && to64;
         default: return false;
       }
     }
   }
-  // TODO: perhaps explicit conversions later too?
+  // Explicit type conversion to long.
+  if (instruction->IsTypeConversion() && instruction->GetType() == Primitive::kPrimLong) {
+    return IsZeroExtensionAndGet(instruction->InputAt(0), type, /*out*/ operand, /*to64*/ true);
+  }
   return false;
 }
 
@@ -214,6 +240,55 @@
   return false;
 }
 
+// Compute relative vector length based on type difference.
+static size_t GetOtherVL(Primitive::Type other_type, Primitive::Type vector_type, size_t vl) {
+  switch (other_type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl;
+        default: break;
+      }
+      return vl;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl >> 1;
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return vl;
+        default: break;
+      }
+      break;
+    case Primitive::kPrimInt:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl >> 2;
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return vl >> 1;
+        case Primitive::kPrimInt: return vl;
+        default: break;
+      }
+      break;
+    case Primitive::kPrimLong:
+      switch (vector_type) {
+        case Primitive::kPrimBoolean:
+        case Primitive::kPrimByte: return vl >> 3;
+        case Primitive::kPrimChar:
+        case Primitive::kPrimShort: return vl >> 2;
+        case Primitive::kPrimInt: return vl >> 1;
+        case Primitive::kPrimLong: return vl;
+        default: break;
+      }
+      break;
+    default:
+      break;
+  }
+  LOG(FATAL) << "Unsupported idiom conversion";
+  UNREACHABLE();
+}
+
 // Detect up to two instructions a and b, and an acccumulated constant c.
 static bool IsAddConstHelper(HInstruction* instruction,
                              /*out*/ HInstruction** a,
@@ -260,16 +335,16 @@
 }
 
 // Detect reductions of the following forms,
-// under assumption phi has only *one* use:
 //   x = x_phi + ..
 //   x = x_phi - ..
 //   x = max(x_phi, ..)
 //   x = min(x_phi, ..)
 static bool HasReductionFormat(HInstruction* reduction, HInstruction* phi) {
   if (reduction->IsAdd()) {
-    return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+    return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+           (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
   } else if (reduction->IsSub()) {
-    return reduction->InputAt(0) == phi;
+    return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi);
   } else if (reduction->IsInvokeStaticOrDirect()) {
     switch (reduction->AsInvokeStaticOrDirect()->GetIntrinsic()) {
       case Intrinsics::kMathMinIntInt:
@@ -280,7 +355,8 @@
       case Intrinsics::kMathMaxLongLong:
       case Intrinsics::kMathMaxFloatFloat:
       case Intrinsics::kMathMaxDoubleDouble:
-        return reduction->InputAt(0) == phi || reduction->InputAt(1) == phi;
+        return (reduction->InputAt(0) == phi && reduction->InputAt(1) != phi) ||
+               (reduction->InputAt(0) != phi && reduction->InputAt(1) == phi);
       default:
         return false;
     }
@@ -288,9 +364,9 @@
   return false;
 }
 
-// Translates operation to reduction kind.
-static HVecReduce::ReductionKind GetReductionKind(HInstruction* reduction) {
-  if (reduction->IsVecAdd() || reduction->IsVecSub()) {
+// Translates vector operation to reduction kind.
+static HVecReduce::ReductionKind GetReductionKind(HVecOperation* reduction) {
+  if (reduction->IsVecAdd() || reduction->IsVecSub() || reduction->IsVecSADAccumulate()) {
     return HVecReduce::kSum;
   } else if (reduction->IsVecMin()) {
     return HVecReduce::kMin;
@@ -720,7 +796,6 @@
                                   HBasicBlock* block,
                                   HBasicBlock* exit,
                                   int64_t trip_count) {
-  Primitive::Type induc_type = Primitive::kPrimInt;
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
 
@@ -739,6 +814,10 @@
   vector_header_ = header;
   vector_body_ = block;
 
+  // Loop induction type.
+  Primitive::Type induc_type = main_phi->GetType();
+  DCHECK(induc_type == Primitive::kPrimInt || induc_type == Primitive::kPrimLong) << induc_type;
+
   // Generate dynamic loop peeling trip count, if needed, under the assumption
   // that the Android runtime guarantees at least "component size" alignment:
   // ptc = (ALIGN - (&a[initial] % ALIGN)) / type-size
@@ -767,10 +846,10 @@
     HInstruction* rem = Insert(
         preheader, new (global_allocator_) HAnd(induc_type,
                                                 diff,
-                                                graph_->GetIntConstant(chunk - 1)));
+                                                graph_->GetConstant(induc_type, chunk - 1)));
     vtc = Insert(preheader, new (global_allocator_) HSub(induc_type, stc, rem));
   }
-  vector_index_ = graph_->GetIntConstant(0);
+  vector_index_ = graph_->GetConstant(induc_type, 0);
 
   // Generate runtime disambiguation test:
   // vtc = a != b ? vtc : 0;
@@ -779,7 +858,8 @@
         preheader,
         new (global_allocator_) HNotEqual(vector_runtime_test_a_, vector_runtime_test_b_));
     vtc = Insert(preheader,
-                 new (global_allocator_) HSelect(rt, vtc, graph_->GetIntConstant(0), kNoDexPc));
+                 new (global_allocator_)
+                 HSelect(rt, vtc, graph_->GetConstant(induc_type, 0), kNoDexPc));
     needs_cleanup = true;
   }
 
@@ -793,7 +873,7 @@
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
                     vector_index_,
                     ptc,
-                    graph_->GetIntConstant(1),
+                    graph_->GetConstant(induc_type, 1),
                     kNoUnrollingFactor);
   }
 
@@ -806,7 +886,7 @@
                   graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
                   vector_index_,
                   vtc,
-                  graph_->GetIntConstant(vector_length_),  // increment per unroll
+                  graph_->GetConstant(induc_type, vector_length_),  // increment per unroll
                   unroll);
   HLoopInformation* vloop = vector_header_->GetLoopInformation();
 
@@ -820,14 +900,20 @@
                     graph_->TransformLoopForVectorization(vector_header_, vector_body_, exit),
                     vector_index_,
                     stc,
-                    graph_->GetIntConstant(1),
+                    graph_->GetConstant(induc_type, 1),
                     kNoUnrollingFactor);
   }
 
   // Link reductions to their final uses.
   for (auto i = reductions_->begin(); i != reductions_->end(); ++i) {
     if (i->first->IsPhi()) {
-      i->first->ReplaceWith(ReduceAndExtractIfNeeded(i->second));
+      HInstruction* phi = i->first;
+      HInstruction* repl = ReduceAndExtractIfNeeded(i->second);
+      // Deal with regular uses.
+      for (const HUseListNode<HInstruction*>& use : phi->GetUses()) {
+        induction_range_.Replace(use.GetUser(), phi, repl);  // update induction use
+      }
+      phi->ReplaceWith(repl);
     }
   }
 
@@ -853,7 +939,7 @@
                                         HInstruction* step,
                                         uint32_t unroll) {
   DCHECK(unroll == 1 || vector_mode_ == kVector);
-  Primitive::Type induc_type = Primitive::kPrimInt;
+  Primitive::Type induc_type = lo->GetType();
   // Prepare new loop.
   vector_preheader_ = new_preheader,
   vector_header_ = vector_preheader_->GetSingleSuccessor();
@@ -942,8 +1028,10 @@
   auto redit = reductions_->find(instruction);
   if (redit != reductions_->end()) {
     Primitive::Type type = instruction->GetType();
-    if (TrySetVectorType(type, &restrictions) &&
-        VectorizeUse(node, instruction, generate_code, type, restrictions)) {
+    // Recognize SAD idiom or direct reduction.
+    if (VectorizeSADIdiom(node, instruction, generate_code, type, restrictions) ||
+        (TrySetVectorType(type, &restrictions) &&
+         VectorizeUse(node, instruction, generate_code, type, restrictions))) {
       if (generate_code) {
         HInstruction* new_red = vector_map_->Get(instruction);
         vector_permanent_map_->Put(new_red, vector_map_->Get(redit->second));
@@ -1029,14 +1117,20 @@
     HInstruction* opa = conversion->InputAt(0);
     Primitive::Type from = conversion->GetInputType();
     Primitive::Type to = conversion->GetResultType();
-    if ((to == Primitive::kPrimByte ||
-         to == Primitive::kPrimChar ||
-         to == Primitive::kPrimShort) && from == Primitive::kPrimInt) {
-      // Accept a "narrowing" type conversion from a "wider" computation for
-      // (1) conversion into final required type,
-      // (2) vectorizable operand,
-      // (3) "wider" operations cannot bring in higher order bits.
-      if (to == type && VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) {
+    if (Primitive::IsIntegralType(from) && Primitive::IsIntegralType(to)) {
+      size_t size_vec = Primitive::ComponentSize(type);
+      size_t size_from = Primitive::ComponentSize(from);
+      size_t size_to = Primitive::ComponentSize(to);
+      // Accept an integral conversion
+      // (1a) narrowing into vector type, "wider" operations cannot bring in higher order bits, or
+      // (1b) widening from at least vector type, and
+      // (2) vectorizable operand.
+      if ((size_to < size_from &&
+           size_to == size_vec &&
+           VectorizeUse(node, opa, generate_code, type, restrictions | kNoHiBits)) ||
+          (size_to >= size_from &&
+           size_from >= size_vec &&
+           VectorizeUse(node, opa, generate_code, type, restrictions))) {
         if (generate_code) {
           if (vector_mode_ == kVector) {
             vector_map_->Put(instruction, vector_map_->Get(opa));  // operand pass-through
@@ -1088,7 +1182,7 @@
       return true;
     }
   } else if (instruction->IsShl() || instruction->IsShr() || instruction->IsUShr()) {
-    // Recognize vectorization idioms.
+    // Recognize halving add idiom.
     if (VectorizeHalvingAddIdiom(node, instruction, generate_code, type, restrictions)) {
       return true;
     }
@@ -1181,7 +1275,8 @@
           return false;  // reject, unless all operands are same-extension narrower
         }
         // Accept MIN/MAX(x, y) for vectorizable operands.
-        DCHECK(r != nullptr && s != nullptr);
+        DCHECK(r != nullptr);
+        DCHECK(s != nullptr);
         if (generate_code && vector_mode_ != kVector) {  // de-idiom
           r = opa;
           s = opb;
@@ -1232,11 +1327,11 @@
       switch (type) {
         case Primitive::kPrimBoolean:
         case Primitive::kPrimByte:
-          *restrictions |= kNoDiv | kNoReduction;
+          *restrictions |= kNoDiv;
           return TrySetVectorLength(16);
         case Primitive::kPrimChar:
         case Primitive::kPrimShort:
-          *restrictions |= kNoDiv | kNoReduction;
+          *restrictions |= kNoDiv;
           return TrySetVectorLength(8);
         case Primitive::kPrimInt:
           *restrictions |= kNoDiv;
@@ -1261,17 +1356,17 @@
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
             *restrictions |=
-                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+                kNoMul | kNoDiv | kNoShift | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoReduction;
+            *restrictions |= kNoDiv | kNoAbs | kNoSignedHAdd | kNoUnroundedHAdd | kNoSAD;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv;
+            *restrictions |= kNoDiv | kNoSAD;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax;
+            *restrictions |= kNoMul | kNoDiv | kNoShr | kNoAbs | kNoMinMax | kNoSAD;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
             *restrictions |= kNoMinMax | kNoReduction;  // minmax: -0.0 vs +0.0
@@ -1289,17 +1384,17 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
             *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
@@ -1317,17 +1412,17 @@
         switch (type) {
           case Primitive::kPrimBoolean:
           case Primitive::kPrimByte:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(16);
           case Primitive::kPrimChar:
           case Primitive::kPrimShort:
-            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction;
+            *restrictions |= kNoDiv | kNoStringCharAt | kNoReduction | kNoSAD;
             return TrySetVectorLength(8);
           case Primitive::kPrimInt:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(4);
           case Primitive::kPrimLong:
-            *restrictions |= kNoDiv | kNoReduction;
+            *restrictions |= kNoDiv | kNoReduction | kNoSAD;
             return TrySetVectorLength(2);
           case Primitive::kPrimFloat:
             *restrictions |= kNoMinMax | kNoReduction;  // min/max(x, NaN)
@@ -1371,8 +1466,16 @@
     if (it != vector_permanent_map_->end()) {
       vector = it->second;  // reuse during unrolling
     } else {
-      vector = new (global_allocator_) HVecReplicateScalar(
-          global_allocator_, org, type, vector_length_);
+      // Generates ReplicateScalar( (optional_type_conv) org ).
+      HInstruction* input = org;
+      Primitive::Type input_type = input->GetType();
+      if (type != input_type && (type == Primitive::kPrimLong ||
+                                 input_type == Primitive::kPrimLong)) {
+        input = Insert(vector_preheader_,
+                       new (global_allocator_) HTypeConversion(type, input, kNoDexPc));
+      }
+      vector = new (global_allocator_)
+          HVecReplicateScalar(global_allocator_, input, type, vector_length_);
       vector_permanent_map_->Put(org, Insert(vector_preheader_, vector));
     }
     vector_map_->Put(org, vector);
@@ -1465,10 +1568,15 @@
   // Prepare the new initialization.
   if (vector_mode_ == kVector) {
     // Generate a [initial, 0, .., 0] vector.
-    new_init = Insert(
-            vector_preheader_,
-            new (global_allocator_) HVecSetScalars(
-                global_allocator_, &new_init, phi->GetType(), vector_length_, 1));
+    HVecOperation* red_vector = new_red->AsVecOperation();
+    size_t vector_length = red_vector->GetVectorLength();
+    Primitive::Type type = red_vector->GetPackedType();
+    new_init = Insert(vector_preheader_,
+                      new (global_allocator_) HVecSetScalars(global_allocator_,
+                                                             &new_init,
+                                                             type,
+                                                             vector_length,
+                                                             1));
   } else {
     new_init = ReduceAndExtractIfNeeded(new_init);
   }
@@ -1484,18 +1592,20 @@
   if (instruction->IsPhi()) {
     HInstruction* input = instruction->InputAt(1);
     if (input->IsVecOperation()) {
-      Primitive::Type type = input->AsVecOperation()->GetPackedType();
+      HVecOperation* input_vector = input->AsVecOperation();
+      size_t vector_length = input_vector->GetVectorLength();
+      Primitive::Type type = input_vector->GetPackedType();
+      HVecReduce::ReductionKind kind = GetReductionKind(input_vector);
       HBasicBlock* exit = instruction->GetBlock()->GetSuccessors()[0];
       // Generate a vector reduction and scalar extract
       //    x = REDUCE( [x_1, .., x_n] )
       //    y = x_1
       // along the exit of the defining loop.
-      HVecReduce::ReductionKind kind = GetReductionKind(input);
       HInstruction* reduce = new (global_allocator_) HVecReduce(
-          global_allocator_, instruction, type, vector_length_, kind);
+          global_allocator_, instruction, type, vector_length, kind);
       exit->InsertInstructionBefore(reduce, exit->GetFirstInstruction());
       instruction = new (global_allocator_) HVecExtractScalar(
-          global_allocator_, reduce, type, vector_length_, 0);
+          global_allocator_, reduce, type, vector_length, 0);
       exit->InsertInstructionAfter(instruction, reduce);
     }
   }
@@ -1516,27 +1626,19 @@
                                       HInstruction* opb,
                                       Primitive::Type type,
                                       bool is_unsigned) {
-  if (vector_mode_ == kSequential) {
-    // Non-converting scalar code follows implicit integral promotion.
-    if (!org->IsTypeConversion() && (type == Primitive::kPrimBoolean ||
-                                     type == Primitive::kPrimByte ||
-                                     type == Primitive::kPrimChar ||
-                                     type == Primitive::kPrimShort)) {
-      type = Primitive::kPrimInt;
-    }
-  }
   HInstruction* vector = nullptr;
+  Primitive::Type org_type = org->GetType();
   switch (org->GetKind()) {
     case HInstruction::kNeg:
       DCHECK(opb == nullptr);
       GENERATE_VEC(
           new (global_allocator_) HVecNeg(global_allocator_, opa, type, vector_length_),
-          new (global_allocator_) HNeg(type, opa));
+          new (global_allocator_) HNeg(org_type, opa));
     case HInstruction::kNot:
       DCHECK(opb == nullptr);
       GENERATE_VEC(
           new (global_allocator_) HVecNot(global_allocator_, opa, type, vector_length_),
-          new (global_allocator_) HNot(type, opa));
+          new (global_allocator_) HNot(org_type, opa));
     case HInstruction::kBooleanNot:
       DCHECK(opb == nullptr);
       GENERATE_VEC(
@@ -1546,47 +1648,47 @@
       DCHECK(opb == nullptr);
       GENERATE_VEC(
           new (global_allocator_) HVecCnv(global_allocator_, opa, type, vector_length_),
-          new (global_allocator_) HTypeConversion(type, opa, kNoDexPc));
+          new (global_allocator_) HTypeConversion(org_type, opa, kNoDexPc));
     case HInstruction::kAdd:
       GENERATE_VEC(
           new (global_allocator_) HVecAdd(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HAdd(type, opa, opb));
+          new (global_allocator_) HAdd(org_type, opa, opb));
     case HInstruction::kSub:
       GENERATE_VEC(
           new (global_allocator_) HVecSub(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HSub(type, opa, opb));
+          new (global_allocator_) HSub(org_type, opa, opb));
     case HInstruction::kMul:
       GENERATE_VEC(
           new (global_allocator_) HVecMul(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HMul(type, opa, opb));
+          new (global_allocator_) HMul(org_type, opa, opb));
     case HInstruction::kDiv:
       GENERATE_VEC(
           new (global_allocator_) HVecDiv(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HDiv(type, opa, opb, kNoDexPc));
+          new (global_allocator_) HDiv(org_type, opa, opb, kNoDexPc));
     case HInstruction::kAnd:
       GENERATE_VEC(
           new (global_allocator_) HVecAnd(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HAnd(type, opa, opb));
+          new (global_allocator_) HAnd(org_type, opa, opb));
     case HInstruction::kOr:
       GENERATE_VEC(
           new (global_allocator_) HVecOr(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HOr(type, opa, opb));
+          new (global_allocator_) HOr(org_type, opa, opb));
     case HInstruction::kXor:
       GENERATE_VEC(
           new (global_allocator_) HVecXor(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HXor(type, opa, opb));
+          new (global_allocator_) HXor(org_type, opa, opb));
     case HInstruction::kShl:
       GENERATE_VEC(
           new (global_allocator_) HVecShl(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HShl(type, opa, opb));
+          new (global_allocator_) HShl(org_type, opa, opb));
     case HInstruction::kShr:
       GENERATE_VEC(
           new (global_allocator_) HVecShr(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HShr(type, opa, opb));
+          new (global_allocator_) HShr(org_type, opa, opb));
     case HInstruction::kUShr:
       GENERATE_VEC(
           new (global_allocator_) HVecUShr(global_allocator_, opa, opb, type, vector_length_),
-          new (global_allocator_) HUShr(type, opa, opb));
+          new (global_allocator_) HUShr(org_type, opa, opb));
     case HInstruction::kInvokeStaticOrDirect: {
       HInvokeStaticOrDirect* invoke = org->AsInvokeStaticOrDirect();
       if (vector_mode_ == kVector) {
@@ -1667,8 +1769,8 @@
 //
 
 // Method recognizes the following idioms:
-//   rounding halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
-//   regular  halving add (a + b)     >> 1 for unsigned/signed operands a, b
+//   rounding  halving add (a + b + 1) >> 1 for unsigned/signed operands a, b
+//   truncated halving add (a + b)     >> 1 for unsigned/signed operands a, b
 // Provided that the operands are promoted to a wider form to do the arithmetic and
 // then cast back to narrower form, the idioms can be mapped into efficient SIMD
 // implementation that operates directly in narrower form (plus one extra bit).
@@ -1712,7 +1814,8 @@
       }
       // Accept recognized halving add for vectorizable operands. Vectorized code uses the
       // shorthand idiomatic operation. Sequential code uses the original scalar expressions.
-      DCHECK(r != nullptr && s != nullptr);
+      DCHECK(r != nullptr);
+      DCHECK(s != nullptr);
       if (generate_code && vector_mode_ != kVector) {  // de-idiom
         r = instruction->InputAt(0);
         s = instruction->InputAt(1);
@@ -1741,6 +1844,88 @@
   return false;
 }
 
+// Method recognizes the following idiom:
+//   q += ABS(a - b) for signed operands a, b
+// Provided that the operands have the same type or are promoted to a wider form.
+// Since this may involve a vector length change, the idiom is handled by going directly
+// to a sad-accumulate node (rather than relying combining finer grained nodes later).
+// TODO: unsigned SAD too?
+bool HLoopOptimization::VectorizeSADIdiom(LoopNode* node,
+                                          HInstruction* instruction,
+                                          bool generate_code,
+                                          Primitive::Type reduction_type,
+                                          uint64_t restrictions) {
+  // Filter integral "q += ABS(a - b);" reduction, where ABS and SUB
+  // are done in the same precision (either int or long).
+  if (!instruction->IsAdd() ||
+      (reduction_type != Primitive::kPrimInt && reduction_type != Primitive::kPrimLong)) {
+    return false;
+  }
+  HInstruction* q = instruction->InputAt(0);
+  HInstruction* v = instruction->InputAt(1);
+  HInstruction* a = nullptr;
+  HInstruction* b = nullptr;
+  if (v->IsInvokeStaticOrDirect() &&
+       (v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsInt ||
+        v->AsInvokeStaticOrDirect()->GetIntrinsic() == Intrinsics::kMathAbsLong)) {
+    HInstruction* x = v->InputAt(0);
+    if (x->IsSub() && x->GetType() == reduction_type) {
+      a = x->InputAt(0);
+      b = x->InputAt(1);
+    }
+  }
+  if (a == nullptr || b == nullptr) {
+    return false;
+  }
+  // Accept same-type or consistent sign extension for narrower-type on operands a and b.
+  // The same-type or narrower operands are called r (a or lower) and s (b or lower).
+  HInstruction* r = a;
+  HInstruction* s = b;
+  bool is_unsigned = false;
+  Primitive::Type sub_type = a->GetType();
+  if (a->IsTypeConversion()) {
+    sub_type = a->InputAt(0)->GetType();
+  } else if (b->IsTypeConversion()) {
+    sub_type = b->InputAt(0)->GetType();
+  }
+  if (reduction_type != sub_type &&
+      (!IsNarrowerOperands(a, b, sub_type, &r, &s, &is_unsigned) || is_unsigned)) {
+    return false;
+  }
+  // Try same/narrower type and deal with vector restrictions.
+  if (!TrySetVectorType(sub_type, &restrictions) || HasVectorRestrictions(restrictions, kNoSAD)) {
+    return false;
+  }
+  // Accept SAD idiom for vectorizable operands. Vectorized code uses the shorthand
+  // idiomatic operation. Sequential code uses the original scalar expressions.
+  DCHECK(r != nullptr);
+  DCHECK(s != nullptr);
+  if (generate_code && vector_mode_ != kVector) {  // de-idiom
+    r = s = v->InputAt(0);
+  }
+  if (VectorizeUse(node, q, generate_code, sub_type, restrictions) &&
+      VectorizeUse(node, r, generate_code, sub_type, restrictions) &&
+      VectorizeUse(node, s, generate_code, sub_type, restrictions)) {
+    if (generate_code) {
+      if (vector_mode_ == kVector) {
+        vector_map_->Put(instruction, new (global_allocator_) HVecSADAccumulate(
+            global_allocator_,
+            vector_map_->Get(q),
+            vector_map_->Get(r),
+            vector_map_->Get(s),
+            reduction_type,
+            GetOtherVL(reduction_type, sub_type, vector_length_)));
+        MaybeRecordStat(stats_, MethodCompilationStat::kLoopVectorizedIdiom);
+      } else {
+        GenerateVecOp(v, vector_map_->Get(r), nullptr, reduction_type);
+        GenerateVecOp(instruction, vector_map_->Get(q), vector_map_->Get(v), reduction_type);
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 //
 // Vectorization heuristics.
 //
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index f347518..ae2ea76 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -75,6 +75,7 @@
     kNoMinMax        = 1 << 8,   // no min/max
     kNoStringCharAt  = 1 << 9,   // no StringCharAt
     kNoReduction     = 1 << 10,  // no reduction
+    kNoSAD           = 1 << 11,  // no sum of absolute differences (SAD)
   };
 
   /*
@@ -172,6 +173,11 @@
                                 bool generate_code,
                                 Primitive::Type type,
                                 uint64_t restrictions);
+  bool VectorizeSADIdiom(LoopNode* node,
+                         HInstruction* instruction,
+                         bool generate_code,
+                         Primitive::Type type,
+                         uint64_t restrictions);
 
   // Vectorization heuristics.
   bool IsVectorizationProfitable(int64_t trip_count);
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index a6d0da1..6bc5111 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1396,6 +1396,7 @@
   M(VecUShr, VecBinaryOperation)                                        \
   M(VecSetScalars, VecOperation)                                        \
   M(VecMultiplyAccumulate, VecOperation)                                \
+  M(VecSADAccumulate, VecOperation)                                     \
   M(VecLoad, VecMemoryOperation)                                        \
   M(VecStore, VecMemoryOperation)                                       \
 
diff --git a/compiler/optimizing/nodes_vector.h b/compiler/optimizing/nodes_vector.h
index 886d75e..1488b70 100644
--- a/compiler/optimizing/nodes_vector.h
+++ b/compiler/optimizing/nodes_vector.h
@@ -461,8 +461,8 @@
 };
 
 // Performs halving add on every component in the two vectors, viz.
-// rounded [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
-// or      [ x1, .. , xn ] hadd  [ y1, .. , yn ] = [ (x1 + y1)     >> 1, .. , (xn + yn )    >> 1 ]
+// rounded   [ x1, .. , xn ] hradd [ y1, .. , yn ] = [ (x1 + y1 + 1) >> 1, .. , (xn + yn + 1) >> 1 ]
+// truncated [ x1, .. , xn ] hadd  [ y1, .. , yn ] = [ (x1 + y1)     >> 1, .. , (xn + yn )    >> 1 ]
 // for signed operands x, y (sign extension) or unsigned operands x, y (zero extension).
 class HVecHalvingAdd FINAL : public HVecBinaryOperation {
  public:
@@ -810,12 +810,12 @@
 //
 
 // Assigns the given scalar elements to a vector,
-// viz. set( array(x1, .., xn) ) = [ x1, .. ,           xn ] if n == m,
-//      set( array(x1, .., xm) ) = [ x1, .. , xm, 0, .., 0 ] if m <  n.
+// viz. set( array(x1, .. , xn) ) = [ x1, .. ,            xn ] if n == m,
+//      set( array(x1, .. , xm) ) = [ x1, .. , xm, 0, .. , 0 ] if m <  n.
 class HVecSetScalars FINAL : public HVecOperation {
  public:
   HVecSetScalars(ArenaAllocator* arena,
-                 HInstruction** scalars,  // array
+                 HInstruction* scalars[],
                  Primitive::Type packed_type,
                  size_t vector_length,
                  size_t number_of_scalars,
@@ -827,7 +827,7 @@
                       vector_length,
                       dex_pc) {
     for (size_t i = 0; i < number_of_scalars; i++) {
-      DCHECK(!scalars[i]->IsVecOperation());
+      DCHECK(!scalars[i]->IsVecOperation() || scalars[i]->IsVecExtractScalar());
       SetRawInputAt(0, scalars[i]);
     }
   }
@@ -842,9 +842,8 @@
   DISALLOW_COPY_AND_ASSIGN(HVecSetScalars);
 };
 
-// Multiplies every component in the two vectors, adds the result vector to the accumulator vector.
-// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] =
-//     [ acc1 + x1 * y1, .. , accn + xn * yn ].
+// Multiplies every component in the two vectors, adds the result vector to the accumulator vector,
+// viz. [ a1, .. , an ] + [ x1, .. , xn ] * [ y1, .. , yn ] = [ a1 + x1 * y1, .. , an + xn * yn ].
 class HVecMultiplyAccumulate FINAL : public HVecOperation {
  public:
   HVecMultiplyAccumulate(ArenaAllocator* arena,
@@ -866,15 +865,11 @@
     DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_left, packed_type));
     DCHECK(HasConsistentPackedTypes(mul_right, packed_type));
-    SetRawInputAt(kInputAccumulatorIndex, accumulator);
-    SetRawInputAt(kInputMulLeftIndex, mul_left);
-    SetRawInputAt(kInputMulRightIndex, mul_right);
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, mul_left);
+    SetRawInputAt(2, mul_right);
   }
 
-  static constexpr int kInputAccumulatorIndex = 0;
-  static constexpr int kInputMulLeftIndex = 1;
-  static constexpr int kInputMulRightIndex = 2;
-
   bool CanBeMoved() const OVERRIDE { return true; }
 
   bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
@@ -894,6 +889,42 @@
   DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate);
 };
 
+// Takes the absolute difference of two vectors, and adds the results to
+// same-precision or wider-precision components in the accumulator,
+// viz. SAD([ a1, .. , am ], [ x1, .. , xn ], [ y1, .. , yn ] =
+//          [ a1 + sum abs(xi-yi), .. , am + sum abs(xj-yj) ],
+//      for m <= n and non-overlapping sums.
+class HVecSADAccumulate FINAL : public HVecOperation {
+ public:
+  HVecSADAccumulate(ArenaAllocator* arena,
+                    HInstruction* accumulator,
+                    HInstruction* sad_left,
+                    HInstruction* sad_right,
+                    Primitive::Type packed_type,
+                    size_t vector_length,
+                    uint32_t dex_pc = kNoDexPc)
+      : HVecOperation(arena,
+                      packed_type,
+                      SideEffects::None(),
+                      /* number_of_inputs */ 3,
+                      vector_length,
+                      dex_pc) {
+    DCHECK(HasConsistentPackedTypes(accumulator, packed_type));
+    DCHECK(sad_left->IsVecOperation());
+    DCHECK(sad_right->IsVecOperation());
+    DCHECK_EQ(sad_left->AsVecOperation()->GetPackedType(),
+              sad_right->AsVecOperation()->GetPackedType());
+    SetRawInputAt(0, accumulator);
+    SetRawInputAt(1, sad_left);
+    SetRawInputAt(2, sad_right);
+  }
+
+  DECLARE_INSTRUCTION(VecSADAccumulate);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HVecSADAccumulate);
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad FINAL : public HVecMemoryOperation {
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 8dd2762..7451196 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -77,6 +77,7 @@
 #include "jit/jit_logger.h"
 #include "jni/quick/jni_compiler.h"
 #include "licm.h"
+#include "linker/linker_patch.h"
 #include "load_store_analysis.h"
 #include "load_store_elimination.h"
 #include "loop_optimization.h"
@@ -833,13 +834,13 @@
   RunArchOptimizations(driver->GetInstructionSet(), graph, codegen, pass_observer);
 }
 
-static ArenaVector<LinkerPatch> EmitAndSortLinkerPatches(CodeGenerator* codegen) {
-  ArenaVector<LinkerPatch> linker_patches(codegen->GetGraph()->GetArena()->Adapter());
+static ArenaVector<linker::LinkerPatch> EmitAndSortLinkerPatches(CodeGenerator* codegen) {
+  ArenaVector<linker::LinkerPatch> linker_patches(codegen->GetGraph()->GetArena()->Adapter());
   codegen->EmitLinkerPatches(&linker_patches);
 
   // Sort patches by literal offset. Required for .oat_patches encoding.
   std::sort(linker_patches.begin(), linker_patches.end(),
-            [](const LinkerPatch& lhs, const LinkerPatch& rhs) {
+            [](const linker::LinkerPatch& lhs, const linker::LinkerPatch& rhs) {
     return lhs.LiteralOffset() < rhs.LiteralOffset();
   });
 
@@ -851,7 +852,7 @@
                                          CodeGenerator* codegen,
                                          CompilerDriver* compiler_driver,
                                          const DexFile::CodeItem* code_item) const {
-  ArenaVector<LinkerPatch> linker_patches = EmitAndSortLinkerPatches(codegen);
+  ArenaVector<linker::LinkerPatch> linker_patches = EmitAndSortLinkerPatches(codegen);
   ArenaVector<uint8_t> stack_map(arena->Adapter(kArenaAllocStackMaps));
   ArenaVector<uint8_t> method_info(arena->Adapter(kArenaAllocStackMaps));
   size_t stack_map_size = 0;
@@ -876,7 +877,7 @@
       ArrayRef<const uint8_t>(method_info),
       ArrayRef<const uint8_t>(stack_map),
       ArrayRef<const uint8_t>(*codegen->GetAssembler()->cfi().data()),
-      ArrayRef<const LinkerPatch>(linker_patches));
+      ArrayRef<const linker::LinkerPatch>(linker_patches));
 
   return compiled_method;
 }
diff --git a/compiler/optimizing/ssa_liveness_analysis.cc b/compiler/optimizing/ssa_liveness_analysis.cc
index 185303b..754a762 100644
--- a/compiler/optimizing/ssa_liveness_analysis.cc
+++ b/compiler/optimizing/ssa_liveness_analysis.cc
@@ -474,7 +474,9 @@
   // For a SIMD operation, compute the number of needed spill slots.
   // TODO: do through vector type?
   HInstruction* definition = GetParent()->GetDefinedBy();
-  if (definition != nullptr && definition->IsVecOperation()) {
+  if (definition != nullptr &&
+      definition->IsVecOperation() &&
+      !definition->IsVecExtractScalar()) {
     return definition->AsVecOperation()->GetVectorNumberOfBytes() / kVRegSize;
   }
   // Return number of needed spill slots based on type.
diff --git a/compiler/utils/assembler_test.h b/compiler/utils/assembler_test.h
index 12954a4..227954e 100644
--- a/compiler/utils/assembler_test.h
+++ b/compiler/utils/assembler_test.h
@@ -114,6 +114,24 @@
         fmt);
   }
 
+  std::string Repeatww(void (Ass::*f)(Reg, Reg), const std::string& fmt) {
+    return RepeatTemplatedRegisters<Reg, Reg>(f,
+        GetRegisters(),
+        GetRegisters(),
+        &AssemblerTest::GetRegName<RegisterView::kUseTertiaryName>,
+        &AssemblerTest::GetRegName<RegisterView::kUseTertiaryName>,
+        fmt);
+  }
+
+  std::string Repeatbb(void (Ass::*f)(Reg, Reg), const std::string& fmt) {
+    return RepeatTemplatedRegisters<Reg, Reg>(f,
+        GetRegisters(),
+        GetRegisters(),
+        &AssemblerTest::GetRegName<RegisterView::kUseQuaternaryName>,
+        &AssemblerTest::GetRegName<RegisterView::kUseQuaternaryName>,
+        fmt);
+  }
+
   std::string RepeatRRR(void (Ass::*f)(Reg, Reg, Reg), const std::string& fmt) {
     return RepeatTemplatedRegisters<Reg, Reg, Reg>(f,
         GetRegisters(),
@@ -147,10 +165,18 @@
     return RepeatRegisterImm<RegisterView::kUsePrimaryName>(f, imm_bytes, fmt);
   }
 
-  std::string Repeatri(void (Ass::*f)(Reg, const Imm&), size_t imm_bytes, const std::string& fmt) {
+  std::string RepeatrI(void (Ass::*f)(Reg, const Imm&), size_t imm_bytes, const std::string& fmt) {
     return RepeatRegisterImm<RegisterView::kUseSecondaryName>(f, imm_bytes, fmt);
   }
 
+  std::string RepeatwI(void (Ass::*f)(Reg, const Imm&), size_t imm_bytes, const std::string& fmt) {
+    return RepeatRegisterImm<RegisterView::kUseTertiaryName>(f, imm_bytes, fmt);
+  }
+
+  std::string RepeatbI(void (Ass::*f)(Reg, const Imm&), size_t imm_bytes, const std::string& fmt) {
+    return RepeatRegisterImm<RegisterView::kUseQuaternaryName>(f, imm_bytes, fmt);
+  }
+
   template <typename Reg1, typename Reg2, typename ImmType>
   std::string RepeatTemplatedRegistersImmBits(void (Ass::*f)(Reg1, Reg2, ImmType),
                                               int imm_bits,
@@ -909,6 +935,63 @@
         fmt);
   }
 
+  // Repeats over secondary registers and addresses provided by fixture.
+  std::string RepeatrA(void (Ass::*f)(Reg, const Addr&), const std::string& fmt) {
+    return RepeatrA(f, GetAddresses(), fmt);
+  }
+
+  // Variant that takes explicit vector of addresss
+  // (to test restricted addressing modes set).
+  std::string RepeatrA(void (Ass::*f)(Reg, const Addr&),
+                       const std::vector<Addr>& a,
+                       const std::string& fmt) {
+    return RepeatTemplatedRegMem<Reg, Addr>(
+        f,
+        GetRegisters(),
+        a,
+        &AssemblerTest::GetRegName<RegisterView::kUseSecondaryName>,
+        &AssemblerTest::GetAddrName,
+        fmt);
+  }
+
+  // Repeats over tertiary registers and addresses provided by fixture.
+  std::string RepeatwA(void (Ass::*f)(Reg, const Addr&), const std::string& fmt) {
+    return RepeatwA(f, GetAddresses(), fmt);
+  }
+
+  // Variant that takes explicit vector of addresss
+  // (to test restricted addressing modes set).
+  std::string RepeatwA(void (Ass::*f)(Reg, const Addr&),
+                       const std::vector<Addr>& a,
+                       const std::string& fmt) {
+    return RepeatTemplatedRegMem<Reg, Addr>(
+        f,
+        GetRegisters(),
+        a,
+        &AssemblerTest::GetRegName<RegisterView::kUseTertiaryName>,
+        &AssemblerTest::GetAddrName,
+        fmt);
+  }
+
+  // Repeats over quaternary registers and addresses provided by fixture.
+  std::string RepeatbA(void (Ass::*f)(Reg, const Addr&), const std::string& fmt) {
+    return RepeatbA(f, GetAddresses(), fmt);
+  }
+
+  // Variant that takes explicit vector of addresss
+  // (to test restricted addressing modes set).
+  std::string RepeatbA(void (Ass::*f)(Reg, const Addr&),
+                       const std::vector<Addr>& a,
+                       const std::string& fmt) {
+    return RepeatTemplatedRegMem<Reg, Addr>(
+        f,
+        GetRegisters(),
+        a,
+        &AssemblerTest::GetRegName<RegisterView::kUseQuaternaryName>,
+        &AssemblerTest::GetAddrName,
+        fmt);
+  }
+
   // Repeats over fp-registers and addresses provided by fixture.
   std::string RepeatFA(void (Ass::*f)(FPReg, const Addr&), const std::string& fmt) {
     return RepeatFA(f, GetAddresses(), fmt);
@@ -947,6 +1030,63 @@
         fmt);
   }
 
+  // Repeats over addresses and secondary registers provided by fixture.
+  std::string RepeatAr(void (Ass::*f)(const Addr&, Reg), const std::string& fmt) {
+    return RepeatAr(f, GetAddresses(), fmt);
+  }
+
+  // Variant that takes explicit vector of addresss
+  // (to test restricted addressing modes set).
+  std::string RepeatAr(void (Ass::*f)(const Addr&, Reg),
+                       const std::vector<Addr>& a,
+                       const std::string& fmt) {
+    return RepeatTemplatedMemReg<Addr, Reg>(
+        f,
+        a,
+        GetRegisters(),
+        &AssemblerTest::GetAddrName,
+        &AssemblerTest::GetRegName<RegisterView::kUseSecondaryName>,
+        fmt);
+  }
+
+  // Repeats over addresses and tertiary registers provided by fixture.
+  std::string RepeatAw(void (Ass::*f)(const Addr&, Reg), const std::string& fmt) {
+    return RepeatAw(f, GetAddresses(), fmt);
+  }
+
+  // Variant that takes explicit vector of addresss
+  // (to test restricted addressing modes set).
+  std::string RepeatAw(void (Ass::*f)(const Addr&, Reg),
+                       const std::vector<Addr>& a,
+                       const std::string& fmt) {
+    return RepeatTemplatedMemReg<Addr, Reg>(
+        f,
+        a,
+        GetRegisters(),
+        &AssemblerTest::GetAddrName,
+        &AssemblerTest::GetRegName<RegisterView::kUseTertiaryName>,
+        fmt);
+  }
+
+  // Repeats over addresses and quaternary registers provided by fixture.
+  std::string RepeatAb(void (Ass::*f)(const Addr&, Reg), const std::string& fmt) {
+    return RepeatAb(f, GetAddresses(), fmt);
+  }
+
+  // Variant that takes explicit vector of addresss
+  // (to test restricted addressing modes set).
+  std::string RepeatAb(void (Ass::*f)(const Addr&, Reg),
+                       const std::vector<Addr>& a,
+                       const std::string& fmt) {
+    return RepeatTemplatedMemReg<Addr, Reg>(
+        f,
+        a,
+        GetRegisters(),
+        &AssemblerTest::GetAddrName,
+        &AssemblerTest::GetRegName<RegisterView::kUseQuaternaryName>,
+        fmt);
+  }
+
   // Repeats over addresses and fp-registers provided by fixture.
   std::string RepeatAF(void (Ass::*f)(const Addr&, FPReg), const std::string& fmt) {
     return RepeatAF(f, GetAddresses(), fmt);
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 3162a32..9fcede5 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -35,25 +35,25 @@
 std::ostream& operator<<(std::ostream& os, const Address& addr) {
   switch (addr.mod()) {
     case 0:
-      if (addr.rm() == ESP && addr.index() != ESP) {
-        return os << "(%" << addr.base() << ",%"
-                  << addr.index() << "," << (1 << addr.scale()) << ")";
+      if (addr.rm() != ESP || addr.index() == ESP) {
+        return os << "(%" << addr.rm() << ")";
+      } else if (addr.base() == EBP) {
+        return os << static_cast<int>(addr.disp32()) << "(,%" << addr.index()
+                  << "," << (1 << addr.scale()) << ")";
       }
-      return os << "(%" << addr.rm() << ")";
+      return os << "(%" << addr.base() << ",%" << addr.index() << "," << (1 << addr.scale()) << ")";
     case 1:
-      if (addr.rm() == ESP && addr.index() != ESP) {
-        return os << static_cast<int>(addr.disp8())
-                  << "(%" << addr.base() << ",%"
-                  << addr.index() << "," << (1 << addr.scale()) << ")";
+      if (addr.rm() != ESP || addr.index() == ESP) {
+        return os << static_cast<int>(addr.disp8()) << "(%" << addr.rm() << ")";
       }
-      return os << static_cast<int>(addr.disp8()) << "(%" << addr.rm() << ")";
+      return os << static_cast<int>(addr.disp8()) << "(%" << addr.base() << ",%"
+                << addr.index() << "," << (1 << addr.scale()) << ")";
     case 2:
-      if (addr.rm() == ESP && addr.index() != ESP) {
-        return os << static_cast<int>(addr.disp32())
-                  << "(%" << addr.base() << ",%"
-                  << addr.index() << "," << (1 << addr.scale()) << ")";
+      if (addr.rm() != ESP || addr.index() == ESP) {
+        return os << static_cast<int>(addr.disp32()) << "(%" << addr.rm() << ")";
       }
-      return os << static_cast<int>(addr.disp32()) << "(%" << addr.rm() << ")";
+      return os << static_cast<int>(addr.disp32()) << "(%" << addr.base() << ",%"
+                << addr.index() << "," << (1 << addr.scale()) << ")";
     default:
       return os << "<address?>";
   }
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index c28ed3b..cccde37 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -148,21 +148,14 @@
 };
 
 //
-// Test repeat drivers used in the tests.
+// Test some repeat drivers used in the tests.
 //
 
 TEST_F(AssemblerX86Test, RepeatRR) {
-  EXPECT_EQ("%eax %eax\n%eax %ebx\n%eax %ecx\n%eax %edx\n%eax %ebp\n%eax %esp\n%eax %esi\n"
-            "%eax %edi\n%ebx %eax\n%ebx %ebx\n%ebx %ecx\n%ebx %edx\n%ebx %ebp\n%ebx %esp\n"
-            "%ebx %esi\n%ebx %edi\n%ecx %eax\n%ecx %ebx\n%ecx %ecx\n%ecx %edx\n%ecx %ebp\n"
-            "%ecx %esp\n%ecx %esi\n%ecx %edi\n%edx %eax\n%edx %ebx\n%edx %ecx\n%edx %edx\n"
-            "%edx %ebp\n%edx %esp\n%edx %esi\n%edx %edi\n%ebp %eax\n%ebp %ebx\n%ebp %ecx\n"
-            "%ebp %edx\n%ebp %ebp\n%ebp %esp\n%ebp %esi\n%ebp %edi\n%esp %eax\n%esp %ebx\n"
-            "%esp %ecx\n%esp %edx\n%esp %ebp\n%esp %esp\n%esp %esi\n%esp %edi\n%esi %eax\n"
-            "%esi %ebx\n%esi %ecx\n%esi %edx\n%esi %ebp\n%esi %esp\n%esi %esi\n%esi %edi\n"
-            "%edi %eax\n%edi %ebx\n%edi %ecx\n%edi %edx\n%edi %ebp\n%edi %esp\n%edi %esi\n"
-            "%edi %edi\n",
-            RepeatRR(/*f*/ nullptr, "%{reg1} %{reg2}"));
+  EXPECT_NE(RepeatRR(/*f*/ nullptr, "%{reg1} %{reg2}")
+            .find("%eax %eax\n%eax %ebx\n%eax %ecx\n%eax %edx\n%eax %ebp\n%eax %esp\n%eax %esi\n"
+                  "%eax %edi\n%ebx %eax\n%ebx %ebx\n%ebx %ecx\n%ebx %edx\n%ebx %ebp\n%ebx %esp\n"),
+            std::string::npos);
 }
 
 TEST_F(AssemblerX86Test, RepeatRI) {
@@ -173,18 +166,10 @@
 }
 
 TEST_F(AssemblerX86Test, RepeatFF) {
-  EXPECT_EQ("%XMM0 %XMM0\n%XMM0 %XMM1\n%XMM0 %XMM2\n%XMM0 %XMM3\n%XMM0 %XMM4\n%XMM0 %XMM5\n"
-            "%XMM0 %XMM6\n%XMM0 %XMM7\n%XMM1 %XMM0\n%XMM1 %XMM1\n%XMM1 %XMM2\n%XMM1 %XMM3\n"
-            "%XMM1 %XMM4\n%XMM1 %XMM5\n%XMM1 %XMM6\n%XMM1 %XMM7\n%XMM2 %XMM0\n%XMM2 %XMM1\n"
-            "%XMM2 %XMM2\n%XMM2 %XMM3\n%XMM2 %XMM4\n%XMM2 %XMM5\n%XMM2 %XMM6\n%XMM2 %XMM7\n"
-            "%XMM3 %XMM0\n%XMM3 %XMM1\n%XMM3 %XMM2\n%XMM3 %XMM3\n%XMM3 %XMM4\n%XMM3 %XMM5\n"
-            "%XMM3 %XMM6\n%XMM3 %XMM7\n%XMM4 %XMM0\n%XMM4 %XMM1\n%XMM4 %XMM2\n%XMM4 %XMM3\n"
-            "%XMM4 %XMM4\n%XMM4 %XMM5\n%XMM4 %XMM6\n%XMM4 %XMM7\n%XMM5 %XMM0\n%XMM5 %XMM1\n"
-            "%XMM5 %XMM2\n%XMM5 %XMM3\n%XMM5 %XMM4\n%XMM5 %XMM5\n%XMM5 %XMM6\n%XMM5 %XMM7\n"
-            "%XMM6 %XMM0\n%XMM6 %XMM1\n%XMM6 %XMM2\n%XMM6 %XMM3\n%XMM6 %XMM4\n%XMM6 %XMM5\n"
-            "%XMM6 %XMM6\n%XMM6 %XMM7\n%XMM7 %XMM0\n%XMM7 %XMM1\n%XMM7 %XMM2\n%XMM7 %XMM3\n"
-            "%XMM7 %XMM4\n%XMM7 %XMM5\n%XMM7 %XMM6\n%XMM7 %XMM7\n",
-            RepeatFF(/*f*/ nullptr, "%{reg1} %{reg2}"));
+  EXPECT_NE(RepeatFF(/*f*/ nullptr, "%{reg1} %{reg2}")
+            .find("%XMM0 %XMM0\n%XMM0 %XMM1\n%XMM0 %XMM2\n%XMM0 %XMM3\n%XMM0 %XMM4\n%XMM0 %XMM5\n"
+                  "%XMM0 %XMM6\n%XMM0 %XMM7\n%XMM1 %XMM0\n%XMM1 %XMM1\n%XMM1 %XMM2\n%XMM1 %XMM3\n"),
+            std::string::npos);
 }
 
 TEST_F(AssemblerX86Test, RepeatFFI) {
@@ -235,6 +220,36 @@
 // Actual x86 instruction assembler tests.
 //
 
+TEST_F(AssemblerX86Test, PoplAllAddresses) {
+  // Make sure all addressing modes combinations are tested at least once.
+  std::vector<x86::Address> all_addresses;
+  for (x86::Register* base : GetRegisters()) {
+    // Base only.
+    all_addresses.push_back(x86::Address(*base, -1));
+    all_addresses.push_back(x86::Address(*base, 0));
+    all_addresses.push_back(x86::Address(*base, 1));
+    all_addresses.push_back(x86::Address(*base, 123456789));
+    for (x86::Register* index : GetRegisters()) {
+      if (*index == x86::ESP) {
+        // Index cannot be ESP.
+        continue;
+      } else if (*base == *index) {
+       // Index only.
+       all_addresses.push_back(x86::Address(*index, x86::TIMES_1, -1));
+       all_addresses.push_back(x86::Address(*index, x86::TIMES_2, 0));
+       all_addresses.push_back(x86::Address(*index, x86::TIMES_4, 1));
+       all_addresses.push_back(x86::Address(*index, x86::TIMES_8, 123456789));
+      }
+      // Base and index.
+      all_addresses.push_back(x86::Address(*base, *index, x86::TIMES_1, -1));
+      all_addresses.push_back(x86::Address(*base, *index, x86::TIMES_2, 0));
+      all_addresses.push_back(x86::Address(*base, *index, x86::TIMES_4, 1));
+      all_addresses.push_back(x86::Address(*base, *index, x86::TIMES_8, 123456789));
+    }
+  }
+  DriverStr(RepeatA(&x86::X86Assembler::popl, all_addresses, "popl {mem}"), "popq");
+}
+
 TEST_F(AssemblerX86Test, Movl) {
   DriverStr(RepeatRR(&x86::X86Assembler::movl, "movl %{reg2}, %{reg1}"), "movl");
 }
@@ -370,7 +385,7 @@
 }
 
 TEST_F(AssemblerX86Test, RorlImm) {
-  DriverStr(RepeatRI(&x86::X86Assembler::rorl, 1U, "rorl ${imm}, %{reg}"), "rorli");
+  DriverStr(RepeatRI(&x86::X86Assembler::rorl, /*imm_bytes*/ 1U, "rorl ${imm}, %{reg}"), "rorli");
 }
 
 // Roll only allows CL as the shift count.
@@ -390,7 +405,7 @@
 }
 
 TEST_F(AssemblerX86Test, RollImm) {
-  DriverStr(RepeatRI(&x86::X86Assembler::roll, 1U, "roll ${imm}, %{reg}"), "rolli");
+  DriverStr(RepeatRI(&x86::X86Assembler::roll, /*imm_bytes*/ 1U, "roll ${imm}, %{reg}"), "rolli");
 }
 
 TEST_F(AssemblerX86Test, Cvtdq2ps) {
@@ -418,12 +433,12 @@
 }
 
 TEST_F(AssemblerX86Test, RoundSS) {
-  DriverStr(RepeatFFI(&x86::X86Assembler::roundss, 1U,
+  DriverStr(RepeatFFI(&x86::X86Assembler::roundss, /*imm_bytes*/ 1U,
                       "roundss ${imm}, %{reg2}, %{reg1}"), "roundss");
 }
 
 TEST_F(AssemblerX86Test, RoundSD) {
-  DriverStr(RepeatFFI(&x86::X86Assembler::roundsd, 1U,
+  DriverStr(RepeatFFI(&x86::X86Assembler::roundsd, /*imm_bytes*/ 1U,
                       "roundsd ${imm}, %{reg2}, %{reg1}"), "roundsd");
 }
 
@@ -896,7 +911,15 @@
 }
 
 TEST_F(AssemblerX86Test, Cmpb) {
-  DriverStr(RepeatAI(&x86::X86Assembler::cmpb, /*imm_bytes*/ 1U, "cmpb ${imm}, {mem}"), "cmpb");
+  DriverStr(RepeatAI(&x86::X86Assembler::cmpb,
+                     /*imm_bytes*/ 1U,
+                     "cmpb ${imm}, {mem}"), "cmpb");
+}
+
+TEST_F(AssemblerX86Test, Cmpw) {
+  DriverStr(RepeatAI(&x86::X86Assembler::cmpw,
+                     /*imm_bytes*/ 1U,
+                     "cmpw ${imm}, {mem}"), "cmpw");  // TODO: only imm8?
 }
 
 }  // namespace art
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 3bff67d..51f61ca 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -36,6 +36,34 @@
   return os << "ST" << static_cast<int>(reg);
 }
 
+std::ostream& operator<<(std::ostream& os, const Address& addr) {
+  switch (addr.mod()) {
+    case 0:
+      if (addr.rm() != RSP || addr.cpu_index().AsRegister() == RSP) {
+        return os << "(%" << addr.cpu_rm() << ")";
+      } else if (addr.base() == RBP) {
+        return os << static_cast<int>(addr.disp32()) << "(,%" << addr.cpu_index()
+                  << "," << (1 << addr.scale()) << ")";
+      }
+      return os << "(%" << addr.cpu_base() << ",%"
+                << addr.cpu_index() << "," << (1 << addr.scale()) << ")";
+    case 1:
+      if (addr.rm() != RSP || addr.cpu_index().AsRegister() == RSP) {
+        return os << static_cast<int>(addr.disp8()) << "(%" << addr.cpu_rm() << ")";
+      }
+      return os << static_cast<int>(addr.disp8()) << "(%" << addr.cpu_base() << ",%"
+                << addr.cpu_index() << "," << (1 << addr.scale()) << ")";
+    case 2:
+      if (addr.rm() != RSP || addr.cpu_index().AsRegister() == RSP) {
+        return os << static_cast<int>(addr.disp32()) << "(%" << addr.cpu_rm() << ")";
+      }
+      return os << static_cast<int>(addr.disp32()) << "(%" << addr.cpu_base() << ",%"
+                << addr.cpu_index() << "," << (1 << addr.scale()) << ")";
+    default:
+      return os << "<address?>";
+  }
+}
+
 void X86_64Assembler::call(CpuRegister reg) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(reg);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index fc0839b5a8..1130444 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -80,6 +80,21 @@
     return static_cast<Register>(encoding_at(1) & 7);
   }
 
+  CpuRegister cpu_rm() const {
+    int ext = (rex_ & 1) != 0 ? x86_64::R8 : x86_64::RAX;
+    return static_cast<CpuRegister>(rm() + ext);
+  }
+
+  CpuRegister cpu_index() const {
+    int ext = (rex_ & 2) != 0 ? x86_64::R8 : x86_64::RAX;
+    return static_cast<CpuRegister>(index() + ext);
+  }
+
+  CpuRegister cpu_base() const {
+    int ext = (rex_ & 1) != 0 ? x86_64::R8 : x86_64::RAX;
+    return static_cast<CpuRegister>(base() + ext);
+  }
+
   uint8_t rex() const {
     return rex_;
   }
@@ -268,6 +283,7 @@
   Address() {}
 };
 
+std::ostream& operator<<(std::ostream& os, const Address& addr);
 
 /**
  * Class to handle constant area values.
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 3e6110d..aff8871 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -153,6 +153,55 @@
   }
 
   void SetUpHelpers() OVERRIDE {
+    if (addresses_singleton_.size() == 0) {
+      // One addressing mode to test the repeat drivers.
+      addresses_singleton_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RAX),
+                          x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_1, -1));
+    }
+
+    if (addresses_.size() == 0) {
+      // Several addressing modes.
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                          x86_64::CpuRegister(x86_64::RAX), x86_64::TIMES_1, 15));
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                          x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_2, 16));
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                          x86_64::CpuRegister(x86_64::RCX), x86_64::TIMES_4, 17));
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                          x86_64::CpuRegister(x86_64::RDX), x86_64::TIMES_8, 18));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RAX), -1));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RBX), 0));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RSI), 1));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RDI), 987654321));
+      // Several addressing modes with the special ESP.
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RSP),
+                          x86_64::CpuRegister(x86_64::RAX), x86_64::TIMES_1, 15));
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RSP),
+                          x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_2, 16));
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RSP),
+                          x86_64::CpuRegister(x86_64::RCX), x86_64::TIMES_4, 17));
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::RSP),
+                          x86_64::CpuRegister(x86_64::RDX), x86_64::TIMES_8, 18));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), -1));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 0));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 1));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 987654321));
+      // Several addressing modes with the higher registers.
+      addresses_.push_back(
+          x86_64::Address(x86_64::CpuRegister(x86_64::R8),
+                          x86_64::CpuRegister(x86_64::R15), x86_64::TIMES_2, -1));
+      addresses_.push_back(x86_64::Address(x86_64::CpuRegister(x86_64::R15), 123456789));
+    }
+
     if (registers_.size() == 0) {
       registers_.push_back(new x86_64::CpuRegister(x86_64::RAX));
       registers_.push_back(new x86_64::CpuRegister(x86_64::RBX));
@@ -248,8 +297,7 @@
   }
 
   std::vector<x86_64::Address> GetAddresses() {
-    UNIMPLEMENTED(FATAL) << "Feature not implemented yet";
-    UNREACHABLE();
+    return addresses_;
   }
 
   std::vector<x86_64::CpuRegister*> GetRegisters() OVERRIDE {
@@ -279,29 +327,31 @@
     return quaternary_register_names_[reg];
   }
 
+  std::vector<x86_64::Address> addresses_singleton_;
+
  private:
+  std::vector<x86_64::Address> addresses_;
   std::vector<x86_64::CpuRegister*> registers_;
   std::map<x86_64::CpuRegister, std::string, X86_64CpuRegisterCompare> secondary_register_names_;
   std::map<x86_64::CpuRegister, std::string, X86_64CpuRegisterCompare> tertiary_register_names_;
   std::map<x86_64::CpuRegister, std::string, X86_64CpuRegisterCompare> quaternary_register_names_;
-
   std::vector<x86_64::XmmRegister*> fp_registers_;
 };
 
 //
-// Test repeat drivers used in the tests.
+// Test some repeat drivers used in the tests.
 //
 
 TEST_F(AssemblerX86_64Test, RepeatI4) {
-  EXPECT_EQ("%0\n%-1\n%18\n%4660\n%-4660\n%305419896\n%-305419896\n",
-            RepeatI(/*f*/ nullptr, /*imm_bytes*/ 4U, "%{imm}"));
+  EXPECT_EQ("$0\n$-1\n$18\n$4660\n$-4660\n$305419896\n$-305419896\n",
+            RepeatI(/*f*/ nullptr, /*imm_bytes*/ 4U, "${imm}"));
 }
 
 TEST_F(AssemblerX86_64Test, RepeatI8) {
-  EXPECT_EQ("%0\n%-1\n%18\n%4660\n%-4660\n%305419896\n%-305419896\n"
-            "%20015998343868\n%-20015998343868\n%1311768467463790320\n"
-            "%-1311768467463790320\n",
-            RepeatI(/*f*/ nullptr, /*imm_bytes*/ 8U, "%{imm}"));
+  EXPECT_EQ("$0\n$-1\n$18\n$4660\n$-4660\n$305419896\n$-305419896\n"
+            "$20015998343868\n$-20015998343868\n$1311768467463790320\n"
+            "$-1311768467463790320\n",
+            RepeatI(/*f*/ nullptr, /*imm_bytes*/ 8U, "${imm}"));
 }
 
 TEST_F(AssemblerX86_64Test, Repeatr) {
@@ -310,10 +360,10 @@
             Repeatr(/*f*/ nullptr, "%{reg}"));
 }
 
-TEST_F(AssemblerX86_64Test, Repeatri) {
-  EXPECT_NE(Repeatri(/*f*/ nullptr, /*imm_bytes*/ 1U, "%{reg} %{imm}").
-            find("%eax %0\n%eax %-1\n%eax %18\n%ebx %0\n%ebx %-1\n%ebx %18\n"
-                 "%ecx %0\n%ecx %-1\n%ecx %18\n%edx %0\n%edx %-1\n%edx %18\n"),
+TEST_F(AssemblerX86_64Test, RepeatrI) {
+  EXPECT_NE(RepeatrI(/*f*/ nullptr, /*imm_bytes*/ 1U, "%{reg} ${imm}").
+            find("%eax $0\n%eax $-1\n%eax $18\n%ebx $0\n%ebx $-1\n%ebx $18\n"
+                 "%ecx $0\n%ecx $-1\n%ecx $18\n%edx $0\n%edx $-1\n%edx $18\n"),
             std::string::npos);
 }
 
@@ -334,10 +384,7 @@
 TEST_F(AssemblerX86_64Test, RepeatrF) {
   EXPECT_NE(RepeatrF(/*f*/ nullptr, "%{reg1} %{reg2}")
             .find("%eax %xmm0\n%eax %xmm1\n%eax %xmm2\n%eax %xmm3\n"
-                  "%eax %xmm4\n%eax %xmm5\n%eax %xmm6\n%eax %xmm7\n"
-                  "%eax %xmm8\n%eax %xmm9\n%eax %xmm10\n%eax %xmm11\n"
-                  "%eax %xmm12\n%eax %xmm13\n%eax %xmm14\n%eax %xmm15\n"
-                  "%ebx %xmm0\n%ebx %xmm1\n%ebx %xmm2\n%ebx %xmm3\n%ebx %xmm4\n"),
+                  "%eax %xmm4\n%eax %xmm5\n%eax %xmm6\n%eax %xmm7\n"),
             std::string::npos);
 }
 
@@ -348,59 +395,103 @@
 }
 
 TEST_F(AssemblerX86_64Test, RepeatRI) {
-  EXPECT_EQ("%rax %0\n%rax %-1\n%rax %18\n%rbx %0\n%rbx %-1\n%rbx %18\n"
-            "%rcx %0\n%rcx %-1\n%rcx %18\n%rdx %0\n%rdx %-1\n%rdx %18\n"
-            "%rbp %0\n%rbp %-1\n%rbp %18\n%rsp %0\n%rsp %-1\n%rsp %18\n"
-            "%rsi %0\n%rsi %-1\n%rsi %18\n%rdi %0\n%rdi %-1\n%rdi %18\n"
-            "%r8 %0\n%r8 %-1\n%r8 %18\n%r9 %0\n%r9 %-1\n%r9 %18\n"
-            "%r10 %0\n%r10 %-1\n%r10 %18\n%r11 %0\n%r11 %-1\n%r11 %18\n"
-            "%r12 %0\n%r12 %-1\n%r12 %18\n%r13 %0\n%r13 %-1\n%r13 %18\n"
-            "%r14 %0\n%r14 %-1\n%r14 %18\n%r15 %0\n%r15 %-1\n%r15 %18\n",
-            RepeatRI(/*f*/ nullptr, /*imm_bytes*/ 1U, "%{reg} %{imm}"));
+  EXPECT_NE(RepeatRI(/*f*/ nullptr, /*imm_bytes*/ 1U, "%{reg} ${imm}")
+            .find("%rax $0\n%rax $-1\n%rax $18\n%rbx $0\n%rbx $-1\n%rbx $18\n"
+                  "%rcx $0\n%rcx $-1\n%rcx $18\n%rdx $0\n%rdx $-1\n%rdx $18\n"),
+            std::string::npos);
 }
 
 TEST_F(AssemblerX86_64Test, RepeatRr) {
   EXPECT_NE(RepeatRr(/*f*/ nullptr, "%{reg1} %{reg2}")
             .find("%rax %eax\n%rax %ebx\n%rax %ecx\n%rax %edx\n%rax %ebp\n"
-                  "%rax %esp\n%rax %esi\n%rax %edi\n%rax %r8d\n%rax %r9d\n"
-                  "%rax %r10d\n%rax %r11d\n%rax %r12d\n%rax %r13d\n%rax %r14d\n"
-                  "%rax %r15d\n%rbx %eax\n%rbx %ebx\n%rbx %ecx\n%rbx %edx\n"),
+                  "%rax %esp\n%rax %esi\n%rax %edi\n%rax %r8d\n%rax %r9d\n"),
             std::string::npos);
 }
 
 TEST_F(AssemblerX86_64Test, RepeatRR) {
   EXPECT_NE(RepeatRR(/*f*/ nullptr, "%{reg1} %{reg2}")
             .find("%rax %rax\n%rax %rbx\n%rax %rcx\n%rax %rdx\n%rax %rbp\n"
-                  "%rax %rsp\n%rax %rsi\n%rax %rdi\n%rax %r8\n%rax %r9\n"
-                  "%rax %r10\n%rax %r11\n%rax %r12\n%rax %r13\n%rax %r14\n"
-                  "%rax %r15\n%rbx %rax\n%rbx %rbx\n%rbx %rcx\n%rbx %rdx\n"),
+                  "%rax %rsp\n%rax %rsi\n%rax %rdi\n%rax %r8\n%rax %r9\n"),
             std::string::npos);
 }
 
 TEST_F(AssemblerX86_64Test, RepeatRF) {
   EXPECT_NE(RepeatRF(/*f*/ nullptr, "%{reg1} %{reg2}")
             .find("%rax %xmm0\n%rax %xmm1\n%rax %xmm2\n%rax %xmm3\n%rax %xmm4\n"
-                  "%rax %xmm5\n%rax %xmm6\n%rax %xmm7\n%rax %xmm8\n%rax %xmm9\n"
-                  "%rax %xmm10\n%rax %xmm11\n%rax %xmm12\n%rax %xmm13\n%rax %xmm14\n"
-                  "%rax %xmm15\n%rbx %xmm0\n%rbx %xmm1\n%rbx %xmm2\n%rbx %xmm3\n"),
+                  "%rax %xmm5\n%rax %xmm6\n%rax %xmm7\n%rax %xmm8\n%rax %xmm9\n"),
             std::string::npos);
 }
 
 TEST_F(AssemblerX86_64Test, RepeatFF) {
   EXPECT_NE(RepeatFF(/*f*/ nullptr, "%{reg1} %{reg2}")
             .find("%xmm0 %xmm0\n%xmm0 %xmm1\n%xmm0 %xmm2\n%xmm0 %xmm3\n%xmm0 %xmm4\n"
-                  "%xmm0 %xmm5\n%xmm0 %xmm6\n%xmm0 %xmm7\n%xmm0 %xmm8\n%xmm0 %xmm9\n"
-                  "%xmm0 %xmm10\n%xmm0 %xmm11\n%xmm0 %xmm12\n%xmm0 %xmm13\n%xmm0 %xmm14\n"
-                  "%xmm0 %xmm15\n%xmm1 %xmm0\n%xmm1 %xmm1\n%xmm1 %xmm2\n%xmm1 %xmm3\n"),
+                  "%xmm0 %xmm5\n%xmm0 %xmm6\n%xmm0 %xmm7\n%xmm0 %xmm8\n%xmm0 %xmm9\n"),
             std::string::npos);
 }
 
 TEST_F(AssemblerX86_64Test, RepeatFFI) {
-  EXPECT_NE(RepeatFFI(/*f*/ nullptr, /*imm_bytes*/ 1U, "%{reg1} %{reg2} %{imm}")
-            .find("%xmm0 %xmm0 %0\n%xmm0 %xmm0 %-1\n%xmm0 %xmm0 %18\n"
-                  "%xmm0 %xmm1 %0\n%xmm0 %xmm1 %-1\n%xmm0 %xmm1 %18\n"
-                  "%xmm0 %xmm2 %0\n%xmm0 %xmm2 %-1\n%xmm0 %xmm2 %18\n"
-                  "%xmm0 %xmm3 %0\n%xmm0 %xmm3 %-1\n%xmm0 %xmm3 %18\n"),
+  EXPECT_NE(RepeatFFI(/*f*/ nullptr, /*imm_bytes*/ 1U, "%{reg1} %{reg2} ${imm}")
+            .find("%xmm0 %xmm0 $0\n%xmm0 %xmm0 $-1\n%xmm0 %xmm0 $18\n"
+                  "%xmm0 %xmm1 $0\n%xmm0 %xmm1 $-1\n%xmm0 %xmm1 $18\n"),
+            std::string::npos);
+}
+
+TEST_F(AssemblerX86_64Test, RepeatA) {
+  EXPECT_EQ("-1(%rax,%rbx,1)\n", RepeatA(/*f*/ nullptr, addresses_singleton_, "{mem}"));
+}
+
+TEST_F(AssemblerX86_64Test, RepeatAFull) {
+  EXPECT_EQ("15(%rdi,%rax,1)\n16(%rdi,%rbx,2)\n17(%rdi,%rcx,4)\n18(%rdi,%rdx,8)\n"
+            "-1(%rax)\n(%rbx)\n1(%rsi)\n987654321(%rdi)\n15(%rsp,%rax,1)\n"
+            "16(%rsp,%rbx,2)\n17(%rsp,%rcx,4)\n18(%rsp,%rdx,8)\n-1(%rsp)\n"
+            "(%rsp)\n1(%rsp)\n987654321(%rsp)\n-1(%r8,%r15,2)\n123456789(%r15)\n",
+            RepeatA(/*f*/ nullptr, "{mem}"));
+}
+
+TEST_F(AssemblerX86_64Test, RepeatAI) {
+  EXPECT_EQ("-1(%rax,%rbx,1) $0\n-1(%rax,%rbx,1) $-1\n-1(%rax,%rbx,1) $18\n",
+            RepeatAI(/*f*/ nullptr, /*imm_bytes*/ 1U, addresses_singleton_, "{mem} ${imm}"));
+}
+
+TEST_F(AssemblerX86_64Test, RepeatRA) {
+  EXPECT_NE(RepeatRA(/*f*/ nullptr, addresses_singleton_, "%{reg} {mem}")
+            .find("%rax -1(%rax,%rbx,1)\n%rbx -1(%rax,%rbx,1)\n%rcx -1(%rax,%rbx,1)\n"
+                  "%rdx -1(%rax,%rbx,1)\n%rbp -1(%rax,%rbx,1)\n%rsp -1(%rax,%rbx,1)\n"),
+            std::string::npos);
+}
+
+TEST_F(AssemblerX86_64Test, RepeatrA) {
+  EXPECT_NE(RepeatrA(/*f*/ nullptr, addresses_singleton_, "%{reg} {mem}")
+            .find("%eax -1(%rax,%rbx,1)\n%ebx -1(%rax,%rbx,1)\n%ecx -1(%rax,%rbx,1)\n"
+                  "%edx -1(%rax,%rbx,1)\n%ebp -1(%rax,%rbx,1)\n%esp -1(%rax,%rbx,1)\n"),
+            std::string::npos);
+}
+
+TEST_F(AssemblerX86_64Test, RepeatAR) {
+  EXPECT_NE(RepeatAR(/*f*/ nullptr, addresses_singleton_, "{mem} %{reg}")
+            .find("-1(%rax,%rbx,1) %rax\n-1(%rax,%rbx,1) %rbx\n-1(%rax,%rbx,1) %rcx\n"
+                  "-1(%rax,%rbx,1) %rdx\n-1(%rax,%rbx,1) %rbp\n-1(%rax,%rbx,1) %rsp\n"),
+            std::string::npos);
+}
+
+TEST_F(AssemblerX86_64Test, RepeatAr) {
+  EXPECT_NE(RepeatAr(/*f*/ nullptr, addresses_singleton_, "{mem} %{reg}")
+            .find("-1(%rax,%rbx,1) %eax\n-1(%rax,%rbx,1) %ebx\n-1(%rax,%rbx,1) %ecx\n"
+                  "-1(%rax,%rbx,1) %edx\n-1(%rax,%rbx,1) %ebp\n-1(%rax,%rbx,1) %esp\n"),
+            std::string::npos);
+}
+
+TEST_F(AssemblerX86_64Test, RepeatFA) {
+  EXPECT_NE(RepeatFA(/*f*/ nullptr, addresses_singleton_, "%{reg} {mem}").
+            find("%xmm0 -1(%rax,%rbx,1)\n%xmm1 -1(%rax,%rbx,1)\n%xmm2 -1(%rax,%rbx,1)\n"
+                 "%xmm3 -1(%rax,%rbx,1)\n%xmm4 -1(%rax,%rbx,1)\n%xmm5 -1(%rax,%rbx,1)\n"),
+            std::string::npos);
+}
+
+TEST_F(AssemblerX86_64Test, RepeatAF) {
+  EXPECT_NE(RepeatAF(/*f*/ nullptr, addresses_singleton_, "{mem} %{reg}")
+            .find("-1(%rax,%rbx,1) %xmm0\n-1(%rax,%rbx,1) %xmm1\n-1(%rax,%rbx,1) %xmm2\n"
+                  "-1(%rax,%rbx,1) %xmm3\n-1(%rax,%rbx,1) %xmm4\n-1(%rax,%rbx,1) %xmm5\n"),
             std::string::npos);
 }
 
@@ -412,12 +503,43 @@
   EXPECT_TRUE(CheckTools());
 }
 
+TEST_F(AssemblerX86_64Test, PopqAllAddresses) {
+  // Make sure all addressing modes combinations are tested at least once.
+  std::vector<x86_64::Address> all_addresses;
+  for (x86_64::CpuRegister* base : GetRegisters()) {
+    // Base only.
+    all_addresses.push_back(x86_64::Address(*base, -1));
+    all_addresses.push_back(x86_64::Address(*base, 0));
+    all_addresses.push_back(x86_64::Address(*base, 1));
+    all_addresses.push_back(x86_64::Address(*base, 123456789));
+    for (x86_64::CpuRegister* index : GetRegisters()) {
+      if (index->AsRegister() == x86_64::RSP) {
+        // Index cannot be RSP.
+        continue;
+      } else if (base->AsRegister() == index->AsRegister()) {
+       // Index only.
+       all_addresses.push_back(x86_64::Address(*index, x86_64::TIMES_1, -1));
+       all_addresses.push_back(x86_64::Address(*index, x86_64::TIMES_2, 0));
+       all_addresses.push_back(x86_64::Address(*index, x86_64::TIMES_4, 1));
+       all_addresses.push_back(x86_64::Address(*index, x86_64::TIMES_8, 123456789));
+      }
+      // Base and index.
+      all_addresses.push_back(x86_64::Address(*base, *index, x86_64::TIMES_1, -1));
+      all_addresses.push_back(x86_64::Address(*base, *index, x86_64::TIMES_2, 0));
+      all_addresses.push_back(x86_64::Address(*base, *index, x86_64::TIMES_4, 1));
+      all_addresses.push_back(x86_64::Address(*base, *index, x86_64::TIMES_8, 123456789));
+    }
+  }
+  DriverStr(RepeatA(&x86_64::X86_64Assembler::popq, all_addresses, "popq {mem}"), "popq");
+}
+
 TEST_F(AssemblerX86_64Test, PushqRegs) {
   DriverStr(RepeatR(&x86_64::X86_64Assembler::pushq, "pushq %{reg}"), "pushq");
 }
 
 TEST_F(AssemblerX86_64Test, PushqImm) {
-  DriverStr(RepeatI(&x86_64::X86_64Assembler::pushq, 4U, "pushq ${imm}"), "pushqi");
+  DriverStr(RepeatI(&x86_64::X86_64Assembler::pushq, /*imm_bytes*/ 4U,
+                    "pushq ${imm}"), "pushqi");
 }
 
 TEST_F(AssemblerX86_64Test, MovqRegs) {
@@ -425,7 +547,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, MovqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::movq, 8U, "movq ${imm}, %{reg}"), "movqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::movq, /*imm_bytes*/ 8U,
+                     "movq ${imm}, %{reg}"), "movqi");
 }
 
 TEST_F(AssemblerX86_64Test, MovlRegs) {
@@ -433,7 +556,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, MovlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::movl, 4U, "mov ${imm}, %{reg}"), "movli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::movl, /*imm_bytes*/ 4U,
+                     "mov ${imm}, %{reg}"), "movli");
 }
 
 TEST_F(AssemblerX86_64Test, AddqRegs) {
@@ -441,7 +565,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, AddqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::addq, 4U, "addq ${imm}, %{reg}"), "addqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::addq, /*imm_bytes*/ 4U,
+                     "addq ${imm}, %{reg}"), "addqi");
 }
 
 TEST_F(AssemblerX86_64Test, AddlRegs) {
@@ -449,7 +574,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, AddlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::addl, 4U, "add ${imm}, %{reg}"), "addli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::addl, /*imm_bytes*/ 4U,
+                     "add ${imm}, %{reg}"), "addli");
 }
 
 TEST_F(AssemblerX86_64Test, ImulqReg1) {
@@ -461,7 +587,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, ImulqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::imulq, 4U, "imulq ${imm}, %{reg}, %{reg}"),
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::imulq, /*imm_bytes*/ 4U,
+                     "imulq ${imm}, %{reg}, %{reg}"),
             "imulqi");
 }
 
@@ -470,7 +597,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, ImullImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::imull, 4U, "imull ${imm}, %{reg}, %{reg}"),
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::imull, /*imm_bytes*/ 4U,
+                     "imull ${imm}, %{reg}, %{reg}"),
             "imulli");
 }
 
@@ -483,7 +611,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, SubqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::subq, 4U, "subq ${imm}, %{reg}"), "subqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::subq, /*imm_bytes*/ 4U,
+                     "subq ${imm}, %{reg}"), "subqi");
 }
 
 TEST_F(AssemblerX86_64Test, SublRegs) {
@@ -491,21 +620,19 @@
 }
 
 TEST_F(AssemblerX86_64Test, SublImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::subl, 4U, "sub ${imm}, %{reg}"), "subli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::subl, /*imm_bytes*/ 4U,
+                     "sub ${imm}, %{reg}"), "subli");
 }
 
 // Shll only allows CL as the shift count.
 std::string shll_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->shll(*reg, shifter);
     str << "shll %cl, %" << assembler_test->GetSecondaryRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -514,21 +641,19 @@
 }
 
 TEST_F(AssemblerX86_64Test, ShllImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::shll, 1U, "shll ${imm}, %{reg}"), "shlli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::shll, /*imm_bytes*/ 1U,
+                     "shll ${imm}, %{reg}"), "shlli");
 }
 
 // Shlq only allows CL as the shift count.
 std::string shlq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->shlq(*reg, shifter);
     str << "shlq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -537,21 +662,19 @@
 }
 
 TEST_F(AssemblerX86_64Test, ShlqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::shlq, 1U, "shlq ${imm}, %{reg}"), "shlqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::shlq, /*imm_bytes*/ 1U,
+                     "shlq ${imm}, %{reg}"), "shlqi");
 }
 
 // Shrl only allows CL as the shift count.
 std::string shrl_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->shrl(*reg, shifter);
     str << "shrl %cl, %" << assembler_test->GetSecondaryRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -560,21 +683,18 @@
 }
 
 TEST_F(AssemblerX86_64Test, ShrlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::shrl, 1U, "shrl ${imm}, %{reg}"), "shrli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::shrl, /*imm_bytes*/ 1U, "shrl ${imm}, %{reg}"), "shrli");
 }
 
 // Shrq only allows CL as the shift count.
 std::string shrq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->shrq(*reg, shifter);
     str << "shrq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -583,21 +703,18 @@
 }
 
 TEST_F(AssemblerX86_64Test, ShrqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::shrq, 1U, "shrq ${imm}, %{reg}"), "shrqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::shrq, /*imm_bytes*/ 1U, "shrq ${imm}, %{reg}"), "shrqi");
 }
 
 // Sarl only allows CL as the shift count.
 std::string sarl_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->sarl(*reg, shifter);
     str << "sarl %cl, %" << assembler_test->GetSecondaryRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -606,21 +723,18 @@
 }
 
 TEST_F(AssemblerX86_64Test, SarlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::sarl, 1U, "sarl ${imm}, %{reg}"), "sarli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::sarl, /*imm_bytes*/ 1U, "sarl ${imm}, %{reg}"), "sarli");
 }
 
 // Sarq only allows CL as the shift count.
 std::string sarq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->sarq(*reg, shifter);
     str << "sarq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -629,21 +743,18 @@
 }
 
 TEST_F(AssemblerX86_64Test, SarqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::sarq, 1U, "sarq ${imm}, %{reg}"), "sarqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::sarq, /*imm_bytes*/ 1U, "sarq ${imm}, %{reg}"), "sarqi");
 }
 
 // Rorl only allows CL as the shift count.
 std::string rorl_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->rorl(*reg, shifter);
     str << "rorl %cl, %" << assembler_test->GetSecondaryRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -652,21 +763,18 @@
 }
 
 TEST_F(AssemblerX86_64Test, RorlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::rorl, 1U, "rorl ${imm}, %{reg}"), "rorli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::rorl, /*imm_bytes*/ 1U, "rorl ${imm}, %{reg}"), "rorli");
 }
 
 // Roll only allows CL as the shift count.
 std::string roll_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->roll(*reg, shifter);
     str << "roll %cl, %" << assembler_test->GetSecondaryRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -675,21 +783,18 @@
 }
 
 TEST_F(AssemblerX86_64Test, RollImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::roll, 1U, "roll ${imm}, %{reg}"), "rolli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::roll, /*imm_bytes*/ 1U, "roll ${imm}, %{reg}"), "rolli");
 }
 
 // Rorq only allows CL as the shift count.
 std::string rorq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->rorq(*reg, shifter);
     str << "rorq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -698,21 +803,18 @@
 }
 
 TEST_F(AssemblerX86_64Test, RorqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::rorq, 1U, "rorq ${imm}, %{reg}"), "rorqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::rorq, /*imm_bytes*/ 1U, "rorq ${imm}, %{reg}"), "rorqi");
 }
 
 // Rolq only allows CL as the shift count.
 std::string rolq_fn(AssemblerX86_64Test::Base* assembler_test, x86_64::X86_64Assembler* assembler) {
   std::ostringstream str;
-
   std::vector<x86_64::CpuRegister*> registers = assembler_test->GetRegisters();
-
   x86_64::CpuRegister shifter(x86_64::RCX);
   for (auto reg : registers) {
     assembler->rolq(*reg, shifter);
     str << "rolq %cl, %" << assembler_test->GetRegisterName(*reg) << "\n";
   }
-
   return str.str();
 }
 
@@ -721,7 +823,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, RolqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::rolq, 1U, "rolq ${imm}, %{reg}"), "rolqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::rolq, /*imm_bytes*/ 1U, "rolq ${imm}, %{reg}"), "rolqi");
 }
 
 TEST_F(AssemblerX86_64Test, CmpqRegs) {
@@ -729,8 +831,9 @@
 }
 
 TEST_F(AssemblerX86_64Test, CmpqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::cmpq, 4U  /* cmpq only supports 32b imm */,
-                     "cmpq ${imm}, %{reg}"), "cmpqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::cmpq,
+                     /*imm_bytes*/ 4U,
+                     "cmpq ${imm}, %{reg}"), "cmpqi");  // only imm32
 }
 
 TEST_F(AssemblerX86_64Test, CmplRegs) {
@@ -738,7 +841,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, CmplImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::cmpl, 4U, "cmpl ${imm}, %{reg}"), "cmpli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::cmpl, /*imm_bytes*/ 4U, "cmpl ${imm}, %{reg}"), "cmpli");
 }
 
 TEST_F(AssemblerX86_64Test, Testl) {
@@ -768,8 +871,9 @@
 }
 
 TEST_F(AssemblerX86_64Test, AndqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::andq, 4U  /* andq only supports 32b imm */,
-                     "andq ${imm}, %{reg}"), "andqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::andq,
+                     /*imm_bytes*/ 4U,
+                     "andq ${imm}, %{reg}"), "andqi");  // only imm32
 }
 
 TEST_F(AssemblerX86_64Test, AndlRegs) {
@@ -777,7 +881,9 @@
 }
 
 TEST_F(AssemblerX86_64Test, AndlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::andl, 4U, "andl ${imm}, %{reg}"), "andli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::andl,
+                     /*imm_bytes*/ 4U,
+                     "andl ${imm}, %{reg}"), "andli");
 }
 
 TEST_F(AssemblerX86_64Test, OrqRegs) {
@@ -789,7 +895,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, OrlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::orl, 4U, "orl ${imm}, %{reg}"), "orli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::orl,
+                     /*imm_bytes*/ 4U, "orl ${imm}, %{reg}"), "orli");
 }
 
 TEST_F(AssemblerX86_64Test, XorqRegs) {
@@ -797,7 +904,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, XorqImm) {
-  DriverStr(RepeatRI(&x86_64::X86_64Assembler::xorq, 4U, "xorq ${imm}, %{reg}"), "xorqi");
+  DriverStr(RepeatRI(&x86_64::X86_64Assembler::xorq,
+                     /*imm_bytes*/ 4U, "xorq ${imm}, %{reg}"), "xorqi");
 }
 
 TEST_F(AssemblerX86_64Test, XorlRegs) {
@@ -805,7 +913,8 @@
 }
 
 TEST_F(AssemblerX86_64Test, XorlImm) {
-  DriverStr(Repeatri(&x86_64::X86_64Assembler::xorl, 4U, "xor ${imm}, %{reg}"), "xorli");
+  DriverStr(RepeatrI(&x86_64::X86_64Assembler::xorl,
+                     /*imm_bytes*/ 4U, "xor ${imm}, %{reg}"), "xorli");
 }
 
 TEST_F(AssemblerX86_64Test, Xchgq) {
@@ -813,167 +922,87 @@
 }
 
 TEST_F(AssemblerX86_64Test, Xchgl) {
-  // Test is disabled because GCC generates 0x87 0xC0 for xchgl eax, eax. All other cases are the
-  // same. Anyone know why it doesn't emit a simple 0x90? It does so for xchgq rax, rax...
+  // TODO: Test is disabled because GCC generates 0x87 0xC0 for xchgl eax, eax. All other cases
+  // are the same. Anyone know why it doesn't emit a simple 0x90? It does so for xchgq rax, rax...
   // DriverStr(Repeatrr(&x86_64::X86_64Assembler::xchgl, "xchgl %{reg2}, %{reg1}"), "xchgl");
 }
 
 TEST_F(AssemblerX86_64Test, LockCmpxchgl) {
-  GetAssembler()->LockCmpxchgl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12),
-      x86_64::CpuRegister(x86_64::RSI));
-  GetAssembler()->LockCmpxchgl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12),
-      x86_64::CpuRegister(x86_64::RSI));
-  GetAssembler()->LockCmpxchgl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12),
-      x86_64::CpuRegister(x86_64::R8));
-  GetAssembler()->LockCmpxchgl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), 0), x86_64::CpuRegister(x86_64::RSI));
-  GetAssembler()->LockCmpxchgl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0),
-      x86_64::CpuRegister(x86_64::RSI));
-  const char* expected =
-    "lock cmpxchgl %ESI, 0xc(%RDI,%RBX,4)\n"
-    "lock cmpxchgl %ESI, 0xc(%RDI,%R9,4)\n"
-    "lock cmpxchgl %R8d, 0xc(%RDI,%R9,4)\n"
-    "lock cmpxchgl %ESI, (%R13)\n"
-    "lock cmpxchgl %ESI, (%R13,%R9,1)\n";
-
-  DriverStr(expected, "lock_cmpxchgl");
+  DriverStr(RepeatAr(&x86_64::X86_64Assembler::LockCmpxchgl,
+                     "lock cmpxchgl %{reg}, {mem}"), "lock_cmpxchgl");
 }
 
 TEST_F(AssemblerX86_64Test, LockCmpxchgq) {
-  GetAssembler()->LockCmpxchgq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12),
-      x86_64::CpuRegister(x86_64::RSI));
-  GetAssembler()->LockCmpxchgq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12),
-      x86_64::CpuRegister(x86_64::RSI));
-  GetAssembler()->LockCmpxchgq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12),
-      x86_64::CpuRegister(x86_64::R8));
-  GetAssembler()->LockCmpxchgq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), 0), x86_64::CpuRegister(x86_64::RSI));
-  GetAssembler()->LockCmpxchgq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0),
-      x86_64::CpuRegister(x86_64::RSI));
-  const char* expected =
-    "lock cmpxchg %RSI, 0xc(%RDI,%RBX,4)\n"
-    "lock cmpxchg %RSI, 0xc(%RDI,%R9,4)\n"
-    "lock cmpxchg %R8, 0xc(%RDI,%R9,4)\n"
-    "lock cmpxchg %RSI, (%R13)\n"
-    "lock cmpxchg %RSI, (%R13,%R9,1)\n";
-
-  DriverStr(expected, "lock_cmpxchg");
+  DriverStr(RepeatAR(&x86_64::X86_64Assembler::LockCmpxchgq,
+                     "lock cmpxchg %{reg}, {mem}"), "lock_cmpxchg");
 }
 
-TEST_F(AssemblerX86_64Test, Movl) {
-  GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  GetAssembler()->movl(x86_64::CpuRegister(x86_64::R8), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), 0));
-  GetAssembler()->movl(x86_64::CpuRegister(x86_64::RAX), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0));
-  const char* expected =
-    "movl 0xc(%RDI,%RBX,4), %EAX\n"
-    "movl 0xc(%RDI,%R9,4), %EAX\n"
-    "movl 0xc(%RDI,%R9,4), %R8d\n"
-    "movl (%R13), %EAX\n"
-    "movl (%R13,%R9,1), %EAX\n";
-
-  DriverStr(expected, "movl");
+TEST_F(AssemblerX86_64Test, MovqStore) {
+  DriverStr(RepeatAR(&x86_64::X86_64Assembler::movq, "movq %{reg}, {mem}"), "movq_s");
 }
 
-TEST_F(AssemblerX86_64Test, Movw) {
-  GetAssembler()->movw(x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                       x86_64::CpuRegister(x86_64::R9));
-  GetAssembler()->movw(x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                       x86_64::Immediate(0));
-  GetAssembler()->movw(x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0),
-                       x86_64::Immediate(0));
-  GetAssembler()->movw(x86_64::Address(x86_64::CpuRegister(x86_64::R14), 0),
-                       x86_64::Immediate(0));
-  const char* expected =
-      "movw %R9w, 0(%RAX)\n"
-      "movw $0, 0(%RAX)\n"
-      "movw $0, 0(%R9)\n"
-      "movw $0, 0(%R14)\n";
-  DriverStr(expected, "movw");
+TEST_F(AssemblerX86_64Test, MovqLoad) {
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::movq, "movq {mem}, %{reg}"), "movq_l");
+}
+
+TEST_F(AssemblerX86_64Test, MovlStore) {
+  DriverStr(RepeatAr(&x86_64::X86_64Assembler::movl, "movl %{reg}, {mem}"), "movl_s");
+}
+
+TEST_F(AssemblerX86_64Test, MovlLoad) {
+  DriverStr(RepeatrA(&x86_64::X86_64Assembler::movl, "movl {mem}, %{reg}"), "movl_l");
+}
+
+TEST_F(AssemblerX86_64Test, MovwStore) {
+  DriverStr(RepeatAw(&x86_64::X86_64Assembler::movw, "movw %{reg}, {mem}"), "movw_s");
+}
+
+TEST_F(AssemblerX86_64Test, MovbStore) {
+  DriverStr(RepeatAb(&x86_64::X86_64Assembler::movb, "movb %{reg}, {mem}"), "movb_s");
 }
 
 TEST_F(AssemblerX86_64Test, Cmpw) {
-  GetAssembler()->cmpw(x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                       x86_64::Immediate(0));
-  GetAssembler()->cmpw(x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0),
-                       x86_64::Immediate(0));
-  GetAssembler()->cmpw(x86_64::Address(x86_64::CpuRegister(x86_64::R14), 0),
-                       x86_64::Immediate(0));
-  const char* expected =
-      "cmpw $0, 0(%RAX)\n"
-      "cmpw $0, 0(%R9)\n"
-      "cmpw $0, 0(%R14)\n";
-  DriverStr(expected, "cmpw");
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::cmpw,
+                     /*imm_bytes*/ 1U,
+                     "cmpw ${imm}, {mem}"), "cmpw");  // TODO: only imm8?
 }
 
 TEST_F(AssemblerX86_64Test, MovqAddrImm) {
-  GetAssembler()->movq(x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                       x86_64::Immediate(-5));
-  const char* expected = "movq $-5, 0(%RAX)\n";
-  DriverStr(expected, "movq");
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::movq,
+                     /*imm_bytes*/ 4U,
+                     "movq ${imm}, {mem}"), "movq");  // only imm32
+}
+
+TEST_F(AssemblerX86_64Test, MovlAddrImm) {
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::movl,
+                     /*imm_bytes*/ 4U, "movl ${imm}, {mem}"), "movl");
+}
+
+TEST_F(AssemblerX86_64Test, MovwAddrImm) {
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::movw,
+                     /*imm_bytes*/ 2U, "movw ${imm}, {mem}"), "movw");
+}
+
+TEST_F(AssemblerX86_64Test, MovbAddrImm) {
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::movb,
+                     /*imm_bytes*/ 1U, "movb ${imm}, {mem}"), "movb");
 }
 
 TEST_F(AssemblerX86_64Test, Movntl) {
-  GetAssembler()->movntl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntl(x86_64::Address(x86_64::CpuRegister(x86_64::R13), 0), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntl(x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0), x86_64::CpuRegister(x86_64::R9));
-  const char* expected =
-    "movntil %EAX, 0xc(%RDI,%RBX,4)\n"
-    "movntil %EAX, 0xc(%RDI,%R9,4)\n"
-    "movntil %EAX, 0xc(%RDI,%R9,4)\n"
-    "movntil %EAX, (%R13)\n"
-    "movntil %R9d, (%R13,%R9,1)\n";
-
-  DriverStr(expected, "movntl");
+  DriverStr(RepeatAr(&x86_64::X86_64Assembler::movntl, "movntil %{reg}, {mem}"), "movntl");
 }
 
 TEST_F(AssemblerX86_64Test, Movntq) {
-  GetAssembler()->movntq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntq(x86_64::Address(x86_64::CpuRegister(x86_64::R13), 0), x86_64::CpuRegister(x86_64::RAX));
-  GetAssembler()->movntq(x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0), x86_64::CpuRegister(x86_64::R9));
-  const char* expected =
-    "movntiq %RAX, 0xc(%RDI,%RBX,4)\n"
-    "movntiq %RAX, 0xc(%RDI,%R9,4)\n"
-    "movntiq %RAX, 0xc(%RDI,%R9,4)\n"
-    "movntiq %RAX, (%R13)\n"
-    "movntiq %R9, (%R13,%R9,1)\n";
-
-  DriverStr(expected, "movntq");
+  DriverStr(RepeatAR(&x86_64::X86_64Assembler::movntq, "movntiq %{reg}, {mem}"), "movntq");
 }
 
 TEST_F(AssemblerX86_64Test, Cvtsi2ssAddr) {
   GetAssembler()->cvtsi2ss(x86_64::XmmRegister(x86_64::XMM0),
                            x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                           false);
+                           /*is64bit*/ false);
   GetAssembler()->cvtsi2ss(x86_64::XmmRegister(x86_64::XMM0),
                            x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                           true);
+                           /*is64bit*/ true);
   const char* expected = "cvtsi2ss 0(%RAX), %xmm0\n"
                          "cvtsi2ssq 0(%RAX), %xmm0\n";
   DriverStr(expected, "cvtsi2ss");
@@ -982,111 +1011,69 @@
 TEST_F(AssemblerX86_64Test, Cvtsi2sdAddr) {
   GetAssembler()->cvtsi2sd(x86_64::XmmRegister(x86_64::XMM0),
                            x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                           false);
+                           /*is64bit*/ false);
   GetAssembler()->cvtsi2sd(x86_64::XmmRegister(x86_64::XMM0),
                            x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0),
-                           true);
+                           /*is64bit*/ true);
   const char* expected = "cvtsi2sd 0(%RAX), %xmm0\n"
                          "cvtsi2sdq 0(%RAX), %xmm0\n";
   DriverStr(expected, "cvtsi2sd");
 }
 
 TEST_F(AssemblerX86_64Test, CmpqAddr) {
-  GetAssembler()->cmpq(x86_64::CpuRegister(x86_64::R12),
-                       x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0));
-  const char* expected = "cmpq 0(%R9), %R12\n";
-  DriverStr(expected, "cmpq");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::cmpq, "cmpq {mem}, %{reg}"), "cmpq");
 }
 
 TEST_F(AssemblerX86_64Test, MovsxdAddr) {
-  GetAssembler()->movsxd(x86_64::CpuRegister(x86_64::R12),
-                       x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0));
-  const char* expected = "movslq 0(%R9), %R12\n";
-  DriverStr(expected, "movsxd");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::movsxd, "movslq {mem}, %{reg}"), "movsxd");
 }
 
 TEST_F(AssemblerX86_64Test, TestqAddr) {
-  GetAssembler()->testq(x86_64::CpuRegister(x86_64::R12),
-                        x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0));
-  const char* expected = "testq 0(%R9), %R12\n";
-  DriverStr(expected, "testq");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::testq, "testq {mem}, %{reg}"), "testq");
 }
 
 TEST_F(AssemblerX86_64Test, AddqAddr) {
-  GetAssembler()->addq(x86_64::CpuRegister(x86_64::R12),
-                        x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0));
-  const char* expected = "addq 0(%R9), %R12\n";
-  DriverStr(expected, "addq");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::addq, "addq {mem}, %{reg}"), "addq");
 }
 
 TEST_F(AssemblerX86_64Test, SubqAddr) {
-  GetAssembler()->subq(x86_64::CpuRegister(x86_64::R12),
-                        x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0));
-  const char* expected = "subq 0(%R9), %R12\n";
-  DriverStr(expected, "subq");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::subq, "subq {mem}, %{reg}"), "subq");
 }
 
 TEST_F(AssemblerX86_64Test, Cvtss2sdAddr) {
-  GetAssembler()->cvtss2sd(x86_64::XmmRegister(x86_64::XMM0),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "cvtss2sd 0(%RAX), %xmm0\n";
-  DriverStr(expected, "cvtss2sd");
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::cvtss2sd, "cvtss2sd {mem}, %{reg}"), "cvtss2sd");
 }
 
 TEST_F(AssemblerX86_64Test, Cvtsd2ssAddr) {
-  GetAssembler()->cvtsd2ss(x86_64::XmmRegister(x86_64::XMM0),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "cvtsd2ss 0(%RAX), %xmm0\n";
-  DriverStr(expected, "cvtsd2ss");
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::cvtsd2ss, "cvtsd2ss {mem}, %{reg}"), "cvtsd2ss");
 }
 
 TEST_F(AssemblerX86_64Test, ComissAddr) {
-  GetAssembler()->comiss(x86_64::XmmRegister(x86_64::XMM14),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "comiss 0(%RAX), %xmm14\n";
-  DriverStr(expected, "comiss");
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::comiss, "comiss {mem}, %{reg}"), "comiss");
 }
 
 TEST_F(AssemblerX86_64Test, ComisdAddr) {
-  GetAssembler()->comisd(x86_64::XmmRegister(x86_64::XMM0),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::R9), 0));
-  const char* expected = "comisd 0(%R9), %xmm0\n";
-  DriverStr(expected, "comisd");
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::comisd, "comisd {mem}, %{reg}"), "comisd");
 }
 
 TEST_F(AssemblerX86_64Test, UComissAddr) {
-  GetAssembler()->ucomiss(x86_64::XmmRegister(x86_64::XMM0),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "ucomiss 0(%RAX), %xmm0\n";
-  DriverStr(expected, "ucomiss");
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::ucomiss, "ucomiss {mem}, %{reg}"), "ucomiss");
 }
 
 TEST_F(AssemblerX86_64Test, UComisdAddr) {
-  GetAssembler()->ucomisd(x86_64::XmmRegister(x86_64::XMM0),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "ucomisd 0(%RAX), %xmm0\n";
-  DriverStr(expected, "ucomisd");
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::ucomisd, "ucomisd {mem}, %{reg}"), "ucomisd");
 }
 
 TEST_F(AssemblerX86_64Test, Andq) {
-  GetAssembler()->andq(x86_64::CpuRegister(x86_64::R9),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "andq 0(%RAX), %r9\n";
-  DriverStr(expected, "andq");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::andq, "andq {mem}, %{reg}"), "andq");
 }
 
 TEST_F(AssemblerX86_64Test, Orq) {
-  GetAssembler()->orq(x86_64::CpuRegister(x86_64::R9),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "orq 0(%RAX), %r9\n";
-  DriverStr(expected, "orq");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::orq, "orq {mem}, %{reg}"), "orq");
 }
 
 TEST_F(AssemblerX86_64Test, Xorq) {
-  GetAssembler()->xorq(x86_64::CpuRegister(x86_64::R9),
-                           x86_64::Address(x86_64::CpuRegister(x86_64::RAX), 0));
-  const char* expected = "xorq 0(%RAX), %r9\n";
-  DriverStr(expected, "xorq");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::xorq, "xorq {mem}, %{reg}"), "xorq");
 }
 
 TEST_F(AssemblerX86_64Test, RepneScasb) {
@@ -1115,22 +1102,20 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::movaps, "movaps %{reg2}, %{reg1}"), "movaps");
 }
 
-TEST_F(AssemblerX86_64Test, MovapsAddr) {
-  GetAssembler()->movaps(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 4));
-  GetAssembler()->movaps(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 2), x86_64::XmmRegister(x86_64::XMM1));
-  const char* expected =
-    "movaps 0x4(%RSP), %xmm0\n"
-    "movaps %xmm1, 0x2(%RSP)\n";
-  DriverStr(expected, "movaps_address");
+TEST_F(AssemblerX86_64Test, MovapsStore) {
+  DriverStr(RepeatAF(&x86_64::X86_64Assembler::movaps, "movaps %{reg}, {mem}"), "movaps_s");
 }
 
-TEST_F(AssemblerX86_64Test, MovupsAddr) {
-  GetAssembler()->movups(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 4));
-  GetAssembler()->movups(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 2), x86_64::XmmRegister(x86_64::XMM1));
-  const char* expected =
-    "movups 0x4(%RSP), %xmm0\n"
-    "movups %xmm1, 0x2(%RSP)\n";
-  DriverStr(expected, "movups_address");
+TEST_F(AssemblerX86_64Test, MovapsLoad) {
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::movaps, "movaps {mem}, %{reg}"), "movaps_l");
+}
+
+TEST_F(AssemblerX86_64Test, MovupsStore) {
+  DriverStr(RepeatAF(&x86_64::X86_64Assembler::movups, "movups %{reg}, {mem}"), "movups_s");
+}
+
+TEST_F(AssemblerX86_64Test, MovupsLoad) {
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::movups, "movups {mem}, %{reg}"), "movups_l");
 }
 
 TEST_F(AssemblerX86_64Test, Movss) {
@@ -1141,22 +1126,20 @@
   DriverStr(RepeatFF(&x86_64::X86_64Assembler::movapd, "movapd %{reg2}, %{reg1}"), "movapd");
 }
 
-TEST_F(AssemblerX86_64Test, MovapdAddr) {
-  GetAssembler()->movapd(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 4));
-  GetAssembler()->movapd(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 2), x86_64::XmmRegister(x86_64::XMM1));
-  const char* expected =
-    "movapd 0x4(%RSP), %xmm0\n"
-    "movapd %xmm1, 0x2(%RSP)\n";
-  DriverStr(expected, "movapd_address");
+TEST_F(AssemblerX86_64Test, MovapdStore) {
+  DriverStr(RepeatAF(&x86_64::X86_64Assembler::movapd, "movapd %{reg}, {mem}"), "movapd_s");
 }
 
-TEST_F(AssemblerX86_64Test, MovupdAddr) {
-  GetAssembler()->movupd(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 4));
-  GetAssembler()->movupd(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 2), x86_64::XmmRegister(x86_64::XMM1));
-  const char* expected =
-    "movupd 0x4(%RSP), %xmm0\n"
-    "movupd %xmm1, 0x2(%RSP)\n";
-  DriverStr(expected, "movupd_address");
+TEST_F(AssemblerX86_64Test, MovapdLoad) {
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::movapd, "movapd {mem}, %{reg}"), "movapd_l");
+}
+
+TEST_F(AssemblerX86_64Test, MovupdStore) {
+  DriverStr(RepeatAF(&x86_64::X86_64Assembler::movupd, "movupd %{reg}, {mem}"), "movupd_s");
+}
+
+TEST_F(AssemblerX86_64Test, MovupdLoad) {
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::movupd, "movupd {mem}, %{reg}"), "movupd_l");
 }
 
 TEST_F(AssemblerX86_64Test, Movsd) {
@@ -1164,25 +1147,23 @@
 }
 
 TEST_F(AssemblerX86_64Test, Movdqa) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::movdqa, "movdqa %{reg2}, %{reg1}"), "movapd");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::movdqa, "movdqa %{reg2}, %{reg1}"), "movdqa");
 }
 
-TEST_F(AssemblerX86_64Test, MovdqaAddr) {
-  GetAssembler()->movdqa(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 4));
-  GetAssembler()->movdqa(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 2), x86_64::XmmRegister(x86_64::XMM1));
-  const char* expected =
-    "movdqa 0x4(%RSP), %xmm0\n"
-    "movdqa %xmm1, 0x2(%RSP)\n";
-  DriverStr(expected, "movdqa_address");
+TEST_F(AssemblerX86_64Test, MovdqaStore) {
+  DriverStr(RepeatAF(&x86_64::X86_64Assembler::movdqa, "movdqa %{reg}, {mem}"), "movdqa_s");
 }
 
-TEST_F(AssemblerX86_64Test, MovdquAddr) {
-  GetAssembler()->movdqu(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 4));
-  GetAssembler()->movdqu(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 2), x86_64::XmmRegister(x86_64::XMM1));
-  const char* expected =
-    "movdqu 0x4(%RSP), %xmm0\n"
-    "movdqu %xmm1, 0x2(%RSP)\n";
-  DriverStr(expected, "movdqu_address");
+TEST_F(AssemblerX86_64Test, MovdqaLoad) {
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::movdqa, "movdqa {mem}, %{reg}"), "movdqa_l");
+}
+
+TEST_F(AssemblerX86_64Test, MovdquStore) {
+  DriverStr(RepeatAF(&x86_64::X86_64Assembler::movdqu, "movdqu %{reg}, {mem}"), "movdqu_s");
+}
+
+TEST_F(AssemblerX86_64Test, MovdquLoad) {
+  DriverStr(RepeatFA(&x86_64::X86_64Assembler::movdqu, "movdqu {mem}, %{reg}"), "movdqu_l");
 }
 
 TEST_F(AssemblerX86_64Test, Movd1) {
@@ -1364,11 +1345,13 @@
 }
 
 TEST_F(AssemblerX86_64Test, Roundss) {
-  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundss, 1, "roundss ${imm}, %{reg2}, %{reg1}"), "roundss");
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundss, /*imm_bytes*/ 1U,
+                      "roundss ${imm}, %{reg2}, %{reg1}"), "roundss");
 }
 
 TEST_F(AssemblerX86_64Test, Roundsd) {
-  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundsd, 1, "roundsd ${imm}, %{reg2}, %{reg1}"), "roundsd");
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::roundsd, /*imm_bytes*/ 1U,
+                      "roundsd ${imm}, %{reg2}, %{reg1}"), "roundsd");
 }
 
 TEST_F(AssemblerX86_64Test, Xorps) {
@@ -1564,47 +1547,58 @@
 }
 
 TEST_F(AssemblerX86_64Test, Shufps) {
-  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::shufps, 1, "shufps ${imm}, %{reg2}, %{reg1}"), "shufps");
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::shufps, /*imm_bytes*/ 1U,
+                      "shufps ${imm}, %{reg2}, %{reg1}"), "shufps");
 }
 
 TEST_F(AssemblerX86_64Test, Shufpd) {
-  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::shufpd, 1, "shufpd ${imm}, %{reg2}, %{reg1}"), "shufpd");
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::shufpd, /*imm_bytes*/ 1U,
+                      "shufpd ${imm}, %{reg2}, %{reg1}"), "shufpd");
 }
 
 TEST_F(AssemblerX86_64Test, PShufd) {
-  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::pshufd, 1, "pshufd ${imm}, %{reg2}, %{reg1}"), "pshufd");
+  DriverStr(RepeatFFI(&x86_64::X86_64Assembler::pshufd, /*imm_bytes*/ 1U,
+                      "pshufd ${imm}, %{reg2}, %{reg1}"), "pshufd");
 }
 
 TEST_F(AssemblerX86_64Test, Punpcklbw) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpcklbw, "punpcklbw %{reg2}, %{reg1}"), "punpcklbw");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpcklbw,
+                     "punpcklbw %{reg2}, %{reg1}"), "punpcklbw");
 }
 
 TEST_F(AssemblerX86_64Test, Punpcklwd) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpcklwd, "punpcklwd %{reg2}, %{reg1}"), "punpcklwd");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpcklwd,
+                     "punpcklwd %{reg2}, %{reg1}"), "punpcklwd");
 }
 
 TEST_F(AssemblerX86_64Test, Punpckldq) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckldq, "punpckldq %{reg2}, %{reg1}"), "punpckldq");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckldq,
+                     "punpckldq %{reg2}, %{reg1}"), "punpckldq");
 }
 
 TEST_F(AssemblerX86_64Test, Punpcklqdq) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpcklqdq, "punpcklqdq %{reg2}, %{reg1}"), "punpcklqdq");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpcklqdq,
+                     "punpcklqdq %{reg2}, %{reg1}"), "punpcklqdq");
 }
 
 TEST_F(AssemblerX86_64Test, Punpckhbw) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhbw, "punpckhbw %{reg2}, %{reg1}"), "punpckhbw");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhbw,
+                     "punpckhbw %{reg2}, %{reg1}"), "punpckhbw");
 }
 
 TEST_F(AssemblerX86_64Test, Punpckhwd) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhwd, "punpckhwd %{reg2}, %{reg1}"), "punpckhwd");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhwd,
+                     "punpckhwd %{reg2}, %{reg1}"), "punpckhwd");
 }
 
 TEST_F(AssemblerX86_64Test, Punpckhdq) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhdq, "punpckhdq %{reg2}, %{reg1}"), "punpckhdq");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhdq,
+                     "punpckhdq %{reg2}, %{reg1}"), "punpckhdq");
 }
 
 TEST_F(AssemblerX86_64Test, Punpckhqdq) {
-  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhqdq, "punpckhqdq %{reg2}, %{reg1}"), "punpckhqdq");
+  DriverStr(RepeatFF(&x86_64::X86_64Assembler::punpckhqdq,
+                     "punpckhqdq %{reg2}, %{reg1}"), "punpckhqdq");
 }
 
 TEST_F(AssemblerX86_64Test, Psllw) {
@@ -1653,63 +1647,21 @@
   GetAssembler()->psrld(x86_64::XmmRegister(x86_64::XMM0),  x86_64::Immediate(1));
   GetAssembler()->psrld(x86_64::XmmRegister(x86_64::XMM15), x86_64::Immediate(2));
   DriverStr("psrld $1, %xmm0\n"
-            "psrld $2, %xmm15\n", "pslldi");
+            "psrld $2, %xmm15\n", "psrldi");
 }
 
 TEST_F(AssemblerX86_64Test, Psrlq) {
   GetAssembler()->psrlq(x86_64::XmmRegister(x86_64::XMM0),  x86_64::Immediate(1));
   GetAssembler()->psrlq(x86_64::XmmRegister(x86_64::XMM15), x86_64::Immediate(2));
   DriverStr("psrlq $1, %xmm0\n"
-            "psrlq $2, %xmm15\n", "pslrqi");
+            "psrlq $2, %xmm15\n", "psrlqi");
 }
 
 TEST_F(AssemblerX86_64Test, Psrldq) {
   GetAssembler()->psrldq(x86_64::XmmRegister(x86_64::XMM0),  x86_64::Immediate(1));
   GetAssembler()->psrldq(x86_64::XmmRegister(x86_64::XMM15), x86_64::Immediate(2));
   DriverStr("psrldq $1, %xmm0\n"
-            "psrldq $2, %xmm15\n", "pslrdqi");
-}
-
-TEST_F(AssemblerX86_64Test, UcomissAddress) {
-  GetAssembler()->ucomiss(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->ucomiss(x86_64::XmmRegister(x86_64::XMM1), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  GetAssembler()->ucomiss(x86_64::XmmRegister(x86_64::XMM2), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  GetAssembler()->ucomiss(x86_64::XmmRegister(x86_64::XMM3), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), 0));
-  GetAssembler()->ucomiss(x86_64::XmmRegister(x86_64::XMM4), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0));
-  const char* expected =
-    "ucomiss 0xc(%RDI,%RBX,4), %xmm0\n"
-    "ucomiss 0xc(%RDI,%R9,4), %xmm1\n"
-    "ucomiss 0xc(%RDI,%R9,4), %xmm2\n"
-    "ucomiss (%R13), %xmm3\n"
-    "ucomiss (%R13,%R9,1), %xmm4\n";
-
-  DriverStr(expected, "ucomiss_address");
-}
-
-TEST_F(AssemblerX86_64Test, UcomisdAddress) {
-  GetAssembler()->ucomisd(x86_64::XmmRegister(x86_64::XMM0), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->ucomisd(x86_64::XmmRegister(x86_64::XMM1), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  GetAssembler()->ucomisd(x86_64::XmmRegister(x86_64::XMM2), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  GetAssembler()->ucomisd(x86_64::XmmRegister(x86_64::XMM3), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), 0));
-  GetAssembler()->ucomisd(x86_64::XmmRegister(x86_64::XMM4), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R13), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_1, 0));
-  const char* expected =
-    "ucomisd 0xc(%RDI,%RBX,4), %xmm0\n"
-    "ucomisd 0xc(%RDI,%R9,4), %xmm1\n"
-    "ucomisd 0xc(%RDI,%R9,4), %xmm2\n"
-    "ucomisd (%R13), %xmm3\n"
-    "ucomisd (%R13,%R9,1), %xmm4\n";
-
-  DriverStr(expected, "ucomisd_address");
+            "psrldq $2, %xmm15\n", "psrldqi");
 }
 
 std::string x87_fn(AssemblerX86_64Test::Base* assembler_test ATTRIBUTE_UNUSED,
@@ -1735,22 +1687,28 @@
   DriverFn(&x87_fn, "x87");
 }
 
-TEST_F(AssemblerX86_64Test, FPUIntegerLoad) {
-  GetAssembler()->filds(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 4));
-  GetAssembler()->fildl(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 12));
-  const char* expected =
-      "fildl 0x4(%RSP)\n"
-      "fildll 0xc(%RSP)\n";
-  DriverStr(expected, "FPUIntegerLoad");
+TEST_F(AssemblerX86_64Test, FPUIntegerLoads) {
+  DriverStr(RepeatA(&x86_64::X86_64Assembler::filds,
+                    addresses_singleton_,  // no ext addressing
+                    "fildl {mem}"), "filds");
 }
 
-TEST_F(AssemblerX86_64Test, FPUIntegerStore) {
-  GetAssembler()->fistps(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 16));
-  GetAssembler()->fistpl(x86_64::Address(x86_64::CpuRegister(x86_64::RSP), 24));
-  const char* expected =
-      "fistpl 0x10(%RSP)\n"
-      "fistpll 0x18(%RSP)\n";
-  DriverStr(expected, "FPUIntegerStore");
+TEST_F(AssemblerX86_64Test, FPUIntegerLoadl) {
+  DriverStr(RepeatA(&x86_64::X86_64Assembler::fildl,
+                    addresses_singleton_,  // no ext addressing
+                    "fildll {mem}"), "fildl");
+}
+
+TEST_F(AssemblerX86_64Test, FPUIntegerStores) {
+  DriverStr(RepeatA(&x86_64::X86_64Assembler::fistps,
+                    addresses_singleton_,  // no ext addressing
+                    "fistpl {mem}"), "fistps");
+}
+
+TEST_F(AssemblerX86_64Test, FPUIntegerStorel) {
+  DriverStr(RepeatA(&x86_64::X86_64Assembler::fistpl,
+                    addresses_singleton_,  // no ext addressing
+                    "fistpll {mem}"), "fistpl");
 }
 
 TEST_F(AssemblerX86_64Test, Call) {
@@ -1762,13 +1720,15 @@
 }
 
 TEST_F(AssemblerX86_64Test, Enter) {
-  DriverStr(RepeatI(&x86_64::X86_64Assembler::enter, 2U  /* 16b immediate */, "enter ${imm}, $0",
-                    true  /* Only non-negative number */), "enter");
+  DriverStr(RepeatI(&x86_64::X86_64Assembler::enter,
+                    /*imm_bytes*/ 2U,
+                    "enter ${imm}, $0", /*non-negative*/ true), "enter");
 }
 
 TEST_F(AssemblerX86_64Test, RetImm) {
-  DriverStr(RepeatI(&x86_64::X86_64Assembler::ret, 2U  /* 16b immediate */, "ret ${imm}",
-                    true  /* Only non-negative number */), "reti");
+  DriverStr(RepeatI(&x86_64::X86_64Assembler::ret,
+                    /*imm_bytes*/ 2U,
+                    "ret ${imm}", /*non-negative*/ true), "ret");
 }
 
 std::string ret_and_leave_fn(AssemblerX86_64Test::Base* assembler_test ATTRIBUTE_UNUSED,
@@ -1801,18 +1761,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, BsflAddress) {
-  GetAssembler()->bsfl(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsfl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsfl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  const char* expected =
-    "bsfl 0xc(%RDI,%RBX,4), %R10d\n"
-    "bsfl 0xc(%R10,%RBX,4), %edi\n"
-    "bsfl 0xc(%RDI,%R9,4), %edi\n";
-
-  DriverStr(expected, "bsfl_address");
+  DriverStr(RepeatrA(&x86_64::X86_64Assembler::bsfl, "bsfl {mem}, %{reg}"), "bsfl_address");
 }
 
 TEST_F(AssemblerX86_64Test, Bsfq) {
@@ -1820,18 +1769,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, BsfqAddress) {
-  GetAssembler()->bsfq(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsfq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsfq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  const char* expected =
-    "bsfq 0xc(%RDI,%RBX,4), %R10\n"
-    "bsfq 0xc(%R10,%RBX,4), %RDI\n"
-    "bsfq 0xc(%RDI,%R9,4), %RDI\n";
-
-  DriverStr(expected, "bsfq_address");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::bsfq, "bsfq {mem}, %{reg}"), "bsfq_address");
 }
 
 TEST_F(AssemblerX86_64Test, Bsrl) {
@@ -1839,18 +1777,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, BsrlAddress) {
-  GetAssembler()->bsrl(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsrl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsrl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  const char* expected =
-    "bsrl 0xc(%RDI,%RBX,4), %R10d\n"
-    "bsrl 0xc(%R10,%RBX,4), %edi\n"
-    "bsrl 0xc(%RDI,%R9,4), %edi\n";
-
-  DriverStr(expected, "bsrl_address");
+  DriverStr(RepeatrA(&x86_64::X86_64Assembler::bsrl, "bsrl {mem}, %{reg}"), "bsrl_address");
 }
 
 TEST_F(AssemblerX86_64Test, Bsrq) {
@@ -1858,18 +1785,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, BsrqAddress) {
-  GetAssembler()->bsrq(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsrq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->bsrq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  const char* expected =
-    "bsrq 0xc(%RDI,%RBX,4), %R10\n"
-    "bsrq 0xc(%R10,%RBX,4), %RDI\n"
-    "bsrq 0xc(%RDI,%R9,4), %RDI\n";
-
-  DriverStr(expected, "bsrq_address");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::bsrq, "bsrq {mem}, %{reg}"), "bsrq_address");
 }
 
 TEST_F(AssemblerX86_64Test, Popcntl) {
@@ -1877,18 +1793,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, PopcntlAddress) {
-  GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->popcntl(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  const char* expected =
-    "popcntl 0xc(%RDI,%RBX,4), %R10d\n"
-    "popcntl 0xc(%R10,%RBX,4), %edi\n"
-    "popcntl 0xc(%RDI,%R9,4), %edi\n";
-
-  DriverStr(expected, "popcntl_address");
+  DriverStr(RepeatrA(&x86_64::X86_64Assembler::popcntl, "popcntl {mem}, %{reg}"), "popcntl_address");
 }
 
 TEST_F(AssemblerX86_64Test, Popcntq) {
@@ -1896,18 +1801,7 @@
 }
 
 TEST_F(AssemblerX86_64Test, PopcntqAddress) {
-  GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::R10), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::R10), x86_64::CpuRegister(x86_64::RBX), x86_64::TIMES_4, 12));
-  GetAssembler()->popcntq(x86_64::CpuRegister(x86_64::RDI), x86_64::Address(
-      x86_64::CpuRegister(x86_64::RDI), x86_64::CpuRegister(x86_64::R9), x86_64::TIMES_4, 12));
-  const char* expected =
-    "popcntq 0xc(%RDI,%RBX,4), %R10\n"
-    "popcntq 0xc(%R10,%RBX,4), %RDI\n"
-    "popcntq 0xc(%RDI,%R9,4), %RDI\n";
-
-  DriverStr(expected, "popcntq_address");
+  DriverStr(RepeatRA(&x86_64::X86_64Assembler::popcntq, "popcntq {mem}, %{reg}"), "popcntq_address");
 }
 
 TEST_F(AssemblerX86_64Test, CmovlAddress) {
@@ -1921,7 +1815,6 @@
     "cmovzl 0xc(%RDI,%RBX,4), %R10d\n"
     "cmovnzl 0xc(%R10,%RBX,4), %edi\n"
     "cmovzl 0xc(%RDI,%R9,4), %edi\n";
-
   DriverStr(expected, "cmovl_address");
 }
 
@@ -1936,7 +1829,6 @@
     "cmovzq 0xc(%RDI,%RBX,4), %R10\n"
     "cmovnzq 0xc(%R10,%RBX,4), %rdi\n"
     "cmovzq 0xc(%RDI,%R9,4), %rdi\n";
-
   DriverStr(expected, "cmovq_address");
 }
 
@@ -2050,52 +1942,21 @@
 }
 
 TEST_F(AssemblerX86_64Test, Cmpb) {
-  GetAssembler()->cmpb(x86_64::Address(x86_64::CpuRegister(x86_64::RDI), 128),
-                       x86_64::Immediate(0));
-  const char* expected = "cmpb $0, 128(%RDI)\n";
-  DriverStr(expected, "cmpb");
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::cmpb,
+                     /*imm_bytes*/ 1U,
+                     "cmpb ${imm}, {mem}"), "cmpb");
 }
 
 TEST_F(AssemblerX86_64Test, TestbAddressImmediate) {
-  GetAssembler()->testb(
-      x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
-                      x86_64::CpuRegister(x86_64::RBX),
-                      x86_64::TIMES_4,
-                      12),
-      x86_64::Immediate(1));
-  GetAssembler()->testb(
-      x86_64::Address(x86_64::CpuRegister(x86_64::RSP), FrameOffset(7)),
-      x86_64::Immediate(-128));
-  GetAssembler()->testb(
-      x86_64::Address(x86_64::CpuRegister(x86_64::RBX), MemberOffset(130)),
-      x86_64::Immediate(127));
-  const char* expected =
-      "testb $1, 0xc(%RDI,%RBX,4)\n"
-      "testb $-128, 0x7(%RSP)\n"
-      "testb $127, 0x82(%RBX)\n";
-
-  DriverStr(expected, "TestbAddressImmediate");
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::testb,
+                     /*imm_bytes*/ 1U,
+                     "testb ${imm}, {mem}"), "testbi");
 }
 
 TEST_F(AssemblerX86_64Test, TestlAddressImmediate) {
-  GetAssembler()->testl(
-      x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
-                      x86_64::CpuRegister(x86_64::RBX),
-                      x86_64::TIMES_4,
-                      12),
-      x86_64::Immediate(1));
-  GetAssembler()->testl(
-      x86_64::Address(x86_64::CpuRegister(x86_64::RSP), FrameOffset(7)),
-      x86_64::Immediate(-100000));
-  GetAssembler()->testl(
-      x86_64::Address(x86_64::CpuRegister(x86_64::RBX), MemberOffset(130)),
-      x86_64::Immediate(77777777));
-  const char* expected =
-      "testl $1, 0xc(%RDI,%RBX,4)\n"
-      "testl $-100000, 0x7(%RSP)\n"
-      "testl $77777777, 0x82(%RBX)\n";
-
-  DriverStr(expected, "TestlAddressImmediate");
+  DriverStr(RepeatAI(&x86_64::X86_64Assembler::testl,
+                     /*imm_bytes*/ 4U,
+                     "testl ${imm}, {mem}"), "testli");
 }
 
 class JNIMacroAssemblerX86_64Test : public JNIMacroAssemblerTest<x86_64::X86_64JNIMacroAssembler> {
@@ -2150,15 +2011,15 @@
 
   // Construct assembly text counterpart.
   std::ostringstream str;
-  // 1) Push the spill_regs.
+  // (1) Push the spill_regs.
   str << "pushq %rsi\n";
   str << "pushq %r10\n";
-  // 2) Move down the stack pointer.
+  // (2) Move down the stack pointer.
   ssize_t displacement = static_cast<ssize_t>(frame_size) - (spill_regs.size() * 8 + 8);
   str << "subq $" << displacement << ", %rsp\n";
-  // 3) Store method reference.
+  // (3) Store method reference.
   str << "movq %rdi, (%rsp)\n";
-  // 4) Entry spills.
+  // (4) Entry spills.
   str << "movq %rax, " << frame_size + 0 << "(%rsp)\n";
   str << "movq %rbx, " << frame_size + 8 << "(%rsp)\n";
   str << "movsd %xmm1, " << frame_size + 16 << "(%rsp)\n";
@@ -2186,10 +2047,10 @@
 
   // Construct assembly text counterpart.
   std::ostringstream str;
-  // 1) Move up the stack pointer.
+  // (1) Move up the stack pointer.
   ssize_t displacement = static_cast<ssize_t>(frame_size) - spill_regs.size() * 8 - 8;
   str << "addq $" << displacement << ", %rsp\n";
-  // 2) Pop spill regs.
+  // (2) Pop spill regs.
   str << "popq %r10\n";
   str << "popq %rsi\n";
   str << "ret\n";
diff --git a/dex2oat/linker/image_test.h b/dex2oat/linker/image_test.h
index 71f1fa6..492c76b 100644
--- a/dex2oat/linker/image_test.h
+++ b/dex2oat/linker/image_test.h
@@ -302,8 +302,8 @@
       }
 
       for (size_t i = 0, size = oat_files.size(); i != size; ++i) {
-        linker::MultiOatRelativePatcher patcher(driver->GetInstructionSet(),
-                                                driver->GetInstructionSetFeatures());
+        MultiOatRelativePatcher patcher(driver->GetInstructionSet(),
+                                        driver->GetInstructionSetFeatures());
         OatWriter* const oat_writer = oat_writers[i].get();
         ElfWriter* const elf_writer = elf_writers[i].get();
         std::vector<const DexFile*> cur_dex_files(1u, class_path[i]);
diff --git a/dex2oat/linker/multi_oat_relative_patcher_test.cc b/dex2oat/linker/multi_oat_relative_patcher_test.cc
index 1b2d43e..ca9c5f1 100644
--- a/dex2oat/linker/multi_oat_relative_patcher_test.cc
+++ b/dex2oat/linker/multi_oat_relative_patcher_test.cc
@@ -19,6 +19,7 @@
 #include "compiled_method.h"
 #include "debug/method_debug_info.h"
 #include "gtest/gtest.h"
+#include "linker/linker_patch.h"
 #include "linker/vector_output_stream.h"
 
 namespace art {
diff --git a/dex2oat/linker/oat_writer.cc b/dex2oat/linker/oat_writer.cc
index 51c2a03..305d4f6 100644
--- a/dex2oat/linker/oat_writer.cc
+++ b/dex2oat/linker/oat_writer.cc
@@ -29,7 +29,7 @@
 #include "base/unix_file/fd_file.h"
 #include "class_linker.h"
 #include "class_table-inl.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "debug/method_debug_info.h"
 #include "dex/verification_results.h"
 #include "dex_file-inl.h"
@@ -43,6 +43,7 @@
 #include "image_writer.h"
 #include "linker/buffered_output_stream.h"
 #include "linker/file_output_stream.h"
+#include "linker/linker_patch.h"
 #include "linker/method_bss_mapping_encoder.h"
 #include "linker/multi_oat_relative_patcher.h"
 #include "linker/output_stream.h"
@@ -1063,7 +1064,7 @@
   SafeMap<const CompiledMethod*, uint32_t, CodeOffsetsKeyComparator> dedupe_map_;
 
   // Cache writer_'s members and compiler options.
-  linker::MultiOatRelativePatcher* relative_patcher_;
+  MultiOatRelativePatcher* relative_patcher_;
   uint32_t executable_offset_;
   const bool debuggable_;
   const bool native_debuggable_;
@@ -1920,7 +1921,7 @@
       DCHECK_ALIGNED(offset, 4u);
       oat_dex_files_[i].method_bss_mapping_offset_ = offset;
 
-      linker::MethodBssMappingEncoder encoder(
+      MethodBssMappingEncoder encoder(
           GetInstructionSetPointerSize(oat_header_->GetInstructionSet()));
       size_t number_of_entries = 0u;
       bool first_index = true;
@@ -2593,7 +2594,7 @@
                     "MethodBssMapping alignment check.");
       DCHECK_ALIGNED(relative_offset, sizeof(uint32_t));
 
-      linker::MethodBssMappingEncoder encoder(
+      MethodBssMappingEncoder encoder(
           GetInstructionSetPointerSize(oat_header_->GetInstructionSet()));
       // Allocate a sufficiently large MethodBssMapping.
       size_t number_of_method_indexes = method_indexes.NumSetBits();
diff --git a/dex2oat/linker/oat_writer_test.cc b/dex2oat/linker/oat_writer_test.cc
index 33d1491..d89d9f0 100644
--- a/dex2oat/linker/oat_writer_test.cc
+++ b/dex2oat/linker/oat_writer_test.cc
@@ -23,7 +23,7 @@
 #include "base/unix_file/fd_file.h"
 #include "class_linker.h"
 #include "common_compiler_test.h"
-#include "compiled_method.h"
+#include "compiled_method-inl.h"
 #include "compiler.h"
 #include "debug/method_debug_info.h"
 #include "dex/quick_compiler_callbacks.h"
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 36bd4bc..be78136 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -36,6 +36,7 @@
 #include "base/unix_file/fd_file.h"
 #include "class_linker-inl.h"
 #include "class_linker.h"
+#include "compiled_method.h"
 #include "debug/elf_debug_writer.h"
 #include "debug/method_debug_info.h"
 #include "dex_file-inl.h"
diff --git a/runtime/oat_file_assistant.cc b/runtime/oat_file_assistant.cc
index 83a8e09..f3a0725 100644
--- a/runtime/oat_file_assistant.cc
+++ b/runtime/oat_file_assistant.cc
@@ -645,6 +645,30 @@
   return true;
 }
 
+class Dex2oatFileWrapper {
+ public:
+  explicit Dex2oatFileWrapper(File* file)
+      : file_(file),
+        unlink_file_at_destruction_(true) {
+  }
+
+  ~Dex2oatFileWrapper() {
+    if (unlink_file_at_destruction_ && (file_ != nullptr)) {
+      file_->Erase(/*unlink*/ true);
+    }
+  }
+
+  File* GetFile() { return file_.get(); }
+
+  void DisableUnlinkAtDestruction() {
+    unlink_file_at_destruction_ = false;
+  };
+
+ private:
+  std::unique_ptr<File> file_;
+  bool unlink_file_at_destruction_;
+};
+
 OatFileAssistant::ResultOfAttemptToUpdate OatFileAssistant::GenerateOatFileNoChecks(
       OatFileAssistant::OatFileInfo& info,
       CompilerFilter::Filter filter,
@@ -690,8 +714,9 @@
       (dex_path_stat.st_mode & S_IRGRP) |
       (dex_path_stat.st_mode & S_IROTH);
 
-  std::unique_ptr<File> vdex_file(OS::CreateEmptyFile(vdex_file_name.c_str()));
-  if (vdex_file.get() == nullptr) {
+  Dex2oatFileWrapper vdex_file_wrapper(OS::CreateEmptyFile(vdex_file_name.c_str()));
+  File* vdex_file = vdex_file_wrapper.GetFile();
+  if (vdex_file == nullptr) {
     *error_msg = "Generation of oat file " + oat_file_name
       + " not attempted because the vdex file " + vdex_file_name
       + " could not be opened.";
@@ -705,8 +730,9 @@
     return kUpdateNotAttempted;
   }
 
-  std::unique_ptr<File> oat_file(OS::CreateEmptyFile(oat_file_name.c_str()));
-  if (oat_file.get() == nullptr) {
+  Dex2oatFileWrapper oat_file_wrapper(OS::CreateEmptyFile(oat_file_name.c_str()));
+  File* oat_file = oat_file_wrapper.GetFile();
+  if (oat_file == nullptr) {
     *error_msg = "Generation of oat file " + oat_file_name
       + " not attempted because the oat file could not be created.";
     return kUpdateNotAttempted;
@@ -715,7 +741,6 @@
   if (fchmod(oat_file->Fd(), file_mode) != 0) {
     *error_msg = "Generation of oat file " + oat_file_name
       + " not attempted because the oat file could not be made world readable.";
-    oat_file->Erase();
     return kUpdateNotAttempted;
   }
 
@@ -731,29 +756,25 @@
   args.push_back("--class-loader-context=" + dex2oat_context);
 
   if (!Dex2Oat(args, error_msg)) {
-    // Manually delete the oat and vdex files. This ensures there is no garbage
-    // left over if the process unexpectedly died.
-    vdex_file->Erase();
-    unlink(vdex_file_name.c_str());
-    oat_file->Erase();
-    unlink(oat_file_name.c_str());
     return kUpdateFailed;
   }
 
   if (vdex_file->FlushCloseOrErase() != 0) {
     *error_msg = "Unable to close vdex file " + vdex_file_name;
-    unlink(vdex_file_name.c_str());
     return kUpdateFailed;
   }
 
   if (oat_file->FlushCloseOrErase() != 0) {
     *error_msg = "Unable to close oat file " + oat_file_name;
-    unlink(oat_file_name.c_str());
     return kUpdateFailed;
   }
 
   // Mark that the odex file has changed and we should try to reload.
   info.Reset();
+  // We have compiled successfully. Disable the auto-unlink.
+  vdex_file_wrapper.DisableUnlinkAtDestruction();
+  oat_file_wrapper.DisableUnlinkAtDestruction();
+
   return kUpdateSucceeded;
 }
 
diff --git a/test/442-checker-constant-folding/smali/TestCmp.smali b/test/442-checker-constant-folding/smali/TestCmp.smali
index df631bc..f55c837 100644
--- a/test/442-checker-constant-folding/smali/TestCmp.smali
+++ b/test/442-checker-constant-folding/smali/TestCmp.smali
@@ -330,3 +330,141 @@
    cmpl-double v0, v1, v3
    return v0
 .end method
+
+
+##  CHECK-START: int TestCmp.IntAddition2() constant_folding (before)
+##  CHECK-DAG:     <<Const1:i\d+>>  IntConstant 1
+##  CHECK-DAG:     <<Const2:i\d+>>  IntConstant 2
+##  CHECK-DAG:     <<Const5:i\d+>>  IntConstant 5
+##  CHECK-DAG:     <<Const6:i\d+>>  IntConstant 6
+##  CHECK-DAG:     <<Add1:i\d+>>    Add [<<Const1>>,<<Const2>>]
+##  CHECK-DAG:     <<Add2:i\d+>>    Add [<<Const5>>,<<Const6>>]
+##  CHECK-DAG:     <<Add3:i\d+>>    Add [<<Add1>>,<<Add2>>]
+##  CHECK-DAG:                      Return [<<Add3>>]
+
+##  CHECK-START: int TestCmp.IntAddition2() constant_folding (after)
+##  CHECK-DAG:     <<Const14:i\d+>> IntConstant 14
+##  CHECK-DAG:                      Return [<<Const14>>]
+
+##  CHECK-START: int TestCmp.IntAddition2() constant_folding (after)
+##  CHECK-NOT:                      Add
+.method public static IntAddition2()I
+    # A more direct translation from Java.
+
+    # int a, b, c;
+    .registers 3
+
+    # a = 1;
+    const/4 v0, 1
+    # b = 2;
+    const/4 v1, 2
+
+    # a += b;
+    add-int/2addr v0, v1
+
+    # b = 5;
+    const/4 v1, 5
+    # c = 6;
+    const/4 v2, 6
+
+    # b += c;
+    add-int/2addr v1, v2
+    # c = a + b;
+    add-int v2, v0, v1
+
+    # return c;
+    return v2
+.end method
+
+
+##  CHECK-START: int TestCmp.IntAddition2AddAndMove() constant_folding (before)
+##  CHECK-DAG:     <<Const1:i\d+>>  IntConstant 1
+##  CHECK-DAG:     <<Const2:i\d+>>  IntConstant 2
+##  CHECK-DAG:     <<Const5:i\d+>>  IntConstant 5
+##  CHECK-DAG:     <<Const6:i\d+>>  IntConstant 6
+##  CHECK-DAG:     <<Add1:i\d+>>    Add [<<Const1>>,<<Const2>>]
+##  CHECK-DAG:     <<Add2:i\d+>>    Add [<<Const5>>,<<Const6>>]
+##  CHECK-DAG:     <<Add3:i\d+>>    Add [<<Add1>>,<<Add2>>]
+##  CHECK-DAG:                      Return [<<Add3>>]
+
+##  CHECK-START: int TestCmp.IntAddition2AddAndMove() constant_folding (after)
+##  CHECK-DAG:     <<Const14:i\d+>> IntConstant 14
+##  CHECK-DAG:                      Return [<<Const14>>]
+
+##  CHECK-START: int TestCmp.IntAddition2AddAndMove() constant_folding (after)
+##  CHECK-NOT:                      Add
+
+#   D8 uses 3 registers for += when local variable info is presented.
+.method public static IntAddition2AddAndMove()I
+    .registers 4
+
+    # a = 1;
+    const/4 v0, 1
+    # b = 2;
+    const/4 v1, 2
+
+    # a += b;
+    add-int v2, v0, v1
+    move v0, v2
+
+    # b = 5;
+    const/4 v2, 5
+    move v1, v2
+
+    # c = 6;
+    const/4 v2, 6
+
+    # b += c;
+    add-int v3, v1, v2
+    move v1, v3
+
+    # c = a + b;
+    add-int v3, v0, v1
+    move v2, v3
+
+    # return c;
+    return v2
+.end method
+
+
+## CHECK-START: int TestCmp.JumpsAndConditionals(boolean) constant_folding (before)
+## CHECK-DAG:     <<Const2:i\d+>>  IntConstant 2
+## CHECK-DAG:     <<Const5:i\d+>>  IntConstant 5
+## CHECK-DAG:     <<Add:i\d+>>     Add [<<Const5>>,<<Const2>>]
+## CHECK-DAG:     <<Sub:i\d+>>     Sub [<<Const5>>,<<Const2>>]
+## CHECK-DAG:     <<Phi:i\d+>>     Phi [<<Add>>,<<Sub>>]
+## CHECK-DAG:                      Return [<<Phi>>]
+
+## CHECK-START: int TestCmp.JumpsAndConditionals(boolean) constant_folding (after)
+## CHECK-DAG:     <<Const3:i\d+>>  IntConstant 3
+## CHECK-DAG:     <<Const7:i\d+>>  IntConstant 7
+## CHECK-DAG:     <<Phi:i\d+>>     Phi [<<Const7>>,<<Const3>>]
+## CHECK-DAG:                      Return [<<Phi>>]
+
+## CHECK-START: int TestCmp.JumpsAndConditionals(boolean) constant_folding (after)
+## CHECK-NOT:                      Add
+## CHECK-NOT:                      Sub
+.method public static JumpsAndConditionals(Z)I
+    # int a, b, c;
+    # a = 5;
+    # b = 2;
+    # if (cond)
+    #   c = a + b;
+    # else
+    #   c = a - b;
+    # return c;
+    .registers 4
+
+    const/4 v0, 5
+    const/4 v1, 2
+
+    if-eqz p0, :cond_7
+    add-int v2, v0, v1
+
+    :goto_6
+    return v2
+
+    :cond_7
+    sub-int v2, v0, v1
+    goto :goto_6
+.end method
diff --git a/test/442-checker-constant-folding/src/Main.java b/test/442-checker-constant-folding/src/Main.java
index eba5137..95c19ea 100644
--- a/test/442-checker-constant-folding/src/Main.java
+++ b/test/442-checker-constant-folding/src/Main.java
@@ -113,6 +113,19 @@
     return (Integer)m.invoke(null);
   }
 
+  public static int smaliIntAddition2() throws Exception {
+    Method m = Class.forName("TestCmp").getMethod("IntAddition2");
+    return (Integer)m.invoke(null);
+  }
+  public static int smaliIntAddition2AddAndMove() throws Exception {
+    Method m = Class.forName("TestCmp").getMethod("IntAddition2AddAndMove");
+    return (Integer)m.invoke(null);
+  }
+  public static int smaliJumpsAndConditionals(boolean cond) throws Exception {
+    Method m = Class.forName("TestCmp").getMethod("JumpsAndConditionals", boolean.class);
+    return (Integer)m.invoke(null, cond);
+  }
+
 
   /**
    * Exercise constant folding on negation.
@@ -225,11 +238,8 @@
   /// CHECK-DAG:     <<Const2:i\d+>>  IntConstant 2
   /// CHECK-DAG:     <<Const5:i\d+>>  IntConstant 5
   /// CHECK-DAG:     <<Const6:i\d+>>  IntConstant 6
-  /// CHECK-DAG:     <<Const11:i\d+>> IntConstant 11
   /// CHECK-DAG:     <<Add1:i\d+>>    Add [<<Const1>>,<<Const2>>]
   /// CHECK-DAG:                      Add [<<Const5>>,<<Const6>>]
-  /// CHECK-DAG:     <<Add3:i\d+>>    Add [<<Add1>>,<<Const11>>]
-  /// CHECK-DAG:                      Return [<<Add3>>]
 
   /// CHECK-START: int Main.IntAddition2() constant_folding (after)
   /// CHECK-DAG:     <<Const14:i\d+>> IntConstant 14
@@ -1520,6 +1530,8 @@
 
     assertIntEquals(3, IntAddition1());
     assertIntEquals(14, IntAddition2());
+    assertIntEquals(14, smaliIntAddition2());
+    assertIntEquals(14, smaliIntAddition2AddAndMove());
     assertLongEquals(3L, LongAddition());
     assertFloatEquals(3F, FloatAddition());
     assertDoubleEquals(3D, DoubleAddition());
@@ -1567,6 +1579,8 @@
 
     assertIntEquals(7, JumpsAndConditionals(true));
     assertIntEquals(3, JumpsAndConditionals(false));
+    assertIntEquals(7, smaliJumpsAndConditionals(true));
+    assertIntEquals(3, smaliJumpsAndConditionals(false));
 
     int arbitrary = 123456;  // Value chosen arbitrarily.
 
diff --git a/test/552-checker-sharpening/src/Main.java b/test/552-checker-sharpening/src/Main.java
index 1f1920c..55873ea 100644
--- a/test/552-checker-sharpening/src/Main.java
+++ b/test/552-checker-sharpening/src/Main.java
@@ -63,16 +63,12 @@
   /// CHECK-START-X86_64: int Main.testSimple(int) sharpening (after)
   /// CHECK:                InvokeStaticOrDirect method_load_kind:BssEntry
 
-  /// CHECK-START-MIPS: int Main.testSimple(int) pc_relative_fixups_mips (after)
-  /// CHECK:                MipsComputeBaseMethodAddress
-  /// CHECK-NOT:            MipsComputeBaseMethodAddress
-
   /// CHECK-START-X86: int Main.testSimple(int) pc_relative_fixups_x86 (after)
   /// CHECK:                X86ComputeBaseMethodAddress
   /// CHECK-NOT:            X86ComputeBaseMethodAddress
 
   public static int testSimple(int x) {
-    // This call should use PC-relative dex cache array load to retrieve the target method.
+    // This call should use PC-relative .bss array load to retrieve the target method.
     return $noinline$foo(x);
   }
 
@@ -104,14 +100,6 @@
   /// CHECK:                InvokeStaticOrDirect method_load_kind:BssEntry
   /// CHECK:                InvokeStaticOrDirect method_load_kind:BssEntry
 
-  /// CHECK-START-MIPS: int Main.testDiamond(boolean, int) pc_relative_fixups_mips (after)
-  /// CHECK:                MipsComputeBaseMethodAddress
-  /// CHECK-NOT:            MipsComputeBaseMethodAddress
-
-  /// CHECK-START-MIPS: int Main.testDiamond(boolean, int) pc_relative_fixups_mips (after)
-  /// CHECK:                MipsComputeBaseMethodAddress
-  /// CHECK-NEXT:           If
-
   /// CHECK-START-X86: int Main.testDiamond(boolean, int) pc_relative_fixups_x86 (after)
   /// CHECK:                X86ComputeBaseMethodAddress
   /// CHECK-NOT:            X86ComputeBaseMethodAddress
@@ -122,7 +110,7 @@
 
   public static int testDiamond(boolean negate, int x) {
     // These calls should use PC-relative loads to retrieve the target method.
-    // PC-relative bases used by MIPS and X86 should be pulled before the If.
+    // PC-relative bases used by MIPS32R2 and X86 should be pulled before the If.
     if (negate) {
       return $noinline$foo(-x);
     } else {
@@ -130,24 +118,6 @@
     }
   }
 
-  /// CHECK-START-MIPS: int Main.testLoop(int[], int) pc_relative_fixups_mips (before)
-  /// CHECK-NOT:            MipsComputeBaseMethodAddress
-
-  /// CHECK-START-MIPS: int Main.testLoop(int[], int) pc_relative_fixups_mips (after)
-  /// CHECK:                MipsComputeBaseMethodAddress
-  /// CHECK-NOT:            MipsComputeBaseMethodAddress
-
-  /// CHECK-START-MIPS: int Main.testLoop(int[], int) pc_relative_fixups_mips (after)
-  /// CHECK:                InvokeStaticOrDirect
-  /// CHECK-NOT:            InvokeStaticOrDirect
-
-  /// CHECK-START-MIPS: int Main.testLoop(int[], int) pc_relative_fixups_mips (after)
-  /// CHECK:                ArrayLength
-  /// CHECK-NEXT:           MipsComputeBaseMethodAddress
-  /// CHECK-NEXT:           Goto
-  /// CHECK:                begin_block
-  /// CHECK:                InvokeStaticOrDirect method_load_kind:BssEntry
-
   /// CHECK-START-X86: int Main.testLoop(int[], int) pc_relative_fixups_x86 (before)
   /// CHECK-NOT:            X86ComputeBaseMethodAddress
 
@@ -167,23 +137,13 @@
   /// CHECK:                InvokeStaticOrDirect method_load_kind:BssEntry
 
   public static int testLoop(int[] array, int x) {
-    // PC-relative bases used by MIPS and X86 should be pulled before the loop.
+    // PC-relative bases used by MIPS32R2 and X86 should be pulled before the loop.
     for (int i : array) {
       x += $noinline$foo(i);
     }
     return x;
   }
 
-  /// CHECK-START-MIPS: int Main.testLoopWithDiamond(int[], boolean, int) pc_relative_fixups_mips (before)
-  /// CHECK-NOT:            MipsComputeBaseMethodAddress
-
-  /// CHECK-START-MIPS: int Main.testLoopWithDiamond(int[], boolean, int) pc_relative_fixups_mips (after)
-  /// CHECK:                If
-  /// CHECK:                begin_block
-  /// CHECK:                ArrayLength
-  /// CHECK-NEXT:           MipsComputeBaseMethodAddress
-  /// CHECK-NEXT:           Goto
-
   /// CHECK-START-X86: int Main.testLoopWithDiamond(int[], boolean, int) pc_relative_fixups_x86 (before)
   /// CHECK-NOT:            X86ComputeBaseMethodAddress
 
@@ -195,7 +155,7 @@
   /// CHECK-NEXT:           Goto
 
   public static int testLoopWithDiamond(int[] array, boolean negate, int x) {
-    // PC-relative bases used by MIPS and X86 should be pulled before the loop
+    // PC-relative bases used by MIPS32R2 and X86 should be pulled before the loop
     // but not outside the if.
     if (array != null) {
       for (int i : array) {
diff --git a/test/651-checker-byte-simd-minmax/src/Main.java b/test/651-checker-byte-simd-minmax/src/Main.java
index e018b56..9643b90 100644
--- a/test/651-checker-byte-simd-minmax/src/Main.java
+++ b/test/651-checker-byte-simd-minmax/src/Main.java
@@ -165,6 +165,28 @@
     }
   }
 
+  /// CHECK-START: void Main.doitMin100(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                     loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:b\d+>>  ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get>>,<<I100>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:b\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMin100(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                     loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]       loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin100(byte[] x, byte[] y) {
+    int min = Math.min(x.length, y.length);
+    for (int i = 0; i < min; i++) {
+      x[i] = (byte) Math.min(y[i], 100);
+    }
+  }
+
   public static void main(String[] args) {
     // Initialize cross-values for all possible values.
     int total = 256 * 256;
@@ -202,6 +224,11 @@
       byte expected = (byte) Math.max(y[i] & 0xff, z[i] & 0xff);
       expectEquals(expected, x[i]);
     }
+    doitMin100(x, y);
+    for (int i = 0; i < total; i++) {
+      byte expected = (byte) Math.min(y[i], 100);
+      expectEquals(expected, x[i]);
+    }
 
     System.out.println("passed");
   }
diff --git a/test/651-checker-char-simd-minmax/src/Main.java b/test/651-checker-char-simd-minmax/src/Main.java
index 57cad9b..8a0262c 100644
--- a/test/651-checker-char-simd-minmax/src/Main.java
+++ b/test/651-checker-char-simd-minmax/src/Main.java
@@ -89,6 +89,28 @@
     }
   }
 
+  /// CHECK-START: void Main.doitMin100(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                     loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:c\d+>>  ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get>>,<<I100>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:c\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMin100(char[], char[]) loop_optimization (after)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                     loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]       loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] unsigned:true loop:<<Loop>>  outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin100(char[] x, char[] y) {
+    int min = Math.min(x.length, y.length);
+    for (int i = 0; i < min; i++) {
+      x[i] = (char) Math.min(y[i], 100);
+    }
+  }
+
   public static void main(String[] args) {
     char[] interesting = {
       0x0000, 0x0001, 0x007f, 0x0080, 0x0081, 0x00ff,
@@ -124,6 +146,11 @@
       char expected = (char) Math.max(y[i], z[i]);
       expectEquals(expected, x[i]);
     }
+    doitMin100(x, y);
+    for (int i = 0; i < total; i++) {
+      char expected = (char) Math.min(y[i], 100);
+      expectEquals(expected, x[i]);
+    }
 
     System.out.println("passed");
   }
diff --git a/test/651-checker-short-simd-minmax/src/Main.java b/test/651-checker-short-simd-minmax/src/Main.java
index 4f2a7a4..ffbf73b 100644
--- a/test/651-checker-short-simd-minmax/src/Main.java
+++ b/test/651-checker-short-simd-minmax/src/Main.java
@@ -165,6 +165,28 @@
     }
   }
 
+  /// CHECK-START: void Main.doitMin100(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                     loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:s\d+>>  ArrayGet                            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:i\d+>>  InvokeStaticOrDirect [<<Get>>,<<I100>>] intrinsic:MathMinIntInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:s\d+>>  TypeConversion [<<Min>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.doitMin100(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<I100:i\d+>> IntConstant 100                     loop:none
+  /// CHECK-DAG: <<Repl:d\d+>> VecReplicateScalar [<<I100>>]       loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>  Phi                                 loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:d\d+>>  VecLoad                             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Min:d\d+>>  VecMin [<<Get>>,<<Repl>>] unsigned:false loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:               VecStore [{{l\d+}},<<Phi>>,<<Min>>] loop:<<Loop>>      outer_loop:none
+  private static void doitMin100(short[] x, short[] y) {
+    int min = Math.min(x.length, y.length);
+    for (int i = 0; i < min; i++) {
+      x[i] = (short) Math.min(y[i], 100);
+    }
+  }
+
   public static void main(String[] args) {
     short[] interesting = {
       (short) 0x0000, (short) 0x0001, (short) 0x007f,
@@ -216,6 +238,11 @@
       short expected = (short) Math.max(y[i] & 0xffff, z[i] & 0xffff);
       expectEquals(expected, x[i]);
     }
+    doitMin100(x, y);
+    for (int i = 0; i < total; i++) {
+      short expected = (short) Math.min(y[i], 100);
+      expectEquals(expected, x[i]);
+    }
 
     System.out.println("passed");
   }
diff --git a/test/656-checker-simd-opt/src/Main.java b/test/656-checker-simd-opt/src/Main.java
index 091633f..39a126f 100644
--- a/test/656-checker-simd-opt/src/Main.java
+++ b/test/656-checker-simd-opt/src/Main.java
@@ -92,7 +92,91 @@
     }
   }
 
-  public static void main(String[] args) {
+  /// CHECK-START: long Main.longInductionReduction(long[]) loop_optimization (before)
+  /// CHECK-DAG: <<L0:j\d+>>    LongConstant 0             loop:none
+  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1             loop:none
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0              loop:none
+  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>] loop:none
+  /// CHECK-DAG: <<Phi1:j\d+>>  Phi [<<L0>>,<<Add1:j\d+>>] loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>  Phi [<<L1>>,<<Add2:j\d+>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add2>>       Add [<<Phi2>>,<<Get>>]     loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add1>>       Add [<<Phi1>>,<<L1>>]      loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.longInductionReduction(long[]) loop_optimization (after)
+  /// CHECK-DAG: <<L0:j\d+>>    LongConstant 0               loop:none
+  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1               loop:none
+  /// CHECK-DAG: <<L2:j\d+>>    LongConstant 2               loop:none
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                loop:none
+  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>]   loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Get>>] loop:none
+  /// CHECK-DAG: <<Set:d\d+>>   VecSetScalars [<<L1>>]       loop:none
+  /// CHECK-DAG: <<Phi1:j\d+>>  Phi [<<L0>>,{{j\d+}}]        loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>  Phi [<<Set>>,{{d\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                VecAdd [<<Phi2>>,<<Rep>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                Add [<<Phi1>>,<<L2>>]        loop:<<Loop>>      outer_loop:none
+  static long longInductionReduction(long[] y) {
+    long x = 1;
+    for (long i = 0; i < 10; i++) {
+      x += y[0];
+    }
+    return x;
+  }
+
+  /// CHECK-START: void Main.intVectorLongInvariant(int[], long[]) loop_optimization (before)
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
+  /// CHECK-DAG: <<I1:i\d+>>    IntConstant 1                       loop:none
+  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>]          loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,<<Add:i\d+>>]           loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Cnv:i\d+>>   TypeConversion [<<Get>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                ArraySet [{{l\d+}},<<Phi>>,<<Cnv>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add>>        Add [<<Phi>>,<<I1>>]                loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.intVectorLongInvariant(int[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
+  /// CHECK-DAG: <<I1:i\d+>>    IntConstant 1                       loop:none
+  /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
+  /// CHECK-DAG: <<Get:j\d+>>   ArrayGet [{{l\d+}},<<I0>>]          loop:none
+  /// CHECK-DAG: <<Cnv:i\d+>>   TypeConversion [<<Get>>]            loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Cnv>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Rep>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
+  static void intVectorLongInvariant(int[] x, long[] y) {
+    for (int i = 0; i < 100; i++) {
+      x[i] = (int) y[0];
+    }
+  }
+
+  /// CHECK-START: void Main.longCanBeDoneWithInt(int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                        loop:none
+  /// CHECK-DAG: <<I1:i\d+>>    IntConstant 1                        loop:none
+  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1                       loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,<<Add:i\d+>>]            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>   ArrayGet [{{l\d+}},<<Phi>>]          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>  TypeConversion [<<Get>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<AddL:j\d+>>  Add [<<Cnv1>>,<<L1>>]                loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:i\d+>>  TypeConversion [<<AddL>>]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                ArraySet [{{l\d+}},<<Phi>>,<<Cnv2>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add>>        Add [<<Phi>>,<<I1>>]                 loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: void Main.longCanBeDoneWithInt(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<I0:i\d+>>    IntConstant 0                       loop:none
+  /// CHECK-DAG: <<I4:i\d+>>    IntConstant 4                       loop:none
+  /// CHECK-DAG: <<L1:j\d+>>    LongConstant 1                      loop:none
+  /// CHECK-DAG: <<Cnv:i\d+>>   TypeConversion [<<L1>>]             loop:none
+  /// CHECK-DAG: <<Rep:d\d+>>   VecReplicateScalar [<<Cnv>>]        loop:none
+  /// CHECK-DAG: <<Phi:i\d+>>   Phi [<<I0>>,{{i\d+}}]               loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Load:d\d+>>  VecLoad [{{l\d+}},<<Phi>>]          loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Add:d\d+>>   VecAdd [<<Load>>,<<Rep>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                VecStore [{{l\d+}},<<Phi>>,<<Add>>] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                Add [<<Phi>>,<<I4>>]                loop:<<Loop>>      outer_loop:none
+  static void longCanBeDoneWithInt(int[] x, int[] y) {
+    for (int i = 0; i < 100; i++) {
+      x[i] = (int) (y[i] + 1L);
+    }
+  }
+
+  static void testUnroll() {
     float[] x = new float[100];
     float[] y = new float[100];
     for (int i = 0; i < 100; i++) {
@@ -104,51 +188,89 @@
       expectEquals(5.0f, x[i]);
       expectEquals(2.0f, y[i]);
     }
-    {
-      int[] a = new int[100];
-      int[] b = new int[100];
-      for (int i = 0; i < 100; i++) {
-        a[i] = 0;
-        b[i] = i;
-      }
-      stencil(a, b, 100);
-      for (int i = 1; i < 99; i++) {
-        int e = i + i + i;
-        expectEquals(e, a[i]);
-        expectEquals(i, b[i]);
-      }
+  }
+
+  static void testStencil1() {
+    int[] a = new int[100];
+    int[] b = new int[100];
+    for (int i = 0; i < 100; i++) {
+      a[i] = 0;
+      b[i] = i;
     }
-    {
-      int[] a = new int[100];
-      int[] b = new int[100];
-      for (int i = 0; i < 100; i++) {
-        a[i] = 0;
-        b[i] = i;
-      }
-      stencilSubInt(a, b, 100);
-      for (int i = 1; i < 99; i++) {
-        int e = i + i + i;
-        expectEquals(e, a[i]);
-        expectEquals(i, b[i]);
-      }
+    stencil(a, b, 100);
+    for (int i = 1; i < 99; i++) {
+      int e = i + i + i;
+      expectEquals(e, a[i]);
+      expectEquals(i, b[i]);
     }
-    {
-      int[] a = new int[100];
-      int[] b = new int[100];
-      for (int i = 0; i < 100; i++) {
-        a[i] = 0;
-        b[i] = i;
-      }
-      stencilAddInt(a, b, 100);
-      for (int i = 1; i < 99; i++) {
-        int e = i + i + i;
-        expectEquals(e, a[i]);
-        expectEquals(i, b[i]);
-      }
+  }
+
+  static void testStencil2() {
+    int[] a = new int[100];
+    int[] b = new int[100];
+    for (int i = 0; i < 100; i++) {
+      a[i] = 0;
+      b[i] = i;
     }
+    stencilSubInt(a, b, 100);
+    for (int i = 1; i < 99; i++) {
+      int e = i + i + i;
+      expectEquals(e, a[i]);
+      expectEquals(i, b[i]);
+    }
+  }
+
+  static void testStencil3() {
+    int[] a = new int[100];
+    int[] b = new int[100];
+    for (int i = 0; i < 100; i++) {
+      a[i] = 0;
+      b[i] = i;
+    }
+    stencilAddInt(a, b, 100);
+    for (int i = 1; i < 99; i++) {
+      int e = i + i + i;
+      expectEquals(e, a[i]);
+      expectEquals(i, b[i]);
+    }
+  }
+
+  static void testTypes() {
+    int[] a = new int[100];
+    int[] b = new int[100];
+    long[] l = { 3 };
+    expectEquals(31, longInductionReduction(l));
+    intVectorLongInvariant(a, l);
+    for (int i = 0; i < 100; i++) {
+      expectEquals(3, a[i]);
+    }
+    longCanBeDoneWithInt(b, a);
+    for (int i = 0; i < 100; i++) {
+      expectEquals(4, b[i]);
+    }
+  }
+
+  public static void main(String[] args) {
+    testUnroll();
+    testStencil1();
+    testStencil2();
+    testStencil3();
+    testTypes();
     System.out.println("passed");
   }
 
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
   private static void expectEquals(float expected, float result) {
     if (expected != result) {
       throw new Error("Expected: " + expected + ", found: " + result);
diff --git a/test/660-checker-simd-sad-byte/expected.txt b/test/660-checker-simd-sad-byte/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/660-checker-simd-sad-byte/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/660-checker-simd-sad-byte/info.txt b/test/660-checker-simd-sad-byte/info.txt
new file mode 100644
index 0000000..b56c119
--- /dev/null
+++ b/test/660-checker-simd-sad-byte/info.txt
@@ -0,0 +1 @@
+Functional tests on SAD vectorization.
diff --git a/test/660-checker-simd-sad-byte/src/Main.java b/test/660-checker-simd-sad-byte/src/Main.java
new file mode 100644
index 0000000..72d1c24
--- /dev/null
+++ b/test/660-checker-simd-sad-byte/src/Main.java
@@ -0,0 +1,332 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for SAD (sum of absolute differences).
+ */
+public class Main {
+
+  // TODO: lower precision still coming, b/64091002
+
+  private static byte sadByte2Byte(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    byte sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(b1[i] - b2[i]);
+    }
+    return sad;
+  }
+
+  private static byte sadByte2ByteAlt(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    byte sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      byte s = b1[i];
+      byte p = b2[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  private static byte sadByte2ByteAlt2(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    byte sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      byte s = b1[i];
+      byte p = b2[i];
+      int x = s - p;
+      if (x < 0) x = -x;
+      sad += x;
+    }
+    return sad;
+  }
+
+  private static short sadByte2Short(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    short sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(b1[i] - b2[i]);
+    }
+    return sad;
+  }
+
+  private static short sadByte2ShortAlt(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    short sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      byte s = b1[i];
+      byte p = b2[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  private static short sadByte2ShortAlt2(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    short sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      byte s = b1[i];
+      byte p = b2[i];
+      int x = s - p;
+      if (x < 0) x = -x;
+      sad += x;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadByte2Int(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadByte2Int(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  private static int sadByte2Int(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(b1[i] - b2[i]);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadByte2IntAlt(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get2>>,<<Get1>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadByte2IntAlt(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  private static int sadByte2IntAlt(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      byte s = b1[i];
+      byte p = b2[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadByte2IntAlt2(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadByte2IntAlt2(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  private static int sadByte2IntAlt2(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      byte s = b1[i];
+      byte p = b2[i];
+      int x = s - p;
+      if (x < 0) x = -x;
+      sad += x;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadByte2Long(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadByte2Long(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  private static long sadByte2Long(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    long sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      long x = b1[i];
+      long y = b2[i];
+      sad += Math.abs(x - y);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadByte2LongAt1(byte[], byte[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:b\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadByte2LongAt1(byte[], byte[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons16:i\d+>> IntConstant 16                 loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons16>>]      loop:<<Loop>>      outer_loop:none
+  private static long sadByte2LongAt1(byte[] b1, byte[] b2) {
+    int min_length = Math.min(b1.length, b2.length);
+    long sad = 1;  // starts at 1
+    for (int i = 0; i < min_length; i++) {
+      long x = b1[i];
+      long y = b2[i];
+      sad += Math.abs(x - y);
+    }
+    return sad;
+  }
+
+  public static void main(String[] args) {
+    // Cross-test the two most extreme values individually.
+    byte[] b1 = { 0, -128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    byte[] b2 = { 0,  127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    expectEquals(-1, sadByte2Byte(b1, b2));
+    expectEquals(-1, sadByte2Byte(b2, b1));
+    expectEquals(-1, sadByte2ByteAlt(b1, b2));
+    expectEquals(-1, sadByte2ByteAlt(b2, b1));
+    expectEquals(-1, sadByte2ByteAlt2(b1, b2));
+    expectEquals(-1, sadByte2ByteAlt2(b2, b1));
+    expectEquals(255, sadByte2Short(b1, b2));
+    expectEquals(255, sadByte2Short(b2, b1));
+    expectEquals(255, sadByte2ShortAlt(b1, b2));
+    expectEquals(255, sadByte2ShortAlt(b2, b1));
+    expectEquals(255, sadByte2ShortAlt2(b1, b2));
+    expectEquals(255, sadByte2ShortAlt2(b2, b1));
+    expectEquals(255, sadByte2Int(b1, b2));
+    expectEquals(255, sadByte2Int(b2, b1));
+    expectEquals(255, sadByte2IntAlt(b1, b2));
+    expectEquals(255, sadByte2IntAlt(b2, b1));
+    expectEquals(255, sadByte2IntAlt2(b1, b2));
+    expectEquals(255, sadByte2IntAlt2(b2, b1));
+    expectEquals(255, sadByte2Long(b1, b2));
+    expectEquals(255L, sadByte2Long(b2, b1));
+    expectEquals(256L, sadByte2LongAt1(b1, b2));
+    expectEquals(256L, sadByte2LongAt1(b2, b1));
+
+    // Use cross-values to test all cases.
+    // One for scalar cleanup.
+    int n = 256;
+    int m = n * n + 1;
+    int k = 0;
+    b1 = new byte[m];
+    b2 = new byte[m];
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        b1[k] = (byte) i;
+        b2[k] = (byte) j;
+        k++;
+      }
+    }
+    b1[k] = 10;
+    b2[k] = 2;
+    expectEquals(8, sadByte2Byte(b1, b2));
+    expectEquals(8, sadByte2ByteAlt(b1, b2));
+    expectEquals(8, sadByte2ByteAlt2(b1, b2));
+    expectEquals(21768, sadByte2Short(b1, b2));
+    expectEquals(21768, sadByte2ShortAlt(b1, b2));
+    expectEquals(21768, sadByte2ShortAlt2(b1, b2));
+    expectEquals(5592328, sadByte2Int(b1, b2));
+    expectEquals(5592328, sadByte2IntAlt(b1, b2));
+    expectEquals(5592328, sadByte2IntAlt2(b1, b2));
+    expectEquals(5592328L, sadByte2Long(b1, b2));
+    expectEquals(5592329L, sadByte2LongAt1(b1, b2));
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/660-checker-simd-sad-char/expected.txt b/test/660-checker-simd-sad-char/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/660-checker-simd-sad-char/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/660-checker-simd-sad-char/info.txt b/test/660-checker-simd-sad-char/info.txt
new file mode 100644
index 0000000..b56c119
--- /dev/null
+++ b/test/660-checker-simd-sad-char/info.txt
@@ -0,0 +1 @@
+Functional tests on SAD vectorization.
diff --git a/test/660-checker-simd-sad-char/src/Main.java b/test/660-checker-simd-sad-char/src/Main.java
new file mode 100644
index 0000000..bb0c58f
--- /dev/null
+++ b/test/660-checker-simd-sad-char/src/Main.java
@@ -0,0 +1,259 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for SAD (sum of absolute differences).
+ */
+public class Main {
+
+  // TODO: lower precision still coming, b/64091002
+
+  // TODO: consider unsigned SAD too, b/64091002
+
+  private static char sadShort2Short(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    char sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(s1[i] - s2[i]);
+    }
+    return sad;
+  }
+
+  private static char sadShort2ShortAlt(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    char sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      char s = s1[i];
+      char p = s2[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  private static char sadShort2ShortAlt2(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    char sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      char s = s1[i];
+      char p = s2[i];
+      int x = s - p;
+      if (x < 0) x = -x;
+      sad += x;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2Int(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadShort2Int(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  private static int sadShort2Int(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(s1[i] - s2[i]);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2IntAlt(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get2>>,<<Get1>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadShort2IntAlt(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  private static int sadShort2IntAlt(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      char s = s1[i];
+      char p = s2[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2IntAlt2(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadShort2IntAlt2(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  private static int sadShort2IntAlt2(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      char s = s1[i];
+      char p = s2[i];
+      int x = s - p;
+      if (x < 0) x = -x;
+      sad += x;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadShort2Long(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadShort2Long(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  private static long sadShort2Long(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    long sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      long x = s1[i];
+      long y = s2[i];
+      sad += Math.abs(x - y);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadShort2LongAt1(char[], char[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:c\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadShort2LongAt1(char[], char[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  private static long sadShort2LongAt1(char[] s1, char[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    long sad = 1;  // starts at 1
+    for (int i = 0; i < min_length; i++) {
+      long x = s1[i];
+      long y = s2[i];
+      sad += Math.abs(x - y);
+    }
+    return sad;
+  }
+
+  public static void main(String[] args) {
+    // Cross-test the two most extreme values individually.
+    char[] s1 = { 0, 0x8000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    char[] s2 = { 0, 0x7fff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    expectEquals(1, sadShort2Short(s1, s2));
+    expectEquals(1, sadShort2Short(s2, s1));
+    expectEquals(1, sadShort2ShortAlt(s1, s2));
+    expectEquals(1, sadShort2ShortAlt(s2, s1));
+    expectEquals(1, sadShort2ShortAlt2(s1, s2));
+    expectEquals(1, sadShort2ShortAlt2(s2, s1));
+    expectEquals(1, sadShort2Int(s1, s2));
+    expectEquals(1, sadShort2Int(s2, s1));
+    expectEquals(1, sadShort2IntAlt(s1, s2));
+    expectEquals(1, sadShort2IntAlt(s2, s1));
+    expectEquals(1, sadShort2IntAlt2(s1, s2));
+    expectEquals(1, sadShort2IntAlt2(s2, s1));
+    expectEquals(1L, sadShort2Long(s1, s2));
+    expectEquals(1L, sadShort2Long(s2, s1));
+    expectEquals(2L, sadShort2LongAt1(s1, s2));
+    expectEquals(2L, sadShort2LongAt1(s2, s1));
+
+    // Use cross-values to test all cases.
+    char[] interesting = {
+      (char) 0x0000,
+      (char) 0x0001,
+      (char) 0x0002,
+      (char) 0x1234,
+      (char) 0x8000,
+      (char) 0x8001,
+      (char) 0x7fff,
+      (char) 0xffff
+    };
+    int n = interesting.length;
+    int m = n * n + 1;
+    s1 = new char[m];
+    s2 = new char[m];
+    int k = 0;
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        s1[k] = interesting[i];
+        s2[k] = interesting[j];
+        k++;
+      }
+    }
+    s1[k] = 10;
+    s2[k] = 2;
+    expectEquals(56196, sadShort2Short(s1, s2));
+    expectEquals(56196, sadShort2ShortAlt(s1, s2));
+    expectEquals(56196, sadShort2ShortAlt2(s1, s2));
+    expectEquals(1497988, sadShort2Int(s1, s2));
+    expectEquals(1497988, sadShort2IntAlt(s1, s2));
+    expectEquals(1497988, sadShort2IntAlt2(s1, s2));
+    expectEquals(1497988L, sadShort2Long(s1, s2));
+    expectEquals(1497989L, sadShort2LongAt1(s1, s2));
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/660-checker-simd-sad-int/expected.txt b/test/660-checker-simd-sad-int/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/660-checker-simd-sad-int/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/660-checker-simd-sad-int/info.txt b/test/660-checker-simd-sad-int/info.txt
new file mode 100644
index 0000000..b56c119
--- /dev/null
+++ b/test/660-checker-simd-sad-int/info.txt
@@ -0,0 +1 @@
+Functional tests on SAD vectorization.
diff --git a/test/660-checker-simd-sad-int/src/Main.java b/test/660-checker-simd-sad-int/src/Main.java
new file mode 100644
index 0000000..0daeedd
--- /dev/null
+++ b/test/660-checker-simd-sad-int/src/Main.java
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for SAD (sum of absolute differences).
+ */
+public class Main {
+
+  /// CHECK-START: int Main.sadInt2Int(int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadInt2Int(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadInt2Int(int[] x, int[] y) {
+    int min_length = Math.min(x.length, y.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(x[i] - y[i]);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadInt2IntAlt(int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                       loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                       loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub1:i\d+>>   Sub [<<Get2>>,<<Get1>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub2:i\d+>>   Sub [<<Get1>>,<<Get2>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Select:i\d+>> Select [<<Sub2>>,<<Sub1>>,{{z\d+}}] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Select>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]            loop:<<Loop>>      outer_loop:none
+  //
+  // No ABS? No SAD!
+  //
+  /// CHECK-START-ARM64: int Main.sadInt2IntAlt(int[], int[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  private static int sadInt2IntAlt(int[] x, int[] y) {
+    int min_length = Math.min(x.length, y.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      int s = x[i];
+      int p = y[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadInt2IntAlt2(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadInt2IntAlt2(int[] x, int[] y) {
+    int min_length = Math.min(x.length, y.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      int s = x[i];
+      int p = y[i];
+      int m = s - p;
+      if (m < 0) m = -m;
+      sad += m;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadInt2Long(int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadInt2Long(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
+  private static long sadInt2Long(int[] x, int[] y) {
+    int min_length = Math.min(x.length, y.length);
+    long sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      long s = x[i];
+      long p = y[i];
+      sad += Math.abs(s - p);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadInt2LongAt1(int[], int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadInt2LongAt1(int[], int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]       loop:<<Loop>>      outer_loop:none
+  private static long sadInt2LongAt1(int[] x, int[] y) {
+    int min_length = Math.min(x.length, y.length);
+    long sad = 1;  // starts at 1
+    for (int i = 0; i < min_length; i++) {
+      long s = x[i];
+      long p = y[i];
+      sad += Math.abs(s - p);
+    }
+    return sad;
+  }
+
+  public static void main(String[] args) {
+    // Cross-test the two most extreme values individually.
+    int[] x = { 0, Integer.MAX_VALUE, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    int[] y = { 0, Integer.MIN_VALUE, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    expectEquals(1, sadInt2Int(x, y));
+    expectEquals(1, sadInt2Int(y, x));
+    expectEquals(-1, sadInt2IntAlt(x, y));
+    expectEquals(-1, sadInt2IntAlt(y, x));
+    expectEquals(1, sadInt2IntAlt2(x, y));
+    expectEquals(1, sadInt2IntAlt2(y, x));
+    expectEquals(4294967295L, sadInt2Long(x, y));
+    expectEquals(4294967295L, sadInt2Long(y, x));
+    expectEquals(4294967296L, sadInt2LongAt1(x, y));
+    expectEquals(4294967296L, sadInt2LongAt1(y, x));
+
+    // Use cross-values for the interesting values.
+    int[] interesting = {
+      0x00000000, 0x00000001, 0x00007fff, 0x00008000, 0x00008001, 0x0000ffff,
+      0x00010000, 0x00010001, 0x00017fff, 0x00018000, 0x00018001, 0x0001ffff,
+      0x7fff0000, 0x7fff0001, 0x7fff7fff, 0x7fff8000, 0x7fff8001, 0x7fffffff,
+      0x80000000, 0x80000001, 0x80007fff, 0x80008000, 0x80008001, 0x8000ffff,
+      0x80010000, 0x80010001, 0x80017fff, 0x80018000, 0x80018001, 0x8001ffff,
+      0xffff0000, 0xffff0001, 0xffff7fff, 0xffff8000, 0xffff8001, 0xffffffff
+    };
+    int n = interesting.length;
+    int m = n * n + 1;
+    x = new int[m];
+    y = new int[m];
+    int k = 0;
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        x[k] = interesting[i];
+        y[k] = interesting[j];
+        k++;
+      }
+    }
+    x[k] = 10;
+    y[k] = 2;
+    expectEquals(8, sadInt2Int(x, y));
+    expectEquals(-13762600, sadInt2IntAlt(x, y));
+    expectEquals(8, sadInt2IntAlt2(x, y));
+    expectEquals(2010030931928L, sadInt2Long(x, y));
+    expectEquals(2010030931929L, sadInt2LongAt1(x, y));
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/660-checker-simd-sad-long/expected.txt b/test/660-checker-simd-sad-long/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/660-checker-simd-sad-long/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/660-checker-simd-sad-long/info.txt b/test/660-checker-simd-sad-long/info.txt
new file mode 100644
index 0000000..b56c119
--- /dev/null
+++ b/test/660-checker-simd-sad-long/info.txt
@@ -0,0 +1 @@
+Functional tests on SAD vectorization.
diff --git a/test/660-checker-simd-sad-long/src/Main.java b/test/660-checker-simd-sad-long/src/Main.java
new file mode 100644
index 0000000..06f62bd
--- /dev/null
+++ b/test/660-checker-simd-sad-long/src/Main.java
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for SAD (sum of absolute differences).
+ */
+public class Main {
+
+  /// CHECK-START: long Main.sadLong2Long(long[], long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadLong2Long(long[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  private static long sadLong2Long(long[] x, long[] y) {
+    int min_length = Math.min(x.length, y.length);
+    long sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(x[i] - y[i]);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadLong2LongAlt(long[], long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                       loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                       loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]            loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub1:j\d+>>   Sub [<<Get2>>,<<Get1>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub2:j\d+>>   Sub [<<Get1>>,<<Get2>>]             loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Select:j\d+>> Select [<<Sub2>>,<<Sub1>>,{{z\d+}}] loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Select>>]           loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]            loop:<<Loop>>      outer_loop:none
+  //
+  // No ABS? No SAD!
+  //
+  /// CHECK-START: long Main.sadLong2LongAlt(long[], long[]) loop_optimization (after)
+  /// CHECK-NOT: VecSADAccumulate
+  private static long sadLong2LongAlt(long[] x, long[] y) {
+    int min_length = Math.min(x.length, y.length);
+    long sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      long s = x[i];
+      long p = y[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadLong2LongAlt2(long[], long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadLong2LongAlt2(long[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  private static long sadLong2LongAlt2(long[] x, long[] y) {
+    int min_length = Math.min(x.length, y.length);
+    long sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      long s = x[i];
+      long p = y[i];
+      long m = s - p;
+      if (m < 0) m = -m;
+      sad += m;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadLong2LongAt1(long[], long[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:j\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadLong2LongAt1(long[], long[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons2:i\d+>>  IntConstant 2                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons2>>]       loop:<<Loop>>      outer_loop:none
+  private static long sadLong2LongAt1(long[] x, long[] y) {
+    int min_length = Math.min(x.length, y.length);
+    long sad = 1;  // starts at 1
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(x[i] - y[i]);
+    }
+    return sad;
+  }
+
+  public static void main(String[] args) {
+    // Cross-test the two most extreme values individually.
+    long[] x = { 0, Long.MIN_VALUE, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    long[] y = { 0, Long.MAX_VALUE, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    expectEquals(1L, sadLong2Long(x, y));
+    expectEquals(1L, sadLong2Long(y, x));
+    expectEquals(-1L, sadLong2LongAlt(x, y));
+    expectEquals(-1L, sadLong2LongAlt(y, x));
+    expectEquals(1L, sadLong2LongAlt2(x, y));
+    expectEquals(1L, sadLong2LongAlt2(y, x));
+    expectEquals(2L, sadLong2LongAt1(x, y));
+    expectEquals(2L, sadLong2LongAt1(y, x));
+
+    // Use cross-values for the interesting values.
+    long[] interesting = {
+      0x0000000000000000L, 0x0000000000000001L, 0x000000007fffffffL,
+      0x0000000080000000L, 0x0000000080000001L, 0x00000000ffffffffL,
+      0x0000000100000000L, 0x0000000100000001L, 0x000000017fffffffL,
+      0x0000000180000000L, 0x0000000180000001L, 0x00000001ffffffffL,
+      0x7fffffff00000000L, 0x7fffffff00000001L, 0x7fffffff7fffffffL,
+      0x7fffffff80000000L, 0x7fffffff80000001L, 0x7fffffffffffffffL,
+      0x8000000000000000L, 0x8000000000000001L, 0x800000007fffffffL,
+      0x8000000080000000L, 0x8000000080000001L, 0x80000000ffffffffL,
+      0x8000000100000000L, 0x8000000100000001L, 0x800000017fffffffL,
+      0x8000000180000000L, 0x8000000180000001L, 0x80000001ffffffffL,
+      0xffffffff00000000L, 0xffffffff00000001L, 0xffffffff7fffffffL,
+      0xffffffff80000000L, 0xffffffff80000001L, 0xffffffffffffffffL
+    };
+    int n = interesting.length;
+    int m = n * n + 1;
+    x = new long[m];
+    y = new long[m];
+    int k = 0;
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        x[k] = interesting[i];
+        y[k] = interesting[j];
+        k++;
+      }
+    }
+    x[k] = 10;
+    y[k] = 2;
+    expectEquals(8L, sadLong2Long(x, y));
+    expectEquals(-901943132200L, sadLong2LongAlt(x, y));
+    expectEquals(8L, sadLong2LongAlt2(x, y));
+    expectEquals(9L, sadLong2LongAt1(x, y));
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/660-checker-simd-sad-short/expected.txt b/test/660-checker-simd-sad-short/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/660-checker-simd-sad-short/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/660-checker-simd-sad-short/info.txt b/test/660-checker-simd-sad-short/info.txt
new file mode 100644
index 0000000..b56c119
--- /dev/null
+++ b/test/660-checker-simd-sad-short/info.txt
@@ -0,0 +1 @@
+Functional tests on SAD vectorization.
diff --git a/test/660-checker-simd-sad-short/src/Main.java b/test/660-checker-simd-sad-short/src/Main.java
new file mode 100644
index 0000000..d94308e
--- /dev/null
+++ b/test/660-checker-simd-sad-short/src/Main.java
@@ -0,0 +1,299 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tests for SAD (sum of absolute differences).
+ */
+public class Main {
+
+  // TODO: lower precision still coming, b/64091002
+
+  private static short sadShort2Short(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    short sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(s1[i] - s2[i]);
+    }
+    return sad;
+  }
+
+  private static short sadShort2ShortAlt(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    short sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      short s = s1[i];
+      short p = s2[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  private static short sadShort2ShortAlt2(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    short sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      short s = s1[i];
+      short p = s2[i];
+      int x = s - p;
+      if (x < 0) x = -x;
+      sad += x;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2Int(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadShort2Int(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadShort2Int(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      sad += Math.abs(s1[i] - s2[i]);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2IntAlt(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get2>>,<<Get1>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadShort2IntAlt(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load2>>,<<Load1>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadShort2IntAlt(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      short s = s1[i];
+      short p = s2[i];
+      sad += s >= p ? s - p : p - s;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: int Main.sadShort2IntAlt2(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:i\d+>>    Sub [<<Get1>>,<<Get2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:i\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsInt loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: int Main.sadShort2IntAlt2(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static int sadShort2IntAlt2(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    int sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      short s = s1[i];
+      short p = s2[i];
+      int x = s - p;
+      if (x < 0) x = -x;
+      sad += x;
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadShort2Long(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadShort2Long(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 0                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static long sadShort2Long(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    long sad = 0;
+    for (int i = 0; i < min_length; i++) {
+      long x = s1[i];
+      long y = s2[i];
+      sad += Math.abs(x - y);
+    }
+    return sad;
+  }
+
+  /// CHECK-START: long Main.sadShort2LongAt1(short[], short[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:j\d+>>   Phi [<<ConsL>>,{{j\d+}}]       loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:s\d+>>   ArrayGet [{{l\d+}},<<Phi1>>]   loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv1:j\d+>>   TypeConversion [<<Get1>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Cnv2:j\d+>>   TypeConversion [<<Get2>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Sub:j\d+>>    Sub [<<Cnv1>>,<<Cnv2>>]        loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Intrin:j\d+>> InvokeStaticOrDirect [<<Sub>>] intrinsic:MathAbsLong loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Intrin>>]      loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]       loop:<<Loop>>      outer_loop:none
+  //
+  /// CHECK-START-ARM64: long Main.sadShort2LongAt1(short[], short[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                  loop:none
+  /// CHECK-DAG: <<Cons8:i\d+>>  IntConstant 8                  loop:none
+  /// CHECK-DAG: <<ConsL:j\d+>>  LongConstant 1                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<ConsL>>]      loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]       loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]    loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<SAD:d\d+>>    VecSADAccumulate [<<Phi2>>,<<Load1>>,<<Load2>>] loop:<<Loop>> outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons8>>]       loop:<<Loop>>      outer_loop:none
+  private static long sadShort2LongAt1(short[] s1, short[] s2) {
+    int min_length = Math.min(s1.length, s2.length);
+    long sad = 1;  // starts at 1
+    for (int i = 0; i < min_length; i++) {
+      long x = s1[i];
+      long y = s2[i];
+      sad += Math.abs(x - y);
+    }
+    return sad;
+  }
+
+  public static void main(String[] args) {
+    // Cross-test the two most extreme values individually.
+    short[] s1 = { 0, -32768, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    short[] s2 = { 0,  32767, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+    expectEquals(-1, sadShort2Short(s1, s2));
+    expectEquals(-1, sadShort2Short(s2, s1));
+    expectEquals(-1, sadShort2ShortAlt(s1, s2));
+    expectEquals(-1, sadShort2ShortAlt(s2, s1));
+    expectEquals(-1, sadShort2ShortAlt2(s1, s2));
+    expectEquals(-1, sadShort2ShortAlt2(s2, s1));
+    expectEquals(65535, sadShort2Int(s1, s2));
+    expectEquals(65535, sadShort2Int(s2, s1));
+    expectEquals(65535, sadShort2IntAlt(s1, s2));
+    expectEquals(65535, sadShort2IntAlt(s2, s1));
+    expectEquals(65535, sadShort2IntAlt2(s1, s2));
+    expectEquals(65535, sadShort2IntAlt2(s2, s1));
+    expectEquals(65535L, sadShort2Long(s1, s2));
+    expectEquals(65535L, sadShort2Long(s2, s1));
+    expectEquals(65536L, sadShort2LongAt1(s1, s2));
+    expectEquals(65536L, sadShort2LongAt1(s2, s1));
+
+    // Use cross-values to test all cases.
+    short[] interesting = {
+      (short) 0x0000,
+      (short) 0x0001,
+      (short) 0x0002,
+      (short) 0x1234,
+      (short) 0x8000,
+      (short) 0x8001,
+      (short) 0x7fff,
+      (short) 0xffff
+    };
+    int n = interesting.length;
+    int m = n * n + 1;
+    s1 = new short[m];
+    s2 = new short[m];
+    int k = 0;
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        s1[k] = interesting[i];
+        s2[k] = interesting[j];
+        k++;
+      }
+    }
+    s1[k] = 10;
+    s2[k] = 2;
+    expectEquals(-18932, sadShort2Short(s1, s2));
+    expectEquals(-18932, sadShort2ShortAlt(s1, s2));
+    expectEquals(-18932, sadShort2ShortAlt2(s1, s2));
+    expectEquals(1291788, sadShort2Int(s1, s2));
+    expectEquals(1291788, sadShort2IntAlt(s1, s2));
+    expectEquals(1291788, sadShort2IntAlt2(s1, s2));
+    expectEquals(1291788L, sadShort2Long(s1, s2));
+    expectEquals(1291789L, sadShort2LongAt1(s1, s2));
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+
+  private static void expectEquals(long expected, long result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/661-checker-simd-reduc/src/Main.java b/test/661-checker-simd-reduc/src/Main.java
index 71eb3cd..bcfa968 100644
--- a/test/661-checker-simd-reduc/src/Main.java
+++ b/test/661-checker-simd-reduc/src/Main.java
@@ -80,6 +80,101 @@
     return sum;
   }
 
+  /// CHECK-START: int Main.reductionIntChain() loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons1>>,{{i\d+}}]      loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Get1:i\d+>>   ArrayGet [{{l\d+}},<<Phi2>>]  loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Get1>>]       loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Cons1>>]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi4:i\d+>>   Phi [<<Phi1>>,{{i\d+}}]       loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Get2:i\d+>>   ArrayGet [{{l\d+}},<<Phi3>>]  loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi4>>,<<Get2>>]       loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi3>>,<<Cons1>>]      loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 Return [<<Phi4>>]             loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-ARM64: int Main.reductionIntChain() loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
+  /// CHECK-DAG: <<Set1:d\d+>>   VecSetScalars [<<Cons1>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set1>>,{{d\d+}}]       loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Red1:d\d+>>   VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr1:i\d+>>  VecExtractScalar [<<Red1>>]   loop:none
+  /// CHECK-DAG: <<Set2:d\d+>>   VecSetScalars [<<Extr1>>]     loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi4:d\d+>>   Phi [<<Set2>>,{{d\d+}}]       loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Load2:d\d+>>  VecLoad [{{l\d+}},<<Phi3>>]   loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi4>>,<<Load2>>]   loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi3>>,<<Cons4>>]      loop:<<Loop2>>      outer_loop:none
+  /// CHECK-DAG: <<Red2:d\d+>>   VecReduce [<<Phi4>>]          loop:none
+  /// CHECK-DAG: <<Extr2:i\d+>>  VecExtractScalar [<<Red2>>]   loop:none
+  /// CHECK-DAG:                 Return [<<Extr2>>]            loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  // NOTE: pattern is robust with respect to vector loop unrolling.
+  private static int reductionIntChain() {
+    int[] x = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
+    int r = 1;
+    for (int i = 0; i < 16; i++) {
+      r += x[i];
+    }
+    for (int i = 0; i < 16; i++) {
+      r += x[i];
+    }
+    return r;
+  }
+
+  /// CHECK-START: int Main.reductionIntToLoop(int[]) loop_optimization (before)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Get:i\d+>>    ArrayGet [{{l\d+}},<<Phi1>>]  loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi2>>,<<Get>>]        loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons1>>]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>   Phi [<<Phi2>>,{{i\d+}}]       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi4:i\d+>>   Phi [<<Phi2>>,{{i\d+}}]       loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  /// CHECK-START-ARM64: int Main.reductionIntToLoop(int[]) loop_optimization (after)
+  /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
+  /// CHECK-DAG: <<Cons1:i\d+>>  IntConstant 1                 loop:none
+  /// CHECK-DAG: <<Cons4:i\d+>>  IntConstant 4                 loop:none
+  /// CHECK-DAG: <<Set:d\d+>>    VecSetScalars [<<Cons0>>]     loop:none
+  /// CHECK-DAG: <<Phi1:i\d+>>   Phi [<<Cons0>>,{{i\d+}}]      loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:d\d+>>   Phi [<<Set>>,{{d\d+}}]        loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Load1:d\d+>>  VecLoad [{{l\d+}},<<Phi1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 VecAdd [<<Phi2>>,<<Load1>>]   loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG:                 Add [<<Phi1>>,<<Cons4>>]      loop:<<Loop1>>      outer_loop:none
+  /// CHECK-DAG: <<Red:d\d+>>    VecReduce [<<Phi2>>]          loop:none
+  /// CHECK-DAG: <<Extr:i\d+>>   VecExtractScalar [<<Red>>]    loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>>   Phi [<<Extr>>,{{i\d+}}]       loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi4:i\d+>>   Phi [<<Extr>>,{{i\d+}}]       loop:<<Loop2>>      outer_loop:none
+  //
+  /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
+  //
+  private static int reductionIntToLoop(int[] x) {
+    int r = 0;
+    for (int i = 0; i < 4; i++) {
+      r += x[i];
+    }
+    for (int i = r; i < 16; i++) {
+      r += i;
+    }
+    return r;
+  }
+
   /// CHECK-START: long Main.reductionLong(long[]) loop_optimization (before)
   /// CHECK-DAG: <<Cons0:i\d+>>  IntConstant 0                 loop:none
   /// CHECK-DAG: <<Long0:j\d+>>  LongConstant 0                loop:none
@@ -468,10 +563,28 @@
     }
 
     // Test various reductions in loops.
+    int[] x0 = { 0, 0, 0, 0 };
+    int[] x1 = { 0, 0, 0, 1 };
+    int[] x2 = { 1, 1, 1, 1 };
     expectEquals(-74, reductionByte(xb));
     expectEquals(-27466, reductionShort(xs));
     expectEquals(38070, reductionChar(xc));
     expectEquals(365750, reductionInt(xi));
+    expectEquals(273, reductionIntChain());
+    expectEquals(120, reductionIntToLoop(x0));
+    expectEquals(121, reductionIntToLoop(x1));
+    expectEquals(118, reductionIntToLoop(x2));
+    expectEquals(-1205, reductionIntToLoop(xi));
+    expectEquals(365750L, reductionLong(xl));
+    expectEquals(-75, reductionByteM1(xb));
+    expectEquals(-27467, reductionShortM1(xs));
+    expectEquals(38069, reductionCharM1(xc));
+    expectEquals(365749, reductionIntM1(xi));
+    expectEquals(365749L, reductionLongM1(xl));
+    expectEquals(74, reductionMinusByte(xb));
+    expectEquals(27466, reductionMinusShort(xs));
+    expectEquals(27466, reductionMinusChar(xc));
+    expectEquals(-365750, reductionMinusInt(xi));
     expectEquals(365750L, reductionLong(xl));
     expectEquals(-75, reductionByteM1(xb));
     expectEquals(-27467, reductionShortM1(xs));