arm64: Implement VarHandle intrinsics for byte array views.

Using benchmarks provided by
    https://android-review.googlesource.com/1420959
on blueline little cores with fixed frequency 1420800:
                             before after
GetByteArrayViewInt          27.093 0.024
SetByteArrayViewInt          28.067 0.024
GetByteArrayViewBigEndianInt 27.142 0.026
SetByteArrayViewBigEndianInt 28.040 0.025

Test: testrunner.py --target --64 --optimizing
Bug: 71781600
Change-Id: I604326675042bd63dce8ec15075714003ca9915d
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 23ac91b..68120e2 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -38,6 +38,7 @@
 #include "base/casts.h"
 #include "base/leb128.h"
 #include "class_linker.h"
+#include "class_root-inl.h"
 #include "compiled_method.h"
 #include "dex/bytecode_utils.h"
 #include "dex/code_item_accessors-inl.h"
@@ -932,6 +933,12 @@
   return GetBootImageOffsetImpl(method, ImageHeader::kSectionArtMethods);
 }
 
+// NO_THREAD_SAFETY_ANALYSIS: Avoid taking the mutator lock, boot image objects are non-moveable.
+uint32_t CodeGenerator::GetBootImageOffset(ClassRoot class_root) NO_THREAD_SAFETY_ANALYSIS {
+  ObjPtr<mirror::Class> klass = GetClassRoot<kWithoutReadBarrier>(class_root);
+  return GetBootImageOffsetImpl(klass.Ptr(), ImageHeader::kSectionObjects);
+}
+
 // NO_THREAD_SAFETY_ANALYSIS: Avoid taking the mutator lock, boot image classes are non-moveable.
 uint32_t CodeGenerator::GetBootImageOffsetOfIntrinsicDeclaringClass(HInvoke* invoke)
     NO_THREAD_SAFETY_ANALYSIS {
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index bd5483c..338aac0 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -27,6 +27,7 @@
 #include "base/enums.h"
 #include "base/globals.h"
 #include "base/memory_region.h"
+#include "class_root.h"
 #include "dex/string_reference.h"
 #include "dex/type_reference.h"
 #include "graph_visualizer.h"
@@ -635,6 +636,7 @@
   static uint32_t GetBootImageOffset(HLoadClass* load_class);
   static uint32_t GetBootImageOffset(HLoadString* load_string);
   static uint32_t GetBootImageOffset(HInvoke* invoke);
+  static uint32_t GetBootImageOffset(ClassRoot class_root);
   static uint32_t GetBootImageOffsetOfIntrinsicDeclaringClass(HInvoke* invoke);
 
   static void CreateSystemArrayCopyLocationSummary(HInvoke* invoke);
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 8f4979f..a9f03b0 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -22,6 +22,7 @@
 #include "art_method-inl.h"
 #include "base/bit_utils.h"
 #include "base/bit_utils_iterator.h"
+#include "class_root-inl.h"
 #include "class_table.h"
 #include "code_generator_utils.h"
 #include "compiled_method.h"
@@ -1652,13 +1653,13 @@
 }
 
 void CodeGeneratorARM64::LoadAcquire(HInstruction* instruction,
+                                     DataType::Type type,
                                      CPURegister dst,
                                      const MemOperand& src,
                                      bool needs_null_check) {
   MacroAssembler* masm = GetVIXLAssembler();
   UseScratchRegisterScope temps(masm);
   Register temp_base = temps.AcquireX();
-  DataType::Type type = instruction->GetType();
 
   DCHECK(!src.IsPreIndex());
   DCHECK(!src.IsPostIndex());
@@ -2067,8 +2068,11 @@
       // Note that a potential implicit null check is handled in this
       // CodeGeneratorARM64::LoadAcquire call.
       // NB: LoadAcquire will record the pc info if needed.
-      codegen_->LoadAcquire(
-          instruction, OutputCPURegister(instruction), field, /* needs_null_check= */ true);
+      codegen_->LoadAcquire(instruction,
+                            load_type,
+                            OutputCPURegister(instruction),
+                            field,
+                            /* needs_null_check= */ true);
     } else {
       // Ensure that between load and MaybeRecordImplicitNullCheck there are no pools emitted.
       EmissionCheckScope guard(GetVIXLAssembler(), kMaxMacroInstructionSizeInBytes);
@@ -4972,25 +4976,45 @@
   }
 }
 
+void CodeGeneratorARM64::LoadTypeForBootImageIntrinsic(vixl::aarch64::Register reg,
+                                                       TypeReference target_type) {
+  // Load the class the same way as for HLoadClass::LoadKind::kBootImageLinkTimePcRelative.
+  DCHECK(GetCompilerOptions().IsBootImage());
+  // Add ADRP with its PC-relative type patch.
+  vixl::aarch64::Label* adrp_label =
+      NewBootImageTypePatch(*target_type.dex_file, target_type.TypeIndex());
+  EmitAdrpPlaceholder(adrp_label, reg.X());
+  // Add ADD with its PC-relative type patch.
+  vixl::aarch64::Label* add_label =
+      NewBootImageTypePatch(*target_type.dex_file, target_type.TypeIndex(), adrp_label);
+  EmitAddPlaceholder(add_label, reg.X(), reg.X());
+}
+
 void CodeGeneratorARM64::LoadIntrinsicDeclaringClass(vixl::aarch64::Register reg, HInvoke* invoke) {
   DCHECK_NE(invoke->GetIntrinsic(), Intrinsics::kNone);
   if (GetCompilerOptions().IsBootImage()) {
-    // Load the class the same way as for HLoadClass::LoadKind::kBootImageLinkTimePcRelative.
     MethodReference target_method = invoke->GetResolvedMethodReference();
     dex::TypeIndex type_idx = target_method.dex_file->GetMethodId(target_method.index).class_idx_;
-    // Add ADRP with its PC-relative type patch.
-    vixl::aarch64::Label* adrp_label = NewBootImageTypePatch(*target_method.dex_file, type_idx);
-    EmitAdrpPlaceholder(adrp_label, reg.X());
-    // Add ADD with its PC-relative type patch.
-    vixl::aarch64::Label* add_label =
-        NewBootImageTypePatch(*target_method.dex_file, type_idx, adrp_label);
-    EmitAddPlaceholder(add_label, reg.X(), reg.X());
+    LoadTypeForBootImageIntrinsic(reg, TypeReference(target_method.dex_file, type_idx));
   } else {
     uint32_t boot_image_offset = GetBootImageOffsetOfIntrinsicDeclaringClass(invoke);
     LoadBootImageAddress(reg, boot_image_offset);
   }
 }
 
+void CodeGeneratorARM64::LoadClassRootForIntrinsic(vixl::aarch64::Register reg,
+                                                   ClassRoot class_root) {
+  if (GetCompilerOptions().IsBootImage()) {
+    ScopedObjectAccess soa(Thread::Current());
+    ObjPtr<mirror::Class> klass = GetClassRoot(class_root);
+    TypeReference target_type(&klass->GetDexFile(), klass->GetDexTypeIndex());
+    LoadTypeForBootImageIntrinsic(reg, target_type);
+  } else {
+    uint32_t boot_image_offset = GetBootImageOffset(class_root);
+    LoadBootImageAddress(reg, boot_image_offset);
+  }
+}
+
 template <linker::LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorARM64::EmitPcRelativeLinkerPatches(
     const ArenaDeque<PcRelativePatchInfo>& infos,
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index c7c11f5..affc640 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -18,6 +18,7 @@
 #define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_ARM64_H_
 
 #include "base/bit_field.h"
+#include "class_root.h"
 #include "code_generator.h"
 #include "common_arm64.h"
 #include "dex/dex_file_types.h"
@@ -646,6 +647,7 @@
              vixl::aarch64::CPURegister src,
              const vixl::aarch64::MemOperand& dst);
   void LoadAcquire(HInstruction* instruction,
+                   DataType::Type type,
                    vixl::aarch64::CPURegister dst,
                    const vixl::aarch64::MemOperand& src,
                    bool needs_null_check);
@@ -787,7 +789,9 @@
                                 vixl::aarch64::Register base);
 
   void LoadBootImageAddress(vixl::aarch64::Register reg, uint32_t boot_image_reference);
+  void LoadTypeForBootImageIntrinsic(vixl::aarch64::Register reg, TypeReference type_reference);
   void LoadIntrinsicDeclaringClass(vixl::aarch64::Register reg, HInvoke* invoke);
+  void LoadClassRootForIntrinsic(vixl::aarch64::Register reg, ClassRoot class_root);
 
   void EmitLinkerPatches(ArenaVector<linker::LinkerPatch>* linker_patches) override;
   bool NeedsThunkCode(const linker::LinkerPatch& patch) const override;
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index fd4992a..64d49aa 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -50,6 +50,7 @@
 
 namespace arm64 {
 
+using helpers::CPURegisterFrom;
 using helpers::DRegisterFrom;
 using helpers::HeapOperand;
 using helpers::LocationFrom;
@@ -248,25 +249,44 @@
   locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap);
 }
 
+static void GenerateReverseBytes(MacroAssembler* masm,
+                                 DataType::Type type,
+                                 CPURegister in,
+                                 CPURegister out) {
+  switch (type) {
+    case DataType::Type::kUint16:
+      __ Rev16(out.W(), in.W());
+      break;
+    case DataType::Type::kInt16:
+      __ Rev16(out.W(), in.W());
+      __ Sxth(out.W(), out.W());
+      break;
+    case DataType::Type::kInt32:
+      __ Rev(out.W(), in.W());
+      break;
+    case DataType::Type::kInt64:
+      __ Rev(out.X(), in.X());
+      break;
+    case DataType::Type::kFloat32:
+      __ Rev(in.W(), in.W());  // Note: Clobbers `in`.
+      __ Fmov(out.S(), in.W());
+      break;
+    case DataType::Type::kFloat64:
+      __ Rev(in.X(), in.X());  // Note: Clobbers `in`.
+      __ Fmov(out.D(), in.X());
+      break;
+    default:
+      LOG(FATAL) << "Unexpected type for reverse-bytes: " << type;
+      UNREACHABLE();
+  }
+}
+
 static void GenReverseBytes(LocationSummary* locations,
                             DataType::Type type,
                             MacroAssembler* masm) {
   Location in = locations->InAt(0);
   Location out = locations->Out();
-
-  switch (type) {
-    case DataType::Type::kInt16:
-      __ Rev16(WRegisterFrom(out), WRegisterFrom(in));
-      __ Sxth(WRegisterFrom(out), WRegisterFrom(out));
-      break;
-    case DataType::Type::kInt32:
-    case DataType::Type::kInt64:
-      __ Rev(RegisterFrom(out, type), RegisterFrom(in, type));
-      break;
-    default:
-      LOG(FATAL) << "Unexpected size for reverse-bytes: " << type;
-      UNREACHABLE();
-  }
+  GenerateReverseBytes(masm, type, CPURegisterFrom(in, type), CPURegisterFrom(out, type));
 }
 
 void IntrinsicLocationsBuilderARM64::VisitIntegerReverseBytes(HInvoke* invoke) {
@@ -707,7 +727,7 @@
     // Other cases.
     MemOperand mem_op(base.X(), offset);
     if (is_volatile) {
-      codegen->LoadAcquire(invoke, trg, mem_op, /* needs_null_check= */ true);
+      codegen->LoadAcquire(invoke, type, trg, mem_op, /* needs_null_check= */ true);
     } else {
       codegen->Load(type, trg, mem_op);
     }
@@ -951,6 +971,7 @@
   MacroAssembler* masm = assembler->GetVIXLAssembler();
   switch (type) {
     case DataType::Type::kBool:
+    case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       if (use_load_acquire) {
         __ Ldaxrb(old_value, MemOperand(ptr));
@@ -1007,6 +1028,7 @@
   }
   switch (type) {
     case DataType::Type::kBool:
+    case DataType::Type::kUint8:
     case DataType::Type::kInt8:
       if (use_store_release) {
         __ Stlxrb(store_result, new_value, MemOperand(ptr));
@@ -1363,6 +1385,7 @@
 enum class GetAndUpdateOp {
   kSet,
   kAdd,
+  kAddWithByteSwap,
   kAnd,
   kOr,
   kXor
@@ -1387,6 +1410,7 @@
       old_value_reg = old_value.IsX() ? old_value.X() : old_value.W();
       new_value = arg.IsX() ? arg.X() : arg.W();
       break;
+    case GetAndUpdateOp::kAddWithByteSwap:
     case GetAndUpdateOp::kAdd:
       if (arg.IsVRegister()) {
         old_value_reg = arg.IsD() ? temps.AcquireX() : temps.AcquireW();
@@ -1414,6 +1438,12 @@
   switch (get_and_update_op) {
     case GetAndUpdateOp::kSet:
       break;
+    case GetAndUpdateOp::kAddWithByteSwap:
+      // To avoid unnecessary sign extension before REV16, the caller must specify `kUint16`
+      // instead of `kInt16` and do the sign-extension explicitly afterwards.
+      DCHECK_NE(load_store_type, DataType::Type::kInt16);
+      GenerateReverseBytes(masm, load_store_type, old_value_reg, old_value_reg);
+      FALLTHROUGH_INTENDED;
     case GetAndUpdateOp::kAdd:
       if (arg.IsVRegister()) {
         VRegister old_value_vreg = old_value.IsD() ? old_value.D() : old_value.S();
@@ -1424,6 +1454,9 @@
       } else {
         __ Add(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
       }
+      if (get_and_update_op == GetAndUpdateOp::kAddWithByteSwap) {
+        GenerateReverseBytes(masm, load_store_type, new_value, new_value);
+      }
       break;
     case GetAndUpdateOp::kAnd:
       __ And(new_value, old_value_reg, arg.IsX() ? arg.X() : arg.W());
@@ -3252,7 +3285,8 @@
                                                     /*use_load_acquire=*/ true);
   } else {
     MemOperand field = HeapOperand(WRegisterFrom(obj), referent_offset);
-    codegen_->LoadAcquire(invoke, WRegisterFrom(out), field, /*needs_null_check=*/ true);
+    codegen_->LoadAcquire(
+        invoke, DataType::Type::kReference, WRegisterFrom(out), field, /*needs_null_check=*/ true);
     codegen_->MaybeGenerateReadBarrierSlow(invoke, out, out, obj, referent_offset);
   }
   __ Bind(slow_path->GetExitLabel());
@@ -3766,6 +3800,68 @@
   GenerateDivideUnsigned(invoke, codegen_);
 }
 
+class VarHandleSlowPathARM64 : public IntrinsicSlowPathARM64 {
+ public:
+  VarHandleSlowPathARM64(HInvoke* invoke, std::memory_order order)
+      : IntrinsicSlowPathARM64(invoke),
+        order_(order),
+        return_success_(false),
+        strong_(false),
+        get_and_update_op_(GetAndUpdateOp::kAdd) {
+  }
+
+  vixl::aarch64::Label* GetByteArrayViewCheckLabel() {
+    return &byte_array_view_check_label_;
+  }
+
+  vixl::aarch64::Label* GetNativeByteOrderLabel() {
+    return &native_byte_order_label_;
+  }
+
+  void SetCompareAndSetOrExchangeArgs(bool return_success, bool strong) {
+    if (return_success) {
+      DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kCompareAndSet);
+    } else {
+      DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kCompareAndExchange);
+    }
+    return_success_ = return_success;
+    strong_ = strong;
+  }
+
+  void SetGetAndUpdateOp(GetAndUpdateOp get_and_update_op) {
+    DCHECK(GetAccessModeTemplate() == mirror::VarHandle::AccessModeTemplate::kGetAndUpdate);
+    get_and_update_op_ = get_and_update_op;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen_in) override {
+    if (GetByteArrayViewCheckLabel()->IsLinked()) {
+      EmitByteArrayViewCode(codegen_in);
+    }
+    IntrinsicSlowPathARM64::EmitNativeCode(codegen_in);
+  }
+
+ private:
+  HInvoke* GetInvoke() const {
+    return GetInstruction()->AsInvoke();
+  }
+
+  mirror::VarHandle::AccessModeTemplate GetAccessModeTemplate() const {
+    return mirror::VarHandle::GetAccessModeTemplateByIntrinsic(GetInvoke()->GetIntrinsic());
+  }
+
+  void EmitByteArrayViewCode(CodeGenerator* codegen_in);
+
+  vixl::aarch64::Label byte_array_view_check_label_;
+  vixl::aarch64::Label native_byte_order_label_;
+  // Shared parameter for all VarHandle intrinsics.
+  std::memory_order order_;
+  // Extra arguments for GenerateVarHandleCompareAndSetOrExchange().
+  bool return_success_;
+  bool strong_;
+  // Extra argument for GenerateVarHandleGetAndUpdate().
+  GetAndUpdateOp get_and_update_op_;
+};
+
 // Generate subtype check without read barriers.
 static void GenerateSubTypeObjectCheckNoReadBarrier(CodeGeneratorARM64* codegen,
                                                     SlowPathCodeARM64* slow_path,
@@ -3914,13 +4010,13 @@
   if (number_of_arguments == /* VarHandle object */ 1u + expected_coordinates_count) {
     return invoke->GetType();
   } else {
-    return invoke->InputAt(number_of_arguments - 1u)->GetType();
+    return GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
   }
 }
 
 static void GenerateVarHandleArrayChecks(HInvoke* invoke,
                                          CodeGeneratorARM64* codegen,
-                                         SlowPathCodeARM64* slow_path) {
+                                         VarHandleSlowPathARM64* slow_path) {
   MacroAssembler* masm = codegen->GetVIXLAssembler();
   Register varhandle = InputRegisterAt(invoke, 0);
   Register object = InputRegisterAt(invoke, 1);
@@ -3952,23 +4048,6 @@
   // No need for read barrier or unpoisoning of coordinateType1 for comparison with null.
   __ Cbz(temp2, slow_path->GetEntryLabel());
 
-  // Check that the coordinateType0 is an array type. We do not need a read barrier
-  // for loading constant reference fields (or chains of them) for comparison with null,
-  // or for finally loading a constant primitive field (primitive type) below.
-  __ Ldr(temp2, HeapOperand(temp, component_type_offset.Int32Value()));
-  codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
-  __ Cbz(temp2, slow_path->GetEntryLabel());
-
-  // Check that the array component type matches the primitive type.
-  __ Ldrh(temp2, HeapOperand(temp2, primitive_type_offset.Int32Value()));
-  if (primitive_type == Primitive::kPrimNot) {
-    static_assert(Primitive::kPrimNot == 0);
-    __ Cbnz(temp2, slow_path->GetEntryLabel());
-  } else {
-    __ Cmp(temp2, static_cast<uint16_t>(primitive_type));
-    __ B(slow_path->GetEntryLabel(), ne);
-  }
-
   // Check object class against componentType0.
   //
   // This is an exact check and we defer other cases to the runtime. This includes
@@ -3984,6 +4063,34 @@
   __ Cmp(temp, temp2);
   __ B(slow_path->GetEntryLabel(), ne);
 
+  // Check that the coordinateType0 is an array type. We do not need a read barrier
+  // for loading constant reference fields (or chains of them) for comparison with null,
+  // or for finally loading a constant primitive field (primitive type) below.
+  __ Ldr(temp2, HeapOperand(temp, component_type_offset.Int32Value()));
+  codegen->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
+  __ Cbz(temp2, slow_path->GetEntryLabel());
+
+  // Check that the array component type matches the primitive type.
+  __ Ldrh(temp2, HeapOperand(temp2, primitive_type_offset.Int32Value()));
+  if (primitive_type == Primitive::kPrimNot) {
+    static_assert(Primitive::kPrimNot == 0);
+    __ Cbnz(temp2, slow_path->GetEntryLabel());
+  } else {
+    // With the exception of `kPrimNot` (handled above), `kPrimByte` and `kPrimBoolean`,
+    // we shall check for a byte array view in the slow path.
+    // The check requires the ByteArrayViewVarHandle.class to be in the boot image,
+    // so we cannot emit that if we're JITting without boot image.
+    bool boot_image_available =
+        codegen->GetCompilerOptions().IsBootImage() ||
+        !Runtime::Current()->GetHeap()->GetBootImageSpaces().empty();
+    DCHECK(boot_image_available || codegen->GetCompilerOptions().IsJitCompiler());
+    size_t can_be_view = (DataType::Size(value_type) != 1u) && boot_image_available;
+    vixl::aarch64::Label* slow_path_label =
+        can_be_view ? slow_path->GetByteArrayViewCheckLabel() : slow_path->GetEntryLabel();
+    __ Cmp(temp2, static_cast<uint16_t>(primitive_type));
+    __ B(slow_path_label, ne);
+  }
+
   // Check for array index out of bounds.
   __ Ldr(temp, HeapOperand(object, array_length_offset.Int32Value()));
   __ Cmp(index, temp);
@@ -3992,7 +4099,7 @@
 
 static void GenerateVarHandleCoordinateChecks(HInvoke* invoke,
                                               CodeGeneratorARM64* codegen,
-                                              SlowPathCodeARM64* slow_path) {
+                                              VarHandleSlowPathARM64* slow_path) {
   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
   if (expected_coordinates_count == 0u) {
     GenerateVarHandleStaticFieldCheck(invoke, codegen, slow_path);
@@ -4004,24 +4111,45 @@
   }
 }
 
+static VarHandleSlowPathARM64* GenerateVarHandleChecks(HInvoke* invoke,
+                                                       CodeGeneratorARM64* codegen,
+                                                       std::memory_order order,
+                                                       DataType::Type type) {
+  VarHandleSlowPathARM64* slow_path =
+      new (codegen->GetScopedAllocator()) VarHandleSlowPathARM64(invoke, order);
+  codegen->AddSlowPath(slow_path);
+
+  GenerateVarHandleAccessModeAndVarTypeChecks(invoke, codegen, slow_path, type);
+  GenerateVarHandleCoordinateChecks(invoke, codegen, slow_path);
+
+  return slow_path;
+}
+
 struct VarHandleTarget {
   Register object;  // The object holding the value to operate on.
   Register offset;  // The offset of the value to operate on.
 };
 
-static VarHandleTarget GenerateVarHandleTarget(HInvoke* invoke, CodeGeneratorARM64* codegen) {
-  MacroAssembler* masm = codegen->GetVIXLAssembler();
-  Register varhandle = InputRegisterAt(invoke, 0);
+static VarHandleTarget GetVarHandleTarget(HInvoke* invoke) {
   size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
   LocationSummary* locations = invoke->GetLocations();
 
   VarHandleTarget target;
   // The temporary allocated for loading the offset.
-  target.offset = WRegisterFrom(locations->GetTemp((expected_coordinates_count == 0u) ? 1u : 0u));
+  target.offset = WRegisterFrom(locations->GetTemp(0u));
   // The reference to the object that holds the value to operate on.
   target.object = (expected_coordinates_count == 0u)
-      ? WRegisterFrom(locations->GetTemp(0u))
+      ? WRegisterFrom(locations->GetTemp(1u))
       : InputRegisterAt(invoke, 1);
+  return target;
+}
+
+static void GenerateVarHandleTarget(HInvoke* invoke,
+                                    const VarHandleTarget& target,
+                                    CodeGeneratorARM64* codegen) {
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
+  Register varhandle = InputRegisterAt(invoke, 0);
+  size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
 
   if (expected_coordinates_count <= 1u) {
     // For static fields, we need to fill the `target.object` with the declaring class,
@@ -4059,8 +4187,6 @@
     }
     __ Add(target.offset, shifted_index, data_offset.Int32Value());
   }
-
-  return target;
 }
 
 static bool HasVarHandleIntrinsicImplementation(HInvoke* invoke) {
@@ -4161,10 +4287,6 @@
   for (size_t i = 0; i != expected_coordinates_count; ++i) {
     locations->SetInAt(/* VarHandle object */ 1u + i, Location::RequiresRegister());
   }
-  if (expected_coordinates_count == 0u) {
-    // Add a temporary to hold the declaring class.
-    locations->AddTemp(Location::RequiresRegister());
-  }
   if (return_type != DataType::Type::kVoid) {
     if (DataType::IsFloatingPointType(return_type)) {
       locations->SetOut(Location::RequiresFpuRegister());
@@ -4195,6 +4317,10 @@
   } else {
     locations->AddTemp(Location::RequiresRegister());
   }
+  if (expected_coordinates_count == 0u) {
+    // Add a temporary to hold the declaring class.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 
   return locations;
 }
@@ -4219,7 +4345,8 @@
 
 static void GenerateVarHandleGet(HInvoke* invoke,
                                  CodeGeneratorARM64* codegen,
-                                 bool use_load_acquire) {
+                                 std::memory_order order,
+                                 bool byte_swap = false) {
   DataType::Type type = invoke->GetType();
   DCHECK_NE(type, DataType::Type::kVoid);
 
@@ -4227,14 +4354,18 @@
   MacroAssembler* masm = codegen->GetVIXLAssembler();
   CPURegister out = helpers::OutputCPURegister(invoke);
 
-  SlowPathCodeARM64* slow_path =
-      new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen->AddSlowPath(slow_path);
+  VarHandleTarget target = GetVarHandleTarget(invoke);
+  VarHandleSlowPathARM64* slow_path = nullptr;
+  if (!byte_swap) {
+    slow_path = GenerateVarHandleChecks(invoke, codegen, order, type);
+    GenerateVarHandleTarget(invoke, target, codegen);
+    __ Bind(slow_path->GetNativeByteOrderLabel());
+  }
 
-  GenerateVarHandleCoordinateChecks(invoke, codegen, slow_path);
-  GenerateVarHandleAccessModeAndVarTypeChecks(invoke, codegen, slow_path, type);
-
-  VarHandleTarget target = GenerateVarHandleTarget(invoke, codegen);
+  // ARM64 load-acquire instructions are implicitly sequentially consistent.
+  bool use_load_acquire =
+      (order == std::memory_order_acquire) || (order == std::memory_order_seq_cst);
+  DCHECK(use_load_acquire || order == std::memory_order_relaxed);
 
   // Load the value from the target location.
   if (type == DataType::Type::kReference && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
@@ -4248,23 +4379,44 @@
                                                    MemOperand(tmp_ptr),
                                                    /*needs_null_check=*/ false,
                                                    use_load_acquire);
+    DCHECK(!byte_swap);
   } else {
     MemOperand address(target.object.X(), target.offset.X());
+    CPURegister load_reg = out;
+    DataType::Type load_type = type;
+    UseScratchRegisterScope temps(masm);
+    if (byte_swap) {
+      if (type == DataType::Type::kInt16) {
+        // Avoid unnecessary sign extension before REV16.
+        load_type = DataType::Type::kUint16;
+      } else if (type == DataType::Type::kFloat32) {
+        load_type = DataType::Type::kInt32;
+        load_reg = target.offset.W();
+      } else if (type == DataType::Type::kFloat64) {
+        load_type = DataType::Type::kInt64;
+        load_reg = target.offset.X();
+      }
+    }
     if (use_load_acquire) {
-      codegen->LoadAcquire(invoke, out, address, /*needs_null_check=*/ false);
+      codegen->LoadAcquire(invoke, load_type, load_reg, address, /*needs_null_check=*/ false);
     } else {
-      codegen->Load(type, out, address);
+      codegen->Load(load_type, load_reg, address);
     }
     if (type == DataType::Type::kReference) {
+      DCHECK(!byte_swap);
       DCHECK(out.IsW());
       Location out_loc = locations->Out();
       Location object_loc = LocationFrom(target.object);
       Location offset_loc = LocationFrom(target.offset);
       codegen->MaybeGenerateReadBarrierSlow(invoke, out_loc, out_loc, object_loc, 0u, offset_loc);
+    } else if (byte_swap) {
+      GenerateReverseBytes(masm, type, load_reg, out);
     }
   }
 
-  __ Bind(slow_path->GetExitLabel());
+  if (!byte_swap) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleGet(HInvoke* invoke) {
@@ -4272,7 +4424,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleGet(HInvoke* invoke) {
-  GenerateVarHandleGet(invoke, codegen_, /*use_load_acquire=*/ false);
+  GenerateVarHandleGet(invoke, codegen_, std::memory_order_relaxed);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetOpaque(HInvoke* invoke) {
@@ -4280,7 +4432,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetOpaque(HInvoke* invoke) {
-  GenerateVarHandleGet(invoke, codegen_, /*use_load_acquire=*/ false);
+  GenerateVarHandleGet(invoke, codegen_, std::memory_order_relaxed);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAcquire(HInvoke* invoke) {
@@ -4288,7 +4440,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetAcquire(HInvoke* invoke) {
-  GenerateVarHandleGet(invoke, codegen_, /*use_load_acquire=*/ true);
+  GenerateVarHandleGet(invoke, codegen_, std::memory_order_acquire);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetVolatile(HInvoke* invoke) {
@@ -4296,8 +4448,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleGetVolatile(HInvoke* invoke) {
-  // ARM64 load-acquire instructions are implicitly sequentially consistent.
-  GenerateVarHandleGet(invoke, codegen_, /*use_load_acquire=*/ true);
+  GenerateVarHandleGet(invoke, codegen_, std::memory_order_seq_cst);
 }
 
 static void CreateVarHandleSetLocations(HInvoke* invoke) {
@@ -4310,21 +4461,26 @@
 
 static void GenerateVarHandleSet(HInvoke* invoke,
                                  CodeGeneratorARM64* codegen,
-                                 bool use_store_release) {
+                                 std::memory_order order,
+                                 bool byte_swap = false) {
   uint32_t value_index = invoke->GetNumberOfArguments() - 1;
   DataType::Type value_type = GetDataTypeFromShorty(invoke, value_index);
 
   MacroAssembler* masm = codegen->GetVIXLAssembler();
   CPURegister value = InputCPURegisterOrZeroRegAt(invoke, value_index);
 
-  SlowPathCodeARM64* slow_path =
-      new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen->AddSlowPath(slow_path);
+  VarHandleTarget target = GetVarHandleTarget(invoke);
+  VarHandleSlowPathARM64* slow_path = nullptr;
+  if (!byte_swap) {
+    slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
+    GenerateVarHandleTarget(invoke, target, codegen);
+    __ Bind(slow_path->GetNativeByteOrderLabel());
+  }
 
-  GenerateVarHandleCoordinateChecks(invoke, codegen, slow_path);
-  GenerateVarHandleAccessModeAndVarTypeChecks(invoke, codegen, slow_path, value_type);
-
-  VarHandleTarget target = GenerateVarHandleTarget(invoke, codegen);
+  // ARM64 store-release instructions are implicitly sequentially consistent.
+  bool use_store_release =
+      (order == std::memory_order_release) || (order == std::memory_order_seq_cst);
+  DCHECK(use_store_release || order == std::memory_order_relaxed);
 
   // Store the value to the target location.
   {
@@ -4337,6 +4493,20 @@
       codegen->GetAssembler()->PoisonHeapReference(temp);
       source = temp;
     }
+    if (byte_swap) {
+      DCHECK(!source.IsZero());  // We use the main path for zero as it does not need a byte swap.
+      Register temp = source.Is64Bits() ? temps.AcquireX() : temps.AcquireW();
+      if (value_type == DataType::Type::kInt16) {
+        // Avoid unnecessary sign extension before storing.
+        value_type = DataType::Type::kUint16;
+      } else if (DataType::IsFloatingPointType(value_type)) {
+        __ Fmov(temp, source.Is64Bits() ? source.D() : source.S());
+        value_type = source.Is64Bits() ? DataType::Type::kInt64 : DataType::Type::kInt32;
+        source = temp;  // Source for the `GenerateReverseBytes()` below.
+      }
+      GenerateReverseBytes(masm, value_type, source, temp);
+      source = temp;
+    }
     MemOperand address(target.object.X(), target.offset.X());
     if (use_store_release) {
       codegen->StoreRelease(invoke, value_type, source, address, /*needs_null_check=*/ false);
@@ -4349,7 +4519,9 @@
     codegen->MarkGCCard(target.object, Register(value), /*value_can_be_null=*/ true);
   }
 
-  __ Bind(slow_path->GetExitLabel());
+  if (!byte_swap) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleSet(HInvoke* invoke) {
@@ -4357,7 +4529,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleSet(HInvoke* invoke) {
-  GenerateVarHandleSet(invoke, codegen_, /*use_store_release=*/ false);
+  GenerateVarHandleSet(invoke, codegen_, std::memory_order_relaxed);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetOpaque(HInvoke* invoke) {
@@ -4365,7 +4537,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetOpaque(HInvoke* invoke) {
-  GenerateVarHandleSet(invoke, codegen_, /*use_store_release=*/ false);
+  GenerateVarHandleSet(invoke, codegen_, std::memory_order_relaxed);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetRelease(HInvoke* invoke) {
@@ -4373,7 +4545,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetRelease(HInvoke* invoke) {
-  GenerateVarHandleSet(invoke, codegen_, /*use_store_release=*/ true);
+  GenerateVarHandleSet(invoke, codegen_, std::memory_order_release);
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleSetVolatile(HInvoke* invoke) {
@@ -4381,8 +4553,7 @@
 }
 
 void IntrinsicCodeGeneratorARM64::VisitVarHandleSetVolatile(HInvoke* invoke) {
-  // ARM64 store-release instructions are implicitly sequentially consistent.
-  GenerateVarHandleSet(invoke, codegen_, /*use_store_release=*/ true);
+  GenerateVarHandleSet(invoke, codegen_, std::memory_order_seq_cst);
 }
 
 static void CreateVarHandleCompareAndSetOrExchangeLocations(HInvoke* invoke, bool return_success) {
@@ -4391,7 +4562,7 @@
   }
 
   uint32_t number_of_arguments = invoke->GetNumberOfArguments();
-  DataType::Type value_type = invoke->InputAt(number_of_arguments - 1u)->GetType();
+  DataType::Type value_type = GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
   if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
       value_type == DataType::Type::kReference) {
     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
@@ -4423,12 +4594,29 @@
       locations->SetTempAt(0u, Location::RegisterLocation(first_callee_save));
     }
   }
-  if (!return_success && DataType::IsFloatingPointType(value_type)) {
-    // Add a temporary for old value and exclusive store result if floating point
-    // `expected` and/or `new_value` take scratch registers.
-    locations->AddRegisterTemps(
-        (IsConstantZeroBitPattern(invoke->InputAt(number_of_arguments - 1u)) ? 0u : 1u) +
-        (IsConstantZeroBitPattern(invoke->InputAt(number_of_arguments - 2u)) ? 0u : 1u));
+  size_t old_temp_count = locations->GetTempCount();
+  DCHECK_EQ(old_temp_count, (GetExpectedVarHandleCoordinatesCount(invoke) == 0) ? 2u : 1u);
+  if (!return_success) {
+    if (DataType::IsFloatingPointType(value_type)) {
+      // Add a temporary for old value and exclusive store result if floating point
+      // `expected` and/or `new_value` take scratch registers.
+      size_t available_scratch_registers =
+          (IsConstantZeroBitPattern(invoke->InputAt(number_of_arguments - 1u)) ? 1u : 0u) +
+          (IsConstantZeroBitPattern(invoke->InputAt(number_of_arguments - 2u)) ? 1u : 0u);
+      size_t temps_needed = /* pointer, old value, store result */ 3u - available_scratch_registers;
+      // We can reuse the declaring class (if present) and offset temporary.
+      if (temps_needed > old_temp_count) {
+        locations->AddRegisterTemps(temps_needed - old_temp_count);
+      }
+    } else if ((value_type != DataType::Type::kReference && DataType::Size(value_type) != 1u) &&
+               !IsConstantZeroBitPattern(invoke->InputAt(number_of_arguments - 2u)) &&
+               !IsConstantZeroBitPattern(invoke->InputAt(number_of_arguments - 1u)) &&
+               GetExpectedVarHandleCoordinatesCount(invoke) == 2u) {
+      // Allocate a normal temporary for store result in the non-native byte order path
+      // because scratch registers are used by the byte-swapped `expected` and `new_value`.
+      DCHECK_EQ(old_temp_count, 1u);
+      locations->AddTemp(Location::RequiresRegister());
+    }
   }
   if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
     // Add a temporary for the `old_value_temp` in slow path.
@@ -4450,10 +4638,8 @@
     Register reg = temps->AcquireX();
     __ Fmov(reg, cpu_reg.D());
     return reg;
-  } else if (DataType::Is64BitType(type)) {
-    return cpu_reg.X();
   } else {
-    return cpu_reg.W();
+    return DataType::Is64BitType(type) ? cpu_reg.X() : cpu_reg.W();
   }
 }
 
@@ -4461,10 +4647,10 @@
                                                      CodeGeneratorARM64* codegen,
                                                      std::memory_order order,
                                                      bool return_success,
-                                                     bool strong) {
+                                                     bool strong,
+                                                     bool byte_swap = false) {
   DCHECK(return_success || strong);
 
-  size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
   uint32_t expected_index = invoke->GetNumberOfArguments() - 2;
   uint32_t new_value_index = invoke->GetNumberOfArguments() - 1;
   DataType::Type value_type = GetDataTypeFromShorty(invoke, new_value_index);
@@ -4476,14 +4662,14 @@
   CPURegister new_value = InputCPURegisterOrZeroRegAt(invoke, new_value_index);
   CPURegister out = helpers::OutputCPURegister(invoke);
 
-  SlowPathCodeARM64* slow_path =
-      new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen->AddSlowPath(slow_path);
-
-  GenerateVarHandleCoordinateChecks(invoke, codegen, slow_path);
-  GenerateVarHandleAccessModeAndVarTypeChecks(invoke, codegen, slow_path, value_type);
-
-  VarHandleTarget target = GenerateVarHandleTarget(invoke, codegen);
+  VarHandleTarget target = GetVarHandleTarget(invoke);
+  VarHandleSlowPathARM64* slow_path = nullptr;
+  if (!byte_swap) {
+    slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
+    slow_path->SetCompareAndSetOrExchangeArgs(return_success, strong);
+    GenerateVarHandleTarget(invoke, target, codegen);
+    __ Bind(slow_path->GetNativeByteOrderLabel());
+  }
 
   // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
   if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(new_value_index))) {
@@ -4501,13 +4687,44 @@
   }
   __ Add(tmp_ptr, target.object.X(), target.offset.X());
 
-  // Move floating point values to temporaries.
+  // Move floating point values to scratch registers.
   // Note that float/double CAS uses bitwise comparison, rather than the operator==.
   Register expected_reg = MoveToTempIfFpRegister(expected, value_type, masm, &temps);
   Register new_value_reg = MoveToTempIfFpRegister(new_value, value_type, masm, &temps);
-  DataType::Type cas_type = DataType::IsFloatingPointType(value_type)
-      ? ((value_type == DataType::Type::kFloat32) ? DataType::Type::kInt32 : DataType::Type::kInt64)
+  bool is_fp = DataType::IsFloatingPointType(value_type);
+  DataType::Type cas_type = is_fp
+      ? ((value_type == DataType::Type::kFloat64) ? DataType::Type::kInt64 : DataType::Type::kInt32)
       : value_type;
+  // Avoid sign extension in the CAS loop by zero-extending `expected` before the loop. This adds
+  // one instruction for CompareAndExchange as we shall need to sign-extend the returned value.
+  if (value_type == DataType::Type::kInt16 && !expected.IsZero()) {
+    Register temp = temps.AcquireW();
+    __ Uxth(temp, expected_reg);
+    expected_reg = temp;
+    cas_type = DataType::Type::kUint16;
+  } else if (value_type == DataType::Type::kInt8 && !expected.IsZero()) {
+    Register temp = temps.AcquireW();
+    __ Uxtb(temp, expected_reg);
+    expected_reg = temp;
+    cas_type = DataType::Type::kUint8;
+  }
+
+  if (byte_swap) {
+    // Do the byte swap and move values to scratch registers if needed.
+    // Non-zero FP values and non-zero `expected` for `kInt16` are already in scratch registers.
+    DCHECK_NE(value_type, DataType::Type::kInt8);
+    if (!expected.IsZero()) {
+      bool is_scratch = is_fp || (value_type == DataType::Type::kInt16);
+      Register temp = is_scratch ? expected_reg : temps.AcquireSameSizeAs(expected_reg);
+      GenerateReverseBytes(masm, cas_type, expected_reg, temp);
+      expected_reg = temp;
+    }
+    if (!new_value.IsZero()) {
+      Register temp = is_fp ? new_value_reg : temps.AcquireSameSizeAs(new_value_reg);
+      GenerateReverseBytes(masm, cas_type, new_value_reg, temp);
+      new_value_reg = temp;
+    }
+  }
 
   // Prepare registers for old value and the result of the exclusive store.
   Register old_value;
@@ -4520,7 +4737,7 @@
     // We need two temporary registers but we have already used scratch registers for
     // holding the expected and new value unless they are zero bit pattern (+0.0f or
     // +0.0). We have allocated sufficient normal temporaries to handle that.
-    size_t next_temp = (expected_coordinates_count == 0u) ? 2u : 1u;
+    size_t next_temp = 1u;
     if (expected.IsZero()) {
       old_value = (cas_type == DataType::Type::kInt64) ? temps.AcquireX() : temps.AcquireW();
     } else {
@@ -4530,10 +4747,18 @@
     }
     store_result =
         new_value.IsZero() ? temps.AcquireW() : WRegisterFrom(locations->GetTemp(next_temp));
+    DCHECK(!old_value.Is(tmp_ptr));
+    DCHECK(!store_result.Is(tmp_ptr));
   } else {
-    // Use the output register for the old value and a scratch register for the store result.
+    // Use the output register for the old value.
     old_value = (cas_type == DataType::Type::kInt64) ? out.X() : out.W();
-    store_result = temps.AcquireW();
+    // Use scratch register for the store result, except when we have used up
+    // scratch registers for byte-swapped `expected` and `new_value`.
+    // In that case, we have allocated a normal temporary.
+    store_result = (byte_swap && !expected.IsZero() && !new_value.IsZero())
+        ? WRegisterFrom(locations->GetTemp(1))
+        : temps.AcquireW();
+    DCHECK(!store_result.Is(tmp_ptr));
   }
 
   vixl::aarch64::Label exit_loop_label;
@@ -4543,6 +4768,7 @@
   if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
     // The `old_value_temp` is used first for the marked `old_value` and then for the unmarked
     // reloaded old value for subsequent CAS in the slow path. It cannot be a scratch register.
+    size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
     Register old_value_temp =
         WRegisterFrom(locations->GetTemp((expected_coordinates_count == 0u) ? 2u : 1u));
     // For strong CAS, use a scratch register for the store result in slow path.
@@ -4588,10 +4814,20 @@
       // Determine the final success value with a CSEL.
       __ Csel(out.W(), store_result, wzr, eq);
     }
+  } else if (byte_swap) {
+    // Also handles moving to FP registers.
+    GenerateReverseBytes(masm, value_type, old_value, out);
   } else if (DataType::IsFloatingPointType(value_type)) {
     __ Fmov((value_type == DataType::Type::kFloat64) ? out.D() : out.S(), old_value);
+  } else if (value_type == DataType::Type::kInt8) {
+    __ Sxtb(out.W(), old_value);
+  } else if (value_type == DataType::Type::kInt16) {
+    __ Sxth(out.W(), old_value);
   }
-  __ Bind(slow_path->GetExitLabel());
+
+  if (!byte_swap) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleCompareAndExchange(HInvoke* invoke) {
@@ -4682,25 +4918,41 @@
 
   LocationSummary* locations = CreateVarHandleCommonLocations(invoke);
 
+  size_t old_temp_count = locations->GetTempCount();
+  DCHECK_EQ(old_temp_count, (GetExpectedVarHandleCoordinatesCount(invoke) == 0) ? 2u : 1u);
   if (DataType::IsFloatingPointType(invoke->GetType())) {
     if (get_and_update_op == GetAndUpdateOp::kAdd) {
       // For ADD, do not use ZR for zero bit pattern (+0.0f or +0.0).
       locations->SetInAt(invoke->GetNumberOfArguments() - 1u, Location::RequiresFpuRegister());
     } else {
       DCHECK(get_and_update_op == GetAndUpdateOp::kSet);
-      if (!IsConstantZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
+      // We can reuse the declaring class temporary if present.
+      if (old_temp_count == 1u &&
+          !IsConstantZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
         // Add a temporary for `old_value` if floating point `new_value` takes a scratch register.
         locations->AddTemp(Location::RequiresRegister());
       }
     }
   }
+  // We need a temporary for the byte-swap path for bitwise operations unless the argument is a
+  // zero which does not need a byte-swap. We can reuse the declaring class temporary if present.
+  if (old_temp_count == 1u &&
+      (get_and_update_op != GetAndUpdateOp::kSet && get_and_update_op != GetAndUpdateOp::kAdd) &&
+      GetExpectedVarHandleCoordinatesCount(invoke) == 2u &&
+      !IsConstantZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
+    DataType::Type value_type =
+        GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
+    if (value_type != DataType::Type::kReference && DataType::Size(value_type) != 1u) {
+      locations->AddTemp(Location::RequiresRegister());
+    }
+  }
 }
 
 static void GenerateVarHandleGetAndUpdate(HInvoke* invoke,
                                           CodeGeneratorARM64* codegen,
                                           GetAndUpdateOp get_and_update_op,
-                                          std::memory_order order) {
-  size_t expected_coordinates_count = GetExpectedVarHandleCoordinatesCount(invoke);
+                                          std::memory_order order,
+                                          bool byte_swap = false) {
   uint32_t arg_index = invoke->GetNumberOfArguments() - 1;
   DataType::Type value_type = GetDataTypeFromShorty(invoke, arg_index);
 
@@ -4709,14 +4961,14 @@
   CPURegister arg = InputCPURegisterOrZeroRegAt(invoke, arg_index);
   CPURegister out = helpers::OutputCPURegister(invoke);
 
-  SlowPathCodeARM64* slow_path =
-      new (codegen->GetScopedAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen->AddSlowPath(slow_path);
-
-  GenerateVarHandleCoordinateChecks(invoke, codegen, slow_path);
-  GenerateVarHandleAccessModeAndVarTypeChecks(invoke, codegen, slow_path, value_type);
-
-  VarHandleTarget target = GenerateVarHandleTarget(invoke, codegen);
+  VarHandleTarget target = GetVarHandleTarget(invoke);
+  VarHandleSlowPathARM64* slow_path = nullptr;
+  if (!byte_swap) {
+    slow_path = GenerateVarHandleChecks(invoke, codegen, order, value_type);
+    slow_path->SetGetAndUpdateOp(get_and_update_op);
+    GenerateVarHandleTarget(invoke, target, codegen);
+    __ Bind(slow_path->GetNativeByteOrderLabel());
+  }
 
   // This needs to be before the temp registers, as MarkGCCard also uses VIXL temps.
   if (CodeGenerator::StoreNeedsWriteBarrier(value_type, invoke->InputAt(arg_index))) {
@@ -4737,9 +4989,17 @@
   __ Add(tmp_ptr, target.object.X(), target.offset.X());
 
   // The load/store type is never floating point.
-  DataType::Type load_store_type = DataType::IsFloatingPointType(value_type)
+  bool is_fp = DataType::IsFloatingPointType(value_type);
+  DataType::Type load_store_type = is_fp
       ? ((value_type == DataType::Type::kFloat32) ? DataType::Type::kInt32 : DataType::Type::kInt64)
       : value_type;
+  // Avoid sign extension in the CAS loop. Sign-extend after the loop.
+  // Note: Using unsigned values yields the same value to store (we do not store higher bits).
+  if (value_type == DataType::Type::kInt8) {
+    load_store_type = DataType::Type::kUint8;
+  } else if (value_type == DataType::Type::kInt16) {
+    load_store_type = DataType::Type::kUint16;
+  }
 
   // Prepare register for old value.
   CPURegister old_value = out;
@@ -4751,9 +5011,7 @@
       // We need a temporary register but we have already used a scratch register for
       // the new value unless it is zero bit pattern (+0.0f or +0.0) and need another one
       // in GenerateGetAndUpdate(). We have allocated a normal temporary to handle that.
-      Location temp = locations->GetTemp((expected_coordinates_count == 0u) ? 2u : 1u);
-      old_value =
-          (load_store_type == DataType::Type::kInt64) ? XRegisterFrom(temp) : WRegisterFrom(temp);
+      old_value = CPURegisterFrom(locations->GetTemp(1u), load_store_type);
     } else if ((kEmitCompilerReadBarrier && kUseBakerReadBarrier) &&
                value_type == DataType::Type::kReference) {
       // Load the old value initially to a scratch register.
@@ -4762,14 +5020,46 @@
     }
   }
 
+  if (byte_swap) {
+    DCHECK_NE(value_type, DataType::Type::kReference);
+    DCHECK_NE(DataType::Size(value_type), 1u);
+    if (get_and_update_op == GetAndUpdateOp::kAdd) {
+      // We need to do the byte swapping in the CAS loop for GetAndAdd.
+      get_and_update_op = GetAndUpdateOp::kAddWithByteSwap;
+    } else if (!arg.IsZero()) {
+      // For other operations, avoid byte swap inside the CAS loop by providing an adjusted `arg`.
+      // For GetAndSet use a scratch register; FP argument is already in a scratch register.
+      // For bitwise operations GenerateGetAndUpdate() needs both scratch registers;
+      // we have allocated a normal temporary to handle that.
+      CPURegister temp = (get_and_update_op == GetAndUpdateOp::kSet)
+          ? (is_fp ? arg : (arg.Is64Bits() ? temps.AcquireX() : temps.AcquireW()))
+          : CPURegisterFrom(locations->GetTemp(1u), load_store_type);
+      GenerateReverseBytes(masm, load_store_type, arg, temp);
+      arg = temp;
+    }
+  }
+
   GenerateGetAndUpdate(codegen, get_and_update_op, load_store_type, order, tmp_ptr, arg, old_value);
 
-  if (get_and_update_op == GetAndUpdateOp::kSet && DataType::IsFloatingPointType(value_type)) {
-    if (value_type == DataType::Type::kFloat64) {
-      __ Fmov(out.D(), old_value.X());
-    } else {
-      __ Fmov(out.S(), old_value.W());
+  if (get_and_update_op == GetAndUpdateOp::kAddWithByteSwap) {
+    // The only adjustment needed is sign-extension for `kInt16`.
+    // Everything else has been done by the `GenerateGetAndUpdate()`.
+    DCHECK(byte_swap);
+    if (value_type == DataType::Type::kInt16) {
+      DCHECK_EQ(load_store_type, DataType::Type::kUint16);
+      __ Sxth(out.W(), old_value.W());
     }
+  } else if (byte_swap) {
+    // Also handles moving to FP registers.
+    GenerateReverseBytes(masm, value_type, old_value, out);
+  } else if (get_and_update_op == GetAndUpdateOp::kSet && value_type == DataType::Type::kFloat64) {
+    __ Fmov(out.D(), old_value.X());
+  } else if (get_and_update_op == GetAndUpdateOp::kSet && value_type == DataType::Type::kFloat32) {
+    __ Fmov(out.S(), old_value.W());
+  } else if (value_type == DataType::Type::kInt8) {
+    __ Sxtb(out.W(), old_value.W());
+  } else if (value_type == DataType::Type::kInt16) {
+    __ Sxth(out.W(), old_value.W());
   } else if (kEmitCompilerReadBarrier && value_type == DataType::Type::kReference) {
     if (kUseBakerReadBarrier) {
       codegen->GenerateIntrinsicCasMoveWithBakerReadBarrier(out.W(), old_value.W());
@@ -4783,7 +5073,10 @@
           /*index=*/ Location::RegisterLocation(target.offset.GetCode()));
     }
   }
-  __ Bind(slow_path->GetExitLabel());
+
+  if (!byte_swap) {
+    __ Bind(slow_path->GetExitLabel());
+  }
 }
 
 void IntrinsicLocationsBuilderARM64::VisitVarHandleGetAndSet(HInvoke* invoke) {
@@ -4906,6 +5199,92 @@
   GenerateVarHandleGetAndUpdate(invoke, codegen_, GetAndUpdateOp::kXor, std::memory_order_release);
 }
 
+void VarHandleSlowPathARM64::EmitByteArrayViewCode(CodeGenerator* codegen_in) {
+  DCHECK(GetByteArrayViewCheckLabel()->IsLinked());
+  CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
+  HInvoke* invoke = GetInvoke();
+  mirror::VarHandle::AccessModeTemplate access_mode_template = GetAccessModeTemplate();
+  DataType::Type value_type =
+      GetVarHandleExpectedValueType(invoke, /*expected_coordinates_count=*/ 2u);
+  DCHECK_NE(value_type, DataType::Type::kReference);
+  size_t size = DataType::Size(value_type);
+  DCHECK_GT(size, 1u);
+  Register varhandle = InputRegisterAt(invoke, 0);
+  Register object = InputRegisterAt(invoke, 1);
+  Register index = InputRegisterAt(invoke, 2);
+
+  MemberOffset class_offset = mirror::Object::ClassOffset();
+  MemberOffset array_length_offset = mirror::Array::LengthOffset();
+  MemberOffset data_offset = mirror::Array::DataOffset(Primitive::kPrimByte);
+  MemberOffset native_byte_order_offset = mirror::ByteArrayViewVarHandle::NativeByteOrderOffset();
+
+  __ Bind(GetByteArrayViewCheckLabel());
+
+  VarHandleTarget target = GetVarHandleTarget(invoke);
+  {
+    UseScratchRegisterScope temps(masm);
+    Register temp = temps.AcquireW();
+    Register temp2 = temps.AcquireW();
+
+    // The main path checked that the coordinateType0 is an array class that matches
+    // the class of the actual coordinate argument but it does not match the value type.
+    // Check if the `varhandle` references a ByteArrayViewVarHandle instance.
+    __ Ldr(temp, HeapOperand(varhandle, class_offset.Int32Value()));
+    codegen->LoadClassRootForIntrinsic(temp2, ClassRoot::kJavaLangInvokeByteArrayViewVarHandle);
+    __ Cmp(temp, temp2);
+    __ B(GetEntryLabel(), ne);
+
+    // Check for array index out of bounds.
+    __ Ldr(temp, HeapOperand(object, array_length_offset.Int32Value()));
+    __ Subs(temp, temp, index);
+    __ Ccmp(temp, size, NoFlag, hs);  // If SUBS yields LO (C=false), keep the C flag clear.
+    __ B(GetEntryLabel(), lo);
+
+    // Construct the target.
+    __ Add(target.offset, index, data_offset.Int32Value());
+
+    // Alignment check. For unaligned access, go to the runtime.
+    DCHECK(IsPowerOfTwo(size));
+    if (size == 2u) {
+      __ Tbnz(target.offset, 0, GetEntryLabel());
+    } else {
+      __ Tst(target.offset, size - 1u);
+      __ B(GetEntryLabel(), ne);
+    }
+
+    // Byte order check. For native byte order return to the main path.
+    if (access_mode_template == mirror::VarHandle::AccessModeTemplate::kSet &&
+        IsConstantZeroBitPattern(invoke->InputAt(invoke->GetNumberOfArguments() - 1u))) {
+      // There is no reason to differentiate between native byte order and byte-swap
+      // for setting a zero bit pattern. Just return to the main path.
+      __ B(GetNativeByteOrderLabel());
+      return;
+    }
+    __ Ldr(temp, HeapOperand(varhandle, native_byte_order_offset.Int32Value()));
+    __ Cbnz(temp, GetNativeByteOrderLabel());
+  }
+
+  switch (access_mode_template) {
+    case mirror::VarHandle::AccessModeTemplate::kGet:
+      GenerateVarHandleGet(invoke, codegen, order_, /*byte_swap=*/ true);
+      break;
+    case mirror::VarHandle::AccessModeTemplate::kSet:
+      GenerateVarHandleSet(invoke, codegen, order_, /*byte_swap=*/ true);
+      break;
+    case mirror::VarHandle::AccessModeTemplate::kCompareAndSet:
+    case mirror::VarHandle::AccessModeTemplate::kCompareAndExchange:
+      GenerateVarHandleCompareAndSetOrExchange(
+          invoke, codegen, order_, return_success_, strong_, /*byte_swap=*/ true);
+      break;
+    case mirror::VarHandle::AccessModeTemplate::kGetAndUpdate:
+      GenerateVarHandleGetAndUpdate(
+          invoke, codegen, get_and_update_op_, order_, /*byte_swap=*/ true);
+      break;
+  }
+  __ B(GetExitLabel());
+}
+
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOf);
 UNIMPLEMENTED_INTRINSIC(ARM64, StringStringIndexOfAfter);
 UNIMPLEMENTED_INTRINSIC(ARM64, StringBufferAppend);
diff --git a/compiler/optimizing/intrinsics_arm_vixl.cc b/compiler/optimizing/intrinsics_arm_vixl.cc
index 94e207b..df214cc 100644
--- a/compiler/optimizing/intrinsics_arm_vixl.cc
+++ b/compiler/optimizing/intrinsics_arm_vixl.cc
@@ -3755,7 +3755,7 @@
   if (number_of_arguments == /* VarHandle object */ 1u + expected_coordinates_count) {
     return invoke->GetType();
   } else {
-    return invoke->InputAt(number_of_arguments - 1u)->GetType();
+    return GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
   }
 }
 
@@ -4265,7 +4265,7 @@
   }
 
   uint32_t number_of_arguments = invoke->GetNumberOfArguments();
-  DataType::Type value_type = invoke->InputAt(number_of_arguments - 1u)->GetType();
+  DataType::Type value_type = GetDataTypeFromShorty(invoke, number_of_arguments - 1u);
   if ((kEmitCompilerReadBarrier && !kUseBakerReadBarrier) &&
       value_type == DataType::Type::kReference) {
     // Unsupported for non-Baker read barrier because the artReadBarrierSlow() ignores
diff --git a/runtime/mirror/var_handle.h b/runtime/mirror/var_handle.h
index c1dcb94..64da936 100644
--- a/runtime/mirror/var_handle.h
+++ b/runtime/mirror/var_handle.h
@@ -258,11 +258,11 @@
 
   bool GetNativeByteOrder() REQUIRES_SHARED(Locks::mutator_lock_);
 
- private:
   static MemberOffset NativeByteOrderOffset() {
     return MemberOffset(OFFSETOF_MEMBER(ByteArrayViewVarHandle, native_byte_order_));
   }
 
+ private:
   // Flag indicating that accessors should use native byte-ordering.
   uint8_t native_byte_order_;