Relaxed memory barriers for x86

X86 provides stronger memory guarantees and thus the memory barriers can be
optimized. This patch ensures that all memory barriers for x86 are treated
as scheduling barriers. And in cases where a barrier is needed (StoreLoad case),
an mfence is used.

Change-Id: I13d02bf3f152083ba9f358052aedb583b0d48640
Signed-off-by: Razvan A Lupusoru <razvan.a.lupusoru@intel.com>
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index 718468f..eb4a336 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -339,7 +339,16 @@
 
 std::ostream& operator<<(std::ostream& os, const DividePattern& pattern);
 
-// Memory barrier types (see "The JSR-133 Cookbook for Compiler Writers").
+/**
+ * @brief Memory barrier types (see "The JSR-133 Cookbook for Compiler Writers").
+ * @details Without context sensitive analysis, the most conservative set of barriers
+ * must be issued to ensure the Java Memory Model. Thus the recipe is as follows:
+ * -# Use StoreStore barrier before volatile store.
+ * -# Use StoreLoad barrier after volatile store.
+ * -# Use LoadLoad and LoadStore barrier after each volatile load.
+ * -# Use StoreStore barrier after all stores but before return from any constructor whose
+ * class has final fields.
+ */
 enum MemBarrierKind {
   kLoadStore,
   kLoadLoad,
@@ -364,6 +373,7 @@
   kPCRelFixup,  // x86 FIXME: add NEEDS_FIXUP to instruction attributes.
   kRegDef0,
   kRegDef1,
+  kRegDef2,
   kRegDefA,
   kRegDefD,
   kRegDefFPCSList0,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 1d959fa..93d48d6 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -777,6 +777,9 @@
 
 void ArmMir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
 #if ANDROID_SMP != 0
+  // Start off with using the last LIR as the barrier. If it is not enough, then we will generate one.
+  LIR* barrier = last_lir_insn_;
+
   int dmb_flavor;
   // TODO: revisit Arm barrier kinds
   switch (barrier_kind) {
@@ -789,8 +792,16 @@
       dmb_flavor = kSY;  // quiet gcc.
       break;
   }
-  LIR* dmb = NewLIR1(kThumb2Dmb, dmb_flavor);
-  dmb->u.m.def_mask = ENCODE_ALL;
+
+  // If the same barrier already exists, don't generate another.
+  if (barrier == nullptr
+      || (barrier != nullptr && (barrier->opcode != kThumb2Dmb || barrier->operands[0] != dmb_flavor))) {
+    barrier = NewLIR1(kThumb2Dmb, dmb_flavor);
+  }
+
+  // At this point we must have a memory barrier. Mark it as a scheduling barrier as well.
+  DCHECK(!barrier->flags.use_def_invalid);
+  barrier->u.m.def_mask = ENCODE_ALL;
 #endif
 }
 
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 8c3a11fb..3235977 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -479,6 +479,7 @@
       rl_src = LoadValue(rl_src, kAnyReg);
     }
     if (field_info.IsVolatile()) {
+      // There might have been a store before this volatile one so insert StoreStore barrier.
       GenMemBarrier(kStoreStore);
     }
     if (is_long_or_double) {
@@ -488,6 +489,7 @@
       StoreWordDisp(r_base, field_info.FieldOffset().Int32Value(), rl_src.reg.GetReg());
     }
     if (field_info.IsVolatile()) {
+      // A load might follow the volatile store so insert a StoreLoad barrier.
       GenMemBarrier(kStoreLoad);
     }
     if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
@@ -559,9 +561,7 @@
     }
     // r_base now holds static storage base
     RegLocation rl_result = EvalLoc(rl_dest, kAnyReg, true);
-    if (field_info.IsVolatile()) {
-      GenMemBarrier(kLoadLoad);
-    }
+
     if (is_long_or_double) {
       LoadBaseDispWide(r_base, field_info.FieldOffset().Int32Value(), rl_result.reg.GetReg(),
                        rl_result.reg.GetHighReg(), INVALID_SREG);
@@ -569,6 +569,14 @@
       LoadWordDisp(r_base, field_info.FieldOffset().Int32Value(), rl_result.reg.GetReg());
     }
     FreeTemp(r_base);
+
+    if (field_info.IsVolatile()) {
+      // Without context sensitive analysis, we must issue the most conservative barriers.
+      // In this case, either a load or store may follow so we issue both barriers.
+      GenMemBarrier(kLoadLoad);
+      GenMemBarrier(kLoadStore);
+    }
+
     if (is_long_or_double) {
       StoreValueWide(rl_dest, rl_result);
     } else {
@@ -716,7 +724,10 @@
                          rl_result.reg.GetHighReg(), rl_obj.s_reg_low);
         MarkPossibleNullPointerException(opt_flags);
         if (field_info.IsVolatile()) {
+          // Without context sensitive analysis, we must issue the most conservative barriers.
+          // In this case, either a load or store may follow so we issue both barriers.
           GenMemBarrier(kLoadLoad);
+          GenMemBarrier(kLoadStore);
         }
       } else {
         int reg_ptr = AllocTemp();
@@ -725,7 +736,10 @@
         LoadBaseDispWide(reg_ptr, 0, rl_result.reg.GetReg(), rl_result.reg.GetHighReg(),
                          INVALID_SREG);
         if (field_info.IsVolatile()) {
+          // Without context sensitive analysis, we must issue the most conservative barriers.
+          // In this case, either a load or store may follow so we issue both barriers.
           GenMemBarrier(kLoadLoad);
+          GenMemBarrier(kLoadStore);
         }
         FreeTemp(reg_ptr);
       }
@@ -737,7 +751,10 @@
                    rl_result.reg.GetReg(), kWord, rl_obj.s_reg_low);
       MarkPossibleNullPointerException(opt_flags);
       if (field_info.IsVolatile()) {
+        // Without context sensitive analysis, we must issue the most conservative barriers.
+        // In this case, either a load or store may follow so we issue both barriers.
         GenMemBarrier(kLoadLoad);
+        GenMemBarrier(kLoadStore);
       }
       StoreValue(rl_dest, rl_result);
     }
@@ -773,25 +790,29 @@
       reg_ptr = AllocTemp();
       OpRegRegImm(kOpAdd, reg_ptr, rl_obj.reg.GetReg(), field_info.FieldOffset().Int32Value());
       if (field_info.IsVolatile()) {
+        // There might have been a store before this volatile one so insert StoreStore barrier.
         GenMemBarrier(kStoreStore);
       }
       StoreBaseDispWide(reg_ptr, 0, rl_src.reg.GetReg(), rl_src.reg.GetHighReg());
       MarkPossibleNullPointerException(opt_flags);
       if (field_info.IsVolatile()) {
-        GenMemBarrier(kLoadLoad);
+        // A load might follow the volatile store so insert a StoreLoad barrier.
+        GenMemBarrier(kStoreLoad);
       }
       FreeTemp(reg_ptr);
     } else {
       rl_src = LoadValue(rl_src, reg_class);
       GenNullCheck(rl_obj.reg.GetReg(), opt_flags);
       if (field_info.IsVolatile()) {
+        // There might have been a store before this volatile one so insert StoreStore barrier.
         GenMemBarrier(kStoreStore);
       }
       StoreBaseDisp(rl_obj.reg.GetReg(), field_info.FieldOffset().Int32Value(),
         rl_src.reg.GetReg(), kWord);
       MarkPossibleNullPointerException(opt_flags);
       if (field_info.IsVolatile()) {
-        GenMemBarrier(kLoadLoad);
+        // A load might follow the volatile store so insert a StoreLoad barrier.
+        GenMemBarrier(kStoreLoad);
       }
       if (is_object && !mir_graph_->IsConstantNullRef(rl_src)) {
         MarkGCCard(rl_src.reg.GetReg(), rl_obj.reg.GetReg());
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index f3c5a34..9e50749 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -1356,18 +1356,27 @@
   RegLocation rl_src_offset = info->args[2];  // long low
   rl_src_offset.wide = 0;  // ignore high half in info->args[3]
   RegLocation rl_dest = is_long ? InlineTargetWide(info) : InlineTarget(info);  // result reg
-  if (is_volatile) {
-    GenMemBarrier(kLoadLoad);
-  }
+
   RegLocation rl_object = LoadValue(rl_src_obj, kCoreReg);
   RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (is_long) {
     OpRegReg(kOpAdd, rl_object.reg.GetReg(), rl_offset.reg.GetReg());
     LoadBaseDispWide(rl_object.reg.GetReg(), 0, rl_result.reg.GetReg(), rl_result.reg.GetHighReg(), INVALID_SREG);
-    StoreValueWide(rl_dest, rl_result);
   } else {
     LoadBaseIndexed(rl_object.reg.GetReg(), rl_offset.reg.GetReg(), rl_result.reg.GetReg(), 0, kWord);
+  }
+
+  if (is_volatile) {
+    // Without context sensitive analysis, we must issue the most conservative barriers.
+    // In this case, either a load or store may follow so we issue both barriers.
+    GenMemBarrier(kLoadLoad);
+    GenMemBarrier(kLoadStore);
+  }
+
+  if (is_long) {
+    StoreValueWide(rl_dest, rl_result);
+  } else {
     StoreValue(rl_dest, rl_result);
   }
   return true;
@@ -1385,6 +1394,7 @@
   rl_src_offset.wide = 0;  // ignore high half in info->args[3]
   RegLocation rl_src_value = info->args[4];  // value to store
   if (is_volatile || is_ordered) {
+    // There might have been a store before this volatile one so insert StoreStore barrier.
     GenMemBarrier(kStoreStore);
   }
   RegLocation rl_object = LoadValue(rl_src_obj, kCoreReg);
@@ -1401,7 +1411,9 @@
 
   // Free up the temp early, to ensure x86 doesn't run out of temporaries in MarkGCCard.
   FreeTemp(rl_offset.reg.GetReg());
+
   if (is_volatile) {
+    // A load might follow the volatile store so insert a StoreLoad barrier.
     GenMemBarrier(kStoreLoad);
   }
   if (is_object) {
diff --git a/compiler/dex/quick/local_optimizations.cc b/compiler/dex/quick/local_optimizations.cc
index 6df91e6..dd4af9c 100644
--- a/compiler/dex/quick/local_optimizations.cc
+++ b/compiler/dex/quick/local_optimizations.cc
@@ -92,7 +92,10 @@
         ((target_flags & (REG_DEF0 | REG_DEF1)) == (REG_DEF0 | REG_DEF1)) ||  // Skip wide loads.
         ((target_flags & (REG_USE0 | REG_USE1 | REG_USE2)) ==
          (REG_USE0 | REG_USE1 | REG_USE2)) ||  // Skip wide stores.
-        !(target_flags & (IS_LOAD | IS_STORE))) {
+        // Skip instructions that are neither loads or stores.
+        !(target_flags & (IS_LOAD | IS_STORE)) ||
+        // Skip instructions that do both load and store.
+        ((target_flags & (IS_STORE | IS_LOAD)) == (IS_STORE | IS_LOAD))) {
       continue;
     }
 
@@ -293,7 +296,8 @@
     /* Skip non-interesting instructions */
     if (!(target_flags & IS_LOAD) ||
         (this_lir->flags.is_nop == true) ||
-        ((target_flags & (REG_DEF0 | REG_DEF1)) == (REG_DEF0 | REG_DEF1))) {
+        ((target_flags & (REG_DEF0 | REG_DEF1)) == (REG_DEF0 | REG_DEF1)) ||
+        ((target_flags & (IS_STORE | IS_LOAD)) == (IS_STORE | IS_LOAD))) {
       continue;
     }
 
diff --git a/compiler/dex/quick/mir_to_lir-inl.h b/compiler/dex/quick/mir_to_lir-inl.h
index 8b1f81d..b2362fc 100644
--- a/compiler/dex/quick/mir_to_lir-inl.h
+++ b/compiler/dex/quick/mir_to_lir-inl.h
@@ -192,6 +192,10 @@
     SetupRegMask(&lir->u.m.def_mask, lir->operands[1]);
   }
 
+  if (flags & REG_DEF2) {
+    SetupRegMask(&lir->u.m.def_mask, lir->operands[2]);
+  }
+
   if (flags & REG_USE0) {
     SetupRegMask(&lir->u.m.use_mask, lir->operands[0]);
   }
diff --git a/compiler/dex/quick/mir_to_lir.cc b/compiler/dex/quick/mir_to_lir.cc
index 82664e2..657c3d9 100644
--- a/compiler/dex/quick/mir_to_lir.cc
+++ b/compiler/dex/quick/mir_to_lir.cc
@@ -138,7 +138,10 @@
     LoadBaseDisp(reg_obj, data.field_offset, rl_dest.reg.GetReg(), kWord, INVALID_SREG);
   }
   if (data.is_volatile) {
+    // Without context sensitive analysis, we must issue the most conservative barriers.
+    // In this case, either a load or store may follow so we issue both barriers.
     GenMemBarrier(kLoadLoad);
+    GenMemBarrier(kLoadStore);
   }
   return true;
 }
@@ -160,6 +163,7 @@
   int reg_obj = LoadArg(data.object_arg);
   int reg_src = LoadArg(data.src_arg, wide);
   if (data.is_volatile) {
+    // There might have been a store before this volatile one so insert StoreStore barrier.
     GenMemBarrier(kStoreStore);
   }
   if (wide) {
@@ -170,7 +174,8 @@
     StoreBaseDisp(reg_obj, data.field_offset, reg_src, kWord);
   }
   if (data.is_volatile) {
-    GenMemBarrier(kLoadLoad);
+    // A load might follow the volatile store so insert a StoreLoad barrier.
+    GenMemBarrier(kStoreLoad);
   }
   if (data.op_variant == InlineMethodAnalyser::IPutVariant(Instruction::IPUT_OBJECT)) {
     MarkGCCard(reg_src, reg_obj);
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 5a1f6cd..51fe8f1 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -56,6 +56,7 @@
 #define NO_OPERAND           (1ULL << kNoOperand)
 #define REG_DEF0             (1ULL << kRegDef0)
 #define REG_DEF1             (1ULL << kRegDef1)
+#define REG_DEF2             (1ULL << kRegDef2)
 #define REG_DEFA             (1ULL << kRegDefA)
 #define REG_DEFD             (1ULL << kRegDefD)
 #define REG_DEF_FPCS_LIST0   (1ULL << kRegDefFPCSList0)
@@ -953,7 +954,16 @@
      */
     virtual void GenSelect(BasicBlock* bb, MIR* mir) = 0;
 
+    /**
+     * @brief Used to generate a memory barrier in an architecture specific way.
+     * @details The last generated LIR will be considered for use as barrier. Namely,
+     * if the last LIR can be updated in a way where it will serve the semantics of
+     * barrier, then it will be used as such. Otherwise, a new LIR will be generated
+     * that can keep the semantics.
+     * @param barrier_kind The kind of memory barrier to generate.
+     */
     virtual void GenMemBarrier(MemBarrierKind barrier_kind) = 0;
+
     virtual void GenMoveException(RegLocation rl_dest) = 0;
     virtual void GenMultiplyByTwoBitMultiplier(RegLocation rl_src,
                                                RegLocation rl_result, int lit, int first_bit,
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 9cafcee..e7a1a69 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -344,6 +344,7 @@
   { kX86LockCmpxchgAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014 | REG_DEFA_USEA | SETS_CCODES, { 0xF0, 0, 0x0F, 0xB1, 0, 0, 0, 0 }, "Lock Cmpxchg", "[!0r+!1r<<!2d+!3d],!4r" },
   { kX86LockCmpxchg8bM, kMem,   IS_STORE | IS_BINARY_OP | REG_USE0 | REG_DEFAD_USEAD | REG_USEC | REG_USEB | SETS_CCODES, { 0xF0, 0, 0x0F, 0xC7, 0, 1, 0, 0 }, "Lock Cmpxchg8b", "[!0r+!1d]" },
   { kX86LockCmpxchg8bA, kArray, IS_STORE | IS_QUAD_OP | REG_USE01 | REG_DEFAD_USEAD | REG_USEC | REG_USEB | SETS_CCODES, { 0xF0, 0, 0x0F, 0xC7, 0, 1, 0, 0 }, "Lock Cmpxchg8b", "[!0r+!1r<<!2d+!3d]" },
+  { kX86XchgMR, kMemReg, IS_STORE | IS_LOAD | IS_TERTIARY_OP | REG_DEF2 | REG_USE02, { 0, 0, 0x87, 0, 0, 0, 0, 0 }, "Xchg", "[!0r+!1d],!2r" },
 
   EXT_0F_ENCODING_MAP(Movzx8,  0x00, 0xB6, REG_DEF0),
   EXT_0F_ENCODING_MAP(Movzx16, 0x00, 0xB7, REG_DEF0),
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 275a2d9..f2654b9 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -372,6 +372,8 @@
     void OpVectorRegCopyWide(uint8_t fp_reg, uint8_t low_reg, uint8_t high_reg);
     void GenConstWide(RegLocation rl_dest, int64_t value);
 
+    static bool ProvidesFullMemoryBarrier(X86OpCode opcode);
+
     /*
      * @brief generate inline code for fast case of Strng.indexOf.
      * @param info Call parameters
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index c929265..14278a4 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -772,6 +772,11 @@
                    : (SRegOffset(rl_src_offset.s_reg_low) + push_offset));
     LoadWordDisp(TargetReg(kSp), srcOffsetSp, rSI);
     NewLIR4(kX86LockCmpxchg8bA, rDI, rSI, 0, 0);
+
+    // After a store we need to insert barrier in case of potential load. Since the
+    // locked cmpxchg has full barrier semantics, only a scheduling barrier will be generated.
+    GenMemBarrier(kStoreLoad);
+
     FreeTemp(rSI);
     UnmarkTemp(rSI);
     NewLIR1(kX86Pop32R, rSI);
@@ -784,9 +789,6 @@
     FlushReg(r0);
     LockTemp(r0);
 
-    // Release store semantics, get the barrier out of the way.  TODO: revisit
-    GenMemBarrier(kStoreLoad);
-
     RegLocation rl_object = LoadValue(rl_src_obj, kCoreReg);
     RegLocation rl_new_value = LoadValue(rl_src_new_value, kCoreReg);
 
@@ -801,6 +803,10 @@
     LoadValueDirect(rl_src_expected, r0);
     NewLIR5(kX86LockCmpxchgAR, rl_object.reg.GetReg(), rl_offset.reg.GetReg(), 0, 0, rl_new_value.reg.GetReg());
 
+    // After a store we need to insert barrier in case of potential load. Since the
+    // locked cmpxchg has full barrier semantics, only a scheduling barrier will be generated.
+    GenMemBarrier(kStoreLoad);
+
     FreeTemp(r0);
   }
 
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index 9994927..9588b17 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -425,10 +425,54 @@
   FreeTemp(rX86_ARG3);
 }
 
+bool X86Mir2Lir::ProvidesFullMemoryBarrier(X86OpCode opcode) {
+    switch (opcode) {
+      case kX86LockCmpxchgMR:
+      case kX86LockCmpxchgAR:
+      case kX86LockCmpxchg8bM:
+      case kX86LockCmpxchg8bA:
+      case kX86XchgMR:
+      case kX86Mfence:
+        // Atomic memory instructions provide full barrier.
+        return true;
+      default:
+        break;
+    }
+
+    // Conservative if cannot prove it provides full barrier.
+    return false;
+}
+
 void X86Mir2Lir::GenMemBarrier(MemBarrierKind barrier_kind) {
 #if ANDROID_SMP != 0
-  // TODO: optimize fences
-  NewLIR0(kX86Mfence);
+  // Start off with using the last LIR as the barrier. If it is not enough, then we will update it.
+  LIR* mem_barrier = last_lir_insn_;
+
+  /*
+   * According to the JSR-133 Cookbook, for x86 only StoreLoad barriers need memory fence. All other barriers
+   * (LoadLoad, LoadStore, StoreStore) are nops due to the x86 memory model. For those cases, all we need
+   * to ensure is that there is a scheduling barrier in place.
+   */
+  if (barrier_kind == kStoreLoad) {
+    // If no LIR exists already that can be used a barrier, then generate an mfence.
+    if (mem_barrier == nullptr) {
+      mem_barrier = NewLIR0(kX86Mfence);
+    }
+
+    // If last instruction does not provide full barrier, then insert an mfence.
+    if (ProvidesFullMemoryBarrier(static_cast<X86OpCode>(mem_barrier->opcode)) == false) {
+      mem_barrier = NewLIR0(kX86Mfence);
+    }
+  }
+
+  // Now ensure that a scheduling barrier is in place.
+  if (mem_barrier == nullptr) {
+    GenBarrier();
+  } else {
+    // Mark as a scheduling barrier.
+    DCHECK(!mem_barrier->flags.use_def_invalid);
+    mem_barrier->u.m.def_mask = ENCODE_ALL;
+  }
 #endif
 }
 
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index abe1b3d..b72e60d 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -388,6 +388,7 @@
   kX86CmpxchgRR, kX86CmpxchgMR, kX86CmpxchgAR,  // compare and exchange
   kX86LockCmpxchgMR, kX86LockCmpxchgAR,  // locked compare and exchange
   kX86LockCmpxchg8bM, kX86LockCmpxchg8bA,  // locked compare and exchange
+  kX86XchgMR,  // exchange memory with register (automatically locked)
   Binary0fOpCode(kX86Movzx8),   // zero-extend 8-bit value
   Binary0fOpCode(kX86Movzx16),  // zero-extend 16-bit value
   Binary0fOpCode(kX86Movsx8),   // sign-extend 8-bit value
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index ab0ee52..4a03ebe 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -226,6 +226,12 @@
     opcode << "j" << condition_codes[*instr & 0xF];
     branch_bytes = 1;
     break;
+  case 0x86: case 0x87:
+    opcode << "xchg";
+    store = true;
+    has_modrm = true;
+    byte_operand = (*instr == 0x86);
+    break;
   case 0x88: opcode << "mov"; store = true; has_modrm = true; byte_operand = true; break;
   case 0x89: opcode << "mov"; store = true; has_modrm = true; break;
   case 0x8A: opcode << "mov"; load = true; has_modrm = true; byte_operand = true; break;