MIPS32: Fill branch delay slots

Test: booted MIPS32 in QEMU
Test: test-art-host-gtest
Test: test-art-target-gtest
Test: test-art-target-run-test-optimizing on CI20

Change-Id: I727e80753395ab99fff004cb5d2e0a06409150d7
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index da72c75..a205800 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -332,14 +332,14 @@
     0x20, 0x00, 0xBD, 0x27, 0x20, 0x00, 0xB2, 0x8F, 0x24, 0x00, 0xB3, 0x8F,
     0x28, 0x00, 0xB4, 0x8F, 0x2C, 0x00, 0xB5, 0x8F, 0x30, 0x00, 0xB6, 0x8F,
     0x34, 0x00, 0xB7, 0x8F, 0x38, 0x00, 0xBE, 0x8F, 0x3C, 0x00, 0xBF, 0x8F,
-    0x40, 0x00, 0xBD, 0x27, 0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0xE0, 0x03, 0x40, 0x00, 0xBD, 0x27,
 };
 static constexpr uint8_t expected_cfi_kMips[] = {
     0x44, 0x0E, 0x40, 0x44, 0x9F, 0x01, 0x44, 0x9E, 0x02, 0x44, 0x97, 0x03,
     0x44, 0x96, 0x04, 0x44, 0x95, 0x05, 0x44, 0x94, 0x06, 0x44, 0x93, 0x07,
     0x44, 0x92, 0x08, 0x58, 0x0E, 0x60, 0x44, 0x0E, 0x40, 0x0A, 0x44, 0xD2,
     0x44, 0xD3, 0x44, 0xD4, 0x44, 0xD5, 0x44, 0xD6, 0x44, 0xD7, 0x44, 0xDE,
-    0x44, 0xDF, 0x44, 0x0E, 0x00, 0x48, 0x0B, 0x0E, 0x40,
+    0x44, 0xDF, 0x48, 0x0E, 0x00, 0x0B, 0x0E, 0x40,
 };
 // 0x00000000: addiu r29, r29, -64
 // 0x00000004: .cfi_def_cfa_offset: 64
@@ -385,12 +385,11 @@
 // 0x0000005c: .cfi_restore: r30
 // 0x0000005c: lw r31, +60(r29)
 // 0x00000060: .cfi_restore: r31
-// 0x00000060: addiu r29, r29, 64
-// 0x00000064: .cfi_def_cfa_offset: 0
-// 0x00000064: jr r31
-// 0x00000068: nop
-// 0x0000006c: .cfi_restore_state
-// 0x0000006c: .cfi_def_cfa_offset: 64
+// 0x00000060: jr r31
+// 0x00000064: addiu r29, r29, 64
+// 0x00000068: .cfi_def_cfa_offset: 0
+// 0x00000068: .cfi_restore_state
+// 0x00000068: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kMips64[] = {
     0x90, 0xFF, 0xBD, 0x67, 0x68, 0x00, 0xBF, 0xFF, 0x60, 0x00, 0xBE, 0xFF,
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 0d3f849..b0de964 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -753,7 +753,7 @@
   }
 
   // Collect PC infos for the mapping table.
-  uint32_t native_pc = GetAssembler()->CodeSize();
+  uint32_t native_pc = GetAssembler()->CodePosition();
 
   if (instruction == nullptr) {
     // For stack overflow checks and native-debug-info entries without dex register
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 8a2f90d..e0de03b 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -792,12 +792,24 @@
       // TODO: __ cfi().Restore(DWARFReg(reg));
     }
 
-    __ DecreaseFrameSize(GetFrameSize());
+    size_t frame_size = GetFrameSize();
+    // Adjust the stack pointer in the delay slot if doing so doesn't break CFI.
+    bool exchange = IsInt<16>(static_cast<int32_t>(frame_size));
+    bool reordering = __ SetReorder(false);
+    if (exchange) {
+      __ Jr(RA);
+      __ DecreaseFrameSize(frame_size);  // Single instruction in delay slot.
+    } else {
+      __ DecreaseFrameSize(frame_size);
+      __ Jr(RA);
+      __ Nop();  // In delay slot.
+    }
+    __ SetReorder(reordering);
+  } else {
+    __ Jr(RA);
+    __ NopIfNoReordering();
   }
 
-  __ Jr(RA);
-  __ Nop();
-
   __ cfi().RestoreState();
   __ cfi().DefCFAOffset(GetFrameSize());
 }
@@ -1251,6 +1263,7 @@
                                       uint32_t dex_pc,
                                       SlowPathCode* slow_path,
                                       bool is_direct_entrypoint) {
+  bool reordering = __ SetReorder(false);
   __ LoadFromOffset(kLoadWord, T9, TR, entry_point_offset);
   __ Jalr(T9);
   if (is_direct_entrypoint) {
@@ -1262,6 +1275,7 @@
   } else {
     __ Nop();  // In delay slot.
   }
+  __ SetReorder(reordering);
   RecordPcInfo(instruction, dex_pc, slow_path);
 }
 
@@ -3953,7 +3967,7 @@
   __ LoadFromOffset(kLoadWord, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   DCHECK(!codegen_->IsLeafMethod());
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
@@ -4254,7 +4268,7 @@
       // T9 prepared above for better instruction scheduling.
       // T9()
       __ Jalr(T9);
-      __ Nop();
+      __ NopIfNoReordering();
       break;
     case HInvokeStaticOrDirect::CodePtrLocation::kCallPCRelative:
       // TODO: Implement this type.
@@ -4270,7 +4284,7 @@
                             kMipsPointerSize).Int32Value());
       // T9()
       __ Jalr(T9);
-      __ Nop();
+      __ NopIfNoReordering();
       break;
   }
   DCHECK(!IsLeafMethod());
@@ -4312,7 +4326,7 @@
   __ LoadFromOffset(kLoadWord, T9, temp, entry_point.Int32Value());
   // T9();
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
 }
 
 void InstructionCodeGeneratorMIPS::VisitInvokeVirtual(HInvokeVirtual* invoke) {
@@ -4421,6 +4435,7 @@
       DCHECK(!kEmitCompilerReadBarrier);
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeTypePatch(cls->GetDexFile(), cls->GetTypeIndex());
+      bool reordering = __ SetReorder(false);
       if (isR6) {
         __ Bind(&info->high_label);
         __ Bind(&info->pc_rel_label);
@@ -4436,6 +4451,7 @@
         // Add a 32-bit offset to PC.
         __ Addu(out, out, base_or_current_method_reg);
       }
+      __ SetReorder(reordering);
       break;
     }
     case HLoadClass::LoadKind::kBootImageAddress: {
@@ -4579,6 +4595,7 @@
       DCHECK(!kEmitCompilerReadBarrier);
       CodeGeneratorMIPS::PcRelativePatchInfo* info =
           codegen_->NewPcRelativeStringPatch(load->GetDexFile(), load->GetStringIndex());
+      bool reordering = __ SetReorder(false);
       if (isR6) {
         __ Bind(&info->high_label);
         __ Bind(&info->pc_rel_label);
@@ -4594,6 +4611,7 @@
         // Add a 32-bit offset to PC.
         __ Addu(out, out, base_or_current_method_reg);
       }
+      __ SetReorder(reordering);
       return;  // No dex cache slow path.
     }
     case HLoadString::LoadKind::kBootImageAddress: {
@@ -4851,7 +4869,7 @@
     __ LoadFromOffset(kLoadWord, temp, TR, QUICK_ENTRY_POINT(pNewEmptyString));
     __ LoadFromOffset(kLoadWord, T9, temp, code_offset.Int32Value());
     __ Jalr(T9);
-    __ Nop();
+    __ NopIfNoReordering();
     codegen_->RecordPcInfo(instruction, instruction->GetDexPc());
   } else {
     codegen_->InvokeRuntime(
@@ -5751,7 +5769,7 @@
   Register reg = base->GetLocations()->Out().AsRegister<Register>();
   CodeGeneratorMIPS::PcRelativePatchInfo* info =
       codegen_->NewPcRelativeDexCacheArrayPatch(base->GetDexFile(), base->GetElementOffset());
-
+  bool reordering = __ SetReorder(false);
   if (codegen_->GetInstructionSetFeatures().IsR6()) {
     __ Bind(&info->high_label);
     __ Bind(&info->pc_rel_label);
@@ -5769,6 +5787,7 @@
     __ Addu(reg, reg, RA);
     // TODO: Can we share this code with that of VisitMipsComputeBaseMethodAddress()?
   }
+  __ SetReorder(reordering);
 }
 
 void LocationsBuilderMIPS::VisitInvokeUnresolved(HInvokeUnresolved* invoke) {
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 6e5eb66..862a93f 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -1901,7 +1901,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pStringCompareTo).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   __ Bind(slow_path->GetExitLabel());
 }
 
@@ -2060,7 +2060,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pIndexOf).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
 
   if (slow_path != nullptr) {
     __ Bind(slow_path->GetExitLabel());
@@ -2146,7 +2146,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromBytes).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   __ Bind(slow_path->GetExitLabel());
 }
@@ -2179,7 +2179,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromChars).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
 }
 
@@ -2208,7 +2208,7 @@
                     TR,
                     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pAllocStringFromString).Int32Value());
   __ Jalr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   codegen_->RecordPcInfo(invoke, invoke->GetDexPc());
   __ Bind(slow_path->GetExitLabel());
 }
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index 05eb063..6c5030c 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -144,12 +144,12 @@
     0x34, 0x00, 0xB0, 0xAF, 0x28, 0x00, 0xB6, 0xF7, 0x20, 0x00, 0xB4, 0xF7,
     0x00, 0x00, 0xA4, 0xAF, 0x3C, 0x00, 0xBF, 0x8F, 0x38, 0x00, 0xB1, 0x8F,
     0x34, 0x00, 0xB0, 0x8F, 0x28, 0x00, 0xB6, 0xD7, 0x20, 0x00, 0xB4, 0xD7,
-    0x40, 0x00, 0xBD, 0x27, 0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0xE0, 0x03, 0x40, 0x00, 0xBD, 0x27,
 };
 static constexpr uint8_t expected_cfi_kMips[] = {
     0x44, 0x0E, 0x40, 0x44, 0x9F, 0x01, 0x44, 0x91, 0x02, 0x44, 0x90, 0x03,
-    0x4C, 0x0A, 0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x4C, 0x0E, 0x00, 0x48,
-    0x0B, 0x0E, 0x40,
+    0x4C, 0x0A, 0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x50, 0x0E, 0x00, 0x0B,
+    0x0E, 0x40,
 };
 // 0x00000000: addiu r29, r29, -64
 // 0x00000004: .cfi_def_cfa_offset: 64
@@ -171,12 +171,11 @@
 // 0x00000028: .cfi_restore: r16
 // 0x00000028: ldc1 f22, +40(r29)
 // 0x0000002c: ldc1 f20, +32(r29)
-// 0x00000030: addiu r29, r29, 64
-// 0x00000034: .cfi_def_cfa_offset: 0
-// 0x00000034: jr r31
-// 0x00000038: nop
-// 0x0000003c: .cfi_restore_state
-// 0x0000003c: .cfi_def_cfa_offset: 64
+// 0x00000030: jr r31
+// 0x00000034: addiu r29, r29, 64
+// 0x00000038: .cfi_def_cfa_offset: 0
+// 0x00000038: .cfi_restore_state
+// 0x00000038: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kMips64[] = {
     0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF,
@@ -348,14 +347,13 @@
 };
 static constexpr uint8_t expected_asm_kMips_adjust_tail[] = {
     0x3C, 0x00, 0xBF, 0x8F, 0x38, 0x00, 0xB1, 0x8F, 0x34, 0x00, 0xB0, 0x8F,
-    0x28, 0x00, 0xB6, 0xD7, 0x20, 0x00, 0xB4, 0xD7, 0x40, 0x00, 0xBD, 0x27,
-    0x09, 0x00, 0xE0, 0x03, 0x00, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0xB6, 0xD7, 0x20, 0x00, 0xB4, 0xD7, 0x09, 0x00, 0xE0, 0x03,
+    0x40, 0x00, 0xBD, 0x27,
 };
 static constexpr uint8_t expected_cfi_kMips_adjust[] = {
     0x44, 0x0E, 0x40, 0x44, 0x9F, 0x01, 0x44, 0x91, 0x02, 0x44, 0x90, 0x03,
     0x54, 0x0E, 0x44, 0x60, 0x0E, 0x40, 0x04, 0x04, 0x00, 0x02, 0x00, 0x0A,
-    0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x4C, 0x0E, 0x00, 0x48, 0x0B, 0x0E,
-    0x40,
+    0x44, 0xDF, 0x44, 0xD1, 0x44, 0xD0, 0x50, 0x0E, 0x00, 0x0B, 0x0E, 0x40,
 };
 // 0x00000000: addiu r29, r29, -64
 // 0x00000004: .cfi_def_cfa_offset: 64
@@ -392,12 +390,11 @@
 // 0x00020054: .cfi_restore: r16
 // 0x00020054: ldc1 f22, +40(r29)
 // 0x00020058: ldc1 f20, +32(r29)
-// 0x0002005c: addiu r29, r29, 64
-// 0x00020060: .cfi_def_cfa_offset: 0
-// 0x00020060: jr r31
-// 0x00020064: nop
-// 0x00020068: .cfi_restore_state
-// 0x00020068: .cfi_def_cfa_offset: 64
+// 0x0002005c: jr r31
+// 0x00020060: addiu r29, r29, 64
+// 0x00020064: .cfi_def_cfa_offset: 0
+// 0x00020064: .cfi_restore_state
+// 0x00020064: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kMips64_adjust_head[] = {
     0xD8, 0xFF, 0xBD, 0x67, 0x20, 0x00, 0xBF, 0xFF, 0x18, 0x00, 0xB1, 0xFF,
diff --git a/compiler/trampolines/trampoline_compiler.cc b/compiler/trampolines/trampoline_compiler.cc
index 55835e7..70f290d 100644
--- a/compiler/trampolines/trampoline_compiler.cc
+++ b/compiler/trampolines/trampoline_compiler.cc
@@ -152,7 +152,7 @@
       __ LoadFromOffset(kLoadWord, T9, S1, offset.Int32Value());
   }
   __ Jr(T9);
-  __ Nop();
+  __ NopIfNoReordering();
   __ Break();
 
   __ FinalizeCode();
diff --git a/compiler/utils/assembler.h b/compiler/utils/assembler.h
index 8981776..b616057 100644
--- a/compiler/utils/assembler.h
+++ b/compiler/utils/assembler.h
@@ -362,6 +362,16 @@
   // Size of generated code
   virtual size_t CodeSize() const { return buffer_.Size(); }
   virtual const uint8_t* CodeBufferBaseAddress() const { return buffer_.contents(); }
+  // CodePosition() is a non-const method similar to CodeSize(), which is used to
+  // record positions within the code buffer for the purpose of signal handling
+  // (stack overflow checks and implicit null checks may trigger signals and the
+  // signal handlers expect them right before the recorded positions).
+  // On most architectures CodePosition() should be equivalent to CodeSize(), but
+  // the MIPS assembler needs to be aware of this recording, so it doesn't put
+  // the instructions that can trigger signals into branch delay slots. Handling
+  // signals from instructions in delay slots is a bit problematic and should be
+  // avoided.
+  virtual size_t CodePosition() { return CodeSize(); }
 
   // Copy instructions out of assembly buffer into the given region of memory
   virtual void FinalizeInstructions(const MemoryRegion& region) {
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index bfc63d1..4b580b6 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -40,10 +40,195 @@
   return os;
 }
 
+MipsAssembler::DelaySlot::DelaySlot()
+    : instruction_(0),
+      gpr_outs_mask_(0),
+      gpr_ins_mask_(0),
+      fpr_outs_mask_(0),
+      fpr_ins_mask_(0),
+      cc_outs_mask_(0),
+      cc_ins_mask_(0) {}
+
+void MipsAssembler::DsFsmInstr(uint32_t instruction,
+                               uint32_t gpr_outs_mask,
+                               uint32_t gpr_ins_mask,
+                               uint32_t fpr_outs_mask,
+                               uint32_t fpr_ins_mask,
+                               uint32_t cc_outs_mask,
+                               uint32_t cc_ins_mask) {
+  if (!reordering_) {
+    CHECK_EQ(ds_fsm_state_, kExpectingLabel);
+    CHECK_EQ(delay_slot_.instruction_, 0u);
+    return;
+  }
+  switch (ds_fsm_state_) {
+    case kExpectingLabel:
+      break;
+    case kExpectingInstruction:
+      CHECK_EQ(ds_fsm_target_pc_ + sizeof(uint32_t), buffer_.Size());
+      // If the last instruction is not suitable for delay slots, drop
+      // the PC of the label preceding it so that no unconditional branch
+      // uses this instruction to fill its delay slot.
+      if (instruction == 0) {
+        DsFsmDropLabel();  // Sets ds_fsm_state_ = kExpectingLabel.
+      } else {
+        // Otherwise wait for another instruction or label before we can
+        // commit the label PC. The label PC will be dropped if instead
+        // of another instruction or label there's a call from the code
+        // generator to CodePosition() to record the buffer size.
+        // Instructions after which the buffer size is recorded cannot
+        // be moved into delay slots or anywhere else because they may
+        // trigger signals and the signal handlers expect these signals
+        // to be coming from the instructions immediately preceding the
+        // recorded buffer locations.
+        ds_fsm_state_ = kExpectingCommit;
+      }
+      break;
+    case kExpectingCommit:
+      CHECK_EQ(ds_fsm_target_pc_ + 2 * sizeof(uint32_t), buffer_.Size());
+      DsFsmCommitLabel();  // Sets ds_fsm_state_ = kExpectingLabel.
+      break;
+  }
+  delay_slot_.instruction_ = instruction;
+  delay_slot_.gpr_outs_mask_ = gpr_outs_mask & ~1u;  // Ignore register ZERO.
+  delay_slot_.gpr_ins_mask_ = gpr_ins_mask & ~1u;  // Ignore register ZERO.
+  delay_slot_.fpr_outs_mask_ = fpr_outs_mask;
+  delay_slot_.fpr_ins_mask_ = fpr_ins_mask;
+  delay_slot_.cc_outs_mask_ = cc_outs_mask;
+  delay_slot_.cc_ins_mask_ = cc_ins_mask;
+}
+
+void MipsAssembler::DsFsmLabel() {
+  if (!reordering_) {
+    CHECK_EQ(ds_fsm_state_, kExpectingLabel);
+    CHECK_EQ(delay_slot_.instruction_, 0u);
+    return;
+  }
+  switch (ds_fsm_state_) {
+    case kExpectingLabel:
+      ds_fsm_target_pc_ = buffer_.Size();
+      ds_fsm_state_ = kExpectingInstruction;
+      break;
+    case kExpectingInstruction:
+      // Allow consecutive labels.
+      CHECK_EQ(ds_fsm_target_pc_, buffer_.Size());
+      break;
+    case kExpectingCommit:
+      CHECK_EQ(ds_fsm_target_pc_ + sizeof(uint32_t), buffer_.Size());
+      DsFsmCommitLabel();
+      ds_fsm_target_pc_ = buffer_.Size();
+      ds_fsm_state_ = kExpectingInstruction;
+      break;
+  }
+  // We cannot move instructions into delay slots across labels.
+  delay_slot_.instruction_ = 0;
+}
+
+void MipsAssembler::DsFsmCommitLabel() {
+  if (ds_fsm_state_ == kExpectingCommit) {
+    ds_fsm_target_pcs_.emplace_back(ds_fsm_target_pc_);
+  }
+  ds_fsm_state_ = kExpectingLabel;
+}
+
+void MipsAssembler::DsFsmDropLabel() {
+  ds_fsm_state_ = kExpectingLabel;
+}
+
+bool MipsAssembler::SetReorder(bool enable) {
+  bool last_state = reordering_;
+  if (last_state != enable) {
+    DsFsmCommitLabel();
+    DsFsmInstrNop(0);
+  }
+  reordering_ = enable;
+  return last_state;
+}
+
+size_t MipsAssembler::CodePosition() {
+  // The last instruction cannot be used in a delay slot, do not commit
+  // the label before it (if any) and clear the delay slot.
+  DsFsmDropLabel();
+  DsFsmInstrNop(0);
+  size_t size = buffer_.Size();
+  // In theory we can get the following sequence:
+  //   label1:
+  //     instr
+  //   label2: # label1 gets committed when label2 is seen
+  //     CodePosition() call
+  // and we need to uncommit label1.
+  if (ds_fsm_target_pcs_.size() != 0 && ds_fsm_target_pcs_.back() + sizeof(uint32_t) == size) {
+    ds_fsm_target_pcs_.pop_back();
+  }
+  return size;
+}
+
+void MipsAssembler::DsFsmInstrNop(uint32_t instruction ATTRIBUTE_UNUSED) {
+  DsFsmInstr(0, 0, 0, 0, 0, 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrRrr(uint32_t instruction, Register out, Register in1, Register in2) {
+  DsFsmInstr(instruction, (1u << out), (1u << in1) | (1u << in2), 0, 0, 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrRrrr(uint32_t instruction,
+                                   Register in1_out,
+                                   Register in2,
+                                   Register in3) {
+  DsFsmInstr(instruction, (1u << in1_out), (1u << in1_out) | (1u << in2) | (1u << in3), 0, 0, 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrFff(uint32_t instruction,
+                                  FRegister out,
+                                  FRegister in1,
+                                  FRegister in2) {
+  DsFsmInstr(instruction, 0, 0, (1u << out), (1u << in1) | (1u << in2), 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrFfff(uint32_t instruction,
+                                   FRegister in1_out,
+                                   FRegister in2,
+                                   FRegister in3) {
+  DsFsmInstr(instruction, 0, 0, (1u << in1_out), (1u << in1_out) | (1u << in2) | (1u << in3), 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrRf(uint32_t instruction, Register out, FRegister in) {
+  DsFsmInstr(instruction, (1u << out), 0, 0, (1u << in), 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrFr(uint32_t instruction, FRegister out, Register in) {
+  DsFsmInstr(instruction, 0, (1u << in), (1u << out), 0, 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrFR(uint32_t instruction, FRegister in1, Register in2) {
+  DsFsmInstr(instruction, 0, (1u << in2), 0, (1u << in1), 0, 0);
+}
+
+void MipsAssembler::DsFsmInstrCff(uint32_t instruction, int cc_out, FRegister in1, FRegister in2) {
+  DsFsmInstr(instruction, 0, 0, 0, (1u << in1) | (1u << in2), (1 << cc_out), 0);
+}
+
+void MipsAssembler::DsFsmInstrRrrc(uint32_t instruction,
+                                   Register in1_out,
+                                   Register in2,
+                                   int cc_in) {
+  DsFsmInstr(instruction, (1u << in1_out), (1u << in1_out) | (1u << in2), 0, 0, 0, (1 << cc_in));
+}
+
+void MipsAssembler::DsFsmInstrFffc(uint32_t instruction,
+                                   FRegister in1_out,
+                                   FRegister in2,
+                                   int cc_in) {
+  DsFsmInstr(instruction, 0, 0, (1u << in1_out), (1u << in1_out) | (1u << in2), 0, (1 << cc_in));
+}
+
 void MipsAssembler::FinalizeCode() {
   for (auto& exception_block : exception_blocks_) {
     EmitExceptionPoll(&exception_block);
   }
+  // Commit the last branch target label (if any) and disable instruction reordering.
+  DsFsmCommitLabel();
+  SetReorder(false);
   EmitLiterals();
   PromoteBranches();
 }
@@ -107,6 +292,12 @@
 
 void MipsAssembler::EmitBranches() {
   CHECK(!overwriting_);
+  CHECK(!reordering_);
+  // Now that everything has its final position in the buffer (the branches have
+  // been promoted), adjust the target label PCs.
+  for (size_t cnt = ds_fsm_target_pcs_.size(), i = 0; i < cnt; i++) {
+    ds_fsm_target_pcs_[i] = GetAdjustedPosition(ds_fsm_target_pcs_[i]);
+  }
   // Switch from appending instructions at the end of the buffer to overwriting
   // existing instructions (branch placeholders) in the buffer.
   overwriting_ = true;
@@ -128,7 +319,12 @@
   }
 }
 
-void MipsAssembler::EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct) {
+uint32_t MipsAssembler::EmitR(int opcode,
+                              Register rs,
+                              Register rt,
+                              Register rd,
+                              int shamt,
+                              int funct) {
   CHECK_NE(rs, kNoRegister);
   CHECK_NE(rt, kNoRegister);
   CHECK_NE(rd, kNoRegister);
@@ -139,9 +335,10 @@
                       shamt << kShamtShift |
                       funct;
   Emit(encoding);
+  return encoding;
 }
 
-void MipsAssembler::EmitI(int opcode, Register rs, Register rt, uint16_t imm) {
+uint32_t MipsAssembler::EmitI(int opcode, Register rs, Register rt, uint16_t imm) {
   CHECK_NE(rs, kNoRegister);
   CHECK_NE(rt, kNoRegister);
   uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift |
@@ -149,25 +346,32 @@
                       static_cast<uint32_t>(rt) << kRtShift |
                       imm;
   Emit(encoding);
+  return encoding;
 }
 
-void MipsAssembler::EmitI21(int opcode, Register rs, uint32_t imm21) {
+uint32_t MipsAssembler::EmitI21(int opcode, Register rs, uint32_t imm21) {
   CHECK_NE(rs, kNoRegister);
   CHECK(IsUint<21>(imm21)) << imm21;
   uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift |
                       static_cast<uint32_t>(rs) << kRsShift |
                       imm21;
   Emit(encoding);
+  return encoding;
 }
 
-void MipsAssembler::EmitI26(int opcode, uint32_t imm26) {
+uint32_t MipsAssembler::EmitI26(int opcode, uint32_t imm26) {
   CHECK(IsUint<26>(imm26)) << imm26;
   uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift | imm26;
   Emit(encoding);
+  return encoding;
 }
 
-void MipsAssembler::EmitFR(int opcode, int fmt, FRegister ft, FRegister fs, FRegister fd,
-                           int funct) {
+uint32_t MipsAssembler::EmitFR(int opcode,
+                               int fmt,
+                               FRegister ft,
+                               FRegister fs,
+                               FRegister fd,
+                               int funct) {
   CHECK_NE(ft, kNoFRegister);
   CHECK_NE(fs, kNoFRegister);
   CHECK_NE(fd, kNoFRegister);
@@ -178,52 +382,54 @@
                       static_cast<uint32_t>(fd) << kFdShift |
                       funct;
   Emit(encoding);
+  return encoding;
 }
 
-void MipsAssembler::EmitFI(int opcode, int fmt, FRegister ft, uint16_t imm) {
+uint32_t MipsAssembler::EmitFI(int opcode, int fmt, FRegister ft, uint16_t imm) {
   CHECK_NE(ft, kNoFRegister);
   uint32_t encoding = static_cast<uint32_t>(opcode) << kOpcodeShift |
                       fmt << kFmtShift |
                       static_cast<uint32_t>(ft) << kFtShift |
                       imm;
   Emit(encoding);
+  return encoding;
 }
 
 void MipsAssembler::Addu(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x21);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x21), rd, rs, rt);
 }
 
 void MipsAssembler::Addiu(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x9, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x9, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Subu(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x23);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x23), rd, rs, rt);
 }
 
 void MipsAssembler::MultR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x18);
+  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x18), ZERO, rs, rt);
 }
 
 void MipsAssembler::MultuR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x19);
+  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x19), ZERO, rs, rt);
 }
 
 void MipsAssembler::DivR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1a);
+  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1a), ZERO, rs, rt);
 }
 
 void MipsAssembler::DivuR2(Register rs, Register rt) {
   CHECK(!IsR6());
-  EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1b);
+  DsFsmInstrRrr(EmitR(0, rs, rt, static_cast<Register>(0), 0, 0x1b), ZERO, rs, rt);
 }
 
 void MipsAssembler::MulR2(Register rd, Register rs, Register rt) {
   CHECK(!IsR6());
-  EmitR(0x1c, rs, rt, rd, 0, 2);
+  DsFsmInstrRrr(EmitR(0x1c, rs, rt, rd, 0, 2), rd, rs, rt);
 }
 
 void MipsAssembler::DivR2(Register rd, Register rs, Register rt) {
@@ -252,308 +458,307 @@
 
 void MipsAssembler::MulR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 2, 0x18);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 2, 0x18), rd, rs, rt);
 }
 
 void MipsAssembler::MuhR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 3, 0x18);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x18), rd, rs, rt);
 }
 
 void MipsAssembler::MuhuR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 3, 0x19);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x19), rd, rs, rt);
 }
 
 void MipsAssembler::DivR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 2, 0x1a);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 2, 0x1a), rd, rs, rt);
 }
 
 void MipsAssembler::ModR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 3, 0x1a);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x1a), rd, rs, rt);
 }
 
 void MipsAssembler::DivuR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 2, 0x1b);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 2, 0x1b), rd, rs, rt);
 }
 
 void MipsAssembler::ModuR6(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 3, 0x1b);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 3, 0x1b), rd, rs, rt);
 }
 
 void MipsAssembler::And(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x24);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x24), rd, rs, rt);
 }
 
 void MipsAssembler::Andi(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0xc, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0xc, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Or(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x25);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x25), rd, rs, rt);
 }
 
 void MipsAssembler::Ori(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0xd, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0xd, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Xor(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x26);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x26), rd, rs, rt);
 }
 
 void MipsAssembler::Xori(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0xe, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0xe, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Nor(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x27);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x27), rd, rs, rt);
 }
 
 void MipsAssembler::Movz(Register rd, Register rs, Register rt) {
   CHECK(!IsR6());
-  EmitR(0, rs, rt, rd, 0, 0x0A);
+  DsFsmInstrRrrr(EmitR(0, rs, rt, rd, 0, 0x0A), rd, rs, rt);
 }
 
 void MipsAssembler::Movn(Register rd, Register rs, Register rt) {
   CHECK(!IsR6());
-  EmitR(0, rs, rt, rd, 0, 0x0B);
+  DsFsmInstrRrrr(EmitR(0, rs, rt, rd, 0, 0x0B), rd, rs, rt);
 }
 
 void MipsAssembler::Seleqz(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 0, 0x35);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x35), rd, rs, rt);
 }
 
 void MipsAssembler::Selnez(Register rd, Register rs, Register rt) {
   CHECK(IsR6());
-  EmitR(0, rs, rt, rd, 0, 0x37);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x37), rd, rs, rt);
 }
 
 void MipsAssembler::ClzR6(Register rd, Register rs) {
   CHECK(IsR6());
-  EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x10);
+  DsFsmInstrRrr(EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x10), rd, rs, rs);
 }
 
 void MipsAssembler::ClzR2(Register rd, Register rs) {
   CHECK(!IsR6());
-  EmitR(0x1C, rs, rd, rd, 0, 0x20);
+  DsFsmInstrRrr(EmitR(0x1C, rs, rd, rd, 0, 0x20), rd, rs, rs);
 }
 
 void MipsAssembler::CloR6(Register rd, Register rs) {
   CHECK(IsR6());
-  EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x11);
+  DsFsmInstrRrr(EmitR(0, rs, static_cast<Register>(0), rd, 0x01, 0x11), rd, rs, rs);
 }
 
 void MipsAssembler::CloR2(Register rd, Register rs) {
   CHECK(!IsR6());
-  EmitR(0x1C, rs, rd, rd, 0, 0x21);
+  DsFsmInstrRrr(EmitR(0x1C, rs, rd, rd, 0, 0x21), rd, rs, rs);
 }
 
 void MipsAssembler::Seb(Register rd, Register rt) {
-  EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x10, 0x20);
+  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x10, 0x20), rd, rt, rt);
 }
 
 void MipsAssembler::Seh(Register rd, Register rt) {
-  EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x18, 0x20);
+  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x18, 0x20), rd, rt, rt);
 }
 
 void MipsAssembler::Wsbh(Register rd, Register rt) {
-  EmitR(0x1f, static_cast<Register>(0), rt, rd, 2, 0x20);
+  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 2, 0x20), rd, rt, rt);
 }
 
 void MipsAssembler::Bitswap(Register rd, Register rt) {
   CHECK(IsR6());
-  EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x0, 0x20);
+  DsFsmInstrRrr(EmitR(0x1f, static_cast<Register>(0), rt, rd, 0x0, 0x20), rd, rt, rt);
 }
 
 void MipsAssembler::Sll(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x00);
+  DsFsmInstrRrr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x00), rd, rt, rt);
 }
 
 void MipsAssembler::Srl(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x02);
+  DsFsmInstrRrr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x02), rd, rt, rt);
 }
 
 void MipsAssembler::Rotr(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  EmitR(0, static_cast<Register>(1), rt, rd, shamt, 0x02);
+  DsFsmInstrRrr(EmitR(0, static_cast<Register>(1), rt, rd, shamt, 0x02), rd, rt, rt);
 }
 
 void MipsAssembler::Sra(Register rd, Register rt, int shamt) {
   CHECK(IsUint<5>(shamt)) << shamt;
-  EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x03);
+  DsFsmInstrRrr(EmitR(0, static_cast<Register>(0), rt, rd, shamt, 0x03), rd, rt, rt);
 }
 
 void MipsAssembler::Sllv(Register rd, Register rt, Register rs) {
-  EmitR(0, rs, rt, rd, 0, 0x04);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x04), rd, rs, rt);
 }
 
 void MipsAssembler::Srlv(Register rd, Register rt, Register rs) {
-  EmitR(0, rs, rt, rd, 0, 0x06);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x06), rd, rs, rt);
 }
 
 void MipsAssembler::Rotrv(Register rd, Register rt, Register rs) {
-  EmitR(0, rs, rt, rd, 1, 0x06);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 1, 0x06), rd, rs, rt);
 }
 
 void MipsAssembler::Srav(Register rd, Register rt, Register rs) {
-  EmitR(0, rs, rt, rd, 0, 0x07);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x07), rd, rs, rt);
 }
 
 void MipsAssembler::Ext(Register rd, Register rt, int pos, int size) {
   CHECK(IsUint<5>(pos)) << pos;
   CHECK(0 < size && size <= 32) << size;
   CHECK(0 < pos + size && pos + size <= 32) << pos << " + " << size;
-  EmitR(0x1f, rt, rd, static_cast<Register>(size - 1), pos, 0x00);
+  DsFsmInstrRrr(EmitR(0x1f, rt, rd, static_cast<Register>(size - 1), pos, 0x00), rd, rt, rt);
 }
 
 void MipsAssembler::Ins(Register rd, Register rt, int pos, int size) {
   CHECK(IsUint<5>(pos)) << pos;
   CHECK(0 < size && size <= 32) << size;
   CHECK(0 < pos + size && pos + size <= 32) << pos << " + " << size;
-  EmitR(0x1f, rt, rd, static_cast<Register>(pos + size - 1), pos, 0x04);
+  DsFsmInstrRrr(EmitR(0x1f, rt, rd, static_cast<Register>(pos + size - 1), pos, 0x04), rd, rd, rt);
 }
 
 void MipsAssembler::Lb(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x20, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x20, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Lh(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x21, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x21, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Lw(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x23, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x23, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Lwl(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  EmitI(0x22, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x22, rs, rt, imm16), rt, rt, rs);
 }
 
 void MipsAssembler::Lwr(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  EmitI(0x26, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x26, rs, rt, imm16), rt, rt, rs);
 }
 
 void MipsAssembler::Lbu(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x24, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x24, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Lhu(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x25, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x25, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Lwpc(Register rs, uint32_t imm19) {
   CHECK(IsR6());
   CHECK(IsUint<19>(imm19)) << imm19;
-  EmitI21(0x3B, rs, (0x01 << 19) | imm19);
+  DsFsmInstrNop(EmitI21(0x3B, rs, (0x01 << 19) | imm19));
 }
 
 void MipsAssembler::Lui(Register rt, uint16_t imm16) {
-  EmitI(0xf, static_cast<Register>(0), rt, imm16);
+  DsFsmInstrRrr(EmitI(0xf, static_cast<Register>(0), rt, imm16), rt, ZERO, ZERO);
 }
 
 void MipsAssembler::Aui(Register rt, Register rs, uint16_t imm16) {
   CHECK(IsR6());
-  EmitI(0xf, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0xf, rs, rt, imm16), rt, rt, rs);
 }
 
 void MipsAssembler::Sync(uint32_t stype) {
-  EmitR(0, static_cast<Register>(0), static_cast<Register>(0), static_cast<Register>(0),
-        stype & 0x1f, 0xf);
+  DsFsmInstrNop(EmitR(0, ZERO, ZERO, ZERO, stype & 0x1f, 0xf));
 }
 
 void MipsAssembler::Mfhi(Register rd) {
   CHECK(!IsR6());
-  EmitR(0, static_cast<Register>(0), static_cast<Register>(0), rd, 0, 0x10);
+  DsFsmInstrRrr(EmitR(0, ZERO, ZERO, rd, 0, 0x10), rd, ZERO, ZERO);
 }
 
 void MipsAssembler::Mflo(Register rd) {
   CHECK(!IsR6());
-  EmitR(0, static_cast<Register>(0), static_cast<Register>(0), rd, 0, 0x12);
+  DsFsmInstrRrr(EmitR(0, ZERO, ZERO, rd, 0, 0x12), rd, ZERO, ZERO);
 }
 
 void MipsAssembler::Sb(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x28, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x28, rs, rt, imm16), ZERO, rt, rs);
 }
 
 void MipsAssembler::Sh(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x29, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x29, rs, rt, imm16), ZERO, rt, rs);
 }
 
 void MipsAssembler::Sw(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0x2b, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x2b, rs, rt, imm16), ZERO, rt, rs);
 }
 
 void MipsAssembler::Swl(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  EmitI(0x2a, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x2a, rs, rt, imm16), ZERO, rt, rs);
 }
 
 void MipsAssembler::Swr(Register rt, Register rs, uint16_t imm16) {
   CHECK(!IsR6());
-  EmitI(0x2e, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x2e, rs, rt, imm16), ZERO, rt, rs);
 }
 
 void MipsAssembler::LlR2(Register rt, Register base, int16_t imm16) {
   CHECK(!IsR6());
-  EmitI(0x30, base, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x30, base, rt, imm16), rt, base, base);
 }
 
 void MipsAssembler::ScR2(Register rt, Register base, int16_t imm16) {
   CHECK(!IsR6());
-  EmitI(0x38, base, rt, imm16);
+  DsFsmInstrRrr(EmitI(0x38, base, rt, imm16), rt, rt, base);
 }
 
 void MipsAssembler::LlR6(Register rt, Register base, int16_t imm9) {
   CHECK(IsR6());
   CHECK(IsInt<9>(imm9));
-  EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x36);
+  DsFsmInstrRrr(EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x36), rt, base, base);
 }
 
 void MipsAssembler::ScR6(Register rt, Register base, int16_t imm9) {
   CHECK(IsR6());
   CHECK(IsInt<9>(imm9));
-  EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x26);
+  DsFsmInstrRrr(EmitI(0x1f, base, rt, ((imm9 & 0x1ff) << 7) | 0x26), rt, rt, base);
 }
 
 void MipsAssembler::Slt(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x2a);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x2a), rd, rs, rt);
 }
 
 void MipsAssembler::Sltu(Register rd, Register rs, Register rt) {
-  EmitR(0, rs, rt, rd, 0, 0x2b);
+  DsFsmInstrRrr(EmitR(0, rs, rt, rd, 0, 0x2b), rd, rs, rt);
 }
 
 void MipsAssembler::Slti(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0xa, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0xa, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::Sltiu(Register rt, Register rs, uint16_t imm16) {
-  EmitI(0xb, rs, rt, imm16);
+  DsFsmInstrRrr(EmitI(0xb, rs, rt, imm16), rt, rs, rs);
 }
 
 void MipsAssembler::B(uint16_t imm16) {
-  EmitI(0x4, static_cast<Register>(0), static_cast<Register>(0), imm16);
+  DsFsmInstrNop(EmitI(0x4, static_cast<Register>(0), static_cast<Register>(0), imm16));
 }
 
 void MipsAssembler::Bal(uint16_t imm16) {
-  EmitI(0x1, static_cast<Register>(0), static_cast<Register>(0x11), imm16);
+  DsFsmInstrNop(EmitI(0x1, static_cast<Register>(0), static_cast<Register>(0x11), imm16));
 }
 
 void MipsAssembler::Beq(Register rs, Register rt, uint16_t imm16) {
-  EmitI(0x4, rs, rt, imm16);
+  DsFsmInstrNop(EmitI(0x4, rs, rt, imm16));
 }
 
 void MipsAssembler::Bne(Register rs, Register rt, uint16_t imm16) {
-  EmitI(0x5, rs, rt, imm16);
+  DsFsmInstrNop(EmitI(0x5, rs, rt, imm16));
 }
 
 void MipsAssembler::Beqz(Register rt, uint16_t imm16) {
@@ -565,19 +770,19 @@
 }
 
 void MipsAssembler::Bltz(Register rt, uint16_t imm16) {
-  EmitI(0x1, rt, static_cast<Register>(0), imm16);
+  DsFsmInstrNop(EmitI(0x1, rt, static_cast<Register>(0), imm16));
 }
 
 void MipsAssembler::Bgez(Register rt, uint16_t imm16) {
-  EmitI(0x1, rt, static_cast<Register>(0x1), imm16);
+  DsFsmInstrNop(EmitI(0x1, rt, static_cast<Register>(0x1), imm16));
 }
 
 void MipsAssembler::Blez(Register rt, uint16_t imm16) {
-  EmitI(0x6, rt, static_cast<Register>(0), imm16);
+  DsFsmInstrNop(EmitI(0x6, rt, static_cast<Register>(0), imm16));
 }
 
 void MipsAssembler::Bgtz(Register rt, uint16_t imm16) {
-  EmitI(0x7, rt, static_cast<Register>(0), imm16);
+  DsFsmInstrNop(EmitI(0x7, rt, static_cast<Register>(0), imm16));
 }
 
 void MipsAssembler::Bc1f(uint16_t imm16) {
@@ -587,7 +792,7 @@
 void MipsAssembler::Bc1f(int cc, uint16_t imm16) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitI(0x11, static_cast<Register>(0x8), static_cast<Register>(cc << 2), imm16);
+  DsFsmInstrNop(EmitI(0x11, static_cast<Register>(0x8), static_cast<Register>(cc << 2), imm16));
 }
 
 void MipsAssembler::Bc1t(uint16_t imm16) {
@@ -597,19 +802,45 @@
 void MipsAssembler::Bc1t(int cc, uint16_t imm16) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitI(0x11, static_cast<Register>(0x8), static_cast<Register>((cc << 2) | 1), imm16);
+  DsFsmInstrNop(EmitI(0x11,
+                      static_cast<Register>(0x8),
+                      static_cast<Register>((cc << 2) | 1),
+                      imm16));
 }
 
 void MipsAssembler::J(uint32_t addr26) {
-  EmitI26(0x2, addr26);
+  DsFsmInstrNop(EmitI26(0x2, addr26));
 }
 
 void MipsAssembler::Jal(uint32_t addr26) {
-  EmitI26(0x3, addr26);
+  DsFsmInstrNop(EmitI26(0x3, addr26));
 }
 
 void MipsAssembler::Jalr(Register rd, Register rs) {
-  EmitR(0, rs, static_cast<Register>(0), rd, 0, 0x09);
+  uint32_t last_instruction = delay_slot_.instruction_;
+  bool exchange = (last_instruction != 0 &&
+      (delay_slot_.gpr_outs_mask_ & (1u << rs)) == 0 &&
+      ((delay_slot_.gpr_ins_mask_ | delay_slot_.gpr_outs_mask_) & (1u << rd)) == 0);
+  if (exchange) {
+    // The last instruction cannot be used in a different delay slot,
+    // do not commit the label before it (if any).
+    DsFsmDropLabel();
+  }
+  DsFsmInstrNop(EmitR(0, rs, static_cast<Register>(0), rd, 0, 0x09));
+  if (exchange) {
+    // Exchange the last two instructions in the assembler buffer.
+    size_t size = buffer_.Size();
+    CHECK_GE(size, 2 * sizeof(uint32_t));
+    size_t pos1 = size - 2 * sizeof(uint32_t);
+    size_t pos2 = size - sizeof(uint32_t);
+    uint32_t instr1 = buffer_.Load<uint32_t>(pos1);
+    uint32_t instr2 = buffer_.Load<uint32_t>(pos2);
+    CHECK_EQ(instr1, last_instruction);
+    buffer_.Store<uint32_t>(pos1, instr2);
+    buffer_.Store<uint32_t>(pos2, instr1);
+  } else if (reordering_) {
+    Nop();
+  }
 }
 
 void MipsAssembler::Jalr(Register rs) {
@@ -621,38 +852,38 @@
 }
 
 void MipsAssembler::Nal() {
-  EmitI(0x1, static_cast<Register>(0), static_cast<Register>(0x10), 0);
+  DsFsmInstrNop(EmitI(0x1, static_cast<Register>(0), static_cast<Register>(0x10), 0));
 }
 
 void MipsAssembler::Auipc(Register rs, uint16_t imm16) {
   CHECK(IsR6());
-  EmitI(0x3B, rs, static_cast<Register>(0x1E), imm16);
+  DsFsmInstrNop(EmitI(0x3B, rs, static_cast<Register>(0x1E), imm16));
 }
 
 void MipsAssembler::Addiupc(Register rs, uint32_t imm19) {
   CHECK(IsR6());
   CHECK(IsUint<19>(imm19)) << imm19;
-  EmitI21(0x3B, rs, imm19);
+  DsFsmInstrNop(EmitI21(0x3B, rs, imm19));
 }
 
 void MipsAssembler::Bc(uint32_t imm26) {
   CHECK(IsR6());
-  EmitI26(0x32, imm26);
+  DsFsmInstrNop(EmitI26(0x32, imm26));
 }
 
 void MipsAssembler::Balc(uint32_t imm26) {
   CHECK(IsR6());
-  EmitI26(0x3A, imm26);
+  DsFsmInstrNop(EmitI26(0x3A, imm26));
 }
 
 void MipsAssembler::Jic(Register rt, uint16_t imm16) {
   CHECK(IsR6());
-  EmitI(0x36, static_cast<Register>(0), rt, imm16);
+  DsFsmInstrNop(EmitI(0x36, static_cast<Register>(0), rt, imm16));
 }
 
 void MipsAssembler::Jialc(Register rt, uint16_t imm16) {
   CHECK(IsR6());
-  EmitI(0x3E, static_cast<Register>(0), rt, imm16);
+  DsFsmInstrNop(EmitI(0x3E, static_cast<Register>(0), rt, imm16));
 }
 
 void MipsAssembler::Bltc(Register rs, Register rt, uint16_t imm16) {
@@ -660,19 +891,19 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x17, rs, rt, imm16);
+  DsFsmInstrNop(EmitI(0x17, rs, rt, imm16));
 }
 
 void MipsAssembler::Bltzc(Register rt, uint16_t imm16) {
   CHECK(IsR6());
   CHECK_NE(rt, ZERO);
-  EmitI(0x17, rt, rt, imm16);
+  DsFsmInstrNop(EmitI(0x17, rt, rt, imm16));
 }
 
 void MipsAssembler::Bgtzc(Register rt, uint16_t imm16) {
   CHECK(IsR6());
   CHECK_NE(rt, ZERO);
-  EmitI(0x17, static_cast<Register>(0), rt, imm16);
+  DsFsmInstrNop(EmitI(0x17, static_cast<Register>(0), rt, imm16));
 }
 
 void MipsAssembler::Bgec(Register rs, Register rt, uint16_t imm16) {
@@ -680,19 +911,19 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x16, rs, rt, imm16);
+  DsFsmInstrNop(EmitI(0x16, rs, rt, imm16));
 }
 
 void MipsAssembler::Bgezc(Register rt, uint16_t imm16) {
   CHECK(IsR6());
   CHECK_NE(rt, ZERO);
-  EmitI(0x16, rt, rt, imm16);
+  DsFsmInstrNop(EmitI(0x16, rt, rt, imm16));
 }
 
 void MipsAssembler::Blezc(Register rt, uint16_t imm16) {
   CHECK(IsR6());
   CHECK_NE(rt, ZERO);
-  EmitI(0x16, static_cast<Register>(0), rt, imm16);
+  DsFsmInstrNop(EmitI(0x16, static_cast<Register>(0), rt, imm16));
 }
 
 void MipsAssembler::Bltuc(Register rs, Register rt, uint16_t imm16) {
@@ -700,7 +931,7 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x7, rs, rt, imm16);
+  DsFsmInstrNop(EmitI(0x7, rs, rt, imm16));
 }
 
 void MipsAssembler::Bgeuc(Register rs, Register rt, uint16_t imm16) {
@@ -708,7 +939,7 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x6, rs, rt, imm16);
+  DsFsmInstrNop(EmitI(0x6, rs, rt, imm16));
 }
 
 void MipsAssembler::Beqc(Register rs, Register rt, uint16_t imm16) {
@@ -716,7 +947,7 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x8, std::min(rs, rt), std::max(rs, rt), imm16);
+  DsFsmInstrNop(EmitI(0x8, std::min(rs, rt), std::max(rs, rt), imm16));
 }
 
 void MipsAssembler::Bnec(Register rs, Register rt, uint16_t imm16) {
@@ -724,29 +955,29 @@
   CHECK_NE(rs, ZERO);
   CHECK_NE(rt, ZERO);
   CHECK_NE(rs, rt);
-  EmitI(0x18, std::min(rs, rt), std::max(rs, rt), imm16);
+  DsFsmInstrNop(EmitI(0x18, std::min(rs, rt), std::max(rs, rt), imm16));
 }
 
 void MipsAssembler::Beqzc(Register rs, uint32_t imm21) {
   CHECK(IsR6());
   CHECK_NE(rs, ZERO);
-  EmitI21(0x36, rs, imm21);
+  DsFsmInstrNop(EmitI21(0x36, rs, imm21));
 }
 
 void MipsAssembler::Bnezc(Register rs, uint32_t imm21) {
   CHECK(IsR6());
   CHECK_NE(rs, ZERO);
-  EmitI21(0x3E, rs, imm21);
+  DsFsmInstrNop(EmitI21(0x3E, rs, imm21));
 }
 
 void MipsAssembler::Bc1eqz(FRegister ft, uint16_t imm16) {
   CHECK(IsR6());
-  EmitFI(0x11, 0x9, ft, imm16);
+  DsFsmInstrNop(EmitFI(0x11, 0x9, ft, imm16));
 }
 
 void MipsAssembler::Bc1nez(FRegister ft, uint16_t imm16) {
   CHECK(IsR6());
-  EmitFI(0x11, 0xD, ft, imm16);
+  DsFsmInstrNop(EmitFI(0x11, 0xD, ft, imm16));
 }
 
 void MipsAssembler::EmitBcondR2(BranchCondition cond, Register rs, Register rt, uint16_t imm16) {
@@ -868,67 +1099,67 @@
 }
 
 void MipsAssembler::AddS(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x10, ft, fs, fd, 0x0);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x0), fd, fs, ft);
 }
 
 void MipsAssembler::SubS(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x10, ft, fs, fd, 0x1);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x1), fd, fs, ft);
 }
 
 void MipsAssembler::MulS(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x10, ft, fs, fd, 0x2);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x2), fd, fs, ft);
 }
 
 void MipsAssembler::DivS(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x10, ft, fs, fd, 0x3);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x3), fd, fs, ft);
 }
 
 void MipsAssembler::AddD(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x11, ft, fs, fd, 0x0);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x0), fd, fs, ft);
 }
 
 void MipsAssembler::SubD(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x11, ft, fs, fd, 0x1);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x1), fd, fs, ft);
 }
 
 void MipsAssembler::MulD(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x11, ft, fs, fd, 0x2);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x2), fd, fs, ft);
 }
 
 void MipsAssembler::DivD(FRegister fd, FRegister fs, FRegister ft) {
-  EmitFR(0x11, 0x11, ft, fs, fd, 0x3);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x3), fd, fs, ft);
 }
 
 void MipsAssembler::SqrtS(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x4);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x4), fd, fs, fs);
 }
 
 void MipsAssembler::SqrtD(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x4);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x4), fd, fs, fs);
 }
 
 void MipsAssembler::AbsS(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x5);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x5), fd, fs, fs);
 }
 
 void MipsAssembler::AbsD(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x5);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x5), fd, fs, fs);
 }
 
 void MipsAssembler::MovS(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x6);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x6), fd, fs, fs);
 }
 
 void MipsAssembler::MovD(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x6);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x6), fd, fs, fs);
 }
 
 void MipsAssembler::NegS(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x7);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x7), fd, fs, fs);
 }
 
 void MipsAssembler::NegD(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x7);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x7), fd, fs, fs);
 }
 
 void MipsAssembler::CunS(FRegister fs, FRegister ft) {
@@ -938,7 +1169,7 @@
 void MipsAssembler::CunS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x31);
+  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x31), cc, fs, ft);
 }
 
 void MipsAssembler::CeqS(FRegister fs, FRegister ft) {
@@ -948,7 +1179,7 @@
 void MipsAssembler::CeqS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x32);
+  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x32), cc, fs, ft);
 }
 
 void MipsAssembler::CueqS(FRegister fs, FRegister ft) {
@@ -958,7 +1189,7 @@
 void MipsAssembler::CueqS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x33);
+  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x33), cc, fs, ft);
 }
 
 void MipsAssembler::ColtS(FRegister fs, FRegister ft) {
@@ -968,7 +1199,7 @@
 void MipsAssembler::ColtS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x34);
+  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x34), cc, fs, ft);
 }
 
 void MipsAssembler::CultS(FRegister fs, FRegister ft) {
@@ -978,7 +1209,7 @@
 void MipsAssembler::CultS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x35);
+  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x35), cc, fs, ft);
 }
 
 void MipsAssembler::ColeS(FRegister fs, FRegister ft) {
@@ -988,7 +1219,7 @@
 void MipsAssembler::ColeS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x36);
+  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x36), cc, fs, ft);
 }
 
 void MipsAssembler::CuleS(FRegister fs, FRegister ft) {
@@ -998,7 +1229,7 @@
 void MipsAssembler::CuleS(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x37);
+  DsFsmInstrCff(EmitFR(0x11, 0x10, ft, fs, static_cast<FRegister>(cc << 2), 0x37), cc, fs, ft);
 }
 
 void MipsAssembler::CunD(FRegister fs, FRegister ft) {
@@ -1008,7 +1239,7 @@
 void MipsAssembler::CunD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x31);
+  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x31), cc, fs, ft);
 }
 
 void MipsAssembler::CeqD(FRegister fs, FRegister ft) {
@@ -1018,7 +1249,7 @@
 void MipsAssembler::CeqD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x32);
+  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x32), cc, fs, ft);
 }
 
 void MipsAssembler::CueqD(FRegister fs, FRegister ft) {
@@ -1028,7 +1259,7 @@
 void MipsAssembler::CueqD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x33);
+  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x33), cc, fs, ft);
 }
 
 void MipsAssembler::ColtD(FRegister fs, FRegister ft) {
@@ -1038,7 +1269,7 @@
 void MipsAssembler::ColtD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x34);
+  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x34), cc, fs, ft);
 }
 
 void MipsAssembler::CultD(FRegister fs, FRegister ft) {
@@ -1048,7 +1279,7 @@
 void MipsAssembler::CultD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x35);
+  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x35), cc, fs, ft);
 }
 
 void MipsAssembler::ColeD(FRegister fs, FRegister ft) {
@@ -1058,7 +1289,7 @@
 void MipsAssembler::ColeD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x36);
+  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x36), cc, fs, ft);
 }
 
 void MipsAssembler::CuleD(FRegister fs, FRegister ft) {
@@ -1068,247 +1299,261 @@
 void MipsAssembler::CuleD(int cc, FRegister fs, FRegister ft) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x37);
+  DsFsmInstrCff(EmitFR(0x11, 0x11, ft, fs, static_cast<FRegister>(cc << 2), 0x37), cc, fs, ft);
 }
 
 void MipsAssembler::CmpUnS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x01);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x01), fd, fs, ft);
 }
 
 void MipsAssembler::CmpEqS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x02);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x02), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUeqS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x03);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x03), fd, fs, ft);
 }
 
 void MipsAssembler::CmpLtS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x04);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x04), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUltS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x05);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x05), fd, fs, ft);
 }
 
 void MipsAssembler::CmpLeS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x06);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x06), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUleS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x07);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x07), fd, fs, ft);
 }
 
 void MipsAssembler::CmpOrS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x11);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x11), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUneS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x12);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x12), fd, fs, ft);
 }
 
 void MipsAssembler::CmpNeS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x14, ft, fs, fd, 0x13);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, ft, fs, fd, 0x13), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUnD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x01);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x01), fd, fs, ft);
 }
 
 void MipsAssembler::CmpEqD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x02);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x02), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUeqD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x03);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x03), fd, fs, ft);
 }
 
 void MipsAssembler::CmpLtD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x04);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x04), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUltD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x05);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x05), fd, fs, ft);
 }
 
 void MipsAssembler::CmpLeD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x06);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x06), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUleD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x07);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x07), fd, fs, ft);
 }
 
 void MipsAssembler::CmpOrD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x11);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x11), fd, fs, ft);
 }
 
 void MipsAssembler::CmpUneD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x12);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x12), fd, fs, ft);
 }
 
 void MipsAssembler::CmpNeD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x15, ft, fs, fd, 0x13);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, ft, fs, fd, 0x13), fd, fs, ft);
 }
 
 void MipsAssembler::Movf(Register rd, Register rs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitR(0, rs, static_cast<Register>(cc << 2), rd, 0, 0x01);
+  DsFsmInstrRrrc(EmitR(0, rs, static_cast<Register>(cc << 2), rd, 0, 0x01), rd, rs, cc);
 }
 
 void MipsAssembler::Movt(Register rd, Register rs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitR(0, rs, static_cast<Register>((cc << 2) | 1), rd, 0, 0x01);
+  DsFsmInstrRrrc(EmitR(0, rs, static_cast<Register>((cc << 2) | 1), rd, 0, 0x01), rd, rs, cc);
 }
 
 void MipsAssembler::MovfS(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, static_cast<FRegister>(cc << 2), fs, fd, 0x11);
+  DsFsmInstrFffc(EmitFR(0x11, 0x10, static_cast<FRegister>(cc << 2), fs, fd, 0x11), fd, fs, cc);
 }
 
 void MipsAssembler::MovfD(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, static_cast<FRegister>(cc << 2), fs, fd, 0x11);
+  DsFsmInstrFffc(EmitFR(0x11, 0x11, static_cast<FRegister>(cc << 2), fs, fd, 0x11), fd, fs, cc);
 }
 
 void MipsAssembler::MovtS(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x10, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11);
+  DsFsmInstrFffc(EmitFR(0x11, 0x10, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11),
+                 fd,
+                 fs,
+                 cc);
 }
 
 void MipsAssembler::MovtD(FRegister fd, FRegister fs, int cc) {
   CHECK(!IsR6());
   CHECK(IsUint<3>(cc)) << cc;
-  EmitFR(0x11, 0x11, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11);
+  DsFsmInstrFffc(EmitFR(0x11, 0x11, static_cast<FRegister>((cc << 2) | 1), fs, fd, 0x11),
+                 fd,
+                 fs,
+                 cc);
 }
 
 void MipsAssembler::SelS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x10, ft, fs, fd, 0x10);
+  DsFsmInstrFfff(EmitFR(0x11, 0x10, ft, fs, fd, 0x10), fd, fs, ft);
 }
 
 void MipsAssembler::SelD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x11, ft, fs, fd, 0x10);
+  DsFsmInstrFfff(EmitFR(0x11, 0x11, ft, fs, fd, 0x10), fd, fs, ft);
 }
 
 void MipsAssembler::ClassS(FRegister fd, FRegister fs) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x1b);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x1b), fd, fs, fs);
 }
 
 void MipsAssembler::ClassD(FRegister fd, FRegister fs) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x1b);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x1b), fd, fs, fs);
 }
 
 void MipsAssembler::MinS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x10, ft, fs, fd, 0x1c);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x1c), fd, fs, ft);
 }
 
 void MipsAssembler::MinD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x11, ft, fs, fd, 0x1c);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x1c), fd, fs, ft);
 }
 
 void MipsAssembler::MaxS(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x10, ft, fs, fd, 0x1e);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, ft, fs, fd, 0x1e), fd, fs, ft);
 }
 
 void MipsAssembler::MaxD(FRegister fd, FRegister fs, FRegister ft) {
   CHECK(IsR6());
-  EmitFR(0x11, 0x11, ft, fs, fd, 0x1e);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, ft, fs, fd, 0x1e), fd, fs, ft);
 }
 
 void MipsAssembler::TruncLS(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x09);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x09), fd, fs, fs);
 }
 
 void MipsAssembler::TruncLD(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x09);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x09), fd, fs, fs);
 }
 
 void MipsAssembler::TruncWS(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x0D);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x0D), fd, fs, fs);
 }
 
 void MipsAssembler::TruncWD(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x0D);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x0D), fd, fs, fs);
 }
 
 void MipsAssembler::Cvtsw(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x20);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x20), fd, fs, fs);
 }
 
 void MipsAssembler::Cvtdw(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x21);
+  DsFsmInstrFff(EmitFR(0x11, 0x14, static_cast<FRegister>(0), fs, fd, 0x21), fd, fs, fs);
 }
 
 void MipsAssembler::Cvtsd(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x20);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0x20), fd, fs, fs);
 }
 
 void MipsAssembler::Cvtds(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x21);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0x21), fd, fs, fs);
 }
 
 void MipsAssembler::Cvtsl(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x20);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x20), fd, fs, fs);
 }
 
 void MipsAssembler::Cvtdl(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x21);
+  DsFsmInstrFff(EmitFR(0x11, 0x15, static_cast<FRegister>(0), fs, fd, 0x21), fd, fs, fs);
 }
 
 void MipsAssembler::FloorWS(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0xf);
+  DsFsmInstrFff(EmitFR(0x11, 0x10, static_cast<FRegister>(0), fs, fd, 0xf), fd, fs, fs);
 }
 
 void MipsAssembler::FloorWD(FRegister fd, FRegister fs) {
-  EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0xf);
+  DsFsmInstrFff(EmitFR(0x11, 0x11, static_cast<FRegister>(0), fs, fd, 0xf), fd, fs, fs);
 }
 
 void MipsAssembler::Mfc1(Register rt, FRegister fs) {
-  EmitFR(0x11, 0x00, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0);
+  DsFsmInstrRf(EmitFR(0x11, 0x00, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
+               rt,
+               fs);
 }
 
 void MipsAssembler::Mtc1(Register rt, FRegister fs) {
-  EmitFR(0x11, 0x04, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0);
+  DsFsmInstrFr(EmitFR(0x11, 0x04, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
+               fs,
+               rt);
 }
 
 void MipsAssembler::Mfhc1(Register rt, FRegister fs) {
-  EmitFR(0x11, 0x03, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0);
+  DsFsmInstrRf(EmitFR(0x11, 0x03, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
+               rt,
+               fs);
 }
 
 void MipsAssembler::Mthc1(Register rt, FRegister fs) {
-  EmitFR(0x11, 0x07, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0);
+  DsFsmInstrFr(EmitFR(0x11, 0x07, static_cast<FRegister>(rt), fs, static_cast<FRegister>(0), 0x0),
+               fs,
+               rt);
 }
 
 void MipsAssembler::MoveFromFpuHigh(Register rt, FRegister fs) {
@@ -1330,28 +1575,33 @@
 }
 
 void MipsAssembler::Lwc1(FRegister ft, Register rs, uint16_t imm16) {
-  EmitI(0x31, rs, static_cast<Register>(ft), imm16);
+  DsFsmInstrFr(EmitI(0x31, rs, static_cast<Register>(ft), imm16), ft, rs);
 }
 
 void MipsAssembler::Ldc1(FRegister ft, Register rs, uint16_t imm16) {
-  EmitI(0x35, rs, static_cast<Register>(ft), imm16);
+  DsFsmInstrFr(EmitI(0x35, rs, static_cast<Register>(ft), imm16), ft, rs);
 }
 
 void MipsAssembler::Swc1(FRegister ft, Register rs, uint16_t imm16) {
-  EmitI(0x39, rs, static_cast<Register>(ft), imm16);
+  DsFsmInstrFR(EmitI(0x39, rs, static_cast<Register>(ft), imm16), ft, rs);
 }
 
 void MipsAssembler::Sdc1(FRegister ft, Register rs, uint16_t imm16) {
-  EmitI(0x3d, rs, static_cast<Register>(ft), imm16);
+  DsFsmInstrFR(EmitI(0x3d, rs, static_cast<Register>(ft), imm16), ft, rs);
 }
 
 void MipsAssembler::Break() {
-  EmitR(0, static_cast<Register>(0), static_cast<Register>(0),
-        static_cast<Register>(0), 0, 0xD);
+  DsFsmInstrNop(EmitR(0, ZERO, ZERO, ZERO, 0, 0xD));
 }
 
 void MipsAssembler::Nop() {
-  EmitR(0x0, static_cast<Register>(0), static_cast<Register>(0), static_cast<Register>(0), 0, 0x0);
+  DsFsmInstrNop(EmitR(0x0, ZERO, ZERO, ZERO, 0, 0x0));
+}
+
+void MipsAssembler::NopIfNoReordering() {
+  if (!reordering_) {
+    Nop();
+  }
 }
 
 void MipsAssembler::Move(Register rd, Register rs) {
@@ -1377,9 +1627,11 @@
 }
 
 void MipsAssembler::PopAndReturn(Register rd, Register rt) {
+  bool reordering = SetReorder(false);
   Lw(rd, SP, 0);
   Jr(rt);
-  DecreaseFrameSize(kMipsWordSize);
+  DecreaseFrameSize(kMipsWordSize);  // Single instruction in delay slot.
+  SetReorder(reordering);
 }
 
 void MipsAssembler::LoadConst32(Register rd, int32_t value) {
@@ -1550,7 +1802,8 @@
       target_(target),
       lhs_reg_(0),
       rhs_reg_(0),
-      condition_(kUncond) {
+      condition_(kUncond),
+      delayed_instruction_(kUnfilledDelaySlot) {
   InitializeType(is_call, /* is_literal */ false, is_r6);
 }
 
@@ -1565,7 +1818,8 @@
       target_(target),
       lhs_reg_(lhs_reg),
       rhs_reg_(rhs_reg),
-      condition_(condition) {
+      condition_(condition),
+      delayed_instruction_(kUnfilledDelaySlot) {
   CHECK_NE(condition, kUncond);
   switch (condition) {
     case kCondLT:
@@ -1617,7 +1871,8 @@
       target_(kUnresolved),
       lhs_reg_(dest_reg),
       rhs_reg_(base_reg),
-      condition_(kUncond) {
+      condition_(kUncond),
+      delayed_instruction_(kUnfilledDelaySlot) {
   CHECK_NE(dest_reg, ZERO);
   if (is_r6) {
     CHECK_EQ(base_reg, ZERO);
@@ -1696,12 +1951,38 @@
   return old_location_;
 }
 
+uint32_t MipsAssembler::Branch::GetPrecedingInstructionLength(Type type) const {
+  // Short branches with delay slots always consist of two instructions, the branch
+  // and the delay slot, irrespective of whether the delay slot is filled with a
+  // useful instruction or not.
+  // Long composite branches may have a length longer by one instruction than
+  // specified in branch_info_[].length. This happens when an instruction is taken
+  // to fill the short branch delay slot, but the branch eventually becomes long
+  // and formally has no delay slot to fill. This instruction is placed at the
+  // beginning of the long composite branch and this needs to be accounted for in
+  // the branch length and the location of the offset encoded in the branch.
+  switch (type) {
+    case kLongUncondBranch:
+    case kLongCondBranch:
+    case kLongCall:
+    case kR6LongCondBranch:
+      return (delayed_instruction_ != kUnfilledDelaySlot &&
+          delayed_instruction_ != kUnfillableDelaySlot) ? 1 : 0;
+    default:
+      return 0;
+  }
+}
+
+uint32_t MipsAssembler::Branch::GetPrecedingInstructionSize(Type type) const {
+  return GetPrecedingInstructionLength(type) * sizeof(uint32_t);
+}
+
 uint32_t MipsAssembler::Branch::GetLength() const {
-  return branch_info_[type_].length;
+  return GetPrecedingInstructionLength(type_) + branch_info_[type_].length;
 }
 
 uint32_t MipsAssembler::Branch::GetOldLength() const {
-  return branch_info_[old_type_].length;
+  return GetPrecedingInstructionLength(old_type_) + branch_info_[old_type_].length;
 }
 
 uint32_t MipsAssembler::Branch::GetSize() const {
@@ -1883,7 +2164,8 @@
 }
 
 uint32_t MipsAssembler::Branch::GetOffsetLocation() const {
-  return location_ + branch_info_[type_].instr_offset * sizeof(uint32_t);
+  return location_ + GetPrecedingInstructionSize(type_) +
+      branch_info_[type_].instr_offset * sizeof(uint32_t);
 }
 
 uint32_t MipsAssembler::GetBranchOrPcRelBaseForEncoding(const MipsAssembler::Branch* branch) const {
@@ -1925,6 +2207,9 @@
   CHECK(!label->IsBound());
   uint32_t bound_pc = buffer_.Size();
 
+  // Make the delay slot FSM aware of the new label.
+  DsFsmLabel();
+
   // Walk the list of branches referring to and preceding this label.
   // Store the previously unknown target addresses in them.
   while (label->IsLinked()) {
@@ -1997,11 +2282,15 @@
 
 void MipsAssembler::FinalizeLabeledBranch(MipsLabel* label) {
   uint32_t length = branches_.back().GetLength();
+  // Commit the last branch target label (if any).
+  DsFsmCommitLabel();
   if (!label->IsBound()) {
     // Branch forward (to a following label), distance is unknown.
     // The first branch forward will contain 0, serving as the terminator of
     // the list of forward-reaching branches.
     Emit(label->position_);
+    // Nothing for the delay slot (yet).
+    DsFsmInstrNop(0);
     length--;
     // Now make the label object point to this branch
     // (this forms a linked list of branches preceding this label).
@@ -2014,9 +2303,139 @@
   }
 }
 
+bool MipsAssembler::Branch::CanHaveDelayedInstruction(const DelaySlot& delay_slot) const {
+  if (delay_slot.instruction_ == 0) {
+    // NOP or no instruction for the delay slot.
+    return false;
+  }
+  switch (type_) {
+    // R2 unconditional branches.
+    case kUncondBranch:
+    case kLongUncondBranch:
+      // There are no register interdependencies.
+      return true;
+
+    // R2 calls.
+    case kCall:
+    case kLongCall:
+      // Instructions depending on or modifying RA should not be moved into delay slots
+      // of branches modifying RA.
+      return ((delay_slot.gpr_ins_mask_ | delay_slot.gpr_outs_mask_) & (1u << RA)) == 0;
+
+    // R2 conditional branches.
+    case kCondBranch:
+    case kLongCondBranch:
+      switch (condition_) {
+        // Branches with one GPR source.
+        case kCondLTZ:
+        case kCondGEZ:
+        case kCondLEZ:
+        case kCondGTZ:
+        case kCondEQZ:
+        case kCondNEZ:
+          return (delay_slot.gpr_outs_mask_ & (1u << lhs_reg_)) == 0;
+
+        // Branches with two GPR sources.
+        case kCondEQ:
+        case kCondNE:
+          return (delay_slot.gpr_outs_mask_ & ((1u << lhs_reg_) | (1u << rhs_reg_))) == 0;
+
+        // Branches with one FPU condition code source.
+        case kCondF:
+        case kCondT:
+          return (delay_slot.cc_outs_mask_ & (1u << lhs_reg_)) == 0;
+
+        default:
+          // We don't support synthetic R2 branches (preceded with slt[u]) at this level
+          // (R2 doesn't have branches to compare 2 registers using <, <=, >=, >).
+          LOG(FATAL) << "Unexpected branch condition " << condition_;
+          UNREACHABLE();
+      }
+
+    // R6 unconditional branches.
+    case kR6UncondBranch:
+    case kR6LongUncondBranch:
+    // R6 calls.
+    case kR6Call:
+    case kR6LongCall:
+      // There are no delay slots.
+      return false;
+
+    // R6 conditional branches.
+    case kR6CondBranch:
+    case kR6LongCondBranch:
+      switch (condition_) {
+        // Branches with one FPU register source.
+        case kCondF:
+        case kCondT:
+          return (delay_slot.fpr_outs_mask_ & (1u << lhs_reg_)) == 0;
+        // Others have a forbidden slot instead of a delay slot.
+        default:
+          return false;
+      }
+
+    // Literals.
+    default:
+      LOG(FATAL) << "Unexpected branch type " << type_;
+      UNREACHABLE();
+  }
+}
+
+uint32_t MipsAssembler::Branch::GetDelayedInstruction() const {
+  return delayed_instruction_;
+}
+
+void MipsAssembler::Branch::SetDelayedInstruction(uint32_t instruction) {
+  CHECK_NE(instruction, kUnfilledDelaySlot);
+  CHECK_EQ(delayed_instruction_, kUnfilledDelaySlot);
+  delayed_instruction_ = instruction;
+}
+
+void MipsAssembler::Branch::DecrementLocations() {
+  // We first create a branch object, which gets its type and locations initialized,
+  // and then we check if the branch can actually have the preceding instruction moved
+  // into its delay slot. If it can, the branch locations need to be decremented.
+  //
+  // We could make the check before creating the branch object and avoid the location
+  // adjustment, but the check is cleaner when performed on an initialized branch
+  // object.
+  //
+  // If the branch is backwards (to a previously bound label), reducing the locations
+  // cannot cause a short branch to exceed its offset range because the offset reduces.
+  // And this is not at all a problem for a long branch backwards.
+  //
+  // If the branch is forward (not linked to any label yet), reducing the locations
+  // is harmless. The branch will be promoted to long if needed when the target is known.
+  CHECK_EQ(location_, old_location_);
+  CHECK_GE(old_location_, sizeof(uint32_t));
+  old_location_ -= sizeof(uint32_t);
+  location_ = old_location_;
+}
+
+void MipsAssembler::MoveInstructionToDelaySlot(Branch& branch) {
+  if (branch.CanHaveDelayedInstruction(delay_slot_)) {
+    // The last instruction cannot be used in a different delay slot,
+    // do not commit the label before it (if any).
+    DsFsmDropLabel();
+    // Remove the last emitted instruction.
+    size_t size = buffer_.Size();
+    CHECK_GE(size, sizeof(uint32_t));
+    size -= sizeof(uint32_t);
+    CHECK_EQ(buffer_.Load<uint32_t>(size), delay_slot_.instruction_);
+    buffer_.Resize(size);
+    // Attach it to the branch and adjust the branch locations.
+    branch.DecrementLocations();
+    branch.SetDelayedInstruction(delay_slot_.instruction_);
+  } else if (!reordering_ && branch.GetType() == Branch::kUncondBranch) {
+    // If reordefing is disabled, prevent absorption of the target instruction.
+    branch.SetDelayedInstruction(Branch::kUnfillableDelaySlot);
+  }
+}
+
 void MipsAssembler::Buncond(MipsLabel* label) {
   uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
   branches_.emplace_back(IsR6(), buffer_.Size(), target, /* is_call */ false);
+  MoveInstructionToDelaySlot(branches_.back());
   FinalizeLabeledBranch(label);
 }
 
@@ -2027,12 +2446,14 @@
   }
   uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
   branches_.emplace_back(IsR6(), buffer_.Size(), target, condition, lhs, rhs);
+  MoveInstructionToDelaySlot(branches_.back());
   FinalizeLabeledBranch(label);
 }
 
 void MipsAssembler::Call(MipsLabel* label) {
   uint32_t target = label->IsBound() ? GetLabelLocation(label) : Branch::kUnresolved;
   branches_.emplace_back(IsR6(), buffer_.Size(), target, /* is_call */ true);
+  MoveInstructionToDelaySlot(branches_.back());
   FinalizeLabeledBranch(label);
 }
 
@@ -2104,6 +2525,7 @@
     uint32_t end = old_size;
     for (size_t i = branch_count; i > 0; ) {
       Branch& branch = branches_[--i];
+      CHECK_GE(end, branch.GetOldEndLocation());
       uint32_t size = end - branch.GetOldEndLocation();
       buffer_.Move(branch.GetEndLocation(), branch.GetOldEndLocation(), size);
       end = branch.GetOldLocation();
@@ -2148,26 +2570,53 @@
   BranchCondition condition = branch->GetCondition();
   Register lhs = branch->GetLeftRegister();
   Register rhs = branch->GetRightRegister();
+  uint32_t delayed_instruction = branch->GetDelayedInstruction();
   switch (branch->GetType()) {
     // R2 short branches.
     case Branch::kUncondBranch:
+      if (delayed_instruction == Branch::kUnfillableDelaySlot) {
+        // The branch was created when reordering was disabled, do not absorb the target
+        // instruction.
+        delayed_instruction = 0;  // NOP.
+      } else if (delayed_instruction == Branch::kUnfilledDelaySlot) {
+        // Try to absorb the target instruction into the delay slot.
+        delayed_instruction = 0;  // NOP.
+        // Incrementing the signed 16-bit offset past the target instruction must not
+        // cause overflow into the negative subrange, check for the max offset.
+        if (offset != 0x7FFF) {
+          uint32_t target = branch->GetTarget();
+          if (std::binary_search(ds_fsm_target_pcs_.begin(), ds_fsm_target_pcs_.end(), target)) {
+            delayed_instruction = buffer_.Load<uint32_t>(target);
+            offset++;
+          }
+        }
+      }
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       B(offset);
-      Nop();  // TODO: improve by filling the delay slot.
+      Emit(delayed_instruction);
       break;
     case Branch::kCondBranch:
+      DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
+      if (delayed_instruction == Branch::kUnfilledDelaySlot) {
+        delayed_instruction = 0;  // NOP.
+      }
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       EmitBcondR2(condition, lhs, rhs, offset);
-      Nop();  // TODO: improve by filling the delay slot.
+      Emit(delayed_instruction);
       break;
     case Branch::kCall:
+      DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
+      if (delayed_instruction == Branch::kUnfilledDelaySlot) {
+        delayed_instruction = 0;  // NOP.
+      }
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Bal(offset);
-      Nop();  // TODO: improve by filling the delay slot.
+      Emit(delayed_instruction);
       break;
 
     // R2 near literal.
     case Branch::kLiteral:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Lw(lhs, rhs, offset);
       break;
@@ -2192,6 +2641,12 @@
       // For now simply use the stack for RA. This should be OK since for the
       // vast majority of code a short PC-relative branch is sufficient.
       // TODO: can this be improved?
+      // TODO: consider generation of a shorter sequence when we know that RA
+      // is explicitly preserved by the method entry/exit code.
+      if (delayed_instruction != Branch::kUnfilledDelaySlot &&
+          delayed_instruction != Branch::kUnfillableDelaySlot) {
+        Emit(delayed_instruction);
+      }
       Push(RA);
       Nal();
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
@@ -2204,6 +2659,10 @@
       break;
     case Branch::kLongCondBranch:
       // The comment on case 'Branch::kLongUncondBranch' applies here as well.
+      DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
+      if (delayed_instruction != Branch::kUnfilledDelaySlot) {
+        Emit(delayed_instruction);
+      }
       // Note: the opposite condition branch encodes 8 as the distance, which is equal to the
       // number of instructions skipped:
       // (PUSH(IncreaseFrameSize(ADDIU) + SW) + NAL + LUI + ORI + ADDU + LW + JR).
@@ -2219,6 +2678,10 @@
       DecreaseFrameSize(kMipsWordSize);
       break;
     case Branch::kLongCall:
+      DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
+      if (delayed_instruction != Branch::kUnfilledDelaySlot) {
+        Emit(delayed_instruction);
+      }
       Nal();
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Lui(AT, High16Bits(offset));
@@ -2230,6 +2693,7 @@
 
     // R2 far literal.
     case Branch::kFarLiteral:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       offset += (offset & 0x8000) << 1;  // Account for sign extension in lw.
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Lui(AT, High16Bits(offset));
@@ -2239,33 +2703,48 @@
 
     // R6 short branches.
     case Branch::kR6UncondBranch:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Bc(offset);
       break;
     case Branch::kR6CondBranch:
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       EmitBcondR6(condition, lhs, rhs, offset);
-      Nop();  // TODO: improve by filling the forbidden/delay slot.
+      DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
+      if (delayed_instruction != Branch::kUnfilledDelaySlot) {
+        Emit(delayed_instruction);
+      } else {
+        // TODO: improve by filling the forbidden slot (IFF this is
+        // a forbidden and not a delay slot).
+        Nop();
+      }
       break;
     case Branch::kR6Call:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Balc(offset);
       break;
 
     // R6 near literal.
     case Branch::kR6Literal:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Lwpc(lhs, offset);
       break;
 
     // R6 long branches.
     case Branch::kR6LongUncondBranch:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       offset += (offset & 0x8000) << 1;  // Account for sign extension in jic.
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Auipc(AT, High16Bits(offset));
       Jic(AT, Low16Bits(offset));
       break;
     case Branch::kR6LongCondBranch:
+      DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);
+      if (delayed_instruction != Branch::kUnfilledDelaySlot) {
+        Emit(delayed_instruction);
+      }
       EmitBcondR6(Branch::OppositeCondition(condition), lhs, rhs, 2);
       offset += (offset & 0x8000) << 1;  // Account for sign extension in jic.
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
@@ -2273,6 +2752,7 @@
       Jic(AT, Low16Bits(offset));
       break;
     case Branch::kR6LongCall:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       offset += (offset & 0x8000) << 1;  // Account for sign extension in jialc.
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Auipc(AT, High16Bits(offset));
@@ -2281,6 +2761,7 @@
 
     // R6 far literal.
     case Branch::kR6FarLiteral:
+      DCHECK_EQ(delayed_instruction, Branch::kUnfilledDelaySlot);
       offset += (offset & 0x8000) << 1;  // Account for sign extension in lw.
       CHECK_EQ(overwrite_location_, branch->GetOffsetLocation());
       Auipc(AT, High16Bits(offset));
@@ -2331,12 +2812,60 @@
   Bcond(label, kCondGTZ, rt);
 }
 
+bool MipsAssembler::CanExchangeWithSlt(Register rs, Register rt) const {
+  // If the instruction modifies AT, `rs` or `rt`, it can't be exchanged with the slt[u]
+  // instruction because either slt[u] depends on `rs` or `rt` or the following
+  // conditional branch depends on AT set by slt[u].
+  // Likewise, if the instruction depends on AT, it can't be exchanged with slt[u]
+  // because slt[u] changes AT.
+  return (delay_slot_.instruction_ != 0 &&
+      (delay_slot_.gpr_outs_mask_ & ((1u << AT) | (1u << rs) | (1u << rt))) == 0 &&
+      (delay_slot_.gpr_ins_mask_ & (1u << AT)) == 0);
+}
+
+void MipsAssembler::ExchangeWithSlt(const DelaySlot& forwarded_slot) {
+  // Exchange the last two instructions in the assembler buffer.
+  size_t size = buffer_.Size();
+  CHECK_GE(size, 2 * sizeof(uint32_t));
+  size_t pos1 = size - 2 * sizeof(uint32_t);
+  size_t pos2 = size - sizeof(uint32_t);
+  uint32_t instr1 = buffer_.Load<uint32_t>(pos1);
+  uint32_t instr2 = buffer_.Load<uint32_t>(pos2);
+  CHECK_EQ(instr1, forwarded_slot.instruction_);
+  CHECK_EQ(instr2, delay_slot_.instruction_);
+  buffer_.Store<uint32_t>(pos1, instr2);
+  buffer_.Store<uint32_t>(pos2, instr1);
+  // Set the current delay slot information to that of the last instruction
+  // in the buffer.
+  delay_slot_ = forwarded_slot;
+}
+
+void MipsAssembler::GenerateSltForCondBranch(bool unsigned_slt, Register rs, Register rt) {
+  // If possible, exchange the slt[u] instruction with the preceding instruction,
+  // so it can fill the delay slot.
+  DelaySlot forwarded_slot = delay_slot_;
+  bool exchange = CanExchangeWithSlt(rs, rt);
+  if (exchange) {
+    // The last instruction cannot be used in a different delay slot,
+    // do not commit the label before it (if any).
+    DsFsmDropLabel();
+  }
+  if (unsigned_slt) {
+    Sltu(AT, rs, rt);
+  } else {
+    Slt(AT, rs, rt);
+  }
+  if (exchange) {
+    ExchangeWithSlt(forwarded_slot);
+  }
+}
+
 void MipsAssembler::Blt(Register rs, Register rt, MipsLabel* label) {
   if (IsR6()) {
     Bcond(label, kCondLT, rs, rt);
   } else if (!Branch::IsNop(kCondLT, rs, rt)) {
     // Synthesize the instruction (not available on R2).
-    Slt(AT, rs, rt);
+    GenerateSltForCondBranch(/* unsigned_slt */ false, rs, rt);
     Bnez(AT, label);
   }
 }
@@ -2348,7 +2877,7 @@
     B(label);
   } else {
     // Synthesize the instruction (not available on R2).
-    Slt(AT, rs, rt);
+    GenerateSltForCondBranch(/* unsigned_slt */ false, rs, rt);
     Beqz(AT, label);
   }
 }
@@ -2358,7 +2887,7 @@
     Bcond(label, kCondLTU, rs, rt);
   } else if (!Branch::IsNop(kCondLTU, rs, rt)) {
     // Synthesize the instruction (not available on R2).
-    Sltu(AT, rs, rt);
+    GenerateSltForCondBranch(/* unsigned_slt */ true, rs, rt);
     Bnez(AT, label);
   }
 }
@@ -2370,7 +2899,7 @@
     B(label);
   } else {
     // Synthesize the instruction (not available on R2).
-    Sltu(AT, rs, rt);
+    GenerateSltForCondBranch(/* unsigned_slt */ true, rs, rt);
     Beqz(AT, label);
   }
 }
@@ -2613,12 +3142,22 @@
   LoadFromOffset(kLoadWord, RA, SP, stack_offset);
   cfi_.Restore(DWARFReg(RA));
 
-  // Decrease frame to required size.
-  DecreaseFrameSize(frame_size);
-
-  // Then jump to the return address.
-  Jr(RA);
-  Nop();
+  // Adjust the stack pointer in the delay slot if doing so doesn't break CFI.
+  bool exchange = IsInt<16>(static_cast<int32_t>(frame_size));
+  bool reordering = SetReorder(false);
+  if (exchange) {
+    // Jump to the return address.
+    Jr(RA);
+    // Decrease frame to required size.
+    DecreaseFrameSize(frame_size);  // Single instruction in delay slot.
+  } else {
+    // Decrease frame to required size.
+    DecreaseFrameSize(frame_size);
+    // Jump to the return address.
+    Jr(RA);
+    Nop();  // In delay slot.
+  }
+  SetReorder(reordering);
 
   // The CFI should be restored for any code that follows the exit block.
   cfi_.RestoreState();
@@ -2963,7 +3502,7 @@
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
                  base.AsCoreRegister(), offset.Int32Value());
   Jalr(scratch.AsCoreRegister());
-  Nop();
+  NopIfNoReordering();
   // TODO: place reference map on call.
 }
 
@@ -2975,7 +3514,7 @@
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
                  scratch.AsCoreRegister(), offset.Int32Value());
   Jalr(scratch.AsCoreRegister());
-  Nop();
+  NopIfNoReordering();
   // TODO: place reference map on call.
 }
 
@@ -2998,9 +3537,6 @@
   exception_blocks_.emplace_back(scratch, stack_adjust);
   LoadFromOffset(kLoadWord, scratch.AsCoreRegister(),
                  S1, Thread::ExceptionOffset<kMipsPointerSize>().Int32Value());
-  // TODO: on MIPS32R6 prefer Bnezc(scratch.AsCoreRegister(), slow.Entry());
-  // as the NAL instruction (occurring in long R2 branches) may become deprecated.
-  // For now use common for R2 and R6 instructions as this code must execute on both.
   Bnez(scratch.AsCoreRegister(), exception_blocks_.back().Entry());
 }
 
@@ -3017,7 +3553,7 @@
   LoadFromOffset(kLoadWord, T9, S1,
     QUICK_ENTRYPOINT_OFFSET(kMipsPointerSize, pDeliverException).Int32Value());
   Jr(T9);
-  Nop();
+  NopIfNoReordering();
 
   // Call never returns.
   Break();
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 434ca67..d50c439 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -154,6 +154,9 @@
       : Assembler(arena),
         overwriting_(false),
         overwrite_location_(0),
+        reordering_(true),
+        ds_fsm_state_(kExpectingLabel),
+        ds_fsm_target_pc_(0),
         literals_(arena->Adapter(kArenaAllocAssembler)),
         last_position_adjustment_(0),
         last_old_position_(0),
@@ -163,6 +166,7 @@
   }
 
   size_t CodeSize() const OVERRIDE { return Assembler::CodeSize(); }
+  size_t CodePosition() OVERRIDE;
   DebugFrameOpCodeWriterForAssembler& cfi() { return Assembler::cfi(); }
 
   virtual ~MipsAssembler() {
@@ -256,6 +260,11 @@
   void Slti(Register rt, Register rs, uint16_t imm16);
   void Sltiu(Register rt, Register rs, uint16_t imm16);
 
+  // Branches and jumps to immediate offsets/addresses do not take care of their
+  // delay/forbidden slots and generally should not be used directly. This applies
+  // to the following R2 and R6 branch/jump instructions with imm16, imm21, addr26
+  // offsets/addresses.
+  // Use branches/jumps to labels instead.
   void B(uint16_t imm16);
   void Bal(uint16_t imm16);
   void Beq(Register rs, Register rt, uint16_t imm16);
@@ -272,9 +281,13 @@
   void Bc1t(int cc, uint16_t imm16);  // R2
   void J(uint32_t addr26);
   void Jal(uint32_t addr26);
+  // Jalr() and Jr() fill their delay slots when reordering is enabled.
+  // When reordering is disabled, the delay slots must be filled manually.
+  // You may use NopIfNoReordering() to fill them when reordering is disabled.
   void Jalr(Register rd, Register rs);
   void Jalr(Register rs);
   void Jr(Register rs);
+  // Nal() does not fill its delay slot. It must be filled manually.
   void Nal();
   void Auipc(Register rs, uint16_t imm16);  // R6
   void Addiupc(Register rs, uint32_t imm19);  // R6
@@ -403,6 +416,7 @@
 
   void Break();
   void Nop();
+  void NopIfNoReordering();
   void Move(Register rd, Register rs);
   void Clear(Register rd);
   void Not(Register rd, Register rs);
@@ -414,7 +428,8 @@
   void LoadSConst32(FRegister r, int32_t value, Register temp);
   void Addiu32(Register rt, Register rs, int32_t value, Register rtmp = AT);
 
-  // These will generate R2 branches or R6 branches as appropriate.
+  // These will generate R2 branches or R6 branches as appropriate and take care of
+  // the delay/forbidden slots.
   void Bind(MipsLabel* label);
   void B(MipsLabel* label);
   void Bal(MipsLabel* label);
@@ -868,7 +883,51 @@
   };
   friend std::ostream& operator<<(std::ostream& os, const BranchCondition& rhs);
 
+  // Enables or disables instruction reordering (IOW, automatic filling of delay slots)
+  // similarly to ".set reorder" / ".set noreorder" in traditional MIPS assembly.
+  // Returns the last state, which may be useful for temporary enabling/disabling of
+  // reordering.
+  bool SetReorder(bool enable);
+
  private:
+  // Description of the last instruction in terms of input and output registers.
+  // Used to make the decision of moving the instruction into a delay slot.
+  struct DelaySlot {
+    DelaySlot();
+    // Encoded instruction that may be used to fill the delay slot or 0
+    // (0 conveniently represents NOP).
+    uint32_t instruction_;
+    // Mask of output GPRs for the instruction.
+    uint32_t gpr_outs_mask_;
+    // Mask of input GPRs for the instruction.
+    uint32_t gpr_ins_mask_;
+    // Mask of output FPRs for the instruction.
+    uint32_t fpr_outs_mask_;
+    // Mask of input FPRs for the instruction.
+    uint32_t fpr_ins_mask_;
+    // Mask of output FPU condition code flags for the instruction.
+    uint32_t cc_outs_mask_;
+    // Mask of input FPU condition code flags for the instruction.
+    uint32_t cc_ins_mask_;
+    // Branches never operate on the LO and HI registers, hence there's
+    // no mask for LO and HI.
+  };
+
+  // Delay slot finite state machine's (DS FSM's) state. The FSM state is updated
+  // upon every new instruction and label generated. The FSM detects instructions
+  // suitable for delay slots and immediately preceded with labels. These are target
+  // instructions for branches. If an unconditional R2 branch does not get its delay
+  // slot filled with the immediately preceding instruction, it may instead get the
+  // slot filled with the target instruction (the branch will need its offset
+  // incremented past the target instruction). We call this "absorption". The FSM
+  // records PCs of the target instructions suitable for this optimization.
+  enum DsFsmState {
+    kExpectingLabel,
+    kExpectingInstruction,
+    kExpectingCommit
+  };
+  friend std::ostream& operator<<(std::ostream& os, const DsFsmState& rhs);
+
   class Branch {
    public:
     enum Type {
@@ -910,6 +969,17 @@
     static constexpr uint32_t kUnresolved = 0xffffffff;  // Unresolved target_
     static constexpr int32_t kMaxBranchLength = 32;
     static constexpr int32_t kMaxBranchSize = kMaxBranchLength * sizeof(uint32_t);
+    // The following two instruction encodings can never legally occur in branch delay
+    // slots and are used as markers.
+    //
+    // kUnfilledDelaySlot means that the branch may use either the preceding or the target
+    // instruction to fill its delay slot (the latter is only possible with unconditional
+    // R2 branches and is termed here as "absorption").
+    static constexpr uint32_t kUnfilledDelaySlot = 0x10000000;  // beq zero, zero, 0.
+    // kUnfillableDelaySlot means that the branch cannot use an instruction (other than NOP)
+    // to fill its delay slot. This is only used for unconditional R2 branches to prevent
+    // absorption of the target instruction when reordering is disabled.
+    static constexpr uint32_t kUnfillableDelaySlot = 0x13FF0000;  // beq ra, ra, 0.
 
     struct BranchInfo {
       // Branch length as a number of 4-byte-long instructions.
@@ -958,6 +1028,8 @@
     uint32_t GetTarget() const;
     uint32_t GetLocation() const;
     uint32_t GetOldLocation() const;
+    uint32_t GetPrecedingInstructionLength(Type type) const;
+    uint32_t GetPrecedingInstructionSize(Type type) const;
     uint32_t GetLength() const;
     uint32_t GetOldLength() const;
     uint32_t GetSize() const;
@@ -967,6 +1039,12 @@
     bool IsLong() const;
     bool IsResolved() const;
 
+    // Various helpers for branch delay slot management.
+    bool CanHaveDelayedInstruction(const DelaySlot& delay_slot) const;
+    void SetDelayedInstruction(uint32_t instruction);
+    uint32_t GetDelayedInstruction() const;
+    void DecrementLocations();
+
     // Returns the bit size of the signed offset that the branch instruction can handle.
     OffsetBits GetOffsetSize() const;
 
@@ -1031,27 +1109,34 @@
     // Helper for the above.
     void InitShortOrLong(OffsetBits ofs_size, Type short_type, Type long_type);
 
-    uint32_t old_location_;      // Offset into assembler buffer in bytes.
-    uint32_t location_;          // Offset into assembler buffer in bytes.
-    uint32_t target_;            // Offset into assembler buffer in bytes.
+    uint32_t old_location_;         // Offset into assembler buffer in bytes.
+    uint32_t location_;             // Offset into assembler buffer in bytes.
+    uint32_t target_;               // Offset into assembler buffer in bytes.
 
-    uint32_t lhs_reg_;           // Left-hand side register in conditional branches or
-                                 // indirect call register.
-    uint32_t rhs_reg_;           // Right-hand side register in conditional branches.
-    BranchCondition condition_;  // Condition for conditional branches.
+    uint32_t lhs_reg_;              // Left-hand side register in conditional branches or
+                                    // FPU condition code. Destination register in literals.
+    uint32_t rhs_reg_;              // Right-hand side register in conditional branches.
+                                    // Base register in literals (ZERO on R6).
+    BranchCondition condition_;     // Condition for conditional branches.
 
-    Type type_;                  // Current type of the branch.
-    Type old_type_;              // Initial type of the branch.
+    Type type_;                     // Current type of the branch.
+    Type old_type_;                 // Initial type of the branch.
+
+    uint32_t delayed_instruction_;  // Encoded instruction for the delay slot or
+                                    // kUnfilledDelaySlot if none but fillable or
+                                    // kUnfillableDelaySlot if none and unfillable
+                                    // (the latter is only used for unconditional R2
+                                    // branches).
   };
   friend std::ostream& operator<<(std::ostream& os, const Branch::Type& rhs);
   friend std::ostream& operator<<(std::ostream& os, const Branch::OffsetBits& rhs);
 
-  void EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
-  void EmitI(int opcode, Register rs, Register rt, uint16_t imm);
-  void EmitI21(int opcode, Register rs, uint32_t imm21);
-  void EmitI26(int opcode, uint32_t imm26);
-  void EmitFR(int opcode, int fmt, FRegister ft, FRegister fs, FRegister fd, int funct);
-  void EmitFI(int opcode, int fmt, FRegister rt, uint16_t imm);
+  uint32_t EmitR(int opcode, Register rs, Register rt, Register rd, int shamt, int funct);
+  uint32_t EmitI(int opcode, Register rs, Register rt, uint16_t imm);
+  uint32_t EmitI21(int opcode, Register rs, uint32_t imm21);
+  uint32_t EmitI26(int opcode, uint32_t imm26);
+  uint32_t EmitFR(int opcode, int fmt, FRegister ft, FRegister fs, FRegister fd, int funct);
+  uint32_t EmitFI(int opcode, int fmt, FRegister rt, uint16_t imm);
   void EmitBcondR2(BranchCondition cond, Register rs, Register rt, uint16_t imm16);
   void EmitBcondR6(BranchCondition cond, Register rs, Register rt, uint32_t imm16_21);
 
@@ -1060,6 +1145,33 @@
   void Call(MipsLabel* label);
   void FinalizeLabeledBranch(MipsLabel* label);
 
+  // Various helpers for branch delay slot management.
+  void DsFsmInstr(uint32_t instruction,
+                  uint32_t gpr_outs_mask,
+                  uint32_t gpr_ins_mask,
+                  uint32_t fpr_outs_mask,
+                  uint32_t fpr_ins_mask,
+                  uint32_t cc_outs_mask,
+                  uint32_t cc_ins_mask);
+  void DsFsmInstrNop(uint32_t instruction);
+  void DsFsmInstrRrr(uint32_t instruction, Register out, Register in1, Register in2);
+  void DsFsmInstrRrrr(uint32_t instruction, Register in1_out, Register in2, Register in3);
+  void DsFsmInstrFff(uint32_t instruction, FRegister out, FRegister in1, FRegister in2);
+  void DsFsmInstrFfff(uint32_t instruction, FRegister in1_out, FRegister in2, FRegister in3);
+  void DsFsmInstrRf(uint32_t instruction, Register out, FRegister in);
+  void DsFsmInstrFr(uint32_t instruction, FRegister out, Register in);
+  void DsFsmInstrFR(uint32_t instruction, FRegister in1, Register in2);
+  void DsFsmInstrCff(uint32_t instruction, int cc_out, FRegister in1, FRegister in2);
+  void DsFsmInstrRrrc(uint32_t instruction, Register in1_out, Register in2, int cc_in);
+  void DsFsmInstrFffc(uint32_t instruction, FRegister in1_out, FRegister in2, int cc_in);
+  void DsFsmLabel();
+  void DsFsmCommitLabel();
+  void DsFsmDropLabel();
+  void MoveInstructionToDelaySlot(Branch& branch);
+  bool CanExchangeWithSlt(Register rs, Register rt) const;
+  void ExchangeWithSlt(const DelaySlot& forwarded_slot);
+  void GenerateSltForCondBranch(bool unsigned_slt, Register rs, Register rt);
+
   Branch* GetBranch(uint32_t branch_id);
   const Branch* GetBranch(uint32_t branch_id) const;
   uint32_t GetBranchLocationOrPcRelBase(const MipsAssembler::Branch* branch) const;
@@ -1100,6 +1212,17 @@
   // The current overwrite location.
   uint32_t overwrite_location_;
 
+  // Whether instruction reordering (IOW, automatic filling of delay slots) is enabled.
+  bool reordering_;
+  // Information about the last instruction that may be used to fill a branch delay slot.
+  DelaySlot delay_slot_;
+  // Delay slot FSM state.
+  DsFsmState ds_fsm_state_;
+  // PC of the current labeled target instruction.
+  uint32_t ds_fsm_target_pc_;
+  // PCs of labeled target instructions.
+  std::vector<uint32_t> ds_fsm_target_pcs_;
+
   // Use std::deque<> for literal labels to allow insertions at the end
   // without invalidating pointers and references to existing elements.
   ArenaDeque<Literal> literals_;
@@ -1109,7 +1232,7 @@
   // that PC (from NAL) points to.
   MipsLabel pc_rel_base_label_;
 
-  // Data for AdjustedPosition(), see the description there.
+  // Data for GetAdjustedPosition(), see the description there.
   uint32_t last_position_adjustment_;
   uint32_t last_old_position_;
   uint32_t last_branch_id_;
diff --git a/compiler/utils/mips/assembler_mips32r6_test.cc b/compiler/utils/mips/assembler_mips32r6_test.cc
index 49ef272..fabb096 100644
--- a/compiler/utils/mips/assembler_mips32r6_test.cc
+++ b/compiler/utils/mips/assembler_mips32r6_test.cc
@@ -673,6 +673,144 @@
 // BRANCHES //
 //////////////
 
+TEST_F(AssemblerMIPS32r6Test, ImpossibleReordering) {
+  mips::MipsLabel label;
+  __ SetReorder(true);
+  __ Bind(&label);
+
+  __ CmpLtD(mips::F0, mips::F2, mips::F4);
+  __ Bc1nez(mips::F0, &label);  // F0 dependency.
+
+  __ MulD(mips::F10, mips::F2, mips::F4);
+  __ Bc1eqz(mips::F10, &label);  // F10 dependency.
+
+  std::string expected =
+      ".set noreorder\n"
+      "1:\n"
+
+      "cmp.lt.d $f0, $f2, $f4\n"
+      "bc1nez $f0, 1b\n"
+      "nop\n"
+
+      "mul.d $f10, $f2, $f4\n"
+      "bc1eqz $f10, 1b\n"
+      "nop\n";
+  DriverStr(expected, "ImpossibleReordering");
+}
+
+TEST_F(AssemblerMIPS32r6Test, Reordering) {
+  mips::MipsLabel label;
+  __ SetReorder(true);
+  __ Bind(&label);
+
+  __ CmpLtD(mips::F0, mips::F2, mips::F4);
+  __ Bc1nez(mips::F2, &label);
+
+  __ MulD(mips::F0, mips::F2, mips::F4);
+  __ Bc1eqz(mips::F4, &label);
+
+  std::string expected =
+      ".set noreorder\n"
+      "1:\n"
+
+      "bc1nez $f2, 1b\n"
+      "cmp.lt.d $f0, $f2, $f4\n"
+
+      "bc1eqz $f4, 1b\n"
+      "mul.d $f0, $f2, $f4\n";
+  DriverStr(expected, "Reordering");
+}
+
+TEST_F(AssemblerMIPS32r6Test, SetReorder) {
+  mips::MipsLabel label1, label2, label3, label4;
+
+  __ SetReorder(true);
+  __ Bind(&label1);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Bc1nez(mips::F0, &label1);
+
+  __ SetReorder(false);
+  __ Bind(&label2);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Bc1nez(mips::F0, &label2);
+
+  __ SetReorder(true);
+  __ Bind(&label3);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Bc1eqz(mips::F0, &label3);
+
+  __ SetReorder(false);
+  __ Bind(&label4);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Bc1eqz(mips::F0, &label4);
+
+  std::string expected =
+      ".set noreorder\n"
+      "1:\n"
+      "bc1nez $f0, 1b\n"
+      "addu $t0, $t1, $t2\n"
+
+      "2:\n"
+      "addu $t0, $t1, $t2\n"
+      "bc1nez $f0, 2b\n"
+      "nop\n"
+
+      "3:\n"
+      "bc1eqz $f0, 3b\n"
+      "addu $t0, $t1, $t2\n"
+
+      "4:\n"
+      "addu $t0, $t1, $t2\n"
+      "bc1eqz $f0, 4b\n"
+      "nop\n";
+  DriverStr(expected, "SetReorder");
+}
+
+TEST_F(AssemblerMIPS32r6Test, LongBranchReorder) {
+  mips::MipsLabel label;
+  __ SetReorder(true);
+  __ Subu(mips::T0, mips::T1, mips::T2);
+  __ Bc1nez(mips::F0, &label);
+  constexpr uint32_t kAdduCount1 = (1u << 15) + 1;
+  for (uint32_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips::ZERO, mips::ZERO, mips::ZERO);
+  }
+  __ Bind(&label);
+  constexpr uint32_t kAdduCount2 = (1u << 15) + 1;
+  for (uint32_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips::ZERO, mips::ZERO, mips::ZERO);
+  }
+  __ Subu(mips::T0, mips::T1, mips::T2);
+  __ Bc1eqz(mips::F0, &label);
+
+  uint32_t offset_forward = 2 + kAdduCount1;  // 2: account for auipc and jic.
+  offset_forward <<= 2;
+  offset_forward += (offset_forward & 0x8000) << 1;  // Account for sign extension in jic.
+
+  uint32_t offset_back = -(kAdduCount2 + 2);  // 2: account for subu and bc1nez.
+  offset_back <<= 2;
+  offset_back += (offset_back & 0x8000) << 1;  // Account for sign extension in jic.
+
+  std::ostringstream oss;
+  oss <<
+      ".set noreorder\n"
+      "subu $t0, $t1, $t2\n"
+      "bc1eqz $f0, 1f\n"
+      "auipc $at, 0x" << std::hex << High16Bits(offset_forward) << "\n"
+      "jic $at, 0x" << std::hex << Low16Bits(offset_forward) << "\n"
+      "1:\n" <<
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
+      "2:\n" <<
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
+      "subu $t0, $t1, $t2\n"
+      "bc1nez $f0, 3f\n"
+      "auipc $at, 0x" << std::hex << High16Bits(offset_back) << "\n"
+      "jic $at, 0x" << std::hex << Low16Bits(offset_back) << "\n"
+      "3:\n";
+  std::string expected = oss.str();
+  DriverStr(expected, "LongBeqc");
+}
+
 // TODO: MipsAssembler::Addiupc
 //       MipsAssembler::Bc
 //       MipsAssembler::Jic
diff --git a/compiler/utils/mips/assembler_mips_test.cc b/compiler/utils/mips/assembler_mips_test.cc
index 50a8dc2..708bc3d 100644
--- a/compiler/utils/mips/assembler_mips_test.cc
+++ b/compiler/utils/mips/assembler_mips_test.cc
@@ -2009,14 +2009,17 @@
 }
 
 TEST_F(AssemblerMIPSTest, Beq) {
+  __ SetReorder(false);
   BranchCondTwoRegsHelper(&mips::MipsAssembler::Beq, "Beq");
 }
 
 TEST_F(AssemblerMIPSTest, Bne) {
+  __ SetReorder(false);
   BranchCondTwoRegsHelper(&mips::MipsAssembler::Bne, "Bne");
 }
 
 TEST_F(AssemblerMIPSTest, Beqz) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Beqz(mips::A0, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2043,6 +2046,7 @@
 }
 
 TEST_F(AssemblerMIPSTest, Bnez) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Bnez(mips::A0, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2069,22 +2073,27 @@
 }
 
 TEST_F(AssemblerMIPSTest, Bltz) {
+  __ SetReorder(false);
   BranchCondOneRegHelper(&mips::MipsAssembler::Bltz, "Bltz");
 }
 
 TEST_F(AssemblerMIPSTest, Bgez) {
+  __ SetReorder(false);
   BranchCondOneRegHelper(&mips::MipsAssembler::Bgez, "Bgez");
 }
 
 TEST_F(AssemblerMIPSTest, Blez) {
+  __ SetReorder(false);
   BranchCondOneRegHelper(&mips::MipsAssembler::Blez, "Blez");
 }
 
 TEST_F(AssemblerMIPSTest, Bgtz) {
+  __ SetReorder(false);
   BranchCondOneRegHelper(&mips::MipsAssembler::Bgtz, "Bgtz");
 }
 
 TEST_F(AssemblerMIPSTest, Blt) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Blt(mips::A0, mips::A1, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2113,6 +2122,7 @@
 }
 
 TEST_F(AssemblerMIPSTest, Bge) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Bge(mips::A0, mips::A1, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2141,6 +2151,7 @@
 }
 
 TEST_F(AssemblerMIPSTest, Bltu) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Bltu(mips::A0, mips::A1, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2169,6 +2180,7 @@
 }
 
 TEST_F(AssemblerMIPSTest, Bgeu) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Bgeu(mips::A0, mips::A1, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2197,6 +2209,7 @@
 }
 
 TEST_F(AssemblerMIPSTest, Bc1f) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Bc1f(0, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2223,6 +2236,7 @@
 }
 
 TEST_F(AssemblerMIPSTest, Bc1t) {
+  __ SetReorder(false);
   mips::MipsLabel label;
   __ Bc1t(0, &label);
   constexpr size_t kAdduCount1 = 63;
@@ -2331,6 +2345,410 @@
   DriverStr(expected, "LoadNearestFarLiteral");
 }
 
+TEST_F(AssemblerMIPSTest, ImpossibleReordering) {
+  mips::MipsLabel label1, label2;
+  __ SetReorder(true);
+
+  __ B(&label1);  // No preceding or target instruction for the delay slot.
+
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Bind(&label1);
+  __ B(&label1);  // The preceding label prevents moving Addu into the delay slot.
+  __ B(&label1);  // No preceding or target instruction for the delay slot.
+
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Beqz(mips::T0, &label1);  // T0 dependency.
+
+  __ Or(mips::T1, mips::T2, mips::T3);
+  __ Bne(mips::T2, mips::T1, &label1);  // T1 dependency.
+
+  __ And(mips::T0, mips::T1, mips::T2);
+  __ Blt(mips::T1, mips::T0, &label1);  // T0 dependency.
+
+  __ Xor(mips::AT, mips::T0, mips::T1);
+  __ Bge(mips::T1, mips::T0, &label1);  // AT dependency.
+
+  __ Subu(mips::T0, mips::T1, mips::AT);
+  __ Bltu(mips::T1, mips::T0, &label1);  // AT dependency.
+
+  __ ColtS(1, mips::F2, mips::F4);
+  __ Bc1t(1, &label1);  // cc1 dependency.
+
+  __ Move(mips::T0, mips::RA);
+  __ Bal(&label1);  // RA dependency.
+
+  __ Lw(mips::RA, mips::T0, 0);
+  __ Bal(&label1);  // RA dependency.
+
+  __ LlR2(mips::T9, mips::T0, 0);
+  __ Jalr(mips::T9);  // T9 dependency.
+
+  __ Sw(mips::RA, mips::T0, 0);
+  __ Jalr(mips::T9);  // RA dependency.
+
+  __ Lw(mips::T1, mips::T0, 0);
+  __ Jalr(mips::T1, mips::T9);  // T1 dependency.
+
+  __ ScR2(mips::T9, mips::T0, 0);
+  __ Jr(mips::T9);  // T9 dependency.
+
+  __ Bind(&label2);
+
+  __ Bnez(mips::T0, &label2);  // No preceding instruction for the delay slot.
+
+  __ Bgeu(mips::T1, mips::T0, &label2);  // No preceding instruction for the delay slot.
+
+  __ Bc1f(2, &label2);  // No preceding instruction for the delay slot.
+
+  __ Bal(&label2);  // No preceding instruction for the delay slot.
+
+  __ Jalr(mips::T9);  // No preceding instruction for the delay slot.
+
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ CodePosition();  // Drops the delay slot candidate (the last instruction).
+  __ Beq(mips::T1, mips::T2, &label2);  // No preceding or target instruction for the delay slot.
+
+  std::string expected =
+      ".set noreorder\n"
+      "b 1f\n"
+      "nop\n"
+
+      "addu $t0, $t1, $t2\n"
+      "1:\n"
+      "b 1b\n"
+      "nop\n"
+      "b 1b\n"
+      "nop\n"
+
+      "addu $t0, $t1, $t2\n"
+      "beq $zero, $t0, 1b\n"
+      "nop\n"
+
+      "or $t1, $t2, $t3\n"
+      "bne $t2, $t1, 1b\n"
+      "nop\n"
+
+      "and $t0, $t1, $t2\n"
+      "slt $at, $t1, $t0\n"
+      "bne $zero, $at, 1b\n"
+      "nop\n"
+
+      "xor $at, $t0, $t1\n"
+      "slt $at, $t1, $t0\n"
+      "beq $zero, $at, 1b\n"
+      "nop\n"
+
+      "subu $t0, $t1, $at\n"
+      "sltu $at, $t1, $t0\n"
+      "bne $zero, $at, 1b\n"
+      "nop\n"
+
+      "c.olt.s $fcc1, $f2, $f4\n"
+      "bc1t $fcc1, 1b\n"
+      "nop\n"
+
+      "or $t0, $ra, $zero\n"
+      "bal 1b\n"
+      "nop\n"
+
+      "lw $ra, 0($t0)\n"
+      "bal 1b\n"
+      "nop\n"
+
+      "ll $t9, 0($t0)\n"
+      "jalr $t9\n"
+      "nop\n"
+
+      "sw $ra, 0($t0)\n"
+      "jalr $t9\n"
+      "nop\n"
+
+      "lw $t1, 0($t0)\n"
+      "jalr $t1, $t9\n"
+      "nop\n"
+
+      "sc $t9, 0($t0)\n"
+      "jalr $zero, $t9\n"
+      "nop\n"
+
+      "2:\n"
+
+      "bne $zero, $t0, 2b\n"
+      "nop\n"
+
+      "sltu $at, $t1, $t0\n"
+      "beq $zero, $at, 2b\n"
+      "nop\n"
+
+      "bc1f $fcc2, 2b\n"
+      "nop\n"
+
+      "bal 2b\n"
+      "nop\n"
+
+      "jalr $t9\n"
+      "nop\n"
+
+      "addu $t0, $t1, $t2\n"
+      "beq $t1, $t2, 2b\n"
+      "nop\n";
+  DriverStr(expected, "ImpossibleReordering");
+}
+
+TEST_F(AssemblerMIPSTest, Reordering) {
+  mips::MipsLabel label1, label2;
+  __ SetReorder(true);
+
+  __ Bind(&label1);
+  __ Bind(&label2);
+
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Beqz(mips::T1, &label1);
+
+  __ Or(mips::T1, mips::T2, mips::T3);
+  __ Bne(mips::T2, mips::T3, &label1);
+
+  __ And(mips::T0, mips::T1, mips::T2);
+  __ Blt(mips::T1, mips::T2, &label1);
+
+  __ Xor(mips::T2, mips::T0, mips::T1);
+  __ Bge(mips::T1, mips::T0, &label1);
+
+  __ Subu(mips::T2, mips::T1, mips::T0);
+  __ Bltu(mips::T1, mips::T0, &label1);
+
+  __ ColtS(0, mips::F2, mips::F4);
+  __ Bc1t(1, &label1);
+
+  __ Move(mips::T0, mips::T1);
+  __ Bal(&label1);
+
+  __ LlR2(mips::T1, mips::T0, 0);
+  __ Jalr(mips::T9);
+
+  __ ScR2(mips::T1, mips::T0, 0);
+  __ Jr(mips::T9);
+
+  std::string expected =
+      ".set noreorder\n"
+      "1:\n"
+
+      "beq $zero, $t1, 1b\n"
+      "addu $t0, $t1, $t2\n"
+
+      "bne $t2, $t3, 1b\n"
+      "or $t1, $t2, $t3\n"
+
+      "slt $at, $t1, $t2\n"
+      "bne $zero, $at, 1b\n"
+      "and $t0, $t1, $t2\n"
+
+      "slt $at, $t1, $t0\n"
+      "beq $zero, $at, 1b\n"
+      "xor $t2, $t0, $t1\n"
+
+      "sltu $at, $t1, $t0\n"
+      "bne $zero, $at, 1b\n"
+      "subu $t2, $t1, $t0\n"
+
+      "bc1t $fcc1, 1b\n"
+      "c.olt.s $fcc0, $f2, $f4\n"
+
+      "bal 1b\n"
+      "or $t0, $t1, $zero\n"
+
+      "jalr $t9\n"
+      "ll $t1, 0($t0)\n"
+
+      "jalr $zero, $t9\n"
+      "sc $t1, 0($t0)\n";
+  DriverStr(expected, "Reordering");
+}
+
+TEST_F(AssemblerMIPSTest, AbsorbTargetInstruction) {
+  mips::MipsLabel label1, label2, label3, label4, label5, label6;
+  __ SetReorder(true);
+
+  __ B(&label1);
+  __ Bind(&label1);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+
+  __ Bind(&label2);
+  __ Xor(mips::T0, mips::T1, mips::T2);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Bind(&label3);  // Prevents reordering ADDU above with B below.
+  __ B(&label2);
+
+  __ B(&label4);
+  __ Bind(&label4);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ CodePosition();  // Prevents absorbing ADDU above.
+
+  __ B(&label5);
+  __ Bind(&label5);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ Bind(&label6);
+  __ CodePosition();  // Even across Bind(), CodePosition() prevents absorbing the ADDU above.
+
+  std::string expected =
+      ".set noreorder\n"
+      "b 1f\n"
+      "addu $t0, $t1, $t2\n"
+      "addu $t0, $t1, $t2\n"
+      "1:\n"
+
+      "xor $t0, $t1, $t2\n"
+      "2:\n"
+      "addu $t0, $t1, $t2\n"
+      "b 2b\n"
+      "xor $t0, $t1, $t2\n"
+
+      "b 4f\n"
+      "nop\n"
+      "4:\n"
+      "addu $t0, $t1, $t2\n"
+
+      "b 5f\n"
+      "nop\n"
+      "5:\n"
+      "addu $t0, $t1, $t2\n";
+  DriverStr(expected, "AbsorbTargetInstruction");
+}
+
+TEST_F(AssemblerMIPSTest, SetReorder) {
+  mips::MipsLabel label1, label2, label3, label4, label5, label6;
+
+  __ SetReorder(true);
+  __ Bind(&label1);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ B(&label1);
+  __ B(&label5);
+  __ B(&label6);
+
+  __ SetReorder(false);
+  __ Bind(&label2);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ B(&label2);
+  __ B(&label5);
+  __ B(&label6);
+
+  __ SetReorder(true);
+  __ Bind(&label3);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ B(&label3);
+  __ B(&label5);
+  __ B(&label6);
+
+  __ SetReorder(false);
+  __ Bind(&label4);
+  __ Addu(mips::T0, mips::T1, mips::T2);
+  __ B(&label4);
+  __ B(&label5);
+  __ B(&label6);
+
+  __ SetReorder(true);
+  __ Bind(&label5);
+  __ Subu(mips::T0, mips::T1, mips::T2);
+
+  __ SetReorder(false);
+  __ Bind(&label6);
+  __ Xor(mips::T0, mips::T1, mips::T2);
+
+  std::string expected =
+      ".set noreorder\n"
+      "1:\n"
+      "b 1b\n"
+      "addu $t0, $t1, $t2\n"
+      "b 55f\n"
+      "subu $t0, $t1, $t2\n"
+      "b 6f\n"
+      "nop\n"
+
+      "2:\n"
+      "addu $t0, $t1, $t2\n"
+      "b 2b\n"
+      "nop\n"
+      "b 5f\n"
+      "nop\n"
+      "b 6f\n"
+      "nop\n"
+
+      "3:\n"
+      "b 3b\n"
+      "addu $t0, $t1, $t2\n"
+      "b 55f\n"
+      "subu $t0, $t1, $t2\n"
+      "b 6f\n"
+      "nop\n"
+
+      "4:\n"
+      "addu $t0, $t1, $t2\n"
+      "b 4b\n"
+      "nop\n"
+      "b 5f\n"
+      "nop\n"
+      "b 6f\n"
+      "nop\n"
+
+      "5:\n"
+      "subu $t0, $t1, $t2\n"
+      "55:\n"
+      "6:\n"
+      "xor $t0, $t1, $t2\n";
+  DriverStr(expected, "SetReorder");
+}
+
+TEST_F(AssemblerMIPSTest, LongBranchReorder) {
+  mips::MipsLabel label;
+  __ SetReorder(true);
+  __ Subu(mips::T0, mips::T1, mips::T2);
+  __ B(&label);
+  constexpr uint32_t kAdduCount1 = (1u << 15) + 1;
+  for (size_t i = 0; i != kAdduCount1; ++i) {
+    __ Addu(mips::ZERO, mips::ZERO, mips::ZERO);
+  }
+  __ Bind(&label);
+  constexpr uint32_t kAdduCount2 = (1u << 15) + 1;
+  for (size_t i = 0; i != kAdduCount2; ++i) {
+    __ Addu(mips::ZERO, mips::ZERO, mips::ZERO);
+  }
+  __ Subu(mips::T0, mips::T1, mips::T2);
+  __ B(&label);
+
+  // Account for 5 extra instructions: ori, addu, lw, jalr, addiu.
+  uint32_t offset_forward = (kAdduCount1 + 5) * sizeof(uint32_t);
+  // Account for 5 extra instructions: subu, addiu, sw, nal, lui.
+  uint32_t offset_back = -(kAdduCount1 + 5) * sizeof(uint32_t);
+
+  std::ostringstream oss;
+  oss <<
+      ".set noreorder\n"
+      "subu $t0, $t1, $t2\n"
+      "addiu $sp, $sp, -4\n"
+      "sw $ra, 0($sp)\n"
+      "bltzal $zero, .+4\n"
+      "lui $at, 0x" << std::hex << High16Bits(offset_forward) << "\n"
+      "ori $at, $at, 0x" << std::hex << Low16Bits(offset_forward) << "\n"
+      "addu $at, $at, $ra\n"
+      "lw $ra, 0($sp)\n"
+      "jalr $zero, $at\n"
+      "addiu $sp, $sp, 4\n" <<
+      RepeatInsn(kAdduCount1, "addu $zero, $zero, $zero\n") <<
+      RepeatInsn(kAdduCount2, "addu $zero, $zero, $zero\n") <<
+      "subu $t0, $t1, $t2\n"
+      "addiu $sp, $sp, -4\n"
+      "sw $ra, 0($sp)\n"
+      "bltzal $zero, .+4\n"
+      "lui $at, 0x" << std::hex << High16Bits(offset_back) << "\n"
+      "ori $at, $at, 0x" << std::hex << Low16Bits(offset_back) << "\n"
+      "addu $at, $at, $ra\n"
+      "lw $ra, 0($sp)\n"
+      "jalr $zero, $at\n"
+      "addiu $sp, $sp, 4\n";
+  std::string expected = oss.str();
+  DriverStr(expected, "LongBranchReorder");
+}
+
 #undef __
 
 }  // namespace art