MIPS64: Implement table-based packed switch

Test: booted MIPS64 (with 2nd arch MIPS32R6) in QEMU
Test: test-art-target-run-test-optimizing (MIPS64R6) in QEMU
Test: test-art-host-gtest

Change-Id: I333dca43fca57ae7e6021bb84585487c889417c3
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index cf9f9d4..aedf31e 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -4482,27 +4482,20 @@
   locations->SetInAt(0, Location::RequiresRegister());
 }
 
-void InstructionCodeGeneratorMIPS64::VisitPackedSwitch(HPackedSwitch* switch_instr) {
-  int32_t lower_bound = switch_instr->GetStartValue();
-  int32_t num_entries = switch_instr->GetNumEntries();
-  LocationSummary* locations = switch_instr->GetLocations();
-  GpuRegister value_reg = locations->InAt(0).AsRegister<GpuRegister>();
-  HBasicBlock* default_block = switch_instr->GetDefaultBlock();
-
+void InstructionCodeGeneratorMIPS64::GenPackedSwitchWithCompares(GpuRegister value_reg,
+                                                                 int32_t lower_bound,
+                                                                 uint32_t num_entries,
+                                                                 HBasicBlock* switch_block,
+                                                                 HBasicBlock* default_block) {
   // Create a set of compare/jumps.
   GpuRegister temp_reg = TMP;
-  if (IsInt<16>(-lower_bound)) {
-    __ Addiu(temp_reg, value_reg, -lower_bound);
-  } else {
-    __ LoadConst32(AT, -lower_bound);
-    __ Addu(temp_reg, value_reg, AT);
-  }
+  __ Addiu32(temp_reg, value_reg, -lower_bound);
   // Jump to default if index is negative
   // Note: We don't check the case that index is positive while value < lower_bound, because in
   // this case, index >= num_entries must be true. So that we can save one branch instruction.
   __ Bltzc(temp_reg, codegen_->GetLabelOf(default_block));
 
-  const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors();
+  const ArenaVector<HBasicBlock*>& successors = switch_block->GetSuccessors();
   // Jump to successors[0] if value == lower_bound.
   __ Beqzc(temp_reg, codegen_->GetLabelOf(successors[0]));
   int32_t last_index = 0;
@@ -4520,11 +4513,66 @@
   }
 
   // And the default for any other value.
-  if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) {
+  if (!codegen_->GoesToNextBlock(switch_block, default_block)) {
     __ Bc(codegen_->GetLabelOf(default_block));
   }
 }
 
+void InstructionCodeGeneratorMIPS64::GenTableBasedPackedSwitch(GpuRegister value_reg,
+                                                               int32_t lower_bound,
+                                                               uint32_t num_entries,
+                                                               HBasicBlock* switch_block,
+                                                               HBasicBlock* default_block) {
+  // Create a jump table.
+  std::vector<Mips64Label*> labels(num_entries);
+  const ArenaVector<HBasicBlock*>& successors = switch_block->GetSuccessors();
+  for (uint32_t i = 0; i < num_entries; i++) {
+    labels[i] = codegen_->GetLabelOf(successors[i]);
+  }
+  JumpTable* table = __ CreateJumpTable(std::move(labels));
+
+  // Is the value in range?
+  __ Addiu32(TMP, value_reg, -lower_bound);
+  __ LoadConst32(AT, num_entries);
+  __ Bgeuc(TMP, AT, codegen_->GetLabelOf(default_block));
+
+  // We are in the range of the table.
+  // Load the target address from the jump table, indexing by the value.
+  __ LoadLabelAddress(AT, table->GetLabel());
+  __ Sll(TMP, TMP, 2);
+  __ Daddu(TMP, TMP, AT);
+  __ Lw(TMP, TMP, 0);
+  // Compute the absolute target address by adding the table start address
+  // (the table contains offsets to targets relative to its start).
+  __ Daddu(TMP, TMP, AT);
+  // And jump.
+  __ Jr(TMP);
+  __ Nop();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitPackedSwitch(HPackedSwitch* switch_instr) {
+  int32_t lower_bound = switch_instr->GetStartValue();
+  uint32_t num_entries = switch_instr->GetNumEntries();
+  LocationSummary* locations = switch_instr->GetLocations();
+  GpuRegister value_reg = locations->InAt(0).AsRegister<GpuRegister>();
+  HBasicBlock* switch_block = switch_instr->GetBlock();
+  HBasicBlock* default_block = switch_instr->GetDefaultBlock();
+
+  if (num_entries > kPackedSwitchJumpTableThreshold) {
+    GenTableBasedPackedSwitch(value_reg,
+                              lower_bound,
+                              num_entries,
+                              switch_block,
+                              default_block);
+  } else {
+    GenPackedSwitchWithCompares(value_reg,
+                                lower_bound,
+                                num_entries,
+                                switch_block,
+                                default_block);
+  }
+}
+
 void LocationsBuilderMIPS64::VisitClassTableGet(HClassTableGet*) {
   UNIMPLEMENTED(FATAL) << "ClassTableGet is unimplemented on mips64";
 }
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index cbd4957..99f11a9 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -217,6 +217,14 @@
 
   Mips64Assembler* GetAssembler() const { return assembler_; }
 
+  // Compare-and-jump packed switch generates approx. 3 + 2.5 * N 32-bit
+  // instructions for N cases.
+  // Table-based packed switch generates approx. 11 32-bit instructions
+  // and N 32-bit data words for N cases.
+  // At N = 6 they come out as 18 and 17 32-bit words respectively.
+  // We switch to the table-based method starting with 7 cases.
+  static constexpr uint32_t kPackedSwitchJumpTableThreshold = 6;
+
  private:
   void GenerateClassInitializationCheck(SlowPathCodeMIPS64* slow_path, GpuRegister class_reg);
   void GenerateMemoryBarrier(MemBarrierKind kind);
@@ -256,6 +264,16 @@
                                   LocationSummary* locations,
                                   Mips64Label* label);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
+  void GenPackedSwitchWithCompares(GpuRegister value_reg,
+                                   int32_t lower_bound,
+                                   uint32_t num_entries,
+                                   HBasicBlock* switch_block,
+                                   HBasicBlock* default_block);
+  void GenTableBasedPackedSwitch(GpuRegister value_reg,
+                                 int32_t lower_bound,
+                                 uint32_t num_entries,
+                                 HBasicBlock* switch_block,
+                                 HBasicBlock* default_block);
 
   Mips64Assembler* const assembler_;
   CodeGeneratorMIPS64* const codegen_;
diff --git a/compiler/utils/mips64/assembler_mips64.cc b/compiler/utils/mips64/assembler_mips64.cc
index 04430b1..2419b6b 100644
--- a/compiler/utils/mips64/assembler_mips64.cc
+++ b/compiler/utils/mips64/assembler_mips64.cc
@@ -35,12 +35,14 @@
   for (auto& exception_block : exception_blocks_) {
     EmitExceptionPoll(&exception_block);
   }
+  ReserveJumpTableSpace();
   EmitLiterals();
   PromoteBranches();
 }
 
 void Mips64Assembler::FinalizeInstructions(const MemoryRegion& region) {
   EmitBranches();
+  EmitJumpTables();
   Assembler::FinalizeInstructions(region);
   PatchCFI();
 }
@@ -470,6 +472,10 @@
   EmitI(0xf, static_cast<GpuRegister>(0), rt, imm16);
 }
 
+void Mips64Assembler::Aui(GpuRegister rt, GpuRegister rs, uint16_t imm16) {
+  EmitI(0xf, rs, rt, imm16);
+}
+
 void Mips64Assembler::Dahi(GpuRegister rs, uint16_t imm16) {
   EmitI(1, rs, static_cast<GpuRegister>(6), imm16);
 }
@@ -1069,6 +1075,20 @@
   TemplateLoadConst64(this, rd, value);
 }
 
+void Mips64Assembler::Addiu32(GpuRegister rt, GpuRegister rs, int32_t value) {
+  if (IsInt<16>(value)) {
+    Addiu(rt, rs, value);
+  } else {
+    int16_t high = High16Bits(value);
+    int16_t low = Low16Bits(value);
+    high += (low < 0) ? 1 : 0;  // Account for sign extension in addiu.
+    Aui(rt, rs, high);
+    if (low != 0) {
+      Addiu(rt, rt, low);
+    }
+  }
+}
+
 void Mips64Assembler::Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp) {
   if (IsInt<16>(value)) {
     Daddiu(rt, rs, value);
@@ -1641,6 +1661,67 @@
   FinalizeLabeledBranch(label);
 }
 
+JumpTable* Mips64Assembler::CreateJumpTable(std::vector<Mips64Label*>&& labels) {
+  jump_tables_.emplace_back(std::move(labels));
+  JumpTable* table = &jump_tables_.back();
+  DCHECK(!table->GetLabel()->IsBound());
+  return table;
+}
+
+void Mips64Assembler::ReserveJumpTableSpace() {
+  if (!jump_tables_.empty()) {
+    for (JumpTable& table : jump_tables_) {
+      Mips64Label* label = table.GetLabel();
+      Bind(label);
+
+      // Bulk ensure capacity, as this may be large.
+      size_t orig_size = buffer_.Size();
+      size_t required_capacity = orig_size + table.GetSize();
+      if (required_capacity > buffer_.Capacity()) {
+        buffer_.ExtendCapacity(required_capacity);
+      }
+#ifndef NDEBUG
+      buffer_.has_ensured_capacity_ = true;
+#endif
+
+      // Fill the space with dummy data as the data is not final
+      // until the branches have been promoted. And we shouldn't
+      // be moving uninitialized data during branch promotion.
+      for (size_t cnt = table.GetData().size(), i = 0; i < cnt; i++) {
+        buffer_.Emit<uint32_t>(0x1abe1234u);
+      }
+
+#ifndef NDEBUG
+      buffer_.has_ensured_capacity_ = false;
+#endif
+    }
+  }
+}
+
+void Mips64Assembler::EmitJumpTables() {
+  if (!jump_tables_.empty()) {
+    CHECK(!overwriting_);
+    // Switch from appending instructions at the end of the buffer to overwriting
+    // existing instructions (here, jump tables) in the buffer.
+    overwriting_ = true;
+
+    for (JumpTable& table : jump_tables_) {
+      Mips64Label* table_label = table.GetLabel();
+      uint32_t start = GetLabelLocation(table_label);
+      overwrite_location_ = start;
+
+      for (Mips64Label* target : table.GetData()) {
+        CHECK_EQ(buffer_.Load<uint32_t>(overwrite_location_), 0x1abe1234u);
+        // The table will contain target addresses relative to the table start.
+        uint32_t offset = GetLabelLocation(target) - start;
+        Emit(offset);
+      }
+    }
+
+    overwriting_ = false;
+  }
+}
+
 void Mips64Assembler::EmitLiterals() {
   if (!literals_.empty()) {
     for (Literal& literal : literals_) {
diff --git a/compiler/utils/mips64/assembler_mips64.h b/compiler/utils/mips64/assembler_mips64.h
index 08a55ed..c6d3119 100644
--- a/compiler/utils/mips64/assembler_mips64.h
+++ b/compiler/utils/mips64/assembler_mips64.h
@@ -357,6 +357,36 @@
   DISALLOW_COPY_AND_ASSIGN(Literal);
 };
 
+// Jump table: table of labels emitted after the code and before the literals. Similar to literals.
+class JumpTable {
+ public:
+  explicit JumpTable(std::vector<Mips64Label*>&& labels)
+      : label_(), labels_(std::move(labels)) {
+  }
+
+  size_t GetSize() const {
+    return labels_.size() * sizeof(uint32_t);
+  }
+
+  const std::vector<Mips64Label*>& GetData() const {
+    return labels_;
+  }
+
+  Mips64Label* GetLabel() {
+    return &label_;
+  }
+
+  const Mips64Label* GetLabel() const {
+    return &label_;
+  }
+
+ private:
+  Mips64Label label_;
+  std::vector<Mips64Label*> labels_;
+
+  DISALLOW_COPY_AND_ASSIGN(JumpTable);
+};
+
 // Slowpath entered when Thread::Current()->_exception is non-null.
 class Mips64ExceptionSlowPath {
  public:
@@ -388,6 +418,7 @@
         overwrite_location_(0),
         literals_(arena->Adapter(kArenaAllocAssembler)),
         long_literals_(arena->Adapter(kArenaAllocAssembler)),
+        jump_tables_(arena->Adapter(kArenaAllocAssembler)),
         last_position_adjustment_(0),
         last_old_position_(0),
         last_branch_id_(0) {
@@ -478,6 +509,7 @@
   void Lwupc(GpuRegister rs, uint32_t imm19);  // MIPS64
   void Ldpc(GpuRegister rs, uint32_t imm18);  // MIPS64
   void Lui(GpuRegister rt, uint16_t imm16);
+  void Aui(GpuRegister rt, GpuRegister rs, uint16_t imm16);
   void Dahi(GpuRegister rs, uint16_t imm16);  // MIPS64
   void Dati(GpuRegister rs, uint16_t imm16);  // MIPS64
   void Sync(uint32_t stype);
@@ -617,6 +649,7 @@
   // This function is only used for testing purposes.
   void RecordLoadConst64Path(int value);
 
+  void Addiu32(GpuRegister rt, GpuRegister rs, int32_t value);
   void Daddiu64(GpuRegister rt, GpuRegister rs, int64_t value, GpuRegister rtmp = AT);  // MIPS64
 
   void Bind(Label* label) OVERRIDE {
@@ -674,6 +707,12 @@
   // Load literal using PC-relative loads.
   void LoadLiteral(GpuRegister dest_reg, LoadOperandType load_type, Literal* literal);
 
+  // Create a jump table for the given labels that will be emitted when finalizing.
+  // When the table is emitted, offsets will be relative to the location of the table.
+  // The table location is determined by the location of its label (the label precedes
+  // the table data) and should be loaded using LoadLabelAddress().
+  JumpTable* CreateJumpTable(std::vector<Mips64Label*>&& labels);
+
   void Bc(Mips64Label* label);
   void Balc(Mips64Label* label);
   void Bltc(GpuRegister rs, GpuRegister rt, Mips64Label* label);
@@ -1048,6 +1087,8 @@
   const Branch* GetBranch(uint32_t branch_id) const;
 
   void EmitLiterals();
+  void ReserveJumpTableSpace();
+  void EmitJumpTables();
   void PromoteBranches();
   void EmitBranch(Branch* branch);
   void EmitBranches();
@@ -1071,6 +1112,9 @@
   ArenaDeque<Literal> literals_;
   ArenaDeque<Literal> long_literals_;  // 64-bit literals separated for alignment reasons.
 
+  // Jump table list.
+  ArenaDeque<JumpTable> jump_tables_;
+
   // Data for AdjustedPosition(), see the description there.
   uint32_t last_position_adjustment_;
   uint32_t last_old_position_;
diff --git a/compiler/utils/mips64/assembler_mips64_test.cc b/compiler/utils/mips64/assembler_mips64_test.cc
index 9d0d0fc..e6fc6fb 100644
--- a/compiler/utils/mips64/assembler_mips64_test.cc
+++ b/compiler/utils/mips64/assembler_mips64_test.cc
@@ -1888,9 +1888,9 @@
   DriverStr(expected, "StoreFpuToOffset");
 }
 
-///////////////////////
-// Loading Constants //
-///////////////////////
+//////////////////////////////
+// Loading/adding Constants //
+//////////////////////////////
 
 TEST_F(AssemblerMIPS64Test, LoadConst32) {
   // IsUint<16>(value)
@@ -1933,6 +1933,31 @@
   DriverStr(expected, "LoadConst32");
 }
 
+TEST_F(AssemblerMIPS64Test, Addiu32) {
+  __ Addiu32(mips64::A1, mips64::A2, -0x8000);
+  __ Addiu32(mips64::A1, mips64::A2, +0);
+  __ Addiu32(mips64::A1, mips64::A2, +0x7FFF);
+  __ Addiu32(mips64::A1, mips64::A2, -0x8001);
+  __ Addiu32(mips64::A1, mips64::A2, +0x8000);
+  __ Addiu32(mips64::A1, mips64::A2, -0x10000);
+  __ Addiu32(mips64::A1, mips64::A2, +0x10000);
+  __ Addiu32(mips64::A1, mips64::A2, +0x12345678);
+
+  const char* expected =
+      "addiu $a1, $a2, -0x8000\n"
+      "addiu $a1, $a2, 0\n"
+      "addiu $a1, $a2, 0x7FFF\n"
+      "aui $a1, $a2, 0xFFFF\n"
+      "addiu $a1, $a1, 0x7FFF\n"
+      "aui $a1, $a2, 1\n"
+      "addiu $a1, $a1, -0x8000\n"
+      "aui $a1, $a2, 0xFFFF\n"
+      "aui $a1, $a2, 1\n"
+      "aui $a1, $a2, 0x1234\n"
+      "addiu $a1, $a1, 0x5678\n";
+  DriverStr(expected, "Addiu32");
+}
+
 static uint64_t SignExtend16To64(uint16_t n) {
   return static_cast<int16_t>(n);
 }