[intrinsics] Emit mem and FCsr full support

This also enables the FCsr tests now that the intrinsics are turned on.

Bug: 291126259
Test: mm and berberis_host_tests
(cherry picked from https://googleplex-android-review.googlesource.com/q/commit:728d170ac92a1dd3a5b14e249e191b75e8add1f9)

Merged-In: I194d5897e4ef57f655d2957a6b01974837eb277c
Change-Id: I194d5897e4ef57f655d2957a6b01974837eb277c
diff --git a/backend/Android.bp b/backend/Android.bp
index f05d744..5481d72 100644
--- a/backend/Android.bp
+++ b/backend/Android.bp
@@ -149,6 +149,7 @@
     name: "libberberis_backend_headers_riscv64_to_x86_64",
     defaults: ["berberis_defaults_64"],
     host_supported: true,
+    export_include_dirs: ["riscv64_to_x86_64/include"],
     header_libs: [
         "libberberis_backend_headers",
         "libberberis_guest_state_riscv64_headers",
diff --git a/backend/include/berberis/backend/x86_64/machine_ir.h b/backend/include/berberis/backend/x86_64/machine_ir.h
index d8d4a67..259c9fd 100644
--- a/backend/include/berberis/backend/x86_64/machine_ir.h
+++ b/backend/include/berberis/backend/x86_64/machine_ir.h
@@ -43,6 +43,7 @@
   kMachineOpPseudoJump,
   kMachineOpPseudoReadFlags,
   kMachineOpPseudoWriteFlags,
+#include "berberis/backend/x86_64/machine_opcode_guest-inl.h"
 #include "machine_opcode_x86_64-inl.h"  // NOLINT generated file!
 };
 
diff --git a/backend/riscv64_to_x86_64/include/berberis/backend/x86_64/machine_opcode_guest-inl.h b/backend/riscv64_to_x86_64/include/berberis/backend/x86_64/machine_opcode_guest-inl.h
new file mode 100644
index 0000000..7707972
--- /dev/null
+++ b/backend/riscv64_to_x86_64/include/berberis/backend/x86_64/machine_opcode_guest-inl.h
@@ -0,0 +1,7 @@
+
+kMachineOpMacroFeGetExceptionsTranslateMemBaseDispReg,
+    kMachineOpMacroFeSetExceptionsAndRoundImmTranslateMemBaseDispImm,
+    kMachineOpMacroFeSetExceptionsAndRoundTranslateRegMemBaseDispRegReg,
+    kMachineOpMacroFeSetExceptionsImmTranslateMemBaseDispImm,
+    kMachineOpMacroFeSetExceptionsTranslateRegMemBaseDispReg,
+    kMachineOpMacroFeSetRoundImmTranslateMemBaseDispMemBaseDispImm,
diff --git a/heavy_optimizer/riscv64/frontend.h b/heavy_optimizer/riscv64/frontend.h
index 7294bec..54cbe09 100644
--- a/heavy_optimizer/riscv64/frontend.h
+++ b/heavy_optimizer/riscv64/frontend.h
@@ -499,9 +499,8 @@
 HeavyOptimizerFrontend::GetCsr<CsrName::kFCsr>() {
   auto csr_reg = AllocTempReg();
   auto tmp = AllocTempReg();
-  bool inline_successful = TryInlineIntrinsicForHeavyOptimizer<&intrinsics::FeGetExceptions>(
+  InlineIntrinsicForHeavyOptimizer<&intrinsics::FeGetExceptions>(
       &builder_, tmp, GetFlagsRegister());
-  CHECK(inline_successful);
   Gen<x86_64::MovzxbqRegMemBaseDisp>(
       csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>);
   Gen<x86_64::ShlbRegImm>(csr_reg, 5, GetFlagsRegister());
@@ -540,52 +539,38 @@
 }
 
 template <>
-inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(uint8_t /* imm */) {
-  Unimplemented();
-  // TODO(b/291126436) Figure out how to pass Mem arg to FeSetExceptionsAndRoundImmTranslate.
-  // // Note: instructions Csrrci or Csrrsi couldn't affect Frm because immediate only has five
-  // bits.
-  // // But these instruction don't pass their immediate-specified argument into `SetCsr`, they
-  // combine
-  // // it with register first. Fixing that can only be done by changing code in the semantics
-  // player.
-  // //
-  // // But Csrrwi may clear it.  And we actually may only arrive here from Csrrwi.
-  // // Thus, technically, we know that imm >> 5 is always zero, but it doesn't look like a good
-  // idea
-  // // to rely on that: it's very subtle and it only affects code generation speed.
-  // Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>,
-  // static_cast<int8_t>(imm >> 5)); bool successful =
-  // TryInlineIntrinsicForHeavyOptimizer<&intrinsics::FeSetExceptionsAndRoundImmTranslate>(
-  //     &builder_,
-  //     GetFlagsRegister(),
-  //     x86_64::kMachineRegRBP,
-  //     static_cast<int>(offsetof(ThreadState, intrinsics_scratch_area)),
-  //     imm);
-  // CHECK(successful);
+inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(uint8_t imm) {
+  // Note: instructions Csrrci or Csrrsi couldn't affect Frm because immediate only has five bits.
+  // But these instruction don't pass their immediate-specified argument into `SetCsr`, they combine
+  // it with register first. Fixing that can only be done by changing code in the semantics player.
+  //
+  // But Csrrwi may clear it.  And we actually may only arrive here from Csrrwi.
+  // Thus, technically, we know that imm >> 5 is always zero, but it doesn't look like a good idea
+  // to rely on that: it's very subtle and it only affects code generation speed.
+  Gen<x86_64::MovbMemBaseDispImm>(
+      x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, static_cast<int8_t>(imm >> 5));
+  InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRoundImm>(
+      &builder_, GetFlagsRegister(), imm);
 }
 
 template <>
-inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(Register /* arg */) {
-  Unimplemented();
-  // TODO(b/291126436) Figure out how to pass Mem arg to FeSetExceptionsAndRoundTranslate.
-  // auto tmp1 = AllocTempReg();
-  // auto tmp2 = AllocTempReg();
-  // Gen<PseudoCopy>(tmp1, arg, 1);
-  // Gen<x86_64::AndlRegImm>(tmp1, 0b1'1111, GetFlagsRegister());
-  // Gen<x86_64::ShldlRegRegImm>(tmp2, arg, int8_t{32 - 5}, GetFlagsRegister());
-  // Gen<x86_64::AndbRegImm>(tmp2, kCsrMask<CsrName::kFrm>, GetFlagsRegister());
-  // Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>,
-  //                  tmp2);
-  // bool successful =
-  // TryInlineIntrinsicForHeavyOptimizer<&intrinsics::FeSetExceptionsAndRoundTranslate>(
-  //     &builder_,
-  //     GetFlagsRegister(),
-  //     tmp1,
-  //     x86_64::kMachineRegRBP,
-  //     static_cast<int>(offsetof(ThreadState, intrinsics_scratch_area)),
-  //     tmp1);
-  // CHECK(successful);
+inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(Register arg) {
+  // Check size to be sure we can use Andb and Movb below.
+  static_assert(sizeof(kCsrMask<CsrName::kFrm>) == 1);
+
+  auto exceptions = AllocTempReg();
+  auto rounding_mode = AllocTempReg();
+  Gen<PseudoCopy>(exceptions, arg, 1);
+  Gen<x86_64::AndlRegImm>(exceptions, 0b1'1111, GetFlagsRegister());
+  // We don't care about the data in rounding_mode because we will shift in the
+  // data we need.
+  Gen<PseudoDefReg>(rounding_mode);
+  Gen<x86_64::ShldlRegRegImm>(rounding_mode, arg, int8_t{32 - 5}, GetFlagsRegister());
+  Gen<x86_64::AndbRegImm>(rounding_mode, kCsrMask<CsrName::kFrm>, GetFlagsRegister());
+  Gen<x86_64::MovbMemBaseDispReg>(
+      x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, rounding_mode);
+  InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRound>(
+      &builder_, GetFlagsRegister(), exceptions, rounding_mode);
 }
 
 template <>
diff --git a/heavy_optimizer/riscv64/inline_intrinsic.h b/heavy_optimizer/riscv64/inline_intrinsic.h
index 443a1f7..a869f6f 100644
--- a/heavy_optimizer/riscv64/inline_intrinsic.h
+++ b/heavy_optimizer/riscv64/inline_intrinsic.h
@@ -29,6 +29,7 @@
 #include "berberis/backend/x86_64/machine_insn_intrinsics.h"
 #include "berberis/backend/x86_64/machine_ir.h"
 #include "berberis/backend/x86_64/machine_ir_builder.h"
+#include "berberis/base/checks.h"
 #include "berberis/base/config.h"
 #include "berberis/base/dependent_false.h"
 #include "berberis/intrinsics/common_to_x86/intrinsics_bindings.h"
@@ -490,6 +491,16 @@
       builder, result, flag_register, args...);
 }
 
+template <auto kFunction, typename ResType, typename FlagRegister, typename... ArgType>
+void InlineIntrinsicForHeavyOptimizer(x86_64::MachineIRBuilder* builder,
+                                      ResType result,
+                                      FlagRegister flag_register,
+                                      ArgType... args) {
+  bool success = TryInlineIntrinsicForHeavyOptimizer<kFunction, ResType, FlagRegister, ArgType...>(
+      builder, result, flag_register, args...);
+  CHECK(success);
+}
+
 template <auto kFunction, typename FlagRegister, typename... ArgType>
 bool TryInlineIntrinsicForHeavyOptimizerVoid(x86_64::MachineIRBuilder* builder,
                                              FlagRegister flag_register,
@@ -505,6 +516,15 @@
       builder, std::monostate{}, flag_register, args...);
 }
 
+template <auto kFunction, typename FlagRegister, typename... ArgType>
+void InlineIntrinsicForHeavyOptimizerVoid(x86_64::MachineIRBuilder* builder,
+                                          FlagRegister flag_register,
+                                          ArgType... args) {
+  bool success = TryInlineIntrinsicForHeavyOptimizerVoid<kFunction, FlagRegister, ArgType...>(
+      builder, flag_register, args...);
+  CHECK(success);
+}
+
 }  // namespace berberis
 
 #endif  // BERBERIS_HEAVY_OPTIMIZER_RISCV64_INLINE_INTRINSIC_H_
diff --git a/intrinsics/gen_intrinsics.py b/intrinsics/gen_intrinsics.py
index 2d56a66..74d2fed 100755
--- a/intrinsics/gen_intrinsics.py
+++ b/intrinsics/gen_intrinsics.py
@@ -785,7 +785,9 @@
 
 def _gen_opcode_generator(asm, opcode_generators):
   name = asm['name']
-  opcode = 'Undefined' if any([arg.get('class').startswith("Mem") and arg.get('usage') == 'def_early_clobber' for arg in asm['args']]) else name
+  num_mem_args = sum(1 for arg in asm['args'] if arg.get('class').startswith("Mem") and arg.get('usage') == 'def_early_clobber')
+  opcode = 'Undefined' if num_mem_args > 2 else (asm_defs.get_mem_macro_name(asm, '').replace("Mem", "MemBaseDisp")) if num_mem_args > 0 else name
+
   if name not in opcode_generators:
     opcode_generators[name] = True
     yield """
diff --git a/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h b/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h
index f3196ee..cb5c7fe 100644
--- a/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h
+++ b/test_utils/include/berberis/test_utils/insn_tests_riscv64-inl.h
@@ -1126,8 +1126,6 @@
   TestFrm(0x0020f173, 0, 0);
 }
 
-#if defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR)
-
 TEST_F(TESTSUITE, FCsrRegister) {
   fenv_t saved_environment;
   EXPECT_EQ(fegetenv(&saved_environment), 0);
@@ -1171,8 +1169,6 @@
   EXPECT_EQ(fesetenv(&saved_environment), 0);
 }
 
-#endif  // defined(TESTING_INTERPRETER) || defined(TESTING_LITE_TRANSLATOR)
-
 TEST_F(TESTSUITE, FFlagsRegister) {
   fenv_t saved_environment;
   EXPECT_EQ(fegetenv(&saved_environment), 0);