intrinsics/riscv64_to_x86_64/include/berberis/intrinsics/macro_assembler_floating_point_impl.h - platform/frameworks/libs/binary_translation - Git at Google

 /*
  * Copyright (C) 2020 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
 #define RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_

 #include "berberis/base/bit_util.h"
 #include "berberis/intrinsics/macro_assembler.h"
 #include "berberis/intrinsics/macro_assembler_constants_pool.h"

 namespace berberis {

 namespace {

 // Exceptions are at position 0 in both x87 status word and mxcsr.
 // But rounding is in different positions for both.
 constexpr int8_t kX87RmPosition = 10;
 constexpr int8_t kMxcsrRmPosition = 13;
 // Mask to clean exceptions and rm fields.
 constexpr int8_t kX87MxcsrExceptionBits = 0b11'1101;  // No denormals: RISC-V doesn't have them.
 constexpr int16_t kX87RoundingBits = 0b11 << kX87RmPosition;
 constexpr int16_t kMxcsrRoundingBits = 0b11 << kMxcsrRmPosition;
 // Because rouding mode is only two bits on x86 we can compress table which converts from
 // RISC-V rounding mode to x87/SSE rounding mode into one integer.
 // Each element of table is two bits here:
 //   FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table.
 // Note: we never convert from x86 rounding mode to RISC-V rounding mode because there are
 // more roudning modes on RISC-V which means we have to keep these in the emulated CPU state.
 constexpr int32_t kRiscVRoundingModes = 0b1110'0111'00;

 }  // namespace

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroCanonicalizeNan(XMMRegister result, XMMRegister src) {
   Pmov(result, src);
   Cmpords<FloatType>(result, src);
   Pand(src, result);
   Pandn(result, {.disp = constants_pool::kCanonicalNans<FloatType>});
   Por(result, src);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroCanonicalizeNanAVX(XMMRegister result, XMMRegister src) {
   Vcmpords<FloatType>(result, src, src);
   Vpand(src, src, result);
   Vpandn(result, result, {.disp = constants_pool::kCanonicalNans<FloatType>});
   Vpor(result, result, src);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroFeq(Register result, XMMRegister src1, XMMRegister src2) {
   Cmpeqs<FloatType>(src1, src2);
   Mov<FloatType>(result, src1);
   And<int32_t>(result, 1);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroFeqAVX(Register result,
                                             XMMRegister src1,
                                             XMMRegister src2,
                                             XMMRegister tmp) {
   Vcmpeqs<FloatType>(tmp, src1, src2);
   Vmov<FloatType>(result, tmp);
   And<int32_t>(result, 1);
 }

 // Note: result is returned in %rax which is implicit argument of that macro-instruction.
 // Explicit argument is temporary needed to handle Stmxcsr instruction.
 template <typename Assembler>
 void MacroAssembler<Assembler>::MacroFeGetExceptionsTranslate(const Operand& mxcsr_scratch) {
   // Store x87 status word in the AX.
   Fnstsw();
   // Store MXCSR in scratch slot.
   Stmxcsr(mxcsr_scratch);
   // Merge x87 status word and MXCSR.
   Or<uint32_t>(gpr_a, mxcsr_scratch);
   // Leave only exceptions.
   And<uint32_t>(gpr_a, kX87MxcsrExceptionBits);
   // Convert exception bits.
   Expand<uint64_t, uint8_t>(gpr_a,
                             {.index = gpr_a,
                              .scale = Assembler::kTimesOne,
                              .disp = constants_pool::kX87ToRiscVExceptions});
 }

 template <typename Assembler>
 void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundImmTranslate(
     const Operand& fenv_scratch,
     int8_t exceptions_and_rm) {
   int8_t exceptions = exceptions_and_rm & 0b1'1111;
   int8_t rm = static_cast<uint8_t>(exceptions_and_rm) >> 5;
   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
   // Two bytes after control word are ignored.
   Operand x87_status_word = {.base = fenv_scratch.base,
                              .index = fenv_scratch.index,
                              .scale = fenv_scratch.scale,
                              .disp = fenv_scratch.disp + 4};
   // Place mxcsr right after 28bytes-sized x87 environment.
   Operand mxcsr = {.base = fenv_scratch.base,
                    .index = fenv_scratch.index,
                    .scale = fenv_scratch.scale,
                    .disp = fenv_scratch.disp + 28};
   // Convert RISC-V exceptions into x87 exceptions.
   uint8_t x87_exceptions = bit_cast<unsigned char*>(
       static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
   // We have to store the whole floating point environment since it's not possible to just change
   // status word without affecting other state.
   Fnstenv(fenv_scratch);
   // Store MXCSR in second scratch slot.
   Stmxcsr(mxcsr);
   // Clean exceptions in the x87 environment.
   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
   // Clean-out x87-RM field in x87 control word.
   And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
   // Clean-out MXCSR-RM field and exception bits in MXCSR.
   And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
   if (x87_exceptions) {
     // If exceptions are not zero then put exceptions in the x87 environment.
     Or<uint8_t>(x87_status_word, x87_exceptions);
   }
   if (rm) {
     // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
     Or<uint16_t>(fenv_scratch,
                  (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
   }
   if (exceptions_and_rm) {
     // If exceptions or roudning mode are not zero then then convert RISC-V rounding mode and store
     // them it in MXCSR.
     Or<uint32_t>(mxcsr,
                  x87_exceptions | (((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) &
                                    kMxcsrRoundingBits));
   }
   // Load x87 environment.
   Fldenv(fenv_scratch);
   // Load Mxcsr.
   Ldmxcsr(mxcsr);
 }

 template <typename Assembler>
 void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundTranslate(Register exceptions,
                                                                       const Operand& fenv_scratch,
                                                                       Register scratch_register) {
   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
   // Two bytes after control word are ignored.
   Operand x87_status_word = {.base = fenv_scratch.base,
                              .index = fenv_scratch.index,
                              .scale = fenv_scratch.scale,
                              .disp = fenv_scratch.disp + 4};
   // Place mxcsr right after 28bytes-sized x87 environment.
   Operand mxcsr = {.base = fenv_scratch.base,
                    .index = fenv_scratch.index,
                    .scale = fenv_scratch.scale,
                    .disp = fenv_scratch.disp + 28};
   // We have to store the whole floating point environment since it's not possible to just change
   // status word without affecting other state.
   Fnstenv(fenv_scratch);
   // Store MXCSR in second scratch slot.
   Stmxcsr(mxcsr);
   // Convert exceptions from RISC-V format to x87 format.
   Mov<uint8_t>(scratch_register,
                {.index = exceptions,
                 .scale = Assembler::kTimesOne,
                 .disp = constants_pool::kRiscVToX87Exceptions});
   // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
   // one may imagine. Two bytes after control word are ignored.
   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
   // Clean-out x87-RM field in x87 control word.
   And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
   // Clean-out MXCSR-RM field and exception bits in MXCSR.
   And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits | kMxcsrRoundingBits)));
   // Put exceptions in the x87 environment.
   Or<uint8_t>(x87_status_word, scratch_register);
   // Put exceptions in the MXCSR environment.
   Or<uint8_t>(mxcsr, scratch_register);
   // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
   Mov<uint32_t>(scratch_register, kRiscVRoundingModes << kX87RmPosition);
   // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
   ShrByCl<uint32_t>(scratch_register);
   // Each field is two bits so we need to shift by “rm” twice.
   // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
   ShrByCl<uint32_t>(scratch_register);
   // Mask only x87-RM bits.
   And<uint32_t>(scratch_register, kX87RoundingBits);
   // Push x87-RM field into x87 control world.
   Or<uint16_t>(fenv_scratch, scratch_register);
   // Move x87-RM field into MSCXR-RM field.
   Shl<uint32_t>(scratch_register, int8_t{3});
   // Push MXCSR-RM field into MXCSR.
   Or<uint32_t>(mxcsr, scratch_register);
   // Load x87 environment.
   Fldenv(fenv_scratch);
   // Load Mxcsr.
   Ldmxcsr(mxcsr);
 }

 template <typename Assembler>
 void MacroAssembler<Assembler>::MacroFeSetExceptionsImmTranslate(const Operand& fenv_scratch,
                                                                  int8_t exceptions) {
   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
   // Two bytes after control word are ignored.
   Operand x87_status_word = {.base = fenv_scratch.base,
                              .index = fenv_scratch.index,
                              .scale = fenv_scratch.scale,
                              .disp = fenv_scratch.disp + 4};
   // Place mxcsr right after 28bytes-sized x87 environment.
   Operand mxcsr = {.base = fenv_scratch.base,
                    .index = fenv_scratch.index,
                    .scale = fenv_scratch.scale,
                    .disp = fenv_scratch.disp + 28};
   // Convert RISC-V exceptions into x87 exceptions.
   uint8_t x87_exceptions = bit_cast<unsigned char*>(
       static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
   // We have to store the whole floating point environment since it's not possible to just change
   // status word without affecting other state.
   Fnstenv(fenv_scratch);
   // Store MXCSR in second scratch slot.
   Stmxcsr(mxcsr);
   // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
   // one may imagine. Two bytes after control word are ignored.
   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
   // Clean exception bits
   And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
   if (x87_exceptions) {
     // Put exceptions in the x87 environment.
     Or<uint8_t>(x87_status_word, x87_exceptions);
     // Put exceptions in the MXCSR environment.
     Or<uint8_t>(mxcsr, x87_exceptions);
   }
   // Load x87 environment.
   Fldenv(fenv_scratch);
   // Load Mxcsr.
   Ldmxcsr(mxcsr);
 }

 template <typename Assembler>
 void MacroAssembler<Assembler>::MacroFeSetExceptionsTranslate(Register exceptions,
                                                               const Operand& fenv_scratch,
                                                               Register x87_exceptions) {
   // Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
   // Two bytes after control word are ignored.
   Operand x87_status_word = {.base = fenv_scratch.base,
                              .index = fenv_scratch.index,
                              .scale = fenv_scratch.scale,
                              .disp = fenv_scratch.disp + 4};
   // Place mxcsr right after 28bytes-sized x87 environment.
   Operand mxcsr = {.base = fenv_scratch.base,
                    .index = fenv_scratch.index,
                    .scale = fenv_scratch.scale,
                    .disp = fenv_scratch.disp + 28};
   // We have to store the whole floating point environment since it's not possible to just change
   // status word without affecting other state.
   Fnstenv(fenv_scratch);
   // Store MXCSR in second scratch slot.
   Stmxcsr(mxcsr);
   // Convert exceptions from RISC-V format to x87 format.
   Mov<uint8_t>(x87_exceptions,
                {.index = exceptions,
                 .scale = Assembler::kTimesOne,
                 .disp = constants_pool::kRiscVToX87Exceptions});
   // Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
   // one may imagine. Two bytes after control word are ignored.
   And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
   // Clean exception bits
   And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
   // Put exceptions in the x87 environment.
   Or<uint8_t>(x87_status_word, x87_exceptions);
   // Put exceptions in the MXCSR environment.
   Or<uint8_t>(mxcsr, x87_exceptions);
   // Load x87 environment.
   Fldenv(fenv_scratch);
   // Load Mxcsr.
   Ldmxcsr(mxcsr);
 }

 // Note: actual rounding mode comes in %cl which is implicit argument of that macro-instruction.
 // All explicit arguments are temporaries.
 template <typename Assembler>
 void MacroAssembler<Assembler>::MacroFeSetRound(Register x87_sse_round,
                                                 const Operand& cw_scratch,
                                                 const Operand& mxcsr_scratch) {
   // Store x87 control world in first scratch slot.
   Fnstcw(cw_scratch);
   // Store MXCSR in second scratch slot.
   Stmxcsr(mxcsr_scratch);
   // Clean-out x87-RM field in x87 control word.
   And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
   // Clean-out MXCSR-RM field in MXCSR.
   And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
   // FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
   Mov<uint32_t>(x87_sse_round, kRiscVRoundingModes << kX87RmPosition);
   // Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
   ShrByCl<uint32_t>(x87_sse_round);
   // Each field is two bits so we need to shift by “rm” twice.
   // By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
   ShrByCl<uint32_t>(x87_sse_round);
   // Mask only x87-RM bits.
   And<uint32_t>(x87_sse_round, kX87RoundingBits);
   // Push x87-RM field into x87 control world.
   Or<uint16_t>(cw_scratch, x87_sse_round);
   // Move x87-RM field into MSCXR-RM field.
   Shl<uint32_t>(x87_sse_round, int8_t{3});
   // Push MXCSR-RM field into MXCSR.
   Or<uint32_t>(mxcsr_scratch, x87_sse_round);
   // Load new control world into x87 FPU.
   Fldcw(cw_scratch);
   // Load Mxcsr.
   Ldmxcsr(mxcsr_scratch);
 }

 template <typename Assembler>
 void MacroAssembler<Assembler>::MacroFeSetRoundImmTranslate(const Operand& cw_scratch,
                                                             const Operand& mxcsr_scratch,
                                                             int8_t rm) {
   // Store x87 control world in first scratch slot.
   Fnstcw(cw_scratch);
   // Store MXCSR in second scratch slot.
   Stmxcsr(mxcsr_scratch);
   // Clean-out x87-RM field in x87 control word.
   And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
   // Clean-out MXCSR-RM field in MXCSR.
   And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
   if (rm) {
     // If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
     Or<uint16_t>(cw_scratch,
                  (((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
     // If rounding mode is not zero then convert RISC-V rounding mode and store it in MXCSR.
     Or<uint32_t>(mxcsr_scratch,
                  ((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) & kMxcsrRoundingBits);
   }
   // Load new control world into x87 FPU.
   Fldcw(cw_scratch);
   // Load Mxcsr.
   Ldmxcsr(mxcsr_scratch);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroFle(Register result, XMMRegister src1, XMMRegister src2) {
   Cmples<FloatType>(src1, src2);
   Mov<FloatType>(result, src1);
   And<int32_t>(result, 1);
 }

 template <typename Assembler>
 template <typename FormatTo, typename FormatFrom>
 void MacroAssembler<Assembler>::MacroFCvtFloatToInteger(Register result, XMMRegister src) {
   Cvt<FormatFrom, FormatTo>(result, src);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroFleAVX(Register result,
                                             XMMRegister src1,
                                             XMMRegister src2,
                                             XMMRegister tmp) {
   Vcmples<FloatType>(tmp, src1, src2);
   Vmov<FloatType>(result, tmp);
   And<int32_t>(result, 1);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroFlt(Register result, XMMRegister src1, XMMRegister src2) {
   Cmplts<FloatType>(src1, src2);
   Mov<FloatType>(result, src1);
   And<int32_t>(result, 1);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroFltAVX(Register result,
                                             XMMRegister src1,
                                             XMMRegister src2,
                                             XMMRegister tmp) {
   Vcmplts<FloatType>(tmp, src1, src2);
   Vmov<FloatType>(result, tmp);
   And<int32_t>(result, 1);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroNanBox(XMMRegister arg) {
   static_assert(std::is_same_v<FloatType, Float32>);

   Por(arg, {.disp = constants_pool::kNanBox<Float32>});
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroNanBoxAVX(XMMRegister result, XMMRegister src) {
   static_assert(std::is_same_v<FloatType, Float32>);

   Vpor(result, src, {.disp = constants_pool::kNanBox<Float32>});
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroUnboxNan(XMMRegister result, XMMRegister src) {
   static_assert(std::is_same_v<FloatType, Float32>);

   Pmov(result, src);
   Pcmpeq<typename TypeTraits<FloatType>::Int>(result, {.disp = constants_pool::kNanBox<Float32>});
   Pshufd(result, result, kShuffleDDBB);
   Pand(src, result);
   Pandn(result, {.disp = constants_pool::kNanBoxedNans<Float32>});
   Por(result, src);
 }

 template <typename Assembler>
 template <typename FloatType>
 void MacroAssembler<Assembler>::MacroUnboxNanAVX(XMMRegister result, XMMRegister src) {
   static_assert(std::is_same_v<FloatType, Float32>);

   Vpcmpeq<typename TypeTraits<FloatType>::Int>(
       result, src, {.disp = constants_pool::kNanBox<Float32>});
   Vpshufd(result, result, kShuffleDDBB);
   Vpand(src, src, result);
   Vpandn(result, result, {.disp = constants_pool::kNanBoxedNans<Float32>});
   Vpor(result, result, src);
 }

 }  // namespace berberis

 #endif  // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
	/*
	* Copyright (C) 2020 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_
	#define RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_

	#include "berberis/base/bit_util.h"
	#include "berberis/intrinsics/macro_assembler.h"
	#include "berberis/intrinsics/macro_assembler_constants_pool.h"

	namespace berberis {

	namespace {

	// Exceptions are at position 0 in both x87 status word and mxcsr.
	// But rounding is in different positions for both.
	constexpr int8_t kX87RmPosition = 10;
	constexpr int8_t kMxcsrRmPosition = 13;
	// Mask to clean exceptions and rm fields.
	constexpr int8_t kX87MxcsrExceptionBits = 0b11'1101; // No denormals: RISC-V doesn't have them.
	constexpr int16_t kX87RoundingBits = 0b11 << kX87RmPosition;
	constexpr int16_t kMxcsrRoundingBits = 0b11 << kMxcsrRmPosition;
	// Because rouding mode is only two bits on x86 we can compress table which converts from
	// RISC-V rounding mode to x87/SSE rounding mode into one integer.
	// Each element of table is two bits here:
	// FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table.
	// Note: we never convert from x86 rounding mode to RISC-V rounding mode because there are
	// more roudning modes on RISC-V which means we have to keep these in the emulated CPU state.
	constexpr int32_t kRiscVRoundingModes = 0b1110'0111'00;

	} // namespace

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroCanonicalizeNan(XMMRegister result, XMMRegister src) {
	Pmov(result, src);
	Cmpords<FloatType>(result, src);
	Pand(src, result);
	Pandn(result, {.disp = constants_pool::kCanonicalNans<FloatType>});
	Por(result, src);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroCanonicalizeNanAVX(XMMRegister result, XMMRegister src) {
	Vcmpords<FloatType>(result, src, src);
	Vpand(src, src, result);
	Vpandn(result, result, {.disp = constants_pool::kCanonicalNans<FloatType>});
	Vpor(result, result, src);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroFeq(Register result, XMMRegister src1, XMMRegister src2) {
	Cmpeqs<FloatType>(src1, src2);
	Mov<FloatType>(result, src1);
	And<int32_t>(result, 1);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroFeqAVX(Register result,
	XMMRegister src1,
	XMMRegister src2,
	XMMRegister tmp) {
	Vcmpeqs<FloatType>(tmp, src1, src2);
	Vmov<FloatType>(result, tmp);
	And<int32_t>(result, 1);
	}

	// Note: result is returned in %rax which is implicit argument of that macro-instruction.
	// Explicit argument is temporary needed to handle Stmxcsr instruction.
	template <typename Assembler>
	void MacroAssembler<Assembler>::MacroFeGetExceptionsTranslate(const Operand& mxcsr_scratch) {
	// Store x87 status word in the AX.
	Fnstsw();
	// Store MXCSR in scratch slot.
	Stmxcsr(mxcsr_scratch);
	// Merge x87 status word and MXCSR.
	Or<uint32_t>(gpr_a, mxcsr_scratch);
	// Leave only exceptions.
	And<uint32_t>(gpr_a, kX87MxcsrExceptionBits);
	// Convert exception bits.
	Expand<uint64_t, uint8_t>(gpr_a,
	{.index = gpr_a,
	.scale = Assembler::kTimesOne,
	.disp = constants_pool::kX87ToRiscVExceptions});
	}

	template <typename Assembler>
	void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundImmTranslate(
	const Operand& fenv_scratch,
	int8_t exceptions_and_rm) {
	int8_t exceptions = exceptions_and_rm & 0b1'1111;
	int8_t rm = static_cast<uint8_t>(exceptions_and_rm) >> 5;
	// Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
	// Two bytes after control word are ignored.
	Operand x87_status_word = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 4};
	// Place mxcsr right after 28bytes-sized x87 environment.
	Operand mxcsr = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 28};
	// Convert RISC-V exceptions into x87 exceptions.
	uint8_t x87_exceptions = bit_cast<unsigned char*>(
	static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
	// We have to store the whole floating point environment since it's not possible to just change
	// status word without affecting other state.
	Fnstenv(fenv_scratch);
	// Store MXCSR in second scratch slot.
	Stmxcsr(mxcsr);
	// Clean exceptions in the x87 environment.
	And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
	// Clean-out x87-RM field in x87 control word.
	And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
	// Clean-out MXCSR-RM field and exception bits in MXCSR.
	And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits \| kMxcsrRoundingBits)));
	if (x87_exceptions) {
	// If exceptions are not zero then put exceptions in the x87 environment.
	Or<uint8_t>(x87_status_word, x87_exceptions);
	}
	if (rm) {
	// If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
	Or<uint16_t>(fenv_scratch,
	(((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
	}
	if (exceptions_and_rm) {
	// If exceptions or roudning mode are not zero then then convert RISC-V rounding mode and store
	// them it in MXCSR.
	Or<uint32_t>(mxcsr,
	x87_exceptions \| (((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) &
	kMxcsrRoundingBits));
	}
	// Load x87 environment.
	Fldenv(fenv_scratch);
	// Load Mxcsr.
	Ldmxcsr(mxcsr);
	}

	template <typename Assembler>
	void MacroAssembler<Assembler>::MacroFeSetExceptionsAndRoundTranslate(Register exceptions,
	const Operand& fenv_scratch,
	Register scratch_register) {
	// Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
	// Two bytes after control word are ignored.
	Operand x87_status_word = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 4};
	// Place mxcsr right after 28bytes-sized x87 environment.
	Operand mxcsr = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 28};
	// We have to store the whole floating point environment since it's not possible to just change
	// status word without affecting other state.
	Fnstenv(fenv_scratch);
	// Store MXCSR in second scratch slot.
	Stmxcsr(mxcsr);
	// Convert exceptions from RISC-V format to x87 format.
	Mov<uint8_t>(scratch_register,
	{.index = exceptions,
	.scale = Assembler::kTimesOne,
	.disp = constants_pool::kRiscVToX87Exceptions});
	// Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
	// one may imagine. Two bytes after control word are ignored.
	And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
	// Clean-out x87-RM field in x87 control word.
	And<uint16_t>(fenv_scratch, static_cast<uint16_t>(~kX87RoundingBits));
	// Clean-out MXCSR-RM field and exception bits in MXCSR.
	And<uint32_t>(mxcsr, static_cast<uint32_t>(~(kX87MxcsrExceptionBits \| kMxcsrRoundingBits)));
	// Put exceptions in the x87 environment.
	Or<uint8_t>(x87_status_word, scratch_register);
	// Put exceptions in the MXCSR environment.
	Or<uint8_t>(mxcsr, scratch_register);
	// FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
	Mov<uint32_t>(scratch_register, kRiscVRoundingModes << kX87RmPosition);
	// Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
	ShrByCl<uint32_t>(scratch_register);
	// Each field is two bits so we need to shift by “rm” twice.
	// By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
	ShrByCl<uint32_t>(scratch_register);
	// Mask only x87-RM bits.
	And<uint32_t>(scratch_register, kX87RoundingBits);
	// Push x87-RM field into x87 control world.
	Or<uint16_t>(fenv_scratch, scratch_register);
	// Move x87-RM field into MSCXR-RM field.
	Shl<uint32_t>(scratch_register, int8_t{3});
	// Push MXCSR-RM field into MXCSR.
	Or<uint32_t>(mxcsr, scratch_register);
	// Load x87 environment.
	Fldenv(fenv_scratch);
	// Load Mxcsr.
	Ldmxcsr(mxcsr);
	}

	template <typename Assembler>
	void MacroAssembler<Assembler>::MacroFeSetExceptionsImmTranslate(const Operand& fenv_scratch,
	int8_t exceptions) {
	// Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
	// Two bytes after control word are ignored.
	Operand x87_status_word = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 4};
	// Place mxcsr right after 28bytes-sized x87 environment.
	Operand mxcsr = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 28};
	// Convert RISC-V exceptions into x87 exceptions.
	uint8_t x87_exceptions = bit_cast<unsigned char*>(
	static_cast<uintptr_t>(constants_pool::kRiscVToX87Exceptions))[exceptions];
	// We have to store the whole floating point environment since it's not possible to just change
	// status word without affecting other state.
	Fnstenv(fenv_scratch);
	// Store MXCSR in second scratch slot.
	Stmxcsr(mxcsr);
	// Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
	// one may imagine. Two bytes after control word are ignored.
	And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
	// Clean exception bits
	And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
	if (x87_exceptions) {
	// Put exceptions in the x87 environment.
	Or<uint8_t>(x87_status_word, x87_exceptions);
	// Put exceptions in the MXCSR environment.
	Or<uint8_t>(mxcsr, x87_exceptions);
	}
	// Load x87 environment.
	Fldenv(fenv_scratch);
	// Load Mxcsr.
	Ldmxcsr(mxcsr);
	}

	template <typename Assembler>
	void MacroAssembler<Assembler>::MacroFeSetExceptionsTranslate(Register exceptions,
	const Operand& fenv_scratch,
	Register x87_exceptions) {
	// Note: in 32bit/64bit mode it's at offset 4, not 2 as one may imagine.
	// Two bytes after control word are ignored.
	Operand x87_status_word = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 4};
	// Place mxcsr right after 28bytes-sized x87 environment.
	Operand mxcsr = {.base = fenv_scratch.base,
	.index = fenv_scratch.index,
	.scale = fenv_scratch.scale,
	.disp = fenv_scratch.disp + 28};
	// We have to store the whole floating point environment since it's not possible to just change
	// status word without affecting other state.
	Fnstenv(fenv_scratch);
	// Store MXCSR in second scratch slot.
	Stmxcsr(mxcsr);
	// Convert exceptions from RISC-V format to x87 format.
	Mov<uint8_t>(x87_exceptions,
	{.index = exceptions,
	.scale = Assembler::kTimesOne,
	.disp = constants_pool::kRiscVToX87Exceptions});
	// Clean exceptions in the x87 environment. Note: in 32bit/64bit mode it's at offset 4, not 2 as
	// one may imagine. Two bytes after control word are ignored.
	And<uint8_t>(x87_status_word, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
	// Clean exception bits
	And<uint8_t>(mxcsr, static_cast<uint8_t>(~kX87MxcsrExceptionBits));
	// Put exceptions in the x87 environment.
	Or<uint8_t>(x87_status_word, x87_exceptions);
	// Put exceptions in the MXCSR environment.
	Or<uint8_t>(mxcsr, x87_exceptions);
	// Load x87 environment.
	Fldenv(fenv_scratch);
	// Load Mxcsr.
	Ldmxcsr(mxcsr);
	}

	// Note: actual rounding mode comes in %cl which is implicit argument of that macro-instruction.
	// All explicit arguments are temporaries.
	template <typename Assembler>
	void MacroAssembler<Assembler>::MacroFeSetRound(Register x87_sse_round,
	const Operand& cw_scratch,
	const Operand& mxcsr_scratch) {
	// Store x87 control world in first scratch slot.
	Fnstcw(cw_scratch);
	// Store MXCSR in second scratch slot.
	Stmxcsr(mxcsr_scratch);
	// Clean-out x87-RM field in x87 control word.
	And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
	// Clean-out MXCSR-RM field in MXCSR.
	And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
	// FE_TONEAREST, FE_TOWARDZERO, FE_DOWNWARD, FE_UPWARD, FE_TOWARDZERO table from bits 10-11:
	Mov<uint32_t>(x87_sse_round, kRiscVRoundingModes << kX87RmPosition);
	// Shift by “rm” to get appropriate bits, suitable for x87 FPU control word.
	ShrByCl<uint32_t>(x87_sse_round);
	// Each field is two bits so we need to shift by “rm” twice.
	// By doing it with 2x shifts we keep “rm” in CL intact (and speed is the same on most CPUs).
	ShrByCl<uint32_t>(x87_sse_round);
	// Mask only x87-RM bits.
	And<uint32_t>(x87_sse_round, kX87RoundingBits);
	// Push x87-RM field into x87 control world.
	Or<uint16_t>(cw_scratch, x87_sse_round);
	// Move x87-RM field into MSCXR-RM field.
	Shl<uint32_t>(x87_sse_round, int8_t{3});
	// Push MXCSR-RM field into MXCSR.
	Or<uint32_t>(mxcsr_scratch, x87_sse_round);
	// Load new control world into x87 FPU.
	Fldcw(cw_scratch);
	// Load Mxcsr.
	Ldmxcsr(mxcsr_scratch);
	}

	template <typename Assembler>
	void MacroAssembler<Assembler>::MacroFeSetRoundImmTranslate(const Operand& cw_scratch,
	const Operand& mxcsr_scratch,
	int8_t rm) {
	// Store x87 control world in first scratch slot.
	Fnstcw(cw_scratch);
	// Store MXCSR in second scratch slot.
	Stmxcsr(mxcsr_scratch);
	// Clean-out x87-RM field in x87 control word.
	And<uint16_t>(cw_scratch, static_cast<uint16_t>(~kX87RoundingBits));
	// Clean-out MXCSR-RM field in MXCSR.
	And<uint32_t>(mxcsr_scratch, static_cast<uint32_t>(~kMxcsrRoundingBits));
	if (rm) {
	// If rounding mode is not zero then convert RISC-V rounding mode and store it in control word.
	Or<uint16_t>(cw_scratch,
	(((kRiscVRoundingModes << kX87RmPosition) >> (rm * 2)) & kX87RoundingBits));
	// If rounding mode is not zero then convert RISC-V rounding mode and store it in MXCSR.
	Or<uint32_t>(mxcsr_scratch,
	((kRiscVRoundingModes << kMxcsrRmPosition) >> (rm * 2)) & kMxcsrRoundingBits);
	}
	// Load new control world into x87 FPU.
	Fldcw(cw_scratch);
	// Load Mxcsr.
	Ldmxcsr(mxcsr_scratch);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroFle(Register result, XMMRegister src1, XMMRegister src2) {
	Cmples<FloatType>(src1, src2);
	Mov<FloatType>(result, src1);
	And<int32_t>(result, 1);
	}

	template <typename Assembler>
	template <typename FormatTo, typename FormatFrom>
	void MacroAssembler<Assembler>::MacroFCvtFloatToInteger(Register result, XMMRegister src) {
	Cvt<FormatFrom, FormatTo>(result, src);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroFleAVX(Register result,
	XMMRegister src1,
	XMMRegister src2,
	XMMRegister tmp) {
	Vcmples<FloatType>(tmp, src1, src2);
	Vmov<FloatType>(result, tmp);
	And<int32_t>(result, 1);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroFlt(Register result, XMMRegister src1, XMMRegister src2) {
	Cmplts<FloatType>(src1, src2);
	Mov<FloatType>(result, src1);
	And<int32_t>(result, 1);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroFltAVX(Register result,
	XMMRegister src1,
	XMMRegister src2,
	XMMRegister tmp) {
	Vcmplts<FloatType>(tmp, src1, src2);
	Vmov<FloatType>(result, tmp);
	And<int32_t>(result, 1);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroNanBox(XMMRegister arg) {
	static_assert(std::is_same_v<FloatType, Float32>);

	Por(arg, {.disp = constants_pool::kNanBox<Float32>});
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroNanBoxAVX(XMMRegister result, XMMRegister src) {
	static_assert(std::is_same_v<FloatType, Float32>);

	Vpor(result, src, {.disp = constants_pool::kNanBox<Float32>});
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroUnboxNan(XMMRegister result, XMMRegister src) {
	static_assert(std::is_same_v<FloatType, Float32>);

	Pmov(result, src);
	Pcmpeq<typename TypeTraits<FloatType>::Int>(result, {.disp = constants_pool::kNanBox<Float32>});
	Pshufd(result, result, kShuffleDDBB);
	Pand(src, result);
	Pandn(result, {.disp = constants_pool::kNanBoxedNans<Float32>});
	Por(result, src);
	}

	template <typename Assembler>
	template <typename FloatType>
	void MacroAssembler<Assembler>::MacroUnboxNanAVX(XMMRegister result, XMMRegister src) {
	static_assert(std::is_same_v<FloatType, Float32>);

	Vpcmpeq<typename TypeTraits<FloatType>::Int>(
	result, src, {.disp = constants_pool::kNanBox<Float32>});
	Vpshufd(result, result, kShuffleDDBB);
	Vpand(src, src, result);
	Vpandn(result, result, {.disp = constants_pool::kNanBoxedNans<Float32>});
	Vpor(result, result, src);
	}

	} // namespace berberis

	#endif // RISCV64_TO_X86_64_BERBERIS_INTRINSICS_MACRO_ASSEMBLER_FLOATING_POINT_IMPL_H_