| /* |
| * Copyright (C) 2023 The Android Open Source Project |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| #include "berberis/interpreter/riscv64/interpreter.h" |
| |
| #include <atomic> |
| #include <cfenv> |
| #include <cstdint> |
| #include <cstring> |
| |
| #include "berberis/base/bit_util.h" |
| #include "berberis/base/checks.h" |
| #include "berberis/base/macros.h" |
| #include "berberis/decoder/riscv64/decoder.h" |
| #include "berberis/decoder/riscv64/semantics_player.h" |
| #include "berberis/guest_state/guest_addr.h" |
| #include "berberis/guest_state/guest_state.h" |
| #include "berberis/intrinsics/guest_fp_flags.h" // ToHostRoundingMode |
| #include "berberis/intrinsics/intrinsics.h" |
| #include "berberis/intrinsics/intrinsics_float.h" |
| #include "berberis/intrinsics/type_traits.h" |
| #include "berberis/kernel_api/run_guest_syscall.h" |
| #include "berberis/runtime_primitives/memory_region_reservation.h" |
| #include "berberis/runtime_primitives/recovery_code.h" |
| |
| #include "fp_regs.h" |
| |
| namespace berberis { |
| |
| namespace { |
| |
| inline constexpr std::memory_order AqRlToStdMemoryOrder(bool aq, bool rl) { |
| if (aq) { |
| if (rl) { |
| return std::memory_order_acq_rel; |
| } else { |
| return std::memory_order_acquire; |
| } |
| } else { |
| if (rl) { |
| return std::memory_order_release; |
| } else { |
| return std::memory_order_relaxed; |
| } |
| } |
| } |
| |
| class Interpreter { |
| public: |
| using CsrName = berberis::CsrName; |
| using Decoder = Decoder<SemanticsPlayer<Interpreter>>; |
| using Register = uint64_t; |
| using FpRegister = uint64_t; |
| using Float32 = intrinsics::Float32; |
| using Float64 = intrinsics::Float64; |
| |
| explicit Interpreter(ThreadState* state) : state_(state), branch_taken_(false) {} |
| |
| // |
| // Instruction implementations. |
| // |
| |
| Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr) { |
| switch (opcode) { |
| case Decoder::CsrOpcode::kCsrrs: |
| return arg | csr; |
| case Decoder::CsrOpcode::kCsrrc: |
| return ~arg & csr; |
| default: |
| Unimplemented(); |
| return {}; |
| } |
| } |
| |
| Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr) { |
| return UpdateCsr(static_cast<Decoder::CsrOpcode>(opcode), imm, csr); |
| } |
| |
| // Note: we prefer not to use C11/C++ atomic_thread_fence or even gcc/clang builtin |
| // __atomic_thread_fence because all these function rely on the fact that compiler never uses |
| // non-temporal loads and stores and only issue “mfence” when sequentially consistent ordering is |
| // requested. They never issue “lfence” or “sfence”. |
| // Instead we pull the page from Linux's kernel book and map read ordereding to “lfence”, write |
| // ordering to “sfence” and read-write ordering to “mfence”. |
| // This can be important in the future if we would start using nontemporal moves in manually |
| // created assembly code. |
| // Ordering affecting I/O devices is not relevant to user-space code thus we just ignore bits |
| // related to devices I/O. |
| void Fence(Decoder::FenceOpcode /*opcode*/, |
| Register /*src*/, |
| bool sw, |
| bool sr, |
| bool /*so*/, |
| bool /*si*/, |
| bool pw, |
| bool pr, |
| bool /*po*/, |
| bool /*pi*/) { |
| bool read_fence = sr | pr; |
| bool write_fence = sw | pw; |
| // Two types of fences (total store ordering fence and normal fence) are supposed to be |
| // processed differently, but only for the “read_fence && write_fence” case (otherwise total |
| // store ordering fence becomes normal fence for the “forward compatibility”), yet because x86 |
| // doesn't distinguish between these two types of fences and since we are supposed to map all |
| // not-yet defined fences to normal fence (again, for the “forward compatibility”) it's Ok to |
| // just ignore opcode field. |
| if (read_fence) { |
| if (write_fence) { |
| asm volatile("mfence" ::: "memory"); |
| } else { |
| asm volatile("lfence" ::: "memory"); |
| } |
| } else if (write_fence) { |
| asm volatile("sfence" ::: "memory"); |
| } |
| return; |
| } |
| |
| void FenceI(Register /*arg*/, int16_t /*imm*/) { |
| // For interpreter-only mode we don't need to do anything here, but when we will have a |
| // translator we would need to flush caches here. |
| } |
| |
| template <typename IntType, bool aq, bool rl> |
| Register Lr(int64_t addr) { |
| static_assert(std::is_integral_v<IntType>, "Lr: IntType must be integral"); |
| static_assert(std::is_signed_v<IntType>, "Lr: IntType must be signed"); |
| // Address must be aligned on size of IntType. |
| CHECK((addr % sizeof(IntType)) == 0ULL); |
| return MemoryRegionReservation::Load<IntType>(&state_->cpu, addr, AqRlToStdMemoryOrder(aq, rl)); |
| } |
| |
| template <typename IntType, bool aq, bool rl> |
| Register Sc(int64_t addr, IntType val) { |
| static_assert(std::is_integral_v<IntType>, "Sc: IntType must be integral"); |
| static_assert(std::is_signed_v<IntType>, "Sc: IntType must be signed"); |
| // Address must be aligned on size of IntType. |
| CHECK((addr % sizeof(IntType)) == 0ULL); |
| return static_cast<Register>(MemoryRegionReservation::Store<IntType>( |
| &state_->cpu, addr, val, AqRlToStdMemoryOrder(aq, rl))); |
| } |
| |
| Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2) { |
| using uint128_t = unsigned __int128; |
| switch (opcode) { |
| case Decoder::OpOpcode::kAdd: |
| return arg1 + arg2; |
| case Decoder::OpOpcode::kSub: |
| return arg1 - arg2; |
| case Decoder::OpOpcode::kAnd: |
| return arg1 & arg2; |
| case Decoder::OpOpcode::kOr: |
| return arg1 | arg2; |
| case Decoder::OpOpcode::kXor: |
| return arg1 ^ arg2; |
| case Decoder::OpOpcode::kSll: |
| return arg1 << arg2; |
| case Decoder::OpOpcode::kSrl: |
| return arg1 >> arg2; |
| case Decoder::OpOpcode::kSra: |
| return bit_cast<int64_t>(arg1) >> arg2; |
| case Decoder::OpOpcode::kSlt: |
| return bit_cast<int64_t>(arg1) < bit_cast<int64_t>(arg2) ? 1 : 0; |
| case Decoder::OpOpcode::kSltu: |
| return arg1 < arg2 ? 1 : 0; |
| case Decoder::OpOpcode::kMul: |
| return arg1 * arg2; |
| case Decoder::OpOpcode::kMulh: |
| return (__int128{bit_cast<int64_t>(arg1)} * __int128{bit_cast<int64_t>(arg2)}) >> 64; |
| case Decoder::OpOpcode::kMulhsu: |
| return (__int128{bit_cast<int64_t>(arg1)} * uint128_t{arg2}) >> 64; |
| case Decoder::OpOpcode::kMulhu: |
| return (uint128_t{arg1} * uint128_t{arg2}) >> 64; |
| case Decoder::OpOpcode::kDiv: |
| return bit_cast<int64_t>(arg1) / bit_cast<int64_t>(arg2); |
| case Decoder::OpOpcode::kDivu: |
| return arg1 / arg2; |
| case Decoder::OpOpcode::kRem: |
| return bit_cast<int64_t>(arg1) % bit_cast<int64_t>(arg2); |
| case Decoder::OpOpcode::kRemu: |
| return arg1 % arg2; |
| case Decoder::OpOpcode::kAndn: |
| return arg1 & (~arg2); |
| case Decoder::OpOpcode::kOrn: |
| return arg1 | (~arg2); |
| case Decoder::OpOpcode::kXnor: |
| return ~(arg1 ^ arg2); |
| default: |
| Unimplemented(); |
| return {}; |
| } |
| } |
| |
| Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2) { |
| switch (opcode) { |
| case Decoder::Op32Opcode::kAddw: |
| return int32_t(arg1) + int32_t(arg2); |
| case Decoder::Op32Opcode::kSubw: |
| return int32_t(arg1) - int32_t(arg2); |
| case Decoder::Op32Opcode::kSllw: |
| return int32_t(arg1) << int32_t(arg2); |
| case Decoder::Op32Opcode::kSrlw: |
| return bit_cast<int32_t>(uint32_t(arg1) >> uint32_t(arg2)); |
| case Decoder::Op32Opcode::kSraw: |
| return int32_t(arg1) >> int32_t(arg2); |
| case Decoder::Op32Opcode::kMulw: |
| return int32_t(arg1) * int32_t(arg2); |
| case Decoder::Op32Opcode::kDivw: |
| return int32_t(arg1) / int32_t(arg2); |
| case Decoder::Op32Opcode::kDivuw: |
| return static_cast<int32_t>(uint32_t(arg1) / uint32_t(arg2)); |
| case Decoder::Op32Opcode::kRemw: |
| return int32_t(arg1) % int32_t(arg2); |
| case Decoder::Op32Opcode::kRemuw: |
| return static_cast<int32_t>(uint32_t(arg1) % uint32_t(arg2)); |
| default: |
| Unimplemented(); |
| return {}; |
| } |
| } |
| |
| Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset) { |
| void* ptr = ToHostAddr<void>(arg + offset); |
| switch (operand_type) { |
| case Decoder::LoadOperandType::k8bitUnsigned: |
| return Load<uint8_t>(ptr); |
| case Decoder::LoadOperandType::k16bitUnsigned: |
| return Load<uint16_t>(ptr); |
| case Decoder::LoadOperandType::k32bitUnsigned: |
| return Load<uint32_t>(ptr); |
| case Decoder::LoadOperandType::k64bit: |
| return Load<uint64_t>(ptr); |
| case Decoder::LoadOperandType::k8bitSigned: |
| return Load<int8_t>(ptr); |
| case Decoder::LoadOperandType::k16bitSigned: |
| return Load<int16_t>(ptr); |
| case Decoder::LoadOperandType::k32bitSigned: |
| return Load<int32_t>(ptr); |
| default: |
| Unimplemented(); |
| return {}; |
| } |
| } |
| |
| template <typename DataType> |
| FpRegister LoadFp(Register arg, int16_t offset) { |
| static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>); |
| DataType* ptr = ToHostAddr<DataType>(arg + offset); |
| FpRegister reg = 0; |
| memcpy(®, ptr, sizeof(DataType)); |
| return reg; |
| } |
| |
| Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm) { |
| switch (opcode) { |
| case Decoder::OpImmOpcode::kAddi: |
| return arg + int64_t{imm}; |
| case Decoder::OpImmOpcode::kSlti: |
| return bit_cast<int64_t>(arg) < int64_t{imm} ? 1 : 0; |
| case Decoder::OpImmOpcode::kSltiu: |
| return arg < bit_cast<uint64_t>(int64_t{imm}) ? 1 : 0; |
| case Decoder::OpImmOpcode::kXori: |
| return arg ^ int64_t { imm }; |
| case Decoder::OpImmOpcode::kOri: |
| return arg | int64_t{imm}; |
| case Decoder::OpImmOpcode::kAndi: |
| return arg & int64_t{imm}; |
| default: |
| Unimplemented(); |
| return {}; |
| } |
| } |
| |
| Register Lui(int32_t imm) { return int64_t{imm}; } |
| |
| Register Auipc(int32_t imm) { |
| uint64_t pc = state_->cpu.insn_addr; |
| return pc + int64_t{imm}; |
| } |
| |
| Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm) { |
| switch (opcode) { |
| case Decoder::OpImm32Opcode::kAddiw: |
| return int32_t(arg) + int32_t{imm}; |
| default: |
| Unimplemented(); |
| return {}; |
| } |
| } |
| |
| Register Ecall(Register syscall_nr, Register arg0, Register arg1, Register arg2, Register arg3, |
| Register arg4, Register arg5) { |
| return RunGuestSyscall(syscall_nr, arg0, arg1, arg2, arg3, arg4, arg5); |
| } |
| |
| Register Slli(Register arg, int8_t imm) { return arg << imm; } |
| |
| Register Srli(Register arg, int8_t imm) { return arg >> imm; } |
| |
| Register Srai(Register arg, int8_t imm) { return bit_cast<int64_t>(arg) >> imm; } |
| |
| Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm) { |
| switch (opcode) { |
| case Decoder::ShiftImm32Opcode::kSlliw: |
| return int32_t(arg) << int32_t{imm}; |
| case Decoder::ShiftImm32Opcode::kSrliw: |
| return bit_cast<int32_t>(uint32_t(arg) >> uint32_t{imm}); |
| case Decoder::ShiftImm32Opcode::kSraiw: |
| return int32_t(arg) >> int32_t{imm}; |
| default: |
| Unimplemented(); |
| return {}; |
| } |
| } |
| |
| Register Rori(Register arg, int8_t shamt) { |
| CheckShamtIsValid(shamt); |
| return (((uint64_t(arg) >> shamt)) | (uint64_t(arg) << (64 - shamt))); |
| } |
| |
| Register Roriw(Register arg, int8_t shamt) { |
| CheckShamt32IsValid(shamt); |
| return int32_t(((uint32_t(arg) >> shamt)) | (uint32_t(arg) << (32 - shamt))); |
| } |
| |
| void Store(Decoder::StoreOperandType operand_type, Register arg, int16_t offset, Register data) { |
| void* ptr = ToHostAddr<void>(arg + offset); |
| switch (operand_type) { |
| case Decoder::StoreOperandType::k8bit: |
| Store<uint8_t>(ptr, data); |
| break; |
| case Decoder::StoreOperandType::k16bit: |
| Store<uint16_t>(ptr, data); |
| break; |
| case Decoder::StoreOperandType::k32bit: |
| Store<uint32_t>(ptr, data); |
| break; |
| case Decoder::StoreOperandType::k64bit: |
| Store<uint64_t>(ptr, data); |
| break; |
| default: |
| return Unimplemented(); |
| } |
| } |
| |
| template <typename DataType> |
| void StoreFp(Register arg, int16_t offset, FpRegister data) { |
| static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>); |
| DataType* ptr = ToHostAddr<DataType>(arg + offset); |
| memcpy(ptr, &data, sizeof(DataType)); |
| } |
| |
| void CompareAndBranch(Decoder::BranchOpcode opcode, |
| Register arg1, |
| Register arg2, |
| int16_t offset) { |
| bool cond_value; |
| switch (opcode) { |
| case Decoder::BranchOpcode::kBeq: |
| cond_value = arg1 == arg2; |
| break; |
| case Decoder::BranchOpcode::kBne: |
| cond_value = arg1 != arg2; |
| break; |
| case Decoder::BranchOpcode::kBltu: |
| cond_value = arg1 < arg2; |
| break; |
| case Decoder::BranchOpcode::kBgeu: |
| cond_value = arg1 >= arg2; |
| break; |
| case Decoder::BranchOpcode::kBlt: |
| cond_value = bit_cast<int64_t>(arg1) < bit_cast<int64_t>(arg2); |
| break; |
| case Decoder::BranchOpcode::kBge: |
| cond_value = bit_cast<int64_t>(arg1) >= bit_cast<int64_t>(arg2); |
| break; |
| default: |
| return Unimplemented(); |
| } |
| |
| if (cond_value) { |
| state_->cpu.insn_addr += offset; |
| branch_taken_ = true; |
| } |
| } |
| |
| void Branch(int32_t offset) { |
| state_->cpu.insn_addr += offset; |
| branch_taken_ = true; |
| } |
| |
| void BranchRegister(Register base, int16_t offset) { |
| state_->cpu.insn_addr = (base + offset) & ~uint64_t{1}; |
| branch_taken_ = true; |
| } |
| |
| FpRegister Fmv(FpRegister arg) { return arg; } |
| |
| // |
| // V extensions. |
| // |
| |
| using TailProcessing = intrinsics::TailProcessing; |
| using InactiveProcessing = intrinsics::InactiveProcessing; |
| |
| enum class VectorSelecteElementWidth { |
| k8bit = 0b000, |
| k16bit = 0b001, |
| k32bit = 0b010, |
| k64bit = 0b011, |
| kMaxValue = 0b111, |
| }; |
| |
| enum class VectorRegisterGroupMultiplier { |
| k1register = 0b000, |
| k2registers = 0b001, |
| k4registers = 0b010, |
| k8registers = 0b011, |
| kEigthOfRegister = 0b101, |
| kQuarterOfRegister = 0b110, |
| kHalfOfRegister = 0b111, |
| kMaxValue = 0b111, |
| }; |
| |
| static constexpr size_t NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul) { |
| switch (vlmul) { |
| case VectorRegisterGroupMultiplier::k2registers: |
| return 2; |
| case VectorRegisterGroupMultiplier::k4registers: |
| return 4; |
| case VectorRegisterGroupMultiplier::k8registers: |
| return 8; |
| default: |
| return 1; |
| } |
| } |
| |
| void OpVector(const Decoder::VOpArgs& args) { |
| // RISC-V V extensions are using 8bit “opcode extension” vtype Csr to make sure 32bit encoding |
| // would be usable. |
| // |
| // Great care is made to ensure that vector code wouldn't need to change vtype Csr often (e.g. |
| // there are special mask instructions which allow one to manipulate on masks without the need |
| // to change the CPU mode. |
| // |
| // Currently we don't have support for multiple CPU mode in Berberis thus we can only handle |
| // these instrtuctions in the interpreter. |
| // |
| // TODO(300690740): develop and implement strategy which would allow us to support vector |
| // intrinsics not just in the interpreter. Move code from this function to semantics player. |
| Register vtype = GetCsr<CsrName::kVtype>(); |
| if (static_cast<std::make_signed_t<Register>>(vtype) < 0) { |
| return Unimplemented(); |
| } |
| switch (static_cast<VectorSelecteElementWidth>((vtype >> 3) & 0b111)) { |
| case VectorSelecteElementWidth::k8bit: |
| return OpVector<uint8_t>(args, vtype); |
| case VectorSelecteElementWidth::k16bit: |
| return OpVector<uint16_t>(args, vtype); |
| case VectorSelecteElementWidth::k32bit: |
| return OpVector<uint32_t>(args, vtype); |
| case VectorSelecteElementWidth::k64bit: |
| return OpVector<uint64_t>(args, vtype); |
| default: |
| return Unimplemented(); |
| } |
| } |
| |
| template <typename ElementType> |
| void OpVector(const Decoder::VOpArgs& args, Register vtype) { |
| switch (static_cast<VectorRegisterGroupMultiplier>(vtype & 0b111)) { |
| case VectorRegisterGroupMultiplier::k1register: |
| return OpVector<ElementType, VectorRegisterGroupMultiplier::k1register>(args, vtype); |
| case VectorRegisterGroupMultiplier::k2registers: |
| return OpVector<ElementType, VectorRegisterGroupMultiplier::k2registers>(args, vtype); |
| case VectorRegisterGroupMultiplier::k4registers: |
| return OpVector<ElementType, VectorRegisterGroupMultiplier::k4registers>(args, vtype); |
| case VectorRegisterGroupMultiplier::k8registers: |
| return OpVector<ElementType, VectorRegisterGroupMultiplier::k8registers>(args, vtype); |
| case VectorRegisterGroupMultiplier::kEigthOfRegister: |
| return OpVector<ElementType, VectorRegisterGroupMultiplier::kEigthOfRegister>(args, vtype); |
| case VectorRegisterGroupMultiplier::kQuarterOfRegister: |
| return OpVector<ElementType, VectorRegisterGroupMultiplier::kQuarterOfRegister>(args, |
| vtype); |
| case VectorRegisterGroupMultiplier::kHalfOfRegister: |
| return OpVector<ElementType, VectorRegisterGroupMultiplier::kHalfOfRegister>(args, vtype); |
| default: |
| return Unimplemented(); |
| } |
| } |
| |
| template <typename ElementType, VectorRegisterGroupMultiplier vlmul> |
| void OpVector(const Decoder::VOpArgs& args, Register vtype) { |
| if ((vtype >> 6) & 1) { |
| return OpVector<ElementType, vlmul, TailProcessing::kAgnostic>(args, vtype); |
| } |
| return OpVector<ElementType, vlmul, TailProcessing::kUndisturbed>(args, vtype); |
| } |
| |
| template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta> |
| void OpVector(const Decoder::VOpArgs& args, Register vtype) { |
| if (args.vm) { |
| return OpVector<ElementType, vlmul, vta>(args); |
| } |
| if (vtype >> 7) { |
| return OpVector<ElementType, vlmul, vta, InactiveProcessing::kAgnostic>(args); |
| } |
| return OpVector<ElementType, vlmul, vta, InactiveProcessing::kUndisturbed>(args); |
| } |
| |
| template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta> |
| void OpVector(const Decoder::VOpArgs& args) { |
| struct { |
| int8_t val : 5; |
| } imm = {.val = static_cast<int8_t>(args.src1) }; |
| switch (args.opcode) { |
| case Decoder::VOpOpcode::kVaddvi: |
| return OpVectorvx<intrinsics::Vaddvx<ElementType, vta>, ElementType, vlmul, vta>(args, |
| imm.val); |
| case Decoder::VOpOpcode::kVaddvv: |
| return OpVectorvv<intrinsics::Vaddvv<ElementType, vta>, ElementType, vlmul, vta>(args); |
| case Decoder::VOpOpcode::kVaddvx: |
| return OpVectorvx<intrinsics::Vaddvx<ElementType, vta>, ElementType, vlmul, vta>( |
| args, args.src1 ? GetReg(args.src1) : 0); |
| case Decoder::VOpOpcode::kVsubvv: |
| return OpVectorvv<intrinsics::Vaddvv<ElementType, vta>, ElementType, vlmul, vta>(args); |
| case Decoder::VOpOpcode::kVsubvx: |
| return OpVectorvx<intrinsics::Vaddvx<ElementType, vta>, ElementType, vlmul, vta>( |
| args, args.src1 ? GetReg(args.src1) : 0); |
| default: |
| Unimplemented(); |
| } |
| } |
| |
| template <auto Intrinsic, |
| typename ElementType, |
| VectorRegisterGroupMultiplier vlmul, |
| TailProcessing vta> |
| void OpVectorvv(const Decoder::VOpArgs& args) { |
| constexpr size_t registers_involved = NumberOfRegistersInvolved(vlmul); |
| if ((args.dst & (registers_involved - 1)) != 0 || (args.src1 & (registers_involved - 1)) != 0 || |
| (args.src2 & (registers_involved - 1)) != 0) { |
| return Unimplemented(); |
| } |
| int vstart = GetCsr<CsrName::kVstart>(); |
| int vl = GetCsr<CsrName::kVl>(); |
| SIMD128Register result, src1, src2; |
| for (size_t index = 0; index < registers_involved; ++index) { |
| result.Set(state_->cpu.v[args.dst + index]); |
| src1.Set(state_->cpu.v[args.src1 + index]); |
| src2.Set(state_->cpu.v[args.src2 + index]); |
| std::tie(result) = Intrinsic(vstart - index * (16 / sizeof(ElementType)), |
| vl - index * (16 / sizeof(ElementType)), |
| result, |
| src1, |
| src2); |
| state_->cpu.v[args.dst + index] = result.Get<__uint128_t>(); |
| } |
| SetCsr<CsrName::kVstart>(0); |
| } |
| |
| template <auto Intrinsic, |
| typename ElementType, |
| VectorRegisterGroupMultiplier vlmul, |
| TailProcessing vta> |
| void OpVectorvx(const Decoder::VOpArgs& args, ElementType arg1) { |
| constexpr size_t registers_involved = NumberOfRegistersInvolved(vlmul); |
| if ((args.dst & (registers_involved - 1)) != 0 || (args.src2 & (registers_involved - 1)) != 0) { |
| return Unimplemented(); |
| } |
| int vstart = GetCsr<CsrName::kVstart>(); |
| int vl = GetCsr<CsrName::kVl>(); |
| SIMD128Register result, src2; |
| for (size_t index = 0; index < registers_involved; ++index) { |
| result.Set(state_->cpu.v[args.dst + index]); |
| src2.Set(state_->cpu.v[args.src2 + index]); |
| std::tie(result) = Intrinsic(vstart - index * (16 / sizeof(ElementType)), |
| vl - index * (16 / sizeof(ElementType)), |
| result, |
| src2, |
| arg1); |
| state_->cpu.v[args.dst + index] = result.Get<__uint128_t>(); |
| } |
| SetCsr<CsrName::kVstart>(0); |
| } |
| |
| template <typename ElementType, |
| VectorRegisterGroupMultiplier vlmul, |
| TailProcessing vta, |
| InactiveProcessing vma> |
| void OpVector(const Decoder::VOpArgs& args) { |
| struct { |
| int8_t val : 5; |
| } imm = {.val = static_cast<int8_t>(args.src1) }; |
| switch (args.opcode) { |
| case Decoder::VOpOpcode::kVaddvi: |
| return OpVectorvx<intrinsics::Vaddvxm<ElementType, vta, vma>, ElementType, vlmul, vta, vma>( |
| args, imm.val); |
| case Decoder::VOpOpcode::kVaddvv: |
| return OpVectorvv<intrinsics::Vaddvvm<ElementType, vta, vma>, ElementType, vlmul, vta, vma>( |
| args); |
| case Decoder::VOpOpcode::kVaddvx: |
| return OpVectorvx<intrinsics::Vaddvxm<ElementType, vta, vma>, ElementType, vlmul, vta, vma>( |
| args, args.src1 ? GetReg(args.src1) : 0); |
| case Decoder::VOpOpcode::kVsubvv: |
| return OpVectorvv<intrinsics::Vaddvvm<ElementType, vta, vma>, ElementType, vlmul, vta, vma>( |
| args); |
| case Decoder::VOpOpcode::kVsubvx: |
| return OpVectorvx<intrinsics::Vaddvxm<ElementType, vta, vma>, ElementType, vlmul, vta, vma>( |
| args, args.src1 ? GetReg(args.src1) : 0); |
| default: |
| Unimplemented(); |
| } |
| } |
| |
| template <auto Intrinsic, |
| typename ElementType, |
| VectorRegisterGroupMultiplier vlmul, |
| TailProcessing vta, |
| InactiveProcessing vma> |
| void OpVectorvv(const Decoder::VOpArgs& args) { |
| constexpr size_t registers_involved = NumberOfRegistersInvolved(vlmul); |
| if ((args.dst & (registers_involved - 1)) != 0 || (args.src1 & (registers_involved - 1)) != 0 || |
| (args.src2 & (registers_involved - 1)) != 0) { |
| return Unimplemented(); |
| } |
| int vstart = GetCsr<CsrName::kVstart>(); |
| int vl = GetCsr<CsrName::kVl>(); |
| SIMD128Register mask, result, src1, src2; |
| mask.Set(state_->cpu.v[0]); |
| for (size_t index = 0; index < registers_involved; ++index) { |
| result.Set(state_->cpu.v[args.dst + index]); |
| src1.Set(state_->cpu.v[args.src1 + index]); |
| src2.Set(state_->cpu.v[args.src2 + index]); |
| std::tie(result) = Intrinsic(vstart - index * (16 / sizeof(ElementType)), |
| vl - index * (16 / sizeof(ElementType)), |
| intrinsics::MaskForRegisterInSequence<ElementType>(mask, index), |
| result, |
| src1, |
| src2); |
| state_->cpu.v[args.dst + index] = result.Get<__uint128_t>(); |
| } |
| SetCsr<CsrName::kVstart>(0); |
| } |
| |
| template <auto Intrinsic, |
| typename ElementType, |
| VectorRegisterGroupMultiplier vlmul, |
| TailProcessing vta, |
| InactiveProcessing vma> |
| void OpVectorvx(const Decoder::VOpArgs& args, ElementType arg1) { |
| constexpr size_t registers_involved = NumberOfRegistersInvolved(vlmul); |
| if ((args.dst & (registers_involved - 1)) != 0 || (args.src2 & (registers_involved - 1)) != 0) { |
| return Unimplemented(); |
| } |
| int vstart = GetCsr<CsrName::kVstart>(); |
| int vl = GetCsr<CsrName::kVl>(); |
| SIMD128Register mask, result, src2; |
| mask.Set(state_->cpu.v[0]); |
| for (size_t index = 0; index < registers_involved; ++index) { |
| result.Set(state_->cpu.v[args.dst + index]); |
| src2.Set(state_->cpu.v[args.src2 + index]); |
| std::tie(result) = Intrinsic(vstart - index * (16 / sizeof(ElementType)), |
| vl - index * (16 / sizeof(ElementType)), |
| intrinsics::MaskForRegisterInSequence<ElementType>(mask, index), |
| result, |
| src2, |
| arg1); |
| state_->cpu.v[args.dst + index] = result.Get<__uint128_t>(); |
| } |
| SetCsr<CsrName::kVstart>(0); |
| } |
| |
| void Nop() {} |
| |
| void Unimplemented() { |
| auto* addr = ToHostAddr<const uint16_t>(GetInsnAddr()); |
| uint8_t size = Decoder::GetInsnSize(addr); |
| if (size == 2) { |
| FATAL("Unimplemented riscv64 instruction 0x%" PRIx16 " at %p", *addr, addr); |
| } else { |
| CHECK_EQ(size, 4); |
| // Warning: do not cast and dereference the pointer |
| // since the address may not be 4-bytes aligned. |
| uint32_t code; |
| memcpy(&code, addr, sizeof(code)); |
| FATAL("Unimplemented riscv64 instruction 0x%" PRIx32 " at %p", code, addr); |
| } |
| } |
| |
| // |
| // Guest state getters/setters. |
| // |
| |
| Register GetReg(uint8_t reg) const { |
| CheckRegIsValid(reg); |
| return state_->cpu.x[reg]; |
| } |
| |
| void SetReg(uint8_t reg, Register value) { |
| CheckRegIsValid(reg); |
| state_->cpu.x[reg] = value; |
| } |
| |
| FpRegister GetFpReg(uint8_t reg) const { |
| CheckFpRegIsValid(reg); |
| return state_->cpu.f[reg]; |
| } |
| |
| template <typename FloatType> |
| FpRegister GetFRegAndUnboxNan(uint8_t reg); |
| |
| template <typename FloatType> |
| void NanBoxAndSetFpReg(uint8_t reg, FpRegister value); |
| |
| // |
| // Various helper methods. |
| // |
| |
| template <CsrName kName> |
| [[nodiscard]] Register GetCsr() const { |
| return state_->cpu.*CsrFieldAddr<kName>; |
| } |
| |
| template <CsrName kName> |
| void SetCsr(Register arg) { |
| state_->cpu.*CsrFieldAddr<kName> = arg & kCsrMask<kName>; |
| } |
| |
| [[nodiscard]] uint64_t GetImm(uint64_t imm) const { return imm; } |
| |
| [[nodiscard]] Register Copy(Register value) const { return value; } |
| |
| [[nodiscard]] GuestAddr GetInsnAddr() const { return state_->cpu.insn_addr; } |
| |
| void FinalizeInsn(uint8_t insn_len) { |
| if (!branch_taken_) { |
| state_->cpu.insn_addr += insn_len; |
| } |
| } |
| |
| #include "berberis/intrinsics/interpreter_intrinsics_hooks-inl.h" |
| |
| private: |
| template <typename DataType> |
| Register Load(const void* ptr) const { |
| static_assert(std::is_integral_v<DataType>); |
| DataType data; |
| memcpy(&data, ptr, sizeof(data)); |
| // Signed types automatically sign-extend to int64_t. |
| return static_cast<uint64_t>(data); |
| } |
| |
| template <typename DataType> |
| void Store(void* ptr, uint64_t data) const { |
| static_assert(std::is_integral_v<DataType>); |
| memcpy(ptr, &data, sizeof(DataType)); |
| } |
| |
| template <typename DataType> |
| void StoreFp(void* ptr, uint64_t data) const { |
| static_assert(std::is_floating_point_v<DataType>); |
| memcpy(ptr, &data, sizeof(DataType)); |
| } |
| |
| void CheckShamtIsValid(int8_t shamt) const { |
| CHECK_GE(shamt, 0); |
| CHECK_LT(shamt, 64); |
| } |
| |
| void CheckShamt32IsValid(int8_t shamt) const { |
| CHECK_GE(shamt, 0); |
| CHECK_LT(shamt, 32); |
| } |
| |
| void CheckRegIsValid(uint8_t reg) const { |
| CHECK_GT(reg, 0u); |
| CHECK_LE(reg, arraysize(state_->cpu.x)); |
| } |
| |
| void CheckFpRegIsValid(uint8_t reg) const { CHECK_LT(reg, arraysize(state_->cpu.f)); } |
| |
| ThreadState* state_; |
| bool branch_taken_; |
| }; |
| |
| template <> |
| [[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kVlenb>() const { |
| return 16; |
| } |
| |
| template <> |
| [[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kVxrm>() const { |
| return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11; |
| } |
| |
| template <> |
| [[nodiscard]] Interpreter::Register Interpreter::GetCsr<CsrName::kVxsat>() const { |
| return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> >> 2; |
| } |
| |
| template <> |
| void Interpreter::SetCsr<CsrName::kFrm>(Register arg) { |
| arg &= kCsrMask<CsrName::kFrm>; |
| state_->cpu.frm = arg; |
| FeSetRound(arg); |
| } |
| |
| template <> |
| void Interpreter::SetCsr<CsrName::kVxrm>(Register arg) { |
| state_->cpu.*CsrFieldAddr<CsrName::kVcsr> = |
| (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b100) | (arg & 0b11); |
| } |
| |
| template <> |
| void Interpreter::SetCsr<CsrName::kVxsat>(Register arg) { |
| state_->cpu.*CsrFieldAddr<CsrName::kVcsr> = |
| (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11) | ((arg & 0b1) << 2); |
| } |
| |
| template <> |
| Interpreter::FpRegister Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>(uint8_t reg) { |
| CheckFpRegIsValid(reg); |
| FpRegister value = state_->cpu.f[reg]; |
| return UnboxNan<Float32>(value); |
| } |
| |
| template <> |
| Interpreter::FpRegister Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>(uint8_t reg) { |
| CheckFpRegIsValid(reg); |
| return state_->cpu.f[reg]; |
| } |
| |
| template <> |
| void Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) { |
| CheckFpRegIsValid(reg); |
| state_->cpu.f[reg] = NanBox<Float32>(value); |
| } |
| |
| template <> |
| void Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) { |
| CheckFpRegIsValid(reg); |
| state_->cpu.f[reg] = value; |
| } |
| |
| } // namespace |
| |
| void InitInterpreter() { |
| // TODO(b/232598137): Currently we just call it to initialize the recovery map. |
| // We need to add real faulty instructions with recovery here. |
| InitExtraRecoveryCodeUnsafe({}); |
| } |
| |
| void InterpretInsn(ThreadState* state) { |
| GuestAddr pc = state->cpu.insn_addr; |
| |
| Interpreter interpreter(state); |
| SemanticsPlayer sem_player(&interpreter); |
| Decoder decoder(&sem_player); |
| uint8_t insn_len = decoder.Decode(ToHostAddr<const uint16_t>(pc)); |
| interpreter.FinalizeInsn(insn_len); |
| } |
| |
| } // namespace berberis |