| /* |
| * Copyright © 2018 Valve Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| */ |
| |
| #ifndef ACO_IR_H |
| #define ACO_IR_H |
| |
| #include <vector> |
| #include <set> |
| #include <unordered_set> |
| #include <bitset> |
| #include <memory> |
| |
| #include "nir.h" |
| #include "ac_binary.h" |
| #include "amd_family.h" |
| #include "aco_opcodes.h" |
| #include "aco_util.h" |
| |
| #include "vulkan/radv_shader.h" |
| |
| struct radv_nir_compiler_options; |
| struct radv_shader_args; |
| struct radv_shader_info; |
| |
| namespace aco { |
| |
| extern uint64_t debug_flags; |
| |
| enum { |
| DEBUG_VALIDATE_IR = 0x1, |
| DEBUG_VALIDATE_RA = 0x2, |
| DEBUG_PERFWARN = 0x4, |
| DEBUG_FORCE_WAITCNT = 0x8, |
| DEBUG_NO_VN = 0x10, |
| DEBUG_NO_OPT = 0x20, |
| DEBUG_NO_SCHED = 0x40, |
| }; |
| |
| /** |
| * Representation of the instruction's microcode encoding format |
| * Note: Some Vector ALU Formats can be combined, such that: |
| * - VOP2* | VOP3A represents a VOP2 instruction in VOP3A encoding |
| * - VOP2* | DPP represents a VOP2 instruction with data parallel primitive. |
| * - VOP2* | SDWA represents a VOP2 instruction with sub-dword addressing. |
| * |
| * (*) The same is applicable for VOP1 and VOPC instructions. |
| */ |
| enum class Format : std::uint16_t { |
| /* Pseudo Instruction Format */ |
| PSEUDO = 0, |
| /* Scalar ALU & Control Formats */ |
| SOP1 = 1, |
| SOP2 = 2, |
| SOPK = 3, |
| SOPP = 4, |
| SOPC = 5, |
| /* Scalar Memory Format */ |
| SMEM = 6, |
| /* LDS/GDS Format */ |
| DS = 8, |
| /* Vector Memory Buffer Formats */ |
| MTBUF = 9, |
| MUBUF = 10, |
| /* Vector Memory Image Format */ |
| MIMG = 11, |
| /* Export Format */ |
| EXP = 12, |
| /* Flat Formats */ |
| FLAT = 13, |
| GLOBAL = 14, |
| SCRATCH = 15, |
| |
| PSEUDO_BRANCH = 16, |
| PSEUDO_BARRIER = 17, |
| PSEUDO_REDUCTION = 18, |
| |
| /* Vector ALU Formats */ |
| VOP3P = 19, |
| VOP1 = 1 << 8, |
| VOP2 = 1 << 9, |
| VOPC = 1 << 10, |
| VOP3 = 1 << 11, |
| VOP3A = 1 << 11, |
| VOP3B = 1 << 11, |
| /* Vector Parameter Interpolation Format */ |
| VINTRP = 1 << 12, |
| DPP = 1 << 13, |
| SDWA = 1 << 14, |
| }; |
| |
| enum storage_class : uint8_t { |
| storage_none = 0x0, /* no synchronization and can be reordered around aliasing stores */ |
| storage_buffer = 0x1, /* SSBOs and global memory */ |
| storage_atomic_counter = 0x2, /* not used for Vulkan */ |
| storage_image = 0x4, |
| storage_shared = 0x8, /* or TCS output */ |
| storage_vmem_output = 0x10, /* GS or TCS output stores using VMEM */ |
| storage_scratch = 0x20, |
| storage_vgpr_spill = 0x40, |
| storage_count = 8, |
| }; |
| |
| enum memory_semantics : uint8_t { |
| semantic_none = 0x0, |
| /* for loads: don't move any access after this load to before this load (even other loads) |
| * for barriers: don't move any access after the barrier to before any |
| * atomics/control_barriers/sendmsg_gs_done before the barrier */ |
| semantic_acquire = 0x1, |
| /* for stores: don't move any access before this store to after this store |
| * for barriers: don't move any access before the barrier to after any |
| * atomics/control_barriers/sendmsg_gs_done after the barrier */ |
| semantic_release = 0x2, |
| |
| /* the rest are for load/stores/atomics only */ |
| /* cannot be DCE'd or CSE'd */ |
| semantic_volatile = 0x4, |
| /* does not interact with barriers and assumes this lane is the only lane |
| * accessing this memory */ |
| semantic_private = 0x8, |
| /* this operation can be reordered around operations of the same storage. says nothing about barriers */ |
| semantic_can_reorder = 0x10, |
| /* this is a atomic instruction (may only read or write memory) */ |
| semantic_atomic = 0x20, |
| /* this is instruction both reads and writes memory */ |
| semantic_rmw = 0x40, |
| |
| semantic_acqrel = semantic_acquire | semantic_release, |
| semantic_atomicrmw = semantic_volatile | semantic_atomic | semantic_rmw, |
| }; |
| |
| enum sync_scope : uint8_t { |
| scope_invocation = 0, |
| scope_subgroup = 1, |
| scope_workgroup = 2, |
| scope_queuefamily = 3, |
| scope_device = 4, |
| }; |
| |
| struct memory_sync_info { |
| memory_sync_info() : storage(storage_none), semantics(semantic_none), scope(scope_invocation) {} |
| memory_sync_info(int storage, int semantics=0, sync_scope scope=scope_invocation) |
| : storage((storage_class)storage), semantics((memory_semantics)semantics), scope(scope) {} |
| |
| storage_class storage:8; |
| memory_semantics semantics:8; |
| sync_scope scope:8; |
| |
| bool operator == (const memory_sync_info& rhs) const { |
| return storage == rhs.storage && |
| semantics == rhs.semantics && |
| scope == rhs.scope; |
| } |
| |
| bool can_reorder() const { |
| if (semantics & semantic_acqrel) |
| return false; |
| /* Also check storage so that zero-initialized memory_sync_info can be |
| * reordered. */ |
| return (!storage || (semantics & semantic_can_reorder)) && !(semantics & semantic_volatile); |
| } |
| }; |
| static_assert(sizeof(memory_sync_info) == 3, "Unexpected padding"); |
| |
| enum fp_round { |
| fp_round_ne = 0, |
| fp_round_pi = 1, |
| fp_round_ni = 2, |
| fp_round_tz = 3, |
| }; |
| |
| enum fp_denorm { |
| /* Note that v_rcp_f32, v_exp_f32, v_log_f32, v_sqrt_f32, v_rsq_f32 and |
| * v_mad_f32/v_madak_f32/v_madmk_f32/v_mac_f32 always flush denormals. */ |
| fp_denorm_flush = 0x0, |
| fp_denorm_keep = 0x3, |
| }; |
| |
| struct float_mode { |
| /* matches encoding of the MODE register */ |
| union { |
| struct { |
| fp_round round32:2; |
| fp_round round16_64:2; |
| unsigned denorm32:2; |
| unsigned denorm16_64:2; |
| }; |
| struct { |
| uint8_t round:4; |
| uint8_t denorm:4; |
| }; |
| uint8_t val = 0; |
| }; |
| /* if false, optimizations which may remove infs/nan/-0.0 can be done */ |
| bool preserve_signed_zero_inf_nan32:1; |
| bool preserve_signed_zero_inf_nan16_64:1; |
| /* if false, optimizations which may remove denormal flushing can be done */ |
| bool must_flush_denorms32:1; |
| bool must_flush_denorms16_64:1; |
| bool care_about_round32:1; |
| bool care_about_round16_64:1; |
| |
| /* Returns true if instructions using the mode "other" can safely use the |
| * current one instead. */ |
| bool canReplace(float_mode other) const noexcept { |
| return val == other.val && |
| (preserve_signed_zero_inf_nan32 || !other.preserve_signed_zero_inf_nan32) && |
| (preserve_signed_zero_inf_nan16_64 || !other.preserve_signed_zero_inf_nan16_64) && |
| (must_flush_denorms32 || !other.must_flush_denorms32) && |
| (must_flush_denorms16_64 || !other.must_flush_denorms16_64) && |
| (care_about_round32 || !other.care_about_round32) && |
| (care_about_round16_64 || !other.care_about_round16_64); |
| } |
| }; |
| |
| constexpr Format asVOP3(Format format) { |
| return (Format) ((uint32_t) Format::VOP3 | (uint32_t) format); |
| }; |
| |
| constexpr Format asSDWA(Format format) { |
| assert(format == Format::VOP1 || format == Format::VOP2 || format == Format::VOPC); |
| return (Format) ((uint32_t) Format::SDWA | (uint32_t) format); |
| } |
| |
| enum class RegType { |
| none = 0, |
| sgpr, |
| vgpr, |
| linear_vgpr, |
| }; |
| |
| struct RegClass { |
| |
| enum RC : uint8_t { |
| s1 = 1, |
| s2 = 2, |
| s3 = 3, |
| s4 = 4, |
| s6 = 6, |
| s8 = 8, |
| s16 = 16, |
| v1 = s1 | (1 << 5), |
| v2 = s2 | (1 << 5), |
| v3 = s3 | (1 << 5), |
| v4 = s4 | (1 << 5), |
| v5 = 5 | (1 << 5), |
| v6 = 6 | (1 << 5), |
| v7 = 7 | (1 << 5), |
| v8 = 8 | (1 << 5), |
| /* byte-sized register class */ |
| v1b = v1 | (1 << 7), |
| v2b = v2 | (1 << 7), |
| v3b = v3 | (1 << 7), |
| v4b = v4 | (1 << 7), |
| v6b = v6 | (1 << 7), |
| v8b = v8 | (1 << 7), |
| /* these are used for WWM and spills to vgpr */ |
| v1_linear = v1 | (1 << 6), |
| v2_linear = v2 | (1 << 6), |
| }; |
| |
| RegClass() = default; |
| constexpr RegClass(RC rc) |
| : rc(rc) {} |
| constexpr RegClass(RegType type, unsigned size) |
| : rc((RC) ((type == RegType::vgpr ? 1 << 5 : 0) | size)) {} |
| |
| constexpr operator RC() const { return rc; } |
| explicit operator bool() = delete; |
| |
| constexpr RegType type() const { return rc <= RC::s16 ? RegType::sgpr : RegType::vgpr; } |
| constexpr bool is_subdword() const { return rc & (1 << 7); } |
| constexpr unsigned bytes() const { return ((unsigned) rc & 0x1F) * (is_subdword() ? 1 : 4); } |
| //TODO: use size() less in favor of bytes() |
| constexpr unsigned size() const { return (bytes() + 3) >> 2; } |
| constexpr bool is_linear() const { return rc <= RC::s16 || rc & (1 << 6); } |
| constexpr RegClass as_linear() const { return RegClass((RC) (rc | (1 << 6))); } |
| constexpr RegClass as_subdword() const { return RegClass((RC) (rc | 1 << 7)); } |
| |
| static constexpr RegClass get(RegType type, unsigned bytes) { |
| if (type == RegType::sgpr) { |
| return RegClass(type, DIV_ROUND_UP(bytes, 4u)); |
| } else { |
| return bytes % 4u ? RegClass(type, bytes).as_subdword() : |
| RegClass(type, bytes / 4u); |
| } |
| } |
| |
| private: |
| RC rc; |
| }; |
| |
| /* transitional helper expressions */ |
| static constexpr RegClass s1{RegClass::s1}; |
| static constexpr RegClass s2{RegClass::s2}; |
| static constexpr RegClass s3{RegClass::s3}; |
| static constexpr RegClass s4{RegClass::s4}; |
| static constexpr RegClass s8{RegClass::s8}; |
| static constexpr RegClass s16{RegClass::s16}; |
| static constexpr RegClass v1{RegClass::v1}; |
| static constexpr RegClass v2{RegClass::v2}; |
| static constexpr RegClass v3{RegClass::v3}; |
| static constexpr RegClass v4{RegClass::v4}; |
| static constexpr RegClass v5{RegClass::v5}; |
| static constexpr RegClass v6{RegClass::v6}; |
| static constexpr RegClass v7{RegClass::v7}; |
| static constexpr RegClass v8{RegClass::v8}; |
| static constexpr RegClass v1b{RegClass::v1b}; |
| static constexpr RegClass v2b{RegClass::v2b}; |
| static constexpr RegClass v3b{RegClass::v3b}; |
| static constexpr RegClass v4b{RegClass::v4b}; |
| static constexpr RegClass v6b{RegClass::v6b}; |
| static constexpr RegClass v8b{RegClass::v8b}; |
| |
| /** |
| * Temp Class |
| * Each temporary virtual register has a |
| * register class (i.e. size and type) |
| * and SSA id. |
| */ |
| struct Temp { |
| Temp() noexcept : id_(0), reg_class(0) {} |
| constexpr Temp(uint32_t id, RegClass cls) noexcept |
| : id_(id), reg_class(uint8_t(cls)) {} |
| |
| constexpr uint32_t id() const noexcept { return id_; } |
| constexpr RegClass regClass() const noexcept { return (RegClass::RC)reg_class; } |
| |
| constexpr unsigned bytes() const noexcept { return regClass().bytes(); } |
| constexpr unsigned size() const noexcept { return regClass().size(); } |
| constexpr RegType type() const noexcept { return regClass().type(); } |
| constexpr bool is_linear() const noexcept { return regClass().is_linear(); } |
| |
| constexpr bool operator <(Temp other) const noexcept { return id() < other.id(); } |
| constexpr bool operator==(Temp other) const noexcept { return id() == other.id(); } |
| constexpr bool operator!=(Temp other) const noexcept { return id() != other.id(); } |
| |
| private: |
| uint32_t id_: 24; |
| uint32_t reg_class : 8; |
| }; |
| |
| /** |
| * PhysReg |
| * Represents the physical register for each |
| * Operand and Definition. |
| */ |
| struct PhysReg { |
| constexpr PhysReg() = default; |
| explicit constexpr PhysReg(unsigned r) : reg_b(r << 2) {} |
| constexpr unsigned reg() const { return reg_b >> 2; } |
| constexpr unsigned byte() const { return reg_b & 0x3; } |
| constexpr operator unsigned() const { return reg(); } |
| constexpr bool operator==(PhysReg other) const { return reg_b == other.reg_b; } |
| constexpr bool operator!=(PhysReg other) const { return reg_b != other.reg_b; } |
| constexpr bool operator <(PhysReg other) const { return reg_b < other.reg_b; } |
| constexpr PhysReg advance(int bytes) const { PhysReg res = *this; res.reg_b += bytes; return res; } |
| |
| uint16_t reg_b = 0; |
| }; |
| |
| /* helper expressions for special registers */ |
| static constexpr PhysReg m0{124}; |
| static constexpr PhysReg vcc{106}; |
| static constexpr PhysReg vcc_hi{107}; |
| static constexpr PhysReg tba{108}; /* GFX6-GFX8 */ |
| static constexpr PhysReg tma{110}; /* GFX6-GFX8 */ |
| static constexpr PhysReg ttmp0{112}; |
| static constexpr PhysReg ttmp1{113}; |
| static constexpr PhysReg ttmp2{114}; |
| static constexpr PhysReg ttmp3{115}; |
| static constexpr PhysReg ttmp4{116}; |
| static constexpr PhysReg ttmp5{117}; |
| static constexpr PhysReg ttmp6{118}; |
| static constexpr PhysReg ttmp7{119}; |
| static constexpr PhysReg ttmp8{120}; |
| static constexpr PhysReg ttmp9{121}; |
| static constexpr PhysReg ttmp10{122}; |
| static constexpr PhysReg ttmp11{123}; |
| static constexpr PhysReg sgpr_null{125}; /* GFX10+ */ |
| static constexpr PhysReg exec{126}; |
| static constexpr PhysReg exec_lo{126}; |
| static constexpr PhysReg exec_hi{127}; |
| static constexpr PhysReg vccz{251}; |
| static constexpr PhysReg execz{252}; |
| static constexpr PhysReg scc{253}; |
| |
| /** |
| * Operand Class |
| * Initially, each Operand refers to either |
| * a temporary virtual register |
| * or to a constant value |
| * Temporary registers get mapped to physical register during RA |
| * Constant values are inlined into the instruction sequence. |
| */ |
| class Operand final |
| { |
| public: |
| constexpr Operand() |
| : reg_(PhysReg{128}), isTemp_(false), isFixed_(true), isConstant_(false), |
| isKill_(false), isUndef_(true), isFirstKill_(false), constSize(0), |
| isLateKill_(false) {} |
| |
| explicit Operand(Temp r) noexcept |
| { |
| data_.temp = r; |
| if (r.id()) { |
| isTemp_ = true; |
| } else { |
| isUndef_ = true; |
| setFixed(PhysReg{128}); |
| } |
| }; |
| explicit Operand(uint8_t v) noexcept |
| { |
| /* 8-bit constants are only used for copies and copies from any 8-bit |
| * constant can be implemented with a SDWA v_mul_u32_u24. So consider all |
| * to be inline constants. */ |
| data_.i = v; |
| isConstant_ = true; |
| constSize = 0; |
| setFixed(PhysReg{0u}); |
| }; |
| explicit Operand(uint16_t v) noexcept |
| { |
| data_.i = v; |
| isConstant_ = true; |
| constSize = 1; |
| if (v <= 64) |
| setFixed(PhysReg{128u + v}); |
| else if (v >= 0xFFF0) /* [-16 .. -1] */ |
| setFixed(PhysReg{192u + (0xFFFF - v)}); |
| else if (v == 0x3800) /* 0.5 */ |
| setFixed(PhysReg{240}); |
| else if (v == 0xB800) /* -0.5 */ |
| setFixed(PhysReg{241}); |
| else if (v == 0x3C00) /* 1.0 */ |
| setFixed(PhysReg{242}); |
| else if (v == 0xBC00) /* -1.0 */ |
| setFixed(PhysReg{243}); |
| else if (v == 0x4000) /* 2.0 */ |
| setFixed(PhysReg{244}); |
| else if (v == 0xC000) /* -2.0 */ |
| setFixed(PhysReg{245}); |
| else if (v == 0x4400) /* 4.0 */ |
| setFixed(PhysReg{246}); |
| else if (v == 0xC400) /* -4.0 */ |
| setFixed(PhysReg{247}); |
| else if (v == 0x3118) /* 1/2 PI */ |
| setFixed(PhysReg{248}); |
| else /* Literal Constant */ |
| setFixed(PhysReg{255}); |
| }; |
| explicit Operand(uint32_t v, bool is64bit = false) noexcept |
| { |
| data_.i = v; |
| isConstant_ = true; |
| constSize = is64bit ? 3 : 2; |
| if (v <= 64) |
| setFixed(PhysReg{128 + v}); |
| else if (v >= 0xFFFFFFF0) /* [-16 .. -1] */ |
| setFixed(PhysReg{192 - v}); |
| else if (v == 0x3f000000) /* 0.5 */ |
| setFixed(PhysReg{240}); |
| else if (v == 0xbf000000) /* -0.5 */ |
| setFixed(PhysReg{241}); |
| else if (v == 0x3f800000) /* 1.0 */ |
| setFixed(PhysReg{242}); |
| else if (v == 0xbf800000) /* -1.0 */ |
| setFixed(PhysReg{243}); |
| else if (v == 0x40000000) /* 2.0 */ |
| setFixed(PhysReg{244}); |
| else if (v == 0xc0000000) /* -2.0 */ |
| setFixed(PhysReg{245}); |
| else if (v == 0x40800000) /* 4.0 */ |
| setFixed(PhysReg{246}); |
| else if (v == 0xc0800000) /* -4.0 */ |
| setFixed(PhysReg{247}); |
| else { /* Literal Constant */ |
| assert(!is64bit && "attempt to create a 64-bit literal constant"); |
| setFixed(PhysReg{255}); |
| } |
| }; |
| explicit Operand(uint64_t v) noexcept |
| { |
| isConstant_ = true; |
| constSize = 3; |
| if (v <= 64) { |
| data_.i = (uint32_t) v; |
| setFixed(PhysReg{128 + (uint32_t) v}); |
| } else if (v >= 0xFFFFFFFFFFFFFFF0) { /* [-16 .. -1] */ |
| data_.i = (uint32_t) v; |
| setFixed(PhysReg{192 - (uint32_t) v}); |
| } else if (v == 0x3FE0000000000000) { /* 0.5 */ |
| data_.i = 0x3f000000; |
| setFixed(PhysReg{240}); |
| } else if (v == 0xBFE0000000000000) { /* -0.5 */ |
| data_.i = 0xbf000000; |
| setFixed(PhysReg{241}); |
| } else if (v == 0x3FF0000000000000) { /* 1.0 */ |
| data_.i = 0x3f800000; |
| setFixed(PhysReg{242}); |
| } else if (v == 0xBFF0000000000000) { /* -1.0 */ |
| data_.i = 0xbf800000; |
| setFixed(PhysReg{243}); |
| } else if (v == 0x4000000000000000) { /* 2.0 */ |
| data_.i = 0x40000000; |
| setFixed(PhysReg{244}); |
| } else if (v == 0xC000000000000000) { /* -2.0 */ |
| data_.i = 0xc0000000; |
| setFixed(PhysReg{245}); |
| } else if (v == 0x4010000000000000) { /* 4.0 */ |
| data_.i = 0x40800000; |
| setFixed(PhysReg{246}); |
| } else if (v == 0xC010000000000000) { /* -4.0 */ |
| data_.i = 0xc0800000; |
| setFixed(PhysReg{247}); |
| } else { /* Literal Constant: we don't know if it is a long or double.*/ |
| isConstant_ = 0; |
| assert(false && "attempt to create a 64-bit literal constant"); |
| } |
| }; |
| explicit Operand(RegClass type) noexcept |
| { |
| isUndef_ = true; |
| data_.temp = Temp(0, type); |
| setFixed(PhysReg{128}); |
| }; |
| explicit Operand(PhysReg reg, RegClass type) noexcept |
| { |
| data_.temp = Temp(0, type); |
| setFixed(reg); |
| } |
| |
| constexpr bool isTemp() const noexcept |
| { |
| return isTemp_; |
| } |
| |
| constexpr void setTemp(Temp t) noexcept { |
| assert(!isConstant_); |
| isTemp_ = true; |
| data_.temp = t; |
| } |
| |
| constexpr Temp getTemp() const noexcept |
| { |
| return data_.temp; |
| } |
| |
| constexpr uint32_t tempId() const noexcept |
| { |
| return data_.temp.id(); |
| } |
| |
| constexpr bool hasRegClass() const noexcept |
| { |
| return isTemp() || isUndefined(); |
| } |
| |
| constexpr RegClass regClass() const noexcept |
| { |
| return data_.temp.regClass(); |
| } |
| |
| constexpr unsigned bytes() const noexcept |
| { |
| if (isConstant()) |
| return 1 << constSize; |
| else |
| return data_.temp.bytes(); |
| } |
| |
| constexpr unsigned size() const noexcept |
| { |
| if (isConstant()) |
| return constSize > 2 ? 2 : 1; |
| else |
| return data_.temp.size(); |
| } |
| |
| constexpr bool isFixed() const noexcept |
| { |
| return isFixed_; |
| } |
| |
| constexpr PhysReg physReg() const noexcept |
| { |
| return reg_; |
| } |
| |
| constexpr void setFixed(PhysReg reg) noexcept |
| { |
| isFixed_ = reg != unsigned(-1); |
| reg_ = reg; |
| } |
| |
| constexpr bool isConstant() const noexcept |
| { |
| return isConstant_; |
| } |
| |
| constexpr bool isLiteral() const noexcept |
| { |
| return isConstant() && reg_ == 255; |
| } |
| |
| constexpr bool isUndefined() const noexcept |
| { |
| return isUndef_; |
| } |
| |
| constexpr uint32_t constantValue() const noexcept |
| { |
| return data_.i; |
| } |
| |
| constexpr bool constantEquals(uint32_t cmp) const noexcept |
| { |
| return isConstant() && constantValue() == cmp; |
| } |
| |
| constexpr uint64_t constantValue64(bool signext=false) const noexcept |
| { |
| if (constSize == 3) { |
| if (reg_ <= 192) |
| return reg_ - 128; |
| else if (reg_ <= 208) |
| return 0xFFFFFFFFFFFFFFFF - (reg_ - 193); |
| |
| switch (reg_) { |
| case 240: |
| return 0x3FE0000000000000; |
| case 241: |
| return 0xBFE0000000000000; |
| case 242: |
| return 0x3FF0000000000000; |
| case 243: |
| return 0xBFF0000000000000; |
| case 244: |
| return 0x4000000000000000; |
| case 245: |
| return 0xC000000000000000; |
| case 246: |
| return 0x4010000000000000; |
| case 247: |
| return 0xC010000000000000; |
| } |
| } else if (constSize == 1) { |
| return (signext && (data_.i & 0x8000u) ? 0xffffffffffff0000ull : 0ull) | data_.i; |
| } else if (constSize == 0) { |
| return (signext && (data_.i & 0x80u) ? 0xffffffffffffff00ull : 0ull) | data_.i; |
| } |
| return (signext && (data_.i & 0x80000000u) ? 0xffffffff00000000ull : 0ull) | data_.i; |
| } |
| |
| constexpr bool isOfType(RegType type) const noexcept |
| { |
| return hasRegClass() && regClass().type() == type; |
| } |
| |
| /* Indicates that the killed operand's live range intersects with the |
| * instruction's definitions. Unlike isKill() and isFirstKill(), this is |
| * not set by liveness analysis. */ |
| constexpr void setLateKill(bool flag) noexcept |
| { |
| isLateKill_ = flag; |
| } |
| |
| constexpr bool isLateKill() const noexcept |
| { |
| return isLateKill_; |
| } |
| |
| constexpr void setKill(bool flag) noexcept |
| { |
| isKill_ = flag; |
| if (!flag) |
| setFirstKill(false); |
| } |
| |
| constexpr bool isKill() const noexcept |
| { |
| return isKill_ || isFirstKill(); |
| } |
| |
| constexpr void setFirstKill(bool flag) noexcept |
| { |
| isFirstKill_ = flag; |
| if (flag) |
| setKill(flag); |
| } |
| |
| /* When there are multiple operands killing the same temporary, |
| * isFirstKill() is only returns true for the first one. */ |
| constexpr bool isFirstKill() const noexcept |
| { |
| return isFirstKill_; |
| } |
| |
| constexpr bool isKillBeforeDef() const noexcept |
| { |
| return isKill() && !isLateKill(); |
| } |
| |
| constexpr bool isFirstKillBeforeDef() const noexcept |
| { |
| return isFirstKill() && !isLateKill(); |
| } |
| |
| constexpr bool operator == (Operand other) const noexcept |
| { |
| if (other.size() != size()) |
| return false; |
| if (isFixed() != other.isFixed() || isKillBeforeDef() != other.isKillBeforeDef()) |
| return false; |
| if (isFixed() && other.isFixed() && physReg() != other.physReg()) |
| return false; |
| if (isLiteral()) |
| return other.isLiteral() && other.constantValue() == constantValue(); |
| else if (isConstant()) |
| return other.isConstant() && other.physReg() == physReg(); |
| else if (isUndefined()) |
| return other.isUndefined() && other.regClass() == regClass(); |
| else |
| return other.isTemp() && other.getTemp() == getTemp(); |
| } |
| private: |
| union { |
| uint32_t i; |
| float f; |
| Temp temp = Temp(0, s1); |
| } data_; |
| PhysReg reg_; |
| union { |
| struct { |
| uint8_t isTemp_:1; |
| uint8_t isFixed_:1; |
| uint8_t isConstant_:1; |
| uint8_t isKill_:1; |
| uint8_t isUndef_:1; |
| uint8_t isFirstKill_:1; |
| uint8_t constSize:2; |
| uint8_t isLateKill_:1; |
| }; |
| /* can't initialize bit-fields in c++11, so work around using a union */ |
| uint16_t control_ = 0; |
| }; |
| }; |
| |
| /** |
| * Definition Class |
| * Definitions are the results of Instructions |
| * and refer to temporary virtual registers |
| * which are later mapped to physical registers |
| */ |
| class Definition final |
| { |
| public: |
| constexpr Definition() : temp(Temp(0, s1)), reg_(0), isFixed_(0), hasHint_(0), |
| isKill_(0), isPrecise_(0), isNUW_(0) {} |
| Definition(uint32_t index, RegClass type) noexcept |
| : temp(index, type) {} |
| explicit Definition(Temp tmp) noexcept |
| : temp(tmp) {} |
| Definition(PhysReg reg, RegClass type) noexcept |
| : temp(Temp(0, type)) |
| { |
| setFixed(reg); |
| } |
| Definition(uint32_t tmpId, PhysReg reg, RegClass type) noexcept |
| : temp(Temp(tmpId, type)) |
| { |
| setFixed(reg); |
| } |
| |
| constexpr bool isTemp() const noexcept |
| { |
| return tempId() > 0; |
| } |
| |
| constexpr Temp getTemp() const noexcept |
| { |
| return temp; |
| } |
| |
| constexpr uint32_t tempId() const noexcept |
| { |
| return temp.id(); |
| } |
| |
| constexpr void setTemp(Temp t) noexcept { |
| temp = t; |
| } |
| |
| constexpr RegClass regClass() const noexcept |
| { |
| return temp.regClass(); |
| } |
| |
| constexpr unsigned bytes() const noexcept |
| { |
| return temp.bytes(); |
| } |
| |
| constexpr unsigned size() const noexcept |
| { |
| return temp.size(); |
| } |
| |
| constexpr bool isFixed() const noexcept |
| { |
| return isFixed_; |
| } |
| |
| constexpr PhysReg physReg() const noexcept |
| { |
| return reg_; |
| } |
| |
| constexpr void setFixed(PhysReg reg) noexcept |
| { |
| isFixed_ = 1; |
| reg_ = reg; |
| } |
| |
| constexpr void setHint(PhysReg reg) noexcept |
| { |
| hasHint_ = 1; |
| reg_ = reg; |
| } |
| |
| constexpr bool hasHint() const noexcept |
| { |
| return hasHint_; |
| } |
| |
| constexpr void setKill(bool flag) noexcept |
| { |
| isKill_ = flag; |
| } |
| |
| constexpr bool isKill() const noexcept |
| { |
| return isKill_; |
| } |
| |
| constexpr void setPrecise(bool precise) noexcept |
| { |
| isPrecise_ = precise; |
| } |
| |
| constexpr bool isPrecise() const noexcept |
| { |
| return isPrecise_; |
| } |
| |
| /* No Unsigned Wrap */ |
| constexpr void setNUW(bool nuw) noexcept |
| { |
| isNUW_ = nuw; |
| } |
| |
| constexpr bool isNUW() const noexcept |
| { |
| return isNUW_; |
| } |
| |
| private: |
| Temp temp = Temp(0, s1); |
| PhysReg reg_; |
| union { |
| struct { |
| uint8_t isFixed_:1; |
| uint8_t hasHint_:1; |
| uint8_t isKill_:1; |
| uint8_t isPrecise_:1; |
| uint8_t isNUW_:1; |
| }; |
| /* can't initialize bit-fields in c++11, so work around using a union */ |
| uint8_t control_ = 0; |
| }; |
| }; |
| |
| struct Block; |
| |
| struct Instruction { |
| aco_opcode opcode; |
| Format format; |
| uint32_t pass_flags; |
| |
| aco::span<Operand> operands; |
| aco::span<Definition> definitions; |
| |
| constexpr bool isVALU() const noexcept |
| { |
| return ((uint16_t) format & (uint16_t) Format::VOP1) == (uint16_t) Format::VOP1 |
| || ((uint16_t) format & (uint16_t) Format::VOP2) == (uint16_t) Format::VOP2 |
| || ((uint16_t) format & (uint16_t) Format::VOPC) == (uint16_t) Format::VOPC |
| || ((uint16_t) format & (uint16_t) Format::VOP3A) == (uint16_t) Format::VOP3A |
| || ((uint16_t) format & (uint16_t) Format::VOP3B) == (uint16_t) Format::VOP3B |
| || format == Format::VOP3P; |
| } |
| |
| constexpr bool isSALU() const noexcept |
| { |
| return format == Format::SOP1 || |
| format == Format::SOP2 || |
| format == Format::SOPC || |
| format == Format::SOPK || |
| format == Format::SOPP; |
| } |
| |
| constexpr bool isVMEM() const noexcept |
| { |
| return format == Format::MTBUF || |
| format == Format::MUBUF || |
| format == Format::MIMG; |
| } |
| |
| constexpr bool isDPP() const noexcept |
| { |
| return (uint16_t) format & (uint16_t) Format::DPP; |
| } |
| |
| constexpr bool isVOP3() const noexcept |
| { |
| return ((uint16_t) format & (uint16_t) Format::VOP3A) || |
| ((uint16_t) format & (uint16_t) Format::VOP3B); |
| } |
| |
| constexpr bool isSDWA() const noexcept |
| { |
| return (uint16_t) format & (uint16_t) Format::SDWA; |
| } |
| |
| constexpr bool isFlatOrGlobal() const noexcept |
| { |
| return format == Format::FLAT || format == Format::GLOBAL; |
| } |
| |
| constexpr bool usesModifiers() const noexcept; |
| |
| constexpr bool reads_exec() const noexcept |
| { |
| for (const Operand& op : operands) { |
| if (op.isFixed() && op.physReg() == exec) |
| return true; |
| } |
| return false; |
| } |
| }; |
| static_assert(sizeof(Instruction) == 16, "Unexpected padding"); |
| |
| struct SOPK_instruction : public Instruction { |
| uint16_t imm; |
| uint16_t padding; |
| }; |
| static_assert(sizeof(SOPK_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct SOPP_instruction : public Instruction { |
| uint32_t imm; |
| int block; |
| }; |
| static_assert(sizeof(SOPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct SOPC_instruction : public Instruction { |
| }; |
| static_assert(sizeof(SOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct SOP1_instruction : public Instruction { |
| }; |
| static_assert(sizeof(SOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct SOP2_instruction : public Instruction { |
| }; |
| static_assert(sizeof(SOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| /** |
| * Scalar Memory Format: |
| * For s_(buffer_)load_dword*: |
| * Operand(0): SBASE - SGPR-pair which provides base address |
| * Operand(1): Offset - immediate (un)signed offset or SGPR |
| * Operand(2) / Definition(0): SDATA - SGPR for read / write result |
| * Operand(n-1): SOffset - SGPR offset (Vega only) |
| * |
| * Having no operands is also valid for instructions such as s_dcache_inv. |
| * |
| */ |
| struct SMEM_instruction : public Instruction { |
| memory_sync_info sync; |
| bool glc : 1; /* VI+: globally coherent */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool nv : 1; /* VEGA only: Non-volatile */ |
| bool disable_wqm : 1; |
| bool prevent_overflow : 1; /* avoid overflow when combining additions */ |
| uint32_t padding: 3; |
| }; |
| static_assert(sizeof(SMEM_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct VOP1_instruction : public Instruction { |
| }; |
| static_assert(sizeof(VOP1_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct VOP2_instruction : public Instruction { |
| }; |
| static_assert(sizeof(VOP2_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct VOPC_instruction : public Instruction { |
| }; |
| static_assert(sizeof(VOPC_instruction) == sizeof(Instruction) + 0, "Unexpected padding"); |
| |
| struct VOP3A_instruction : public Instruction { |
| bool abs[3]; |
| bool neg[3]; |
| uint8_t opsel : 4; |
| uint8_t omod : 2; |
| bool clamp : 1; |
| uint32_t padding : 9; |
| }; |
| static_assert(sizeof(VOP3A_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct VOP3P_instruction : public Instruction { |
| bool neg_lo[3]; |
| bool neg_hi[3]; |
| uint8_t opsel_lo : 3; |
| uint8_t opsel_hi : 3; |
| bool clamp : 1; |
| uint32_t padding : 9; |
| }; |
| static_assert(sizeof(VOP3P_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Data Parallel Primitives Format: |
| * This format can be used for VOP1, VOP2 or VOPC instructions. |
| * The swizzle applies to the src0 operand. |
| * |
| */ |
| struct DPP_instruction : public Instruction { |
| bool abs[2]; |
| bool neg[2]; |
| uint16_t dpp_ctrl; |
| uint8_t row_mask : 4; |
| uint8_t bank_mask : 4; |
| bool bound_ctrl : 1; |
| uint32_t padding : 7; |
| }; |
| static_assert(sizeof(DPP_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| enum sdwa_sel : uint8_t { |
| /* masks */ |
| sdwa_wordnum = 0x1, |
| sdwa_bytenum = 0x3, |
| sdwa_asuint = 0x7 | 0x10, |
| sdwa_rasize = 0x3, |
| |
| /* flags */ |
| sdwa_isword = 0x4, |
| sdwa_sext = 0x8, |
| sdwa_isra = 0x10, |
| |
| /* specific values */ |
| sdwa_ubyte0 = 0, |
| sdwa_ubyte1 = 1, |
| sdwa_ubyte2 = 2, |
| sdwa_ubyte3 = 3, |
| sdwa_uword0 = sdwa_isword | 0, |
| sdwa_uword1 = sdwa_isword | 1, |
| sdwa_udword = 6, |
| |
| sdwa_sbyte0 = sdwa_ubyte0 | sdwa_sext, |
| sdwa_sbyte1 = sdwa_ubyte1 | sdwa_sext, |
| sdwa_sbyte2 = sdwa_ubyte2 | sdwa_sext, |
| sdwa_sbyte3 = sdwa_ubyte3 | sdwa_sext, |
| sdwa_sword0 = sdwa_uword0 | sdwa_sext, |
| sdwa_sword1 = sdwa_uword1 | sdwa_sext, |
| sdwa_sdword = sdwa_udword | sdwa_sext, |
| |
| /* register-allocated */ |
| sdwa_ubyte = 1 | sdwa_isra, |
| sdwa_uword = 2 | sdwa_isra, |
| sdwa_sbyte = sdwa_ubyte | sdwa_sext, |
| sdwa_sword = sdwa_uword | sdwa_sext, |
| }; |
| |
| /** |
| * Sub-Dword Addressing Format: |
| * This format can be used for VOP1, VOP2 or VOPC instructions. |
| * |
| * omod and SGPR/constant operands are only available on GFX9+. For VOPC, |
| * the definition doesn't have to be VCC on GFX9+. |
| * |
| */ |
| struct SDWA_instruction : public Instruction { |
| /* these destination modifiers aren't available with VOPC except for |
| * clamp on GFX8 */ |
| uint8_t sel[2]; |
| uint8_t dst_sel; |
| bool neg[2]; |
| bool abs[2]; |
| bool dst_preserve : 1; |
| bool clamp : 1; |
| uint8_t omod : 2; /* GFX9+ */ |
| uint32_t padding : 4; |
| }; |
| static_assert(sizeof(SDWA_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct Interp_instruction : public Instruction { |
| uint8_t attribute; |
| uint8_t component; |
| uint16_t padding; |
| }; |
| static_assert(sizeof(Interp_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| /** |
| * Local and Global Data Sharing instructions |
| * Operand(0): ADDR - VGPR which supplies the address. |
| * Operand(1): DATA0 - First data VGPR. |
| * Operand(2): DATA1 - Second data VGPR. |
| * Operand(n-1): M0 - LDS size. |
| * Definition(0): VDST - Destination VGPR when results returned to VGPRs. |
| * |
| */ |
| struct DS_instruction : public Instruction { |
| memory_sync_info sync; |
| bool gds; |
| int16_t offset0; |
| int8_t offset1; |
| uint8_t padding; |
| }; |
| static_assert(sizeof(DS_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Vector Memory Untyped-buffer Instructions |
| * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) |
| * Operand(1): VADDR - Address source. Can carry an index and/or offset |
| * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) |
| * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data |
| * |
| */ |
| struct MUBUF_instruction : public Instruction { |
| memory_sync_info sync; |
| bool offen : 1; /* Supply an offset from VGPR (VADDR) */ |
| bool idxen : 1; /* Supply an index from VGPR (VADDR) */ |
| bool addr64 : 1; /* SI, CIK: Address size is 64-bit */ |
| bool glc : 1; /* globally coherent */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool slc : 1; /* system level coherent */ |
| bool tfe : 1; /* texture fail enable */ |
| bool lds : 1; /* Return read-data to LDS instead of VGPRs */ |
| bool disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint16_t offset : 12; /* Unsigned byte offset - 12 bit */ |
| bool swizzled : 1; |
| uint32_t padding1 : 18; |
| }; |
| static_assert(sizeof(MUBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Vector Memory Typed-buffer Instructions |
| * Operand(0): SRSRC - Specifies which SGPR supplies T# (resource constant) |
| * Operand(1): VADDR - Address source. Can carry an index and/or offset |
| * Operand(2): SOFFSET - SGPR to supply unsigned byte offset. (SGPR, M0, or inline constant) |
| * Operand(3) / Definition(0): VDATA - Vector GPR for write result / read data |
| * |
| */ |
| struct MTBUF_instruction : public Instruction { |
| memory_sync_info sync; |
| uint8_t dfmt : 4; /* Data Format of data in memory buffer */ |
| uint8_t nfmt : 3; /* Numeric format of data in memory */ |
| bool offen : 1; /* Supply an offset from VGPR (VADDR) */ |
| bool idxen : 1; /* Supply an index from VGPR (VADDR) */ |
| bool glc : 1; /* globally coherent */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool slc : 1; /* system level coherent */ |
| bool tfe : 1; /* texture fail enable */ |
| bool disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint32_t padding : 10; |
| uint16_t offset; /* Unsigned byte offset - 12 bit */ |
| }; |
| static_assert(sizeof(MTBUF_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Vector Memory Image Instructions |
| * Operand(0) SRSRC - Scalar GPR that specifies the resource constant. |
| * Operand(1): SSAMP - Scalar GPR that specifies sampler constant. |
| * or VDATA - Vector GPR for write data. |
| * Operand(2): VADDR - Address source. Can carry an offset or an index. |
| * Definition(0): VDATA - Vector GPR for read result. |
| * |
| */ |
| struct MIMG_instruction : public Instruction { |
| memory_sync_info sync; |
| uint8_t dmask; /* Data VGPR enable mask */ |
| uint8_t dim : 3; /* NAVI: dimensionality */ |
| bool unrm : 1; /* Force address to be un-normalized */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool glc : 1; /* globally coherent */ |
| bool slc : 1; /* system level coherent */ |
| bool tfe : 1; /* texture fail enable */ |
| bool da : 1; /* declare an array */ |
| bool lwe : 1; /* Force data to be un-normalized */ |
| bool r128 : 1; /* NAVI: Texture resource size */ |
| bool a16 : 1; /* VEGA, NAVI: Address components are 16-bits */ |
| bool d16 : 1; /* Convert 32-bit data to 16-bit data */ |
| bool disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint32_t padding : 18; |
| }; |
| static_assert(sizeof(MIMG_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| /** |
| * Flat/Scratch/Global Instructions |
| * Operand(0): ADDR |
| * Operand(1): SADDR |
| * Operand(2) / Definition(0): DATA/VDST |
| * |
| */ |
| struct FLAT_instruction : public Instruction { |
| memory_sync_info sync; |
| bool slc : 1; /* system level coherent */ |
| bool glc : 1; /* globally coherent */ |
| bool dlc : 1; /* NAVI: device level coherent */ |
| bool lds : 1; |
| bool nv : 1; |
| bool disable_wqm : 1; /* Require an exec mask without helper invocations */ |
| uint32_t padding0 : 2; |
| uint16_t offset; /* Vega/Navi only */ |
| uint16_t padding1; |
| }; |
| static_assert(sizeof(FLAT_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct Export_instruction : public Instruction { |
| uint8_t enabled_mask; |
| uint8_t dest; |
| bool compressed : 1; |
| bool done : 1; |
| bool valid_mask : 1; |
| uint32_t padding : 13; |
| }; |
| static_assert(sizeof(Export_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct Pseudo_instruction : public Instruction { |
| PhysReg scratch_sgpr; /* might not be valid if it's not needed */ |
| bool tmp_in_scc; |
| uint8_t padding; |
| }; |
| static_assert(sizeof(Pseudo_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct Pseudo_branch_instruction : public Instruction { |
| /* target[0] is the block index of the branch target. |
| * For conditional branches, target[1] contains the fall-through alternative. |
| * A value of 0 means the target has not been initialized (BB0 cannot be a branch target). |
| */ |
| uint32_t target[2]; |
| }; |
| static_assert(sizeof(Pseudo_branch_instruction) == sizeof(Instruction) + 8, "Unexpected padding"); |
| |
| struct Pseudo_barrier_instruction : public Instruction { |
| memory_sync_info sync; |
| sync_scope exec_scope; |
| }; |
| static_assert(sizeof(Pseudo_barrier_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| enum ReduceOp : uint16_t { |
| iadd8, iadd16, iadd32, iadd64, |
| imul8, imul16, imul32, imul64, |
| fadd16, fadd32, fadd64, |
| fmul16, fmul32, fmul64, |
| imin8, imin16, imin32, imin64, |
| imax8, imax16, imax32, imax64, |
| umin8, umin16, umin32, umin64, |
| umax8, umax16, umax32, umax64, |
| fmin16, fmin32, fmin64, |
| fmax16, fmax32, fmax64, |
| iand8, iand16, iand32, iand64, |
| ior8, ior16, ior32, ior64, |
| ixor8, ixor16, ixor32, ixor64, |
| }; |
| |
| /** |
| * Subgroup Reduction Instructions, everything except for the data to be |
| * reduced and the result as inserted by setup_reduce_temp(). |
| * Operand(0): data to be reduced |
| * Operand(1): reduce temporary |
| * Operand(2): vector temporary |
| * Definition(0): result |
| * Definition(1): scalar temporary |
| * Definition(2): scalar identity temporary (not used to store identity on GFX10) |
| * Definition(3): scc clobber |
| * Definition(4): vcc clobber |
| * |
| */ |
| struct Pseudo_reduction_instruction : public Instruction { |
| ReduceOp reduce_op; |
| uint16_t cluster_size; // must be 0 for scans |
| }; |
| static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, "Unexpected padding"); |
| |
| struct instr_deleter_functor { |
| void operator()(void* p) { |
| free(p); |
| } |
| }; |
| |
| template<typename T> |
| using aco_ptr = std::unique_ptr<T, instr_deleter_functor>; |
| |
| template<typename T> |
| T* create_instruction(aco_opcode opcode, Format format, uint32_t num_operands, uint32_t num_definitions) |
| { |
| std::size_t size = sizeof(T) + num_operands * sizeof(Operand) + num_definitions * sizeof(Definition); |
| char *data = (char*) calloc(1, size); |
| T* inst = (T*) data; |
| |
| inst->opcode = opcode; |
| inst->format = format; |
| |
| uint16_t operands_offset = data + sizeof(T) - (char*)&inst->operands; |
| inst->operands = aco::span<Operand>(operands_offset, num_operands); |
| uint16_t definitions_offset = (char*)inst->operands.end() - (char*)&inst->definitions; |
| inst->definitions = aco::span<Definition>(definitions_offset, num_definitions); |
| |
| return inst; |
| } |
| |
| constexpr bool Instruction::usesModifiers() const noexcept |
| { |
| if (isDPP() || isSDWA()) |
| return true; |
| |
| if (format == Format::VOP3P) { |
| const VOP3P_instruction *vop3p = static_cast<const VOP3P_instruction*>(this); |
| for (unsigned i = 0; i < operands.size(); i++) { |
| if (vop3p->neg_lo[i] || vop3p->neg_hi[i]) |
| return true; |
| } |
| return vop3p->opsel_lo || vop3p->opsel_hi || vop3p->clamp; |
| } else if (isVOP3()) { |
| const VOP3A_instruction *vop3 = static_cast<const VOP3A_instruction*>(this); |
| for (unsigned i = 0; i < operands.size(); i++) { |
| if (vop3->abs[i] || vop3->neg[i]) |
| return true; |
| } |
| return vop3->opsel || vop3->clamp || vop3->omod; |
| } |
| return false; |
| } |
| |
| constexpr bool is_phi(Instruction* instr) |
| { |
| return instr->opcode == aco_opcode::p_phi || instr->opcode == aco_opcode::p_linear_phi; |
| } |
| |
| static inline bool is_phi(aco_ptr<Instruction>& instr) |
| { |
| return is_phi(instr.get()); |
| } |
| |
| memory_sync_info get_sync_info(const Instruction* instr); |
| |
| bool is_dead(const std::vector<uint16_t>& uses, Instruction *instr); |
| |
| bool can_use_opsel(chip_class chip, aco_opcode op, int idx, bool high); |
| bool can_use_SDWA(chip_class chip, const aco_ptr<Instruction>& instr); |
| /* updates "instr" and returns the old instruction (or NULL if no update was needed) */ |
| aco_ptr<Instruction> convert_to_SDWA(chip_class chip, aco_ptr<Instruction>& instr); |
| |
| enum block_kind { |
| /* uniform indicates that leaving this block, |
| * all actives lanes stay active */ |
| block_kind_uniform = 1 << 0, |
| block_kind_top_level = 1 << 1, |
| block_kind_loop_preheader = 1 << 2, |
| block_kind_loop_header = 1 << 3, |
| block_kind_loop_exit = 1 << 4, |
| block_kind_continue = 1 << 5, |
| block_kind_break = 1 << 6, |
| block_kind_continue_or_break = 1 << 7, |
| block_kind_discard = 1 << 8, |
| block_kind_branch = 1 << 9, |
| block_kind_merge = 1 << 10, |
| block_kind_invert = 1 << 11, |
| block_kind_uses_discard_if = 1 << 12, |
| block_kind_needs_lowering = 1 << 13, |
| block_kind_uses_demote = 1 << 14, |
| block_kind_export_end = 1 << 15, |
| }; |
| |
| |
| struct RegisterDemand { |
| constexpr RegisterDemand() = default; |
| constexpr RegisterDemand(const int16_t v, const int16_t s) noexcept |
| : vgpr{v}, sgpr{s} {} |
| int16_t vgpr = 0; |
| int16_t sgpr = 0; |
| |
| constexpr friend bool operator==(const RegisterDemand a, const RegisterDemand b) noexcept { |
| return a.vgpr == b.vgpr && a.sgpr == b.sgpr; |
| } |
| |
| constexpr bool exceeds(const RegisterDemand other) const noexcept { |
| return vgpr > other.vgpr || sgpr > other.sgpr; |
| } |
| |
| constexpr RegisterDemand operator+(const Temp t) const noexcept { |
| if (t.type() == RegType::sgpr) |
| return RegisterDemand( vgpr, sgpr + t.size() ); |
| else |
| return RegisterDemand( vgpr + t.size(), sgpr ); |
| } |
| |
| constexpr RegisterDemand operator+(const RegisterDemand other) const noexcept { |
| return RegisterDemand(vgpr + other.vgpr, sgpr + other.sgpr); |
| } |
| |
| constexpr RegisterDemand operator-(const RegisterDemand other) const noexcept { |
| return RegisterDemand(vgpr - other.vgpr, sgpr - other.sgpr); |
| } |
| |
| constexpr RegisterDemand& operator+=(const RegisterDemand other) noexcept { |
| vgpr += other.vgpr; |
| sgpr += other.sgpr; |
| return *this; |
| } |
| |
| constexpr RegisterDemand& operator-=(const RegisterDemand other) noexcept { |
| vgpr -= other.vgpr; |
| sgpr -= other.sgpr; |
| return *this; |
| } |
| |
| constexpr RegisterDemand& operator+=(const Temp t) noexcept { |
| if (t.type() == RegType::sgpr) |
| sgpr += t.size(); |
| else |
| vgpr += t.size(); |
| return *this; |
| } |
| |
| constexpr RegisterDemand& operator-=(const Temp t) noexcept { |
| if (t.type() == RegType::sgpr) |
| sgpr -= t.size(); |
| else |
| vgpr -= t.size(); |
| return *this; |
| } |
| |
| constexpr void update(const RegisterDemand other) noexcept { |
| vgpr = std::max(vgpr, other.vgpr); |
| sgpr = std::max(sgpr, other.sgpr); |
| } |
| |
| }; |
| |
| /* CFG */ |
| struct Block { |
| float_mode fp_mode; |
| unsigned index; |
| unsigned offset = 0; |
| std::vector<aco_ptr<Instruction>> instructions; |
| std::vector<unsigned> logical_preds; |
| std::vector<unsigned> linear_preds; |
| std::vector<unsigned> logical_succs; |
| std::vector<unsigned> linear_succs; |
| RegisterDemand register_demand = RegisterDemand(); |
| uint16_t loop_nest_depth = 0; |
| uint16_t kind = 0; |
| int logical_idom = -1; |
| int linear_idom = -1; |
| Temp live_out_exec = Temp(); |
| |
| /* this information is needed for predecessors to blocks with phis when |
| * moving out of ssa */ |
| bool scc_live_out = false; |
| PhysReg scratch_sgpr = PhysReg(); /* only needs to be valid if scc_live_out != false */ |
| |
| Block(unsigned idx) : index(idx) {} |
| Block() : index(0) {} |
| }; |
| |
| using Stage = uint16_t; |
| |
| /* software stages */ |
| static constexpr Stage sw_vs = 1 << 0; |
| static constexpr Stage sw_gs = 1 << 1; |
| static constexpr Stage sw_tcs = 1 << 2; |
| static constexpr Stage sw_tes = 1 << 3; |
| static constexpr Stage sw_fs = 1 << 4; |
| static constexpr Stage sw_cs = 1 << 5; |
| static constexpr Stage sw_gs_copy = 1 << 6; |
| static constexpr Stage sw_mask = 0x7f; |
| |
| /* hardware stages (can't be OR'd, just a mask for convenience when testing multiple) */ |
| static constexpr Stage hw_vs = 1 << 7; |
| static constexpr Stage hw_es = 1 << 8; /* Export shader: pre-GS (VS or TES) on GFX6-8. Combined into GS on GFX9 (and GFX10/legacy). */ |
| static constexpr Stage hw_gs = 1 << 9; /* Geometry shader on GFX10/legacy and GFX6-9. */ |
| static constexpr Stage hw_ngg_gs = 1 << 10; /* Geometry shader on GFX10/NGG. */ |
| static constexpr Stage hw_ls = 1 << 11; /* Local shader: pre-TCS (VS) on GFX6-8. Combined into HS on GFX9 (and GFX10/legacy). */ |
| static constexpr Stage hw_hs = 1 << 12; /* Hull shader: TCS on GFX6-8. Merged VS and TCS on GFX9-10. */ |
| static constexpr Stage hw_fs = 1 << 13; |
| static constexpr Stage hw_cs = 1 << 14; |
| static constexpr Stage hw_mask = 0xff << 7; |
| |
| /* possible settings of Program::stage */ |
| static constexpr Stage vertex_vs = sw_vs | hw_vs; |
| static constexpr Stage fragment_fs = sw_fs | hw_fs; |
| static constexpr Stage compute_cs = sw_cs | hw_cs; |
| static constexpr Stage tess_eval_vs = sw_tes | hw_vs; |
| static constexpr Stage gs_copy_vs = sw_gs_copy | hw_vs; |
| /* GFX10/NGG */ |
| static constexpr Stage ngg_vertex_gs = sw_vs | hw_ngg_gs; |
| static constexpr Stage ngg_vertex_geometry_gs = sw_vs | sw_gs | hw_ngg_gs; |
| static constexpr Stage ngg_tess_eval_gs = sw_tes | hw_ngg_gs; |
| static constexpr Stage ngg_tess_eval_geometry_gs = sw_tes | sw_gs | hw_ngg_gs; |
| /* GFX9 (and GFX10 if NGG isn't used) */ |
| static constexpr Stage vertex_geometry_gs = sw_vs | sw_gs | hw_gs; |
| static constexpr Stage vertex_tess_control_hs = sw_vs | sw_tcs | hw_hs; |
| static constexpr Stage tess_eval_geometry_gs = sw_tes | sw_gs | hw_gs; |
| /* pre-GFX9 */ |
| static constexpr Stage vertex_ls = sw_vs | hw_ls; /* vertex before tesselation control */ |
| static constexpr Stage vertex_es = sw_vs | hw_es; /* vertex before geometry */ |
| static constexpr Stage tess_control_hs = sw_tcs | hw_hs; |
| static constexpr Stage tess_eval_es = sw_tes | hw_es; /* tesselation evaluation before geometry */ |
| static constexpr Stage geometry_gs = sw_gs | hw_gs; |
| |
| enum statistic { |
| statistic_hash, |
| statistic_instructions, |
| statistic_copies, |
| statistic_branches, |
| statistic_cycles, |
| statistic_vmem_clauses, |
| statistic_smem_clauses, |
| statistic_vmem_score, |
| statistic_smem_score, |
| statistic_sgpr_presched, |
| statistic_vgpr_presched, |
| num_statistics |
| }; |
| |
| class Program final { |
| public: |
| float_mode next_fp_mode; |
| std::vector<Block> blocks; |
| RegisterDemand max_reg_demand = RegisterDemand(); |
| uint16_t num_waves = 0; |
| uint16_t max_waves = 0; /* maximum number of waves, regardless of register usage */ |
| ac_shader_config* config; |
| struct radv_shader_info *info; |
| enum chip_class chip_class; |
| enum radeon_family family; |
| unsigned wave_size; |
| RegClass lane_mask; |
| Stage stage; /* Stage */ |
| bool needs_exact = false; /* there exists an instruction with disable_wqm = true */ |
| bool needs_wqm = false; /* there exists a p_wqm instruction */ |
| bool wb_smem_l1_on_end = false; |
| |
| std::vector<uint8_t> constant_data; |
| Temp private_segment_buffer; |
| Temp scratch_offset; |
| |
| uint16_t min_waves = 0; |
| uint16_t lds_alloc_granule; |
| uint32_t lds_limit; /* in bytes */ |
| bool has_16bank_lds; |
| uint16_t vgpr_limit; |
| uint16_t sgpr_limit; |
| uint16_t physical_sgprs; |
| uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ |
| uint16_t vgpr_alloc_granule; /* minus one. must be power of two */ |
| unsigned workgroup_size; /* if known; otherwise UINT_MAX */ |
| |
| bool xnack_enabled = false; |
| bool sram_ecc_enabled = false; |
| bool has_fast_fma32 = false; |
| |
| bool needs_vcc = false; |
| bool needs_flat_scr = false; |
| |
| bool collect_statistics = false; |
| uint32_t statistics[num_statistics]; |
| |
| struct { |
| void (*func)(void *private_data, |
| enum radv_compiler_debug_level level, |
| const char *message); |
| void *private_data; |
| } debug; |
| |
| uint32_t allocateId() |
| { |
| assert(allocationID <= 16777215); |
| return allocationID++; |
| } |
| |
| uint32_t peekAllocationId() |
| { |
| return allocationID; |
| } |
| |
| void setAllocationId(uint32_t id) |
| { |
| allocationID = id; |
| } |
| |
| Block* create_and_insert_block() { |
| blocks.emplace_back(blocks.size()); |
| blocks.back().fp_mode = next_fp_mode; |
| return &blocks.back(); |
| } |
| |
| Block* insert_block(Block&& block) { |
| block.index = blocks.size(); |
| block.fp_mode = next_fp_mode; |
| blocks.emplace_back(std::move(block)); |
| return &blocks.back(); |
| } |
| |
| private: |
| uint32_t allocationID = 1; |
| }; |
| |
| struct TempHash { |
| std::size_t operator()(Temp t) const { |
| return t.id(); |
| } |
| }; |
| using TempSet = std::unordered_set<Temp, TempHash>; |
| |
| struct live { |
| /* live temps out per block */ |
| std::vector<TempSet> live_out; |
| /* register demand (sgpr/vgpr) per instruction per block */ |
| std::vector<std::vector<RegisterDemand>> register_demand; |
| }; |
| |
| void init(); |
| |
| void init_program(Program *program, Stage stage, struct radv_shader_info *info, |
| enum chip_class chip_class, enum radeon_family family, |
| ac_shader_config *config); |
| |
| void select_program(Program *program, |
| unsigned shader_count, |
| struct nir_shader *const *shaders, |
| ac_shader_config* config, |
| struct radv_shader_args *args); |
| void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader, |
| ac_shader_config* config, |
| struct radv_shader_args *args); |
| void select_trap_handler_shader(Program *program, struct nir_shader *shader, |
| ac_shader_config* config, |
| struct radv_shader_args *args); |
| |
| void lower_wqm(Program* program, live& live_vars, |
| const struct radv_nir_compiler_options *options); |
| void lower_phis(Program* program); |
| void calc_min_waves(Program* program); |
| void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); |
| live live_var_analysis(Program* program, const struct radv_nir_compiler_options *options); |
| std::vector<uint16_t> dead_code_analysis(Program *program); |
| void dominator_tree(Program* program); |
| void insert_exec_mask(Program *program); |
| void value_numbering(Program* program); |
| void optimize(Program* program); |
| void setup_reduce_temp(Program* program); |
| void lower_to_cssa(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); |
| void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block); |
| void ssa_elimination(Program* program); |
| void lower_to_hw_instr(Program* program); |
| void schedule_program(Program* program, live& live_vars); |
| void spill(Program* program, live& live_vars, const struct radv_nir_compiler_options *options); |
| void insert_wait_states(Program* program); |
| void insert_NOPs(Program* program); |
| unsigned emit_program(Program* program, std::vector<uint32_t>& code); |
| void print_asm(Program *program, std::vector<uint32_t>& binary, |
| unsigned exec_size, std::ostream& out); |
| bool validate_ir(Program* program); |
| bool validate_ra(Program* program, const struct radv_nir_compiler_options *options); |
| #ifndef NDEBUG |
| void perfwarn(Program *program, bool cond, const char *msg, Instruction *instr=NULL); |
| #else |
| #define perfwarn(program, cond, msg, ...) do {} while(0) |
| #endif |
| |
| void collect_presched_stats(Program *program); |
| void collect_preasm_stats(Program *program); |
| void collect_postasm_stats(Program *program, const std::vector<uint32_t>& code); |
| |
| void aco_print_instr(const Instruction *instr, FILE *output); |
| void aco_print_program(const Program *program, FILE *output); |
| |
| void _aco_perfwarn(Program *program, const char *file, unsigned line, |
| const char *fmt, ...); |
| void _aco_err(Program *program, const char *file, unsigned line, |
| const char *fmt, ...); |
| |
| #define aco_perfwarn(program, ...) _aco_perfwarn(program, __FILE__, __LINE__, __VA_ARGS__) |
| #define aco_err(program, ...) _aco_err(program, __FILE__, __LINE__, __VA_ARGS__) |
| |
| /* utilities for dealing with register demand */ |
| RegisterDemand get_live_changes(aco_ptr<Instruction>& instr); |
| RegisterDemand get_temp_registers(aco_ptr<Instruction>& instr); |
| RegisterDemand get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, aco_ptr<Instruction>& instr_before); |
| |
| /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ |
| uint16_t get_extra_sgprs(Program *program); |
| |
| /* get number of sgprs/vgprs allocated required to address a number of sgprs/vgprs */ |
| uint16_t get_sgpr_alloc(Program *program, uint16_t addressable_sgprs); |
| uint16_t get_vgpr_alloc(Program *program, uint16_t addressable_vgprs); |
| |
| /* return number of addressable sgprs/vgprs for max_waves */ |
| uint16_t get_addr_sgpr_from_waves(Program *program, uint16_t max_waves); |
| uint16_t get_addr_vgpr_from_waves(Program *program, uint16_t max_waves); |
| |
| typedef struct { |
| const int16_t opcode_gfx7[static_cast<int>(aco_opcode::num_opcodes)]; |
| const int16_t opcode_gfx9[static_cast<int>(aco_opcode::num_opcodes)]; |
| const int16_t opcode_gfx10[static_cast<int>(aco_opcode::num_opcodes)]; |
| const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_input_modifiers; |
| const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> can_use_output_modifiers; |
| const std::bitset<static_cast<int>(aco_opcode::num_opcodes)> is_atomic; |
| const char *name[static_cast<int>(aco_opcode::num_opcodes)]; |
| const aco::Format format[static_cast<int>(aco_opcode::num_opcodes)]; |
| /* sizes used for input/output modifiers and constants */ |
| const unsigned operand_size[static_cast<int>(aco_opcode::num_opcodes)]; |
| const unsigned definition_size[static_cast<int>(aco_opcode::num_opcodes)]; |
| } Info; |
| |
| extern const Info instr_info; |
| |
| } |
| |
| #endif /* ACO_IR_H */ |
| |