| #ifndef CAFFE2_UTILS_GPU_DEFS_H_ |
| #define CAFFE2_UTILS_GPU_DEFS_H_ |
| |
| #include <cuda_runtime.h> |
| |
| namespace caffe2 { |
| |
| // Static definition of GPU warp size for unrolling and code generation |
| |
| #ifdef __CUDA_ARCH__ |
| #if __CUDA_ARCH__ <= 800 |
| constexpr int kWarpSize = 32; |
| #else |
| #error Unknown __CUDA_ARCH__; please define parameters for compute capability |
| #endif // __CUDA_ARCH__ types |
| #elif defined(__HIP_PLATFORM_HCC__) |
| constexpr int kWarpSize = warpSize; // = 64 (Defined in hip_runtime.h) |
| #else |
| // dummy value for host compiler |
| constexpr int kWarpSize = 32; |
| #endif // __CUDA_ARCH__ |
| |
| // |
| // Interfaces to PTX instructions for which there appears to be no |
| // intrinsic |
| // |
| |
| template <typename T> |
| struct Bitfield {}; |
| |
| template <> |
| struct Bitfield<unsigned int> { |
| static __device__ __forceinline__ |
| unsigned int getBitfield(unsigned int val, int pos, int len) { |
| #if defined(__HIP_PLATFORM_HCC__) |
| pos &= 0xff; |
| len &= 0xff; |
| |
| unsigned int m = (1u << len) - 1u; |
| return (val >> pos) & m; |
| #else |
| unsigned int ret; |
| asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(val), "r"(pos), "r"(len)); |
| return ret; |
| #endif // __HIP_PLATFORM_HCC__ |
| } |
| |
| static __device__ __forceinline__ |
| unsigned int setBitfield(unsigned int val, unsigned int toInsert, int pos, int len) { |
| #if defined(__HIP_PLATFORM_HCC__) |
| pos &= 0xff; |
| len &= 0xff; |
| |
| unsigned int m = (1u << len) - 1u; |
| toInsert &= m; |
| toInsert <<= pos; |
| m <<= pos; |
| |
| return (val & ~m) | toInsert; |
| #else |
| unsigned int ret; |
| asm("bfi.b32 %0, %1, %2, %3, %4;" : |
| "=r"(ret) : "r"(toInsert), "r"(val), "r"(pos), "r"(len)); |
| return ret; |
| #endif // __HIP_PLATFORM_HCC__ |
| } |
| }; |
| |
| template <> |
| struct Bitfield<unsigned long long int> { |
| static __device__ __forceinline__ |
| unsigned long long int getBitfield(unsigned long long int val, int pos, int len) { |
| #if defined(__HIP_PLATFORM_HCC__) |
| pos &= 0xff; |
| len &= 0xff; |
| |
| unsigned long long int m = (1u << len) - 1u; |
| return (val >> pos) & m; |
| #else |
| unsigned long long int ret; |
| asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); |
| return ret; |
| #endif // __HIP_PLATFORM_HCC__ |
| } |
| |
| static __device__ __forceinline__ |
| unsigned long long int setBitfield(unsigned long long int val, unsigned long long int toInsert, int pos, int len) { |
| #if defined(__HIP_PLATFORM_HCC__) |
| pos &= 0xff; |
| len &= 0xff; |
| |
| unsigned long long int m = (1u << len) - 1u; |
| toInsert &= m; |
| toInsert <<= pos; |
| m <<= pos; |
| |
| return (val & ~m) | toInsert; |
| #else |
| unsigned long long int ret; |
| asm("bfi.b64 %0, %1, %2, %3, %4;" : |
| "=l"(ret) : "l"(toInsert), "l"(val), "r"(pos), "r"(len)); |
| return ret; |
| #endif // __HIP_PLATFORM_HCC__ |
| } |
| }; |
| |
| __device__ __forceinline__ int getLaneId() { |
| #if defined(__HIP_PLATFORM_HCC__) |
| return __lane_id(); |
| #else |
| int laneId; |
| asm("mov.s32 %0, %%laneid;" : "=r"(laneId) ); |
| return laneId; |
| #endif // __HIP_PLATFORM_HCC__ |
| } |
| |
| #if defined(__HIP_PLATFORM_HCC__) |
| __device__ __forceinline__ unsigned long long int getLaneMaskLt() { |
| unsigned long long int m = (1ull << getLaneId()) - 1ull; |
| return m; |
| } |
| |
| __device__ __forceinline__ unsigned long long int getLaneMaskLe() { |
| unsigned long long int m = UINT64_MAX >> (sizeof(std::uint64_t) * CHAR_BIT - (getLaneId() + 1)); |
| return m; |
| } |
| |
| __device__ __forceinline__ unsigned long long int getLaneMaskGt() { |
| unsigned long long int m = getLaneMaskLe(); |
| return m ? ~m : m; |
| } |
| |
| __device__ __forceinline__ unsigned long long int getLaneMaskGe() { |
| unsigned long long int m = getLaneMaskLt(); |
| return ~m; |
| } |
| #else |
| __device__ __forceinline__ unsigned getLaneMaskLt() { |
| unsigned mask; |
| asm("mov.u32 %0, %%lanemask_lt;" : "=r"(mask)); |
| return mask; |
| } |
| |
| __device__ __forceinline__ unsigned getLaneMaskLe() { |
| unsigned mask; |
| asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); |
| return mask; |
| } |
| |
| __device__ __forceinline__ unsigned getLaneMaskGt() { |
| unsigned mask; |
| asm("mov.u32 %0, %%lanemask_gt;" : "=r"(mask)); |
| return mask; |
| } |
| |
| __device__ __forceinline__ unsigned getLaneMaskGe() { |
| unsigned mask; |
| asm("mov.u32 %0, %%lanemask_ge;" : "=r"(mask)); |
| return mask; |
| } |
| #endif // __HIP_PLATFORM_HCC__ |
| |
| } // namespace caffe2 |
| |
| #endif // CAFFE2_UTILS_GPU_DEFS_H_ |