|  | #ifndef CAFFE2_CUDA_RTC_COMMON_RTC_H_ | 
|  | #define CAFFE2_CUDA_RTC_COMMON_RTC_H_ | 
|  |  | 
|  | #include <sstream> | 
|  | #include <string> | 
|  |  | 
|  | #include <cuda.h> | 
|  | #include <nvrtc.h> | 
|  |  | 
|  | #define NVRTC_CHECK(condition)                                          \ | 
|  | do {                                                                  \ | 
|  | nvrtcResult result = condition;                                     \ | 
|  | if (result != NVRTC_SUCCESS) {                                      \ | 
|  | LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": " \ | 
|  | << nvrtcGetErrorString(result);                        \ | 
|  | }                                                                   \ | 
|  | } while (0) | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | template <typename Derived> | 
|  | class CudaRTCFunction { | 
|  | public: | 
|  | CudaRTCFunction() : module_loaded_(false) {} | 
|  | ~CudaRTCFunction() { | 
|  | if (module_loaded_) { | 
|  | CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_)); | 
|  | } | 
|  | } | 
|  |  | 
|  | // TODO: this function is nontrivial and since CudaRTCFunction uses CRTP, it | 
|  | // may potentially increase the binary size. In that case, move common parts | 
|  | // into a separate function. | 
|  | template <typename... Args> | 
|  | void Compile(Args... args) { | 
|  | string src = static_cast<Derived*>(this)->GetSource(args...); | 
|  | string name = static_cast<Derived*>(this)->KernelName(args...); | 
|  | VLOG(1) << "function name: " << name; | 
|  | VLOG(1) << "function src:\n" << src; | 
|  | // Actually do the compiling. | 
|  | nvrtcProgram prog; | 
|  | NVRTC_CHECK( | 
|  | nvrtcCreateProgram(&prog, src.c_str(), nullptr, 0, nullptr, nullptr)); | 
|  | // Compile the program. | 
|  | // TODO(Yangqing): how to find the current gpu architecture instead of hard | 
|  | // coding it? | 
|  | const char* nvrtc_opts[] = { | 
|  | "--gpu-architecture=compute_35", "--use_fast_math"}; | 
|  | nvrtcResult compile_result = nvrtcCompileProgram(prog, 2, nvrtc_opts); | 
|  | if (compile_result != NVRTC_SUCCESS) { | 
|  | size_t log_size; | 
|  | NVRTC_CHECK(nvrtcGetProgramLogSize(prog, &log_size)); | 
|  | vector<char> nvrtc_log(log_size); | 
|  | NVRTC_CHECK(nvrtcGetProgramLog(prog, nvrtc_log.data())); | 
|  | LOG(FATAL) << "Compilation failure for nvrtc(" | 
|  | << nvrtcGetErrorString(compile_result) << "): \n" | 
|  | << nvrtc_log.data(); | 
|  | } | 
|  | size_t ptx_size; | 
|  | NVRTC_CHECK(nvrtcGetPTXSize(prog, &ptx_size)); | 
|  | vector<char> nvrtc_ptx(ptx_size); | 
|  | NVRTC_CHECK(nvrtcGetPTX(prog, nvrtc_ptx.data())); | 
|  | NVRTC_CHECK(nvrtcDestroyProgram(&prog)); | 
|  | // After compilation, load the module. | 
|  | if (module_loaded_) { | 
|  | CUDA_DRIVERAPI_ENFORCE(cuModuleUnload(module_)); | 
|  | } | 
|  | CUDA_DRIVERAPI_ENFORCE( | 
|  | cuModuleLoadDataEx(&module_, nvrtc_ptx.data(), 0, 0, 0)); | 
|  | module_loaded_ = true; | 
|  | CUDA_DRIVERAPI_ENFORCE( | 
|  | cuModuleGetFunction(&kernel_, module_, name.c_str())); | 
|  | } | 
|  |  | 
|  | template <typename... Args> | 
|  | void Launch( | 
|  | unsigned int gx, | 
|  | unsigned int gy, | 
|  | unsigned int gz, | 
|  | unsigned int bx, | 
|  | unsigned int by, | 
|  | unsigned int bz, | 
|  | unsigned int shared_mem, | 
|  | cudaStream_t stream, | 
|  | Args... args) { | 
|  | CAFFE_ENFORCE( | 
|  | module_loaded_, "Cannot call Launch before a module is loaded."); | 
|  | void* args_voidp[] = {&args...}; | 
|  | CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel( | 
|  | kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, args_voidp, 0)); | 
|  | } | 
|  |  | 
|  | void LaunchEx( | 
|  | unsigned int gx, | 
|  | unsigned int gy, | 
|  | unsigned int gz, | 
|  | unsigned int bx, | 
|  | unsigned int by, | 
|  | unsigned int bz, | 
|  | unsigned int shared_mem, | 
|  | cudaStream_t stream, | 
|  | void** extra) { | 
|  | CAFFE_ENFORCE( | 
|  | module_loaded_, "Cannot call Launch before a module is loaded."); | 
|  | CUDA_DRIVERAPI_ENFORCE(cuLaunchKernel( | 
|  | kernel_, gx, gy, gz, bx, by, bz, shared_mem, stream, nullptr, extra)); | 
|  | } | 
|  |  | 
|  | private: | 
|  | bool module_loaded_; | 
|  | CUmodule module_; | 
|  | CUfunction kernel_; | 
|  | }; | 
|  |  | 
|  | // TODO: this is in no way unique and is just a hack right now. | 
|  | inline std::string GetUniqueName() { | 
|  | static constexpr int len = 20; | 
|  | static const char alpha[] = | 
|  | "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; | 
|  |  | 
|  | std::stringstream ss; | 
|  | ss << "_cuda_kernel_"; | 
|  | for (const auto i : c10::irange(len)) { | 
|  | ss << alpha[rand() % (sizeof(alpha) - 1)]; | 
|  | } | 
|  | return ss.str(); | 
|  | } | 
|  |  | 
|  | } // namespace caffe2 | 
|  |  | 
|  | #endif // CAFFE2_CUDA_RTC_COMMON_RTC_H_ |