|  | #include "caffe2/core/common_gpu.h" | 
|  | #include "caffe2/core/context_gpu.h" | 
|  | #include "caffe2/core/operator.h" | 
|  | #include "caffe2/cuda_rtc/common_rtc.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  | namespace { | 
|  | class ElementwiseRTCFunction : public CudaRTCFunction<ElementwiseRTCFunction> { | 
|  | public: | 
|  | ElementwiseRTCFunction() : CudaRTCFunction(), name_(GetUniqueName()) {} | 
|  |  | 
|  | template <typename... Args> | 
|  | string KernelName(Args... /*args*/) { | 
|  | return name_; | 
|  | } | 
|  |  | 
|  | template <typename... Args> | 
|  | string GetSource(Args... args); | 
|  |  | 
|  | private: | 
|  | string name_; | 
|  | }; | 
|  |  | 
|  | template <> | 
|  | string ElementwiseRTCFunction::GetSource( | 
|  | int input_size, | 
|  | int output_size, | 
|  | const string command_string) { | 
|  | std::stringstream ss; | 
|  | ss << "extern \"C\" __global__ void " << name_ | 
|  | << "(const size_t nthreads, \n"; | 
|  | // Insert the parameter list. | 
|  | int remain_params = input_size + output_size; | 
|  | for (int i = 0; i < input_size; ++i) { | 
|  | ss << "const float* in" << i << ((remain_params--) ? ", \n" : ""); | 
|  | } | 
|  | for (int i = 0; i < output_size; ++i) { | 
|  | ss << "float* out" << i << ((remain_params--) ? ", \n" : ""); | 
|  | } | 
|  | ss << ") {\n" | 
|  | "for (int index = blockIdx.x * blockDim.x + threadIdx.x;\n" | 
|  | "index < nthreads; index += blockDim.x * gridDim.x) {\n" | 
|  | << command_string << "\n" | 
|  | << "}\n}"; | 
|  | return ss.str(); | 
|  | } | 
|  | } // namespace | 
|  |  | 
|  | /** | 
|  | * A GPU operator that can generate limited elementwise operations. | 
|  | * | 
|  | * ElementwiseRTCOp allows one to do a simple and limited thing: it takes in | 
|  | * multiple inputs and multiple outputs, as well as a raw string argument | 
|  | * rtc_src. The runtime then generates the following kernel code: | 
|  | * | 
|  | *   __global__ void kernel_name(const size_t nthreads, ...) { | 
|  | *     for(int index = blockIdx.x * blockDim.x + threadIdx.x; | 
|  | *         index < nthreads; index += blockDim.x * gridDim.x) { | 
|  | *       rtc_src | 
|  | *     } | 
|  | *   } | 
|  | * where the "..." part is auto generated, so one can refer to the input and | 
|  | * output as in0, in1, ..., out0, out1... in the rtc_src string. | 
|  | * | 
|  | * For example, if one wants to do a vector multiplication, one can take two | 
|  | * inputs and one outputs, and write rtc_src as | 
|  | *     out0[index] = in0[index] * in1[index]; | 
|  | * | 
|  | * This op is currently highly experimental. We do not have a gradient | 
|  | * registered for it either. | 
|  | */ | 
|  | class ElementwiseRTCOp final : public Operator<CUDAContext> { | 
|  | public: | 
|  | ElementwiseRTCOp(const OperatorDef& operator_def, Workspace* ws) | 
|  | : Operator<CUDAContext>(operator_def, ws) { | 
|  | const string src = OperatorBase::GetSingleArgument<string>("rtc_src", ""); | 
|  | CAFFE_ENFORCE(src.size(), "Op should have a non-zero source code size."); | 
|  | func_.Compile(InputSize(), OutputSize(), src); | 
|  | } | 
|  | ~ElementwiseRTCOp() override {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | static_assert( | 
|  | sizeof(void*) == sizeof(size_t), | 
|  | "The argbuffer relies on the assumption that void* and " | 
|  | "size_t have the same size."); | 
|  | vector<size_t> argBuffer_vec(InputSize() + OutputSize() + 1); | 
|  | size_t* argBuffer = argBuffer_vec.data(); | 
|  | CAFFE_ENFORCE( | 
|  | Input(0).numel() < std::numeric_limits<int>::max(), | 
|  | "The kernel function currently only supports int index."); | 
|  | argBuffer[0] = Input(0).numel(); | 
|  | void** ptr_buffer = reinterpret_cast<void**>(argBuffer + 1); | 
|  | for (int i = 0; i < InputSize(); ++i) { | 
|  | ptr_buffer[i] = const_cast<float*>(Input(i).data<float>()); | 
|  | } | 
|  | for (int i = 0; i < OutputSize(); ++i) { | 
|  | Output(i)->ResizeLike(Input(0)); | 
|  | ptr_buffer[i + InputSize()] = Output(i)->mutable_data<float>(); | 
|  | } | 
|  | size_t argBufferSize = sizeof(argBuffer); | 
|  | void* config[] = { | 
|  | CU_LAUNCH_PARAM_BUFFER_POINTER, | 
|  | argBuffer, | 
|  | CU_LAUNCH_PARAM_BUFFER_SIZE, | 
|  | &argBufferSize, | 
|  | CU_LAUNCH_PARAM_END}; | 
|  | func_.LaunchEx( | 
|  | CAFFE_GET_BLOCKS(Input(0).numel()), | 
|  | 1, | 
|  | 1, | 
|  | CAFFE_CUDA_NUM_THREADS, | 
|  | 1, | 
|  | 1, | 
|  | 0, | 
|  | context_.cuda_stream(), | 
|  | config); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | ElementwiseRTCFunction func_; | 
|  | }; | 
|  |  | 
|  | namespace { | 
|  | REGISTER_CUDA_OPERATOR_WITH_ENGINE(ElementwiseRTC, NVRTC, ElementwiseRTCOp); | 
|  | } | 
|  |  | 
|  | } // namespace caffe2 |