|  | #include "caffe2/core/context_gpu.h" | 
|  | #include "caffe2/core/operator.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  | namespace { | 
|  |  | 
|  | class GetGPUMemoryUsageOp final : public Operator<CUDAContext> { | 
|  | public: | 
|  | template<class... Args> explicit GetGPUMemoryUsageOp(Args&&... args) | 
|  | : Operator<CUDAContext>(std::forward<Args>(args)...) {} | 
|  | ~GetGPUMemoryUsageOp() override {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | TORCH_CHECK_EQ(InputSize(), 0); | 
|  | TORCH_CHECK_EQ(OutputSize(), 1); | 
|  | std::vector<long> total_by_gpu = CUDAContext::TotalMemoryByGpu(); | 
|  | std::vector<long> max_by_gpu = CUDAContext::MaxMemoryByGpu(); | 
|  | TORCH_CHECK_EQ(total_by_gpu.size(), max_by_gpu.size()); | 
|  |  | 
|  |  | 
|  | auto* stats = Output(0, {2, static_cast<int64_t>(total_by_gpu.size())}, at::dtype<long>()); | 
|  | context_.CopyFromCPU<long>( | 
|  | total_by_gpu.size(), | 
|  | total_by_gpu.data(), | 
|  | stats->template mutable_data<long>()); | 
|  | context_.CopyFromCPU<long>( | 
|  | max_by_gpu.size(), | 
|  | max_by_gpu.data(), | 
|  | stats->template mutable_data<long>() + total_by_gpu.size()); | 
|  | return true; | 
|  | } | 
|  | }; | 
|  |  | 
|  | OPERATOR_SCHEMA(GetGPUMemoryUsage) | 
|  | .NumInputs(0) | 
|  | .NumOutputs(1) | 
|  | .SetDoc(R"DOC(Fetches GPU memory stats from CUDAContext. Result is stored | 
|  | in output blob with shape (2, num_gpus). First row contains the total | 
|  | current memory usage, and the second row the maximum usage during | 
|  | this execution. | 
|  |  | 
|  | NOTE: --caffe2_gpu_memory_tracking flag must be enabled to use this op. | 
|  | )DOC"); | 
|  |  | 
|  | REGISTER_CUDA_OPERATOR(GetGPUMemoryUsage, GetGPUMemoryUsageOp); | 
|  | } | 
|  |  | 
|  | } // namespace caffe2 |