Unify cuda and hip device types in Caffe2 python front end (#14221)

Summary:
Goal of this PR is to unify cuda and hip device types in caffe2 python front end.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/14221

Differential Revision: D13148564

Pulled By: bddppq

fbshipit-source-id: ef9bd2c7d238200165f217097ac5727e686d887b
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index f9ccf92..aead1d5 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -5,7 +5,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from caffe2.python import brew
+from caffe2.python import brew, workspace
 from caffe2.python.model_helper import ModelHelper
 from caffe2.proto import caffe2_pb2
 import logging
@@ -235,6 +235,6 @@
     @property
     def GPU(self, gpu_id=0):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = gpu_id
         return device_option
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 4f683da..6cab923 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -82,6 +82,10 @@
     return C.op_registry_key(op_type, engine) in _REGISTERED_OPERATORS
 
 
+def IsGPUDeviceType(device_type):
+    return device_type in {caffe2_pb2.CUDA, caffe2_pb2.HIP}
+
+
 def DeviceOption(
     device_type,
     device_id=0,
@@ -2110,7 +2114,7 @@
     def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
@@ -2280,12 +2284,13 @@
 
 def copy_func_between_devices(src, dst):
     CPU = caffe2_pb2.CPU
-    CUDA = caffe2_pb2.CUDA
+    is_src_gpu = IsGPUDeviceType(src.device_type)
+    is_dst_gpu = IsGPUDeviceType(dst.device_type)
 
     if src.device_type == CPU and dst.device_type == CPU:
         return None
 
-    if src.device_type == CUDA and dst.device_type == CUDA:
+    if is_src_gpu and is_dst_gpu:
         if src.device_id == dst.device_id:
             return None
         else:
@@ -2294,13 +2299,13 @@
                     return net.Copy(*args, **kw)
             return fun
 
-    if src.device_type == CUDA and dst.device_type == CPU:
+    if is_src_gpu and dst.device_type == CPU:
         def fun(net, *args, **kw):
             with DeviceScope(src):
                 return net.CopyGPUToCPU(*args, **kw)
         return fun
 
-    if src.device_type == CPU and dst.device_type == CUDA:
+    if src.device_type == CPU and is_dst_gpu:
         def fun(net, *args, **kw):
             with DeviceScope(dst):
                 return net.CopyCPUToGPU(*args, **kw)
@@ -2425,11 +2430,10 @@
 
                     def _gen_new_name(blob, device_option):
                         CPU = caffe2_pb2.CPU
-                        CUDA = caffe2_pb2.CUDA
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
-                        elif device_option.device_type == CUDA:
-                            suffix = '_cuda_' + str(device_option.device_id)
+                        elif IsGPUDeviceType(device_option.device_type):
+                            suffix = '_gpu_' + str(device_option.device_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index bf25806..75c2689 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -9,9 +9,8 @@
 import unittest
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, test_util
+from caffe2.python import core, test_util, workspace
 from caffe2.python.core import CreateOperator, GradientRegistry
-from caffe2.python import workspace
 
 import numpy as np
 
@@ -94,7 +93,7 @@
 
     @given(device_option=st.sampled_from([
         None,
-        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+        core.DeviceOption(workspace.GpuDeviceType, 1)]))
     def testDirect(self, device_option):
         operators = [
             CreateOperator('Direct', 'in', 'hidden'),
@@ -279,7 +278,7 @@
 
     @given(device_option=st.sampled_from([
         None,
-        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+        core.DeviceOption(workspace.GpuDeviceType, 1)]))
     def testMultiUseInput(self, device_option):
         """Test gradient for the following case:
 
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 6c23d88..641a109 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -82,17 +82,17 @@
         self.assertFalse(op.HasField('device_option'))
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
         op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
         self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
-            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+            self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
             self.assertEqual(op.device_option.device_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
@@ -108,13 +108,13 @@
 
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
-                self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+                self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
                 self.assertEqual(op.device_option.device_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
@@ -254,7 +254,7 @@
 class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
@@ -270,7 +270,7 @@
         self.assertEqual(len(op.control_input), 1)
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
         self.assertTrue(len(op.arg), 3)
 
@@ -643,14 +643,15 @@
         self.assertEqual(op.input[2], "fc_b")
 
 
-@unittest.skipIf(not workspace.has_gpu_support, 'No GPU support')
+@unittest.skipIf(not workspace.has_gpu_support
+                and not workspace.has_hip_support, 'No GPU support')
 class TestInferDevice(test_util.TestCase):
 
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
-        self.cuda_option = device_option
+        self.gpu_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
     def _test_op(
@@ -662,7 +663,7 @@
         inputs=None,
         outputs=None
     ):
-        op_option = self.cuda_option if not op_option else op_option
+        op_option = self.gpu_option if not op_option else op_option
         inputs = ["blob_1"] if not inputs else inputs
         outputs = ["blob_2"] if not outputs else outputs
         with core.DeviceScope(op_option):
@@ -690,9 +691,9 @@
     def test_infer_device(self):
         self._test_op(
             "FC",
-            self.cuda_option,
-            self.cuda_option,
-            op_option=self.cuda_option,
+            self.gpu_option,
+            self.gpu_option,
+            op_option=self.gpu_option,
             inputs=["data", "fc_w", "fc_b"],
             outputs=["fc_1"]
         )
@@ -700,31 +701,31 @@
     def test_infer_device_split_by_lengths(self):
         self._test_op(
             "SplitByLengths",
-            [self.cuda_option, self.cpu_option],
-            self.cuda_option,
-            op_option=self.cuda_option,
+            [self.gpu_option, self.cpu_option],
+            self.gpu_option,
+            op_option=self.gpu_option,
             inputs=["data", "fc_w"],
             outputs=["fc_1"]
         )
 
     def test_infer_device_adam(self):
-        in_options = [self.cuda_option] * 6
+        in_options = [self.gpu_option] * 6
         in_options[5] = self.cpu_option
-        out_options = [self.cuda_option] * 4
+        out_options = [self.gpu_option] * 4
         self._test_op(
             "Adam",
             in_options,
             out_options,
-            op_option=self.cuda_option,
+            op_option=self.gpu_option,
             inputs=["param", "moment_1", "moment_2", "grad", "lr", "iter"],
             outputs=["output_param", "output_moment_1", "output_moment_2",
                 "output_grad"]
         )
 
     def test_infer_device_cross_device(self):
-        self._test_op("CopyGPUToCPU", self.cuda_option, self.cpu_option)
-        self._test_op("CopyCPUToGPU", self.cpu_option, self.cuda_option)
-        self._test_op("CopyFromCPUInput", self.cpu_option, self.cuda_option)
+        self._test_op("CopyGPUToCPU", self.gpu_option, self.cpu_option)
+        self._test_op("CopyCPUToGPU", self.cpu_option, self.gpu_option)
+        self._test_op("CopyFromCPUInput", self.cpu_option, self.gpu_option)
         self._test_op(
             "CopyFromCPUInput",
             self.cpu_option,
@@ -734,7 +735,7 @@
 
     def test_device_inference_function(self):
         # ConcatOp.
-        op_option = self.cuda_option
+        op_option = self.gpu_option
         with core.DeviceScope(op_option):
             op = core.CreateOperator(
                 'Concat',
@@ -746,7 +747,7 @@
         self.assertEqual(output_dev[1], self.cpu_option)
 
         #SplitOp.
-        op_option = self.cuda_option
+        op_option = self.gpu_option
         with core.DeviceScope(op_option):
             op = core.CreateOperator(
                 'Split',
@@ -761,7 +762,7 @@
         net = core.Net("test")
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
@@ -775,10 +776,10 @@
         )
         op = new_net._net.op[-1]
         self.assertEqual(op.type, "FC")
-        self.assertEqual(op.input[0], "data_cuda_1")
-        self.assertEqual(op.input[1], "fc_w_cuda_1")
-        self.assertEqual(op.input[2], "fc_b_cuda_1")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.input[0], "data_gpu_1")
+        self.assertEqual(op.input[1], "fc_w_gpu_1")
+        self.assertEqual(op.input[2], "fc_b_gpu_1")
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
@@ -788,7 +789,7 @@
         net = core.Net("test")
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
@@ -804,34 +805,34 @@
         )
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.output[0], "fc_w_cuda_1")
+        self.assertEqual(op.output[0], "fc_w_gpu_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.output[0], "fc_b_cuda_1")
+        self.assertEqual(op.output[0], "fc_b_gpu_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
         self.assertEqual(op.input[0], "data")
-        self.assertEqual(op.input[1], "fc_w_cuda_1")
-        self.assertEqual(op.input[2], "fc_b_cuda_1")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.input[1], "fc_w_gpu_1")
+        self.assertEqual(op.input[2], "fc_b_gpu_1")
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
-        self.assertEqual(op.input[1], "const_cuda_1")
+        self.assertEqual(op.input[1], "const_gpu_1")
         # check that moved blob is in input to the new net
-        for c in ["data", "fc_w", "fc_b", "const_cuda_1"]:
+        for c in ["data", "fc_w", "fc_b", "const_gpu_1"]:
             self.assertTrue(c in nets[1]._net.external_input)
         """
 For reference, net.Proto() should be like:
 name: ""
 op {
   input: "fc_w"
-  output: "fc_w_cuda_1"
+  output: "fc_w_gpu_1"
   name: ""
   type: "CopyCPUToGPU"
   device_option {
@@ -841,7 +842,7 @@
 }
 op {
   input: "fc_b"
-  output: "fc_b_cuda_1"
+  output: "fc_b_gpu_1"
   name: ""
   type: "CopyCPUToGPU"
   device_option {
@@ -851,8 +852,8 @@
 }
 op {
   input: "data"
-  input: "fc_w_cuda_1"
-  input: "fc_b_cuda_1"
+  input: "fc_w_gpu_1"
+  input: "fc_b_gpu_1"
   output: "fc1"
   name: ""
   type: "FC"
@@ -863,7 +864,7 @@
 }
 op {
   input: "fc1"
-  input: "const_cuda_1"
+  input: "const_gpu_1"
   output: "fc1"
   name: ""
   type: "Add"
@@ -876,14 +877,14 @@
 external_input: "fc_w"
 external_input: "fc_b"
 external_input: "const"
-external_input: "const_cuda_1"
+external_input: "const_gpu_1"
 """
 
     def test_cross_nets_no_change(self):
         net = core.Net("test")
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
 
         with core.DeviceScope(device_option):
@@ -900,7 +901,7 @@
         self.assertEqual(op.input[0], "data")
         self.assertEqual(op.input[1], "fc_w")
         self.assertEqual(op.input[2], "fc_b")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
         """
 For reference, net.Proto() should be like:
@@ -925,7 +926,7 @@
     def test_inject_copy_multi_use(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
 
         with core.DeviceScope(device_option):
@@ -944,12 +945,12 @@
         new_net, _ = core.InjectCrossDeviceCopies(net)
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.output[0], "data_cuda_1")
+        self.assertEqual(op.output[0], "data_gpu_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
@@ -958,9 +959,9 @@
         self.assertEqual(op.output[0], "relu2")
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.input[0], "data_cuda_1")
+        self.assertEqual(op.input[0], "data_gpu_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
         self.assertEqual(op.type, "Relu")
@@ -968,27 +969,27 @@
         self.assertEqual(op.output[0], "relu4")
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 0)
-        self.assertEqual(op.output[0], "data_cuda_0")
+        self.assertEqual(op.output[0], "data_gpu_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 0)
-        self.assertEqual(op.input[0], "data_cuda_0")
+        self.assertEqual(op.input[0], "data_gpu_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.input[0], "data_cuda_1")
+        self.assertEqual(op.input[0], "data_gpu_1")
         self.assertEqual(op.output[0], "relu6")
         """
 For reference, net.Proto() should be like:
 name: ""
 op {
   input: "data"
-  output: "data_cuda_1"
+  output: "data_gpu_1"
   name: ""
   type: "CopyCPUToGPU"
   device_option {
@@ -997,7 +998,7 @@
   }
 }
 op {
-  input: "data_cuda_1"
+  input: "data_gpu_1"
   output: "relu1"
   name: ""
   type: "Relu"
@@ -1013,7 +1014,7 @@
   type: "Relu"
 }
 op {
-  input: "data_cuda_1"
+  input: "data_gpu_1"
   output: "relu3"
   name: ""
   type: "Relu"
@@ -1030,7 +1031,7 @@
 }
 op {
   input: "data"
-  output: "data_cuda_0"
+  output: "data_gpu_0"
   name: ""
   type: "CopyCPUToGPU"
   device_option {
@@ -1039,7 +1040,7 @@
   }
 }
 op {
-  input: "data_cuda_0"
+  input: "data_gpu_0"
   output: "relu5"
   name: ""
   type: "Relu"
@@ -1049,7 +1050,7 @@
   }
 }
 op {
-  input: "data_cuda_1"
+  input: "data_gpu_1"
   output: "relu6"
   name: ""
   type: "Relu"
@@ -1073,7 +1074,7 @@
             cpu_device.append(caffe2_pb2.DeviceOption())
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
-            gpu_device[i].device_type = caffe2_pb2.CUDA
+            gpu_device[i].device_type = workspace.GpuDeviceType
             gpu_device[i].device_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
@@ -1113,12 +1114,12 @@
         # Verify (init_net)
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, 1)
+        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
         self.assertEqual(op.device_option.device_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
@@ -1141,7 +1142,7 @@
     def test_blob_inplace(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = workspace.GpuDeviceType
         device_option.device_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
@@ -1151,9 +1152,9 @@
         op = net._net.op[1]
         self.assertEqual(op.type, 'CopyCPUToGPU')
         self.assertEqual(op.input[0], 'param')
-        self.assertEqual(op.output[0], 'param_cuda_1')
+        self.assertEqual(op.output[0], 'param_gpu_1')
         op = net._net.op[2]
-        self.assertEqual(op.input[0], 'param_cuda_1')
+        self.assertEqual(op.input[0], 'param_gpu_1')
 
         net.Relu('nonsense_input', 'moment')
         # should not raise inplace error
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 1ea110e..7a76545 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -136,17 +136,17 @@
 
     if devices is None:
         if not cpu_device:
-            devices = list(range(0, workspace.NumCudaDevices()))
+            devices = list(range(0, workspace.NumGpuDevices()))
         else:
             devices = list(range(0, cpu_count()))
 
     if not cpu_device:
         for gpu in devices:
-            if gpu >= workspace.NumCudaDevices():
+            if gpu >= workspace.NumGpuDevices():
                 log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumCudaDevices(), devices))
+                    workspace.NumGpuDevices(), devices))
                 break
-        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_type = workspace.GpuDeviceType
         model_helper_obj._device_prefix = "gpu"
         model_helper_obj._shared_model = False
         device_name = "GPU"
@@ -447,17 +447,17 @@
     assert isinstance(model_helper_obj, model_helper.ModelHelper)
 
     if devices is None:
-        devices = list(range(0, workspace.NumCudaDevices()))
+        devices = list(range(0, workspace.NumGpuDevices()))
     if master_device is None:
         master_device = devices[0]
 
     if not cpu_device:
         for gpu in devices:
-            if gpu >= workspace.NumCudaDevices():
+            if gpu >= workspace.NumGpuDevices():
                 log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumCudaDevices(), devices))
+                    workspace.NumGpuDevices(), devices))
                 break
-        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_type = workspace.GpuDeviceType
         model_helper_obj._device_prefix = "gpu"
     else:
         model_helper_obj._device_type = caffe2_pb2.CPU
@@ -812,7 +812,7 @@
     if device is None:
         device = scope.CurrentDeviceScope()
 
-    device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
+    device_prefix = "gpu" if core.IsGPUDeviceType(device.device_type) else "cpu"
 
     namescope = "{}_{}/".format(device_prefix, device.device_id)
     for op in mnet.Proto().op:
@@ -971,7 +971,7 @@
     if model._optimizer is not None:
         if model._device_type == caffe2_pb2.CPU:
             return [model._optimizer.get_cpu_blob_name('lr')]
-        elif model._device_type == caffe2_pb2.CUDA:
+        elif core.IsGPUDeviceType(model._device_type):
             return [model._optimizer.get_gpu_blob_name('lr', gpu, '')
                     for gpu in model._devices]
         else:
@@ -1006,7 +1006,7 @@
 
     for dev_idx in devices[1:]:
         if _IsGPUBlob(model, param):
-            device_opt = core.DeviceOption(caffe2_pb2.CUDA, dev_idx)
+            device_opt = core.DeviceOption(workspace.GpuDeviceType, dev_idx)
         else:
             device_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
         with core.DeviceScope(device_opt):
@@ -1025,8 +1025,8 @@
         )
         return
 
-    if model._device_type == caffe2_pb2.CUDA:
-        p2p_access_pattern = workspace.GetCudaPeerAccessPattern()
+    if model._device_type == workspace.GpuDeviceType:
+        p2p_access_pattern = workspace.GetGpuPeerAccessPattern()
     else:
         p2p_access_pattern = None
 
@@ -1546,7 +1546,7 @@
         op_gpu = op_dev.device_id
 
         # This avoids failing on operators that are only for CPU
-        if op_dev.device_type != caffe2_pb2.CUDA:
+        if not core.IsGPUDeviceType(op_dev.device_type):
             continue
 
         namescope = "{}_{}/".format(model._device_prefix, op_gpu)
@@ -1589,14 +1589,14 @@
 
 def _IsGPUBlob(model, blob_name):
     if blob_name in model._blob_to_device:
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+        return core.IsGPUDeviceType(model._blob_to_device[blob_name].device_type)
     else:
         blob_name = "{}_{}/{}".format(
             model._device_prefix, model._devices[0], blob_name
         )
         if blob_name not in model._blob_to_device:
-            return model._device_type == caffe2_pb2.CUDA
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+            return core.IsGPUDeviceType(model._device_type)
+        return core.IsGPUDeviceType(model._blob_to_device[blob_name].device_type)
 
 
 def _GroupByDevice(model, devices, params, non_data_params):
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
index 16d1d94..c28dc07 100644
--- a/caffe2/python/data_parallel_model_test.py
+++ b/caffe2/python/data_parallel_model_test.py
@@ -409,7 +409,7 @@
 
     def test_device_scope_check(self):
         with self.assertRaises(AssertionError):
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
                 data_parallel_model.Parallelize_GPU(None, None, None)
 
     def test_net_transformer_function(self):
@@ -984,7 +984,7 @@
                                                   self.LR],
                                                   self.vecs)
         else:
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
                 model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)
 
         np.random.seed(2603)
@@ -1006,12 +1006,12 @@
 
                 device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                 if not cpu_indices:
-                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)
+                    device_for_indices = core.DeviceOption(workspace.GpuDeviceType, g)
 
                 with core.DeviceScope(device_for_indices):
                     workspace.FeedBlob("gpu_{}/indices".format(g), indices)
 
-                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, g)):
                     workspace.FeedBlob("gpu_{}/label".format(g), labels)
 
             if i == 0:
@@ -1027,7 +1027,7 @@
                         workspace.FeedBlob(
                             "gpu_{}/gpuvecs".format(g),
                             orig_vecs,
-                            device_option=core.DeviceOption(caffe2_pb2.CUDA, g),
+                            device_option=core.DeviceOption(workspace.GpuDeviceType, g),
                         )
                 workspace.CreateNet(model.net)
 
@@ -1073,7 +1073,8 @@
         self._test_equiv_sparse(False)
 
 
-@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
+@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPUs.")
 class ParallelizeBMUFTest(TestCase):
 
     def _run_model(self, gpu_devices):
@@ -1132,7 +1133,7 @@
         cpu_device=st.booleans()
     )
     def test_parallelize_bmuf(self, cpu_device):
-        assume(cpu_device or workspace.has_gpu_support)
+        assume(cpu_device or workspace.has_gpu_support or workspace.has_hip_support)
 
         workspace.ResetWorkspace()
 
@@ -1146,7 +1147,7 @@
             return None
 
         if not cpu_device:
-            device_type = caffe2_pb2.CUDA
+            device_type = workspace.GpuDeviceType
             device_prefix = "gpu"
         else:
             device_type = caffe2_pb2.CPU
@@ -1220,7 +1221,7 @@
 
 
 @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
+@unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPUs.")
 class SparseDataParallelModelTestWithSharedIndices(TestCase):
 
     '''
@@ -1336,7 +1337,7 @@
         )
 
         # Update the vecs
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
             for num, vec in enumerate(self.vecs[:-1]):
                 model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)
 
@@ -1354,7 +1355,7 @@
                 indices = full_indices[st:en].astype(np.int32)
                 labels = full_labels[st:en].astype(np.int32)
 
-                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
+                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, g)):
                     workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                     workspace.FeedBlob("gpu_{}/label".format(g), labels)
 
@@ -1377,7 +1378,7 @@
                             "gpu_{}/gpuvec_{}".format(g, num),
                             orig_vec,
                             device_option=core.DeviceOption(
-                                caffe2_pb2.CUDA, g),
+                                workspace.GpuDeviceType, g),
                         )
                 workspace.CreateNet(model.net)
 
@@ -1407,10 +1408,10 @@
         self.run_model(V, [0, 1])
         self.run_model(V, [0])
 
-        if workspace.NumCudaDevices() >= 4:
+        if workspace.NumGpuDevices() >= 4:
             self.run_model(V, list(range(4)))
 
-        if workspace.NumCudaDevices() >= 8:
+        if workspace.NumGpuDevices() >= 8:
             self.run_model(V, list(range(8)))
 
 
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
index a74d489..fb2059f 100644
--- a/caffe2/python/examples/char_rnn.py
+++ b/caffe2/python/examples/char_rnn.py
@@ -264,7 +264,7 @@
     args = parser.parse_args()
 
     device = core.DeviceOption(
-        caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 0)
+        workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 0)
     with core.DeviceScope(device):
         model = CharRNN(args)
         model.CreateModel()
diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py
index 05b753b..307d7b2 100644
--- a/caffe2/python/examples/resnet50_trainer.py
+++ b/caffe2/python/examples/resnet50_trainer.py
@@ -67,7 +67,7 @@
         reader, ["data", "label"],
         batch_size=batch_size,
         output_type=dtype,
-        use_gpu_transform=True if model._device_type == 1 else False,
+        use_gpu_transform=True if core.IsGPUDeviceType(model._device_type) else False,
         use_caffe_datum=True,
         mean_per_channel=mean_per_channel,
         std_per_channel=std_per_channel,
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
index b67c173..7d88efc 100644
--- a/caffe2/python/gradient_check_test.py
+++ b/caffe2/python/gradient_check_test.py
@@ -23,9 +23,9 @@
 import unittest
 
 
-if workspace.has_gpu_support and workspace.NumCudaDevices() > 0:
+if (workspace.has_gpu_support or workspace.has_hip_support) and workspace.NumGpuDevices() > 0:
     gpu_device_option = caffe2_pb2.DeviceOption()
-    gpu_device_option.device_type = caffe2_pb2.CUDA
+    gpu_device_option.device_type = workspace.GpuDeviceType
     cpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_checker = device_checker.DeviceChecker(
         0.01, [gpu_device_option]
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index b6f3b49..1f6ae91 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -259,10 +259,9 @@
 device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else [])
 
 # Include device option for each GPU
-expanded_device_options = [cpu_do] + (
-    [caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA, device_id=i)
-     for i in range(workspace.NumCudaDevices())]
-    if workspace.has_gpu_support else [])
+expanded_device_options = [cpu_do] + [
+    caffe2_pb2.DeviceOption(device_type=workspace.GpuDeviceType, device_id=i)
+    for i in range(workspace.NumGpuDevices())]
 
 
 def device_checker_device_options():
diff --git a/caffe2/python/lstm_benchmark.py b/caffe2/python/lstm_benchmark.py
index deefb12..a66967e 100644
--- a/caffe2/python/lstm_benchmark.py
+++ b/caffe2/python/lstm_benchmark.py
@@ -341,7 +341,7 @@
         '--caffe2_gpu_memory_tracking=1'] + extra_args)
 
     device = core.DeviceOption(
-        caffe2_pb2.CUDA if args.gpu else caffe2_pb2.CPU, 4)
+        workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 4)
 
     with core.DeviceScope(device):
         Benchmark(args)
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
index 6536280..c3e6168 100644
--- a/caffe2/python/memonger_test.py
+++ b/caffe2/python/memonger_test.py
@@ -223,13 +223,14 @@
         np.testing.assert_almost_equal(loss, optimized_loss)
         np.testing.assert_almost_equal(grad, optimized_grad)
 
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+    @unittest.skipIf(not workspace.has_gpu_support
+                    and not workspace.has_hip_support, "No gpu support.")
     def test_memonger_mix_cpu_gpu(self):
         '''
         Check that memonger does not make blobs cross CPU/GPU boundary
         '''
         m = model_helper.ModelHelper()
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
             fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
             fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
             fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
@@ -259,7 +260,7 @@
 
         # Create set of blobs on CPU side and GPU side and check they don't
         # overlap
-        device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
+        device_blobs = {caffe2_pb2.CPU: set(), workspace.GpuDeviceType: set()}
         for op in optim_proto.op:
             if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
                 dev = op.device_option.device_type
@@ -267,7 +268,7 @@
                     device_blobs[dev].add(b)
 
         device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
-            device_blobs[caffe2_pb2.CUDA]
+            device_blobs[workspace.GpuDeviceType]
         )
         self.assertEquals(device_crossers, set())
 
diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py
index 31cba3f..86c46ea 100644
--- a/caffe2/python/model_device_test.py
+++ b/caffe2/python/model_device_test.py
@@ -124,7 +124,7 @@
         cpu_device = caffe2_pb2.DeviceOption()
         cpu_device.device_type = caffe2_pb2.CPU
         gpu_device = caffe2_pb2.DeviceOption()
-        gpu_device.device_type = caffe2_pb2.CUDA
+        gpu_device.device_type = workspace.GpuDeviceType
 
         checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
         ret = checker.CheckNet(
@@ -136,7 +136,8 @@
         )
         self.assertEqual(ret, True)
 
-    @unittest.skipIf(not workspace.has_gpu_support,
+    @unittest.skipIf(not workspace.has_gpu_support
+                    and not workspace.has_hip_support,
                      "No GPU support. Skipping test.")
     def testMiniAlexNetNCHW(self):
         self._testMiniAlexNet("NCHW")
diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py
index 1e5b286..df68e3e 100644
--- a/caffe2/python/models/seq2seq/train.py
+++ b/caffe2/python/models/seq2seq/train.py
@@ -537,7 +537,7 @@
                     if batch_obj_name in ['encoder_inputs', 'decoder_inputs']:
                         dev = core.DeviceOption(caffe2_pb2.CPU)
                     else:
-                        dev = core.DeviceOption(caffe2_pb2.CUDA, i)
+                        dev = core.DeviceOption(workspace.GpuDeviceType, i)
                     workspace.FeedBlob(name, batch_obj_value, device_option=dev)
 
         if forward_only:
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index 2f2b5ac..3464739 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -25,7 +25,7 @@
   specified gpu id.
   """
     device_option = caffe2_pb2.DeviceOption()
-    device_option.device_type = caffe2_pb2.CUDA
+    device_option.device_type = workspace.GpuDeviceType
     device_option.device_id = gpu_id
     return device_option
 
@@ -39,7 +39,7 @@
 def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
     """The general Allreduce interface that reroutes the function calls.
     CPUs and AMD GPUs are not supported because
-    GetCudaPeerAccessPattern is called to get gpu peer access pattern.
+    GetGpuPeerAccessPattern is called to get gpu peer access pattern.
   """
     if gpu_indices is None:
         gpu_indices = list(range(len(blobs)))
@@ -48,7 +48,7 @@
             "gpu_indices length and blobs length mismatch: %d vs %d" %
             (len(gpu_indices), len(blobs))
         )
-    pattern = workspace.GetCudaPeerAccessPattern()
+    pattern = workspace.GetGpuPeerAccessPattern()
     if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
         return Allreduce2(net, blobs, reduced_affix, gpu_indices)
     elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
diff --git a/caffe2/python/muji_test.py b/caffe2/python/muji_test.py
index cca0ca0..8adc2da 100644
--- a/caffe2/python/muji_test.py
+++ b/caffe2/python/muji_test.py
@@ -38,36 +38,36 @@
 
     def testAllreduceFallback(self):
         self.RunningAllreduceWithGPUs(
-            list(range(workspace.NumCudaDevices())), muji.AllreduceFallback
+            list(range(workspace.NumGpuDevices())), muji.AllreduceFallback
         )
 
     def testAllreduceSingleGPU(self):
-        for i in range(workspace.NumCudaDevices()):
+        for i in range(workspace.NumGpuDevices()):
             self.RunningAllreduceWithGPUs([i], muji.Allreduce)
 
     def testAllreduceWithTwoGPUs(self):
-        pattern = workspace.GetCudaPeerAccessPattern()
+        pattern = workspace.GetGpuPeerAccessPattern()
         if pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
             self.RunningAllreduceWithGPUs([0, 1], muji.Allreduce2)
         else:
             print('Skipping allreduce with 2 gpus. Not peer access ready.')
 
     def testAllreduceWithFourGPUs(self):
-        pattern = workspace.GetCudaPeerAccessPattern()
+        pattern = workspace.GetGpuPeerAccessPattern()
         if pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
             self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4)
         else:
             print('Skipping allreduce with 4 gpus. Not peer access ready.')
 
     def testAllreduceWithFourGPUsAndTwoGroups(self):
-        pattern = workspace.GetCudaPeerAccessPattern()
+        pattern = workspace.GetGpuPeerAccessPattern()
         if pattern.shape[0] >= 4 and np.all(pattern[:2, :2]) and np.all(pattern[2:4, 2:4]):
             self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4Group2)
         else:
             print('Skipping allreduce with 4 gpus and 2 groups. Not peer access ready.')
 
     def testAllreduceWithEightGPUs(self):
-        pattern = workspace.GetCudaPeerAccessPattern()
+        pattern = workspace.GetGpuPeerAccessPattern()
         if (
             pattern.shape[0] >= 8 and np.all(pattern[:4, :4]) and
             np.all(pattern[4:, 4:])
diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py
index 05a018f..04e9358 100644
--- a/caffe2/python/operator_test/copy_ops_test.py
+++ b/caffe2/python/operator_test/copy_ops_test.py
@@ -40,21 +40,21 @@
     def test_copy_gradient_cpu(self):
         self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
     def test_copy_gradient_gpu(self):
-        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CUDA, 0))
+        self.run_test_copy_gradient(core.DeviceOption(workspace.GpuDeviceType, 0))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPU.")
+    @unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPU.")
     def test_copy_gradient_multiple_gpus(self):
         model = model_helper.ModelHelper(name="copy_test")
 
         with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
             x_cpu = model.net.AddExternalInputs("x_cpu")
 
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
             x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1")
 
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)):
+        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 1)):
             x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2")
             loss = model.AveragedLoss(x_gpu_2, "loss")
             gradient_map = model.AddGradientOperators([loss])
@@ -80,20 +80,20 @@
 
         self.assertEqual(
             get_op_with_output(model, "x_gpu_2_grad").device_option,
-            core.DeviceOption(caffe2_pb2.CUDA, 1),
+            core.DeviceOption(workspace.GpuDeviceType, 1),
         )
         self.assertEqual(
             get_op_with_output(model, "x_cpu_grad").device_option,
-            core.DeviceOption(caffe2_pb2.CUDA, 0),
+            core.DeviceOption(workspace.GpuDeviceType, 0),
         )
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
     def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
         model = model_helper.ModelHelper(name="copy_test")
         v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
         indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
         cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
 
         with core.DeviceScope(gpu_opt):
             vcpu = model.CopyGPUToCPU(v, "vcpu")
@@ -112,13 +112,13 @@
         self.assertTrue("v" in gradient_map)
         self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
     def test_cpu2gpu_gpu2cpu_gradients(self):
         model = model_helper.ModelHelper(name="copy_test")
 
         batch = 32
         cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
 
         with core.NameScope("cpu"):
             with core.DeviceScope(cpu_opt):
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
index 8e38170..1f3f930 100644
--- a/caffe2/python/operator_test/load_save_test.py
+++ b/caffe2/python/operator_test/load_save_test.py
@@ -15,8 +15,8 @@
 from caffe2.python import core, test_util, workspace
 
 if workspace.has_gpu_support:
-    DEVICES = [caffe2_pb2.CPU, caffe2_pb2.CUDA]
-    max_gpuid = workspace.NumCudaDevices() - 1
+    DEVICES = [caffe2_pb2.CPU, workspace.GpuDeviceType]
+    max_gpuid = workspace.NumGpuDevices() - 1
 else:
     DEVICES = [caffe2_pb2.CPU]
     max_gpuid = 0
@@ -42,8 +42,8 @@
                   np.int16, np.int32, np.int64, np.uint8, np.uint16]
         arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
                   for T in dtypes]
-        assume(src_device_type == caffe2_pb2.CUDA or src_gpu_id == 0)
-        assume(dst_device_type == caffe2_pb2.CUDA or dst_gpu_id == 0)
+        assume(core.IsGPUDeviceType(src_device_type) or src_gpu_id == 0)
+        assume(core.IsGPUDeviceType(dst_device_type) or dst_gpu_id == 0)
         src_device_option = core.DeviceOption(
             src_device_type, src_gpu_id)
         dst_device_option = core.DeviceOption(
@@ -90,7 +90,7 @@
                     self.assertTrue(proto.HasField('tensor'))
                     self.assertEqual(proto.tensor.device_detail.device_type,
                                      device_type)
-                    if device_type == caffe2_pb2.CUDA:
+                    if core.IsGPUDeviceType(device_type):
                         self.assertEqual(proto.tensor.device_detail.device_id,
                                          gpu_id)
 
diff --git a/caffe2/python/operator_test/prepend_dim_test.py b/caffe2/python/operator_test/prepend_dim_test.py
index a5b7d01..6cf8e7a 100644
--- a/caffe2/python/operator_test/prepend_dim_test.py
+++ b/caffe2/python/operator_test/prepend_dim_test.py
@@ -38,8 +38,8 @@
 
     def test_prepend_dim(self):
         devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
-        if workspace.NumCudaDevices() > 0:
-            devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0))
+        if workspace.NumGpuDevices() > 0:
+            devices.append(core.DeviceOption(workspace.GpuDeviceType, 0))
 
         for device_opt in devices:
             with core.DeviceScope(device_opt):
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
index be006f4..98189b8 100644
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ b/caffe2/python/operator_test/reshape_ops_test.py
@@ -115,8 +115,8 @@
 def _test_reshape(old_shape, new_shape, expected_shape=None, arg_shape=True,
                  in_place=False):
     devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
-    if workspace.NumCudaDevices() > 0:
-        devices.append(core.DeviceOption(caffe2_pb2.CUDA, 0))
+    if workspace.NumGpuDevices() > 0:
+        devices.append(core.DeviceOption(workspace.GpuDeviceType, 0))
 
     for device_opt in devices:
         with core.DeviceScope(device_opt):
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
index 9263791..0487d96 100644
--- a/caffe2/python/operator_test/roi_align_rotated_op_test.py
+++ b/caffe2/python/operator_test/roi_align_rotated_op_test.py
@@ -77,7 +77,7 @@
         self.assertReferenceChecks(
             device_option=gc, op=op, inputs=[X, R], reference=roialign_ref
         )
-        if gc.device_type == caffe2_pb2.CUDA:
+        if core.IsGPUDeviceType(gc.device_type):
             self.assertGradientChecks(gc, op, [X, R], 0, [0])
 
     @given(
@@ -202,7 +202,7 @@
         self.assertReferenceChecks(
             device_option=gc, op=op, inputs=[X, R], reference=roialign_ref
         )
-        if gc.device_type == caffe2_pb2.CUDA:
+        if core.IsGPUDeviceType(gc.device_type):
             self.assertGradientChecks(gc, op, [X, R], 0, [0])
 
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index b84f852..0aa0201 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -81,7 +81,7 @@
         if current_scope is None:
             return self.get_cpu_blob_name(base_str)
 
-        if current_scope.device_type == caffe2_pb2.CUDA:
+        if core.IsGPUDeviceType(current_scope.device_type):
             return self.get_gpu_blob_name(
                 base_str, current_scope.device_id, current_scope.node_name
             )
@@ -127,7 +127,7 @@
         if self._local_lr_multiplier is not None:
             current_scope = scope.CurrentDeviceScope()
             if (current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA
+                    and core.IsGPUDeviceType(current_scope.device_type)
                     and not self._local_lr_multiplier_on_gpu):
                 local_lr_multiplier = net.CopyFromCPUInput(
                     self._local_lr_multiplier,
@@ -258,7 +258,7 @@
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and core.IsGPUDeviceType(current_scope.device_type)),
             )
 
         # We need negative sign for LR when used directly with WeightedSum
@@ -549,7 +549,7 @@
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and core.IsGPUDeviceType(current_scope.device_type)),
             )
 
         lr, _ = self.build_lr(
@@ -688,7 +688,7 @@
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and core.IsGPUDeviceType(current_scope.device_type)),
             )
 
         lr, _ = self.build_lr(
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
index 0dc8e55..5ca6af4 100644
--- a/caffe2/python/optimizer_test.py
+++ b/caffe2/python/optimizer_test.py
@@ -454,11 +454,12 @@
                 )
 
     @unittest.skip("Results might vary too much. Only for individual use.")
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @unittest.skipIf(not workspace.has_gpu_support
+                    and not workspace.has_hip_support, "No gpu support")
     def test_caffe2_gpu_vs_numpy(self):
         n_dim = 1000000
         n_iter = 50
-        gpu_device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
         with core.DeviceScope(gpu_device_opt):
             for zero_debias in [False, True]:
                 for grad_coef in [1.0, 0.1, 0.01]:
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index dbb0dbe..685782d 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -70,7 +70,7 @@
 
     @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
     def testGPUDense(self, dtype=core.DataType.FLOAT):
-        device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
         with core.DeviceScope(device_opt):
             model, _perfect_model, data, label = self._createDense(dtype)
             if dtype == core.DataType.FLOAT16:
diff --git a/caffe2/python/parallelize_bmuf_distributed_test.py b/caffe2/python/parallelize_bmuf_distributed_test.py
index afce7c6..a930c15 100644
--- a/caffe2/python/parallelize_bmuf_distributed_test.py
+++ b/caffe2/python/parallelize_bmuf_distributed_test.py
@@ -25,10 +25,10 @@
     dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
 
     if not cpu_device:
-        if not workspace.has_gpu_support:
+        if not workspace.has_gpu_support and not workspace.has_hip_support:
             log.info('No GPU support test is Ignored.')
             return
-        if workspace.NumCudaDevices() < 4:
+        if workspace.NumGpuDevices() < 4:
             log.info('Not enough GPU support, test IGNORED')
             return
 
@@ -37,7 +37,7 @@
         name="test"
     )
     if not cpu_device:
-        device_type = caffe2_pb2.CUDA
+        device_type = workspace.GpuDeviceType
         device_prefix = "gpu"
     else:
         device_type = caffe2_pb2.CPU
diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py
index 2d44e09..c3bf9b3 100644
--- a/caffe2/python/rnn/lstm_comparison.py
+++ b/caffe2/python/rnn/lstm_comparison.py
@@ -11,7 +11,7 @@
     results = []
     num_iters = 1000
     args.gpu = True
-    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+    with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
         for batch_size in [64, 128, 256]:
             for seq_length in [20, 100]:
                 for hidden_dim in [40, 100, 400, 800]:
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
index 4010502..108b749 100644
--- a/caffe2/python/rnn_cell.py
+++ b/caffe2/python/rnn_cell.py
@@ -1314,7 +1314,7 @@
         )
         if (
             scope.CurrentDeviceScope() is not None and
-            scope.CurrentDeviceScope().device_type == caffe2_pb2.CUDA
+            core.IsGPUDeviceType(scope.CurrentDeviceScope().device_type)
         ):
             encoder_length = model.net.CopyGPUToCPU(
                 encoder_length,
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index 11f7a2c..d51488d 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from caffe2.python import scope, core
+from caffe2.python import scope, core, workspace
 from caffe2.proto import caffe2_pb2
 
 import unittest
@@ -18,7 +18,7 @@
     testobj.assertEquals(scope.CurrentNameScope(), "")
     testobj.assertEquals(scope.CurrentDeviceScope(), None)
     namescope = "namescope_{}".format(idx)
-    dsc = core.DeviceOption(caffe2_pb2.CUDA, idx)
+    dsc = core.DeviceOption(workspace.GpuDeviceType, idx)
     with scope.DeviceScope(dsc):
         with scope.NameScope(namescope):
             testobj.assertEquals(scope.CurrentNameScope(), namescope + "/")
@@ -58,7 +58,7 @@
     def testDevicescopeBasic(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
         with scope.DeviceScope(dsc):
             self.assertEquals(scope.CurrentDeviceScope(), dsc)
 
@@ -67,7 +67,7 @@
     def testEmptyDevicescopeBasic(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
         with scope.DeviceScope(dsc):
             self.assertEquals(scope.CurrentDeviceScope(), dsc)
             with scope.EmptyDeviceScope():
@@ -78,7 +78,7 @@
     def testDevicescopeAssertion(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
 
         try:
             with scope.DeviceScope(dsc):
diff --git a/caffe2/python/test/executor_test.py b/caffe2/python/test/executor_test.py
index bee45e1..d4ff0c3 100644
--- a/caffe2/python/test/executor_test.py
+++ b/caffe2/python/test/executor_test.py
@@ -47,14 +47,15 @@
         )
 
 
-@unittest.skipIf(not workspace.has_gpu_support, "no gpu")
+@unittest.skipIf(not workspace.has_gpu_support
+                and not workspace.has_hip_support, "no gpu")
 class ExecutorGPUResNetTest(ExecutorTestBase):
     @given(executor=st.sampled_from(EXECUTORS),
            num_workers=st.sampled_from([8]))
     @executor_test_settings
     def test_executor(self, executor, num_workers):
         model = build_resnet50_dataparallel_model(
-            num_gpus=workspace.NumCudaDevices(), batch_size=8, epoch_size=8)
+            num_gpus=workspace.NumGpuDevices(), batch_size=8, epoch_size=8)
         model.Proto().num_workers = num_workers
 
         def run_model():
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
index f9e87c1..a4ec59e 100644
--- a/caffe2/python/utils.py
+++ b/caffe2/python/utils.py
@@ -237,7 +237,7 @@
 
 
 def GetGPUMemoryUsageStats():
-    """Get GPU memory usage stats from CUDAContext. This requires flag
+    """Get GPU memory usage stats from CUDAContext/HIPContext. This requires flag
        --caffe2_gpu_memory_tracking to be enabled"""
     from caffe2.python import workspace, core
     workspace.RunOperatorOnce(
@@ -245,7 +245,7 @@
             "GetGPUMemoryUsage",
             [],
             ["____mem____"],
-            device_option=core.DeviceOption(caffe2_pb2.CUDA, 0),
+            device_option=core.DeviceOption(workspace.GpuDeviceType, 0),
         ),
     )
     b = workspace.FetchBlob("____mem____")
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index 8551ac5..88aece2 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -44,11 +44,15 @@
 has_gpu_support = C.has_gpu_support
 has_hip_support = C.has_hip_support
 if has_gpu_support:
+    GpuDeviceType = caffe2_pb2.CUDA
     NumCudaDevices = C.num_cuda_devices
+    # This is a duplicate of NumCudaDevices. Remove
+    # NumCudaDevices once replaced everywhere in the code
+    NumGpuDevices = C.num_cuda_devices
     GetCUDAVersion = C.get_cuda_version
     GetCuDNNVersion = C.get_cudnn_version
 
-    def GetCudaPeerAccessPattern():
+    def GetGpuPeerAccessPattern():
         return np.asarray(C.get_cuda_peer_access_pattern())
 
     GetDeviceProperties = C.get_device_properties
@@ -56,8 +60,22 @@
     NumCudaDevices = lambda: 0 # noqa
     GetCUDAVersion = lambda: 0 # noqa
     GetCuDNNVersion = lambda: 0 # noqa
-    GetCudaPeerAccessPattern = lambda: np.array([]) # noqa
+
+if has_hip_support:
+    GpuDeviceType = caffe2_pb2.HIP
+    NumGpuDevices = C.num_hip_devices
+
+    def GetGpuPeerAccessPattern():
+        return np.asarray(C.get_hip_peer_access_pattern())
+    GetDeviceProperties = C.get_device_properties
+
+if not has_gpu_support and not has_hip_support:
+    # setting cuda as the default GpuDeviceType as some tests
+    # like core, scope tests use GpuDeviceType even without gpu support
+    GpuDeviceType = caffe2_pb2.CUDA
+    NumGpuDevices = lambda: 0 # noqa
     GetDeviceProperties = lambda x: None # noqa
+    GetGpuPeerAccessPattern = lambda: np.array([]) # noqa
 
 IsNUMAEnabled = C.is_numa_enabled
 GetNumNUMANodes = C.get_num_numa_nodes
@@ -82,7 +100,6 @@
         # rather than 24x7 service.
         return port
 
-
 def StartMint(root_folder=None, port=None):
     """Start a mint instance.
 
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index a248d62..93bcb11 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -320,7 +320,8 @@
         self.assertTrue("test" in workspaces)
 
 
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(not workspace.has_gpu_support
+                and not workspace.has_hip_support, "No gpu support.")
 class TestWorkspaceGPU(test_util.TestCase):
 
     def setUp(self):
@@ -342,12 +343,12 @@
         self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
         np.testing.assert_array_equal(fetched_again, 2.0)
 
-    def testGetCudaPeerAccessPattern(self):
-        pattern = workspace.GetCudaPeerAccessPattern()
+    def testGetGpuPeerAccessPattern(self):
+        pattern = workspace.GetGpuPeerAccessPattern()
         self.assertEqual(type(pattern), np.ndarray)
         self.assertEqual(pattern.ndim, 2)
         self.assertEqual(pattern.shape[0], pattern.shape[1])
-        self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
+        self.assertEqual(pattern.shape[0], workspace.NumGpuDevices())
 
 
 @unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")