[quant][core][gpu][feature] Added support for float->quantized cuda tensor copying (#76177)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/76177

Previously, support for copying a fp tensor to a quantized tensor was
limited to CPU tensors. This PR extends the support to GPU tensors.
A corresponding test was added to test_qtensor_float_assignment for cuda
tensors

Test Plan:
```
python test/test_quantization.py -k test_qtensor_float_assignment
```
Imported from OSS

Differential Revision:
D35817832
D35817832

Reviewed By: jerryzh168

Pulled By: dzdang

fbshipit-source-id: e5a4a0bb2d8a56f3f1a88806a534b5cb38275cf2
(cherry picked from commit 9173e07b51bb1b853244b205ddf3e36000f01b64)
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 7d16603..66c646c 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -185,7 +185,7 @@
   }
 
   if (self.is_quantized() && !src.is_quantized()) {
-    return quantized_copy_from_float_cpu_(self, src);
+    return quantized_copy_from_float_(self, src);
   }
 
   if (self.is_quantized() && src.is_quantized()) {
diff --git a/aten/src/ATen/native/quantized/Copy.cpp b/aten/src/ATen/native/quantized/Copy.cpp
index ac3f5e9..e3b6bd8 100644
--- a/aten/src/ATen/native/quantized/Copy.cpp
+++ b/aten/src/ATen/native/quantized/Copy.cpp
@@ -13,7 +13,7 @@
 // This means that assignment of a non-contiguous quantized subtensor is currently not supported in pytorch
 // e.g., Consider a 2x2 quantized tensor qt1 and a non-quantized tensor t2. The operation
 // `qt1[:, 0] = t2[:, 0]` would trigger the exception b/c neither the LHS nor RHS is contiguous
-Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src) {
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src) {
   TORCH_CHECK(
       src.scalar_type() == at::kFloat,
       "Quantized copy only works with kFloat as source Tensor");
@@ -23,21 +23,14 @@
   TORCH_CHECK(
       self.sizes().equals(src.sizes()),
       "Quantized copy only works with Tensors with the same shape");
-  TORCH_CHECK(
-      self.device().type() == kCPU,
-      "Quantized copy only works with QuantizedCPU Tensors");
   AT_DISPATCH_QINT_TYPES(self.scalar_type(), "Copy", [&]() {
-    if (self.qscheme() == kPerChannelAffine) {
+    if (self.qscheme() == kPerChannelAffine || self.qscheme() == kPerChannelAffineFloatQParams
+        || self.qscheme() == kPerChannelSymmetric) {
       quantize_tensor_per_channel_affine(src, self, self.q_per_channel_scales(),
                                          self.q_per_channel_zero_points(),
                                          self.q_per_channel_axis());
     } else {
-      float* src_data = src.data_ptr<float>();
-      scalar_t* self_data = self.data_ptr<scalar_t>();
-      for (const auto i : c10::irange(self.numel())) {
-        self_data[i] = quantize_val<scalar_t>(
-            self.q_scale(), self.q_zero_point(), src_data[i]);
-      }
+      quantize_tensor_per_tensor_affine(src, self, self.q_scale(), self.q_zero_point());
     }
   });
   return self;
diff --git a/aten/src/ATen/native/quantized/Copy.h b/aten/src/ATen/native/quantized/Copy.h
index 65dabd2..d52c8ff 100644
--- a/aten/src/ATen/native/quantized/Copy.h
+++ b/aten/src/ATen/native/quantized/Copy.h
@@ -5,6 +5,6 @@
 namespace at {
 namespace native {
 
-Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src);
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src);
 }
 } // namespace at
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 6ab45e3..f54388d 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -354,25 +354,33 @@
         # item
         scale = 1.0
         zero_point = 2
-        r = torch.ones(1, dtype=torch.float)
-        for dtype in [torch.qint8, torch.quint8, torch.qint32]:
-            qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
-            self.assertEqual(qr.item(), 1)
-            self.assertEqual(qr[0].item(), 1)
-            # assignment
-            self.assertTrue(qr[0].is_quantized)
-            qr[0] = 11.3  # float assignment
-            self.assertEqual(qr.item(), 11)
-            x = torch.ones(1, dtype=torch.float) * 15.3
-            # Copying from a float Tensor
-            qr[:] = x
-            self.assertEqual(qr.item(), 15)
+        devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+        for device in devices:
+            r = torch.ones(1, dtype=torch.float).to(device=device)
+            for dtype in [torch.qint8, torch.quint8, torch.qint32]:
+                qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
+                self.assertEqual(qr.item(), 1)
+                self.assertEqual(qr[0].item(), 1)
+                # assignment
+                self.assertTrue(qr[0].is_quantized)
+                qr[0] = torch.Tensor([11.3]).to(device=device)  # float assignment
+                self.assertEqual(qr.item(), 11)
+                x = torch.ones(1, dtype=torch.float).to(device=device) * 15.3
+                # Copying from a float Tensor
+                qr[:] = x
+                self.assertEqual(qr.item(), 15)
 
-            dtype_msg = str(dtype) + ", "
-            self.assertEqual(' '.join(str(qr).split()),
-                             "tensor([15.], size=(1,), dtype=" + dtype_msg +
-                             "quantization_scheme=torch.per_tensor_affine, " +
-                             "scale=1.0, zero_point=2)")
+                dtype_msg = str(dtype) + ", "
+                if device == "cuda":
+                    self.assertEqual(' '.join(str(qr).split()),
+                                     "tensor([15.], device='" + str(qr.device) + "', size=(1,), dtype=" + dtype_msg +
+                                     "quantization_scheme=torch.per_tensor_affine, " +
+                                     "scale=1.0, zero_point=2)")
+                else:
+                    self.assertEqual(' '.join(str(qr).split()),
+                                     "tensor([15.], size=(1,), dtype=" + dtype_msg +
+                                     "quantization_scheme=torch.per_tensor_affine, " +
+                                     "scale=1.0, zero_point=2)")
 
     def test_qtensor_quant_dequant(self):
         scale = 0.02