[quant][core][gpu][feature] Added support for float->quantized cuda tensor copying (#76177)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/76177
Previously, support for copying a fp tensor to a quantized tensor was
limited to CPU tensors. This PR extends the support to GPU tensors.
A corresponding test was added to test_qtensor_float_assignment for cuda
tensors
Test Plan:
```
python test/test_quantization.py -k test_qtensor_float_assignment
```
Imported from OSS
Differential Revision:
D35817832
D35817832
Reviewed By: jerryzh168
Pulled By: dzdang
fbshipit-source-id: e5a4a0bb2d8a56f3f1a88806a534b5cb38275cf2
(cherry picked from commit 9173e07b51bb1b853244b205ddf3e36000f01b64)
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index 7d16603..66c646c 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -185,7 +185,7 @@
}
if (self.is_quantized() && !src.is_quantized()) {
- return quantized_copy_from_float_cpu_(self, src);
+ return quantized_copy_from_float_(self, src);
}
if (self.is_quantized() && src.is_quantized()) {
diff --git a/aten/src/ATen/native/quantized/Copy.cpp b/aten/src/ATen/native/quantized/Copy.cpp
index ac3f5e9..e3b6bd8 100644
--- a/aten/src/ATen/native/quantized/Copy.cpp
+++ b/aten/src/ATen/native/quantized/Copy.cpp
@@ -13,7 +13,7 @@
// This means that assignment of a non-contiguous quantized subtensor is currently not supported in pytorch
// e.g., Consider a 2x2 quantized tensor qt1 and a non-quantized tensor t2. The operation
// `qt1[:, 0] = t2[:, 0]` would trigger the exception b/c neither the LHS nor RHS is contiguous
-Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src) {
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src) {
TORCH_CHECK(
src.scalar_type() == at::kFloat,
"Quantized copy only works with kFloat as source Tensor");
@@ -23,21 +23,14 @@
TORCH_CHECK(
self.sizes().equals(src.sizes()),
"Quantized copy only works with Tensors with the same shape");
- TORCH_CHECK(
- self.device().type() == kCPU,
- "Quantized copy only works with QuantizedCPU Tensors");
AT_DISPATCH_QINT_TYPES(self.scalar_type(), "Copy", [&]() {
- if (self.qscheme() == kPerChannelAffine) {
+ if (self.qscheme() == kPerChannelAffine || self.qscheme() == kPerChannelAffineFloatQParams
+ || self.qscheme() == kPerChannelSymmetric) {
quantize_tensor_per_channel_affine(src, self, self.q_per_channel_scales(),
self.q_per_channel_zero_points(),
self.q_per_channel_axis());
} else {
- float* src_data = src.data_ptr<float>();
- scalar_t* self_data = self.data_ptr<scalar_t>();
- for (const auto i : c10::irange(self.numel())) {
- self_data[i] = quantize_val<scalar_t>(
- self.q_scale(), self.q_zero_point(), src_data[i]);
- }
+ quantize_tensor_per_tensor_affine(src, self, self.q_scale(), self.q_zero_point());
}
});
return self;
diff --git a/aten/src/ATen/native/quantized/Copy.h b/aten/src/ATen/native/quantized/Copy.h
index 65dabd2..d52c8ff 100644
--- a/aten/src/ATen/native/quantized/Copy.h
+++ b/aten/src/ATen/native/quantized/Copy.h
@@ -5,6 +5,6 @@
namespace at {
namespace native {
-Tensor& quantized_copy_from_float_cpu_(Tensor& self, const Tensor& src);
+Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src);
}
} // namespace at
diff --git a/test/quantization/core/test_quantized_tensor.py b/test/quantization/core/test_quantized_tensor.py
index 6ab45e3..f54388d 100644
--- a/test/quantization/core/test_quantized_tensor.py
+++ b/test/quantization/core/test_quantized_tensor.py
@@ -354,25 +354,33 @@
# item
scale = 1.0
zero_point = 2
- r = torch.ones(1, dtype=torch.float)
- for dtype in [torch.qint8, torch.quint8, torch.qint32]:
- qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
- self.assertEqual(qr.item(), 1)
- self.assertEqual(qr[0].item(), 1)
- # assignment
- self.assertTrue(qr[0].is_quantized)
- qr[0] = 11.3 # float assignment
- self.assertEqual(qr.item(), 11)
- x = torch.ones(1, dtype=torch.float) * 15.3
- # Copying from a float Tensor
- qr[:] = x
- self.assertEqual(qr.item(), 15)
+ devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+ for device in devices:
+ r = torch.ones(1, dtype=torch.float).to(device=device)
+ for dtype in [torch.qint8, torch.quint8, torch.qint32]:
+ qr = torch.quantize_per_tensor(r, scale, zero_point, dtype=dtype)
+ self.assertEqual(qr.item(), 1)
+ self.assertEqual(qr[0].item(), 1)
+ # assignment
+ self.assertTrue(qr[0].is_quantized)
+ qr[0] = torch.Tensor([11.3]).to(device=device) # float assignment
+ self.assertEqual(qr.item(), 11)
+ x = torch.ones(1, dtype=torch.float).to(device=device) * 15.3
+ # Copying from a float Tensor
+ qr[:] = x
+ self.assertEqual(qr.item(), 15)
- dtype_msg = str(dtype) + ", "
- self.assertEqual(' '.join(str(qr).split()),
- "tensor([15.], size=(1,), dtype=" + dtype_msg +
- "quantization_scheme=torch.per_tensor_affine, " +
- "scale=1.0, zero_point=2)")
+ dtype_msg = str(dtype) + ", "
+ if device == "cuda":
+ self.assertEqual(' '.join(str(qr).split()),
+ "tensor([15.], device='" + str(qr.device) + "', size=(1,), dtype=" + dtype_msg +
+ "quantization_scheme=torch.per_tensor_affine, " +
+ "scale=1.0, zero_point=2)")
+ else:
+ self.assertEqual(' '.join(str(qr).split()),
+ "tensor([15.], size=(1,), dtype=" + dtype_msg +
+ "quantization_scheme=torch.per_tensor_affine, " +
+ "scale=1.0, zero_point=2)")
def test_qtensor_quant_dequant(self):
scale = 0.02