| import torch |
| import torch.cuda |
| import torch.jit |
| import numpy as np |
| from hypothesis import given |
| from hypothesis import strategies as st |
| import torch.testing._internal.hypothesis_utils as hu |
| hu.assert_deadline_disabled() |
| from torch.testing._internal.common_utils import run_tests, TestCase |
| from torch.quantization import FakeQuantize |
| from torch.quantization import default_observer, default_per_channel_weight_observer |
| import io |
| import unittest |
| |
| # Reference method for fake quantize |
| def _fake_quantize_per_tensor_affine_reference(X, scale, zero_point, quant_min, quant_max): |
| res = (torch.clamp(torch.round(X * (1.0 / scale) + zero_point), quant_min, quant_max) - zero_point) * scale |
| return res |
| |
| # Reference method for the gradient of the fake quantize operator |
| def _fake_quantize_per_tensor_affine_grad_reference(dY, X, scale, zero_point, quant_min, quant_max): |
| Xq = torch.round(X * (1.0 / scale) + zero_point) |
| mask = (Xq >= quant_min) * (Xq <= quant_max) |
| res = torch.zeros_like(dY) |
| res[mask] = dY[mask] |
| return res |
| |
| # Helper function used to simulate per-channel fake-quant against any axis |
| def _permute_to_axis_zero(X, axis): |
| new_axis_list = list(range(X.dim())) |
| new_axis_list[axis] = 0 |
| new_axis_list[0] = axis |
| y = X.permute(tuple(new_axis_list)) |
| return y, new_axis_list |
| |
| # Reference method for fake quantize |
| def _fake_quantize_per_channel_affine_reference(X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max): |
| X, permute_axis_list = _permute_to_axis_zero(X, axis) |
| res = torch.zeros_like(X) |
| |
| for i in range(X.size()[0]): |
| res[i] = (torch.clamp(torch.round(X[i] * (1.0 / per_channel_scale[i]) + |
| per_channel_zero_point[i]), quant_min, quant_max) - per_channel_zero_point[i]) * per_channel_scale[i] |
| |
| out = res.permute(tuple(permute_axis_list)) |
| return out |
| |
| # Reference method for the gradient of the fake quantize operator |
| def _fake_quantize_per_channel_affine_grad_reference(dY, X, per_channel_scale, per_channel_zero_point, axis, quant_min, quant_max): |
| X, permute_axis_list = _permute_to_axis_zero(X, axis) |
| Xq = torch.zeros_like(X) |
| for i in range(X.size()[0]): |
| Xq[i] = torch.round(X[i] * (1.0 / per_channel_scale[i]) + per_channel_zero_point[i]) |
| Xq = Xq.permute(tuple(permute_axis_list)) |
| mask = (Xq >= quant_min) * (Xq <= quant_max) |
| res = torch.zeros_like(dY) |
| res[mask] = dY[mask] |
| return res |
| |
| def to_tensor(X, device): |
| return torch.tensor(X).to(device=torch.device(device), dtype=torch.float32) |
| |
| NP_RANDOM_SEED = 19 |
| tolerance = 1e-6 |
| |
| class TestFakeQuantizePerTensor(TestCase): |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.tensor(shapes=hu.array_shapes(1, 5,), |
| qparams=hu.qparams(dtypes=torch.quint8))) |
| def test_forward_per_tensor(self, device, X): |
| r"""Tests the forward path of the FakeQuantizePerTensorAffine op. |
| """ |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| Y = _fake_quantize_per_tensor_affine_reference(X.cpu(), scale, zero_point, quant_min, quant_max) |
| Y_prime = torch.fake_quantize_per_tensor_affine( |
| X, scale, zero_point, quant_min, quant_max) |
| np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance) |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.tensor(shapes=hu.array_shapes(1, 5,), |
| qparams=hu.qparams(dtypes=torch.quint8))) |
| @unittest.skip("temporarily disable the test") |
| def test_backward_per_tensor(self, device, X): |
| r"""Tests the backward method. |
| """ |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| X.requires_grad_() |
| Y = _fake_quantize_per_tensor_affine_reference(X.cpu(), scale, zero_point, quant_min, quant_max) |
| Y_prime = torch.fake_quantize_per_tensor_affine( |
| X, scale, zero_point, quant_min, quant_max) |
| dout = torch.rand(X.shape, dtype=torch.float).to(device) |
| dX = _fake_quantize_per_tensor_affine_grad_reference( |
| dout, X, scale, zero_point, quant_min, quant_max) |
| Y_prime.backward(dout) |
| np.testing.assert_allclose(dX.cpu(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance) |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.tensor(shapes=hu.array_shapes(1, 5,), |
| qparams=hu.qparams(dtypes=torch.quint8))) |
| # https://github.com/pytorch/pytorch/issues/30604 |
| @unittest.skip("temporarily disable the test") |
| def test_numerical_consistency_per_tensor(self, device, X): |
| r"""Comparing numerical consistency between CPU quantize/dequantize op and the CPU fake quantize op |
| """ |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| # quantize_per_tensor and dequantize are only implemented in CPU |
| Y = torch.dequantize(torch.quantize_per_tensor(X.cpu(), scale, zero_point, torch_type)) |
| Y_prime = torch.fake_quantize_per_tensor_affine( |
| X, scale, zero_point, quant_min, quant_max) |
| np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance) |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.tensor(shapes=hu.array_shapes(1, 5,), |
| qparams=hu.qparams(dtypes=[torch.quint8])), |
| ) |
| def test_fq_module(self, device, X): |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| X.requires_grad_() |
| fq_module = torch.quantization.default_fake_quant().to(device) |
| Y_prime = fq_module(X) |
| assert fq_module.scale is not None |
| assert fq_module.zero_point is not None |
| Y = _fake_quantize_per_tensor_affine_reference(X, fq_module.scale, fq_module.zero_point, quant_min, quant_max) |
| np.testing.assert_allclose(Y.cpu().detach().numpy(), Y_prime.cpu().detach().numpy(), rtol=tolerance, atol=tolerance) |
| |
| # Test backward |
| dout = torch.rand(X.shape, dtype=torch.float, device=device) |
| Y_prime.backward(dout) |
| dX = _fake_quantize_per_tensor_affine_grad_reference(dout, X, fq_module.scale, fq_module.zero_point, quant_min, quant_max) |
| np.testing.assert_allclose(dX.cpu().numpy(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance) |
| |
| def test_fq_serializable(self): |
| observer = default_observer |
| quant_min = 0 |
| quant_max = 255 |
| fq_module = FakeQuantize(observer, quant_min, quant_max) |
| X = torch.tensor([-5, -3.5, -2, 0, 3, 5, 7], dtype=torch.float32) |
| y_ref = fq_module(X) |
| state_dict = fq_module.state_dict() |
| self.assertEqual(state_dict['scale'], 0.094488) |
| self.assertEqual(state_dict['zero_point'], 53) |
| b = io.BytesIO() |
| torch.save(state_dict, b) |
| b.seek(0) |
| loaded_dict = torch.load(b) |
| loaded_fq_module = FakeQuantize(observer, quant_min, quant_max) |
| loaded_fq_module.load_state_dict(loaded_dict) |
| for key in state_dict: |
| self.assertEqual(state_dict[key], loaded_fq_module.state_dict()[key]) |
| |
| self.assertEqual(loaded_fq_module.calculate_qparams(), fq_module.calculate_qparams()) |
| |
| def test_fake_quant_control(self): |
| torch.manual_seed(42) |
| X = torch.rand(20, 10, dtype=torch.float32) |
| fq_module = torch.quantization.default_fake_quant() |
| # Output of fake quant is not identical to input |
| Y = fq_module(X) |
| self.assertNotEqual(Y, X) |
| torch.quantization.disable_fake_quant(fq_module) |
| X = torch.rand(20, 10, dtype=torch.float32) |
| Y = fq_module(X) |
| # Fake quant is disabled,output is identical to input |
| self.assertEqual(Y, X) |
| scale = fq_module.scale |
| zero_point = fq_module.zero_point |
| torch.quantization.disable_observer(fq_module) |
| torch.quantization.enable_fake_quant(fq_module) |
| X = 10.0 * torch.rand(20, 10, dtype=torch.float32) - 5.0 |
| Y = fq_module(X) |
| self.assertNotEqual(Y, X) |
| # Observer is disabled, scale and zero-point do not change |
| self.assertEqual(fq_module.scale, scale) |
| self.assertEqual(fq_module.zero_point, zero_point) |
| torch.quantization.enable_observer(fq_module) |
| Y = fq_module(X) |
| self.assertNotEqual(Y, X) |
| # Observer is enabled, scale and zero-point are different |
| self.assertNotEqual(fq_module.scale, scale) |
| self.assertNotEqual(fq_module.zero_point, zero_point) |
| |
| |
| |
| class TestFakeQuantizePerChannel(TestCase): |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,), |
| qparams=hu.qparams(dtypes=torch.quint8))) |
| def test_forward_per_channel(self, device, X): |
| r"""Tests the forward path of the FakeQuantizePerTensorAffine op. |
| """ |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, axis, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| scale = to_tensor(scale, device) |
| zero_point = torch.tensor(zero_point).to(dtype=torch.int64, device=device) |
| Y = _fake_quantize_per_channel_affine_reference(X.cpu(), scale.cpu(), zero_point.cpu(), axis, quant_min, quant_max) |
| Y_prime = torch.fake_quantize_per_channel_affine( |
| X, scale, zero_point, axis, quant_min, quant_max) |
| np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance) |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,), |
| qparams=hu.qparams(dtypes=torch.quint8))) |
| def test_backward_per_channel(self, device, X): |
| r"""Tests the backward method. |
| """ |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, axis, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| scale = to_tensor(scale, device) |
| zero_point = torch.tensor(zero_point).to(dtype=torch.int64, device=device) |
| X.requires_grad_() |
| Y_prime = torch.fake_quantize_per_channel_affine( |
| X, scale, zero_point, axis, quant_min, quant_max) |
| dout = torch.rand(X.shape, dtype=torch.float).to(device) |
| dX = _fake_quantize_per_channel_affine_grad_reference( |
| dout, X, scale, zero_point, axis, quant_min, quant_max) |
| Y_prime.backward(dout) |
| np.testing.assert_allclose(dX.cpu().detach().numpy(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance) |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.per_channel_tensor(shapes=hu.array_shapes(1, 5,), |
| qparams=hu.qparams(dtypes=torch.quint8))) |
| @unittest.skip("temporarily disable the test") |
| def test_numerical_consistency_per_channel(self, device, X): |
| r"""Comparing numerical consistency between CPU quantize/dequantize op and the CPU fake quantize op |
| """ |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, axis, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| scale = to_tensor(scale, device) |
| zero_point = torch.tensor(zero_point).to(dtype=torch.int64, device=device) |
| # quantize_linear and dequantize are only implemented in CPU |
| Y = torch.dequantize(torch.quantize_per_channel(X.cpu(), scale.cpu(), zero_point.cpu(), axis, torch_type)) |
| Y_prime = torch.fake_quantize_per_channel_affine( |
| X, scale, zero_point, axis, quant_min, quant_max) |
| np.testing.assert_allclose(Y, Y_prime.cpu(), rtol=tolerance, atol=tolerance) |
| |
| @given(device=st.sampled_from(['cpu', 'cuda'] if torch.cuda.is_available() else ['cpu']), |
| X=hu.per_channel_tensor(shapes=hu.array_shapes(2, 5,), |
| qparams=hu.qparams(dtypes=torch.qint8))) |
| def test_fq_module(self, device, X): |
| np.random.seed(NP_RANDOM_SEED) |
| X, (scale, zero_point, axis, torch_type) = X |
| quant_min = torch.iinfo(torch_type).min |
| quant_max = torch.iinfo(torch_type).max |
| |
| X = to_tensor(X, device) |
| X.requires_grad_() |
| fq_module = FakeQuantize(default_per_channel_weight_observer, quant_min, quant_max, ch_axis=axis).to(device) |
| Y_prime = fq_module(X) |
| assert fq_module.scale is not None |
| assert fq_module.zero_point is not None |
| Y = _fake_quantize_per_channel_affine_reference(X, fq_module.scale, |
| fq_module.zero_point, axis, quant_min, quant_max) |
| np.testing.assert_allclose(Y.cpu().detach().numpy(), Y_prime.cpu().detach().numpy(), rtol=tolerance, atol=tolerance) |
| |
| # Test backward |
| dout = torch.rand(X.shape, dtype=torch.float, device=device) |
| Y_prime.backward(dout) |
| dX = _fake_quantize_per_channel_affine_grad_reference(dout, X, fq_module.scale, |
| fq_module.zero_point, axis, quant_min, quant_max) |
| np.testing.assert_allclose(dX.cpu().numpy(), X.grad.cpu().detach().numpy(), rtol=tolerance, atol=tolerance) |
| |
| def test_fq_serializable(self): |
| observer = default_per_channel_weight_observer |
| quant_min = -128 |
| quant_max = 127 |
| fq_module = FakeQuantize(observer, quant_min, quant_max) |
| X = torch.tensor([[-5, -3.5, -2, 0, 3, 5, 7], [1, 3, 2, 5, 6.5, 8, 10]], dtype=torch.float32) |
| y_ref = fq_module(X) |
| state_dict = fq_module.state_dict() |
| self.assertEqual(state_dict['scale'], [0.054902, 0.078431]) |
| self.assertEqual(state_dict['zero_point'], [0, 0]) |
| b = io.BytesIO() |
| torch.save(state_dict, b) |
| b.seek(0) |
| loaded_dict = torch.load(b) |
| for key in state_dict: |
| self.assertEqual(state_dict[key], loaded_dict[key]) |
| |
| if __name__ == '__main__': |
| run_tests() |