[caffe2] allow dropout to take 1.0 as dropout ratio to zero-out a layer (#72741)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72741

as titled.

Context:
This is useful in fast mitigating feature induced overfitting in the sense that we can do omni-transfer on a trained model and apply dropout with ratio = 1 on features resulting in overfitting. Directly removing the features would not be feasible on omni-transfer scenarios since the downstream FC sizes would change.

Experimental records:
https://fb.quip.com/npIkAgRc8jl9#temp:C:DWC050ceaba14424d23a78462c01
Doing dropout = 1 on selected features improves the eval NE over the next few hours (compared to v0 baseline) as is shown in the figures.

Test Plan:
```
buck test caffe2/caffe2/python/operator_test:dropout_op_test
```

Reviewed By: ustctf

Differential Revision: D34178732

fbshipit-source-id: 533feebe21bc582eefd756de397d5c7807c7438d
(cherry picked from commit 5dabf9c484c0bc5410e3700e3010cdabb4bf903c)
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
index 6f37407..bbd1eb1 100644
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@@ -15,13 +15,12 @@
     return true;
   } else {
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
     // mask=true means keep, and mask=false means not keep, so we will
     // generate probability depending on 1-ratio.
     at::bernoulli_distribution<double> dist(1. - ratio_);
     const float* Xdata = X.data<float>();
     float* Ydata = Y->template mutable_data<float>();
-
     auto mask = Output(1, X.sizes(), at::dtype<bool>());
     bool* mask_data = mask->template mutable_data<bool>();
     auto* gen = context_.RandGenerator();
@@ -52,7 +51,7 @@
     const bool* mask_data = mask.data<bool>();
     float* dXdata = dX->template mutable_data<float>();
     // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
-    float scale = 1. / (1. - ratio_);
+    float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
     for (int i = 0; i < dY.numel(); ++i) {
       // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
       dXdata[i] = dYdata[i] * mask_data[i] * scale;
diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h
index aff0528..ae8f0ff 100644
--- a/caffe2/operators/dropout_op.h
+++ b/caffe2/operators/dropout_op.h
@@ -19,7 +19,6 @@
         is_test_(
             this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
     CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
   }
 
   bool RunOnDevice() override;
@@ -41,7 +40,6 @@
         is_test_(
             this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
     CAFFE_ENFORCE_GE(ratio_, 0);
-    CAFFE_ENFORCE_LT(ratio_, 1);
   }
 
   bool RunOnDevice() override;
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
index d3a5c83..ad2b620 100644
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -74,3 +74,35 @@
             gc, op, [X], reference_dropout_ratio0,
             # Don't check the mask with cuDNN because it's packed data
             outputs_to_check=None if engine != 'CUDNN' else [0])
+
+
+    @given(X=hu.tensor(),
+           in_place=st.booleans(),
+           output_mask=st.booleans(),
+           engine=st.sampled_from(["", "CUDNN"]),
+           **hu.gcs)
+    @settings(deadline=10000)
+    def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc):
+        """Test with ratio=0 for a deterministic reference impl."""
+        if in_place:
+            # Skip if trying in-place on GPU
+            assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
+            # If in-place on CPU, don't compare with GPU
+            dc = dc[:1]
+        is_test = not output_mask
+        op = core.CreateOperator("Dropout", ["X"],
+                                 ["X" if in_place else "Y"] +
+                                 (["mask"] if output_mask else []),
+                                 ratio=1.0, engine=engine,
+                                 is_test=is_test)
+
+        self.assertDeviceChecks(dc, op, [X], [0])
+        if not is_test:
+            self.assertGradientChecks(gc, op, [X], 0, [0])
+
+        def reference_dropout_ratio1(x):
+            return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool))
+        self.assertReferenceChecks(
+            gc, op, [X], reference_dropout_ratio1,
+            # Don't check the mask with cuDNN because it's packed data
+            outputs_to_check=None if engine != 'CUDNN' else [0])