[caffe2] allow dropout to take 1.0 as dropout ratio to zero-out a layer (#72741)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/72741
as titled.
Context:
This is useful in fast mitigating feature induced overfitting in the sense that we can do omni-transfer on a trained model and apply dropout with ratio = 1 on features resulting in overfitting. Directly removing the features would not be feasible on omni-transfer scenarios since the downstream FC sizes would change.
Experimental records:
https://fb.quip.com/npIkAgRc8jl9#temp:C:DWC050ceaba14424d23a78462c01
Doing dropout = 1 on selected features improves the eval NE over the next few hours (compared to v0 baseline) as is shown in the figures.
Test Plan:
```
buck test caffe2/caffe2/python/operator_test:dropout_op_test
```
Reviewed By: ustctf
Differential Revision: D34178732
fbshipit-source-id: 533feebe21bc582eefd756de397d5c7807c7438d
(cherry picked from commit 5dabf9c484c0bc5410e3700e3010cdabb4bf903c)
diff --git a/caffe2/operators/dropout_op.cc b/caffe2/operators/dropout_op.cc
index 6f37407..bbd1eb1 100644
--- a/caffe2/operators/dropout_op.cc
+++ b/caffe2/operators/dropout_op.cc
@@ -15,13 +15,12 @@
return true;
} else {
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
- float scale = 1. / (1. - ratio_);
+ float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
// mask=true means keep, and mask=false means not keep, so we will
// generate probability depending on 1-ratio.
at::bernoulli_distribution<double> dist(1. - ratio_);
const float* Xdata = X.data<float>();
float* Ydata = Y->template mutable_data<float>();
-
auto mask = Output(1, X.sizes(), at::dtype<bool>());
bool* mask_data = mask->template mutable_data<bool>();
auto* gen = context_.RandGenerator();
@@ -52,7 +51,7 @@
const bool* mask_data = mask.data<bool>();
float* dXdata = dX->template mutable_data<float>();
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
- float scale = 1. / (1. - ratio_);
+ float scale = ratio_ >= 1.0 ? 0.0:1. / (1. - ratio_);
for (int i = 0; i < dY.numel(); ++i) {
// NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
dXdata[i] = dYdata[i] * mask_data[i] * scale;
diff --git a/caffe2/operators/dropout_op.h b/caffe2/operators/dropout_op.h
index aff0528..ae8f0ff 100644
--- a/caffe2/operators/dropout_op.h
+++ b/caffe2/operators/dropout_op.h
@@ -19,7 +19,6 @@
is_test_(
this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
CAFFE_ENFORCE_GE(ratio_, 0);
- CAFFE_ENFORCE_LT(ratio_, 1);
}
bool RunOnDevice() override;
@@ -41,7 +40,6 @@
is_test_(
this->template GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)) {
CAFFE_ENFORCE_GE(ratio_, 0);
- CAFFE_ENFORCE_LT(ratio_, 1);
}
bool RunOnDevice() override;
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
index d3a5c83..ad2b620 100644
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ b/caffe2/python/operator_test/dropout_op_test.py
@@ -74,3 +74,35 @@
gc, op, [X], reference_dropout_ratio0,
# Don't check the mask with cuDNN because it's packed data
outputs_to_check=None if engine != 'CUDNN' else [0])
+
+
+ @given(X=hu.tensor(),
+ in_place=st.booleans(),
+ output_mask=st.booleans(),
+ engine=st.sampled_from(["", "CUDNN"]),
+ **hu.gcs)
+ @settings(deadline=10000)
+ def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc):
+ """Test with ratio=0 for a deterministic reference impl."""
+ if in_place:
+ # Skip if trying in-place on GPU
+ assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
+ # If in-place on CPU, don't compare with GPU
+ dc = dc[:1]
+ is_test = not output_mask
+ op = core.CreateOperator("Dropout", ["X"],
+ ["X" if in_place else "Y"] +
+ (["mask"] if output_mask else []),
+ ratio=1.0, engine=engine,
+ is_test=is_test)
+
+ self.assertDeviceChecks(dc, op, [X], [0])
+ if not is_test:
+ self.assertGradientChecks(gc, op, [X], 0, [0])
+
+ def reference_dropout_ratio1(x):
+ return (x,) if is_test else (np.zeros(x.shape, dtype=np.float), np.zeros(x.shape, dtype=np.bool))
+ self.assertReferenceChecks(
+ gc, op, [X], reference_dropout_ratio1,
+ # Don't check the mask with cuDNN because it's packed data
+ outputs_to_check=None if engine != 'CUDNN' else [0])