Optimize max_pooling for inference for MKL-DNN/IDEEP device (#10156)

Summary:
Optimize the max_pooling operation for inference path by setting the "inference" flag to the underlying MKL-DNN, saving the computation and store of max indices which is only needed for training. To make the API compatible, training mode is still the default and inference mode is set in the optimizeForIdeep path.
Test shows the speed-up of a single max_pooling operation is up to 7X on BDW.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/10156

Differential Revision: D9276755

Pulled By: yinghai

fbshipit-source-id: ad533d53aabb8ccb3b592da984d6269d9b794a8a
diff --git a/caffe2/ideep/operators/pool_op.cc b/caffe2/ideep/operators/pool_op.cc
index ace88cf..6117060 100644
--- a/caffe2/ideep/operators/pool_op.cc
+++ b/caffe2/ideep/operators/pool_op.cc
@@ -8,7 +8,9 @@
   USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
 
   IDEEPPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {
+      : IDEEPConvPoolOpBase(operator_def, ws),
+        training_mode_(
+            OperatorBase::GetSingleArgument<int>("training_mode", 1)) {
     CAFFE_ENFORCE(
         (dilation_h() == 1) && (dilation_w() == 1),
         "Pooling op does not support dilation right now.");
@@ -33,15 +35,18 @@
     auto& X = Input(INPUT);
     auto* Y = Output(OUTPUT);
     auto Y_dims = CalcOutputDims(X, X.get_dim(1));
+    mkldnn::prop_kind pk = training_mode_ ?
+      mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_inference;
 
     ideep::pooling_forward::compute(X, Y_dims, *Y,
-        stride_, kernel_, pad_tl(), pad_br(), algo_);
+        stride_, kernel_, pad_tl(), pad_br(), algo_, pk);
 
     return true;
   }
 
  private:
   ialgo algo_;
+  bool training_mode_;
 
   INPUT_TAGS(INPUT);
   OUTPUT_TAGS(OUTPUT);
diff --git a/caffe2/opt/optimize_ideep.cc b/caffe2/opt/optimize_ideep.cc
index d880987..af35304 100644
--- a/caffe2/opt/optimize_ideep.cc
+++ b/caffe2/opt/optimize_ideep.cc
@@ -363,6 +363,35 @@
   }
 }
 
+void setPoolingInferenceMode(repr::NNModule *nn) {
+  for (auto node_pair : repr::nn::dataIterator<repr::MaxPool>(nn->dataFlow)) {
+    repr::NNGraph::NodeRef maxPoolNode;
+    repr::MaxPool *maxPool;
+    std::tie(maxPool, maxPoolNode) = node_pair;
+
+    if (!isOnIdeepDevice(*maxPool)) {
+      LOG(WARNING) << "Not a IDEEP operator";
+      continue;
+    }
+
+    auto *op = getMutableOpDef(*maxPool);
+    bool found_training_mode = false;
+    for (auto &arg : *op->mutable_arg()) {
+      if (arg.name() == "training_mode") {
+        arg.set_i(0);
+        found_training_mode = true;
+        break;
+      }
+    }
+
+    if (!found_training_mode) {
+      auto *arg = op->add_arg();
+      arg->set_name("training_mode");
+      arg->set_i(0);
+    }
+  }
+}
+
 void OptimizeForIdeep(
     repr::NNModule* nn,
     caffe2::Workspace* ws,
@@ -379,6 +408,8 @@
   fuseActivationForIdeep(nn);
 
   enforceFusionInplaceForIdeep(nn);
+
+  setPoolingInferenceMode(nn);
 }
 
 #endif // CAFFE2_USE_IDEEP