Optimize max_pooling for inference for MKL-DNN/IDEEP device (#10156) Summary: Optimize the max_pooling operation for inference path by setting the "inference" flag to the underlying MKL-DNN, saving the computation and store of max indices which is only needed for training. To make the API compatible, training mode is still the default and inference mode is set in the optimizeForIdeep path. Test shows the speed-up of a single max_pooling operation is up to 7X on BDW. Pull Request resolved: https://github.com/pytorch/pytorch/pull/10156 Differential Revision: D9276755 Pulled By: yinghai fbshipit-source-id: ad533d53aabb8ccb3b592da984d6269d9b794a8a

commit: ab6afc2b238deb9e3b731399a367385518b788e5 [log] [tgz]
author: jgong5 <jiong.gong@intel.com> Fri Aug 10 23:05:32 2018 -0700
committer: Facebook Github Bot <facebook-github-bot@users.noreply.github.com> Fri Aug 10 23:14:05 2018 -0700
tree: 97980f96a4834c3a27b60926bc1c611ef1176884
parent: d3ccc836de6f691231f61c1884fbe50b2ed0c7f4 [diff]
diff --git a/caffe2/ideep/operators/pool_op.cc b/caffe2/ideep/operators/pool_op.cc
index ace88cf..6117060 100644
--- a/caffe2/ideep/operators/pool_op.cc
+++ b/caffe2/ideep/operators/pool_op.cc

@@ -8,7 +8,9 @@
   USE_IDEEP_CONV_POOL_BASE_FUNCTIONS();
 
   IDEEPPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : IDEEPConvPoolOpBase(operator_def, ws) {
+      : IDEEPConvPoolOpBase(operator_def, ws),
+        training_mode_(
+            OperatorBase::GetSingleArgument<int>("training_mode", 1)) {
     CAFFE_ENFORCE(
         (dilation_h() == 1) && (dilation_w() == 1),
         "Pooling op does not support dilation right now.");
@@ -33,15 +35,18 @@
     auto& X = Input(INPUT);
     auto* Y = Output(OUTPUT);
     auto Y_dims = CalcOutputDims(X, X.get_dim(1));
+    mkldnn::prop_kind pk = training_mode_ ?
+      mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_inference;
 
     ideep::pooling_forward::compute(X, Y_dims, *Y,
-        stride_, kernel_, pad_tl(), pad_br(), algo_);
+        stride_, kernel_, pad_tl(), pad_br(), algo_, pk);
 
     return true;
   }
 
  private:
   ialgo algo_;
+  bool training_mode_;
 
   INPUT_TAGS(INPUT);
   OUTPUT_TAGS(OUTPUT);

diff --git a/caffe2/opt/optimize_ideep.cc b/caffe2/opt/optimize_ideep.cc
index d880987..af35304 100644
--- a/caffe2/opt/optimize_ideep.cc
+++ b/caffe2/opt/optimize_ideep.cc

@@ -363,6 +363,35 @@
   }
 }
 
+void setPoolingInferenceMode(repr::NNModule *nn) {
+  for (auto node_pair : repr::nn::dataIterator<repr::MaxPool>(nn->dataFlow)) {
+    repr::NNGraph::NodeRef maxPoolNode;
+    repr::MaxPool *maxPool;
+    std::tie(maxPool, maxPoolNode) = node_pair;
+
+    if (!isOnIdeepDevice(*maxPool)) {
+      LOG(WARNING) << "Not a IDEEP operator";
+      continue;
+    }
+
+    auto *op = getMutableOpDef(*maxPool);
+    bool found_training_mode = false;
+    for (auto &arg : *op->mutable_arg()) {
+      if (arg.name() == "training_mode") {
+        arg.set_i(0);
+        found_training_mode = true;
+        break;
+      }
+    }
+
+    if (!found_training_mode) {
+      auto *arg = op->add_arg();
+      arg->set_name("training_mode");
+      arg->set_i(0);
+    }
+  }
+}
+
 void OptimizeForIdeep(
     repr::NNModule* nn,
     caffe2::Workspace* ws,
@@ -379,6 +408,8 @@
   fuseActivationForIdeep(nn);
 
   enforceFusionInplaceForIdeep(nn);
+
+  setPoolingInferenceMode(nn);
 }
 
 #endif // CAFFE2_USE_IDEEP
commit	ab6afc2b238deb9e3b731399a367385518b788e5	[log] [tgz]
author	jgong5 <jiong.gong@intel.com>	Fri Aug 10 23:05:32 2018 -0700
committer	Facebook Github Bot <facebook-github-bot@users.noreply.github.com>	Fri Aug 10 23:14:05 2018 -0700
tree	97980f96a4834c3a27b60926bc1c611ef1176884
parent	d3ccc836de6f691231f61c1884fbe50b2ed0c7f4 [diff]