[IDEEP] [fix bug] Fix bug in ideep SkipOutputCopy strategy (#8372)

* fix a bug for SkipIndices

* IDEEP bug, revise the output to CPUTensor in SkipOutputCopy strategy

* [IDEEP] Add IDEEP fallbacks for Style-Transfer ops
diff --git a/caffe2/core/common.h b/caffe2/core/common.h
index c30d502..8d75628 100644
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@@ -258,7 +258,7 @@
   }
   template <int First, int Second, int... Rest>
   static inline bool ContainsInternal(const int i) {
-    return (i == First) && ContainsInternal<Second, Rest...>(i);
+    return (i == First) || ContainsInternal<Second, Rest...>(i);
   }
 
  public:
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.cc b/caffe2/ideep/operators/operator_fallback_ideep.cc
index d79557d..7491601 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.cc
+++ b/caffe2/ideep/operators/operator_fallback_ideep.cc
@@ -15,6 +15,8 @@
 #include <caffe2/operators/given_tensor_fill_op.h>
 #include <caffe2/operators/load_save_op.h>
 #include <caffe2/operators/loss_op.h>
+#include <caffe2/operators/pad_op.h>
+#include <caffe2/operators/prelu_op.h>
 #include <caffe2/operators/reshape_op.h>
 #include <caffe2/operators/roi_align_op.h>
 #include <caffe2/operators/softmax_op.h>
@@ -94,9 +96,16 @@
     IDEEPFallbackOp<CollectAndDistributeFpnRpnProposalsOp<CPUContext>>);
 REGISTER_IDEEP_OPERATOR(
     BoxWithNMSLimit,
-    IDEEPFallbackOp<BoxWithNMSLimitOp<CPUContext>>);
+    IDEEPFallbackOp<BoxWithNMSLimitOp<CPUContext>, SkipIndices<0,1,2>>);
 REGISTER_IDEEP_OPERATOR(
     BBoxTransform,
     IDEEPFallbackOp<BBoxTransformOp<float, CPUContext>>);
 
+REGISTER_IDEEP_OPERATOR(
+    PadImage,
+    IDEEPFallbackOp<PadImageOp<float, CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    PRelu,
+    IDEEPFallbackOp<PReluOp<float, CPUContext>>);
+
 } // namespace caffe2
diff --git a/caffe2/ideep/operators/operator_fallback_ideep.h b/caffe2/ideep/operators/operator_fallback_ideep.h
index 6c428e7..97bc8d1 100644
--- a/caffe2/ideep/operators/operator_fallback_ideep.h
+++ b/caffe2/ideep/operators/operator_fallback_ideep.h
@@ -52,11 +52,14 @@
     // Create output blobs in parent workspace,
     // then forward output blobs to local workspace.
     std::unordered_map<string, string> forwarded_output_blobs;
-    for (const string& name : base_def_.output()) {
-      string parent_name(name + "_cpu_output_blob_" + base_def_.type());
+    for (int i = 0; i < base_def_.output_size(); i++) {
+      string parent_name(base_def_.output(i));
+      if (!SkipOutputCopy::Contains(i)) {
+        parent_name += "_cpu_output_blob_" + base_def_.type();
+      }
       local_output_blobs_.push_back(ws->CreateBlob(parent_name));
       CHECK_NOTNULL(local_output_blobs_.back());
-      forwarded_output_blobs[name] = parent_name;
+      forwarded_output_blobs[base_def_.output(i)] = parent_name;
     }
     local_ws_.reset(new Workspace(ws, forwarded_output_blobs));
     // Set up the symbols for the local workspace.
diff --git a/caffe2/operators/stylizer_ops.cc b/caffe2/operators/stylizer_ops.cc
index 80dcaaa..ca4a762 100644
--- a/caffe2/operators/stylizer_ops.cc
+++ b/caffe2/operators/stylizer_ops.cc
@@ -2,6 +2,11 @@
 #include "caffe2/utils/cpu_neon.h"
 #include "caffe2/utils/math.h"
 
+#ifdef CAFFE2_USE_IDEEP
+#include <caffe2/ideep/operators/operator_fallback_ideep.h>
+#include <caffe2/ideep/utils/ideep_operator.h>
+#endif
+
 namespace caffe2 {
 
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
@@ -580,5 +585,14 @@
 OPERATOR_SCHEMA(BRGNCHWCToPackedInt8BGRAStylizerDeprocess)
     .NumInputs(2)
     .NumOutputs(1);
+
+#ifdef CAFFE2_USE_IDEEP
+REGISTER_IDEEP_OPERATOR(
+    BRGNCHWCToPackedInt8BGRAStylizerDeprocess,
+    IDEEPFallbackOp<BRGNCHWCToPackedInt8BGRAStylizerDeprocessOp, SkipIndices<0>>);
+REGISTER_IDEEP_OPERATOR(
+    PackedInt8BGRANHWCToNCHWCStylizerPreprocess,
+    IDEEPFallbackOp<PackedInt8BGRANHWCToNCHWCStylizerPreprocessOp>);
+#endif
 } // namespace
 } // namespace caffe2