use CUDA version of AccuracyOp with top_k=1

Summary: D4348953 added support for accuracy for top_k>1, which is only supported on CPU, requiring data to be copied to CUDA. But that diff did not take into account that we have top_k=1 version of AccuracyOp for CUDA. This diff ensures we use the CUDA version for top_k=1.

Differential Revision: D4607767

fbshipit-source-id: 8becda23890343043eb79ad04e4c6196e9010f0c
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
index 5ae7a7a..31ec3e9 100644
--- a/caffe2/python/cnn.py
+++ b/caffe2/python/cnn.py
@@ -636,20 +636,22 @@
         return self.net.Iter(blob_out, blob_out, **kwargs)
 
     def Accuracy(self, blob_in, blob_out, **kwargs):
-        dev = kwargs['device_option'] if 'device_option' in kwargs else scope.CurrentDeviceScope()
+        dev = kwargs['device_option'] if 'device_option' in kwargs \
+            else scope.CurrentDeviceScope()
+        is_cpu = dev is None or dev.device_type == caffe2_pb2.CPU
 
-        blobs_in_dev = []
-        # if device_option is CPU (or None, so assumed to be CPU), nothing needs to be done
-        if dev == None or dev.device_type == caffe2_pb2.CPU:
-            blobs_in_dev = blob_in
+        # We support top_k > 1 only on CPU
+        if not is_cpu and 'top_k' in kwargs and kwargs['top_k'] > 1:
+            pred_host = self.net.CopyGPUToCPU(blob_in[0], blob_in[0] + "_host")
+            label_host = self.net.CopyGPUToCPU(blob_in[1], blob_in[1] + "_host")
+
+            # Now use the Host version of the accuracy op
+            self.net.Accuracy([pred_host, label_host],
+                              blob_out,
+                              device_option=core.DeviceOption(caffe2_pb2.CPU, 0),
+                              **kwargs)
         else:
-            # Otherwise insert copy operators
-            pred_host = self.net.CopyGPUToCPU(blob_in[0], blob_in[0]+"_host")
-            label_host = self.net.CopyGPUToCPU(blob_in[1], blob_in[1]+"_host")
-            blobs_in_dev = [pred_host, label_host]
-
-        # Now use the Host version of the accuracy op
-        self.net.Accuracy(blobs_in_dev, blob_out, device_option=core.DeviceOption(caffe2_pb2.CPU, 0), **kwargs)
+            self.net.Accuracy(blob_in, blob_out)
 
     def PadImage(
         self, blob_in, blob_out, **kwargs