Vectorize SigmoidOp on CPU

Summary: I noticed that Sigmoid was taking an inordinate amount of time in our NMT benchmark, so I looked at the implementation and it didn't seem optimal. I replaced the implementation with an Eigen version so that when the Eigen update goes through, we will get proper AVX(2) vectorization.

Differential Revision: D5082464

fbshipit-source-id: aa951f7d730fc05198f7dd04076ec58d471b74c8
diff --git a/caffe2/operators/sigmoid_op.cc b/caffe2/operators/sigmoid_op.cc
index 64deb93..2734f00 100644
--- a/caffe2/operators/sigmoid_op.cc
+++ b/caffe2/operators/sigmoid_op.cc
@@ -1,6 +1,5 @@
-#include <cmath>
-
 #include "caffe2/operators/elementwise_op.h"
+#include "caffe2/utils/math.h"
 
 namespace caffe2 {
 
@@ -8,9 +7,8 @@
   template <typename T>
   inline void operator()(const int n, const T* x,
                          T* y, CPUContext* device_context) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = 1. / (1. + exp(-x[i]));
-    }
+    ConstEigenVectorArrayMap<T> xM(x, n);
+    EigenVectorArrayMap<T>(y, n) = 1. / (1. + (-xM).exp());
   }
 };
 
@@ -18,9 +16,8 @@
   template <typename T>
   inline void
   Run(const int n, const T* y, const T* dy, T* dx, CPUContext* device_context) {
-    for (int i = 0; i < n; ++i) {
-      dx[i] = dy[i] * y[i] * (1. - y[i]);
-    }
+    ConstEigenVectorArrayMap<T> yM(y, n), dyM(dy, n);
+    EigenVectorArrayMap<T>(dx, n) = dyM * yM * (1. - yM);
   }
 };
 
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index 8184265..d5d5c2b 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -82,3 +82,26 @@
 
         self.assertGradientChecks(
             gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
+
+    @given(n=st.integers(5, 6), m=st.integers(4, 6), **hu.gcs)
+    def test_sigmoid(self, n, m, gc, dc):
+        X = np.random.rand(n, m).astype(np.float32)
+
+        def sigmoid(X):
+            return [1. / (1. + np.exp(-X))]
+
+        op = core.CreateOperator(
+            "Sigmoid",
+            ["X"],
+            ["Z"]
+        )
+
+        self.assertReferenceChecks(
+            device_option=gc,
+            op=op,
+            inputs=[X],
+            reference=sigmoid,
+        )
+
+        self.assertGradientChecks(
+            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)