Vectorize SigmoidOp on CPU
Summary: I noticed that Sigmoid was taking an inordinate amount of time in our NMT benchmark, so I looked at the implementation and it didn't seem optimal. I replaced the implementation with an Eigen version so that when the Eigen update goes through, we will get proper AVX(2) vectorization.
Differential Revision: D5082464
fbshipit-source-id: aa951f7d730fc05198f7dd04076ec58d471b74c8
diff --git a/caffe2/operators/sigmoid_op.cc b/caffe2/operators/sigmoid_op.cc
index 64deb93..2734f00 100644
--- a/caffe2/operators/sigmoid_op.cc
+++ b/caffe2/operators/sigmoid_op.cc
@@ -1,6 +1,5 @@
-#include <cmath>
-
#include "caffe2/operators/elementwise_op.h"
+#include "caffe2/utils/math.h"
namespace caffe2 {
@@ -8,9 +7,8 @@
template <typename T>
inline void operator()(const int n, const T* x,
T* y, CPUContext* device_context) {
- for (int i = 0; i < n; ++i) {
- y[i] = 1. / (1. + exp(-x[i]));
- }
+ ConstEigenVectorArrayMap<T> xM(x, n);
+ EigenVectorArrayMap<T>(y, n) = 1. / (1. + (-xM).exp());
}
};
@@ -18,9 +16,8 @@
template <typename T>
inline void
Run(const int n, const T* y, const T* dy, T* dx, CPUContext* device_context) {
- for (int i = 0; i < n; ++i) {
- dx[i] = dy[i] * y[i] * (1. - y[i]);
- }
+ ConstEigenVectorArrayMap<T> yM(y, n), dyM(dy, n);
+ EigenVectorArrayMap<T>(dx, n) = dyM * yM * (1. - yM);
}
};
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
index 8184265..d5d5c2b 100644
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ b/caffe2/python/operator_test/elementwise_ops_test.py
@@ -82,3 +82,26 @@
self.assertGradientChecks(
gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
+
+ @given(n=st.integers(5, 6), m=st.integers(4, 6), **hu.gcs)
+ def test_sigmoid(self, n, m, gc, dc):
+ X = np.random.rand(n, m).astype(np.float32)
+
+ def sigmoid(X):
+ return [1. / (1. + np.exp(-X))]
+
+ op = core.CreateOperator(
+ "Sigmoid",
+ ["X"],
+ ["Z"]
+ )
+
+ self.assertReferenceChecks(
+ device_option=gc,
+ op=op,
+ inputs=[X],
+ reference=sigmoid,
+ )
+
+ self.assertGradientChecks(
+ gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)