| #include "caffe2/utils/math/elementwise.h" |
| |
| #include <algorithm> |
| #include <functional> |
| |
| #ifdef CAFFE2_USE_ACCELERATE |
| #include <Accelerate/Accelerate.h> |
| #endif // CAFFE2_USE_ACCELERATE |
| |
| #ifdef CAFFE2_USE_MKL |
| #include <mkl.h> |
| #endif // CAFFE2_USE_MKL |
| |
| #include "caffe2/core/context.h" |
| #include "caffe2/utils/eigen_utils.h" |
| #include "caffe2/utils/math.h" |
| |
| namespace caffe2 { |
| namespace math { |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // MKL VML alternatives. |
| // Depending on whether we are using MKL, we will delegate the Caffe2 math |
| // functions that are VML-related to either the VML call or the Eigen |
| // implementation. If you are setting the flags (such as AVX) right for your CPU |
| // architecture, usually Eigen will deliver a throughput as fast as the VML |
| // functions. |
| //////////////////////////////////////////////////////////////////////////////// |
| #ifdef CAFFE2_USE_MKL |
| |
| #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, MKLFunc, ...) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| MKLFunc(N, X, Y, ##__VA_ARGS__); \ |
| } |
| DELEGATE_SIMPLE_UNARY_FUNCTION( |
| float, |
| Exp, |
| vmsExp, |
| VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE) |
| DELEGATE_SIMPLE_UNARY_FUNCTION( |
| double, |
| Exp, |
| vmdExp, |
| VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log1p, vsLog1p) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log1p, vdLog1p) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, vsSin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, vdSin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, vsAsin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, vdAsin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, vsCos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, vdCos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, vsAcos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, vdAcos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, vsTan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, vdTan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, vsAtan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, vdAtan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sinh, vsSinh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sinh, vdSinh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cosh, vsCosh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cosh, vdCosh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, vsAbs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, vdAbs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, vsSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, vdSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, vsInvSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, vdInvSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cbrt, vsCbrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cbrt, vdCbrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Inv, vsInv) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, vdInv) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Erf, vsErf) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Erf, vdErf) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, CdfNorm, vsCdfNorm) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, CdfNorm, vdCdfNorm) |
| #undef DELEGATE_SIMPLE_UNARY_FUNCTION |
| |
| #define DELEGATE_SINCOS(T, MKLFunc) \ |
| template <> \ |
| C10_EXPORT void SinCos<T, CPUContext>( \ |
| const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \ |
| MKLFunc(N, X, S, C); \ |
| } |
| DELEGATE_SINCOS(float, vsSinCos) |
| DELEGATE_SINCOS(double, vdSinCos) |
| #undef DELEGATE_SINCOS |
| |
| #define DELEGATE_POWX(T, MKLFunc) \ |
| template <> \ |
| C10_EXPORT void Powx<T, CPUContext>( \ |
| const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \ |
| MKLFunc(N, A, b, Y); \ |
| } |
| DELEGATE_POWX(float, vsPowx) |
| DELEGATE_POWX(double, vdPowx) |
| #undef DELEGATE_POWX |
| |
| #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \ |
| MKLFunc(N, A, B, C); \ |
| } |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv) |
| #undef DELEGATE_SIMPLE_BINARY_FUNCTION |
| |
| #define DELEGATE_AXPBY(TAlpha, TData, MKLFunc) \ |
| template <> \ |
| C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| const TAlpha beta, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| MKLFunc( \ |
| N, static_cast<TData>(alpha), X, 1, static_cast<TData>(beta), Y, 1); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| const TAlpha* beta, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| MKLFunc( \ |
| N, static_cast<TData>(*alpha), X, 1, static_cast<TData>(*beta), Y, 1); \ |
| } |
| DELEGATE_AXPBY(float, float, cblas_saxpby) |
| #undef DELEGATE_AXPBY |
| |
| #else // CAFFE2_USE_MKL |
| |
| #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| EigenVectorArrayMap<T>(Y, N) = \ |
| ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \ |
| } |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log1p, log1p) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log1p, log1p) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, sin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, asin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, asin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, cos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, acos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, acos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, tan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, tan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, atan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, atan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, abs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, abs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, sqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, rsqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, rsqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Inv, inverse) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, inverse) |
| #undef DELEGATE_SIMPLE_UNARY_FUNCTION |
| |
| #define CAFFE2_SPECIALIZED_SINH(T) \ |
| template <> \ |
| C10_EXPORT void Sinh<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| ConstEigenVectorArrayMap<T> X_arr(X, N); \ |
| EigenVectorArrayMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / T(2); \ |
| } |
| CAFFE2_SPECIALIZED_SINH(float) |
| CAFFE2_SPECIALIZED_SINH(double) |
| #undef CAFFE2_SPECIALIZED_SINH |
| |
| #define CAFFE2_SPECIALIZED_COSH(T) \ |
| template <> \ |
| C10_EXPORT void Cosh<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| ConstEigenVectorArrayMap<T> X_arr(X, N); \ |
| EigenVectorArrayMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / T(2); \ |
| } |
| CAFFE2_SPECIALIZED_COSH(float) |
| CAFFE2_SPECIALIZED_COSH(double) |
| #undef CAFFE2_SPECIALIZED_COSH |
| |
| #define CAFFE2_SPECIALIZED_SINCOS(T) \ |
| template <> \ |
| C10_EXPORT void SinCos<T, CPUContext>( \ |
| const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \ |
| EigenVectorArrayMap<T>(S, N) = ConstEigenVectorArrayMap<T>(X, N).sin(); \ |
| EigenVectorArrayMap<T>(C, N) = ConstEigenVectorArrayMap<T>(X, N).cos(); \ |
| } |
| CAFFE2_SPECIALIZED_SINCOS(float) |
| CAFFE2_SPECIALIZED_SINCOS(double) |
| #undef CAFFE2_SPECIALIZED_SINCOS |
| |
| #define CAFFE2_SPECIALIZED_POWX(T) \ |
| template <> \ |
| C10_EXPORT void Powx<T, CPUContext>( \ |
| const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \ |
| EigenVectorArrayMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(A, N).pow(b); \ |
| } |
| CAFFE2_SPECIALIZED_POWX(float) |
| CAFFE2_SPECIALIZED_POWX(double) |
| #undef CAFFE2_SPECIALIZED_POWX |
| |
| #define CAFFE2_SPECIALIZED_CBRT(T) \ |
| template <> \ |
| C10_EXPORT void Cbrt<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| std::transform(X, X + N, Y, [](const T x) { return cbrt(x); }); \ |
| } |
| CAFFE2_SPECIALIZED_CBRT(float) |
| CAFFE2_SPECIALIZED_CBRT(double) |
| #undef CAFFE2_SPECIALIZED_CBRT |
| |
| #define CAFFE2_SPECIALIZED_ERF(T) \ |
| template <> \ |
| C10_EXPORT void Erf<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| std::transform(X, X + N, Y, [](const T x) { return erf(x); }); \ |
| } |
| CAFFE2_SPECIALIZED_ERF(float) |
| CAFFE2_SPECIALIZED_ERF(double) |
| #undef CAFFE2_SPECIALIZED_ERF |
| |
| #define CAFFE2_SPECIALIZED_CDF_NORM(T) \ |
| template <> \ |
| C10_EXPORT void CdfNorm<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| std::transform(X, X + N, Y, [](const T x) { \ |
| constexpr T kRsqrt2 = 0.7071067811865475; \ |
| return (T(1) + erf(x * kRsqrt2)) * static_cast<T>(0.5); \ |
| }); \ |
| } |
| CAFFE2_SPECIALIZED_CDF_NORM(float) |
| CAFFE2_SPECIALIZED_CDF_NORM(double) |
| #undef CAFFE2_SPECIALIZED_CDF_NORM |
| |
| #define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \ |
| EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \ |
| EigenOp ConstEigenVectorArrayMap<T>(B, N); \ |
| } |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Add, +) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Add, +) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Sub, -) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Sub, -) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Mul, *) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Mul, *) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Div, /) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /) |
| #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR |
| |
| #define CAFFE2_SPECIALIZED_AXPBY(TAlpha, TData) \ |
| template <> \ |
| C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| const TAlpha beta, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| EigenVectorArrayMap<TData> Y_arr(Y, N); \ |
| Y_arr = Y_arr * static_cast<TData>(beta) + \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| const TAlpha* beta, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| EigenVectorArrayMap<TData> Y_arr(Y, N); \ |
| Y_arr = Y_arr * static_cast<TData>(*beta) + \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \ |
| } |
| CAFFE2_SPECIALIZED_AXPBY(float, float) |
| #undef CAFFE2_SPECIALIZED_AXPBY |
| |
| #endif // CAFFE2_USE_MKL |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // BLAS alternatives. |
| // Depending on whether we have specified an external BLAS library or not, we |
| // will delegate the Caffe math functions that are BLAS-related to either the |
| // CBLAS call or the Eigen implementation. |
| //////////////////////////////////////////////////////////////////////////////// |
| #ifdef CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| if (X == Y) { \ |
| EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \ |
| } else { \ |
| EigenVectorArrayMap<TData>(Y, N) = \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| if (X == Y) { \ |
| EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \ |
| } else { \ |
| EigenVectorArrayMap<TData>(Y, N) = \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_SCALE(float, float) |
| CAFFE2_SPECIALIZED_SCALE(double, double) |
| CAFFE2_SPECIALIZED_SCALE(float, double) |
| #undef CAFFE2_SPECIALIZED_SCALE |
| |
| #define CAFFE2_SPECIALIZED_AXPY(TAlpha, TData) \ |
| template <> \ |
| C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| EigenVectorArrayMap<TData>(Y, N) += \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| EigenVectorArrayMap<TData>(Y, N) += \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \ |
| } |
| CAFFE2_SPECIALIZED_AXPY(float, float) |
| CAFFE2_SPECIALIZED_AXPY(float, double) |
| #undef CAFFE2_SPECIALIZED_AXPY |
| |
| #else // CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| #ifdef CAFFE2_USE_MKL |
| |
| #define DELEGATE_SCALE(TAlpha, TData, MKLFunc1, MKLFunc2) \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| const int max_int = std::numeric_limits<int32_t>::max(); \ |
| int batch = N / max_int; \ |
| int remainder = N % max_int; \ |
| std::int64_t offset = 0; \ |
| for (int i = 0; i < batch; i ++) { \ |
| if (Y == X) { \ |
| MKLFunc1(max_int, static_cast<TData>(alpha), Y + offset, 1); \ |
| } else { \ |
| MKLFunc2(max_int, static_cast<TData>(alpha), X + offset, 1, TData(0), Y + offset, 1); \ |
| } \ |
| offset += max_int; \ |
| } \ |
| if (remainder != 0) { \ |
| if (Y == X) { \ |
| MKLFunc1(remainder, static_cast<TData>(alpha), Y + offset, 1); \ |
| } else { \ |
| MKLFunc2(remainder, static_cast<TData>(alpha), X + offset, 1, TData(0), Y + offset, 1); \ |
| } \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| const int max_int = std::numeric_limits<int32_t>::max(); \ |
| int batch = N / max_int; \ |
| int remainder = N % max_int; \ |
| std::int64_t offset = 0; \ |
| for (int i = 0; i < batch; i ++) { \ |
| if (Y == X) { \ |
| MKLFunc1(max_int, static_cast<TData>(*alpha), Y + offset, 1); \ |
| } else { \ |
| MKLFunc2(max_int, static_cast<TData>(*alpha), X + offset, 1, TData(0), Y + offset, 1); \ |
| } \ |
| offset += max_int; \ |
| } \ |
| if (remainder != 0) { \ |
| if (Y == X) { \ |
| MKLFunc1(remainder, static_cast<TData>(*alpha), Y + offset, 1); \ |
| } else { \ |
| MKLFunc2(remainder, static_cast<TData>(*alpha), X + offset, 1, TData(0), Y + offset, 1); \ |
| } \ |
| } \ |
| } |
| DELEGATE_SCALE(float, float, cblas_sscal, cblas_saxpby) |
| DELEGATE_SCALE(double, double, cblas_dscal, cblas_daxpby) |
| DELEGATE_SCALE(float, double, cblas_dscal, cblas_daxpby) |
| #undef DELEGATE_SCALE |
| |
| #else // CAFFE2_USE_MKL |
| |
| #define DELEGATE_SCALE(TAlpha, TData, BLASFunc) \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| if (Y == X) { \ |
| BLASFunc(N, static_cast<TData>(alpha), Y, 1); \ |
| } else { \ |
| EigenVectorArrayMap<TData>(Y, N) = \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| if (Y == X) { \ |
| BLASFunc(N, static_cast<TData>(*alpha), Y, 1); \ |
| } else { \ |
| EigenVectorArrayMap<TData>(Y, N) = \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \ |
| } \ |
| } |
| DELEGATE_SCALE(float, float, cblas_sscal) |
| DELEGATE_SCALE(double, double, cblas_dscal) |
| DELEGATE_SCALE(float, double, cblas_dscal) |
| #undef DELEGATE_SCALE |
| |
| #endif // CAFFE2_USE_MKL |
| |
| #define DELEGATE_AXPY(TAlpha, TData, BLASFunc) \ |
| template <> \ |
| C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| BLASFunc(N, static_cast<TData>(alpha), X, 1, Y, 1); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| BLASFunc(N, static_cast<TData>(*alpha), X, 1, Y, 1); \ |
| } |
| DELEGATE_AXPY(float, float, cblas_saxpy) |
| DELEGATE_AXPY(float, double, cblas_daxpy) |
| #undef DELEGATE_AXPY |
| |
| #endif // CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Common math functions being used in Caffe that do not have a BLAS or MKL |
| // equivalent. For all these functions, we will simply implement them either via |
| // Eigen or via custom code. |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| #define CAFFE2_SPECIALIZED_SET(T) \ |
| template <> \ |
| C10_EXPORT void Set<T, CPUContext>( \ |
| const std::int64_t N, const T alpha, T* Y, CPUContext* /* context */) { \ |
| if (N == 0) { \ |
| return; \ |
| } \ |
| if (alpha == T(0)) { \ |
| std::memset(Y, 0, N * sizeof(T)); \ |
| } else { \ |
| EigenVectorArrayMap<T>(Y, N).setConstant(alpha); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_SET(float) |
| CAFFE2_SPECIALIZED_SET(double) |
| CAFFE2_SPECIALIZED_SET(int) |
| CAFFE2_SPECIALIZED_SET(std::int8_t) |
| CAFFE2_SPECIALIZED_SET(std::int16_t) |
| CAFFE2_SPECIALIZED_SET(std::int64_t) |
| CAFFE2_SPECIALIZED_SET(bool) |
| CAFFE2_SPECIALIZED_SET(char) |
| CAFFE2_SPECIALIZED_SET(std::uint8_t) |
| CAFFE2_SPECIALIZED_SET(std::uint16_t) |
| #undef CAFFE2_SPECIALIZED_SET |
| |
| #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| EigenVectorArrayMap<T>(Y, N) = \ |
| ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \ |
| } |
| // Eigen's Tanh implementation is faster than MKL, so use Eigen here. |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tanh, tanh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tanh, tanh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Sign, sign) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Sign, sign) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sign, sign) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sign, sign) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Abs, abs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Abs, abs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Cube, cube) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Cube, cube) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cube, cube) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cube, cube) |
| #undef DELEGATE_SIMPLE_UNARY_FUNCTION |
| |
| #define CAFFE2_SPECIALIZED_NEG(T) \ |
| template <> \ |
| C10_EXPORT void Neg<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext* /* context */) { \ |
| EigenVectorArrayMap<T>(Y, N) = -ConstEigenVectorArrayMap<T>(X, N); \ |
| } |
| CAFFE2_SPECIALIZED_NEG(std::int32_t) |
| CAFFE2_SPECIALIZED_NEG(std::int64_t) |
| CAFFE2_SPECIALIZED_NEG(float) |
| CAFFE2_SPECIALIZED_NEG(double) |
| #undef CAFFE2_SPECIALIZED_NEG |
| |
| #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| if (X == Y) { \ |
| EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \ |
| } else { \ |
| EigenVectorArrayMap<TData>(Y, N) = \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const std::int64_t N, \ |
| const TAlpha* alpha, \ |
| const TData* X, \ |
| TData* Y, \ |
| CPUContext* /* context */) { \ |
| if (X == Y) { \ |
| EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \ |
| } else { \ |
| EigenVectorArrayMap<TData>(Y, N) = \ |
| ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t) |
| CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t) |
| #undef CAFFE2_SPECIALIZED_SCALE |
| |
| #define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \ |
| EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \ |
| EigenOp ConstEigenVectorArrayMap<T>(B, N); \ |
| } |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Add, +) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Add, +) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Sub, -) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Sub, -) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Mul, *) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Mul, *) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Div, /) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Div, /) |
| #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR |
| |
| #define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(T, Func, EigenFunc) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \ |
| EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N).EigenFunc( \ |
| ConstEigenVectorArrayMap<T>(B, N)); \ |
| } |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Min, min) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Min, min) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Max, max) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Max, max) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max) |
| #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION |
| |
| #define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(T, Func, StdFunc) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \ |
| std::transform(A, A + N, B, C, StdFunc); \ |
| } |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| bool, |
| And, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::logical_and<bool>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| bool, |
| Or, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::logical_or<bool>()) |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(bool, Xor, std::bit_xor<bool>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| bool, |
| BitwiseAnd, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_and<bool>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| std::int32_t, |
| BitwiseAnd, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_and<std::int32_t>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| std::int64_t, |
| BitwiseAnd, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_and<std::int64_t>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| bool, |
| BitwiseOr, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_or<bool>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| std::int32_t, |
| BitwiseOr, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_or<std::int32_t>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| std::int64_t, |
| BitwiseOr, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_or<std::int64_t>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| bool, |
| BitwiseXor, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_xor<bool>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| std::int32_t, |
| BitwiseXor, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_xor<std::int32_t>()) |
| DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION( |
| std::int64_t, |
| BitwiseXor, |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| std::bit_xor<std::int64_t>()) |
| #undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION |
| |
| #define DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, \ |
| const T* A, \ |
| const T* B, \ |
| bool* C, \ |
| CPUContext* /* context */) { \ |
| EigenVectorArrayMap<bool>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \ |
| EigenOp ConstEigenVectorArrayMap<T>(B, N); \ |
| } |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, EQ, ==) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, EQ, ==) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, EQ, ==) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, EQ, ==) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, EQ, ==) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, NE, !=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, NE, !=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, NE, !=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, NE, !=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, NE, !=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, LT, <) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, LT, <) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, LT, <) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, LT, <) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, LT, <) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, LE, <=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, LE, <=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, LE, <=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, LE, <=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, LE, <=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, GT, >) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, GT, >) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, GT, >) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, GT, >) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, GT, >) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, GE, >=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, GE, >=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, GE, >=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, GE, >=) |
| DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, GE, >=) |
| #undef DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR |
| |
| } // namespace math |
| } // namespace caffe2 |