blob: e33b73d1c4f9686604032e0eb8603274c1adfe88 [file] [log] [blame]
#include "caffe2/utils/math/elementwise.h"
#include <algorithm>
#include <functional>
#ifdef CAFFE2_USE_ACCELERATE
#include <Accelerate/Accelerate.h>
#endif // CAFFE2_USE_ACCELERATE
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL
#include "caffe2/core/context.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace math {
////////////////////////////////////////////////////////////////////////////////
// MKL VML alternatives.
// Depending on whether we are using MKL, we will delegate the Caffe2 math
// functions that are VML-related to either the VML call or the Eigen
// implementation. If you are setting the flags (such as AVX) right for your CPU
// architecture, usually Eigen will deliver a throughput as fast as the VML
// functions.
////////////////////////////////////////////////////////////////////////////////
#ifdef CAFFE2_USE_MKL
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, MKLFunc, ...) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
MKLFunc(N, X, Y, ##__VA_ARGS__); \
}
DELEGATE_SIMPLE_UNARY_FUNCTION(
float,
Exp,
vmsExp,
VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
DELEGATE_SIMPLE_UNARY_FUNCTION(
double,
Exp,
vmdExp,
VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log1p, vsLog1p)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log1p, vdLog1p)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, vsSin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, vdSin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, vsAsin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, vdAsin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, vsCos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, vdCos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, vsAcos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, vdAcos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, vsTan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, vdTan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, vsAtan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, vdAtan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sinh, vsSinh)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sinh, vdSinh)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cosh, vsCosh)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cosh, vdCosh)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, vsAbs)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, vdAbs)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, vsSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, vdSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, vsInvSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, vdInvSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cbrt, vsCbrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cbrt, vdCbrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Inv, vsInv)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, vdInv)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Erf, vsErf)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Erf, vdErf)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, CdfNorm, vsCdfNorm)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, CdfNorm, vdCdfNorm)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#define DELEGATE_SINCOS(T, MKLFunc) \
template <> \
C10_EXPORT void SinCos<T, CPUContext>( \
const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \
MKLFunc(N, X, S, C); \
}
DELEGATE_SINCOS(float, vsSinCos)
DELEGATE_SINCOS(double, vdSinCos)
#undef DELEGATE_SINCOS
#define DELEGATE_POWX(T, MKLFunc) \
template <> \
C10_EXPORT void Powx<T, CPUContext>( \
const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \
MKLFunc(N, A, b, Y); \
}
DELEGATE_POWX(float, vsPowx)
DELEGATE_POWX(double, vdPowx)
#undef DELEGATE_POWX
#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, MKLFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
MKLFunc(N, A, B, C); \
}
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION
#define DELEGATE_AXPBY(TAlpha, TData, MKLFunc) \
template <> \
C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
const TAlpha beta, \
TData* Y, \
CPUContext* /* context */) { \
MKLFunc( \
N, static_cast<TData>(alpha), X, 1, static_cast<TData>(beta), Y, 1); \
} \
template <> \
C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
const TAlpha* beta, \
TData* Y, \
CPUContext* /* context */) { \
MKLFunc( \
N, static_cast<TData>(*alpha), X, 1, static_cast<TData>(*beta), Y, 1); \
}
DELEGATE_AXPBY(float, float, cblas_saxpby)
#undef DELEGATE_AXPBY
#else // CAFFE2_USE_MKL
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(Y, N) = \
ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \
}
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log1p, log1p)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log1p, log1p)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, sin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, asin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, asin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, cos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, acos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, acos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, tan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, tan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, atan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, atan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, abs)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, abs)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, sqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, rsqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, rsqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Inv, inverse)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, inverse)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#define CAFFE2_SPECIALIZED_SINH(T) \
template <> \
C10_EXPORT void Sinh<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
ConstEigenVectorArrayMap<T> X_arr(X, N); \
EigenVectorArrayMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / T(2); \
}
CAFFE2_SPECIALIZED_SINH(float)
CAFFE2_SPECIALIZED_SINH(double)
#undef CAFFE2_SPECIALIZED_SINH
#define CAFFE2_SPECIALIZED_COSH(T) \
template <> \
C10_EXPORT void Cosh<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
ConstEigenVectorArrayMap<T> X_arr(X, N); \
EigenVectorArrayMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / T(2); \
}
CAFFE2_SPECIALIZED_COSH(float)
CAFFE2_SPECIALIZED_COSH(double)
#undef CAFFE2_SPECIALIZED_COSH
#define CAFFE2_SPECIALIZED_SINCOS(T) \
template <> \
C10_EXPORT void SinCos<T, CPUContext>( \
const int N, const T* X, T* S, T* C, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(S, N) = ConstEigenVectorArrayMap<T>(X, N).sin(); \
EigenVectorArrayMap<T>(C, N) = ConstEigenVectorArrayMap<T>(X, N).cos(); \
}
CAFFE2_SPECIALIZED_SINCOS(float)
CAFFE2_SPECIALIZED_SINCOS(double)
#undef CAFFE2_SPECIALIZED_SINCOS
#define CAFFE2_SPECIALIZED_POWX(T) \
template <> \
C10_EXPORT void Powx<T, CPUContext>( \
const int N, const T* A, const T b, T* Y, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(A, N).pow(b); \
}
CAFFE2_SPECIALIZED_POWX(float)
CAFFE2_SPECIALIZED_POWX(double)
#undef CAFFE2_SPECIALIZED_POWX
#define CAFFE2_SPECIALIZED_CBRT(T) \
template <> \
C10_EXPORT void Cbrt<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
std::transform(X, X + N, Y, [](const T x) { return cbrt(x); }); \
}
CAFFE2_SPECIALIZED_CBRT(float)
CAFFE2_SPECIALIZED_CBRT(double)
#undef CAFFE2_SPECIALIZED_CBRT
#define CAFFE2_SPECIALIZED_ERF(T) \
template <> \
C10_EXPORT void Erf<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
std::transform(X, X + N, Y, [](const T x) { return erf(x); }); \
}
CAFFE2_SPECIALIZED_ERF(float)
CAFFE2_SPECIALIZED_ERF(double)
#undef CAFFE2_SPECIALIZED_ERF
#define CAFFE2_SPECIALIZED_CDF_NORM(T) \
template <> \
C10_EXPORT void CdfNorm<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
std::transform(X, X + N, Y, [](const T x) { \
constexpr T kRsqrt2 = 0.7071067811865475; \
return (T(1) + erf(x * kRsqrt2)) * static_cast<T>(0.5); \
}); \
}
CAFFE2_SPECIALIZED_CDF_NORM(float)
CAFFE2_SPECIALIZED_CDF_NORM(double)
#undef CAFFE2_SPECIALIZED_CDF_NORM
#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
EigenOp ConstEigenVectorArrayMap<T>(B, N); \
}
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Add, +)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Add, +)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Sub, -)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Sub, -)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Mul, *)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Mul, *)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(float, Div, /)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(double, Div, /)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR
#define CAFFE2_SPECIALIZED_AXPBY(TAlpha, TData) \
template <> \
C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
const TAlpha beta, \
TData* Y, \
CPUContext* /* context */) { \
EigenVectorArrayMap<TData> Y_arr(Y, N); \
Y_arr = Y_arr * static_cast<TData>(beta) + \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
template <> \
C10_EXPORT void Axpby<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
const TAlpha* beta, \
TData* Y, \
CPUContext* /* context */) { \
EigenVectorArrayMap<TData> Y_arr(Y, N); \
Y_arr = Y_arr * static_cast<TData>(*beta) + \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
}
CAFFE2_SPECIALIZED_AXPBY(float, float)
#undef CAFFE2_SPECIALIZED_AXPBY
#endif // CAFFE2_USE_MKL
////////////////////////////////////////////////////////////////////////////////
// BLAS alternatives.
// Depending on whether we have specified an external BLAS library or not, we
// will delegate the Caffe math functions that are BLAS-related to either the
// CBLAS call or the Eigen implementation.
////////////////////////////////////////////////////////////////////////////////
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
} \
}
CAFFE2_SPECIALIZED_SCALE(float, float)
CAFFE2_SPECIALIZED_SCALE(double, double)
CAFFE2_SPECIALIZED_SCALE(float, double)
#undef CAFFE2_SPECIALIZED_SCALE
#define CAFFE2_SPECIALIZED_AXPY(TAlpha, TData) \
template <> \
C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
EigenVectorArrayMap<TData>(Y, N) += \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
template <> \
C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
EigenVectorArrayMap<TData>(Y, N) += \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
}
CAFFE2_SPECIALIZED_AXPY(float, float)
CAFFE2_SPECIALIZED_AXPY(float, double)
#undef CAFFE2_SPECIALIZED_AXPY
#else // CAFFE2_USE_EIGEN_FOR_BLAS
#ifdef CAFFE2_USE_MKL
#define DELEGATE_SCALE(TAlpha, TData, MKLFunc1, MKLFunc2) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
const int max_int = std::numeric_limits<int32_t>::max(); \
int batch = N / max_int; \
int remainder = N % max_int; \
std::int64_t offset = 0; \
for (int i = 0; i < batch; i ++) { \
if (Y == X) { \
MKLFunc1(max_int, static_cast<TData>(alpha), Y + offset, 1); \
} else { \
MKLFunc2(max_int, static_cast<TData>(alpha), X + offset, 1, TData(0), Y + offset, 1); \
} \
offset += max_int; \
} \
if (remainder != 0) { \
if (Y == X) { \
MKLFunc1(remainder, static_cast<TData>(alpha), Y + offset, 1); \
} else { \
MKLFunc2(remainder, static_cast<TData>(alpha), X + offset, 1, TData(0), Y + offset, 1); \
} \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
const int max_int = std::numeric_limits<int32_t>::max(); \
int batch = N / max_int; \
int remainder = N % max_int; \
std::int64_t offset = 0; \
for (int i = 0; i < batch; i ++) { \
if (Y == X) { \
MKLFunc1(max_int, static_cast<TData>(*alpha), Y + offset, 1); \
} else { \
MKLFunc2(max_int, static_cast<TData>(*alpha), X + offset, 1, TData(0), Y + offset, 1); \
} \
offset += max_int; \
} \
if (remainder != 0) { \
if (Y == X) { \
MKLFunc1(remainder, static_cast<TData>(*alpha), Y + offset, 1); \
} else { \
MKLFunc2(remainder, static_cast<TData>(*alpha), X + offset, 1, TData(0), Y + offset, 1); \
} \
} \
}
DELEGATE_SCALE(float, float, cblas_sscal, cblas_saxpby)
DELEGATE_SCALE(double, double, cblas_dscal, cblas_daxpby)
DELEGATE_SCALE(float, double, cblas_dscal, cblas_daxpby)
#undef DELEGATE_SCALE
#else // CAFFE2_USE_MKL
#define DELEGATE_SCALE(TAlpha, TData, BLASFunc) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (Y == X) { \
BLASFunc(N, static_cast<TData>(alpha), Y, 1); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (Y == X) { \
BLASFunc(N, static_cast<TData>(*alpha), Y, 1); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
} \
}
DELEGATE_SCALE(float, float, cblas_sscal)
DELEGATE_SCALE(double, double, cblas_dscal)
DELEGATE_SCALE(float, double, cblas_dscal)
#undef DELEGATE_SCALE
#endif // CAFFE2_USE_MKL
#define DELEGATE_AXPY(TAlpha, TData, BLASFunc) \
template <> \
C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
BLASFunc(N, static_cast<TData>(alpha), X, 1, Y, 1); \
} \
template <> \
C10_EXPORT void Axpy<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
BLASFunc(N, static_cast<TData>(*alpha), X, 1, Y, 1); \
}
DELEGATE_AXPY(float, float, cblas_saxpy)
DELEGATE_AXPY(float, double, cblas_daxpy)
#undef DELEGATE_AXPY
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
////////////////////////////////////////////////////////////////////////////////
// Common math functions being used in Caffe that do not have a BLAS or MKL
// equivalent. For all these functions, we will simply implement them either via
// Eigen or via custom code.
////////////////////////////////////////////////////////////////////////////////
#define CAFFE2_SPECIALIZED_SET(T) \
template <> \
C10_EXPORT void Set<T, CPUContext>( \
const std::int64_t N, const T alpha, T* Y, CPUContext* /* context */) { \
if (N == 0) { \
return; \
} \
if (alpha == T(0)) { \
std::memset(Y, 0, N * sizeof(T)); \
} else { \
EigenVectorArrayMap<T>(Y, N).setConstant(alpha); \
} \
}
CAFFE2_SPECIALIZED_SET(float)
CAFFE2_SPECIALIZED_SET(double)
CAFFE2_SPECIALIZED_SET(int)
CAFFE2_SPECIALIZED_SET(std::int8_t)
CAFFE2_SPECIALIZED_SET(std::int16_t)
CAFFE2_SPECIALIZED_SET(std::int64_t)
CAFFE2_SPECIALIZED_SET(bool)
CAFFE2_SPECIALIZED_SET(char)
CAFFE2_SPECIALIZED_SET(std::uint8_t)
CAFFE2_SPECIALIZED_SET(std::uint16_t)
#undef CAFFE2_SPECIALIZED_SET
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Func, EigenFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(Y, N) = \
ConstEigenVectorArrayMap<T>(X, N).EigenFunc(); \
}
// Eigen's Tanh implementation is faster than MKL, so use Eigen here.
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tanh, tanh)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tanh, tanh)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Sign, sign)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Sign, sign)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sign, sign)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sign, sign)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Abs, abs)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Abs, abs)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int32_t, Cube, cube)
DELEGATE_SIMPLE_UNARY_FUNCTION(std::int64_t, Cube, cube)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cube, cube)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cube, cube)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#define CAFFE2_SPECIALIZED_NEG(T) \
template <> \
C10_EXPORT void Neg<T, CPUContext>( \
const int N, const T* X, T* Y, CPUContext* /* context */) { \
EigenVectorArrayMap<T>(Y, N) = -ConstEigenVectorArrayMap<T>(X, N); \
}
CAFFE2_SPECIALIZED_NEG(std::int32_t)
CAFFE2_SPECIALIZED_NEG(std::int64_t)
CAFFE2_SPECIALIZED_NEG(float)
CAFFE2_SPECIALIZED_NEG(double)
#undef CAFFE2_SPECIALIZED_NEG
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(alpha); \
} \
} \
template <> \
C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \
const std::int64_t N, \
const TAlpha* alpha, \
const TData* X, \
TData* Y, \
CPUContext* /* context */) { \
if (X == Y) { \
EigenVectorArrayMap<TData>(Y, N) *= static_cast<TData>(*alpha); \
} else { \
EigenVectorArrayMap<TData>(Y, N) = \
ConstEigenVectorArrayMap<TData>(X, N) * static_cast<TData>(*alpha); \
} \
}
CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
#undef CAFFE2_SPECIALIZED_SCALE
#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
EigenOp ConstEigenVectorArrayMap<T>(B, N); \
}
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Add, +)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Add, +)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Sub, -)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Sub, -)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Mul, *)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Mul, *)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, Div, /)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, Div, /)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_OPERATOR
#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(T, Func, EigenFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N).EigenFunc( \
ConstEigenVectorArrayMap<T>(B, N)); \
}
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Min, min)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int32_t, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(std::int64_t, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(float, Max, max)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION(double, Max, max)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_EIGEN_FUNCTION
#define DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(T, Func, StdFunc) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, const T* A, const T* B, T* C, CPUContext* /* context */) { \
std::transform(A, A + N, B, C, StdFunc); \
}
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
bool,
And,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::logical_and<bool>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
bool,
Or,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::logical_or<bool>())
// NOLINTNEXTLINE(modernize-use-transparent-functors)
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(bool, Xor, std::bit_xor<bool>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
bool,
BitwiseAnd,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_and<bool>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
std::int32_t,
BitwiseAnd,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_and<std::int32_t>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
std::int64_t,
BitwiseAnd,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_and<std::int64_t>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
bool,
BitwiseOr,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_or<bool>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
std::int32_t,
BitwiseOr,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_or<std::int32_t>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
std::int64_t,
BitwiseOr,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_or<std::int64_t>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
bool,
BitwiseXor,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_xor<bool>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
std::int32_t,
BitwiseXor,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_xor<std::int32_t>())
DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION(
std::int64_t,
BitwiseXor,
// NOLINTNEXTLINE(modernize-use-transparent-functors)
std::bit_xor<std::int64_t>())
#undef DELEGATE_SIMPLE_BINARY_FUNCTION_BY_STD_FUNCTION
#define DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(T, Func, EigenOp) \
template <> \
C10_EXPORT void Func<T, CPUContext>( \
const int N, \
const T* A, \
const T* B, \
bool* C, \
CPUContext* /* context */) { \
EigenVectorArrayMap<bool>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
EigenOp ConstEigenVectorArrayMap<T>(B, N); \
}
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, EQ, ==)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, EQ, ==)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, EQ, ==)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, EQ, ==)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, EQ, ==)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, NE, !=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, NE, !=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, NE, !=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, NE, !=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, NE, !=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, LT, <)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, LT, <)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, LT, <)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, LT, <)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, LT, <)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, LE, <=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, LE, <=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, LE, <=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, LE, <=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, LE, <=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, GT, >)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, GT, >)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, GT, >)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, GT, >)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, GT, >)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(bool, GE, >=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int32_t, GE, >=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(std::int64_t, GE, >=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(float, GE, >=)
DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR(double, GE, >=)
#undef DELEGATE_SIMPLE_COMPARE_FUNCTION_BY_EIGEN_OPERATOR
} // namespace math
} // namespace caffe2