blob: bcdb4989b0a74007ef9a7a3e5ac96a6569227620 [file] [log] [blame]
// Implements the math functions for CPU.
// The implementation in this file allows us to route the underlying numerical
// computation library to different backends. Notably:
// (1) For all BLAS-related functions, one can explicitly request a BLAS backend
// such as MKL, openblas or Atlas. To see the set of supported backends
// currently provided, check //third_party/blas/.
// (2) If one chooses to link against MKL, we utilize MKL's vector math library
// (VML) for a few functions such as Exp and Log.
// (3) Fallback implementations are provided in Eigen for cross-platform
// support. Since Eigen is a header-only library and supports a number of
// platforms, it allows one to quickly port Caffe2 to different platforms
// where BLAS may not be present.
#include "caffe2/utils/math.h"
#include <algorithm>
#include <array>
#include <atomic>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <limits>
#include <numeric>
#include <random>
#include <tuple>
#include <unordered_set>
#include <vector>
#include "caffe2/core/context.h"
#include "caffe2/utils/cpu_neon.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/fixed_divisor.h"
#include "Eigen/Core"
#include "Eigen/Dense"
#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif // CAFFE2_USE_MKL
#ifdef CAFFE2_USE_HPTT
#include <hptt.h>
#endif // CAFFE2_USE_HPTT
#if defined(_MSC_VER)
#include <process.h>
#endif
namespace caffe2 {
namespace math {
////////////////////////////////////////////////////////////////////////////////
// BLAS alternatives.
// Depending on whether we have specified an external BLAS library or not, we
// will delegate the Caffe math functions that are BLAS-related to either the
// CBLAS call or the Eigen implementation.
////////////////////////////////////////////////////////////////////////////////
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
// Caffe2 gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
//
// The gemm call implements the following operation:
//
// C = alpha * op(A) * op(B) + beta * C
//
// where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each
// of A, B, and C are matrices and alpha and beta are scalars. Note that the
// most common use case of gemm will involve setting alpha to 1 and beta to 0.
//
// op(A) and op(B) represent the transformations that are done to A and B before
// the matrix multiply; depending on the flags set, op(A) is equal to A or A^T
// (transpose) if the argument TransA or TransB is set to CblasNoTrans or
// CblasTrans, respectively, for each of A and B.
template <>
void Gemm<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int M,
const int N,
const int K,
const float alpha,
const float* A,
const float* B,
const float beta,
float* C,
CPUContext* context,
TensorProto::DataType math_type) {
auto C_mat = EigenMatrixMap<float>(C, N, M);
if (beta == 0) {
C_mat.setZero();
} else {
C_mat *= beta;
}
switch (trans_A) {
case CblasNoTrans: {
switch (trans_B) {
case CblasNoTrans:
C_mat.noalias() += alpha *
(ConstEigenMatrixMap<float>(B, N, K) *
ConstEigenMatrixMap<float>(A, K, M));
return;
case CblasTrans:
C_mat.noalias() += alpha *
(ConstEigenMatrixMap<float>(B, K, N).transpose() *
ConstEigenMatrixMap<float>(A, K, M));
return;
default:
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
}
}
case CblasTrans: {
switch (trans_B) {
case CblasNoTrans:
C_mat.noalias() += alpha *
(ConstEigenMatrixMap<float>(B, N, K) *
ConstEigenMatrixMap<float>(A, M, K).transpose());
return;
case CblasTrans:
C_mat.noalias() += alpha *
(ConstEigenMatrixMap<float>(B, K, N).transpose() *
ConstEigenMatrixMap<float>(A, M, K).transpose());
return;
default:
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
}
}
default:
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A";
}
}
template <>
void GemmEx<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int M,
const int N,
const int K,
const float alpha,
const float* A,
const int lda,
const float* B,
const int ldb,
const float beta,
float* C,
const int ldc,
CPUContext*) {
EigenOuterStridedMatrixMap<float> C_mat(C, N, M, EigenOuterStride(ldc));
if (beta == 0) {
C_mat.setZero();
} else {
C_mat *= beta;
}
switch (trans_A) {
case CblasNoTrans: {
switch (trans_B) {
case CblasNoTrans:
C_mat.noalias() += alpha *
(ConstEigenOuterStridedMatrixMap<float>(
B, N, K, EigenOuterStride(ldb)) *
ConstEigenOuterStridedMatrixMap<float>(
A, K, M, EigenOuterStride(lda)));
return;
case CblasTrans:
C_mat.noalias() += alpha *
(ConstEigenOuterStridedMatrixMap<float>(
B, K, N, EigenOuterStride(ldb))
.transpose() *
ConstEigenOuterStridedMatrixMap<float>(
A, K, M, EigenOuterStride(lda)));
return;
default:
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
}
}
case CblasTrans: {
switch (trans_B) {
case CblasNoTrans:
C_mat.noalias() += alpha *
(ConstEigenOuterStridedMatrixMap<float>(
B, N, K, EigenOuterStride(ldb)) *
ConstEigenOuterStridedMatrixMap<float>(
A, M, K, EigenOuterStride(lda))
.transpose());
return;
case CblasTrans:
C_mat.noalias() += alpha *
(ConstEigenOuterStridedMatrixMap<float>(
B, K, N, EigenOuterStride(ldb))
.transpose() *
ConstEigenOuterStridedMatrixMap<float>(
A, M, K, EigenOuterStride(lda))
.transpose());
return;
default:
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B";
}
}
default:
LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A";
}
}
template <>
void Gemv<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const int M,
const int N,
const float alpha,
const float* A,
const float* x,
const float beta,
float* y,
CPUContext* context,
TensorProto::DataType math_type) {
EigenVectorMap<float> y_vec(y, trans_A == CblasNoTrans ? M : N);
if (beta == 0) {
// In Caffe2 we often do a lazy initialization, which may contain NaNs in
// the float values. As a result, if beta is 0, we explicitly do a setzero.
y_vec.setZero();
} else {
y_vec *= beta;
}
switch (trans_A) {
case CblasNoTrans: {
y_vec.noalias() += alpha *
(ConstEigenMatrixMap<float>(A, N, M).transpose() *
ConstEigenVectorMap<float>(x, N));
return;
}
case CblasTrans: {
y_vec.noalias() += alpha *
(ConstEigenMatrixMap<float>(A, N, M) *
ConstEigenVectorMap<float>(x, M));
return;
}
default:
LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input.";
}
}
#define CAFFE2_SPECIALIZED_DOT(T) \
template <> \
void Dot<T, CPUContext>( \
const int N, const T* a, const T* b, T* y, CPUContext* context) { \
*y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \
}
CAFFE2_SPECIALIZED_DOT(float)
#undef CAFFE2_SPECIALIZED_DOT
#define CAFFE2_SPECIALIZED_AXPY(T) \
template <> \
void Axpy<T, CPUContext>( \
const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \
EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \
} \
template <> \
void Axpy<T, CPUContext>( \
const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \
}
CAFFE2_SPECIALIZED_AXPY(float)
#undef CAFFE2_SPECIALIZED_AXPY
#define CAFFE2_SPECIALIZED_AXPBY(T) \
template <> \
void Axpby<T, CPUContext>( \
const int N, \
const T alpha, \
const T* x, \
const T beta, \
T* y, \
CPUContext* context) { \
EigenVectorMap<T> y_vec(y, N); \
y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha; \
}
CAFFE2_SPECIALIZED_AXPBY(float)
#undef CAFFE2_SPECIALIZED_AXPBY
#else // CAFFE2_USE_EIGEN_FOR_BLAS
template <>
void Gemm<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int M,
const int N,
const int K,
const float alpha,
const float* A,
const float* B,
const float beta,
float* C,
CPUContext* /*context*/,
TensorProto::DataType /*math_type*/) {
const int lda = (trans_A == CblasNoTrans) ? K : M;
const int ldb = (trans_B == CblasNoTrans) ? N : K;
cblas_sgemm(
CblasRowMajor,
trans_A,
trans_B,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
N);
}
template <>
void GemmEx<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int M,
const int N,
const int K,
const float alpha,
const float* A,
const int lda,
const float* B,
const int ldb,
const float beta,
float* C,
const int ldc,
CPUContext* /*context*/) {
cblas_sgemm(
CblasRowMajor,
trans_A,
trans_B,
M,
N,
K,
alpha,
A,
lda,
B,
ldb,
beta,
C,
ldc);
}
template <>
void Gemv<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const int M,
const int N,
const float alpha,
const float* A,
const float* x,
const float beta,
float* y,
CPUContext* /*context*/,
TensorProto::DataType /*math_type*/) {
cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1);
}
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix) \
template <> \
void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha alpha, \
const TData* x, \
TData* y, \
CPUContext*) { \
if (y != x) { \
cblas_##prefix##copy(n, x, 1, y, 1); \
} \
if (alpha != TAlpha(1)) { \
cblas_##prefix##scal(n, static_cast<TData>(alpha), y, 1); \
} \
} \
template <> \
void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha* alpha, \
const TData* x, \
TData* y, \
CPUContext*) { \
if (y != x) { \
cblas_##prefix##copy(n, x, 1, y, 1); \
} \
if (*alpha != TAlpha(1)) { \
cblas_##prefix##scal(n, static_cast<TData>(*alpha), y, 1); \
} \
}
CAFFE2_SPECIALIZED_SCALE(float, float, s)
CAFFE2_SPECIALIZED_SCALE(double, double, d)
CAFFE2_SPECIALIZED_SCALE(float, double, d)
#undef CAFFE2_SPECIALIZED_SCALE
#define CAFFE2_SPECIALIZED_DOT(T, prefix) \
template <> \
void Dot<T, CPUContext>( \
const int N, const T* a, const T* b, T* y, CPUContext*) { \
*y = cblas_##prefix##dot(N, a, 1, b, 1); \
}
CAFFE2_SPECIALIZED_DOT(float, s)
#undef CAFFE2_SPECIALIZED_DOT
#define CAFFE2_SPECIALIZED_AXPY(T, prefix) \
template <> \
void Axpy<T, CPUContext>( \
const int N, const T alpha, const T* x, T* y, CPUContext*) { \
cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
} \
template <> \
void Axpy<T, CPUContext>( \
const int N, const T* alpha, const T* x, T* y, CPUContext*) { \
cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \
}
CAFFE2_SPECIALIZED_AXPY(float, s)
#undef CAFFE2_SPECIALIZED_AXPY
// cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
// we will need to implement it.
#ifdef CAFFE2_USE_MKL
#define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
template <> \
void Axpby<T, CPUContext>( \
const int N, \
const T alpha, \
const T* x, \
const T beta, \
T* y, \
CPUContext*) { \
cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \
}
#else // CAFFE2_USE_MKL
#define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \
template <> \
void Axpby<T, CPUContext>( \
const int N, \
const T alpha, \
const T* x, \
const T beta, \
T* y, \
CPUContext*) { \
cblas_##prefix##scal(N, beta, y, 1); \
cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \
}
#endif // CAFFE2_USE_MKL
CAFFE2_SPECIALIZED_AXPBY(float, s)
#undef CAFFE2_SPECIALIZED_AXPBY
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
#define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \
template <> \
void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha alpha, \
const TData* x, \
TData* y, \
CPUContext* /* context */) { \
EigenVectorMap<TData>(y, n) = \
ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha); \
} \
template <> \
void Scale<TAlpha, TData, CPUContext>( \
const int n, \
const TAlpha* alpha, \
const TData* x, \
TData* y, \
CPUContext* /* context */) { \
EigenVectorMap<TData>(y, n) = \
ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(*alpha); \
}
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS
CAFFE2_SPECIALIZED_SCALE(float, float)
CAFFE2_SPECIALIZED_SCALE(double, double)
CAFFE2_SPECIALIZED_SCALE(float, double)
#endif // CAFFE2_USE_EIGEN_FOR_BLAS
CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t)
CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t)
#undef CAFFE2_SPECIALIZED_SCALE
template <>
void GemmBatched<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int batch_size,
const int M,
const int N,
const int K,
const float alpha,
const float** A,
const float** B,
const float beta,
float** C,
CPUContext* context,
TensorProto::DataType /* math_type */) {
#ifdef CAFFE2_USE_MKL
(void)context;
const int lda = (trans_A == CblasNoTrans) ? K : M;
const int ldb = (trans_B == CblasNoTrans) ? N : K;
const int ldc = N;
cblas_sgemm_batch(
CblasRowMajor,
&trans_A,
&trans_B,
&M,
&N,
&K,
&alpha,
A,
&lda,
B,
&ldb,
&beta,
C,
&ldc,
1,
&batch_size);
#else // CAFFE2_USE_MKL
// loop over matrices in the batch
for (int i = 0; i < batch_size; ++i) {
math::Gemm<float, CPUContext>(
trans_A, trans_B, M, N, K, alpha, A[i], B[i], beta, C[i], context);
}
#endif // CAFFE2_USE_MKL
}
template <>
void GemmStridedBatched<float, CPUContext>(
const CBLAS_TRANSPOSE trans_A,
const CBLAS_TRANSPOSE trans_B,
const int batch_size,
const int M,
const int N,
const int K,
const float alpha,
const float* A,
const int A_stride,
const float* B,
const int B_stride,
const float beta,
float* C,
const int C_stride,
CPUContext* context,
TensorProto::DataType /* math_type */) {
#ifdef CAFFE2_USE_MKL
(void)context;
const int lda = (trans_A == CblasNoTrans) ? K : M;
const int ldb = (trans_B == CblasNoTrans) ? N : K;
const int ldc = N;
std::vector<const float*> A_array(batch_size);
std::vector<const float*> B_array(batch_size);
std::vector<float*> C_array(batch_size);
for (int i = 0; i < batch_size; ++i) {
A_array[i] = A + i * A_stride;
B_array[i] = B + i * B_stride;
C_array[i] = C + i * C_stride;
}
cblas_sgemm_batch(
CblasRowMajor,
&trans_A,
&trans_B,
&M,
&N,
&K,
&alpha,
A_array.data(),
&lda,
B_array.data(),
&ldb,
&beta,
C_array.data(),
&ldc,
1,
&batch_size);
#else // CAFFE2_USE_MKL
// loop over matrices in the batch
for (int i = 0; i < batch_size; ++i) {
math::Gemm<float, CPUContext>(
trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context);
A += A_stride;
B += B_stride;
C += C_stride;
}
#endif
}
////////////////////////////////////////////////////////////////////////////////
// MKL VML alternatives.
// Depending on whether we are using MKL, we will delegate the Caffe math
// functions that are VML-related to either the VML call or the Eigen
// implementation. If you are setting the flags (such as AVX) right for your CPU
// architecture, usually Eigen will deliver a throughput as fast as the VML
// functions.
////////////////////////////////////////////////////////////////////////////////
#ifdef CAFFE2_USE_MKL
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \
template <> \
void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
OriginalFunc(N, x, y, ##__VA_ARGS__); \
}
DELEGATE_SIMPLE_UNARY_FUNCTION(
float,
Exp,
vmsExp,
VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
DELEGATE_SIMPLE_UNARY_FUNCTION(
double,
Exp,
vmdExp,
VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, vsCos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, vdCos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, vsAcos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, vdAcos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, vsSin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, vdSin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, vsAsin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, vdAsin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, vsTan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, vdTan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, vsAtan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, vdAtan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sinh, vsSinh)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sinh, vdSinh)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cosh, vsCosh)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cosh, vdCosh)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tanh, vsTanh)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tanh, vdTanh)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, vsAbs)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, vdAbs)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, vsSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, vdSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, vsInvSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, vdInvSqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cbrt, vsCbrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cbrt, vdCbrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Inv, vsInv)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, vdInv)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc) \
template <> \
void SinCos<T, CPUContext>( \
const int N, const T* a, T* ys, T* yc, CPUContext*) { \
OriginalFunc(N, a, ys, yc); \
}
DELEGATE_SINCOS_FUNCTION(float, vsSinCos)
DELEGATE_SINCOS_FUNCTION(double, vdSinCos)
#undef DELEGATE_SINCOS_FUNCTION
#define DELEGATE_POWX_FUNCTION(T, OriginalFunc) \
template <> \
void Powx<T, CPUContext>(const int N, const T* a, T b, T* y, CPUContext*) { \
OriginalFunc(N, a, b, y); \
}
DELEGATE_POWX_FUNCTION(float, vsPowx)
DELEGATE_POWX_FUNCTION(double, vdPowx)
#undef DELEGATE_POWX_FUNCTION
#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl) \
template <> \
void Func<T, CPUContext>( \
const int N, const T* A, const T* B, T* C, CPUContext*) { \
FuncImpl(N, A, B, C); \
}
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION
#else // CAFFE2_USE_MKL
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \
template <> \
void Funcname<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr(); \
}
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, cos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, acos)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, acos)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, sin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, asin)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, asin)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, tan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, tan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, atan)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, atan)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, sqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, rsqrt)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, rsqrt)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#define DELEGATE_SINCOS_FUNCTION(T) \
template <> \
void SinCos<T, CPUContext>( \
const int N, const T* x, T* ys, T* yc, CPUContext*) { \
EigenVectorMap<T>(ys, N) = ConstEigenVectorArrayMap<T>(x, N).sin(); \
EigenVectorMap<T>(yc, N) = ConstEigenVectorArrayMap<T>(x, N).cos(); \
}
DELEGATE_SINCOS_FUNCTION(float)
DELEGATE_SINCOS_FUNCTION(double)
#undef DELEGATE_SINCOS_FUNCTION
#define DELEGATE_TANH_FUNCTION(T) \
template <> \
void Tanh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
EigenVectorMap<T>(Y, N) = T(1) - \
((ConstEigenVectorArrayMap<T>(X, N) * T(2)).exp() + T(1)).inverse() * \
T(2); \
}
DELEGATE_TANH_FUNCTION(float)
DELEGATE_TANH_FUNCTION(double)
#undef DELEGATE_TANH_FUNCTION
#define DELEGATE_CBRT_FUNCTION(T) \
template <> \
void Cbrt<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
std::transform(X, X + N, Y, [](const T x) { return cbrt(x); }); \
}
DELEGATE_CBRT_FUNCTION(float)
DELEGATE_CBRT_FUNCTION(double)
#undef DELEGATE_CBRT_FUNCTION
#define DELEGATE_POWX_FUNCTION(T) \
template <> \
void Powx<T, CPUContext>( \
const int N, const T* a, const T b, T* y, CPUContext*) { \
EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(a, N).pow(b); \
}
DELEGATE_POWX_FUNCTION(float)
#undef DELEGATE_POWX_FUNCTION
#define DELEGATE_SINH_FUNCTION(T) \
template <> \
void Sinh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
ConstEigenVectorArrayMap<T> X_arr(X, N); \
EigenVectorMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / 2; \
}
DELEGATE_SINH_FUNCTION(float)
DELEGATE_SINH_FUNCTION(double)
#undef DELEGATE_SINH_FUNCTION
#define DELEGATE_COSH_FUNCTION(T) \
template <> \
void Cosh<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
ConstEigenVectorArrayMap<T> X_arr(X, N); \
EigenVectorMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / 2; \
}
DELEGATE_COSH_FUNCTION(float)
DELEGATE_COSH_FUNCTION(double)
#undef DELEGATE_COSH_FUNCTION
#define DELEGATE_INV_FUNCTION(T) \
template <> \
void Inv<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).inverse(); \
}
DELEGATE_INV_FUNCTION(float)
DELEGATE_INV_FUNCTION(double)
#undef DELEGATE_INV_FUNCTION
#endif // CAFFE2_USE_MKL
#define DELEGATE_NEG_FUNCTION(T) \
template <> \
void Neg<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N); \
}
DELEGATE_NEG_FUNCTION(float)
DELEGATE_NEG_FUNCTION(double)
DELEGATE_NEG_FUNCTION(std::int32_t)
DELEGATE_NEG_FUNCTION(std::int64_t)
#undef DELEGATE_NEG_FUNCTION
#define DELEGATE_SIGN_FUNCTION(T) \
template <> \
void Sign<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign(); \
}
DELEGATE_SIGN_FUNCTION(float)
DELEGATE_SIGN_FUNCTION(double)
DELEGATE_SIGN_FUNCTION(std::int32_t)
DELEGATE_SIGN_FUNCTION(std::int64_t)
#undef DELEGATE_SIGN_FUNCTION
#define DELEGATE_ABS_FUNCTION(T) \
template <> \
void Abs<T, CPUContext>(const int N, const T* x, T* y, CPUContext*) { \
EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).abs(); \
}
#ifndef CAFFE2_USE_MKL
DELEGATE_ABS_FUNCTION(float)
DELEGATE_ABS_FUNCTION(double)
#endif // CAFFE2_USE_MKL
DELEGATE_ABS_FUNCTION(std::int32_t)
DELEGATE_ABS_FUNCTION(std::int64_t)
#undef DELEGATE_ABS_FUNCTION
#define DELEGATE_CUBE_FUNCTION(T) \
template <> \
void Cube<T, CPUContext>(const int N, const T* X, T* Y, CPUContext*) { \
EigenVectorMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(X, N).cube(); \
}
DELEGATE_CUBE_FUNCTION(float)
DELEGATE_CUBE_FUNCTION(double)
DELEGATE_CUBE_FUNCTION(std::int32_t)
DELEGATE_CUBE_FUNCTION(std::int64_t)
#undef DELEGATE_CUBE_FUNCTION
#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr) \
template <> \
void Func<T, CPUContext>( \
const int N, const T* A, const T* B, T* C, CPUContext*) { \
EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \
expr ConstEigenVectorArrayMap<T>(B, N); \
}
#ifdef CAFFE2_USE_MKL
#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
#else
#define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(float, Func, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(double, Func, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \
EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr)
#endif
DEFINE_SIMPLE_BINARY_FUNCTION(Add, +)
DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)
#undef DEFINE_SIMPLE_BINARY_FUNCTION
#undef EIGEN_SIMPLE_BINARY_FUNCTION
////////////////////////////////////////////////////////////////////////////////
// Common math functions being used in Caffe that do not have a BLAS or MKL
// equivalent. For all these functions, we will simply implement them either via
// Eigen or via custom code.
////////////////////////////////////////////////////////////////////////////////
#define CAFFE2_SPECIALIZED_SET(T) \
template <> \
void Set<T, CPUContext>(const size_t N, const T alpha, T* Y, CPUContext*) { \
if (N == 0) { \
return; \
} \
if (alpha == (T)0) { \
if (Y != nullptr) { \
std::memset(Y, 0, N * sizeof(T)); \
} \
} else { \
EigenVectorMap<T>(Y, N).setConstant(alpha); \
} \
}
CAFFE2_SPECIALIZED_SET(float);
CAFFE2_SPECIALIZED_SET(double);
CAFFE2_SPECIALIZED_SET(int8_t);
CAFFE2_SPECIALIZED_SET(int16_t);
CAFFE2_SPECIALIZED_SET(int);
CAFFE2_SPECIALIZED_SET(int64_t);
CAFFE2_SPECIALIZED_SET(bool);
CAFFE2_SPECIALIZED_SET(char);
CAFFE2_SPECIALIZED_SET(uint8_t);
CAFFE2_SPECIALIZED_SET(uint16_t);
#undef CAFFE2_SPECIALIZED_SET
#define CAFFE2_SPECIALIZED_REDUCEMIN(T) \
template <> \
void ReduceMin<T, CPUContext>( \
const int N, \
const T* x, \
T* y, \
Tensor* /*scratch_ptr*/, \
CPUContext* /*context*/) { \
*y = ConstEigenVectorArrayMap<T>(x, N).minCoeff(); \
}
CAFFE2_SPECIALIZED_REDUCEMIN(float)
#undef CAFFE2_SPECIALIZED_REDUCEMIN
#define CAFFE2_SPECIALIZED_REDUCEMAX(T) \
template <> \
void ReduceMax<T, CPUContext>( \
const int N, \
const T* x, \
T* y, \
Tensor* /*scratch_ptr*/, \
CPUContext* /*context*/) { \
*y = ConstEigenVectorArrayMap<T>(x, N).maxCoeff(); \
}
CAFFE2_SPECIALIZED_REDUCEMAX(float)
CAFFE2_SPECIALIZED_REDUCEMAX(int32_t)
CAFFE2_SPECIALIZED_REDUCEMAX(int64_t)
#undef CAFFE2_SPECIALIZED_REDUCEMAX
namespace {
template <typename T>
struct MinFunctor {
inline T operator()(const T a, const T b) const {
return std::min(a, b);
}
};
template <typename T>
struct MaxFunctor {
inline T operator()(const T a, const T b) const {
return std::max(a, b);
}
};
template <typename T>
struct L1NormFunctor {
inline T operator()(const T a, const T b) const {
return a + std::abs(b);
}
};
template <typename T>
struct SquaredL2NormFunctor {
inline T operator()(const T a, const T b) const {
return a + b * b;
}
};
#define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp) \
template <typename T> \
void Rowwise##Func( \
const int rows, const int cols, const T alpha, const T* X, T* Y) { \
EigenVectorMap<T>(Y, rows) = \
ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \
}
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean)
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm)
#undef DELEGATE_ROWWISE_REDUCE_FUNCTION
#define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp) \
template <typename T> \
void Colwise##Func( \
const int rows, const int cols, const T alpha, const T* X, T* Y) { \
EigenVectorMap<T>(Y, cols) = \
ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \
}
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, minCoeff)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, sum)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMean, mean)
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>);
DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm)
#undef DELEGATE_COLWISE_REDUCE_FUNCTION
template <typename T>
void BothEndsReduceMin(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().minCoeff();
const T* X_ptr = X + mid * nxt;
// It seems there is some bug in eigen array::min so it cannot be implemented
// as ReduceSum below.
for (int i = 1; i < pre; ++i) {
for (int j = 0; j < mid; ++j) {
Y[j] = std::min(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).minCoeff());
X_ptr += nxt;
}
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
void BothEndsReduceMax(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().maxCoeff();
const T* X_ptr = X + mid * nxt;
for (int i = 1; i < pre; ++i) {
for (int j = 0; j < mid; ++j) {
Y[j] = std::max(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).maxCoeff());
X_ptr += nxt;
}
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
void BothEndsReduceSum(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().sum();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().sum();
X_ptr += stride;
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
void BothEndsReduceMean(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().mean();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().mean();
X_ptr += stride;
}
if (alpha / static_cast<T>(pre) != 1) {
Y_arr *= alpha / static_cast<T>(pre);
}
}
template <typename T>
void BothEndsReduceL1(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid)
.colwise()
.template lpNorm<1>()
.array();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr += ConstEigenMatrixMap<T>(X_ptr, nxt, mid)
.colwise()
.template lpNorm<1>()
.array();
X_ptr += stride;
}
if (alpha != T(1)) {
Y_arr *= alpha;
}
}
template <typename T>
void BothEndsReduceL2(
const int pre,
const int mid,
const int nxt,
const T alpha,
const T* X,
T* Y) {
EigenVectorArrayMap<T> Y_arr(Y, mid);
Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid).colwise().squaredNorm().array();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
Y_arr +=
ConstEigenMatrixMap<T>(X_ptr, nxt, mid).colwise().squaredNorm().array();
X_ptr += stride;
}
Y_arr = Y_arr.sqrt() * alpha;
}
template <typename T, class Reducer>
void ReduceTensor(
const int ndim,
const int* X_dims,
const int* Y_dims,
const Reducer& reducer,
const T init,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
const int X_size =
std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>());
const int Y_size =
std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>());
Set<T, CPUContext>(Y_size, init, Y, context);
std::vector<int> index(ndim, 0);
for (int X_index = 0; X_index < X_size; ++X_index) {
const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data());
Y[Y_index] = reducer(Y[Y_index], X[X_index]);
utils::IncreaseIndexInDims(ndim, X_dims, index.data());
}
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
}
} // namespace
#define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm) \
template <> \
void Func<T, CPUContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
CAFFE_ENFORCE_LE(num_axes, num_dims); \
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims_vector[axes[i]] = 1; \
} \
const int* X_dims = dims; \
const int* Y_dims = Y_dims_vector.data(); \
const int X_size = \
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
const int Y_size = \
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
if (X_size == 0) { \
Set<T, CPUContext>(Y_size, alpha * init, Y, context); \
return; \
} \
if (alpha == T(0)) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
if (is_norm) { \
Abs<T, CPUContext>(X_size, X, Y, context); \
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
} else { \
Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \
} \
return; \
} \
int rows; \
int cols; \
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
Rowwise##Func<T>(rows, cols, alpha, X, Y); \
return; \
} \
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
Colwise##Func<T>(rows, cols, alpha, X, Y); \
return; \
} \
int pre; \
int mid; \
int nxt; \
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
BothEnds##Func<T>(pre, mid, nxt, alpha, X, Y); \
return; \
} \
ReduceTensor( \
num_dims, X_dims, Y_dims, reducer, init, alpha, X, Y, context); \
}
DELEGATE_REDUCE_FUNCTION(
float,
ReduceMin,
MinFunctor<float>(),
std::numeric_limits<float>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
double,
ReduceMin,
MinFunctor<double>(),
std::numeric_limits<double>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceMin,
MinFunctor<std::int32_t>(),
std::numeric_limits<std::int32_t>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceMin,
MinFunctor<std::int64_t>(),
std::numeric_limits<std::int64_t>::max(),
false)
DELEGATE_REDUCE_FUNCTION(
float,
ReduceMax,
MaxFunctor<float>(),
std::numeric_limits<float>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
double,
ReduceMax,
MaxFunctor<double>(),
std::numeric_limits<double>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceMax,
MaxFunctor<std::int32_t>(),
std::numeric_limits<std::int32_t>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceMax,
MaxFunctor<std::int64_t>(),
std::numeric_limits<std::int64_t>::lowest(),
false)
DELEGATE_REDUCE_FUNCTION(float, ReduceSum, std::plus<float>(), 0.0f, false)
DELEGATE_REDUCE_FUNCTION(double, ReduceSum, std::plus<double>(), 0.0, false)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceSum,
std::plus<std::int32_t>(),
0,
false)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceSum,
std::plus<std::int64_t>(),
std::int64_t(0),
false)
DELEGATE_REDUCE_FUNCTION(float, ReduceL1, L1NormFunctor<float>(), 0.0f, true)
DELEGATE_REDUCE_FUNCTION(double, ReduceL1, L1NormFunctor<double>(), 0.0, true)
DELEGATE_REDUCE_FUNCTION(
std::int32_t,
ReduceL1,
L1NormFunctor<std::int32_t>(),
0,
true)
DELEGATE_REDUCE_FUNCTION(
std::int64_t,
ReduceL1,
L1NormFunctor<std::int64_t>(),
std::int64_t(0),
true)
#undef DELEGATE_REDUCE_FUNCTION
#define CAFFE2_SPECIALIZED_REDUCE_MEAN(T) \
template <> \
void ReduceMean<T, CPUContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
CAFFE_ENFORCE_LE(num_axes, num_dims); \
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims_vector[axes[i]] = 1; \
} \
const int* X_dims = dims; \
const int* Y_dims = Y_dims_vector.data(); \
const int X_size = \
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
const int Y_size = \
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
if (X_size == 0) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (alpha == T(0)) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
Scale<T, T, CPUContext>(X_size, alpha, X, Y, context); \
return; \
} \
int rows; \
int cols; \
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
RowwiseReduceMean<T>(rows, cols, alpha, X, Y); \
return; \
} \
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
ColwiseReduceMean<T>(rows, cols, alpha, X, Y); \
return; \
} \
int pre; \
int mid; \
int nxt; \
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
BothEndsReduceMean<T>(pre, mid, nxt, alpha, X, Y); \
return; \
} \
const int scale = X_size / Y_size; \
ReduceTensor( \
num_dims, \
X_dims, \
Y_dims, \
std::plus<T>(), \
T(0), \
alpha / static_cast<T>(scale), \
X, \
Y, \
context); \
}
CAFFE2_SPECIALIZED_REDUCE_MEAN(float)
CAFFE2_SPECIALIZED_REDUCE_MEAN(double)
#undef CAFFE2_SPECIALIZED_REDUCE_MEAN
#define CAFFE2_SPECIALIZED_REDUCE_L2(T) \
template <> \
void ReduceL2<T, CPUContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
CAFFE_ENFORCE_LE(num_axes, num_dims); \
std::vector<int> Y_dims_vector(dims, dims + num_dims); \
for (int i = 0; i < num_axes; ++i) { \
Y_dims_vector[axes[i]] = 1; \
} \
const int* X_dims = dims; \
const int* Y_dims = Y_dims_vector.data(); \
const int X_size = \
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \
const int Y_size = \
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \
if (X_size == 0) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (alpha == T(0)) { \
Set<T, CPUContext>(Y_size, 0, Y, context); \
return; \
} \
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \
Abs<T, CPUContext>(X_size, X, Y, context); \
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
return; \
} \
int rows; \
int cols; \
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
RowwiseReduceL2<T>(rows, cols, alpha, X, Y); \
return; \
} \
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \
ColwiseReduceL2<T>(rows, cols, alpha, X, Y); \
return; \
} \
int pre; \
int mid; \
int nxt; \
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \
BothEndsReduceL2<T>(pre, mid, nxt, alpha, X, Y); \
return; \
} \
ReduceTensor( \
num_dims, \
X_dims, \
Y_dims, \
SquaredL2NormFunctor<T>(), \
T(0), \
T(1), \
X, \
Y, \
context); \
Sqrt<T, CPUContext>(Y_size, Y, Y, context); \
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \
}
CAFFE2_SPECIALIZED_REDUCE_L2(float)
CAFFE2_SPECIALIZED_REDUCE_L2(double)
#undef CAFFE2_SPECIALIZED_REDUCE_L2
namespace {
template <typename T>
void BroadcastImpl(
const int X_ndim,
const int* X_dims,
const int Y_ndim,
const int* Y_dims,
const T alpha,
const T* X,
T* Y,
CPUContext* context) {
CAFFE_ENFORCE_LE(X_ndim, Y_ndim);
std::vector<int> X_dims_vector(Y_ndim);
const int d = Y_ndim - X_ndim;
std::fill(X_dims_vector.begin(), X_dims_vector.begin() + d, 1);
for (int i = d; i < Y_ndim; ++i) {
CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
X_dims_vector[i] = X_dims[i - d];
}
X_dims = X_dims_vector.data();
const int Y_size =
std::accumulate(Y_dims, Y_dims + Y_ndim, 1, std::multiplies<int>());
std::vector<int> index(Y_ndim, 0);
for (int Y_index = 0; Y_index < Y_size; ++Y_index) {
const int X_index = utils::GetIndexFromDims(Y_ndim, X_dims, index.data());
Y[Y_index] = X[X_index];
utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data());
}
Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context);
}
} // namespace
#define CAFFE2_SPECIALIZED_BROADCAST(T) \
template <> \
void Broadcast<T, CPUContext>( \
const int X_ndim, \
const int* X_dims, \
const int Y_ndim, \
const int* Y_dims, \
const T alpha, \
const T* X, \
T* Y, \
CPUContext* context) { \
BroadcastImpl<T>(X_ndim, X_dims, Y_ndim, Y_dims, alpha, X, Y, context); \
}
CAFFE2_SPECIALIZED_BROADCAST(std::int32_t)
CAFFE2_SPECIALIZED_BROADCAST(std::int64_t)
CAFFE2_SPECIALIZED_BROADCAST(float)
CAFFE2_SPECIALIZED_BROADCAST(double)
#undef CAFFE2_SPECIALIZED_BROADCAST
namespace {
template <typename T>
void RowwiseMoments(
const int rows,
const int cols,
const T* X,
T* mean,
T* variance) {
ConstEigenArrayMap<T> X_mat(X, cols, rows);
EigenVectorArrayMap<T> mean_arr(mean, rows);
EigenVectorArrayMap<T> variance_arr(variance, rows);
mean_arr = X_mat.colwise().mean();
variance_arr = X_mat.array().square().colwise().mean();
variance_arr -= mean_arr.square();
}
template <typename T>
void ColwiseMoments(
const int rows,
const int cols,
const T* X,
T* mean,
T* variance) {
ConstEigenArrayMap<T> X_mat(X, cols, rows);
EigenVectorArrayMap<T> mean_arr(mean, cols);
EigenVectorArrayMap<T> variance_arr(variance, cols);
mean_arr = X_mat.rowwise().mean();
variance_arr = X_mat.array().square().rowwise().mean();
variance_arr -= mean_arr.square();
}
template <typename T>
void BothEndsMoments(
const int pre,
const int mid,
const int nxt,
const T* X,
T* mean,
T* variance) {
EigenVectorArrayMap<T> mean_arr(mean, mid);
EigenVectorArrayMap<T> variance_arr(variance, mid);
mean_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().mean();
variance_arr = ConstEigenArrayMap<T>(X, nxt, mid).square().colwise().mean();
const int stride = mid * nxt;
const T* X_ptr = X + stride;
for (int i = 1; i < pre; ++i) {
mean_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().mean();
variance_arr +=
ConstEigenArrayMap<T>(X_ptr, nxt, mid).square().colwise().mean();
X_ptr += stride;
}
if (pre > 1) {
mean_arr /= static_cast<T>(pre);
variance_arr /= static_cast<T>(pre);
}
variance_arr -= mean_arr.square();
}
template <typename T>
void MomentsImpl(
const int num_dims,
const int* dims,
const int num_axes,
const int* axes,
const T* X,
T* mean,
T* variance,
CPUContext* context) {
std::vector<int> Y_dims_vector(dims, dims + num_dims);
for (int i = 0; i < num_axes; ++i) {
Y_dims_vector[axes[i]] = 1;
}
const int* X_dims = dims;
const int* Y_dims = Y_dims_vector.data();
const int X_size =
std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>());
const int Y_size =
std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>());
if (X_size == 0) {
if (Y_size > 0) {
memset(mean, 0, sizeof(T) * Y_size);
memset(variance, 0, sizeof(T) * Y_size);
}
return;
}
if (std::equal(X_dims, X_dims + num_dims, Y_dims)) {
memcpy(mean, X, sizeof(T) * Y_size);
memset(variance, 0, sizeof(T) * Y_size);
return;
}
int rows;
int cols;
if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
RowwiseMoments<T>(rows, cols, X, mean, variance);
return;
}
if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) {
ColwiseMoments<T>(rows, cols, X, mean, variance);
return;
}
int pre;
int mid;
int nxt;
if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) {
BothEndsMoments<T>(pre, mid, nxt, X, mean, variance);
return;
}
Set<T, CPUContext>(Y_size, T(0), mean, context);
Set<T, CPUContext>(Y_size, T(0), variance, context);
std::vector<int> index(num_dims, 0);
for (int X_index = 0; X_index < X_size; ++X_index) {
const int Y_index = utils::GetIndexFromDims(num_dims, Y_dims, index.data());
mean[Y_index] += X[X_index];
variance[Y_index] += X[X_index] * X[X_index];
utils::IncreaseIndexInDims(num_dims, dims, index.data());
}
const T scale = static_cast<T>(Y_size) / static_cast<T>(X_size);
Scale<T, T, CPUContext>(Y_size, scale, mean, mean, context);
EigenVectorArrayMap<T> variance_arr(variance, Y_size);
variance_arr =
variance_arr * scale - ConstEigenVectorArrayMap<T>(mean, Y_size).square();
}
} // namespace
#define CAFFE2_SPECIALIZED_MOMENTS(T) \
template <> \
void Moments<T, CPUContext>( \
const int num_dims, \
const int* dims, \
const int num_axes, \
const int* axes, \
const T* X, \
T* mean, \
T* variance, \
CPUContext* context) { \
MomentsImpl<T>( \
num_dims, dims, num_axes, axes, X, mean, variance, context); \
}
CAFFE2_SPECIALIZED_MOMENTS(float)
#undef CAFFE2_SPECIALIZED_MOMENTS
#define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \
template <> \
void RowwiseMax<T, CPUContext>( \
const int N, const int D, const T* x, T* y, CPUContext*) { \
EigenVectorMap<T>(y, N) = \
ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \
}
CAFFE2_SPECIALIZED_ROWWISEMAX(float)
#undef CAFFE2_SPECIALIZED_ROWWISEMAX
#define CAFFE2_SPECIALIZED_COLWISEMAX(T) \
template <> \
void ColwiseMax<T, CPUContext>( \
const int N, const int D, const T* x, T* y, CPUContext*) { \
EigenVectorMap<T>(y, D) = \
ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \
}
CAFFE2_SPECIALIZED_COLWISEMAX(float)
#undef CAFFE2_SPECIALIZED_COLWISEMAX
#define CAFFE2_SPECIALIZED_ELEMWISEMAX(T) \
template <> \
void ElemwiseMax<T, CPUContext>( \
const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \
std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) { \
return std::max(x_i, y_i); \
}); \
}
CAFFE2_SPECIALIZED_ELEMWISEMAX(float)
#undef CAFFE2_SPECIALIZED_ELEMWISEMAX
#define CAFFE2_SPECIALIZED_MAXIMUM(T) \
template <> \
void Maximum<T, CPUContext>( \
const int N, const float alpha, const T* x, T* y, CPUContext* context) { \
std::transform( \
x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \
}
CAFFE2_SPECIALIZED_MAXIMUM(float)
#undef CAFFE2_SPECIALIZED_MAXIMUM
// The actual implementation uses eigen which is column major, so notice the
// row/column swap in the actual implementation.
#define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
template <> \
void Rowwise##Func<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
if (C == B) { \
EigenArrayMap<T>(C, cols, rows).colwise() expr## = \
ConstEigenVectorArrayMap<T>(A, cols); \
} else { \
EigenArrayMap<T>(C, cols, rows) = \
ConstEigenArrayMap<T>(B, cols, rows) \
.colwise() expr ConstEigenVectorArrayMap<T>(A, cols); \
} \
} \
template <> \
void Colwise##Func<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
if (C == B) { \
EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \
ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
} else { \
EigenArrayMap<T>(C, cols, rows) = \
ConstEigenArrayMap<T>(B, cols, rows) \
.rowwise() expr ConstEigenVectorArrayMap<T>(A, rows) \
.transpose(); \
} \
}
#define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \
template <> \
void Rowwise##Func<T, CPUContext, false>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
if (C == A) { \
EigenArrayMap<T>(C, cols, rows).colwise() expr## = \
ConstEigenVectorArrayMap<T>(B, cols); \
} else { \
EigenArrayMap<T>(C, cols, rows) = \
ConstEigenArrayMap<T>(A, cols, rows) \
.colwise() expr ConstEigenVectorArrayMap<T>(B, cols); \
} \
} \
template <> \
void Colwise##Func<T, CPUContext, false>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
if (C == A) { \
EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \
ConstEigenVectorArrayMap<T>(B, rows).transpose(); \
} else { \
EigenArrayMap<T>(C, cols, rows) = \
ConstEigenArrayMap<T>(A, cols, rows) \
.rowwise() expr ConstEigenVectorArrayMap<T>(B, rows) \
.transpose(); \
} \
}
#define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \
DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr)
#define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr) \
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr) \
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr) \
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \
DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr)
DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +)
DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *)
#undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
#undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION
#define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T) \
template <> \
void RowwiseSub<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
EigenArrayMap<T>(C, cols, rows) = \
(-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \
ConstEigenVectorArrayMap<T>(A, cols); \
} \
template <> \
void ColwiseSub<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
EigenArrayMap<T>(C, cols, rows) = \
(-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \
ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
} \
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -)
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float)
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double)
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t)
DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t)
#undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION
#define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T) \
template <> \
void RowwiseDiv<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
EigenArrayMap<T>(C, cols, rows) = \
ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \
ConstEigenVectorArrayMap<T>(A, cols); \
} \
template <> \
void ColwiseDiv<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
EigenArrayMap<T>(C, cols, rows) = \
ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \
ConstEigenVectorArrayMap<T>(A, rows).transpose(); \
} \
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /)
DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float)
DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double)
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /)
DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /)
#undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION
#undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION
#undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION
template <>
void Not<bool, CPUContext>(
const int N,
const bool* x,
bool* y,
CPUContext* /*context*/) {
for (int i = 0; i < N; ++i) {
y[i] = !x[i];
}
}
#undef CAFFE2_DEFINE_BINARY_OP
#undef CAFFE2_INSTANTIATE_BINARY_OP
#define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \
template <> \
void AddStripedBatch( \
const int N, \
const T* first, \
T* y, \
const int stripe, \
const int batch, \
CPUContext* context) { \
for (int j = 0; j < batch; j++) { \
Add<T, CPUContext>(N, first + j * stripe, y, y, context); \
} \
}
CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float);
#undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH
namespace {
template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
void RowwiseBinaryOp(
const int rows,
const int cols,
const BinaryOperator& op,
const TIn* A,
const TIn* B,
TOut* C) {
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
const int C_index = i * cols + j;
const int A_index = kBroadcast1st ? j : C_index;
const int B_index = kBroadcast1st ? C_index : j;
C[C_index] = op(A[A_index], B[B_index]);
}
}
}
template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st>
void ColwiseBinaryOp(
const int rows,
const int cols,
const BinaryOperator& op,
const TIn* A,
const TIn* B,
TOut* C) {
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
const int C_index = i * cols + j;
const int A_index = kBroadcast1st ? i : C_index;
const int B_index = kBroadcast1st ? C_index : i;
C[C_index] = op(A[A_index], B[B_index]);
}
}
}
template <typename TIn, typename TOut, class BinaryOperator>
void BroadcastBinaryOpImpl(
const int ndim,
const int* A_dims,
const int* B_dims,
const int* C_dims,
const BinaryOperator& op,
const TIn* A,
const TIn* B,
TOut* C) {
std::vector<int> index(ndim, 0);
const int C_size =
std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>());
for (int C_index = 0; C_index < C_size; ++C_index) {
const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data());
const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data());
C[C_index] = op(A[A_index], B[B_index]);
utils::IncreaseIndexInDims(ndim, C_dims, index.data());
}
}
} // namespace
#define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op) \
template <> \
void Func<TIn, CPUContext>( \
const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \
std::transform(A, A + N, B, C, Op<TIn>()); \
}
#define DEFINE_1D_COMPARE_FUNCTION(Func, Op) \
DELEGATE_1D_BINARY_FUNCTION(float, bool, Func, Op) \
DELEGATE_1D_BINARY_FUNCTION(double, bool, Func, Op) \
DELEGATE_1D_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
DELEGATE_1D_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, Op)
DEFINE_1D_COMPARE_FUNCTION(EQ, std::equal_to)
DEFINE_1D_COMPARE_FUNCTION(NE, std::not_equal_to)
DEFINE_1D_COMPARE_FUNCTION(LT, std::less)
DEFINE_1D_COMPARE_FUNCTION(LE, std::less_equal)
DEFINE_1D_COMPARE_FUNCTION(GT, std::greater)
DEFINE_1D_COMPARE_FUNCTION(GE, std::greater_equal)
#undef DEFINE_1D_COMPARE_FUNCTION
DELEGATE_1D_BINARY_FUNCTION(bool, bool, And, std::logical_and)
DELEGATE_1D_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
DELEGATE_1D_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
#define DEFINE_1D_BITWISE_BINARY_FUNCTION(Func, op) \
DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, op) \
DELEGATE_1D_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, op) \
DELEGATE_1D_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, op)
DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
#undef DEFINE_1D_BITWISE_BINARY_FUNCTION
#undef DELEGATE_1D_BINARY_FUNCTION
#define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
template <> \
void Rowwise##Func<TIn, CPUContext, true>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CPUContext*) { \
RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
} \
template <> \
void Rowwise##Func<TIn, CPUContext, false>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CPUContext*) { \
RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \
rows, cols, Op<TIn>(), A, B, C); \
} \
template <> \
void Colwise##Func<TIn, CPUContext, true>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CPUContext*) { \
ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \
} \
template <> \
void Colwise##Func<TIn, CPUContext, false>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CPUContext*) { \
ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \
rows, cols, Op<TIn>(), A, B, C); \
}
#define DEFINE_2D_COMPARE_FUNCTION(Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to)
DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to)
DEFINE_2D_COMPARE_FUNCTION(LT, std::less)
DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal)
DEFINE_2D_COMPARE_FUNCTION(GT, std::greater)
DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal)
#undef DEFINE_2D_COMPARE_FUNCTION
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
#define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
#undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION
#undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION
#define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T) \
template <> \
void RowwiseDiv<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
RowwiseBinaryOp<T, T, std::divides<T>, true>( \
rows, cols, std::divides<T>(), A, B, C); \
} \
template <> \
void ColwiseDiv<T, CPUContext, true>( \
const int rows, \
const int cols, \
const T* A, \
const T* B, \
T* C, \
CPUContext*) { \
ColwiseBinaryOp<T, T, std::divides<T>, true>( \
rows, cols, std::divides<T>(), A, B, C); \
}
DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t)
DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t)
#undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION
#define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \
template <> \
void Func<TIn, CPUContext>( \
const int A_ndim, \
const int* A_dims, \
const int B_ndim, \
const int* B_dims, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CPUContext* context) { \
const int ndim = std::max(A_ndim, B_ndim); \
std::vector<int> A_dims_array(ndim); \
std::vector<int> B_dims_array(ndim); \
std::vector<int> C_dims_array(ndim); \
utils::ComputeBroadcastBinaryOpDims( \
A_ndim, \
A_dims, \
B_ndim, \
B_dims, \
A_dims_array.data(), \
B_dims_array.data(), \
C_dims_array.data()); \
if (A_dims_array == B_dims_array) { \
const int size = std::accumulate( \
C_dims_array.cbegin(), \
C_dims_array.cend(), \
1, \
std::multiplies<int>()); \
Func<TIn, CPUContext>(size, A, B, C, context); \
return; \
} \
int rows; \
int cols; \
bool broadcast_1st; \
if (utils::IsRowwiseBroadcastBinaryOp( \
ndim, \
A_dims_array.data(), \
B_dims_array.data(), \
&rows, \
&cols, \
&broadcast_1st)) { \
if (broadcast_1st) { \
Rowwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \
} else { \
Rowwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \
} \
return; \
} \
if (utils::IsColwiseBroadcastBinaryOp( \
ndim, \
A_dims_array.data(), \
B_dims_array.data(), \
&rows, \
&cols, \
&broadcast_1st)) { \
if (broadcast_1st) { \
Colwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \
} else { \
Colwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \
} \
return; \
} \
int pre; \
int mid; \
int nxt; \
if (utils::IsBothEndsBroadcastBinaryOp( \
ndim, \
A_dims_array.data(), \
B_dims_array.data(), \
&pre, \
&mid, \
&nxt, \
&broadcast_1st)) { \
const int stride = mid * nxt; \
for (int i = 0; i < pre; ++i) { \
if (broadcast_1st) { \
Colwise##Func<TIn, CPUContext, true>( \
mid, nxt, A, B + i * stride, C + i * stride, context); \
} else { \
Colwise##Func<TIn, CPUContext, false>( \
mid, nxt, A + i * stride, B, C + i * stride, context); \
} \
} \
return; \
} \
BroadcastBinaryOpImpl( \
ndim, \
A_dims_array.data(), \
B_dims_array.data(), \
C_dims_array.data(), \
Op<TIn>(), \
A, \
B, \
C); \
}
#define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op)
DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to)
DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to)
DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less)
DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal)
DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater)
DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal)
#undef DEFINE_BROADCAST_COMPARE_FUNCTION
#define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus)
DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus)
DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies)
DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides)
#undef DEFINE_BROADCAST_BINARY_FUNCTION
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and)
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or)
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor)
#define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and)
DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or)
DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor)
#undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION
#undef DELEGATE_BROADCAST_BINARY_FUNCTION
#define CAFFE2_RAND_UNIFORM_REAL(T) \
template <> \
void RandUniform<T, CPUContext>( \
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
std::uniform_real_distribution<T> distribution(a, b); \
for (size_t i = 0; i < n; ++i) { \
r[i] = distribution(context->RandGenerator()); \
} \
}
CAFFE2_RAND_UNIFORM_REAL(float);
CAFFE2_RAND_UNIFORM_REAL(double);
#undef CAFFE2_RAND_UNIFORM_REAL
#define CAFFE2_RAND_UNIFORM_CHAR(T) \
template <> \
void RandUniform<T, CPUContext>( \
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
std::uniform_int_distribution<short> distribution((short)a, (short)b); \
for (size_t i = 0; i < n; ++i) { \
r[i] = static_cast<T>(distribution(context->RandGenerator())); \
} \
}
CAFFE2_RAND_UNIFORM_CHAR(int8_t);
CAFFE2_RAND_UNIFORM_CHAR(uint8_t);
#undef CAFFE2_RAND_UNIFORM_CHAR
#define CAFFE2_RAND_UNIFORM_INT(T) \
template <> \
void RandUniform<T, CPUContext>( \
const size_t n, const T a, const T b, T* r, CPUContext* context) { \
std::uniform_int_distribution<T> distribution(a, b); \
for (size_t i = 0; i < n; ++i) { \
r[i] = distribution(context->RandGenerator()); \
} \
}
CAFFE2_RAND_UNIFORM_INT(int16_t);
CAFFE2_RAND_UNIFORM_INT(int32_t);
CAFFE2_RAND_UNIFORM_INT(int64_t);
CAFFE2_RAND_UNIFORM_INT(uint16_t);
CAFFE2_RAND_UNIFORM_INT(uint32_t);
CAFFE2_RAND_UNIFORM_INT(uint64_t);
#undef CAFFE2_RAND_UNIFORM_INT
// This is not uniformly distributed between a and b.
// It takes advantage of normal distribution to generate numbers
// with mean = sum / n.
// Ideally the algorithm should be generating n numbers between 0 and 1,
// sum them up as scaled_sum, and use sum / scaled_sum to adjust the values
// to between a and b.
// The algorithm is non-trivial given the adjustment would be different towards
// each value.
#define CAFFE2_RAND_FIXED_SUM(T) \
template <> \
void RandFixedSum<T, CPUContext>( \
const size_t n, \
const T a, \
const T b, \
const T sum, \
T* r, \
CPUContext* context) { \
CAFFE_ENFORCE_GE(a, 0); \
CAFFE_ENFORCE_GE(sum / (double)n, a); \
CAFFE_ENFORCE_LE(sum / (double)n, b); \
T current_sum = 0; \
for (size_t i = 0; i < n - 1; ++i) { \
auto remaining_numbers = n - 1 - i; \
double mean = (sum - current_sum) / remaining_numbers; \
double stdev = std::min(mean - a, b - mean); \
std::normal_distribution<double> distribution{mean, stdev / 4.0}; \
T value = distribution(context->RandGenerator()); \
auto remaining_sum = sum - current_sum - value; \
if (value < a || remaining_sum > b * remaining_numbers) { \
value = a; \
} else if (value > b || remaining_sum < a * remaining_numbers) { \
value = b; \
} \
r[i] = value; \
CAFFE_ENFORCE(a <= value && value <= b); \
current_sum += value; \
} \
r[n - 1] = sum - current_sum; \
CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b); \
}
CAFFE2_RAND_FIXED_SUM(float);
CAFFE2_RAND_FIXED_SUM(double);
CAFFE2_RAND_FIXED_SUM(int8_t);
CAFFE2_RAND_FIXED_SUM(int16_t);
CAFFE2_RAND_FIXED_SUM(int32_t);
CAFFE2_RAND_FIXED_SUM(int64_t);
CAFFE2_RAND_FIXED_SUM(uint8_t);
CAFFE2_RAND_FIXED_SUM(uint16_t);
CAFFE2_RAND_FIXED_SUM(uint32_t);
CAFFE2_RAND_FIXED_SUM(uint64_t);
#undef CAFFE2_RAND_FIXED_SUM
#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \
template <> \
void RandUniformUnique<T, CPUContext>( \
const size_t n, \
const T a, \
const T b, \
T* r, \
const size_t m, \
const T* avoid, \
CPUContext* context) { \
CAFFE_ENFORCE_LE( \
n, b - a - m + 1, "Cannot satisfy the unique requirement"); \
std::unordered_set<T> avoid_set(n); \
if (m) { \
avoid_set.insert(avoid, avoid + m); \
CAFFE_ENFORCE_EQ(m, avoid_set.size(), "Avoid should be unique"); \
} \
std::uniform_int_distribution<T> distribution(a, b); \
T v = 0; \
for (size_t i = 0; i < n; ++i) { \
do { \
v = distribution(context->RandGenerator()); \
} while (avoid_set.count(v)); \
r[i] = v; \
avoid_set.insert(v); \
} \
}
CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
#undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE
template <>
void RandGaussian<float, CPUContext>(
const size_t n,
const float mean,
const float std,
float* r,
CPUContext* context) {
std::normal_distribution<float> distribution(mean, std);
for (size_t i = 0; i < n; ++i) {
r[i] = distribution(context->RandGenerator());
}
}
#define CAFFE2_SPECIALIZED_SUM(T) \
template <> \
void Sum<T, CPUContext>( \
const int N, \
const T* x, \
T* y, \
CPUContext* /* unused */, \
Tensor* /* unused */) { \
*y = ConstEigenVectorMap<T>(x, N).sum(); \
}
CAFFE2_SPECIALIZED_SUM(float);
CAFFE2_SPECIALIZED_SUM(int32_t);
CAFFE2_SPECIALIZED_SUM(int64_t);
#undef CAFFE2_SPECIALIZED_SUM
template <>
void SumSqr<float, CPUContext>(
const int N,
const float* x,
float* y,
CPUContext* /*context*/ /* unused */,
Tensor* /*scratch_ptr*/ /* unused */) {
*y = ConstEigenVectorMap<float>(x, N).squaredNorm();
}
template <>
void Select<float, CPUContext>(
const int N,
const int D,
const float* x,
const int* idx,
float* y,
CPUContext* /*context*/) {
for (int i = 0; i < N; ++i) {
DCHECK_LT(idx[i], D);
y[i] = x[i * D + idx[i]];
}
}
template <>
void CopyMatrix<CPUContext>(
const size_t itemsize,
const int M,
const int N,
const void* A,
const int lda,
void* B,
const int ldb,
CPUContext* /*context*/,
TypeMeta::TypedCopy copy) {
if (A == nullptr || B == nullptr) {
return;
}
if (lda == N && ldb == N) {
// can coalese to a single memcpy of size M * N
if (copy) {
copy(static_cast<const char*>(A), static_cast<char*>(B), N * M);
} else {
memcpy(
static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
}
return;
}
for (int i = 0; i < M; ++i) {
if (copy) {
copy(
static_cast<const char*>(A) + lda * i * itemsize,
static_cast<char*>(B) + ldb * i * itemsize,
N);
} else {
memcpy(
static_cast<char*>(B) + ldb * i * itemsize,
static_cast<const char*>(A) + lda * i * itemsize,
itemsize * N);
}
}
}
#ifdef CAFFE2_USE_MKL
#define DELEGATE_COPY_MATRIX_FUNCTION(T, Func) \
template <> \
void CopyMatrix<T, CPUContext>( \
const int M, \
const int N, \
const T* A, \
const int lda, \
T* B, \
const int ldb, \
CPUContext* /* context */) { \
Func('R', 'N', M, N, T(1), A, lda, B, ldb); \
} \
template <> \
void CopyMatrix<T, CPUContext>( \
const int M, \
const int N, \
const T* A, \
const int A_outer_stride, \
const int A_inner_stride, \
T* B, \
const int B_outer_stride, \
const int B_inner_stride, \
CPUContext* /* context */) { \
Func##2( \
'R', \
'N', \
M, \
N, \
T(1), \
A, \
A_outer_stride, \
A_inner_stride, \
B, \
B_outer_stride, \
B_inner_stride); \
}
DELEGATE_COPY_MATRIX_FUNCTION(float, mkl_somatcopy)
DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy)
#undef DELEGATE_COPY_MATRIX_FUNCTION
#endif // CAFFE2_USE_MKL
#define CAFFE2_SPECIALIZED_COPY_MATRIX(T) \
template <> \
void CopyMatrix<T, CPUContext>( \
const int M, \
const int N, \
const T* A, \
const int lda, \
T* B, \
const int ldb, \
CPUContext* /* context */) { \
if (M == 0 || N == 0) { \
return; \
} \
if (lda == N) { \
if (ldb == N) { \
std::memcpy(B, A, sizeof(T) * M * N); \
} else { \
EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \
ConstEigenMatrixMap<T>(A, N, M); \
} \
} else { \
if (ldb == N) { \
EigenMatrixMap<T>(B, N, M) = ConstEigenOuterStridedMatrixMap<T>( \
A, N, M, EigenOuterStride(lda)); \
} else { \
EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \
ConstEigenOuterStridedMatrixMap<T>( \
A, N, M, EigenOuterStride(lda)); \
} \
} \
} \
template <> \
void CopyMatrix<T, CPUContext>( \
const int M, \
const int N, \
const T* A, \
const int A_outer_stride, \
const int A_inner_stride, \
T* B, \
const int B_outer_stride, \
const int B_inner_stride, \
CPUContext* context) { \
if (A_inner_stride == 1 && B_inner_stride == 1) { \
CopyMatrix<T, CPUContext>( \
M, N, A, A_outer_stride, B, B_outer_stride, context); \
return; \
} \
EigenStridedMatrixMap<T>( \
B, N, M, EigenStride(B_outer_stride, B_inner_stride)) = \
ConstEigenStridedMatrixMap<T>( \
A, N, M, EigenStride(A_outer_stride, A_inner_stride)); \
}
#ifndef CAFFE2_USE_MKL
CAFFE2_SPECIALIZED_COPY_MATRIX(float)
CAFFE2_SPECIALIZED_COPY_MATRIX(double)
#endif // CAFFE2_USE_MKL
CAFFE2_SPECIALIZED_COPY_MATRIX(int)
CAFFE2_SPECIALIZED_COPY_MATRIX(TIndex)
#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
CAFFE2_SPECIALIZED_COPY_MATRIX(long)
#endif
CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint8_t)
CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t)
#undef CAFFE2_SPECIALIZXED_COPY_MATRIX
namespace {
template <typename T>
void Im2ColZeroPaddingAndNoDilationNCHW(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const T* img_data,
T* col_data,
CPUContext* context) {
const int output_h = (H - kernel_h) / stride_h + 1;
const int output_w = (W - kernel_w) / stride_w + 1;
const int output_size = output_h * output_w;
for (int c = 0; c < C; ++c) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
const T* src = img_data + kh * W + kw;
if (stride_w == 1) {
CopyMatrix<T, CPUContext>(
output_h,
output_w,
src,
stride_h * W,
col_data,
output_w,
context);
} else {
CopyMatrix<T, CPUContext>(
output_h,
output_w,
src,
stride_h * W,
stride_w,
col_data,
output_w,
1,
context);
}
col_data += output_size;
}
}
img_data += H * W;
}
}
template <typename T>
void Col2ImZeroPaddingAndNoDilationNCHW(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const T* col_data,
T* img_data,
CPUContext* context) {
Set<T, CPUContext>(C * H * W, T(0), img_data, context);
const int output_h = (H - kernel_h) / stride_h + 1;
const int output_w = (W - kernel_w) / stride_w + 1;
const int output_size = output_h * output_w;
for (int c = 0; c < C; ++c) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
T* dst = img_data + kh * W + kw;
if (stride_w == 1) {
EigenOuterStridedArrayMap<T>(
dst, output_w, output_h, EigenOuterStride(stride_h * W)) +=
ConstEigenArrayMap<T>(col_data, output_w, output_h);
} else {
EigenStridedArrayMap<T>(
dst, output_w, output_h, EigenStride(stride_h * W, stride_w)) +=
ConstEigenArrayMap<T>(col_data, output_w, output_h);
}
col_data += output_size;
}
}
img_data += H * W;
}
}
template <typename T>
void Im2ColZeroPaddingAndNoDilationNHWC(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const T* img_data,
T* col_data,
CPUContext* context) {
const int output_h = (H - kernel_h) / stride_h + 1;
const int output_w = (W - kernel_w) / stride_w + 1;
const int kernel_size = kernel_h * kernel_w;
for (int yh = 0; yh < output_h; ++yh) {
for (int yw = 0; yw < output_w; ++yw) {
const T* src = img_data + (yh * stride_h * W + yw * stride_w) * C;
CopyMatrix<T, CPUContext>(
kernel_h, kernel_w * C, src, W * C, col_data, kernel_w * C, context);
col_data += kernel_size * C;
}
}
}
template <typename T>
void Col2ImZeroPaddingAndNoDilationNHWC(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int stride_h,
const int stride_w,
const T* col_data,
T* img_data,
CPUContext* context) {
Set<T, CPUContext>(H * W * C, T(0), img_data, context);
const int output_h = (H - kernel_h) / stride_h + 1;
const int output_w = (W - kernel_w) / stride_w + 1;
const int kernel_size = kernel_h * kernel_w;
for (int yh = 0; yh < output_h; ++yh) {
for (int yw = 0; yw < output_w; ++yw) {
T* dst = img_data + (yh * stride_h * W + yw * stride_w) * C;
EigenOuterStridedArrayMap<T>(
dst, kernel_w * C, kernel_h, EigenOuterStride(W * C)) +=
ConstEigenArrayMap<T>(col_data, kernel_w * C, kernel_h);
col_data += kernel_size * C;
}
}
}
template <typename T, bool kCol2Im>
void Im2ColNdNCHWImpl(
const int N,
const int img_size,
const int col_size,
const int* img_shape,
const int* col_shape,
const int* kernel_shape,
const int* stride,
const int* dilation,
const int* pad,
const float* X_data,
float* Y_data) {
if (kCol2Im) {
std::memset(Y_data, 0, img_size * sizeof(float));
}
const int outer_size = col_shape[0];
const int inner_size = col_size / outer_size;
const int kernel_size = std::accumulate(
kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
std::vector<FixedDivisor<int>> kernel_shape_div(N);
for (int i = 0; i < N; ++i) {
kernel_shape_div[i] = FixedDivisor<int>(kernel_shape[i]);
}
std::vector<int> d_offset(N, 0);
std::vector<int> d_iter(N, 0);
for (int i = 0; i < outer_size; ++i) {
// Loop over spatial axes in reverse order to compute a per-axis offset.
int offset = i;
for (int d_i = N - 1; d_i >= 0; --d_i) {
kernel_shape_div[d_i].DivMod(offset, &offset, &d_offset[d_i]);
}
for (int j = 0; j < inner_size; ++j) {
// Loop over spatial axes in forward order to compute the indices in the
// image and column, and whether the index lies in the padding.
const int col_index = i * inner_size + j;
int img_index = i / kernel_size;
bool is_padding = false;
for (int d_i = 0; d_i < N; ++d_i) {
const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] +
d_offset[d_i] * dilation[d_i];
is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape[d_i + 1]);
img_index = img_index * img_shape[d_i + 1] + d_img;
}
if (!kCol2Im) {
Y_data[col_index] = is_padding ? 0 : X_data[img_index];
} else if (!is_padding) {
Y_data[img_index] += X_data[col_index];
}
utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data());
}
}
}
} // namespace
template <>
void Im2ColNd<float, CPUContext, StorageOrder::NCHW>(
const int N,
const int img_size,
const int col_size,
const int* img_shape,
const int* col_shape,
const int* kernel_shape,
const int* stride,
const int* dilation,
const int* pad,
const float* img_data,
float* col_data,
CPUContext* /* context */) {
Im2ColNdNCHWImpl<float, false>(
N,
img_size,
col_size,
img_shape,
col_shape,
kernel_shape,
stride,
dilation,
pad,
img_data,
col_data);
}
template <>
void Col2ImNd<float, CPUContext, StorageOrder::NCHW>(
const int N,
const int img_size,
const int col_size,
const int* img_shape,
const int* col_shape,
const int* kernel_shape,
const int* stride,
const int* dilation,
const int* pad,
const float* col_data,
float* img_data,
CPUContext* /* context */) {
Im2ColNdNCHWImpl<float, true>(
N,
img_size,
col_size,
img_shape,
col_shape,
kernel_shape,
stride,
dilation,
pad,
col_data,
img_data);
}
template <>
void Im2Col<float, CPUContext, StorageOrder::NCHW>(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int dilation_h,
const int dilation_w,
const int pad_t,
const int pad_l,
const int pad_b,
const int pad_r,
const int stride_h,
const int stride_w,
const float* img_data,
float* col_data,
CPUContext* context) {
// Fast path for zero padding and no dilation
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
dilation_w == 1) {
Im2ColZeroPaddingAndNoDilationNCHW<float>(
C,
H,
W,
kernel_h,
kernel_w,
stride_h,
stride_w,
img_data,
col_data,
context);
return;
}
// Baseline
const int output_h =
(H + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int output_w =
(W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
const int output_size = output_h * output_w;
for (int c = 0; c < C; ++c) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
for (int h = 0; h < output_h; ++h) {
const int h_pad = h * stride_h - pad_t + kh * dilation_h;
if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
std::memset(col_data + h * output_w, 0, output_w * sizeof(float));
continue;
}
for (int w = 0; w < output_w; ++w) {
const int w_pad = w * stride_w - pad_l + kw * dilation_w;
col_data[h * output_w + w] = utils::IsAGeZeroAndALtB(w_pad, W)
? img_data[(c * H + h_pad) * W + w_pad]
: 0;
}
}
col_data += output_size;
}
}
}
}
template <>
void Im2Col<float, CPUContext, StorageOrder::NHWC>(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int dilation_h,
const int dilation_w,
const int pad_t,
const int pad_l,
const int pad_b,
const int pad_r,
const int stride_h,
const int stride_w,
const float* img_data,
float* col_data,
CPUContext* context) {
// Fast path for zero padding and no dilation
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
dilation_w == 1) {
Im2ColZeroPaddingAndNoDilationNHWC<float>(
C,
H,
W,
kernel_h,
kernel_w,
stride_h,
stride_w,
img_data,
col_data,
context);
return;
}
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
const int output_h = (H + pad_b + pad_t - dkernel_h) / stride_h + 1;
const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
int h_pad = -pad_t;
for (int h = 0; h < output_h; ++h) {
int w_pad = -pad_l;
for (int w = 0; w < output_w; ++w) {
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
if (!utils::IsAGeZeroAndALtB(ih, H)) {
std::memset(col_data, 0, sizeof(float) * kernel_w * C);
col_data += kernel_w * C;
continue;
}
for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
if (utils::IsAGeZeroAndALtB(iw, W)) {
std::memcpy(
col_data, img_data + (ih * W + iw) * C, sizeof(float) * C);
} else {
std::memset(col_data, 0, sizeof(float) * C);
}
col_data += C;
}
}
w_pad += stride_w;
}
h_pad += stride_h;
}
}
template <>
void Col2Im<float, CPUContext, StorageOrder::NCHW>(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int dilation_h,
const int dilation_w,
const int pad_t,
const int pad_l,
const int pad_b,
const int pad_r,
const int stride_h,
const int stride_w,
const float* col_data,
float* img_data,
CPUContext* context) {
// Fast path for zero padding and no dilation
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
dilation_w == 1) {
Col2ImZeroPaddingAndNoDilationNCHW<float>(
C,
H,
W,
kernel_h,
kernel_w,
stride_h,
stride_w,
col_data,
img_data,
context);
return;
}
// Fallback
Set<float, CPUContext>(C * H * W, 0.0f, img_data, context);
const int output_h =
(H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
const int output_w =
(W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
const int output_size = output_h * output_w;
for (int c = 0; c < C; ++c) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
for (int h = 0; h < output_h; ++h) {
const int h_pad = h * stride_h - pad_t + kh * dilation_h;
if (!utils::IsAGeZeroAndALtB(h_pad, H)) {
continue;
}
for (int w = 0; w < output_w; ++w) {
const int w_pad = w * stride_w - pad_l + kw * dilation_w;
if (utils::IsAGeZeroAndALtB(w_pad, W)) {
img_data[(c * H + h_pad) * W + w_pad] +=
col_data[h * output_w + w];
}
}
}
col_data += output_size;
}
}
}
}
template <>
void Col2Im<float, CPUContext, StorageOrder::NHWC>(
const int C,
const int H,
const int W,
const int kernel_h,
const int kernel_w,
const int dilation_h,
const int dilation_w,
const int pad_t,
const int pad_l,
const int pad_b,
const int pad_r,
const int stride_h,
const int stride_w,
const float* col_data,
float* img_data,
CPUContext* context) {
// Fast path for zero padding and no dilation
if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 &&
dilation_w == 1) {
Col2ImZeroPaddingAndNoDilationNHWC<float>(
C,
H,
W,
kernel_h,
kernel_w,
stride_h,
stride_w,
col_data,
img_data,
context);
return;
}
Set<float, CPUContext>(H * W * C, 0, img_data, context);
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1;
const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1;
int h_pad = -pad_t;
for (int h = 0; h < output_h; ++h) {
int w_pad = -pad_l;
for (int w = 0; w < output_w; ++w) {
for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
if (!utils::IsAGeZeroAndALtB(ih, H)) {
col_data += kernel_w * C;
continue;
}
for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
if (utils::IsAGeZeroAndALtB(iw, W)) {
float* img_data_patch = img_data + (ih * W + iw) * C;
Add<float, CPUContext>(
C, img_data_patch, col_data, img_data_patch, context);
}
col_data += C;
}
}
w_pad += stride_w;
}
h_pad += stride_h;
}
}
template <>
void BiasCHW<float, CPUContext>(
const float* bias,
const float* /*bias_multiplier*/,
const int bias_channels,
const int image_size,
float* image,
CPUContext* /*context*/) {
// Sum the per-channel bias into every image plane
for (int c = 0; c < bias_channels; ++c) {
float b = bias[c];
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
float32x4_t vBias = vdupq_n_f32(b);
// We give alignment hints for additional speed, so handle the
// non-vectorizable prologue separately
constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);
// FIXME: if input < kVecSizeInFloat, can't vectorize at all
int prologue = kVecSizeInFloat -
// remainder in floats
(((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float);
int i = 0;
// Prologue loop
for (; i < prologue; ++i) {
image[i] += b;
}
// The loop is manually unrolled by 8
constexpr int kUnroll = 8;
constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;
int remainder = image_size - prologue;
int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;
// Vectorizable body
for (; i < vectorizable; i += kFloatsPerLoop) {
// Manually unrolled
float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
float32x4_t v7 = vld1q_f32_aligned(image + i + 28);
v0 = vaddq_f32(v0, vBias);
v1 = vaddq_f32(v1, vBias);
v2 = vaddq_f32(v2, vBias);
v3 = vaddq_f32(v3, vBias);
v4 = vaddq_f32(v4, vBias);
v5 = vaddq_f32(v5, vBias);
v6 = vaddq_f32(v6, vBias);
v7 = vaddq_f32(v7, vBias);
vst1q_f32_aligned(image + i + 0, v0);
vst1q_f32_aligned(image + i + 4, v1);
vst1q_f32_aligned(image + i + 8, v2);
vst1q_f32_aligned(image + i + 12, v3);
vst1q_f32_aligned(image + i + 16, v4);
vst1q_f32_aligned(image + i + 20, v5);
vst1q_f32_aligned(image + i + 24, v6);
vst1q_f32_aligned(image + i + 28, v7);
}
// Non-vectorizable epilogue
for (; i < image_size; ++i) {
image[i] += b;
}
#else
// Non-NEON CPU implementation
for (int i = 0; i < image_size; ++i) {
image[i] += b;
}
#endif // defined(__ARM_NEON__) || defined(__ARM_NEON)
image += image_size;
}
}
#define CAFFE2_SPECIALIZED_COPYVECTOR(T) \
template <> \
void CopyVector<T, CPUContext>( \
const int N, const T* src, T* dst, CPUContext* /*context*/) { \
if (src != dst && N > 0) { \
memcpy(dst, src, sizeof(T) * N); \
} \
}
CAFFE2_SPECIALIZED_COPYVECTOR(float)
#undef CAFFE2_SPECIALIZED_COPYVECTOR
namespace {
#ifdef CAFFE2_USE_HPTT
bool TransposeWithHPTT(
const int ndim,
const int* dims,
const int* axes,
const float* X,
float* Y) {
std::vector<int> axes_cm(ndim);
std::vector<int> dims_cm(ndim);
// Convert row-major index to column-major.
const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
for (int i = 0; i < ndim; ++i) {
axes_cm[i] = cm_fn(axes[cm_fn(i)]);
dims_cm[i] = dims[cm_fn(i)];
}
// HPTT doesn't handle 0 sized inputs.
for (auto dim : dims_cm) {
if (dim <= 0) {
return false;
}
}
auto plan = hptt::create_plan(
axes_cm.data(),
ndim,
1.0,
X,
dims_cm.data(),
nullptr,
0.0,
Y,
nullptr,
hptt::ESTIMATE,
1);
if (plan == nullptr) {
return false;
}
plan->execute();
return true;
}
#endif // CAFFE2_USE_HPTT
template <typename T>
void Tranpose2D(const int rows, const int cols, const T* X, T* Y);
#ifdef CAFFE2_USE_MKL
#define DELEGATE_TRANSPOSE_2D_FUNCTION(T, Func) \
template <> \
void Tranpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
Func('R', 'T', rows, cols, T(1), X, cols, Y, rows); \
}
DELEGATE_TRANSPOSE_2D_FUNCTION(float, mkl_somatcopy);
DELEGATE_TRANSPOSE_2D_FUNCTION(double, mkl_domatcopy);
#undef DELEGATE_TRANSPOSE_2D_FUNCTION
#endif // CAFFE2_USE_MKL
#define CAFFE2_SPECIALIZED_TRANSPOSE_2D(T) \
template <> \
void Tranpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \
EigenMatrixMap<T>(Y, rows, cols) = \
ConstEigenMatrixMap<T>(X, cols, rows).transpose(); \
}
#ifndef CAFFE2_USE_MKL
template <>
void Tranpose2D<float>(
const int rows,
const int cols,
const float* X,
float* Y) {
#ifdef CAFFE2_USE_HPTT
const std::array<int, 2> dims = {rows, cols};
const std::array<int, 2> axes = {1, 0};
if (TransposeWithHPTT(2, dims.data(), axes.data(), X, Y)) {
return;
}
#endif // CAFFE2_USE_HPTT
EigenMatrixMap<float>(Y, rows, cols) =
ConstEigenMatrixMap<float>(X, cols, rows).transpose();
}
CAFFE2_SPECIALIZED_TRANSPOSE_2D(double)
#endif // CAFFE2_USE_MKL
CAFFE2_SPECIALIZED_TRANSPOSE_2D(int)
CAFFE2_SPECIALIZED_TRANSPOSE_2D(TIndex)
#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
CAFFE2_SPECIALIZED_TRANSPOSE_2D(long)
#endif
CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint8_t)
CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint16_t)
#undef CAFFE2_SPECIALIZED_TRANSPOSE_2D
std::vector<int>
ComputeXStrides(const int ndim, const int* dims, const int* axes) {
std::vector<int> x_strides(ndim);
std::vector<int> buff(ndim);
int cur_stride = 1;
for (int i = ndim - 1; i >= 0; --i) {
buff[i] = cur_stride;
cur_stride *= dims[i];
}
for (int i = 0; i < ndim; ++i) {
x_strides[i] = buff[axes[i]];
}
return x_strides;
}
template <typename T>
void TransposeND(
const int ndim,
const int* dims,
const int* axes,
const T* X,
T* Y) {
std::vector<int> Y_dims(ndim);
for (int i = 0; i < ndim; ++i) {
Y_dims[i] = dims[axes[i]];
}
// Measure amount of contiguous data we can copy at once
int block_size = 1;
int num_shared_idx = 0;
for (int i = ndim - 1; i >= 0 && axes[i] == i; --i) {
block_size *= Y_dims[i];
++num_shared_idx;
}
const int itr_axes = ndim - num_shared_idx;
const int num_blocks = std::accumulate(
Y_dims.cbegin(), Y_dims.cbegin() + itr_axes, 1, std::multiplies<int>());
const std::vector<int> X_strides = ComputeXStrides(itr_axes, dims, axes);
std::vector<int> index(itr_axes, 0);
for (int Y_index = 0; Y_index < num_blocks; ++Y_index) {
const int X_index = std::inner_product(
X_strides.cbegin(), X_strides.cend(), index.cbegin(), 0);
if (block_size == 1) {
Y[Y_index] = X[X_index];
} else {
std::memcpy(
Y + block_size * Y_index,
X + block_size * X_index,
block_size * sizeof(T));
}
utils::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data());
}
}
template <typename T>
void TransposeCPUImpl(
const int ndim,
const int* dims,
const int* axes,
const T* X,
T* Y) {
if (utils::IsIdentityPermutation(ndim, axes)) {
const int size =
std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
std::memcpy(Y, X, size * sizeof(T));
return;
}
if (ndim == 2) {
Tranpose2D<T>(dims[0], dims[1], X, Y);
} else {
TransposeND<T>(ndim, dims, axes, X, Y);
}
}
template <>
void TransposeCPUImpl(
const int ndim,
const int* dims,
const int* axes,
const float* X,
float* Y) {
if (utils::IsIdentityPermutation(ndim, axes)) {
const int size =
std::accumulate(dims, dims + ndim, 1, std::multiplies<int>());
std::memcpy(Y, X, size * sizeof(float));
return;
}
if (ndim == 2) {
Tranpose2D<float>(dims[0], dims[1], X, Y);
} else {
#ifdef CAFFE2_USE_HPTT
if (TransposeWithHPTT(ndim, dims, axes, X, Y)) {
return;
}
#endif
TransposeND<float>(ndim, dims, axes, X, Y);
}
}
} // namespace
#define CAFFE2_SPECIALIZED_TRANSPOSE(T) \
template <> \
void Transpose<T, CPUContext>( \
const int ndim, \
const int* dims, \
const int* axes, \
const T* X, \
T* Y, \
CPUContext* /* context */) { \
TransposeCPUImpl(ndim, dims, axes, X, Y); \
}
CAFFE2_SPECIALIZED_TRANSPOSE(float)
CAFFE2_SPECIALIZED_TRANSPOSE(double)
CAFFE2_SPECIALIZED_TRANSPOSE(int)
CAFFE2_SPECIALIZED_TRANSPOSE(TIndex)
#ifdef CAFFE2_UNIQUE_LONG_TYPEMETA
CAFFE2_SPECIALIZED_TRANSPOSE(long)
#endif
CAFFE2_SPECIALIZED_TRANSPOSE(std::uint8_t)
CAFFE2_SPECIALIZED_TRANSPOSE(std::uint16_t)
#undef CAFFE2_SPECIALIZED_TRANSPOSE
} // namespace math
} // namespace caffe2