| // Implements the math functions for CPU. |
| // The implementation in this file allows us to route the underlying numerical |
| // computation library to different backends. Notably: |
| // (1) For all BLAS-related functions, one can explicitly request a BLAS backend |
| // such as MKL, openblas or Atlas. To see the set of supported backends |
| // currently provided, check //third_party/blas/. |
| // (2) If one chooses to link against MKL, we utilize MKL's vector math library |
| // (VML) for a few functions such as Exp and Log. |
| // (3) Fallback implementations are provided in Eigen for cross-platform |
| // support. Since Eigen is a header-only library and supports a number of |
| // platforms, it allows one to quickly port Caffe2 to different platforms |
| // where BLAS may not be present. |
| |
| #include "caffe2/utils/math.h" |
| |
| #include <algorithm> |
| #include <array> |
| #include <atomic> |
| #include <chrono> |
| #include <cmath> |
| #include <cstdlib> |
| #include <cstring> |
| #include <functional> |
| #include <limits> |
| #include <numeric> |
| #include <random> |
| #include <tuple> |
| #include <unordered_set> |
| #include <vector> |
| |
| #include "caffe2/core/context.h" |
| #include "caffe2/utils/cpu_neon.h" |
| #include "caffe2/utils/eigen_utils.h" |
| #include "caffe2/utils/fixed_divisor.h" |
| |
| #include "Eigen/Core" |
| #include "Eigen/Dense" |
| |
| #ifdef CAFFE2_USE_MKL |
| #include <mkl.h> |
| #endif // CAFFE2_USE_MKL |
| |
| #ifdef CAFFE2_USE_HPTT |
| #include <hptt.h> |
| #endif // CAFFE2_USE_HPTT |
| |
| #if defined(_MSC_VER) |
| #include <process.h> |
| #endif |
| |
| namespace caffe2 { |
| |
| namespace math { |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // BLAS alternatives. |
| // Depending on whether we have specified an external BLAS library or not, we |
| // will delegate the Caffe math functions that are BLAS-related to either the |
| // CBLAS call or the Eigen implementation. |
| //////////////////////////////////////////////////////////////////////////////// |
| #ifdef CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| // Caffe2 gemm provides a simpler interface to the gemm functions, with the |
| // limitation that the data has to be contiguous in memory. |
| // |
| // The gemm call implements the following operation: |
| // |
| // C = alpha * op(A) * op(B) + beta * C |
| // |
| // where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each |
| // of A, B, and C are matrices and alpha and beta are scalars. Note that the |
| // most common use case of gemm will involve setting alpha to 1 and beta to 0. |
| // |
| // op(A) and op(B) represent the transformations that are done to A and B before |
| // the matrix multiply; depending on the flags set, op(A) is equal to A or A^T |
| // (transpose) if the argument TransA or TransB is set to CblasNoTrans or |
| // CblasTrans, respectively, for each of A and B. |
| template <> |
| C10_EXPORT void Gemm<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CPUContext* context, |
| TensorProto::DataType math_type) { |
| auto C_mat = EigenMatrixMap<float>(C, N, M); |
| if (beta == 0) { |
| C_mat.setZero(); |
| } else { |
| C_mat *= beta; |
| } |
| switch (trans_A) { |
| case CblasNoTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, N, K) * |
| ConstEigenMatrixMap<float>(A, K, M)); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, K, N).transpose() * |
| ConstEigenMatrixMap<float>(A, K, M)); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| } |
| } |
| case CblasTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, N, K) * |
| ConstEigenMatrixMap<float>(A, M, K).transpose()); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(B, K, N).transpose() * |
| ConstEigenMatrixMap<float>(A, M, K).transpose()); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| } |
| } |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A"; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void GemmEx<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int lda, |
| const float* B, |
| const int ldb, |
| const float beta, |
| float* C, |
| const int ldc, |
| CPUContext*) { |
| EigenOuterStridedMatrixMap<float> C_mat(C, N, M, EigenOuterStride(ldc)); |
| if (beta == 0) { |
| C_mat.setZero(); |
| } else { |
| C_mat *= beta; |
| } |
| switch (trans_A) { |
| case CblasNoTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, N, K, EigenOuterStride(ldb)) * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, K, M, EigenOuterStride(lda))); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, K, N, EigenOuterStride(ldb)) |
| .transpose() * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, K, M, EigenOuterStride(lda))); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| } |
| } |
| case CblasTrans: { |
| switch (trans_B) { |
| case CblasNoTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, N, K, EigenOuterStride(ldb)) * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, M, K, EigenOuterStride(lda)) |
| .transpose()); |
| return; |
| case CblasTrans: |
| C_mat.noalias() += alpha * |
| (ConstEigenOuterStridedMatrixMap<float>( |
| B, K, N, EigenOuterStride(ldb)) |
| .transpose() * |
| ConstEigenOuterStridedMatrixMap<float>( |
| A, M, K, EigenOuterStride(lda)) |
| .transpose()); |
| return; |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_B"; |
| } |
| } |
| default: |
| LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for trans_A"; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Gemv<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const float* A, |
| const float* x, |
| const float beta, |
| float* y, |
| CPUContext* context, |
| TensorProto::DataType math_type) { |
| EigenVectorMap<float> y_vec(y, trans_A == CblasNoTrans ? M : N); |
| if (beta == 0) { |
| // In Caffe2 we often do a lazy initialization, which may contain NaNs in |
| // the float values. As a result, if beta is 0, we explicitly do a setzero. |
| y_vec.setZero(); |
| } else { |
| y_vec *= beta; |
| } |
| switch (trans_A) { |
| case CblasNoTrans: { |
| y_vec.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(A, N, M).transpose() * |
| ConstEigenVectorMap<float>(x, N)); |
| return; |
| } |
| case CblasTrans: { |
| y_vec.noalias() += alpha * |
| (ConstEigenMatrixMap<float>(A, N, M) * |
| ConstEigenVectorMap<float>(x, M)); |
| return; |
| } |
| default: |
| LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input."; |
| } |
| } |
| |
| #define CAFFE2_SPECIALIZED_DOT(T) \ |
| template <> \ |
| C10_EXPORT void Dot<T, CPUContext>( \ |
| const int N, const T* a, const T* b, T* y, CPUContext* context) { \ |
| *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N)); \ |
| } |
| CAFFE2_SPECIALIZED_DOT(float) |
| #undef CAFFE2_SPECIALIZED_DOT |
| |
| #define CAFFE2_SPECIALIZED_AXPY(T) \ |
| template <> \ |
| C10_EXPORT void Axpy<T, CPUContext>( \ |
| const int N, const T alpha, const T* x, T* Y, CPUContext* context) { \ |
| EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha; \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpy<T, CPUContext>( \ |
| const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \ |
| EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha); \ |
| } |
| CAFFE2_SPECIALIZED_AXPY(float) |
| #undef CAFFE2_SPECIALIZED_AXPY |
| |
| #define CAFFE2_SPECIALIZED_AXPBY(T) \ |
| template <> \ |
| C10_EXPORT void Axpby<T, T, CPUContext>( \ |
| const int N, \ |
| const T alpha, \ |
| const T* x, \ |
| const T beta, \ |
| T* y, \ |
| CPUContext* context) { \ |
| EigenVectorArrayMap<T> y_arr(y, N); \ |
| y_arr = y_arr * beta + ConstEigenVectorArrayMap<T>(x, N) * alpha; \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpby<T, T, CPUContext>( \ |
| const int N, \ |
| const T* alpha, \ |
| const T* x, \ |
| const T* beta, \ |
| T* y, \ |
| CPUContext* context) { \ |
| EigenVectorArrayMap<T> y_arr(y, N); \ |
| y_arr = y_arr * *beta + ConstEigenVectorArrayMap<T>(x, N) * *alpha; \ |
| } |
| CAFFE2_SPECIALIZED_AXPBY(float) |
| #undef CAFFE2_SPECIALIZED_AXPBY |
| |
| #else // CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| template <> |
| C10_EXPORT void Gemm<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CPUContext* /*context*/, |
| TensorProto::DataType /*math_type*/) { |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| cblas_sgemm( |
| CblasRowMajor, |
| trans_A, |
| trans_B, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| lda, |
| B, |
| ldb, |
| beta, |
| C, |
| N); |
| } |
| |
| template <> |
| C10_EXPORT void GemmEx<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int lda, |
| const float* B, |
| const int ldb, |
| const float beta, |
| float* C, |
| const int ldc, |
| CPUContext* /*context*/) { |
| cblas_sgemm( |
| CblasRowMajor, |
| trans_A, |
| trans_B, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| lda, |
| B, |
| ldb, |
| beta, |
| C, |
| ldc); |
| } |
| |
| template <> |
| C10_EXPORT void Gemv<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const int M, |
| const int N, |
| const float alpha, |
| const float* A, |
| const float* x, |
| const float beta, |
| float* y, |
| CPUContext* /*context*/, |
| TensorProto::DataType /*math_type*/) { |
| cblas_sgemv(CblasRowMajor, trans_A, M, N, alpha, A, N, x, 1, beta, y, 1); |
| } |
| |
| #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData, prefix) \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const int n, \ |
| const TAlpha alpha, \ |
| const TData* x, \ |
| TData* y, \ |
| CPUContext*) { \ |
| if (y != x) { \ |
| cblas_##prefix##copy(n, x, 1, y, 1); \ |
| } \ |
| if (alpha != TAlpha(1)) { \ |
| cblas_##prefix##scal(n, static_cast<TData>(alpha), y, 1); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const int n, \ |
| const TAlpha* alpha, \ |
| const TData* x, \ |
| TData* y, \ |
| CPUContext*) { \ |
| if (y != x) { \ |
| cblas_##prefix##copy(n, x, 1, y, 1); \ |
| } \ |
| if (*alpha != TAlpha(1)) { \ |
| cblas_##prefix##scal(n, static_cast<TData>(*alpha), y, 1); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_SCALE(float, float, s) |
| CAFFE2_SPECIALIZED_SCALE(double, double, d) |
| CAFFE2_SPECIALIZED_SCALE(float, double, d) |
| #undef CAFFE2_SPECIALIZED_SCALE |
| |
| #define CAFFE2_SPECIALIZED_DOT(T, prefix) \ |
| template <> \ |
| C10_EXPORT void Dot<T, CPUContext>( \ |
| const int N, const T* a, const T* b, T* y, CPUContext*) { \ |
| *y = cblas_##prefix##dot(N, a, 1, b, 1); \ |
| } |
| CAFFE2_SPECIALIZED_DOT(float, s) |
| #undef CAFFE2_SPECIALIZED_DOT |
| |
| #define CAFFE2_SPECIALIZED_AXPY(T, prefix) \ |
| template <> \ |
| C10_EXPORT void Axpy<T, CPUContext>( \ |
| const int N, const T alpha, const T* x, T* y, CPUContext*) { \ |
| cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpy<T, CPUContext>( \ |
| const int N, const T* alpha, const T* x, T* y, CPUContext*) { \ |
| cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ |
| } |
| CAFFE2_SPECIALIZED_AXPY(float, s) |
| #undef CAFFE2_SPECIALIZED_AXPY |
| |
| // cblas_[sd]axpby is not a standard blas function, and if MKL is not present, |
| // we will need to implement it. |
| #ifdef CAFFE2_USE_MKL |
| #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ |
| template <> \ |
| C10_EXPORT void Axpby<T, T, CPUContext>( \ |
| const int N, \ |
| const T alpha, \ |
| const T* x, \ |
| const T beta, \ |
| T* y, \ |
| CPUContext*) { \ |
| cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpby<T, T, CPUContext>( \ |
| const int N, \ |
| const T* alpha, \ |
| const T* x, \ |
| const T* beta, \ |
| T* y, \ |
| CPUContext*) { \ |
| cblas_##prefix##axpby(N, *alpha, x, 1, *beta, y, 1); \ |
| } |
| #else // CAFFE2_USE_MKL |
| #define CAFFE2_SPECIALIZED_AXPBY(T, prefix) \ |
| template <> \ |
| C10_EXPORT void Axpby<T, T, CPUContext>( \ |
| const int N, \ |
| const T alpha, \ |
| const T* x, \ |
| const T beta, \ |
| T* y, \ |
| CPUContext*) { \ |
| cblas_##prefix##scal(N, beta, y, 1); \ |
| cblas_##prefix##axpy(N, alpha, x, 1, y, 1); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Axpby<T, T, CPUContext>( \ |
| const int N, \ |
| const T* alpha, \ |
| const T* x, \ |
| const T* beta, \ |
| T* y, \ |
| CPUContext*) { \ |
| cblas_##prefix##scal(N, *beta, y, 1); \ |
| cblas_##prefix##axpy(N, *alpha, x, 1, y, 1); \ |
| } |
| #endif // CAFFE2_USE_MKL |
| CAFFE2_SPECIALIZED_AXPBY(float, s) |
| #undef CAFFE2_SPECIALIZED_AXPBY |
| |
| #endif // CAFFE2_USE_EIGEN_FOR_BLAS |
| |
| #define CAFFE2_SPECIALIZED_SCALE(TAlpha, TData) \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const int n, \ |
| const TAlpha alpha, \ |
| const TData* x, \ |
| TData* y, \ |
| CPUContext* /* context */) { \ |
| EigenVectorMap<TData>(y, n) = \ |
| ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(alpha); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Scale<TAlpha, TData, CPUContext>( \ |
| const int n, \ |
| const TAlpha* alpha, \ |
| const TData* x, \ |
| TData* y, \ |
| CPUContext* /* context */) { \ |
| EigenVectorMap<TData>(y, n) = \ |
| ConstEigenVectorMap<TData>(x, n) * static_cast<TData>(*alpha); \ |
| } |
| #ifdef CAFFE2_USE_EIGEN_FOR_BLAS |
| CAFFE2_SPECIALIZED_SCALE(float, float) |
| CAFFE2_SPECIALIZED_SCALE(double, double) |
| CAFFE2_SPECIALIZED_SCALE(float, double) |
| #endif // CAFFE2_USE_EIGEN_FOR_BLAS |
| CAFFE2_SPECIALIZED_SCALE(std::int32_t, std::int32_t) |
| CAFFE2_SPECIALIZED_SCALE(std::int64_t, std::int64_t) |
| #undef CAFFE2_SPECIALIZED_SCALE |
| |
| template <> |
| C10_EXPORT void GemmBatched<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float** A, |
| const float** B, |
| const float beta, |
| float** C, |
| CPUContext* context, |
| TensorProto::DataType /* math_type */) { |
| #ifdef CAFFE2_USE_MKL |
| (void)context; |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const int ldc = N; |
| cblas_sgemm_batch( |
| CblasRowMajor, |
| &trans_A, |
| &trans_B, |
| &M, |
| &N, |
| &K, |
| &alpha, |
| A, |
| &lda, |
| B, |
| &ldb, |
| &beta, |
| C, |
| &ldc, |
| 1, |
| &batch_size); |
| #else // CAFFE2_USE_MKL |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| math::Gemm<float, CPUContext>( |
| trans_A, trans_B, M, N, K, alpha, A[i], B[i], beta, C[i], context); |
| } |
| #endif // CAFFE2_USE_MKL |
| } |
| |
| template <> |
| C10_EXPORT void GemmStridedBatched<float, CPUContext>( |
| const CBLAS_TRANSPOSE trans_A, |
| const CBLAS_TRANSPOSE trans_B, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int A_stride, |
| const float* B, |
| const int B_stride, |
| const float beta, |
| float* C, |
| const int C_stride, |
| CPUContext* context, |
| TensorProto::DataType /* math_type */) { |
| #ifdef CAFFE2_USE_MKL |
| (void)context; |
| const int lda = (trans_A == CblasNoTrans) ? K : M; |
| const int ldb = (trans_B == CblasNoTrans) ? N : K; |
| const int ldc = N; |
| std::vector<const float*> A_array(batch_size); |
| std::vector<const float*> B_array(batch_size); |
| std::vector<float*> C_array(batch_size); |
| for (int i = 0; i < batch_size; ++i) { |
| A_array[i] = A + i * A_stride; |
| B_array[i] = B + i * B_stride; |
| C_array[i] = C + i * C_stride; |
| } |
| cblas_sgemm_batch( |
| CblasRowMajor, |
| &trans_A, |
| &trans_B, |
| &M, |
| &N, |
| &K, |
| &alpha, |
| A_array.data(), |
| &lda, |
| B_array.data(), |
| &ldb, |
| &beta, |
| C_array.data(), |
| &ldc, |
| 1, |
| &batch_size); |
| #else // CAFFE2_USE_MKL |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| math::Gemm<float, CPUContext>( |
| trans_A, trans_B, M, N, K, alpha, A, B, beta, C, context); |
| A += A_stride; |
| B += B_stride; |
| C += C_stride; |
| } |
| #endif |
| } |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // MKL VML alternatives. |
| // Depending on whether we are using MKL, we will delegate the Caffe math |
| // functions that are VML-related to either the VML call or the Eigen |
| // implementation. If you are setting the flags (such as AVX) right for your CPU |
| // architecture, usually Eigen will deliver a throughput as fast as the VML |
| // functions. |
| //////////////////////////////////////////////////////////////////////////////// |
| #ifdef CAFFE2_USE_MKL |
| |
| #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \ |
| template <> \ |
| C10_EXPORT void Funcname<T, CPUContext>( \ |
| const int N, const T* x, T* y, CPUContext*) { \ |
| OriginalFunc(N, x, y, ##__VA_ARGS__); \ |
| } |
| DELEGATE_SIMPLE_UNARY_FUNCTION( |
| float, |
| Exp, |
| vmsExp, |
| VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE) |
| DELEGATE_SIMPLE_UNARY_FUNCTION( |
| double, |
| Exp, |
| vmdExp, |
| VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, vsCos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, vdCos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, vsAcos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, vdAcos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, vsSin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, vdSin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, vsAsin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, vdAsin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, vsTan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, vdTan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, vsAtan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, vdAtan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sinh, vsSinh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sinh, vdSinh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cosh, vsCosh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cosh, vdCosh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tanh, vsTanh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tanh, vdTanh) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Abs, vsAbs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Abs, vdAbs) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, vsSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, vdSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, vsInvSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, vdInvSqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cbrt, vsCbrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cbrt, vdCbrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Inv, vsInv) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Inv, vdInv) |
| #undef DELEGATE_SIMPLE_UNARY_FUNCTION |
| |
| #define DELEGATE_SINCOS_FUNCTION(T, OriginalFunc) \ |
| template <> \ |
| C10_EXPORT void SinCos<T, CPUContext>( \ |
| const int N, const T* a, T* ys, T* yc, CPUContext*) { \ |
| OriginalFunc(N, a, ys, yc); \ |
| } |
| DELEGATE_SINCOS_FUNCTION(float, vsSinCos) |
| DELEGATE_SINCOS_FUNCTION(double, vdSinCos) |
| #undef DELEGATE_SINCOS_FUNCTION |
| |
| #define DELEGATE_POWX_FUNCTION(T, OriginalFunc) \ |
| template <> \ |
| C10_EXPORT void Powx<T, CPUContext>( \ |
| const int N, const T* a, T b, T* y, CPUContext*) { \ |
| OriginalFunc(N, a, b, y); \ |
| } |
| DELEGATE_POWX_FUNCTION(float, vsPowx) |
| DELEGATE_POWX_FUNCTION(double, vdPowx) |
| #undef DELEGATE_POWX_FUNCTION |
| |
| #define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Func, FuncImpl) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* A, const T* B, T* C, CPUContext*) { \ |
| FuncImpl(N, A, B, C); \ |
| } |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Add, vsAdd) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Sub, vsSub) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Mul, vsMul) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(float, Div, vsDiv) |
| DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv) |
| #undef DELEGATE_SIMPLE_BINARY_FUNCTION |
| |
| #else // CAFFE2_USE_MKL |
| |
| #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \ |
| template <> \ |
| C10_EXPORT void Funcname<T, CPUContext>( \ |
| const int N, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).expr(); \ |
| } |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Cos, cos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Cos, cos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Acos, acos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Acos, acos) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sin, sin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sin, sin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Asin, asin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Asin, asin) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Tan, tan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Tan, tan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Atan, atan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Atan, atan) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqrt, sqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqrt, sqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(float, Rsqrt, rsqrt) |
| DELEGATE_SIMPLE_UNARY_FUNCTION(double, Rsqrt, rsqrt) |
| |
| #undef DELEGATE_SIMPLE_UNARY_FUNCTION |
| |
| #define DELEGATE_SINCOS_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void SinCos<T, CPUContext>( \ |
| const int N, const T* x, T* ys, T* yc, CPUContext*) { \ |
| EigenVectorMap<T>(ys, N) = ConstEigenVectorArrayMap<T>(x, N).sin(); \ |
| EigenVectorMap<T>(yc, N) = ConstEigenVectorArrayMap<T>(x, N).cos(); \ |
| } |
| DELEGATE_SINCOS_FUNCTION(float) |
| DELEGATE_SINCOS_FUNCTION(double) |
| #undef DELEGATE_SINCOS_FUNCTION |
| |
| #define DELEGATE_TANH_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Tanh<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext*) { \ |
| EigenVectorMap<T>(Y, N) = T(1) - \ |
| ((ConstEigenVectorArrayMap<T>(X, N) * T(2)).exp() + T(1)).inverse() * \ |
| T(2); \ |
| } |
| DELEGATE_TANH_FUNCTION(float) |
| DELEGATE_TANH_FUNCTION(double) |
| #undef DELEGATE_TANH_FUNCTION |
| |
| #define DELEGATE_CBRT_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Cbrt<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext*) { \ |
| std::transform(X, X + N, Y, [](const T x) { return cbrt(x); }); \ |
| } |
| DELEGATE_CBRT_FUNCTION(float) |
| DELEGATE_CBRT_FUNCTION(double) |
| #undef DELEGATE_CBRT_FUNCTION |
| |
| #define DELEGATE_POWX_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Powx<T, CPUContext>( \ |
| const int N, const T* a, const T b, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(a, N).pow(b); \ |
| } |
| DELEGATE_POWX_FUNCTION(float) |
| #undef DELEGATE_POWX_FUNCTION |
| |
| #define DELEGATE_SINH_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Sinh<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext*) { \ |
| ConstEigenVectorArrayMap<T> X_arr(X, N); \ |
| EigenVectorMap<T>(Y, N) = (X_arr.exp() - (-X_arr).exp()) / 2; \ |
| } |
| DELEGATE_SINH_FUNCTION(float) |
| DELEGATE_SINH_FUNCTION(double) |
| #undef DELEGATE_SINH_FUNCTION |
| |
| #define DELEGATE_COSH_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Cosh<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext*) { \ |
| ConstEigenVectorArrayMap<T> X_arr(X, N); \ |
| EigenVectorMap<T>(Y, N) = (X_arr.exp() + (-X_arr).exp()) / 2; \ |
| } |
| DELEGATE_COSH_FUNCTION(float) |
| DELEGATE_COSH_FUNCTION(double) |
| #undef DELEGATE_COSH_FUNCTION |
| |
| #define DELEGATE_INV_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Inv<T, CPUContext>( \ |
| const int N, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).inverse(); \ |
| } |
| DELEGATE_INV_FUNCTION(float) |
| DELEGATE_INV_FUNCTION(double) |
| #undef DELEGATE_INV_FUNCTION |
| |
| #endif // CAFFE2_USE_MKL |
| |
| #define DELEGATE_NEG_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Neg<T, CPUContext>( \ |
| const int N, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = -ConstEigenVectorMap<T>(x, N); \ |
| } |
| DELEGATE_NEG_FUNCTION(float) |
| DELEGATE_NEG_FUNCTION(double) |
| DELEGATE_NEG_FUNCTION(std::int32_t) |
| DELEGATE_NEG_FUNCTION(std::int64_t) |
| #undef DELEGATE_NEG_FUNCTION |
| |
| #define DELEGATE_SIGN_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Sign<T, CPUContext>( \ |
| const int N, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).sign(); \ |
| } |
| DELEGATE_SIGN_FUNCTION(float) |
| DELEGATE_SIGN_FUNCTION(double) |
| DELEGATE_SIGN_FUNCTION(std::int32_t) |
| DELEGATE_SIGN_FUNCTION(std::int64_t) |
| #undef DELEGATE_SIGN_FUNCTION |
| |
| #define DELEGATE_ABS_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Abs<T, CPUContext>( \ |
| const int N, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = ConstEigenVectorArrayMap<T>(x, N).abs(); \ |
| } |
| #ifndef CAFFE2_USE_MKL |
| DELEGATE_ABS_FUNCTION(float) |
| DELEGATE_ABS_FUNCTION(double) |
| #endif // CAFFE2_USE_MKL |
| DELEGATE_ABS_FUNCTION(std::int32_t) |
| DELEGATE_ABS_FUNCTION(std::int64_t) |
| #undef DELEGATE_ABS_FUNCTION |
| |
| #define DELEGATE_CUBE_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void Cube<T, CPUContext>( \ |
| const int N, const T* X, T* Y, CPUContext*) { \ |
| EigenVectorMap<T>(Y, N) = ConstEigenVectorArrayMap<T>(X, N).cube(); \ |
| } |
| DELEGATE_CUBE_FUNCTION(float) |
| DELEGATE_CUBE_FUNCTION(double) |
| DELEGATE_CUBE_FUNCTION(std::int32_t) |
| DELEGATE_CUBE_FUNCTION(std::int64_t) |
| #undef DELEGATE_CUBE_FUNCTION |
| |
| #define EIGEN_SIMPLE_BINARY_FUNCTION(T, Func, expr) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int N, const T* A, const T* B, T* C, CPUContext*) { \ |
| EigenVectorMap<T>(C, N) = ConstEigenVectorArrayMap<T>(A, N) \ |
| expr ConstEigenVectorArrayMap<T>(B, N); \ |
| } |
| |
| #ifdef CAFFE2_USE_MKL |
| |
| #define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \ |
| EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \ |
| EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr) |
| |
| #else |
| |
| #define DEFINE_SIMPLE_BINARY_FUNCTION(Func, expr) \ |
| EIGEN_SIMPLE_BINARY_FUNCTION(float, Func, expr) \ |
| EIGEN_SIMPLE_BINARY_FUNCTION(double, Func, expr) \ |
| EIGEN_SIMPLE_BINARY_FUNCTION(std::int32_t, Func, expr) \ |
| EIGEN_SIMPLE_BINARY_FUNCTION(std::int64_t, Func, expr) |
| |
| #endif |
| |
| DEFINE_SIMPLE_BINARY_FUNCTION(Add, +) |
| DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -) |
| DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *) |
| DEFINE_SIMPLE_BINARY_FUNCTION(Div, /) |
| |
| #undef DEFINE_SIMPLE_BINARY_FUNCTION |
| #undef EIGEN_SIMPLE_BINARY_FUNCTION |
| |
| //////////////////////////////////////////////////////////////////////////////// |
| // Common math functions being used in Caffe that do not have a BLAS or MKL |
| // equivalent. For all these functions, we will simply implement them either via |
| // Eigen or via custom code. |
| //////////////////////////////////////////////////////////////////////////////// |
| |
| #define CAFFE2_SPECIALIZED_SET(T) \ |
| template <> \ |
| C10_EXPORT void Set<T, CPUContext>( \ |
| const size_t N, const T alpha, T* Y, CPUContext*) { \ |
| if (N == 0) { \ |
| return; \ |
| } \ |
| if (alpha == (T)0) { \ |
| if (Y != nullptr) { \ |
| std::memset(Y, 0, N * sizeof(T)); \ |
| } \ |
| } else { \ |
| EigenVectorMap<T>(Y, N).setConstant(alpha); \ |
| } \ |
| } |
| |
| CAFFE2_SPECIALIZED_SET(float); |
| CAFFE2_SPECIALIZED_SET(double); |
| CAFFE2_SPECIALIZED_SET(int8_t); |
| CAFFE2_SPECIALIZED_SET(int16_t); |
| CAFFE2_SPECIALIZED_SET(int); |
| CAFFE2_SPECIALIZED_SET(int64_t); |
| CAFFE2_SPECIALIZED_SET(bool); |
| CAFFE2_SPECIALIZED_SET(char); |
| CAFFE2_SPECIALIZED_SET(uint8_t); |
| CAFFE2_SPECIALIZED_SET(uint16_t); |
| #undef CAFFE2_SPECIALIZED_SET |
| |
| #define CAFFE2_SPECIALIZED_REDUCEMIN(T) \ |
| template <> \ |
| C10_EXPORT void ReduceMin<T, CPUContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| Tensor* /*scratch_ptr*/, \ |
| CPUContext* /*context*/) { \ |
| *y = ConstEigenVectorArrayMap<T>(x, N).minCoeff(); \ |
| } |
| CAFFE2_SPECIALIZED_REDUCEMIN(float) |
| #undef CAFFE2_SPECIALIZED_REDUCEMIN |
| |
| #define CAFFE2_SPECIALIZED_REDUCEMAX(T) \ |
| template <> \ |
| C10_EXPORT void ReduceMax<T, CPUContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| Tensor* /*scratch_ptr*/, \ |
| CPUContext* /*context*/) { \ |
| *y = ConstEigenVectorArrayMap<T>(x, N).maxCoeff(); \ |
| } |
| CAFFE2_SPECIALIZED_REDUCEMAX(float) |
| CAFFE2_SPECIALIZED_REDUCEMAX(int32_t) |
| CAFFE2_SPECIALIZED_REDUCEMAX(int64_t) |
| |
| #undef CAFFE2_SPECIALIZED_REDUCEMAX |
| |
| namespace { |
| |
| template <typename T> |
| struct MinFunctor { |
| inline T operator()(const T a, const T b) const { |
| return std::min(a, b); |
| } |
| }; |
| |
| template <typename T> |
| struct MaxFunctor { |
| inline T operator()(const T a, const T b) const { |
| return std::max(a, b); |
| } |
| }; |
| |
| template <typename T> |
| struct L1NormFunctor { |
| inline T operator()(const T a, const T b) const { |
| return a + std::abs(b); |
| } |
| }; |
| |
| template <typename T> |
| struct SquaredL2NormFunctor { |
| inline T operator()(const T a, const T b) const { |
| return a + b * b; |
| } |
| }; |
| |
| #define DELEGATE_ROWWISE_REDUCE_FUNCTION(Func, EigenOp) \ |
| template <typename T> \ |
| C10_EXPORT void Rowwise##Func( \ |
| const int rows, const int cols, const T alpha, const T* X, T* Y) { \ |
| EigenVectorMap<T>(Y, rows) = \ |
| ConstEigenMatrixMap<T>(X, cols, rows).colwise().EigenOp() * alpha; \ |
| } |
| DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMin, minCoeff) |
| DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff) |
| DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceSum, sum) |
| DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceMean, mean) |
| DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>); |
| DELEGATE_ROWWISE_REDUCE_FUNCTION(ReduceL2, norm) |
| #undef DELEGATE_ROWWISE_REDUCE_FUNCTION |
| |
| #define DELEGATE_COLWISE_REDUCE_FUNCTION(Func, EigenOp) \ |
| template <typename T> \ |
| C10_EXPORT void Colwise##Func( \ |
| const int rows, const int cols, const T alpha, const T* X, T* Y) { \ |
| EigenVectorMap<T>(Y, cols) = \ |
| ConstEigenMatrixMap<T>(X, cols, rows).rowwise().EigenOp() * alpha; \ |
| } |
| DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMin, minCoeff) |
| DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMax, maxCoeff) |
| DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceSum, sum) |
| DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceMean, mean) |
| DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL1, template lpNorm<1>); |
| DELEGATE_COLWISE_REDUCE_FUNCTION(ReduceL2, norm) |
| #undef DELEGATE_COLWISE_REDUCE_FUNCTION |
| |
| template <typename T> |
| C10_EXPORT void BothEndsReduceMin( |
| const int pre, |
| const int mid, |
| const int nxt, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| EigenVectorArrayMap<T> Y_arr(Y, mid); |
| Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().minCoeff(); |
| const T* X_ptr = X + mid * nxt; |
| // It seems there is some bug in eigen array::min so it cannot be implemented |
| // as ReduceSum below. |
| for (int i = 1; i < pre; ++i) { |
| for (int j = 0; j < mid; ++j) { |
| Y[j] = std::min(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).minCoeff()); |
| X_ptr += nxt; |
| } |
| } |
| if (alpha != T(1)) { |
| Y_arr *= alpha; |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void BothEndsReduceMax( |
| const int pre, |
| const int mid, |
| const int nxt, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| EigenVectorArrayMap<T> Y_arr(Y, mid); |
| Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().maxCoeff(); |
| const T* X_ptr = X + mid * nxt; |
| for (int i = 1; i < pre; ++i) { |
| for (int j = 0; j < mid; ++j) { |
| Y[j] = std::max(Y[j], ConstEigenVectorArrayMap<T>(X_ptr, nxt).maxCoeff()); |
| X_ptr += nxt; |
| } |
| } |
| if (alpha != T(1)) { |
| Y_arr *= alpha; |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void BothEndsReduceSum( |
| const int pre, |
| const int mid, |
| const int nxt, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| EigenVectorArrayMap<T> Y_arr(Y, mid); |
| Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().sum(); |
| const int stride = mid * nxt; |
| const T* X_ptr = X + stride; |
| for (int i = 1; i < pre; ++i) { |
| Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().sum(); |
| X_ptr += stride; |
| } |
| if (alpha != T(1)) { |
| Y_arr *= alpha; |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void BothEndsReduceMean( |
| const int pre, |
| const int mid, |
| const int nxt, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| EigenVectorArrayMap<T> Y_arr(Y, mid); |
| Y_arr = ConstEigenArrayMap<T>(X, nxt, mid).colwise().mean(); |
| const int stride = mid * nxt; |
| const T* X_ptr = X + stride; |
| for (int i = 1; i < pre; ++i) { |
| Y_arr += ConstEigenArrayMap<T>(X_ptr, nxt, mid).colwise().mean(); |
| X_ptr += stride; |
| } |
| if (alpha / static_cast<T>(pre) != 1) { |
| Y_arr *= alpha / static_cast<T>(pre); |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void BothEndsReduceL1( |
| const int pre, |
| const int mid, |
| const int nxt, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| EigenVectorArrayMap<T> Y_arr(Y, mid); |
| Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid) |
| .colwise() |
| .template lpNorm<1>() |
| .array(); |
| const int stride = mid * nxt; |
| const T* X_ptr = X + stride; |
| for (int i = 1; i < pre; ++i) { |
| Y_arr += ConstEigenMatrixMap<T>(X_ptr, nxt, mid) |
| .colwise() |
| .template lpNorm<1>() |
| .array(); |
| X_ptr += stride; |
| } |
| if (alpha != T(1)) { |
| Y_arr *= alpha; |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void BothEndsReduceL2( |
| const int pre, |
| const int mid, |
| const int nxt, |
| const T alpha, |
| const T* X, |
| T* Y) { |
| EigenVectorArrayMap<T> Y_arr(Y, mid); |
| Y_arr = ConstEigenMatrixMap<T>(X, nxt, mid).colwise().squaredNorm().array(); |
| const int stride = mid * nxt; |
| const T* X_ptr = X + stride; |
| for (int i = 1; i < pre; ++i) { |
| Y_arr += |
| ConstEigenMatrixMap<T>(X_ptr, nxt, mid).colwise().squaredNorm().array(); |
| X_ptr += stride; |
| } |
| Y_arr = Y_arr.sqrt() * alpha; |
| } |
| |
| template <typename T, class Reducer> |
| C10_EXPORT void ReduceTensor( |
| const int ndim, |
| const int* X_dims, |
| const int* Y_dims, |
| const Reducer& reducer, |
| const T init, |
| const T alpha, |
| const T* X, |
| T* Y, |
| CPUContext* context) { |
| const int X_size = |
| std::accumulate(X_dims, X_dims + ndim, 1, std::multiplies<int>()); |
| const int Y_size = |
| std::accumulate(Y_dims, Y_dims + ndim, 1, std::multiplies<int>()); |
| Set<T, CPUContext>(Y_size, init, Y, context); |
| std::vector<int> index(ndim, 0); |
| for (int X_index = 0; X_index < X_size; ++X_index) { |
| const int Y_index = utils::GetIndexFromDims(ndim, Y_dims, index.data()); |
| Y[Y_index] = reducer(Y[Y_index], X[X_index]); |
| utils::IncreaseIndexInDims(ndim, X_dims, index.data()); |
| } |
| Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); |
| } |
| |
| } // namespace |
| |
| #define DELEGATE_REDUCE_FUNCTION(T, Func, reducer, init, is_norm) \ |
| template <> \ |
| C10_EXPORT void Func<T, CPUContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T alpha, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* context) { \ |
| CAFFE_ENFORCE_LE(num_axes, num_dims); \ |
| std::vector<int> Y_dims_vector(dims, dims + num_dims); \ |
| for (int i = 0; i < num_axes; ++i) { \ |
| Y_dims_vector[axes[i]] = 1; \ |
| } \ |
| const int* X_dims = dims; \ |
| const int* Y_dims = Y_dims_vector.data(); \ |
| const int X_size = \ |
| std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \ |
| const int Y_size = \ |
| std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \ |
| if (X_size == 0) { \ |
| Set<T, CPUContext>(Y_size, alpha * init, Y, context); \ |
| return; \ |
| } \ |
| if (alpha == T(0)) { \ |
| Set<T, CPUContext>(Y_size, 0, Y, context); \ |
| return; \ |
| } \ |
| if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \ |
| if (is_norm) { \ |
| Abs<T, CPUContext>(X_size, X, Y, context); \ |
| Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \ |
| } else { \ |
| Scale<T, T, CPUContext>(Y_size, alpha, X, Y, context); \ |
| } \ |
| return; \ |
| } \ |
| int rows; \ |
| int cols; \ |
| if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ |
| Rowwise##Func<T>(rows, cols, alpha, X, Y); \ |
| return; \ |
| } \ |
| if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ |
| Colwise##Func<T>(rows, cols, alpha, X, Y); \ |
| return; \ |
| } \ |
| int pre; \ |
| int mid; \ |
| int nxt; \ |
| if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \ |
| BothEnds##Func<T>(pre, mid, nxt, alpha, X, Y); \ |
| return; \ |
| } \ |
| ReduceTensor( \ |
| num_dims, X_dims, Y_dims, reducer, init, alpha, X, Y, context); \ |
| } |
| |
| DELEGATE_REDUCE_FUNCTION( |
| float, |
| ReduceMin, |
| MinFunctor<float>(), |
| std::numeric_limits<float>::max(), |
| false) |
| DELEGATE_REDUCE_FUNCTION( |
| double, |
| ReduceMin, |
| MinFunctor<double>(), |
| std::numeric_limits<double>::max(), |
| false) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int32_t, |
| ReduceMin, |
| MinFunctor<std::int32_t>(), |
| std::numeric_limits<std::int32_t>::max(), |
| false) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int64_t, |
| ReduceMin, |
| MinFunctor<std::int64_t>(), |
| std::numeric_limits<std::int64_t>::max(), |
| false) |
| |
| DELEGATE_REDUCE_FUNCTION( |
| float, |
| ReduceMax, |
| MaxFunctor<float>(), |
| std::numeric_limits<float>::lowest(), |
| false) |
| DELEGATE_REDUCE_FUNCTION( |
| double, |
| ReduceMax, |
| MaxFunctor<double>(), |
| std::numeric_limits<double>::lowest(), |
| false) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int32_t, |
| ReduceMax, |
| MaxFunctor<std::int32_t>(), |
| std::numeric_limits<std::int32_t>::lowest(), |
| false) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int64_t, |
| ReduceMax, |
| MaxFunctor<std::int64_t>(), |
| std::numeric_limits<std::int64_t>::lowest(), |
| false) |
| |
| DELEGATE_REDUCE_FUNCTION(float, ReduceSum, std::plus<float>(), 0.0f, false) |
| DELEGATE_REDUCE_FUNCTION(double, ReduceSum, std::plus<double>(), 0.0, false) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int32_t, |
| ReduceSum, |
| std::plus<std::int32_t>(), |
| 0, |
| false) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int64_t, |
| ReduceSum, |
| std::plus<std::int64_t>(), |
| std::int64_t(0), |
| false) |
| |
| DELEGATE_REDUCE_FUNCTION(float, ReduceL1, L1NormFunctor<float>(), 0.0f, true) |
| DELEGATE_REDUCE_FUNCTION(double, ReduceL1, L1NormFunctor<double>(), 0.0, true) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int32_t, |
| ReduceL1, |
| L1NormFunctor<std::int32_t>(), |
| 0, |
| true) |
| DELEGATE_REDUCE_FUNCTION( |
| std::int64_t, |
| ReduceL1, |
| L1NormFunctor<std::int64_t>(), |
| std::int64_t(0), |
| true) |
| |
| #undef DELEGATE_REDUCE_FUNCTION |
| |
| #define CAFFE2_SPECIALIZED_REDUCE_MEAN(T) \ |
| template <> \ |
| C10_EXPORT void ReduceMean<T, CPUContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T alpha, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* context) { \ |
| CAFFE_ENFORCE_LE(num_axes, num_dims); \ |
| std::vector<int> Y_dims_vector(dims, dims + num_dims); \ |
| for (int i = 0; i < num_axes; ++i) { \ |
| Y_dims_vector[axes[i]] = 1; \ |
| } \ |
| const int* X_dims = dims; \ |
| const int* Y_dims = Y_dims_vector.data(); \ |
| const int X_size = \ |
| std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \ |
| const int Y_size = \ |
| std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \ |
| if (X_size == 0) { \ |
| Set<T, CPUContext>(Y_size, 0, Y, context); \ |
| return; \ |
| } \ |
| if (alpha == T(0)) { \ |
| Set<T, CPUContext>(Y_size, 0, Y, context); \ |
| return; \ |
| } \ |
| if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \ |
| Scale<T, T, CPUContext>(X_size, alpha, X, Y, context); \ |
| return; \ |
| } \ |
| int rows; \ |
| int cols; \ |
| if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ |
| RowwiseReduceMean<T>(rows, cols, alpha, X, Y); \ |
| return; \ |
| } \ |
| if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ |
| ColwiseReduceMean<T>(rows, cols, alpha, X, Y); \ |
| return; \ |
| } \ |
| int pre; \ |
| int mid; \ |
| int nxt; \ |
| if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \ |
| BothEndsReduceMean<T>(pre, mid, nxt, alpha, X, Y); \ |
| return; \ |
| } \ |
| const int scale = X_size / Y_size; \ |
| ReduceTensor( \ |
| num_dims, \ |
| X_dims, \ |
| Y_dims, \ |
| std::plus<T>(), \ |
| T(0), \ |
| alpha / static_cast<T>(scale), \ |
| X, \ |
| Y, \ |
| context); \ |
| } |
| CAFFE2_SPECIALIZED_REDUCE_MEAN(float) |
| CAFFE2_SPECIALIZED_REDUCE_MEAN(double) |
| #undef CAFFE2_SPECIALIZED_REDUCE_MEAN |
| |
| #define CAFFE2_SPECIALIZED_REDUCE_L2(T) \ |
| template <> \ |
| C10_EXPORT void ReduceL2<T, CPUContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T alpha, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* context) { \ |
| CAFFE_ENFORCE_LE(num_axes, num_dims); \ |
| std::vector<int> Y_dims_vector(dims, dims + num_dims); \ |
| for (int i = 0; i < num_axes; ++i) { \ |
| Y_dims_vector[axes[i]] = 1; \ |
| } \ |
| const int* X_dims = dims; \ |
| const int* Y_dims = Y_dims_vector.data(); \ |
| const int X_size = \ |
| std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); \ |
| const int Y_size = \ |
| std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); \ |
| if (X_size == 0) { \ |
| Set<T, CPUContext>(Y_size, 0, Y, context); \ |
| return; \ |
| } \ |
| if (alpha == T(0)) { \ |
| Set<T, CPUContext>(Y_size, 0, Y, context); \ |
| return; \ |
| } \ |
| if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { \ |
| Abs<T, CPUContext>(X_size, X, Y, context); \ |
| Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \ |
| return; \ |
| } \ |
| int rows; \ |
| int cols; \ |
| if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ |
| RowwiseReduceL2<T>(rows, cols, alpha, X, Y); \ |
| return; \ |
| } \ |
| if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { \ |
| ColwiseReduceL2<T>(rows, cols, alpha, X, Y); \ |
| return; \ |
| } \ |
| int pre; \ |
| int mid; \ |
| int nxt; \ |
| if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { \ |
| BothEndsReduceL2<T>(pre, mid, nxt, alpha, X, Y); \ |
| return; \ |
| } \ |
| ReduceTensor( \ |
| num_dims, \ |
| X_dims, \ |
| Y_dims, \ |
| SquaredL2NormFunctor<T>(), \ |
| T(0), \ |
| T(1), \ |
| X, \ |
| Y, \ |
| context); \ |
| Sqrt<T, CPUContext>(Y_size, Y, Y, context); \ |
| Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); \ |
| } |
| CAFFE2_SPECIALIZED_REDUCE_L2(float) |
| CAFFE2_SPECIALIZED_REDUCE_L2(double) |
| #undef CAFFE2_SPECIALIZED_REDUCE_L2 |
| |
| namespace { |
| |
| template <typename T> |
| C10_EXPORT void BroadcastImpl( |
| const int X_ndim, |
| const int* X_dims, |
| const int Y_ndim, |
| const int* Y_dims, |
| const T alpha, |
| const T* X, |
| T* Y, |
| CPUContext* context) { |
| CAFFE_ENFORCE_LE(X_ndim, Y_ndim); |
| std::vector<int> X_dims_vector(Y_ndim); |
| const int d = Y_ndim - X_ndim; |
| std::fill(X_dims_vector.begin(), X_dims_vector.begin() + d, 1); |
| for (int i = d; i < Y_ndim; ++i) { |
| CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]); |
| X_dims_vector[i] = X_dims[i - d]; |
| } |
| X_dims = X_dims_vector.data(); |
| const int Y_size = |
| std::accumulate(Y_dims, Y_dims + Y_ndim, 1, std::multiplies<int>()); |
| std::vector<int> index(Y_ndim, 0); |
| for (int Y_index = 0; Y_index < Y_size; ++Y_index) { |
| const int X_index = utils::GetIndexFromDims(Y_ndim, X_dims, index.data()); |
| Y[Y_index] = X[X_index]; |
| utils::IncreaseIndexInDims(Y_ndim, Y_dims, index.data()); |
| } |
| Scale<T, T, CPUContext>(Y_size, alpha, Y, Y, context); |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_BROADCAST(T) \ |
| template <> \ |
| C10_EXPORT void Broadcast<T, CPUContext>( \ |
| const int X_ndim, \ |
| const int* X_dims, \ |
| const int Y_ndim, \ |
| const int* Y_dims, \ |
| const T alpha, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* context) { \ |
| BroadcastImpl<T>(X_ndim, X_dims, Y_ndim, Y_dims, alpha, X, Y, context); \ |
| } |
| CAFFE2_SPECIALIZED_BROADCAST(std::int32_t) |
| CAFFE2_SPECIALIZED_BROADCAST(std::int64_t) |
| CAFFE2_SPECIALIZED_BROADCAST(float) |
| CAFFE2_SPECIALIZED_BROADCAST(double) |
| #undef CAFFE2_SPECIALIZED_BROADCAST |
| |
| namespace { |
| |
| template <typename T> |
| C10_EXPORT void RowwiseMoments( |
| const int rows, |
| const int cols, |
| const T* X, |
| T* mean, |
| T* variance) { |
| ConstEigenArrayMap<T> X_arr(X, cols, rows); |
| EigenVectorArrayMap<T> mean_arr(mean, rows); |
| EigenVectorArrayMap<T> var_arr(variance, rows); |
| mean_arr = X_arr.colwise().mean(); |
| var_arr = X_arr.square().colwise().mean() - mean_arr.square().transpose(); |
| } |
| |
| template <typename T> |
| C10_EXPORT void ColwiseMoments( |
| const int rows, |
| const int cols, |
| const T* X, |
| T* mean, |
| T* variance) { |
| std::memset(mean, 0, sizeof(T) * cols); |
| std::memset(variance, 0, sizeof(T) * cols); |
| ConstEigenArrayMap<T> X_arr(X, cols, rows); |
| EigenVectorArrayMap<T> mean_arr(mean, cols); |
| EigenVectorArrayMap<T> var_arr(variance, cols); |
| // Eigen rowwise reduction is about 10 times slower than this for-loop. |
| for (int i = 0; i < rows; ++i) { |
| mean_arr += X_arr.col(i); |
| var_arr += X_arr.col(i).square(); |
| } |
| const T scale = T(1) / static_cast<T>(rows); |
| mean_arr *= scale; |
| var_arr = var_arr * scale - mean_arr.square(); |
| } |
| |
| template <typename T> |
| C10_EXPORT void BothEndsMoments( |
| const int pre, |
| const int mid, |
| const int nxt, |
| const T* X, |
| T* mean, |
| T* variance) { |
| std::memset(mean, 0, sizeof(T) * mid); |
| std::memset(variance, 0, sizeof(T) * mid); |
| EigenVectorArrayMap<T> mean_arr(mean, mid); |
| EigenVectorArrayMap<T> var_arr(variance, mid); |
| ConstEigenArrayMap<T> X_arr(X, nxt, pre * mid); |
| for (int i = 0; i < pre; ++i) { |
| for (int j = 0; j < mid; ++j) { |
| const int c = i * mid + j; |
| mean_arr(j) += X_arr.col(c).sum(); |
| var_arr(j) += X_arr.col(c).square().sum(); |
| } |
| } |
| const T scale = T(1) / static_cast<T>(pre * nxt); |
| mean_arr *= scale; |
| var_arr = var_arr * scale - mean_arr.square(); |
| } |
| |
| template <typename T> |
| C10_EXPORT void MomentsImpl( |
| const int num_dims, |
| const int* dims, |
| const int num_axes, |
| const int* axes, |
| const T* X, |
| T* mean, |
| T* variance, |
| CPUContext* context) { |
| std::vector<int> Y_dims_vector(dims, dims + num_dims); |
| for (int i = 0; i < num_axes; ++i) { |
| Y_dims_vector[axes[i]] = 1; |
| } |
| const int* X_dims = dims; |
| const int* Y_dims = Y_dims_vector.data(); |
| const int X_size = |
| std::accumulate(X_dims, X_dims + num_dims, 1, std::multiplies<int>()); |
| const int Y_size = |
| std::accumulate(Y_dims, Y_dims + num_dims, 1, std::multiplies<int>()); |
| if (X_size == 0) { |
| std::memset(mean, 0, sizeof(T) * Y_size); |
| std::memset(variance, 0, sizeof(T) * Y_size); |
| return; |
| } |
| if (std::equal(X_dims, X_dims + num_dims, Y_dims)) { |
| std::memcpy(mean, X, sizeof(T) * Y_size); |
| std::memset(variance, 0, sizeof(T) * Y_size); |
| return; |
| } |
| int rows; |
| int cols; |
| if (utils::IsRowwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { |
| RowwiseMoments<T>(rows, cols, X, mean, variance); |
| return; |
| } |
| if (utils::IsColwiseReduce(num_dims, X_dims, Y_dims, &rows, &cols)) { |
| ColwiseMoments<T>(rows, cols, X, mean, variance); |
| return; |
| } |
| int pre; |
| int mid; |
| int nxt; |
| if (utils::IsBothEndsReduce(num_dims, X_dims, Y_dims, &pre, &mid, &nxt)) { |
| BothEndsMoments<T>(pre, mid, nxt, X, mean, variance); |
| return; |
| } |
| Set<T, CPUContext>(Y_size, T(0), mean, context); |
| Set<T, CPUContext>(Y_size, T(0), variance, context); |
| std::vector<int> index(num_dims, 0); |
| for (int X_index = 0; X_index < X_size; ++X_index) { |
| const int Y_index = utils::GetIndexFromDims(num_dims, Y_dims, index.data()); |
| mean[Y_index] += X[X_index]; |
| variance[Y_index] += X[X_index] * X[X_index]; |
| utils::IncreaseIndexInDims(num_dims, dims, index.data()); |
| } |
| const T scale = static_cast<T>(Y_size) / static_cast<T>(X_size); |
| EigenVectorArrayMap<T> mean_arr(mean, Y_size); |
| EigenVectorArrayMap<T> var_arr(variance, Y_size); |
| mean_arr *= scale; |
| var_arr = |
| var_arr * scale - ConstEigenVectorArrayMap<T>(mean, Y_size).square(); |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_MOMENTS(T) \ |
| template <> \ |
| C10_EXPORT void Moments<T, CPUContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T* X, \ |
| T* mean, \ |
| T* variance, \ |
| CPUContext* context) { \ |
| MomentsImpl<T>( \ |
| num_dims, dims, num_axes, axes, X, mean, variance, context); \ |
| } |
| CAFFE2_SPECIALIZED_MOMENTS(float) |
| #undef CAFFE2_SPECIALIZED_MOMENTS |
| |
| #define CAFFE2_SPECIALIZED_INV_STD(T) \ |
| template <> \ |
| void InvStd<T, CPUContext>( \ |
| const int N, \ |
| const T epsilon, \ |
| const T* var, \ |
| T* inv_std, \ |
| CPUContext* context) { \ |
| EigenVectorArrayMap<T>(inv_std, N) = \ |
| (ConstEigenVectorArrayMap<T>(var, N) + epsilon).rsqrt(); \ |
| } |
| CAFFE2_SPECIALIZED_INV_STD(float) |
| #undef CAFFE2_SPECIALIZED_INV_STD |
| |
| #define CAFFE2_SPECIALIZED_ROWWISEMAX(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseMax<T, CPUContext>( \ |
| const int N, const int D, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, N) = \ |
| ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff(); \ |
| } |
| CAFFE2_SPECIALIZED_ROWWISEMAX(float) |
| #undef CAFFE2_SPECIALIZED_ROWWISEMAX |
| |
| #define CAFFE2_SPECIALIZED_COLWISEMAX(T) \ |
| template <> \ |
| C10_EXPORT void ColwiseMax<T, CPUContext>( \ |
| const int N, const int D, const T* x, T* y, CPUContext*) { \ |
| EigenVectorMap<T>(y, D) = \ |
| ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff(); \ |
| } |
| CAFFE2_SPECIALIZED_COLWISEMAX(float) |
| #undef CAFFE2_SPECIALIZED_COLWISEMAX |
| |
| #define CAFFE2_SPECIALIZED_ELEMWISEMAX(T) \ |
| template <> \ |
| C10_EXPORT void ElemwiseMax<T, CPUContext>( \ |
| const int N, const T* x, const T* y, T* z, CPUContext* /*context*/) { \ |
| std::transform(x, x + N, y, z, [](const T& x_i, const T& y_i) { \ |
| return std::max(x_i, y_i); \ |
| }); \ |
| } |
| CAFFE2_SPECIALIZED_ELEMWISEMAX(float) |
| #undef CAFFE2_SPECIALIZED_ELEMWISEMAX |
| |
| #define CAFFE2_SPECIALIZED_MAXIMUM(T) \ |
| template <> \ |
| C10_EXPORT void Maximum<T, CPUContext>( \ |
| const int N, const float alpha, const T* x, T* y, CPUContext* context) { \ |
| std::transform( \ |
| x, x + N, y, [&alpha](const T& x_i) { return std::max(x_i, alpha); }); \ |
| } |
| CAFFE2_SPECIALIZED_MAXIMUM(float) |
| #undef CAFFE2_SPECIALIZED_MAXIMUM |
| |
| // The actual implementation uses eigen which is column major, so notice the |
| // row/column swap in the actual implementation. |
| |
| #define DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == B) { \ |
| EigenArrayMap<T>(C, cols, rows).colwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(A, cols); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows) \ |
| .colwise() expr ConstEigenVectorArrayMap<T>(A, cols); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == B) { \ |
| EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows) \ |
| .rowwise() expr ConstEigenVectorArrayMap<T>(A, rows) \ |
| .transpose(); \ |
| } \ |
| } |
| |
| #define DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<T, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == A) { \ |
| EigenArrayMap<T>(C, cols, rows).colwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(B, cols); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(A, cols, rows) \ |
| .colwise() expr ConstEigenVectorArrayMap<T>(B, cols); \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<T, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| if (C == A) { \ |
| EigenArrayMap<T>(C, cols, rows).rowwise() expr## = \ |
| ConstEigenVectorArrayMap<T>(B, rows).transpose(); \ |
| } else { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(A, cols, rows) \ |
| .rowwise() expr ConstEigenVectorArrayMap<T>(B, rows) \ |
| .transpose(); \ |
| } \ |
| } |
| |
| #define DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(T, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION(T, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Func, expr) |
| |
| #define DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(float, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(double, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, Func, expr) \ |
| DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, Func, expr) |
| |
| DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Add, +) |
| DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION(Mul, *) |
| |
| #undef DEFINE_EIGEN_2D_BROADCAST_BINARY_FUNCTION |
| #undef DELEGATE_EIGEN_2D_BROADCAST_BINARY_FUNCTION |
| |
| #define DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseSub<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| (-ConstEigenArrayMap<T>(B, cols, rows)).colwise() + \ |
| ConstEigenVectorArrayMap<T>(A, cols); \ |
| } \ |
| template <> \ |
| C10_EXPORT void ColwiseSub<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| (-ConstEigenArrayMap<T>(B, cols, rows)).rowwise() + \ |
| ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ |
| } \ |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Sub, -) |
| |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(float) |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(double) |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int32_t) |
| DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION(std::int64_t) |
| |
| #undef DEFINE_EIGEN_2D_BROADCAST_SUB_FUNCTION |
| |
| #define DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows).inverse().colwise() * \ |
| ConstEigenVectorArrayMap<T>(A, cols); \ |
| } \ |
| template <> \ |
| C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| EigenArrayMap<T>(C, cols, rows) = \ |
| ConstEigenArrayMap<T>(B, cols, rows).inverse().rowwise() * \ |
| ConstEigenVectorArrayMap<T>(A, rows).transpose(); \ |
| } \ |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(T, Div, /) |
| |
| DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(float) |
| DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION(double) |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int32_t, Div, /) |
| DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION(std::int64_t, Div, /) |
| |
| #undef DEFINE_EIGEN_2D_BROADCAST_DIV_FUNCTION |
| |
| #undef DELEGATE_EIGEN_2D_BROADCAST_1ST_BINARY_FUNCTION |
| #undef DELEGATE_EIGEN_2D_BROADCAST_2ND_BINARY_FUNCTION |
| |
| template <> |
| C10_EXPORT void Not<bool, CPUContext>( |
| const int N, |
| const bool* x, |
| bool* y, |
| CPUContext* /*context*/) { |
| for (int i = 0; i < N; ++i) { |
| y[i] = !x[i]; |
| } |
| } |
| |
| #undef C10_DEFINE_BINARY_OP |
| #undef CAFFE2_INSTANTIATE_BINARY_OP |
| |
| #define CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(T) \ |
| template <> \ |
| C10_EXPORT void AddStripedBatch( \ |
| const int N, \ |
| const T* first, \ |
| T* y, \ |
| const int stripe, \ |
| const int batch, \ |
| CPUContext* context) { \ |
| for (int j = 0; j < batch; j++) { \ |
| Add<T, CPUContext>(N, first + j * stripe, y, y, context); \ |
| } \ |
| } |
| |
| CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH(float); |
| #undef CAFFE2_SPECIALIZED_CPU_ADD_STRIPED_BATCH |
| |
| namespace { |
| |
| template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st> |
| C10_EXPORT void RowwiseBinaryOp( |
| const int rows, |
| const int cols, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| for (int i = 0; i < rows; ++i) { |
| for (int j = 0; j < cols; ++j) { |
| const int C_index = i * cols + j; |
| const int A_index = kBroadcast1st ? j : C_index; |
| const int B_index = kBroadcast1st ? C_index : j; |
| C[C_index] = op(A[A_index], B[B_index]); |
| } |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator, bool kBroadcast1st> |
| C10_EXPORT void ColwiseBinaryOp( |
| const int rows, |
| const int cols, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| for (int i = 0; i < rows; ++i) { |
| for (int j = 0; j < cols; ++j) { |
| const int C_index = i * cols + j; |
| const int A_index = kBroadcast1st ? i : C_index; |
| const int B_index = kBroadcast1st ? C_index : i; |
| C[C_index] = op(A[A_index], B[B_index]); |
| } |
| } |
| } |
| |
| template <typename TIn, typename TOut, class BinaryOperator> |
| C10_EXPORT void BroadcastBinaryOpImpl( |
| const int ndim, |
| const int* A_dims, |
| const int* B_dims, |
| const int* C_dims, |
| const BinaryOperator& op, |
| const TIn* A, |
| const TIn* B, |
| TOut* C) { |
| std::vector<int> index(ndim, 0); |
| const int C_size = |
| std::accumulate(C_dims, C_dims + ndim, 1, std::multiplies<int>()); |
| for (int C_index = 0; C_index < C_size; ++C_index) { |
| const int A_index = utils::GetIndexFromDims(ndim, A_dims, index.data()); |
| const int B_index = utils::GetIndexFromDims(ndim, B_dims, index.data()); |
| C[C_index] = op(A[A_index], B[B_index]); |
| utils::IncreaseIndexInDims(ndim, C_dims, index.data()); |
| } |
| } |
| |
| } // namespace |
| |
| #define DELEGATE_1D_BINARY_FUNCTION(TIn, TOut, Func, Op) \ |
| template <> \ |
| C10_EXPORT void Func<TIn, CPUContext>( \ |
| const int N, const TIn* A, const TIn* B, TOut* C, CPUContext*) { \ |
| std::transform(A, A + N, B, C, Op<TIn>()); \ |
| } |
| |
| #define DEFINE_1D_COMPARE_FUNCTION(Func, Op) \ |
| DELEGATE_1D_BINARY_FUNCTION(float, bool, Func, Op) \ |
| DELEGATE_1D_BINARY_FUNCTION(double, bool, Func, Op) \ |
| DELEGATE_1D_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ |
| DELEGATE_1D_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ |
| DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, Op) |
| |
| DEFINE_1D_COMPARE_FUNCTION(EQ, std::equal_to) |
| DEFINE_1D_COMPARE_FUNCTION(NE, std::not_equal_to) |
| DEFINE_1D_COMPARE_FUNCTION(LT, std::less) |
| DEFINE_1D_COMPARE_FUNCTION(LE, std::less_equal) |
| DEFINE_1D_COMPARE_FUNCTION(GT, std::greater) |
| DEFINE_1D_COMPARE_FUNCTION(GE, std::greater_equal) |
| |
| #undef DEFINE_1D_COMPARE_FUNCTION |
| |
| DELEGATE_1D_BINARY_FUNCTION(bool, bool, And, std::logical_and) |
| DELEGATE_1D_BINARY_FUNCTION(bool, bool, Or, std::logical_or) |
| DELEGATE_1D_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) |
| |
| #define DEFINE_1D_BITWISE_BINARY_FUNCTION(Func, op) \ |
| DELEGATE_1D_BINARY_FUNCTION(bool, bool, Func, op) \ |
| DELEGATE_1D_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, op) \ |
| DELEGATE_1D_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, op) |
| |
| DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) |
| DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) |
| DEFINE_1D_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) |
| |
| #undef DEFINE_1D_BITWISE_BINARY_FUNCTION |
| |
| #undef DELEGATE_1D_BINARY_FUNCTION |
| |
| #define DELEGATE_2D_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<TIn, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| RowwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Rowwise##Func<TIn, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| RowwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \ |
| rows, cols, Op<TIn>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<TIn, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| ColwiseBinaryOp<TIn, TOut, Op<TIn>, true>(rows, cols, Op<TIn>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void Colwise##Func<TIn, CPUContext, false>( \ |
| const int rows, \ |
| const int cols, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext*) { \ |
| ColwiseBinaryOp<TIn, TOut, Op<TIn>, false>( \ |
| rows, cols, Op<TIn>(), A, B, C); \ |
| } |
| |
| #define DEFINE_2D_COMPARE_FUNCTION(Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) |
| |
| DEFINE_2D_COMPARE_FUNCTION(EQ, std::equal_to) |
| DEFINE_2D_COMPARE_FUNCTION(NE, std::not_equal_to) |
| DEFINE_2D_COMPARE_FUNCTION(LT, std::less) |
| DEFINE_2D_COMPARE_FUNCTION(LE, std::less_equal) |
| DEFINE_2D_COMPARE_FUNCTION(GT, std::greater) |
| DEFINE_2D_COMPARE_FUNCTION(GE, std::greater_equal) |
| |
| #undef DEFINE_2D_COMPARE_FUNCTION |
| |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and) |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or) |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) |
| |
| #define DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_2D_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) |
| |
| DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) |
| DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) |
| DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) |
| |
| #undef DEFINE_2D_BROADCAST_BITWISE_BINARY_FUNCTION |
| |
| #undef DELEGATE_2D_BROADCAST_BINARY_FUNCTION |
| |
| #define DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(T) \ |
| template <> \ |
| C10_EXPORT void RowwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| RowwiseBinaryOp<T, T, std::divides<T>, true>( \ |
| rows, cols, std::divides<T>(), A, B, C); \ |
| } \ |
| template <> \ |
| C10_EXPORT void ColwiseDiv<T, CPUContext, true>( \ |
| const int rows, \ |
| const int cols, \ |
| const T* A, \ |
| const T* B, \ |
| T* C, \ |
| CPUContext*) { \ |
| ColwiseBinaryOp<T, T, std::divides<T>, true>( \ |
| rows, cols, std::divides<T>(), A, B, C); \ |
| } |
| DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int32_t) |
| DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION(std::int64_t) |
| #undef DEFINE_2D_BROADCAST_1ST_DIV_FUNCTION |
| |
| #define DELEGATE_BROADCAST_BINARY_FUNCTION(TIn, TOut, Func, Op) \ |
| template <> \ |
| C10_EXPORT void Func<TIn, CPUContext>( \ |
| const int A_ndim, \ |
| const int* A_dims, \ |
| const int B_ndim, \ |
| const int* B_dims, \ |
| const TIn* A, \ |
| const TIn* B, \ |
| TOut* C, \ |
| CPUContext* context) { \ |
| const int ndim = std::max(A_ndim, B_ndim); \ |
| std::vector<int> A_dims_array(ndim); \ |
| std::vector<int> B_dims_array(ndim); \ |
| std::vector<int> C_dims_array(ndim); \ |
| utils::ComputeBroadcastBinaryOpDims( \ |
| A_ndim, \ |
| A_dims, \ |
| B_ndim, \ |
| B_dims, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| C_dims_array.data()); \ |
| if (A_dims_array == B_dims_array) { \ |
| const int size = std::accumulate( \ |
| C_dims_array.cbegin(), \ |
| C_dims_array.cend(), \ |
| 1, \ |
| std::multiplies<int>()); \ |
| Func<TIn, CPUContext>(size, A, B, C, context); \ |
| return; \ |
| } \ |
| int rows; \ |
| int cols; \ |
| bool broadcast_1st; \ |
| if (utils::IsRowwiseBroadcastBinaryOp( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| &rows, \ |
| &cols, \ |
| &broadcast_1st)) { \ |
| if (broadcast_1st) { \ |
| Rowwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \ |
| } else { \ |
| Rowwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \ |
| } \ |
| return; \ |
| } \ |
| if (utils::IsColwiseBroadcastBinaryOp( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| &rows, \ |
| &cols, \ |
| &broadcast_1st)) { \ |
| if (broadcast_1st) { \ |
| Colwise##Func<TIn, CPUContext, true>(rows, cols, A, B, C, context); \ |
| } else { \ |
| Colwise##Func<TIn, CPUContext, false>(rows, cols, A, B, C, context); \ |
| } \ |
| return; \ |
| } \ |
| int pre; \ |
| int mid; \ |
| int nxt; \ |
| if (utils::IsBothEndsBroadcastBinaryOp( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| &pre, \ |
| &mid, \ |
| &nxt, \ |
| &broadcast_1st)) { \ |
| const int stride = mid * nxt; \ |
| for (int i = 0; i < pre; ++i) { \ |
| if (broadcast_1st) { \ |
| Colwise##Func<TIn, CPUContext, true>( \ |
| mid, nxt, A, B + i * stride, C + i * stride, context); \ |
| } else { \ |
| Colwise##Func<TIn, CPUContext, false>( \ |
| mid, nxt, A + i * stride, B, C + i * stride, context); \ |
| } \ |
| } \ |
| return; \ |
| } \ |
| BroadcastBinaryOpImpl( \ |
| ndim, \ |
| A_dims_array.data(), \ |
| B_dims_array.data(), \ |
| C_dims_array.data(), \ |
| Op<TIn>(), \ |
| A, \ |
| B, \ |
| C); \ |
| } |
| |
| #define DEFINE_BROADCAST_COMPARE_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(float, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(double, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) |
| |
| DEFINE_BROADCAST_COMPARE_FUNCTION(EQ, std::equal_to) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(NE, std::not_equal_to) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(LT, std::less) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(LE, std::less_equal) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(GT, std::greater) |
| DEFINE_BROADCAST_COMPARE_FUNCTION(GE, std::greater_equal) |
| |
| #undef DEFINE_BROADCAST_COMPARE_FUNCTION |
| |
| #define DEFINE_BROADCAST_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(float, float, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(double, double, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) |
| |
| DEFINE_BROADCAST_BINARY_FUNCTION(Add, std::plus) |
| DEFINE_BROADCAST_BINARY_FUNCTION(Sub, std::minus) |
| DEFINE_BROADCAST_BINARY_FUNCTION(Mul, std::multiplies) |
| DEFINE_BROADCAST_BINARY_FUNCTION(Div, std::divides) |
| |
| #undef DEFINE_BROADCAST_BINARY_FUNCTION |
| |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, And, std::logical_and) |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Or, std::logical_or) |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Xor, std::bit_xor) |
| |
| #define DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(bool, bool, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \ |
| DELEGATE_BROADCAST_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) |
| |
| DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseAnd, std::bit_and) |
| DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseOr, std::bit_or) |
| DEFINE_BROADCAST_BITWISE_BINARY_FUNCTION(BitwiseXor, std::bit_xor) |
| |
| #undef DEFINE_BITWISE_BROADCAST_BINARY_FUNCTION |
| |
| #undef DELEGATE_BROADCAST_BINARY_FUNCTION |
| |
| #define CAFFE2_RAND_UNIFORM_REAL(T) \ |
| template <> \ |
| C10_EXPORT void RandUniform<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| std::uniform_real_distribution<T> distribution(a, b); \ |
| for (size_t i = 0; i < n; ++i) { \ |
| r[i] = distribution(context->RandGenerator()); \ |
| } \ |
| } |
| CAFFE2_RAND_UNIFORM_REAL(float); |
| CAFFE2_RAND_UNIFORM_REAL(double); |
| #undef CAFFE2_RAND_UNIFORM_REAL |
| |
| #define CAFFE2_RAND_UNIFORM_CHAR(T) \ |
| template <> \ |
| C10_EXPORT void RandUniform<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| std::uniform_int_distribution<short> distribution((short)a, (short)b); \ |
| for (size_t i = 0; i < n; ++i) { \ |
| r[i] = static_cast<T>(distribution(context->RandGenerator())); \ |
| } \ |
| } |
| CAFFE2_RAND_UNIFORM_CHAR(int8_t); |
| CAFFE2_RAND_UNIFORM_CHAR(uint8_t); |
| #undef CAFFE2_RAND_UNIFORM_CHAR |
| |
| #define CAFFE2_RAND_UNIFORM_INT(T) \ |
| template <> \ |
| C10_EXPORT void RandUniform<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| std::uniform_int_distribution<T> distribution(a, b); \ |
| for (size_t i = 0; i < n; ++i) { \ |
| r[i] = distribution(context->RandGenerator()); \ |
| } \ |
| } |
| |
| CAFFE2_RAND_UNIFORM_INT(int16_t); |
| CAFFE2_RAND_UNIFORM_INT(int32_t); |
| CAFFE2_RAND_UNIFORM_INT(int64_t); |
| CAFFE2_RAND_UNIFORM_INT(uint16_t); |
| CAFFE2_RAND_UNIFORM_INT(uint32_t); |
| CAFFE2_RAND_UNIFORM_INT(uint64_t); |
| #undef CAFFE2_RAND_UNIFORM_INT |
| |
| // This is not uniformly distributed between a and b. |
| // It takes advantage of normal distribution to generate numbers |
| // with mean = sum / n. |
| // Ideally the algorithm should be generating n numbers between 0 and 1, |
| // sum them up as scaled_sum, and use sum / scaled_sum to adjust the values |
| // to between a and b. |
| // The algorithm is non-trivial given the adjustment would be different towards |
| // each value. |
| #define CAFFE2_RAND_FIXED_SUM(T) \ |
| template <> \ |
| C10_EXPORT void RandFixedSum<T, CPUContext>( \ |
| const size_t n, \ |
| const T a, \ |
| const T b, \ |
| const T sum, \ |
| T* r, \ |
| CPUContext* context) { \ |
| CAFFE_ENFORCE_GE(a, 0); \ |
| CAFFE_ENFORCE_GE(sum / (double)n, a); \ |
| CAFFE_ENFORCE_LE(sum / (double)n, b); \ |
| T current_sum = 0; \ |
| for (size_t i = 0; i < n - 1; ++i) { \ |
| auto remaining_numbers = n - 1 - i; \ |
| double mean = (sum - current_sum) / remaining_numbers; \ |
| double stdev = std::min(mean - a, b - mean); \ |
| std::normal_distribution<double> distribution{mean, stdev / 4.0}; \ |
| T value = distribution(context->RandGenerator()); \ |
| auto remaining_sum = sum - current_sum - value; \ |
| if (value < a || remaining_sum > b * remaining_numbers) { \ |
| value = a; \ |
| } else if (value > b || remaining_sum < a * remaining_numbers) { \ |
| value = b; \ |
| } \ |
| r[i] = value; \ |
| CAFFE_ENFORCE(a <= value && value <= b); \ |
| current_sum += value; \ |
| } \ |
| r[n - 1] = sum - current_sum; \ |
| CAFFE_ENFORCE(a <= r[n - 1] && r[n - 1] <= b); \ |
| } |
| CAFFE2_RAND_FIXED_SUM(float); |
| CAFFE2_RAND_FIXED_SUM(double); |
| CAFFE2_RAND_FIXED_SUM(int8_t); |
| CAFFE2_RAND_FIXED_SUM(int16_t); |
| CAFFE2_RAND_FIXED_SUM(int32_t); |
| CAFFE2_RAND_FIXED_SUM(int64_t); |
| CAFFE2_RAND_FIXED_SUM(uint8_t); |
| CAFFE2_RAND_FIXED_SUM(uint16_t); |
| CAFFE2_RAND_FIXED_SUM(uint32_t); |
| CAFFE2_RAND_FIXED_SUM(uint64_t); |
| #undef CAFFE2_RAND_FIXED_SUM |
| |
| template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app> |
| Ind_t generate_stack_distance( |
| std::vector<Ind_t>& cum_val, |
| std::vector<Val_t>& cum_dis, |
| std::vector<Ind_t>& cum_map, |
| Ind_t max_i, |
| Ind_t i, |
| Context_t* context) { |
| /* Description: |
| Inverse Transform Sampling method to generate values for random variable X |
| that is described by the cumulative distribution F (cum_val,cum_dis). |
| Notice, that we may choose to use the inverse map of F (cum_map) as an |
| approximation to avoid searching. Also, scaling the probability so that |
| the values are within max_i refs, because stack distance can not be > |
| than the # of already generated refs (max_i). |
| */ |
| Ind_t j, k, n; |
| Val_t u, f, fi; |
| |
| // generate a random number u in [0,1] from a uniform distribution U |
| math::RandUniform<Val_t, Context_t>(1, 0, 1, &u, context); |
| |
| // scale the random number u to be within range [0,f(i)], if needed |
| if (i < max_i) { |
| // approach 2: allows gaps in the distribution |
| j = (std::upper_bound(cum_val.begin(), cum_val.end(), i) - |
| cum_val.begin()) - |
| 1; |
| fi = cum_dis[j]; |
| u *= fi; |
| } |
| // 2. compute the stack distance value of x, s.t. F(x)=u |
| // notice that the cumulative distribution F increases monotonically up to 1 |
| if (cdf_app) { |
| // look up cum_val corresponding to u <= cum_dis[j] |
| k = cum_map.size(); |
| n = (Ind_t)round(u * k); |
| j = cum_map[n]; |
| return cum_val[j]; |
| } else { |
| // iterate until you find the cum_val corresponding to u <= cum_dis[j] |
| for (j = 0; j < cum_dis.size(); j++) { |
| f = cum_dis[j]; |
| if (u <= f) { |
| return cum_val[j]; |
| } |
| } |
| return cum_val[j - 1]; |
| } |
| } |
| |
| template <class Type, class Val_t, class Ind_t, class Context_t, bool cdf_app> |
| C10_EXPORT void generate_trace_lru( |
| std::vector<Ind_t>& uni_ref, |
| std::vector<Ind_t>& cum_val, |
| std::vector<Val_t>& cum_dis, |
| std::vector<Ind_t>& cum_map, |
| Context_t* context, |
| Ind_t cache_line_size, |
| Ind_t n, |
| Type min, |
| Type max, |
| Type* syn_ref) { |
| /* Description: |
| Generate synthetic trace from a list of unique accesses uni_ref, and |
| cumulative distribution of distances (cum_val,cum_dis) between them. |
| Also, there is an option to use cum_map approximation to avoid searching. |
| */ |
| Ind_t i, j, k, sd, line_ref, mem_ref, mem_ref_within_line; |
| Ind_t max_sd = cum_val.back(); |
| Ind_t l = uni_ref.size(); |
| |
| for (i = 0, j = 0; j < n; j++) { |
| // generate stack distance |
| sd = generate_stack_distance<Type, Val_t, Ind_t, Context_t, cdf_app>( |
| cum_val, cum_dis, cum_map, max_sd, i, context); |
| // fixed access within cache line |
| mem_ref_within_line = 0; |
| // random access within cache line |
| // Val_t r; |
| // math::RandUniform<Val_t, Context_t>(1, 0, 1, &r, context); |
| // mem_ref_within_line = floor(r*cache_line_size); |
| |
| // generate memory reference |
| if (sd == 0) { |
| k = 0; /// new reference /// |
| i++; |
| } else { |
| k = l - sd; /// existing reference /// |
| } |
| line_ref = uni_ref[k]; // pop k-th element |
| uni_ref.erase(uni_ref.begin() + k); |
| uni_ref.push_back(line_ref); // append it back |
| mem_ref = line_ref * cache_line_size + mem_ref_within_line; |
| /* |
| //debug prints |
| if ((mem_ref < min) || (mem_ref > max)) { |
| //printf("mem_ref[%d]=%d (%ld) \n",j,mem_ref,syn_ref[j]); |
| std::cout << "syn_ref[" << j << "]=" << (Type)mem_ref << " "; |
| std::cout << "(" << mem_ref << ") "; |
| std::cout << "[" << min << "," << max << "]" << std::endl; |
| int scanf_temp; |
| scanf("%d",&scanf_temp); |
| } |
| */ |
| |
| // patch mem_ref to be within range |
| // WARNING: this should not be needed if instantiation type and distribution |
| // choice is correct. It is remeding a symptom of earlier mistakes. |
| if (mem_ref < min) { |
| mem_ref = min; |
| // std::cout << "clamping (min) mem_ref=" << mem_ref << std::endl; |
| } |
| if (mem_ref > max) { |
| mem_ref = max; // mem_ref % max; |
| // std::cout << "clamping (max) mem_ref=" << mem_ref << std::endl; |
| } |
| |
| // save generated memory reference |
| syn_ref[j] = (Type)mem_ref; |
| } |
| } |
| |
| // Generate n values from synthetic data distribution, |
| // define by unique accesses and stack distances |
| // WARNING: can create this for all tables or per table, but in latter |
| // case we need to know the table id, to sample from the right distribution |
| #define CAFFE2_RAND_SYNTHETIC_DATA(T) \ |
| template <> \ |
| C10_EXPORT void RandSyntheticData<T, CPUContext>( \ |
| const size_t n, const T a, const T b, T* r, CPUContext* context) { \ |
| /* unique memory references */ \ |
| std::vector<int> mem_ref = {1, 2, 3, 4, 5, 6}; \ |
| /* cumulative distribution of distances */ \ |
| std::vector<int> cum_val = {0, 1, 3, 4, 5}; \ |
| std::vector<double> cum_dis = {0.55, 0.64, 0.82, 0.91, 1.0}; \ |
| /* inverse map of cumulative distribution (for O(1) lookup) */ \ |
| /* std::vector<int> cum_map = {0, 0, 0, 0, 0, 1, 2, 2, 3, 4}; */ \ |
| int k = 10; /* 100; */ \ |
| std::vector<int> cum_map(k, 0); \ |
| for (int j = 0; j < cum_dis.size();) { \ |
| int sz = (int)round(cum_dis[j] * k); \ |
| for (int i = 0; i < sz; i++) { \ |
| cum_map[j + i] = j; \ |
| } \ |
| j += sz; \ |
| } \ |
| \ |
| /* code to generate the synthetic data from the above values */ \ |
| const int cache_line = 1; /* 64; */ \ |
| generate_trace_lru<T, double, int, CPUContext, false>( \ |
| mem_ref, cum_val, cum_dis, cum_map, context, cache_line, n, a, b, r); \ |
| } |
| |
| CAFFE2_RAND_SYNTHETIC_DATA(float); |
| CAFFE2_RAND_SYNTHETIC_DATA(double); |
| CAFFE2_RAND_SYNTHETIC_DATA(int8_t); |
| CAFFE2_RAND_SYNTHETIC_DATA(int16_t); |
| CAFFE2_RAND_SYNTHETIC_DATA(int32_t); |
| CAFFE2_RAND_SYNTHETIC_DATA(int64_t); |
| CAFFE2_RAND_SYNTHETIC_DATA(uint8_t); |
| CAFFE2_RAND_SYNTHETIC_DATA(uint16_t); |
| CAFFE2_RAND_SYNTHETIC_DATA(uint32_t); |
| CAFFE2_RAND_SYNTHETIC_DATA(uint64_t); |
| #undef CAFFE2_RAND_SYNTHETIC_DATA |
| |
| #define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T) \ |
| template <> \ |
| C10_EXPORT void RandUniformUnique<T, CPUContext>( \ |
| const size_t n, \ |
| const T a, \ |
| const T b, \ |
| T* r, \ |
| const size_t m, \ |
| const T* avoid, \ |
| CPUContext* context) { \ |
| CAFFE_ENFORCE_LE( \ |
| n, b - a - m + 1, "Cannot satisfy the unique requirement"); \ |
| std::unordered_set<T> avoid_set(n); \ |
| if (m) { \ |
| avoid_set.insert(avoid, avoid + m); \ |
| CAFFE_ENFORCE_EQ( \ |
| m, avoid_set.size(), "AC10_EXPORT void should be unique"); \ |
| } \ |
| std::uniform_int_distribution<T> distribution(a, b); \ |
| T v = 0; \ |
| for (size_t i = 0; i < n; ++i) { \ |
| do { \ |
| v = distribution(context->RandGenerator()); \ |
| } while (avoid_set.count(v)); \ |
| r[i] = v; \ |
| avoid_set.insert(v); \ |
| } \ |
| } |
| |
| CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t); |
| CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t); |
| #undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE |
| |
| template <> |
| C10_EXPORT void RandGaussian<float, CPUContext>( |
| const size_t n, |
| const float mean, |
| const float std, |
| float* r, |
| CPUContext* context) { |
| std::normal_distribution<float> distribution(mean, std); |
| for (size_t i = 0; i < n; ++i) { |
| r[i] = distribution(context->RandGenerator()); |
| } |
| } |
| |
| #define CAFFE2_SPECIALIZED_SUM(T) \ |
| template <> \ |
| C10_EXPORT void Sum<T, CPUContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| CPUContext* /* unused */, \ |
| Tensor* /* unused */) { \ |
| *y = ConstEigenVectorMap<T>(x, N).sum(); \ |
| } |
| |
| CAFFE2_SPECIALIZED_SUM(float); |
| CAFFE2_SPECIALIZED_SUM(int32_t); |
| CAFFE2_SPECIALIZED_SUM(int64_t); |
| |
| #undef CAFFE2_SPECIALIZED_SUM |
| |
| template <> |
| C10_EXPORT void SumSqr<float, CPUContext>( |
| const int N, |
| const float* x, |
| float* y, |
| CPUContext* /*context*/ /* unused */, |
| Tensor* /*scratch_ptr*/ /* unused */) { |
| *y = ConstEigenVectorMap<float>(x, N).squaredNorm(); |
| } |
| |
| template <> |
| C10_EXPORT void Select<float, CPUContext>( |
| const int N, |
| const int D, |
| const float* x, |
| const int* idx, |
| float* y, |
| CPUContext* /*context*/) { |
| for (int i = 0; i < N; ++i) { |
| DCHECK_LT(idx[i], D); |
| y[i] = x[i * D + idx[i]]; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void CopyMatrix<CPUContext>( |
| const size_t itemsize, |
| const int M, |
| const int N, |
| const void* A, |
| const int lda, |
| void* B, |
| const int ldb, |
| CPUContext* /*context*/, |
| TypeMeta::Copy copy) { |
| if (A == nullptr || B == nullptr) { |
| return; |
| } |
| if (lda == N && ldb == N) { |
| // can coalese to a single memcpy of size M * N |
| if (copy) { |
| copy(static_cast<const char*>(A), static_cast<char*>(B), N * M); |
| } else { |
| memcpy( |
| static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M); |
| } |
| return; |
| } |
| |
| for (int i = 0; i < M; ++i) { |
| if (copy) { |
| copy( |
| static_cast<const char*>(A) + lda * i * itemsize, |
| static_cast<char*>(B) + ldb * i * itemsize, |
| N); |
| } else { |
| memcpy( |
| static_cast<char*>(B) + ldb * i * itemsize, |
| static_cast<const char*>(A) + lda * i * itemsize, |
| itemsize * N); |
| } |
| } |
| } |
| |
| #ifdef CAFFE2_USE_MKL |
| |
| #define DELEGATE_COPY_MATRIX_FUNCTION(T, Func) \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int lda, \ |
| T* B, \ |
| const int ldb, \ |
| CPUContext* /* context */) { \ |
| Func('R', 'N', M, N, T(1), A, lda, B, ldb); \ |
| } \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int A_outer_stride, \ |
| const int A_inner_stride, \ |
| T* B, \ |
| const int B_outer_stride, \ |
| const int B_inner_stride, \ |
| CPUContext* /* context */) { \ |
| Func##2( \ |
| 'R', \ |
| 'N', \ |
| M, \ |
| N, \ |
| T(1), \ |
| A, \ |
| A_outer_stride, \ |
| A_inner_stride, \ |
| B, \ |
| B_outer_stride, \ |
| B_inner_stride); \ |
| } |
| DELEGATE_COPY_MATRIX_FUNCTION(float, mkl_somatcopy) |
| DELEGATE_COPY_MATRIX_FUNCTION(double, mkl_domatcopy) |
| #undef DELEGATE_COPY_MATRIX_FUNCTION |
| |
| #endif // CAFFE2_USE_MKL |
| |
| #define CAFFE2_SPECIALIZED_COPY_MATRIX(T) \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int lda, \ |
| T* B, \ |
| const int ldb, \ |
| CPUContext* /* context */) { \ |
| if (M == 0 || N == 0) { \ |
| return; \ |
| } \ |
| if (lda == N) { \ |
| if (ldb == N) { \ |
| std::memcpy(B, A, sizeof(T) * M * N); \ |
| } else { \ |
| EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \ |
| ConstEigenMatrixMap<T>(A, N, M); \ |
| } \ |
| } else { \ |
| if (ldb == N) { \ |
| EigenMatrixMap<T>(B, N, M) = ConstEigenOuterStridedMatrixMap<T>( \ |
| A, N, M, EigenOuterStride(lda)); \ |
| } else { \ |
| EigenOuterStridedMatrixMap<T>(B, N, M, EigenOuterStride(ldb)) = \ |
| ConstEigenOuterStridedMatrixMap<T>( \ |
| A, N, M, EigenOuterStride(lda)); \ |
| } \ |
| } \ |
| } \ |
| template <> \ |
| C10_EXPORT void CopyMatrix<T, CPUContext>( \ |
| const int M, \ |
| const int N, \ |
| const T* A, \ |
| const int A_outer_stride, \ |
| const int A_inner_stride, \ |
| T* B, \ |
| const int B_outer_stride, \ |
| const int B_inner_stride, \ |
| CPUContext* context) { \ |
| if (A_inner_stride == 1 && B_inner_stride == 1) { \ |
| CopyMatrix<T, CPUContext>( \ |
| M, N, A, A_outer_stride, B, B_outer_stride, context); \ |
| return; \ |
| } \ |
| EigenStridedMatrixMap<T>( \ |
| B, N, M, EigenStride(B_outer_stride, B_inner_stride)) = \ |
| ConstEigenStridedMatrixMap<T>( \ |
| A, N, M, EigenStride(A_outer_stride, A_inner_stride)); \ |
| } |
| |
| #ifndef CAFFE2_USE_MKL |
| CAFFE2_SPECIALIZED_COPY_MATRIX(float) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(double) |
| #endif // CAFFE2_USE_MKL |
| |
| CAFFE2_SPECIALIZED_COPY_MATRIX(int) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(int64_t) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint8_t) |
| CAFFE2_SPECIALIZED_COPY_MATRIX(std::uint16_t) |
| |
| #undef CAFFE2_SPECIALIZXED_COPY_MATRIX |
| |
| namespace { |
| |
| template <typename T> |
| C10_EXPORT void Im2ColZeroPaddingAndNoDilationNCHW( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* img_data, |
| T* col_data, |
| CPUContext* context) { |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| const T* src = img_data + kh * W + kw; |
| if (stride_w == 1) { |
| CopyMatrix<T, CPUContext>( |
| output_h, |
| output_w, |
| src, |
| stride_h * W, |
| col_data, |
| output_w, |
| context); |
| } else { |
| CopyMatrix<T, CPUContext>( |
| output_h, |
| output_w, |
| src, |
| stride_h * W, |
| stride_w, |
| col_data, |
| output_w, |
| 1, |
| context); |
| } |
| col_data += output_size; |
| } |
| } |
| img_data += H * W; |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void Col2ImZeroPaddingAndNoDilationNCHW( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* col_data, |
| T* img_data, |
| CPUContext* context) { |
| Set<T, CPUContext>(C * H * W, T(0), img_data, context); |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| T* dst = img_data + kh * W + kw; |
| if (stride_w == 1) { |
| EigenOuterStridedArrayMap<T>( |
| dst, output_w, output_h, EigenOuterStride(stride_h * W)) += |
| ConstEigenArrayMap<T>(col_data, output_w, output_h); |
| } else { |
| EigenStridedArrayMap<T>( |
| dst, output_w, output_h, EigenStride(stride_h * W, stride_w)) += |
| ConstEigenArrayMap<T>(col_data, output_w, output_h); |
| } |
| col_data += output_size; |
| } |
| } |
| img_data += H * W; |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void Im2ColZeroPaddingAndNoDilationNHWC( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* img_data, |
| T* col_data, |
| CPUContext* context) { |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int kernel_size = kernel_h * kernel_w; |
| for (int yh = 0; yh < output_h; ++yh) { |
| for (int yw = 0; yw < output_w; ++yw) { |
| const T* src = img_data + (yh * stride_h * W + yw * stride_w) * C; |
| CopyMatrix<T, CPUContext>( |
| kernel_h, kernel_w * C, src, W * C, col_data, kernel_w * C, context); |
| col_data += kernel_size * C; |
| } |
| } |
| } |
| |
| template <typename T> |
| C10_EXPORT void Col2ImZeroPaddingAndNoDilationNHWC( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int stride_h, |
| const int stride_w, |
| const T* col_data, |
| T* img_data, |
| CPUContext* context) { |
| Set<T, CPUContext>(H * W * C, T(0), img_data, context); |
| const int output_h = (H - kernel_h) / stride_h + 1; |
| const int output_w = (W - kernel_w) / stride_w + 1; |
| const int kernel_size = kernel_h * kernel_w; |
| for (int yh = 0; yh < output_h; ++yh) { |
| for (int yw = 0; yw < output_w; ++yw) { |
| T* dst = img_data + (yh * stride_h * W + yw * stride_w) * C; |
| EigenOuterStridedArrayMap<T>( |
| dst, kernel_w * C, kernel_h, EigenOuterStride(W * C)) += |
| ConstEigenArrayMap<T>(col_data, kernel_w * C, kernel_h); |
| col_data += kernel_size * C; |
| } |
| } |
| } |
| |
| template <typename T, bool kCol2Im> |
| C10_EXPORT void Im2ColNdNCHWImpl( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* X_data, |
| float* Y_data) { |
| if (kCol2Im) { |
| std::memset(Y_data, 0, img_size * sizeof(float)); |
| } |
| const int outer_size = col_shape[0]; |
| const int inner_size = col_size / outer_size; |
| const int kernel_size = std::accumulate( |
| kernel_shape, kernel_shape + N, 1, std::multiplies<int>()); |
| std::vector<FixedDivisor<int>> kernel_shape_div(N); |
| for (int i = 0; i < N; ++i) { |
| kernel_shape_div[i] = FixedDivisor<int>(kernel_shape[i]); |
| } |
| std::vector<int> d_offset(N, 0); |
| std::vector<int> d_iter(N, 0); |
| for (int i = 0; i < outer_size; ++i) { |
| // Loop over spatial axes in reverse order to compute a per-axis offset. |
| int offset = i; |
| for (int d_i = N - 1; d_i >= 0; --d_i) { |
| kernel_shape_div[d_i].DivMod(offset, &offset, &d_offset[d_i]); |
| } |
| for (int j = 0; j < inner_size; ++j) { |
| // Loop over spatial axes in forward order to compute the indices in the |
| // image and column, and whether the index lies in the padding. |
| const int col_index = i * inner_size + j; |
| int img_index = i / kernel_size; |
| bool is_padding = false; |
| for (int d_i = 0; d_i < N; ++d_i) { |
| const int d_img = d_iter[d_i] * stride[d_i] - pad[d_i] + |
| d_offset[d_i] * dilation[d_i]; |
| is_padding |= !utils::IsAGeZeroAndALtB(d_img, img_shape[d_i + 1]); |
| img_index = img_index * img_shape[d_i + 1] + d_img; |
| } |
| if (!kCol2Im) { |
| Y_data[col_index] = is_padding ? 0 : X_data[img_index]; |
| } else if (!is_padding) { |
| Y_data[img_index] += X_data[col_index]; |
| } |
| utils::IncreaseIndexInDims(N, col_shape + 1, d_iter.data()); |
| } |
| } |
| } |
| |
| template <typename T> |
| void Im2Col3dNCHWImpl( |
| const int channels, |
| const int clip_len, |
| const int height, |
| const int width, |
| const int kernel_t, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_t, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_p, |
| const int pad_t, |
| const int pad_l, |
| const int pad_a, |
| const int pad_b, |
| const int pad_r, |
| const int stride_t, |
| const int stride_h, |
| const int stride_w, |
| const T* img_data, |
| T* col_data) { |
| const int output_t = |
| (clip_len + pad_p + pad_a - (dilation_t * (kernel_t - 1) + 1)) / |
| stride_t + |
| 1; |
| const int output_h = |
| (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h + |
| 1; |
| const int output_w = |
| (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + |
| 1; |
| const int kernel_size = kernel_t * kernel_h * kernel_w; |
| const int kernel_hw_size = kernel_h * kernel_w; |
| const int output_size = output_t * output_h * output_w; |
| const int channel_size = clip_len * height * width; |
| const int output_hw_size = output_h * output_w; |
| const int channel_hw_size = height * width; |
| |
| // Fast path for zero padding and no dilation |
| // From Torch, THNN_(unfolded_copy) |
| if (dilation_t == 1 && dilation_h == 1 && dilation_w == 1 && pad_a == 0 && |
| pad_p == 0 && pad_l == 0 && pad_r == 0 && pad_t == 0 && pad_b == 0) { |
| for (auto k = 0; k < channels * kernel_size; k++) { |
| const auto nip = k / kernel_size; |
| const auto rest = k % kernel_size; |
| const auto kt = rest / kernel_hw_size; |
| const auto rest_hw = rest % kernel_hw_size; |
| const auto kh = rest_hw / kernel_w; |
| const auto kw = rest_hw % kernel_w; |
| auto* dst = col_data + nip * (kernel_size * output_size) + |
| kt * (kernel_hw_size * output_size) + kh * (kernel_w * output_size) + |
| kw * output_size; |
| const auto* src = img_data + nip * channel_size; |
| for (auto t = 0; t < output_t; t++) { |
| const auto it = t * stride_t + kt; |
| for (auto y = 0; y < output_h; y++) { |
| const auto iy = y * stride_h + kh; |
| const auto ix = kw; |
| if (stride_w == 1) { |
| memcpy( |
| dst + (t * output_hw_size + y * output_w), |
| src + (it * channel_hw_size + iy * width + ix), |
| sizeof(T) * output_w); |
| } else { |
| for (auto x = 0; x < output_w; x++) { |
| memcpy( |
| dst + (t * output_hw_size + y * output_w + x), |
| src + (it * channel_hw_size + iy * width + ix + x * stride_w), |
| sizeof(T)); |
| } |
| } |
| } |
| } |
| } |
| return; |
| } |
| // Fast path for equal padding |
| if (pad_a == pad_p && pad_l == pad_r && pad_t == pad_b) { |
| const int pad_f = pad_a; |
| const int pad_h = pad_t; |
| const int pad_w = pad_l; |
| for (int channel = channels; channel--; img_data += channel_size) { |
| for (int kernel_frame = 0; kernel_frame < kernel_t; kernel_frame++) { |
| for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { |
| for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { |
| int input_frame = -pad_f + kernel_frame * dilation_t; |
| for (int output_frames = output_t; output_frames; output_frames--) { |
| if (!utils::IsAGeZeroAndALtB(input_frame, clip_len)) { |
| for (int output_rows = output_h; output_rows; output_rows--) { |
| for (int output_cols = output_w; output_cols; output_cols--) { |
| *(col_data++) = 0; |
| } |
| } |
| } else { |
| int input_row = -pad_h + kernel_row * dilation_h; |
| for (int output_rows = output_h; output_rows; output_rows--) { |
| if (!utils::IsAGeZeroAndALtB(input_row, height)) { |
| for (int output_cols = output_w; output_cols; |
| output_cols--) { |
| *(col_data++) = 0; |
| } |
| } else { |
| int input_col = -pad_w + kernel_col * dilation_w; |
| for (int output_col = output_w; output_col; output_col--) { |
| if (utils::IsAGeZeroAndALtB(input_col, width)) { |
| *(col_data++) = img_data |
| [(input_frame * height + input_row) * width + |
| input_col]; |
| } else { |
| *(col_data++) = 0; |
| } |
| input_col += stride_w; |
| } |
| } |
| input_row += stride_h; |
| } |
| } |
| input_frame += stride_t; |
| } |
| } |
| } |
| } |
| } |
| return; |
| } |
| |
| // Baseline |
| const int dkernel_t = dilation_t * (kernel_t - 1) + 1; |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| int clip_col = (clip_len + pad_p + pad_a - dkernel_t) / stride_t + 1; |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| |
| int channels_col = channels * kernel_t * kernel_h * kernel_w; |
| for (int c = 0; c < channels_col; ++c) { |
| int w_offset = c % kernel_w; |
| int h_offset = (c / kernel_w) % kernel_h; |
| int t_offset = (c / kernel_w / kernel_h) % kernel_t; |
| int c_im = c / kernel_h / kernel_w / kernel_t; |
| for (int t = 0; t < clip_col; ++t) { |
| for (int h = 0; h < height_col; ++h) { |
| for (int w = 0; w < width_col; ++w) { |
| int t_pad = t * stride_t - pad_p + t_offset * dilation_t; |
| int h_pad = h * stride_h - pad_t + h_offset * dilation_h; |
| int w_pad = w * stride_w - pad_l + w_offset * dilation_w; |
| if (t_pad >= 0 && t_pad < clip_len && h_pad >= 0 && h_pad < height && |
| w_pad >= 0 && w_pad < width) { |
| col_data[((c * clip_col + t) * height_col + h) * width_col + w] = |
| img_data |
| [((c_im * clip_len + t_pad) * height + h_pad) * width + |
| w_pad]; |
| } else { |
| col_data[((c * clip_col + t) * height_col + h) * width_col + w] = 0; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NCHW>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* img_data, |
| float* col_data, |
| CPUContext* /* context */, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Im2Col. |
| if (N == 3) { |
| const int channels = |
| col_shape[0] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2]; |
| Im2Col3dNCHWImpl<float>( |
| channels, |
| img_shape[1], |
| img_shape[2], |
| img_shape[3], |
| kernel_shape[0], |
| kernel_shape[1], |
| kernel_shape[2], |
| dilation[0], |
| dilation[1], |
| dilation[2], |
| pad[0], |
| pad[1], |
| pad[2], |
| pad[3], |
| pad[4], |
| pad[5], |
| stride[0], |
| stride[1], |
| stride[2], |
| img_data, |
| col_data); |
| } else { |
| Im2ColNdNCHWImpl<float, false>( |
| N, |
| img_size, |
| col_size, |
| img_shape, |
| col_shape, |
| kernel_shape, |
| stride, |
| dilation, |
| pad, |
| img_data, |
| col_data); |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NCHW>( |
| const int N, |
| const int img_size, |
| const int col_size, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* col_data, |
| float* img_data, |
| CPUContext* /* context */, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Col2Im. |
| Im2ColNdNCHWImpl<float, true>( |
| N, |
| img_size, |
| col_size, |
| img_shape, |
| col_shape, |
| kernel_shape, |
| stride, |
| dilation, |
| pad, |
| col_data, |
| img_data); |
| } |
| |
| template <> |
| C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NCHW>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* img_data, |
| float* col_data, |
| CPUContext* context, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Im2Col. |
| |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1) { |
| Im2ColZeroPaddingAndNoDilationNCHW<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| img_data, |
| col_data, |
| context); |
| return; |
| } |
| |
| // Baseline |
| const int output_h = |
| (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; |
| const int output_w = |
| (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| for (int h = 0; h < output_h; ++h) { |
| const int h_pad = h * stride_h - pad_t + kh * dilation_h; |
| if (!utils::IsAGeZeroAndALtB(h_pad, H)) { |
| std::memset(col_data + h * output_w, 0, output_w * sizeof(float)); |
| continue; |
| } |
| for (int w = 0; w < output_w; ++w) { |
| const int w_pad = w * stride_w - pad_l + kw * dilation_w; |
| col_data[h * output_w + w] = utils::IsAGeZeroAndALtB(w_pad, W) |
| ? img_data[(c * H + h_pad) * W + w_pad] |
| : 0; |
| } |
| } |
| col_data += output_size; |
| } |
| } |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Im2Col<float, CPUContext, StorageOrder::NHWC>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* img_data, |
| float* col_data, |
| CPUContext* context, |
| const int groups) { |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1 && groups == 1) { |
| Im2ColZeroPaddingAndNoDilationNHWC<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| img_data, |
| col_data, |
| context); |
| return; |
| } |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int h_pad = -pad_t; |
| if (groups == 1) { |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) { |
| if (!utils::IsAGeZeroAndALtB(ih, H)) { |
| std::memset(col_data, 0, sizeof(float) * kernel_w * C); |
| col_data += kernel_w * C; |
| continue; |
| } |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) { |
| if (utils::IsAGeZeroAndALtB(iw, W)) { |
| std::memcpy( |
| col_data, img_data + (ih * W + iw) * C, sizeof(float) * C); |
| } else { |
| std::memset(col_data, 0, sizeof(float) * C); |
| } |
| col_data += C; |
| } // iw |
| } // ih |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } else { |
| /** |
| * img_data in N H W G C/G layout |
| * col_data in N G H W R S C/G layout |
| * Note that groups are pulled out to an outer dimension so that we can use |
| * GEMMs efficiently. |
| */ |
| const int C_per_G = C / groups; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| for (int g = 0; g < groups; ++g) { |
| std::memcpy( |
| col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G, |
| img_data + (ih * W + iw) * C + g * C_per_G, |
| sizeof(float) * C_per_G); |
| } |
| } else { |
| for (int g = 0; g < groups; ++g) { |
| std::memset( |
| col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G, |
| 0, |
| sizeof(float) * C_per_G); |
| } |
| } |
| } // iw |
| } // ih |
| col_data += kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } |
| } |
| |
| /** |
| * The layout of the result is N H W G R S C/G. |
| * Note that groups are pulled out to an outer dimension so that we can use |
| * GEMMs efficiently. |
| */ |
| template <typename TData> |
| C10_EXPORT void Im2Col3dNHWCImpl( |
| const int C, |
| const int T, |
| const int H, |
| const int W, |
| const int kernel_t, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_t, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_p, // previous frame |
| const int pad_t, // top |
| const int pad_l, // left |
| const int pad_n, // next frame |
| const int pad_b, // bottom |
| const int pad_r, // right |
| const int stride_t, |
| const int stride_h, |
| const int stride_w, |
| const TData* img_data, |
| TData* col_data, |
| const int groups) { |
| const int dkernel_t = dilation_t * (kernel_t - 1) + 1; |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int C_per_G = C / groups; |
| int t_pad = -pad_p; |
| for (int t = 0; t < output_t; ++t) { |
| int h_pad = -pad_t; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int q = 0; |
| for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; |
| iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(it, T) && |
| utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| for (int g = 0; g < groups; ++g) { |
| std::memcpy( |
| col_data + |
| (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) * |
| C_per_G, |
| img_data + ((it * H + ih) * W + iw) * C + g * C_per_G, |
| sizeof(TData) * C_per_G); |
| } |
| } else { |
| for (int g = 0; g < groups; ++g) { |
| std::memset( |
| col_data + |
| (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) * |
| C_per_G, |
| 0, |
| sizeof(TData) * C_per_G); |
| } |
| } |
| } // iw |
| } // ih |
| } // it |
| col_data += kernel_t * kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| t_pad += stride_t; |
| } // t |
| } |
| |
| template <> |
| C10_EXPORT void Im2ColNd<float, CPUContext, StorageOrder::NHWC>( |
| const int N, |
| const int /*img_size*/, |
| const int /*col_size*/, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* img_data, |
| float* col_data, |
| CPUContext* /* context */, |
| const int groups) { |
| if (N == 3) { |
| const int channels = |
| col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2]; |
| Im2Col3dNHWCImpl<float>( |
| channels, |
| img_shape[0], |
| img_shape[1], |
| img_shape[2], |
| kernel_shape[0], |
| kernel_shape[1], |
| kernel_shape[2], |
| dilation[0], |
| dilation[1], |
| dilation[2], |
| pad[0], |
| pad[1], |
| pad[2], |
| pad[3], |
| pad[4], |
| pad[5], |
| stride[0], |
| stride[1], |
| stride[2], |
| img_data, |
| col_data, |
| groups); |
| } else { |
| CAFFE_NOT_IMPLEMENTED; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NCHW>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* col_data, |
| float* img_data, |
| CPUContext* context, |
| const int /* groups */) { |
| // In NCHW, the number of groups doesn't affect Col2Im. |
| |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1) { |
| Col2ImZeroPaddingAndNoDilationNCHW<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| col_data, |
| img_data, |
| context); |
| return; |
| } |
| |
| // Fallback |
| Set<float, CPUContext>(C * H * W, 0.0f, img_data, context); |
| const int output_h = |
| (H + pad_t + pad_b - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; |
| const int output_w = |
| (W + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; |
| const int output_size = output_h * output_w; |
| for (int c = 0; c < C; ++c) { |
| for (int kh = 0; kh < kernel_h; ++kh) { |
| for (int kw = 0; kw < kernel_w; ++kw) { |
| for (int h = 0; h < output_h; ++h) { |
| const int h_pad = h * stride_h - pad_t + kh * dilation_h; |
| if (!utils::IsAGeZeroAndALtB(h_pad, H)) { |
| continue; |
| } |
| for (int w = 0; w < output_w; ++w) { |
| const int w_pad = w * stride_w - pad_l + kw * dilation_w; |
| if (utils::IsAGeZeroAndALtB(w_pad, W)) { |
| img_data[(c * H + h_pad) * W + w_pad] += |
| col_data[h * output_w + w]; |
| } |
| } |
| } |
| col_data += output_size; |
| } |
| } |
| } |
| } |
| |
| template <> |
| C10_EXPORT void Col2Im<float, CPUContext, StorageOrder::NHWC>( |
| const int C, |
| const int H, |
| const int W, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_t, |
| const int pad_l, |
| const int pad_b, |
| const int pad_r, |
| const int stride_h, |
| const int stride_w, |
| const float* col_data, |
| float* img_data, |
| CPUContext* context, |
| const int groups) { |
| // Fast path for zero padding and no dilation |
| if (pad_t == 0 && pad_l == 0 && pad_b == 0 && pad_r == 0 && dilation_h == 1 && |
| dilation_w == 1 && groups == 1) { |
| Col2ImZeroPaddingAndNoDilationNHWC<float>( |
| C, |
| H, |
| W, |
| kernel_h, |
| kernel_w, |
| stride_h, |
| stride_w, |
| col_data, |
| img_data, |
| context); |
| return; |
| } |
| |
| Set<float, CPUContext>(H * W * C, 0, img_data, context); |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| |
| int h_pad = -pad_t; |
| if (groups == 1) { |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) { |
| if (!utils::IsAGeZeroAndALtB(ih, H)) { |
| col_data += kernel_w * C; |
| continue; |
| } |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) { |
| if (utils::IsAGeZeroAndALtB(iw, W)) { |
| float* img_data_patch = img_data + (ih * W + iw) * C; |
| Add<float, CPUContext>( |
| C, img_data_patch, col_data, img_data_patch, context); |
| } |
| col_data += C; |
| } // iw |
| } // ih |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } else { |
| const int C_per_G = C / groups; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| float* img_data_patch = img_data + (ih * W + iw) * C; |
| for (int g = 0; g < groups; ++g) { |
| Add<float, CPUContext>( |
| C_per_G, |
| img_data_patch + g * C_per_G, |
| col_data + ((g * kernel_h + r) * kernel_w + s) * C_per_G, |
| img_data_patch + g * C_per_G, |
| context); |
| } |
| } |
| } // iw |
| } // ih |
| col_data += kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| } |
| } |
| |
| /** |
| * The layout of the result is N H W G R S C/G. |
| * Note that groups are pulled out to an outer dimension so that we can use |
| * GEMMs efficiently. |
| */ |
| template <typename TData> |
| C10_EXPORT void Col2Im3dNHWCImpl( |
| const int C, |
| const int T, |
| const int H, |
| const int W, |
| const int kernel_t, |
| const int kernel_h, |
| const int kernel_w, |
| const int dilation_t, |
| const int dilation_h, |
| const int dilation_w, |
| const int pad_p, // previous frame |
| const int pad_t, // top |
| const int pad_l, // left |
| const int pad_n, // next frame |
| const int pad_b, // bottom |
| const int pad_r, // right |
| const int stride_t, |
| const int stride_h, |
| const int stride_w, |
| const TData* col_data, |
| TData* img_data, |
| CPUContext* context, |
| const int groups) { |
| Set<float, CPUContext>(T * H * W * C, 0, img_data, context); |
| const int dkernel_t = dilation_t * (kernel_t - 1) + 1; |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| const int output_t = (T + pad_p + pad_n - dkernel_t) / stride_t + 1; |
| const int output_h = (H + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| const int output_w = (W + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| const int C_per_G = C / groups; |
| |
| int t_pad = -pad_p; |
| for (int t = 0; t < output_t; ++t) { |
| int h_pad = -pad_t; |
| for (int h = 0; h < output_h; ++h) { |
| int w_pad = -pad_l; |
| for (int w = 0; w < output_w; ++w) { |
| int q = 0; |
| for (int it = t_pad; it < t_pad + dkernel_t; it += dilation_t, ++q) { |
| int r = 0; |
| for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h, ++r) { |
| int s = 0; |
| for (int iw = w_pad; iw < w_pad + dkernel_w; |
| iw += dilation_w, ++s) { |
| if (utils::IsAGeZeroAndALtB(it, T) && |
| utils::IsAGeZeroAndALtB(ih, H) && |
| utils::IsAGeZeroAndALtB(iw, W)) { |
| float* img_data_patch = img_data + ((it * T + ih) * W + iw) * C; |
| for (int g = 0; g < groups; ++g) { |
| Add<float, CPUContext>( |
| C_per_G, |
| img_data_patch + g * C_per_G, |
| col_data + |
| (((g * kernel_t + q) * kernel_h + r) * kernel_w + s) * |
| C_per_G, |
| img_data_patch + g * C_per_G, |
| context); |
| } |
| } |
| } // iw |
| } // ih |
| } // it |
| col_data += kernel_t * kernel_h * kernel_w * C; |
| w_pad += stride_w; |
| } // w |
| h_pad += stride_h; |
| } // h |
| t_pad += stride_t; |
| } // t |
| } |
| |
| template <> |
| C10_EXPORT void Col2ImNd<float, CPUContext, StorageOrder::NHWC>( |
| const int N, |
| const int /*img_size*/, |
| const int /*col_size*/, |
| const int* img_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const float* col_data, |
| float* img_data, |
| CPUContext* context, |
| const int groups) { |
| if (N == 3) { |
| const int channels = |
| col_shape[3] / kernel_shape[0] / kernel_shape[1] / kernel_shape[2]; |
| Col2Im3dNHWCImpl<float>( |
| channels, |
| img_shape[0], |
| img_shape[1], |
| img_shape[2], |
| kernel_shape[0], |
| kernel_shape[1], |
| kernel_shape[2], |
| dilation[0], |
| dilation[1], |
| dilation[2], |
| pad[0], |
| pad[1], |
| pad[2], |
| pad[3], |
| pad[4], |
| pad[5], |
| stride[0], |
| stride[1], |
| stride[2], |
| col_data, |
| img_data, |
| context, |
| groups); |
| } else { |
| CAFFE_NOT_IMPLEMENTED; |
| } |
| } |
| |
| template <> |
| C10_EXPORT void BiasCHW<float, CPUContext>( |
| const float* bias, |
| const float* /*bias_multiplier*/, |
| const int bias_channels, |
| const int image_size, |
| float* image, |
| CPUContext* /*context*/) { |
| // Sum the per-channel bias into every image plane |
| for (int c = 0; c < bias_channels; ++c) { |
| float b = bias[c]; |
| |
| #if defined(__ARM_NEON__) || defined(__ARM_NEON) |
| float32x4_t vBias = vdupq_n_f32(b); |
| |
| // We give alignment hints for additional speed, so handle the |
| // non-vectorizable prologue separately |
| constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float); |
| |
| // FIXME: if input < kVecSizeInFloat, can't vectorize at all |
| |
| int prologue = kVecSizeInFloat - |
| // remainder in floats |
| (((uintptr_t)image) % (sizeof(float32x4_t))) / sizeof(float); |
| |
| int i = 0; |
| // Prologue loop |
| for (; i < prologue; ++i) { |
| image[i] += b; |
| } |
| |
| // The loop is manually unrolled by 8 |
| constexpr int kUnroll = 8; |
| constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat; |
| |
| int remainder = image_size - prologue; |
| int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop; |
| |
| // Vectorizable body |
| for (; i < vectorizable; i += kFloatsPerLoop) { |
| // Manually unrolled |
| float32x4_t v0 = vld1q_f32_aligned(image + i + 0); |
| float32x4_t v1 = vld1q_f32_aligned(image + i + 4); |
| float32x4_t v2 = vld1q_f32_aligned(image + i + 8); |
| float32x4_t v3 = vld1q_f32_aligned(image + i + 12); |
| float32x4_t v4 = vld1q_f32_aligned(image + i + 16); |
| float32x4_t v5 = vld1q_f32_aligned(image + i + 20); |
| float32x4_t v6 = vld1q_f32_aligned(image + i + 24); |
| float32x4_t v7 = vld1q_f32_aligned(image + i + 28); |
| |
| v0 = vaddq_f32(v0, vBias); |
| v1 = vaddq_f32(v1, vBias); |
| v2 = vaddq_f32(v2, vBias); |
| v3 = vaddq_f32(v3, vBias); |
| v4 = vaddq_f32(v4, vBias); |
| v5 = vaddq_f32(v5, vBias); |
| v6 = vaddq_f32(v6, vBias); |
| v7 = vaddq_f32(v7, vBias); |
| |
| vst1q_f32_aligned(image + i + 0, v0); |
| vst1q_f32_aligned(image + i + 4, v1); |
| vst1q_f32_aligned(image + i + 8, v2); |
| vst1q_f32_aligned(image + i + 12, v3); |
| vst1q_f32_aligned(image + i + 16, v4); |
| vst1q_f32_aligned(image + i + 20, v5); |
| vst1q_f32_aligned(image + i + 24, v6); |
| vst1q_f32_aligned(image + i + 28, v7); |
| } |
| |
| // Non-vectorizable epilogue |
| for (; i < image_size; ++i) { |
| image[i] += b; |
| } |
| #else |
| // Non-NEON CPU implementation |
| for (int i = 0; i < image_size; ++i) { |
| image[i] += b; |
| } |
| #endif // defined(__ARM_NEON__) || defined(__ARM_NEON) |
| |
| image += image_size; |
| } |
| } |
| |
| #define CAFFE2_SPECIALIZED_COPYVECTOR(T) \ |
| template <> \ |
| C10_EXPORT void CopyVector<T, CPUContext>( \ |
| const int N, const T* src, T* dst, CPUContext* /*context*/) { \ |
| if (src != dst && N > 0) { \ |
| memcpy(dst, src, sizeof(T) * N); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_COPYVECTOR(float) |
| #undef CAFFE2_SPECIALIZED_COPYVECTOR |
| |
| namespace { |
| |
| #ifdef CAFFE2_USE_HPTT |
| |
| bool TransposeWithHPTT( |
| const int ndim, |
| const int* dims, |
| const int* axes, |
| const float* X, |
| float* Y) { |
| std::vector<int> axes_cm(ndim); |
| std::vector<int> dims_cm(ndim); |
| // Convert row-major index to column-major. |
| const auto cm_fn = [ndim](const int i) { return ndim - i - 1; }; |
| for (int i = 0; i < ndim; ++i) { |
| axes_cm[i] = cm_fn(axes[cm_fn(i)]); |
| dims_cm[i] = dims[cm_fn(i)]; |
| } |
| |
| // HPTT doesn't handle 0 sized inputs. |
| for (auto dim : dims_cm) { |
| if (dim <= 0) { |
| return false; |
| } |
| } |
| auto plan = hptt::create_plan( |
| axes_cm.data(), |
| ndim, |
| 1.0, |
| X, |
| dims_cm.data(), |
| nullptr, |
| 0.0, |
| Y, |
| nullptr, |
| hptt::ESTIMATE, |
| 1); |
| if (plan == nullptr) { |
| return false; |
| } |
| plan->execute(); |
| return true; |
| } |
| |
| #endif // CAFFE2_USE_HPTT |
| |
| template <typename T> |
| void Transpose2D(const int rows, const int cols, const T* X, T* Y); |
| |
| #ifdef CAFFE2_USE_MKL |
| |
| #define DELEGATE_TRANSPOSE_2D_FUNCTION(T, Func) \ |
| template <> \ |
| void Transpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \ |
| Func('R', 'T', rows, cols, T(1), X, cols, Y, rows); \ |
| } |
| DELEGATE_TRANSPOSE_2D_FUNCTION(float, mkl_somatcopy); |
| DELEGATE_TRANSPOSE_2D_FUNCTION(double, mkl_domatcopy); |
| #undef DELEGATE_TRANSPOSE_2D_FUNCTION |
| |
| #endif // CAFFE2_USE_MKL |
| |
| #define CAFFE2_SPECIALIZED_TRANSPOSE_2D(T) \ |
| template <> \ |
| void Transpose2D<T>(const int rows, const int cols, const T* X, T* Y) { \ |
| EigenMatrixMap<T>(Y, rows, cols) = \ |
| ConstEigenMatrixMap<T>(X, cols, rows).transpose(); \ |
| } |
| |
| #ifndef CAFFE2_USE_MKL |
| |
| template <> |
| void Transpose2D<float>( |
| const int rows, |
| const int cols, |
| const float* X, |
| float* Y) { |
| #ifdef CAFFE2_USE_HPTT |
| const std::array<int, 2> dims = {rows, cols}; |
| const std::array<int, 2> axes = {1, 0}; |
| if (TransposeWithHPTT(2, dims.data(), axes.data(), X, Y)) { |
| return; |
| } |
| #endif // CAFFE2_USE_HPTT |
| EigenMatrixMap<float>(Y, rows, cols) = |
| ConstEigenMatrixMap<float>(X, cols, rows).transpose(); |
| } |
| |
| CAFFE2_SPECIALIZED_TRANSPOSE_2D(double) |
| |
| #endif // CAFFE2_USE_MKL |
| |
| CAFFE2_SPECIALIZED_TRANSPOSE_2D(int) |
| CAFFE2_SPECIALIZED_TRANSPOSE_2D(int64_t) |
| CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint8_t) |
| CAFFE2_SPECIALIZED_TRANSPOSE_2D(std::uint16_t) |
| |
| #undef CAFFE2_SPECIALIZED_TRANSPOSE_2D |
| |
| std::vector<int> |
| ComputeXStrides(const int ndim, const int* dims, const int* axes) { |
| std::vector<int> x_strides(ndim); |
| std::vector<int> buff(ndim); |
| int cur_stride = 1; |
| for (int i = ndim - 1; i >= 0; --i) { |
| buff[i] = cur_stride; |
| cur_stride *= dims[i]; |
| } |
| for (int i = 0; i < ndim; ++i) { |
| x_strides[i] = buff[axes[i]]; |
| } |
| return x_strides; |
| } |
| |
| template <typename T> |
| void TransposeND( |
| const int ndim, |
| const int* dims, |
| const int* axes, |
| const T* X, |
| T* Y) { |
| std::vector<int> Y_dims(ndim); |
| for (int i = 0; i < ndim; ++i) { |
| Y_dims[i] = dims[axes[i]]; |
| } |
| // Measure amount of contiguous data we can copy at once |
| int block_size = 1; |
| int num_shared_idx = 0; |
| for (int i = ndim - 1; i >= 0 && axes[i] == i; --i) { |
| block_size *= Y_dims[i]; |
| ++num_shared_idx; |
| } |
| const int itr_axes = ndim - num_shared_idx; |
| const int num_blocks = std::accumulate( |
| Y_dims.cbegin(), Y_dims.cbegin() + itr_axes, 1, std::multiplies<int>()); |
| const std::vector<int> X_strides = ComputeXStrides(itr_axes, dims, axes); |
| std::vector<int> index(itr_axes, 0); |
| for (int Y_index = 0; Y_index < num_blocks; ++Y_index) { |
| const int X_index = std::inner_product( |
| X_strides.cbegin(), X_strides.cend(), index.cbegin(), 0); |
| if (block_size == 1) { |
| Y[Y_index] = X[X_index]; |
| } else { |
| std::memcpy( |
| Y + block_size * Y_index, |
| X + block_size * X_index, |
| block_size * sizeof(T)); |
| } |
| utils::IncreaseIndexInDims(itr_axes, Y_dims.data(), index.data()); |
| } |
| } |
| |
| template <typename T> |
| void TransposeCPUImpl( |
| const int ndim, |
| const int* dims, |
| const int* axes, |
| const T* X, |
| T* Y) { |
| if (utils::IsIdentityPermutation(ndim, axes)) { |
| const int size = |
| std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); |
| std::memcpy(Y, X, size * sizeof(T)); |
| return; |
| } |
| if (utils::IsBatchTranspose2D(ndim, axes)) { |
| const int N = |
| std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>()); |
| const int H = dims[ndim - 2]; |
| const int W = dims[ndim - 1]; |
| for (int i = 0; i < N; ++i) { |
| Transpose2D<T>(H, W, X + i * H * W, Y + i * H * W); |
| } |
| return; |
| } |
| TransposeND<T>(ndim, dims, axes, X, Y); |
| } |
| |
| template <> |
| void TransposeCPUImpl( |
| const int ndim, |
| const int* dims, |
| const int* axes, |
| const float* X, |
| float* Y) { |
| if (utils::IsIdentityPermutation(ndim, axes)) { |
| const int size = |
| std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); |
| std::memcpy(Y, X, size * sizeof(float)); |
| return; |
| } |
| if (utils::IsBatchTranspose2D(ndim, axes)) { |
| const int N = |
| std::accumulate(dims, dims + ndim - 2, 1, std::multiplies<int>()); |
| const int H = dims[ndim - 2]; |
| const int W = dims[ndim - 1]; |
| for (int i = 0; i < N; ++i) { |
| Transpose2D<float>(H, W, X + i * H * W, Y + i * H * W); |
| } |
| return; |
| } |
| #ifdef CAFFE2_USE_HPTT |
| if (TransposeWithHPTT(ndim, dims, axes, X, Y)) { |
| return; |
| } |
| #endif |
| TransposeND<float>(ndim, dims, axes, X, Y); |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_TRANSPOSE(T) \ |
| template <> \ |
| C10_EXPORT void Transpose<T, CPUContext>( \ |
| const int ndim, \ |
| const int* dims, \ |
| const int* axes, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* /* context */) { \ |
| TransposeCPUImpl(ndim, dims, axes, X, Y); \ |
| } |
| CAFFE2_SPECIALIZED_TRANSPOSE(float) |
| CAFFE2_SPECIALIZED_TRANSPOSE(double) |
| CAFFE2_SPECIALIZED_TRANSPOSE(int) |
| CAFFE2_SPECIALIZED_TRANSPOSE(int64_t) |
| CAFFE2_SPECIALIZED_TRANSPOSE(std::uint8_t) |
| CAFFE2_SPECIALIZED_TRANSPOSE(std::uint16_t) |
| #undef CAFFE2_SPECIALIZED_TRANSPOSE |
| |
| #define CAFFE2_SPECIALIZED_AFFINE_CHANNEL(T) \ |
| template <> \ |
| void AffineChannel<T, CPUContext, StorageOrder::NCHW>( \ |
| const int N, \ |
| const int C, \ |
| const int HxW, \ |
| const T* X, \ |
| const T* scale, \ |
| const T* bias, \ |
| T* Y, \ |
| CPUContext* /* context */) { \ |
| ConstEigenVectorArrayMap<T> scale_arr(scale, C); \ |
| ConstEigenVectorArrayMap<T> bias_arr(bias, C); \ |
| const int stride = C * HxW; \ |
| const T* X_ptr = X; \ |
| T* Y_ptr = Y; \ |
| for (int i = 0; i < N; ++i) { \ |
| EigenArrayMap<T>(Y_ptr, HxW, C) = \ |
| (ConstEigenArrayMap<T>(X_ptr, HxW, C).rowwise() * \ |
| scale_arr.transpose()) \ |
| .rowwise() + \ |
| bias_arr.transpose(); \ |
| X_ptr += stride; \ |
| Y_ptr += stride; \ |
| } \ |
| } \ |
| template <> \ |
| void AffineChannel<T, CPUContext, StorageOrder::NHWC>( \ |
| const int N, \ |
| const int C, \ |
| const int HxW, \ |
| const T* X, \ |
| const T* scale, \ |
| const T* bias, \ |
| T* Y, \ |
| CPUContext* /* context */) { \ |
| EigenArrayMap<T>(Y, C, N * HxW) = \ |
| (ConstEigenArrayMap<T>(X, C, N * HxW).colwise() * \ |
| ConstEigenVectorArrayMap<T>(scale, C)) \ |
| .colwise() + \ |
| ConstEigenVectorArrayMap<T>(bias, C); \ |
| } |
| CAFFE2_SPECIALIZED_AFFINE_CHANNEL(float) |
| #undef CAFFE2_SPECIALIZED_AFFINE_CHANNEL |
| |
| #define CAFFE2_SPECIALIZED_NCHW2NHWC(T) \ |
| template <> \ |
| C10_EXPORT void NCHW2NHWC<T, CPUContext>( \ |
| const int N, \ |
| const int C, \ |
| const int HxW, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* /* context */) { \ |
| const int stride = C * HxW; \ |
| for (int i = 0; i < N; ++i) { \ |
| Transpose2D<T>(C, HxW, X + i * stride, Y + i * stride); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_NCHW2NHWC(float) |
| #undef CAFFE2_SPECIALIZED_NCHW2NHWC |
| |
| #define CAFFE2_SPECIALIZED_NHWC2NCHW(T) \ |
| template <> \ |
| C10_EXPORT void NHWC2NCHW<T, CPUContext>( \ |
| const int N, \ |
| const int C, \ |
| const int HxW, \ |
| const T* X, \ |
| T* Y, \ |
| CPUContext* /* context */) { \ |
| const int stride = HxW * C; \ |
| for (int i = 0; i < N; ++i) { \ |
| Transpose2D<T>(HxW, C, X + i * stride, Y + i * stride); \ |
| } \ |
| } |
| CAFFE2_SPECIALIZED_NHWC2NCHW(float) |
| #undef CAFFE2_SPECIALIZED_NHWC2NCHW |
| |
| } // namespace math |
| } // namespace caffe2 |