caffe2/utils/math/transpose.cc - platform/external/pytorch - Git at Google

 #include "caffe2/utils/math/transpose.h"

 #include <algorithm>
 #include <functional>
 #include <limits>
 #include <numeric>

 #ifdef CAFFE2_USE_MKL
 #include <mkl.h>
 #endif // CAFFE2_USE_MKL

 #ifdef CAFFE2_USE_HPTT
 #include <hptt.h>
 #endif // CAFFE2_USE_HPTT

 #include "caffe2/core/context.h"
 #include "caffe2/utils/eigen_utils.h"
 #include "caffe2/utils/math/utils.h"

 namespace caffe2 {
 namespace math {

 namespace {

 template <typename TIndex, typename TData>
 void Transpose2D(
     const TIndex rows,
     const TIndex cols,
     const TData* X,
     TData* Y) {
   EigenMatrixMap<TData>(Y, rows, cols) =
       ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
 }

 #ifdef CAFFE2_USE_MKL

 #define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc)                   \
   template <>                                                           \
   void Transpose2D<TIndex, TData>(                                      \
       const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \
     MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows);          \
   }
 DELEGATE_TRANSPOSE_2D(std::int32_t, float, mkl_somatcopy);
 DELEGATE_TRANSPOSE_2D(std::int64_t, float, mkl_somatcopy);
 DELEGATE_TRANSPOSE_2D(std::int32_t, double, mkl_domatcopy);
 DELEGATE_TRANSPOSE_2D(std::int64_t, double, mkl_domatcopy);
 #undef DELEGATE_TRANSPOSE_2D

 #endif // CAFFE2_USE_MKL

 #ifdef CAFFE2_USE_HPTT

 template <typename TIndex, typename TData>
 bool TransposeByHPTT(
     const int ndim,
     const TIndex* dims,
     const int* axes,
     const TData* X,
     TData* Y) {
   for (int i = 0; i < ndim; ++i) {
     if (dims[i] <= 0 || dims[i] > std::numeric_limits<int>::max()) {
       return false;
     }
   }

   std::vector<int> axes_cm(ndim);
   std::vector<int> dims_cm(ndim);
   // Convert row-major index to column-major.
   const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
   for (int i = 0; i < ndim; ++i) {
     axes_cm[i] = cm_fn(axes[cm_fn(i)]);
     dims_cm[i] = dims[cm_fn(i)];
   }
   auto plan = hptt::create_plan(
       axes_cm.data(),
       ndim,
       TData(1),
       X,
       dims_cm.data(),
       nullptr,
       TData(0),
       Y,
       nullptr,
       hptt::ESTIMATE,
       1 /* num_threads */);
   if (plan == nullptr) {
     return false;
   }
   plan->execute();
   return true;
 }

 #endif // CAFFE2_USE_HPTT

 template <typename TIndex, typename TData>
 void TransposeND(
     const int ndim,
     const TIndex* dims,
     const int* axes,
     const TData* X,
     TData* Y) {
   std::vector<TIndex> Y_dims(ndim);
   for (int i = 0; i < ndim; ++i) {
     Y_dims[i] = dims[axes[i]];
   }
   // Measure amount of contiguous data we can copy at once
   int pivot = ndim - 1;
   TIndex block_size = 1;
   for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
     block_size *= Y_dims[pivot];
   }
   ++pivot;
   const TIndex num_blocks = std::accumulate(
       Y_dims.cbegin(),
       Y_dims.cbegin() + pivot,
       TIndex(1),
       std::multiplies<TIndex>());
   std::vector<TIndex> X_strides(pivot);
   utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
   std::vector<TIndex> index(pivot, 0);
   for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
     const TIndex X_index = std::inner_product(
         X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
     if (block_size == 1) {
       Y[Y_index] = X[X_index];
     } else {
       std::memcpy(
           Y + block_size * Y_index,
           X + block_size * X_index,
           block_size * sizeof(TData));
     }
     utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
   }
 }

 template <typename TIndex, typename TData>
 void TransposeImpl(
     const int ndim,
     const TIndex* dims,
     const int* axes,
     const TData* X,
     TData* Y) {
   const TIndex size =
       std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
   if (size == 0) {
     return;
   }
   if (utils::IsIdentityPermutation(ndim, axes)) {
     std::memcpy(Y, X, size * sizeof(TData));
     return;
   }
   if (utils::IsBatchTranspose2D(ndim, axes)) {
     const TIndex H = dims[ndim - 2];
     const TIndex W = dims[ndim - 1];
     const TIndex N = size / (H * W);
     for (TIndex i = 0; i < N; ++i) {
       Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
     }
     return;
   }
   TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
 }

 #ifdef CAFFE2_USE_HPTT

 #define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData)                \
   template <>                                                           \
   void TransposeImpl<TIndex, TData>(                                    \
       const int ndim,                                                   \
       const TIndex* dims,                                               \
       const int* axes,                                                  \
       const TData* X,                                                   \
       TData* T) {                                                       \
     const TIndex size = std::accumulate(                                \
         dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());       \
     if (size == 0) {                                                    \
       return;                                                           \
     }                                                                   \
     if (utils::IsIdentityPermutation(ndim, axes)) {                     \
       std::memcpy(Y, X, size * sizeof(TData));                          \
       return;                                                           \
     }                                                                   \
     if (TransposeByHPTT(ndim, dims, axes, X, Y)) {                      \
       return;                                                           \
     }                                                                   \
     if (utils::IsBatchTranspose2D(ndim, axes)) {                        \
       const TIndex H = dims[ndim - 2];                                  \
       const TIndex W = dims[ndim - 1];                                  \
       const TIndex N = size / (H * W);                                  \
       for (TIndex i = 0; i < N; ++i) {                                  \
         Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \
       }                                                                 \
       return;                                                           \
     }                                                                   \
     TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);                 \
   }
 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, float)
 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, float)
 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, double)
 CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, double)
 #undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL

 #endif // CAFFE2_USE_HPTT

 } // namespace

 #define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData)       \
   template <>                                             \
   C10_EXPORT void Transpose<TIndex, TData, CPUContext>(   \
       const int ndim,                                     \
       const TIndex* dims,                                 \
       const int* axes,                                    \
       const TData* X,                                     \
       TData* Y,                                           \
       CPUContext* /* context */) {                        \
     TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \
   }
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, float)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, float)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, double)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, double)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int32_t)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int32_t)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int64_t)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int64_t)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint8_t)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint8_t)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint16_t)
 CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint16_t)
 #undef CAFFE2_SPECIALIZED_TRANSPOSE

 #define CAFFE2_SPECIALIZED_NCHW2NHWC(T)                    \
   template <>                                              \
   C10_EXPORT void NCHW2NHWC<T, CPUContext>(                \
       const int N,                                         \
       const int C,                                         \
       const int HxW,                                       \
       const T* X,                                          \
       T* Y,                                                \
       CPUContext* /* context */) {                         \
     const int stride = C * HxW;                            \
     for (int i = 0; i < N; ++i) {                          \
       Transpose2D(C, HxW, X + i * stride, Y + i * stride); \
     }                                                      \
   }
 CAFFE2_SPECIALIZED_NCHW2NHWC(float)
 #undef CAFFE2_SPECIALIZED_NCHW2NHWC

 #define CAFFE2_SPECIALIZED_NHWC2NCHW(T)                    \
   template <>                                              \
   C10_EXPORT void NHWC2NCHW<T, CPUContext>(                \
       const int N,                                         \
       const int C,                                         \
       const int HxW,                                       \
       const T* X,                                          \
       T* Y,                                                \
       CPUContext* /* context */) {                         \
     const int stride = HxW * C;                            \
     for (int i = 0; i < N; ++i) {                          \
       Transpose2D(HxW, C, X + i * stride, Y + i * stride); \
     }                                                      \
   }
 CAFFE2_SPECIALIZED_NHWC2NCHW(float)
 #undef CAFFE2_SPECIALIZED_NHWC2NCHW

 } // namespace math
 } // namespace caffe2
	#include "caffe2/utils/math/transpose.h"

	#include <algorithm>
	#include <functional>
	#include <limits>
	#include <numeric>

	#ifdef CAFFE2_USE_MKL
	#include <mkl.h>
	#endif // CAFFE2_USE_MKL

	#ifdef CAFFE2_USE_HPTT
	#include <hptt.h>
	#endif // CAFFE2_USE_HPTT

	#include "caffe2/core/context.h"
	#include "caffe2/utils/eigen_utils.h"
	#include "caffe2/utils/math/utils.h"

	namespace caffe2 {
	namespace math {

	namespace {

	template <typename TIndex, typename TData>
	void Transpose2D(
	const TIndex rows,
	const TIndex cols,
	const TData* X,
	TData* Y) {
	EigenMatrixMap<TData>(Y, rows, cols) =
	ConstEigenMatrixMap<TData>(X, cols, rows).transpose();
	}

	#ifdef CAFFE2_USE_MKL

	#define DELEGATE_TRANSPOSE_2D(TIndex, TData, MKLFunc) \
	template <> \
	void Transpose2D<TIndex, TData>( \
	const TIndex rows, const TIndex cols, const TData* X, TData* Y) { \
	MKLFunc('R', 'T', rows, cols, TData(1), X, cols, Y, rows); \
	}
	DELEGATE_TRANSPOSE_2D(std::int32_t, float, mkl_somatcopy);
	DELEGATE_TRANSPOSE_2D(std::int64_t, float, mkl_somatcopy);
	DELEGATE_TRANSPOSE_2D(std::int32_t, double, mkl_domatcopy);
	DELEGATE_TRANSPOSE_2D(std::int64_t, double, mkl_domatcopy);
	#undef DELEGATE_TRANSPOSE_2D

	#endif // CAFFE2_USE_MKL

	#ifdef CAFFE2_USE_HPTT

	template <typename TIndex, typename TData>
	bool TransposeByHPTT(
	const int ndim,
	const TIndex* dims,
	const int* axes,
	const TData* X,
	TData* Y) {
	for (int i = 0; i < ndim; ++i) {
	if (dims[i] <= 0 \|\| dims[i] > std::numeric_limits<int>::max()) {
	return false;
	}
	}

	std::vector<int> axes_cm(ndim);
	std::vector<int> dims_cm(ndim);
	// Convert row-major index to column-major.
	const auto cm_fn = [ndim](const int i) { return ndim - i - 1; };
	for (int i = 0; i < ndim; ++i) {
	axes_cm[i] = cm_fn(axes[cm_fn(i)]);
	dims_cm[i] = dims[cm_fn(i)];
	}
	auto plan = hptt::create_plan(
	axes_cm.data(),
	ndim,
	TData(1),
	X,
	dims_cm.data(),
	nullptr,
	TData(0),
	Y,
	nullptr,
	hptt::ESTIMATE,
	1 /* num_threads */);
	if (plan == nullptr) {
	return false;
	}
	plan->execute();
	return true;
	}

	#endif // CAFFE2_USE_HPTT

	template <typename TIndex, typename TData>
	void TransposeND(
	const int ndim,
	const TIndex* dims,
	const int* axes,
	const TData* X,
	TData* Y) {
	std::vector<TIndex> Y_dims(ndim);
	for (int i = 0; i < ndim; ++i) {
	Y_dims[i] = dims[axes[i]];
	}
	// Measure amount of contiguous data we can copy at once
	int pivot = ndim - 1;
	TIndex block_size = 1;
	for (; pivot >= 0 && axes[pivot] == pivot; --pivot) {
	block_size *= Y_dims[pivot];
	}
	++pivot;
	const TIndex num_blocks = std::accumulate(
	Y_dims.cbegin(),
	Y_dims.cbegin() + pivot,
	TIndex(1),
	std::multiplies<TIndex>());
	std::vector<TIndex> X_strides(pivot);
	utils::ComputeTransposedStrides<TIndex>(pivot, dims, axes, X_strides.data());
	std::vector<TIndex> index(pivot, 0);
	for (TIndex Y_index = 0; Y_index < num_blocks; ++Y_index) {
	const TIndex X_index = std::inner_product(
	X_strides.cbegin(), X_strides.cend(), index.cbegin(), TIndex(0));
	if (block_size == 1) {
	Y[Y_index] = X[X_index];
	} else {
	std::memcpy(
	Y + block_size * Y_index,
	X + block_size * X_index,
	block_size * sizeof(TData));
	}
	utils::IncreaseIndexInDims<TIndex>(pivot, Y_dims.data(), index.data());
	}
	}

	template <typename TIndex, typename TData>
	void TransposeImpl(
	const int ndim,
	const TIndex* dims,
	const int* axes,
	const TData* X,
	TData* Y) {
	const TIndex size =
	std::accumulate(dims, dims + ndim, TIndex(1), std::multiplies<TIndex>());
	if (size == 0) {
	return;
	}
	if (utils::IsIdentityPermutation(ndim, axes)) {
	std::memcpy(Y, X, size * sizeof(TData));
	return;
	}
	if (utils::IsBatchTranspose2D(ndim, axes)) {
	const TIndex H = dims[ndim - 2];
	const TIndex W = dims[ndim - 1];
	const TIndex N = size / (H * W);
	for (TIndex i = 0; i < N; ++i) {
	Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W);
	}
	return;
	}
	TransposeND<TIndex, TData>(ndim, dims, axes, X, Y);
	}

	#ifdef CAFFE2_USE_HPTT

	#define CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(TIndex, TData) \
	template <> \
	void TransposeImpl<TIndex, TData>( \
	const int ndim, \
	const TIndex* dims, \
	const int* axes, \
	const TData* X, \
	TData* T) { \
	const TIndex size = std::accumulate( \
	dims, dims + ndim, TIndex(1), std::multiplies<TIndex>()); \
	if (size == 0) { \
	return; \
	} \
	if (utils::IsIdentityPermutation(ndim, axes)) { \
	std::memcpy(Y, X, size * sizeof(TData)); \
	return; \
	} \
	if (TransposeByHPTT(ndim, dims, axes, X, Y)) { \
	return; \
	} \
	if (utils::IsBatchTranspose2D(ndim, axes)) { \
	const TIndex H = dims[ndim - 2]; \
	const TIndex W = dims[ndim - 1]; \
	const TIndex N = size / (H * W); \
	for (TIndex i = 0; i < N; ++i) { \
	Transpose2D<TIndex, TData>(H, W, X + i * H * W, Y + i * H * W); \
	} \
	return; \
	} \
	TransposeND<TIndex, TData>(ndim, dims, axes, X, Y); \
	}
	CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, float)
	CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, float)
	CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int32_t, double)
	CAFFE2_SPECIALIZED_TRANSPOSE_IMPL(std::int64_t, double)
	#undef CAFFE2_SPECIALIZED_TRANSPOSE_IMPL

	#endif // CAFFE2_USE_HPTT

	} // namespace

	#define CAFFE2_SPECIALIZED_TRANSPOSE(TIndex, TData) \
	template <> \
	C10_EXPORT void Transpose<TIndex, TData, CPUContext>( \
	const int ndim, \
	const TIndex* dims, \
	const int* axes, \
	const TData* X, \
	TData* Y, \
	CPUContext* /* context */) { \
	TransposeImpl<TIndex, TData>(ndim, dims, axes, X, Y); \
	}
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, float)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, float)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, double)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, double)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int32_t)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int32_t)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::int64_t)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::int64_t)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint8_t)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint8_t)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int32_t, std::uint16_t)
	CAFFE2_SPECIALIZED_TRANSPOSE(std::int64_t, std::uint16_t)
	#undef CAFFE2_SPECIALIZED_TRANSPOSE

	#define CAFFE2_SPECIALIZED_NCHW2NHWC(T) \
	template <> \
	C10_EXPORT void NCHW2NHWC<T, CPUContext>( \
	const int N, \
	const int C, \
	const int HxW, \
	const T* X, \
	T* Y, \
	CPUContext* /* context */) { \
	const int stride = C * HxW; \
	for (int i = 0; i < N; ++i) { \
	Transpose2D(C, HxW, X + i * stride, Y + i * stride); \
	} \
	}
	CAFFE2_SPECIALIZED_NCHW2NHWC(float)
	#undef CAFFE2_SPECIALIZED_NCHW2NHWC

	#define CAFFE2_SPECIALIZED_NHWC2NCHW(T) \
	template <> \
	C10_EXPORT void NHWC2NCHW<T, CPUContext>( \
	const int N, \
	const int C, \
	const int HxW, \
	const T* X, \
	T* Y, \
	CPUContext* /* context */) { \
	const int stride = HxW * C; \
	for (int i = 0; i < N; ++i) { \
	Transpose2D(HxW, C, X + i * stride, Y + i * stride); \
	} \
	}
	CAFFE2_SPECIALIZED_NHWC2NCHW(float)
	#undef CAFFE2_SPECIALIZED_NHWC2NCHW

	} // namespace math
	} // namespace caffe2