caffe2/operators/conv_op_eigen.cc - platform/external/pytorch - Git at Google

 #include "Eigen/Core"
 #include "caffe2/utils/eigen_utils.h"

 #if EIGEN_VERSION_AT_LEAST(3, 3, 0)

 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_pool_op_base.h"

 #include "unsupported/Eigen/CXX11/Tensor"

 namespace caffe2 {

 template <typename T>
 class EigenConvOp final : public ConvPoolOpBase<CPUContext> {
  public:
   USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
   EigenConvOp(const OperatorDef& operator_def, Workspace* ws)
       : ConvPoolOpBase<CPUContext>(operator_def, ws) {
     OPERATOR_NEEDS_FEATURE(group_ == 1, "Group convolution not supported yet.");
   }
   ~EigenConvOp() {}

   bool RunOnDeviceWithOrderNCHW() override;
   bool RunOnDeviceWithOrderNHWC() override;

  private:
   INPUT_TAGS(INPUT, FILTER, BIAS);
 };

 // The NCHW implementation: we do explicit transposes before and after, which
 // are not ideal but provides a compatible path instead of throwing the error.
 template <typename T>
 bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
   auto& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto* Y = Output(0);
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   CAFFE_ENFORCE(4 == filter.ndim());
   const int M = filter.dim32(0);
   CAFFE_ENFORCE(filter.dim32(1) == C);
   CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
   CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
   ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
   Eigen::array<TIndex, 4> kernel_shuffles
       { {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
   Eigen::array<TIndex, 4> input_shuffles
       { {TIndex(0), TIndex(2), TIndex(3), TIndex(1)} };

   Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
       Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
           const_cast<T*>(filter.template data<T>()),
           M,
           C,
           kernel_h(),
           kernel_w())
           .shuffle(kernel_shuffles);
   Eigen::Tensor<T, 4, Eigen::RowMajor> X_tensor =
       Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
           const_cast<T*>(X.template data<T>()), N, C, H, W)
           .shuffle(input_shuffles);

   // For Eigen, the definition of row and col actually correspond to width
   // and height instead of the other way round, so notice how we pass the
   // stride, pad and dilation values.
   typedef typename Eigen::internal::traits<
       Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
   Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
   contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);

   Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
   pre_contract_dims[1] = kernel_h() * kernel_w() * C;
   pre_contract_dims[0] = Y->size() / M;

   Eigen::DSizes<TensorIndex, 2> kernel_dims;
   kernel_dims[0] = kernel_h() * kernel_w() * C;
   kernel_dims[1] = M;

   Eigen::array<TensorIndex, 4> bcast_dims;
   bcast_dims[0] = N;
   bcast_dims[1] = Y->dim32(1);
   bcast_dims[2] = Y->dim32(2);
   bcast_dims[3] = 1;

   Eigen::Tensor<T, 4, Eigen::RowMajor> Y_tensor(
       Y->dim32(0), Y->dim32(2), Y->dim32(3), Y->dim32(1));
   Y_tensor = X_tensor
                  .extract_image_patches(
                      kernel_w(),
                      kernel_h(),
                      stride_w(),
                      stride_h(),
                      dilation_w(),
                      dilation_h(),
                      1,
                      1,
                      pad_l(),
                      pad_r(),
                      pad_t(),
                      pad_b(),
                      0)
                  .reshape(pre_contract_dims)
                  .contract(filter_tensor.reshape(kernel_dims), contract_dims)
                  .reshape(Y_tensor.dimensions());
   if (InputSize() == 3) {
     auto& bias = Input(BIAS);
     CAFFE_ENFORCE(1 == bias.ndim());
     CAFFE_ENFORCE(bias.dim32(0) == M);
     // It seems that the bias broadcast is still slower so let's do the
     // following for now.
     EigenArrayMap<T> Y_arr(
         Y_tensor.data(), static_cast<TIndex>(M), Y->size() / M);
     ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
     Y_arr = Y_arr.colwise() + bias_arr;
   }

   // Do a last transpose.
   Eigen::array<TIndex, 4> output_shuffles
       { {TIndex(0), TIndex(3), TIndex(1), TIndex(2) } };

   Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
       Y->template mutable_data<T>(), N, M, Y->dim32(2), Y->dim32(3)) =
       Y_tensor.shuffle(output_shuffles);
   return true;
 }

 template <typename T>
 bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
   auto& X = Input(INPUT);
   auto& filter = Input(FILTER);
   auto* Y = Output(0);
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
   CAFFE_ENFORCE(4 == filter.ndim());
   const int M = filter.dim32(0);
   CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
   CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
   CAFFE_ENFORCE(filter.dim32(3) == C);
   ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
   // Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
   // optimization purposes, so we will create a temp one.
   Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
       M, kernel_h() * kernel_w() * C);
   temp_filter = ConstEigenArrayMap<T>(
                     filter.template data<T>(), kernel_h() * kernel_w() * C, M)
                     .transpose();

   // Create tensor maps, and call spatial convolution.
   // TODO(jiayq): right now we const cast away the const pointer, but we will
   // need to figure out how to properly do a const tensormap.
   Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> X_tensor(
       const_cast<T*>(X.template data<T>()), N, H, W, C);
   Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> Y_tensor(
       Y->template mutable_data<T>(), N, Y->dim32(1), Y->dim32(2), M);
   Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> filter_tensor(
       const_cast<T*>(temp_filter.data()), kernel_h(), kernel_w(), C, M);

   // For Eigen, the definition of row and col actually correspond to width
   // and height instead of the other way round, so notice how we pass the
   // stride, pad and dilation values.
   typedef typename Eigen::internal::traits<
       Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
   Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
   contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);

   Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
   pre_contract_dims[1] = kernel_h() * kernel_w() * C;
   pre_contract_dims[0] = Y->size() / M;

   Eigen::DSizes<TensorIndex, 2> kernel_dims;
   kernel_dims[0] = kernel_h() * kernel_w() * C;
   kernel_dims[1] = M;

   Eigen::array<TensorIndex, 4> bcast_dims;
   bcast_dims[0] = N;
   bcast_dims[1] = Y->dim32(1);
   bcast_dims[2] = Y->dim32(2);
   bcast_dims[3] = 1;

   Y_tensor = X_tensor
                  .extract_image_patches(
                      kernel_w(),
                      kernel_h(),
                      stride_w(),
                      stride_h(),
                      dilation_w(),
                      dilation_h(),
                      1,
                      1,
                      pad_l(),
                      pad_r(),
                      pad_t(),
                      pad_b(),
                      0)
                  .reshape(pre_contract_dims)
                  .contract(filter_tensor.reshape(kernel_dims), contract_dims)
                  .reshape(Y_tensor.dimensions());

   if (InputSize() == 3) {
     auto& bias = Input(BIAS);
     CAFFE_ENFORCE(1 == bias.ndim());
     CAFFE_ENFORCE(bias.dim32(0) == M);
     Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> bias_tensor(
         const_cast<T*>(bias.template data<T>()), 1, 1, 1, M);
     // It seems that the bias broadcast is still slower so let's do the
     // following for now.
     EigenArrayMap<T> Y_arr(
         Y->template mutable_data<T>(), static_cast<TIndex>(M), Y->size() / M);
     ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
     Y_arr = Y_arr.colwise() + bias_arr;
   }
   return true;
 }

 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, EIGEN, EigenConvOp<float>);
 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv1D, EIGEN, EigenConvOp<float>);
 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv2D, EIGEN, EigenConvOp<float>);
 REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv3D, EIGEN, EigenConvOp<float>);

 } // namespace caffe2

 #endif
	#include "Eigen/Core"
	#include "caffe2/utils/eigen_utils.h"

	#if EIGEN_VERSION_AT_LEAST(3, 3, 0)

	#include "caffe2/core/context.h"
	#include "caffe2/core/operator.h"
	#include "caffe2/operators/conv_pool_op_base.h"

	#include "unsupported/Eigen/CXX11/Tensor"

	namespace caffe2 {

	template <typename T>
	class EigenConvOp final : public ConvPoolOpBase<CPUContext> {
	public:
	USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
	EigenConvOp(const OperatorDef& operator_def, Workspace* ws)
	: ConvPoolOpBase<CPUContext>(operator_def, ws) {
	OPERATOR_NEEDS_FEATURE(group_ == 1, "Group convolution not supported yet.");
	}
	~EigenConvOp() {}

	bool RunOnDeviceWithOrderNCHW() override;
	bool RunOnDeviceWithOrderNHWC() override;

	private:
	INPUT_TAGS(INPUT, FILTER, BIAS);
	};

	// The NCHW implementation: we do explicit transposes before and after, which
	// are not ideal but provides a compatible path instead of throwing the error.
	template <typename T>
	bool EigenConvOp<T>::RunOnDeviceWithOrderNCHW() {
	auto& X = Input(INPUT);
	auto& filter = Input(FILTER);
	auto* Y = Output(0);
	const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
	CAFFE_ENFORCE(4 == filter.ndim());
	const int M = filter.dim32(0);
	CAFFE_ENFORCE(filter.dim32(1) == C);
	CAFFE_ENFORCE(filter.dim32(2) == kernel_h());
	CAFFE_ENFORCE(filter.dim32(3) == kernel_w());
	ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
	Eigen::array<TIndex, 4> kernel_shuffles
	{ {TIndex(2), TIndex(3), TIndex(1), TIndex(0)} };
	Eigen::array<TIndex, 4> input_shuffles
	{ {TIndex(0), TIndex(2), TIndex(3), TIndex(1)} };

	Eigen::Tensor<T, 4, Eigen::RowMajor> filter_tensor =
	Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
	const_cast<T*>(filter.template data<T>()),
	M,
	C,
	kernel_h(),
	kernel_w())
	.shuffle(kernel_shuffles);
	Eigen::Tensor<T, 4, Eigen::RowMajor> X_tensor =
	Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
	const_cast<T*>(X.template data<T>()), N, C, H, W)
	.shuffle(input_shuffles);

	// For Eigen, the definition of row and col actually correspond to width
	// and height instead of the other way round, so notice how we pass the
	// stride, pad and dilation values.
	typedef typename Eigen::internal::traits<
	Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
	Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
	contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);

	Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
	pre_contract_dims[1] = kernel_h() * kernel_w() * C;
	pre_contract_dims[0] = Y->size() / M;

	Eigen::DSizes<TensorIndex, 2> kernel_dims;
	kernel_dims[0] = kernel_h() * kernel_w() * C;
	kernel_dims[1] = M;

	Eigen::array<TensorIndex, 4> bcast_dims;
	bcast_dims[0] = N;
	bcast_dims[1] = Y->dim32(1);
	bcast_dims[2] = Y->dim32(2);
	bcast_dims[3] = 1;

	Eigen::Tensor<T, 4, Eigen::RowMajor> Y_tensor(
	Y->dim32(0), Y->dim32(2), Y->dim32(3), Y->dim32(1));
	Y_tensor = X_tensor
	.extract_image_patches(
	kernel_w(),
	kernel_h(),
	stride_w(),
	stride_h(),
	dilation_w(),
	dilation_h(),
	1,
	1,
	pad_l(),
	pad_r(),
	pad_t(),
	pad_b(),
	0)
	.reshape(pre_contract_dims)
	.contract(filter_tensor.reshape(kernel_dims), contract_dims)
	.reshape(Y_tensor.dimensions());
	if (InputSize() == 3) {
	auto& bias = Input(BIAS);
	CAFFE_ENFORCE(1 == bias.ndim());
	CAFFE_ENFORCE(bias.dim32(0) == M);
	// It seems that the bias broadcast is still slower so let's do the
	// following for now.
	EigenArrayMap<T> Y_arr(
	Y_tensor.data(), static_cast<TIndex>(M), Y->size() / M);
	ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
	Y_arr = Y_arr.colwise() + bias_arr;
	}

	// Do a last transpose.
	Eigen::array<TIndex, 4> output_shuffles
	{ {TIndex(0), TIndex(3), TIndex(1), TIndex(2) } };

	Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>>(
	Y->template mutable_data<T>(), N, M, Y->dim32(2), Y->dim32(3)) =
	Y_tensor.shuffle(output_shuffles);
	return true;
	}

	template <typename T>
	bool EigenConvOp<T>::RunOnDeviceWithOrderNHWC() {
	auto& X = Input(INPUT);
	auto& filter = Input(FILTER);
	auto* Y = Output(0);
	const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
	CAFFE_ENFORCE(4 == filter.ndim());
	const int M = filter.dim32(0);
	CAFFE_ENFORCE(filter.dim32(1) == kernel_h());
	CAFFE_ENFORCE(filter.dim32(2) == kernel_w());
	CAFFE_ENFORCE(filter.dim32(3) == C);
	ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
	// Eigen expects filter to be of shape (kernel_h, kernel_w, C, M) for
	// optimization purposes, so we will create a temp one.
	Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> temp_filter(
	M, kernel_h() * kernel_w() * C);
	temp_filter = ConstEigenArrayMap<T>(
	filter.template data<T>(), kernel_h() * kernel_w() * C, M)
	.transpose();

	// Create tensor maps, and call spatial convolution.
	// TODO(jiayq): right now we const cast away the const pointer, but we will
	// need to figure out how to properly do a const tensormap.
	Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> X_tensor(
	const_cast<T*>(X.template data<T>()), N, H, W, C);
	Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> Y_tensor(
	Y->template mutable_data<T>(), N, Y->dim32(1), Y->dim32(2), M);
	Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> filter_tensor(
	const_cast<T*>(temp_filter.data()), kernel_h(), kernel_w(), C, M);

	// For Eigen, the definition of row and col actually correspond to width
	// and height instead of the other way round, so notice how we pass the
	// stride, pad and dilation values.
	typedef typename Eigen::internal::traits<
	Eigen::Tensor<T, 4, Eigen::RowMajor>>::Index TensorIndex;
	Eigen::array<Eigen::IndexPair<TensorIndex>, 1> contract_dims;
	contract_dims[0] = Eigen::IndexPair<TensorIndex>(1, 0);

	Eigen::DSizes<TensorIndex, 2> pre_contract_dims;
	pre_contract_dims[1] = kernel_h() * kernel_w() * C;
	pre_contract_dims[0] = Y->size() / M;

	Eigen::DSizes<TensorIndex, 2> kernel_dims;
	kernel_dims[0] = kernel_h() * kernel_w() * C;
	kernel_dims[1] = M;

	Eigen::array<TensorIndex, 4> bcast_dims;
	bcast_dims[0] = N;
	bcast_dims[1] = Y->dim32(1);
	bcast_dims[2] = Y->dim32(2);
	bcast_dims[3] = 1;

	Y_tensor = X_tensor
	.extract_image_patches(
	kernel_w(),
	kernel_h(),
	stride_w(),
	stride_h(),
	dilation_w(),
	dilation_h(),
	1,
	1,
	pad_l(),
	pad_r(),
	pad_t(),
	pad_b(),
	0)
	.reshape(pre_contract_dims)
	.contract(filter_tensor.reshape(kernel_dims), contract_dims)
	.reshape(Y_tensor.dimensions());

	if (InputSize() == 3) {
	auto& bias = Input(BIAS);
	CAFFE_ENFORCE(1 == bias.ndim());
	CAFFE_ENFORCE(bias.dim32(0) == M);
	Eigen::TensorMap<Eigen::Tensor<T, 4, Eigen::RowMajor>> bias_tensor(
	const_cast<T*>(bias.template data<T>()), 1, 1, 1, M);
	// It seems that the bias broadcast is still slower so let's do the
	// following for now.
	EigenArrayMap<T> Y_arr(
	Y->template mutable_data<T>(), static_cast<TIndex>(M), Y->size() / M);
	ConstEigenVectorArrayMap<T> bias_arr(bias.template data<T>(), M);
	Y_arr = Y_arr.colwise() + bias_arr;
	}
	return true;
	}

	REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, EIGEN, EigenConvOp<float>);
	REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv1D, EIGEN, EigenConvOp<float>);
	REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv2D, EIGEN, EigenConvOp<float>);
	REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv3D, EIGEN, EigenConvOp<float>);

	} // namespace caffe2

	#endif