caffe2/operators/pool_gradient_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/pool_op.h"

 namespace caffe2 {

 using std::max;
 using std::min;

 namespace {
 // These two classe are just used as template arguments passed to the
 // PoolGradientOp
 // template to instantiate the different algorithms.
 template <typename T>
 class AveragePool {
  public:
   static void process_grad(
       const T& /*x_data*/,
       const T& /*y_data*/,
       const T& dy_data,
       const T& scale,
       T& dx_data) {
     dx_data += (scale * dy_data);
   }

   static void process_grad(
       const int y_col,
       const int x_col,
       const float scale,
       ConstEigenArrayMap<float>& /*x_data*/,
       ConstEigenArrayMap<float>& /*y_data*/,
       ConstEigenArrayMap<float>& dy_data,
       EigenArrayMap<float>& dx_data) {
     dx_data.col(x_col) += scale * dy_data.col(y_col);
   }
 };

 template <typename T>
 class MaxPool {
  public:
   static void process_grad(
       const T& x_data,
       const T& y_data,
       const T& dy_data,
       const T& /*scale*/,
       T& dx_data) {
     if (x_data == y_data) {
       dx_data += dy_data;
     }
   }

   static void process_grad(
       const int y_col,
       const int x_col,
       const float /*scale*/,
       ConstEigenArrayMap<float>& x_data,
       ConstEigenArrayMap<float>& y_data,
       ConstEigenArrayMap<float>& dy_data,
       EigenArrayMap<float>& dx_data) {
     dx_data.col(x_col) +=
         dy_data.col(y_col) * (x_data.col(x_col)
                                   .cwiseEqual(y_data.col(y_col))
                                   .template cast<float>());
   }
 };
 }

 template <typename T, class Context, typename PoolType>
 bool PoolGradientOp<T, Context, PoolType>::RunOnDeviceWithOrderNCHW() {
   auto& X = Input(0);
   auto& Y = Input(1);
   auto& dY = Input(2);
   auto* dX = Output(0);
   // TODO(Yangqing): Add shape checks.
   dX->ResizeLike(X);
   math::Set<float, CPUContext>(
       X.size(), 0, dX->template mutable_data<float>(), &context_);
   const float* Xdata = X.template data<float>();
   const float* Ydata = Y.template data<float>();
   const float* dYdata = dY.template data<float>();
   float* dXdata = dX->template mutable_data<float>();
   int channels = X.dim32(1);
   CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
   int height = X.dim32(2);
   int width = kernel_.size() > 1 ? X.dim32(3) : 1;
   int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
   vector<int> dims(X.dims().begin() + 2, X.dims().end());
   ConvPoolOpBase<CPUContext>::ComputePads(dims);
   int pooled_height = dY.dim32(2);
   int pooled_width = kernel_.size() > 1 ? dY.dim32(3) : 1;
   int pooled_depth = kernel_.size() > 2 ? dY.dim32(4) : 1;
   // The main loop
   switch (kernel_.size()) {
     case 1:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             int hstart = ph * stride_h() - pad_t();
             int hend = min(hstart + kernel_h(), height);
             hstart = max(hstart, 0);
             float scale = 1. / (hend - hstart);
             for (int h = hstart; h < hend; ++h) {
               PoolType::process_grad(
                   Xdata[h], Ydata[ph], dYdata[ph], scale, dXdata[h]);
             }
           }
           // offset
           Xdata += height;
           dXdata += height;
           Ydata += pooled_height;
           dYdata += pooled_height;
         }
       }
       break;
     case 2:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             int hstart = ph * stride_h() - pad_t();
             int hend = min(hstart + kernel_h(), height);
             hstart = max(hstart, 0);
             for (int pw = 0; pw < pooled_width; ++pw) {
               int wstart = pw * stride_w() - pad_l();
               int wend = min(wstart + kernel_w(), width);
               wstart = max(wstart, 0);
               float scale = 1. / (hend - hstart) / (wend - wstart);
               const int pooled_index = ph * pooled_width + pw;
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
                   const int index = h * width + w;
                   PoolType::process_grad(
                       Xdata[index],
                       Ydata[pooled_index],
                       dYdata[pooled_index],
                       scale,
                       dXdata[index]);
                 }
               }
             }
           }
           // offset
           Xdata += height * width;
           dXdata += height * width;
           Ydata += pooled_height * pooled_width;
           dYdata += pooled_height * pooled_width;
         }
       }
       break;
     case 3:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             int hstart = ph * stride_h() - pad_t();
             int hend = min(hstart + kernel_h(), height);
             hstart = max(hstart, 0);
             for (int pw = 0; pw < pooled_width; ++pw) {
               int wstart = pw * stride_w() - pad_l();
               int wend = min(wstart + kernel_w(), width);
               wstart = max(wstart, 0);
               for (int pd = 0; pd < pooled_depth; ++pd) {
                 int dstart = pd * stride_[2] - pads_[2];
                 int dend = min(dstart + kernel_[2], depth);
                 dstart = max(dstart, 0);
                 float scale =
                     1. / (hend - hstart) / (wend - wstart) / (dend - dstart);
                 const int pooled_index =
                     ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
                 for (int h = hstart; h < hend; ++h) {
                   for (int w = wstart; w < wend; ++w) {
                     for (int d = dstart; d < dend; ++d) {
                       const int index = h * width * depth + w * depth + d;
                       PoolType::process_grad(
                           Xdata[index],
                           Ydata[pooled_index],
                           dYdata[pooled_index],
                           scale,
                           dXdata[index]);
                     }
                   }
                 }
               }
             }
           }
           // offset
           Xdata += height * width * depth;
           dXdata += height * width * depth;
           Ydata += pooled_height * pooled_width * pooled_depth;
           dYdata += pooled_height * pooled_width * pooled_depth;
         }
       }
       break;
     default:
       CAFFE_THROW("Unsupported pooling size");
       return false;
   }
   return true;
 }

 template <typename T, class Context, typename PoolType>
 bool PoolGradientOp<T, Context, PoolType>::RunOnDeviceWithOrderNHWC() {
   auto& X = Input(0);
   auto& Y = Input(1);
   auto& dY = Input(2);
   DCHECK_EQ(dY.ndim(), kernel_.size() + 2);
   auto* dX = Output(0);
   dX->ResizeLike(X);

   int channels = X.dim32(X.ndim() - 1);
   CAFFE_ENFORCE_EQ(channels, dY.dim32(dY.ndim() - 1));
   ConstEigenArrayMap<T> Ymat(
       Y.template data<float>(), channels, Y.size() / channels);
   ConstEigenArrayMap<float> dYmat(
       dY.template data<float>(), channels, Y.size() / channels);
   ConstEigenArrayMap<float> Xmat(
       X.template data<float>(), channels, X.size() / channels);
   EigenArrayMap<float> dXmat(
       dX->template mutable_data<float>(), channels, X.size() / channels);
   dXmat.setZero();
   int height = X.dim32(1);
   int width = kernel_.size() > 1 ? X.dim32(2) : 1;
   int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
   vector<int> dims(X.dims().begin() + 1, X.dims().end() - 1);
   ConvPoolOpBase<CPUContext>::ComputePads(dims);
   int pooled_height = dY.dim32(1);
   int pooled_width = kernel_.size() > 1 ? dY.dim32(2) : 1;
   int pooled_depth = kernel_.size() > 2 ? dY.dim32(3) : 1;

   // The main loop
   // Do not do openmp here: the following for loops are looping over the pooled
   // output, so if one parallelizes the outer loops, race conditions could
   // happen in the inner loops.
   switch (kernel_.size()) {
     case 1:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           int hstart = ph * stride_h() - pad_t();
           int hend = min(hstart + kernel_h(), height);
           hstart = max(hstart, 0);
           const int pool_index = n * pooled_height + ph;
           const float scale = 1. / (hend - hstart);
           for (int h = hstart; h < hend; ++h) {
             const int input_index = n * height + h;
             PoolType::process_grad(
                 pool_index, input_index, scale, Xmat, Ymat, dYmat, dXmat);
           }
         }
       }
       break;
     case 2:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           int hstart = ph * stride_h() - pad_t();
           int hend = min(hstart + kernel_h(), height);
           hstart = max(hstart, 0);
           for (int pw = 0; pw < pooled_width; ++pw) {
             int wstart = pw * stride_w() - pad_l();
             int wend = min(wstart + kernel_w(), width);
             wstart = max(wstart, 0);
             const int pool_index = (n * pooled_height + ph) * pooled_width + pw;
             const float scale = 1. / (hend - hstart) / (wend - wstart);
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
                 const int input_index = (n * height + h) * width + w;
                 PoolType::process_grad(
                     pool_index, input_index, scale, Xmat, Ymat, dYmat, dXmat);
               }
             }
           }
         }
       }
       break;
     case 3:
       for (int n = 0; n < X.dim32(0); ++n) {
         for (int ph = 0; ph < pooled_height; ++ph) {
           int hstart = ph * stride_h() - pad_t();
           int hend = min(hstart + kernel_h(), height);
           hstart = max(hstart, 0);
           for (int pw = 0; pw < pooled_width; ++pw) {
             int wstart = pw * stride_w() - pad_l();
             int wend = min(wstart + kernel_w(), width);
             wstart = max(wstart, 0);
             for (int pd = 0; pd < pooled_depth; ++pd) {
               int dstart = pd * stride_[2] - pads_[2];
               int dend = min(dstart + kernel_[2], depth);
               dstart = max(dstart, 0);
               const int pool_index =
                   ((n * pooled_height + ph) * pooled_width + pw) *
                       pooled_depth +
                   pd;
               const float scale =
                   1. / (hend - hstart) / (wend - wstart) / (dend - dstart);
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
                   for (int d = dstart; d < dend; ++d) {
                     const int input_index =
                         ((n * height + h) * width + w) * depth + d;
                     PoolType::process_grad(
                         pool_index,
                         input_index,
                         scale,
                         Xmat,
                         Ymat,
                         dYmat,
                         dXmat);
                   }
                 }
               }
             }
           }
         }
       }
       break;
     default:
       CAFFE_THROW("Unsupported pooling size");
       return false;
   }
   return true;
 }

 REGISTER_CPU_OPERATOR(
     AveragePoolGradient,
     PoolGradientOp<float, CPUContext, AveragePool<float>>);
 OPERATOR_SCHEMA(AveragePoolGradient).NumInputs(3).NumOutputs(1);

 REGISTER_CPU_OPERATOR(
     AveragePool1DGradient,
     PoolGradientOp<float, CPUContext, AveragePool<float>>);
 OPERATOR_SCHEMA(AveragePool1DGradient).NumInputs(3).NumOutputs(1);

 REGISTER_CPU_OPERATOR(
     AveragePool2DGradient,
     PoolGradientOp<float, CPUContext, AveragePool<float>>);
 OPERATOR_SCHEMA(AveragePool2DGradient).NumInputs(3).NumOutputs(1);

 REGISTER_CPU_OPERATOR(
     AveragePool3DGradient,
     PoolGradientOp<float, CPUContext, AveragePool<float>>);
 OPERATOR_SCHEMA(AveragePool3DGradient).NumInputs(3).NumOutputs(1);

 REGISTER_CPU_OPERATOR(
     MaxPoolGradient,
     PoolGradientOp<float, CPUContext, MaxPool<float>>);
 OPERATOR_SCHEMA(MaxPoolGradient).NumInputs(3).NumOutputs(1);

 REGISTER_CPU_OPERATOR(
     MaxPool1DGradient,
     PoolGradientOp<float, CPUContext, MaxPool<float>>);
 OPERATOR_SCHEMA(MaxPool1DGradient).NumInputs(3).NumOutputs(1);

 REGISTER_CPU_OPERATOR(
     MaxPool2DGradient,
     PoolGradientOp<float, CPUContext, MaxPool<float>>);
 OPERATOR_SCHEMA(MaxPool2DGradient).NumInputs(3).NumOutputs(1);

 REGISTER_CPU_OPERATOR(
     MaxPool3DGradient,
     PoolGradientOp<float, CPUContext, MaxPool<float>>);
 OPERATOR_SCHEMA(MaxPool3DGradient).NumInputs(3).NumOutputs(1);

 class GetPoolGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         def_.type() + "Gradient",
         "",
         vector<string>{I(0), O(0), GO(0)},
         vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(AveragePool, GetPoolGradient);
 REGISTER_GRADIENT(AveragePool1D, GetPoolGradient);
 REGISTER_GRADIENT(AveragePool2D, GetPoolGradient);
 REGISTER_GRADIENT(AveragePool3D, GetPoolGradient);
 REGISTER_GRADIENT(MaxPool, GetPoolGradient);
 REGISTER_GRADIENT(MaxPool1D, GetPoolGradient);
 REGISTER_GRADIENT(MaxPool2D, GetPoolGradient);
 REGISTER_GRADIENT(MaxPool3D, GetPoolGradient);
 }
	#include "caffe2/operators/pool_op.h"

	namespace caffe2 {

	using std::max;
	using std::min;

	namespace {
	// These two classe are just used as template arguments passed to the
	// PoolGradientOp
	// template to instantiate the different algorithms.
	template <typename T>
	class AveragePool {
	public:
	static void process_grad(
	const T& /x_data/,
	const T& /y_data/,
	const T& dy_data,
	const T& scale,
	T& dx_data) {
	dx_data += (scale * dy_data);
	}

	static void process_grad(
	const int y_col,
	const int x_col,
	const float scale,
	ConstEigenArrayMap<float>& /x_data/,
	ConstEigenArrayMap<float>& /y_data/,
	ConstEigenArrayMap<float>& dy_data,
	EigenArrayMap<float>& dx_data) {
	dx_data.col(x_col) += scale * dy_data.col(y_col);
	}
	};

	template <typename T>
	class MaxPool {
	public:
	static void process_grad(
	const T& x_data,
	const T& y_data,
	const T& dy_data,
	const T& /scale/,
	T& dx_data) {
	if (x_data == y_data) {
	dx_data += dy_data;
	}
	}

	static void process_grad(
	const int y_col,
	const int x_col,
	const float /scale/,
	ConstEigenArrayMap<float>& x_data,
	ConstEigenArrayMap<float>& y_data,
	ConstEigenArrayMap<float>& dy_data,
	EigenArrayMap<float>& dx_data) {
	dx_data.col(x_col) +=
	dy_data.col(y_col) * (x_data.col(x_col)
	.cwiseEqual(y_data.col(y_col))
	.template cast<float>());
	}
	};
	}

	template <typename T, class Context, typename PoolType>
	bool PoolGradientOp<T, Context, PoolType>::RunOnDeviceWithOrderNCHW() {
	auto& X = Input(0);
	auto& Y = Input(1);
	auto& dY = Input(2);
	auto* dX = Output(0);
	// TODO(Yangqing): Add shape checks.
	dX->ResizeLike(X);
	math::Set<float, CPUContext>(
	X.size(), 0, dX->template mutable_data<float>(), &context_);
	const float* Xdata = X.template data<float>();
	const float* Ydata = Y.template data<float>();
	const float* dYdata = dY.template data<float>();
	float* dXdata = dX->template mutable_data<float>();
	int channels = X.dim32(1);
	CAFFE_ENFORCE_EQ(channels, dY.dim32(1));
	int height = X.dim32(2);
	int width = kernel_.size() > 1 ? X.dim32(3) : 1;
	int depth = kernel_.size() > 2 ? X.dim32(4) : 1;
	vector<int> dims(X.dims().begin() + 2, X.dims().end());
	ConvPoolOpBase<CPUContext>::ComputePads(dims);
	int pooled_height = dY.dim32(2);
	int pooled_width = kernel_.size() > 1 ? dY.dim32(3) : 1;
	int pooled_depth = kernel_.size() > 2 ? dY.dim32(4) : 1;
	// The main loop
	switch (kernel_.size()) {
	case 1:
	for (int n = 0; n < X.dim32(0); ++n) {
	for (int c = 0; c < channels; ++c) {
	for (int ph = 0; ph < pooled_height; ++ph) {
	int hstart = ph * stride_h() - pad_t();
	int hend = min(hstart + kernel_h(), height);
	hstart = max(hstart, 0);
	float scale = 1. / (hend - hstart);
	for (int h = hstart; h < hend; ++h) {
	PoolType::process_grad(
	Xdata[h], Ydata[ph], dYdata[ph], scale, dXdata[h]);
	}
	}
	// offset
	Xdata += height;
	dXdata += height;
	Ydata += pooled_height;
	dYdata += pooled_height;
	}
	}
	break;
	case 2:
	for (int n = 0; n < X.dim32(0); ++n) {
	for (int c = 0; c < channels; ++c) {
	for (int ph = 0; ph < pooled_height; ++ph) {
	int hstart = ph * stride_h() - pad_t();
	int hend = min(hstart + kernel_h(), height);
	hstart = max(hstart, 0);
	for (int pw = 0; pw < pooled_width; ++pw) {
	int wstart = pw * stride_w() - pad_l();
	int wend = min(wstart + kernel_w(), width);
	wstart = max(wstart, 0);
	float scale = 1. / (hend - hstart) / (wend - wstart);
	const int pooled_index = ph * pooled_width + pw;
	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	const int index = h * width + w;
	PoolType::process_grad(
	Xdata[index],
	Ydata[pooled_index],
	dYdata[pooled_index],
	scale,
	dXdata[index]);
	}
	}
	}
	}
	// offset
	Xdata += height * width;
	dXdata += height * width;
	Ydata += pooled_height * pooled_width;
	dYdata += pooled_height * pooled_width;
	}
	}
	break;
	case 3:
	for (int n = 0; n < X.dim32(0); ++n) {
	for (int c = 0; c < channels; ++c) {
	for (int ph = 0; ph < pooled_height; ++ph) {
	int hstart = ph * stride_h() - pad_t();
	int hend = min(hstart + kernel_h(), height);
	hstart = max(hstart, 0);
	for (int pw = 0; pw < pooled_width; ++pw) {
	int wstart = pw * stride_w() - pad_l();
	int wend = min(wstart + kernel_w(), width);
	wstart = max(wstart, 0);
	for (int pd = 0; pd < pooled_depth; ++pd) {
	int dstart = pd * stride_[2] - pads_[2];
	int dend = min(dstart + kernel_[2], depth);
	dstart = max(dstart, 0);
	float scale =
	1. / (hend - hstart) / (wend - wstart) / (dend - dstart);
	const int pooled_index =
	ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	for (int d = dstart; d < dend; ++d) {
	const int index = h * width * depth + w * depth + d;
	PoolType::process_grad(
	Xdata[index],
	Ydata[pooled_index],
	dYdata[pooled_index],
	scale,
	dXdata[index]);
	}
	}
	}
	}
	}
	}
	// offset
	Xdata += height * width * depth;
	dXdata += height * width * depth;
	Ydata += pooled_height * pooled_width * pooled_depth;
	dYdata += pooled_height * pooled_width * pooled_depth;
	}
	}
	break;
	default:
	CAFFE_THROW("Unsupported pooling size");
	return false;
	}
	return true;
	}

	template <typename T, class Context, typename PoolType>
	bool PoolGradientOp<T, Context, PoolType>::RunOnDeviceWithOrderNHWC() {
	auto& X = Input(0);
	auto& Y = Input(1);
	auto& dY = Input(2);
	DCHECK_EQ(dY.ndim(), kernel_.size() + 2);
	auto* dX = Output(0);
	dX->ResizeLike(X);

	int channels = X.dim32(X.ndim() - 1);
	CAFFE_ENFORCE_EQ(channels, dY.dim32(dY.ndim() - 1));
	ConstEigenArrayMap<T> Ymat(
	Y.template data<float>(), channels, Y.size() / channels);
	ConstEigenArrayMap<float> dYmat(
	dY.template data<float>(), channels, Y.size() / channels);
	ConstEigenArrayMap<float> Xmat(
	X.template data<float>(), channels, X.size() / channels);
	EigenArrayMap<float> dXmat(
	dX->template mutable_data<float>(), channels, X.size() / channels);
	dXmat.setZero();
	int height = X.dim32(1);
	int width = kernel_.size() > 1 ? X.dim32(2) : 1;
	int depth = kernel_.size() > 2 ? X.dim32(3) : 1;
	vector<int> dims(X.dims().begin() + 1, X.dims().end() - 1);
	ConvPoolOpBase<CPUContext>::ComputePads(dims);
	int pooled_height = dY.dim32(1);
	int pooled_width = kernel_.size() > 1 ? dY.dim32(2) : 1;
	int pooled_depth = kernel_.size() > 2 ? dY.dim32(3) : 1;

	// The main loop
	// Do not do openmp here: the following for loops are looping over the pooled
	// output, so if one parallelizes the outer loops, race conditions could
	// happen in the inner loops.
	switch (kernel_.size()) {
	case 1:
	for (int n = 0; n < X.dim32(0); ++n) {
	for (int ph = 0; ph < pooled_height; ++ph) {
	int hstart = ph * stride_h() - pad_t();
	int hend = min(hstart + kernel_h(), height);
	hstart = max(hstart, 0);
	const int pool_index = n * pooled_height + ph;
	const float scale = 1. / (hend - hstart);
	for (int h = hstart; h < hend; ++h) {
	const int input_index = n * height + h;
	PoolType::process_grad(
	pool_index, input_index, scale, Xmat, Ymat, dYmat, dXmat);
	}
	}
	}
	break;
	case 2:
	for (int n = 0; n < X.dim32(0); ++n) {
	for (int ph = 0; ph < pooled_height; ++ph) {
	int hstart = ph * stride_h() - pad_t();
	int hend = min(hstart + kernel_h(), height);
	hstart = max(hstart, 0);
	for (int pw = 0; pw < pooled_width; ++pw) {
	int wstart = pw * stride_w() - pad_l();
	int wend = min(wstart + kernel_w(), width);
	wstart = max(wstart, 0);
	const int pool_index = (n * pooled_height + ph) * pooled_width + pw;
	const float scale = 1. / (hend - hstart) / (wend - wstart);
	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	const int input_index = (n * height + h) * width + w;
	PoolType::process_grad(
	pool_index, input_index, scale, Xmat, Ymat, dYmat, dXmat);
	}
	}
	}
	}
	}
	break;
	case 3:
	for (int n = 0; n < X.dim32(0); ++n) {
	for (int ph = 0; ph < pooled_height; ++ph) {
	int hstart = ph * stride_h() - pad_t();
	int hend = min(hstart + kernel_h(), height);
	hstart = max(hstart, 0);
	for (int pw = 0; pw < pooled_width; ++pw) {
	int wstart = pw * stride_w() - pad_l();
	int wend = min(wstart + kernel_w(), width);
	wstart = max(wstart, 0);
	for (int pd = 0; pd < pooled_depth; ++pd) {
	int dstart = pd * stride_[2] - pads_[2];
	int dend = min(dstart + kernel_[2], depth);
	dstart = max(dstart, 0);
	const int pool_index =
	((n * pooled_height + ph) * pooled_width + pw) *
	pooled_depth +
	pd;
	const float scale =
	1. / (hend - hstart) / (wend - wstart) / (dend - dstart);
	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	for (int d = dstart; d < dend; ++d) {
	const int input_index =
	((n * height + h) * width + w) * depth + d;
	PoolType::process_grad(
	pool_index,
	input_index,
	scale,
	Xmat,
	Ymat,
	dYmat,
	dXmat);
	}
	}
	}
	}
	}
	}
	}
	break;
	default:
	CAFFE_THROW("Unsupported pooling size");
	return false;
	}
	return true;
	}

	REGISTER_CPU_OPERATOR(
	AveragePoolGradient,
	PoolGradientOp<float, CPUContext, AveragePool<float>>);
	OPERATOR_SCHEMA(AveragePoolGradient).NumInputs(3).NumOutputs(1);

	REGISTER_CPU_OPERATOR(
	AveragePool1DGradient,
	PoolGradientOp<float, CPUContext, AveragePool<float>>);
	OPERATOR_SCHEMA(AveragePool1DGradient).NumInputs(3).NumOutputs(1);

	REGISTER_CPU_OPERATOR(
	AveragePool2DGradient,
	PoolGradientOp<float, CPUContext, AveragePool<float>>);
	OPERATOR_SCHEMA(AveragePool2DGradient).NumInputs(3).NumOutputs(1);

	REGISTER_CPU_OPERATOR(
	AveragePool3DGradient,
	PoolGradientOp<float, CPUContext, AveragePool<float>>);
	OPERATOR_SCHEMA(AveragePool3DGradient).NumInputs(3).NumOutputs(1);

	REGISTER_CPU_OPERATOR(
	MaxPoolGradient,
	PoolGradientOp<float, CPUContext, MaxPool<float>>);
	OPERATOR_SCHEMA(MaxPoolGradient).NumInputs(3).NumOutputs(1);

	REGISTER_CPU_OPERATOR(
	MaxPool1DGradient,
	PoolGradientOp<float, CPUContext, MaxPool<float>>);
	OPERATOR_SCHEMA(MaxPool1DGradient).NumInputs(3).NumOutputs(1);

	REGISTER_CPU_OPERATOR(
	MaxPool2DGradient,
	PoolGradientOp<float, CPUContext, MaxPool<float>>);
	OPERATOR_SCHEMA(MaxPool2DGradient).NumInputs(3).NumOutputs(1);

	REGISTER_CPU_OPERATOR(
	MaxPool3DGradient,
	PoolGradientOp<float, CPUContext, MaxPool<float>>);
	OPERATOR_SCHEMA(MaxPool3DGradient).NumInputs(3).NumOutputs(1);

	class GetPoolGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;
	vector<OperatorDef> GetGradientDefs() override {
	return SingleGradientDef(
	def_.type() + "Gradient",
	"",
	vector<string>{I(0), O(0), GO(0)},
	vector<string>{GI(0)});
	}
	};
	REGISTER_GRADIENT(AveragePool, GetPoolGradient);
	REGISTER_GRADIENT(AveragePool1D, GetPoolGradient);
	REGISTER_GRADIENT(AveragePool2D, GetPoolGradient);
	REGISTER_GRADIENT(AveragePool3D, GetPoolGradient);
	REGISTER_GRADIENT(MaxPool, GetPoolGradient);
	REGISTER_GRADIENT(MaxPool1D, GetPoolGradient);
	REGISTER_GRADIENT(MaxPool2D, GetPoolGradient);
	REGISTER_GRADIENT(MaxPool3D, GetPoolGradient);
	}