SpatialAveragePooling.cu - platform/external/pytorch - Git at Google

 #include "THCUNN.h"
 #include "THCHalf.h"
 #include "THCHalfAutoNumerics.cuh"
 #include "common.h"

 template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
 __global__ void AvePoolForward(const int nthreads,
     const Dtype* const bottom_data, const int num, const int channels,
     const int height, const int width, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     Dtype* const top_data) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int pw = index % pooled_width;
     const int ph = (index / pooled_width) % pooled_height;
     const int c = (index / pooled_width / pooled_height) % channels;
     const int n = index / pooled_width / pooled_height / channels;
     int hstart = ph * stride_h - pad_h;
     int wstart = pw * stride_w - pad_w;
     int hend = min(hstart + kernel_h, height + pad_h);
     int wend = min(wstart + kernel_w, width + pad_w);
     const int pool_size = (hend - hstart) * (wend - wstart);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     hend = min(hend, height);
     wend = min(wend, width);
     Acctype aveval = Acctype(0);
     const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         aveval += bottom_slice[h * width + w];
       }
     }
     if(COUNT_INCLUDE_PAD)
       top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / pool_size);
     else
       top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / ((hend - hstart) * (wend - wstart)));
   }
 }

 template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
 __global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
     const int num, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_h, const int pad_w,
     Dtype* const bottom_diff) {
   CUDA_KERNEL_LOOP(index, nthreads) {
     // find out the local index
     // find out the local offset
     const int w = index % width + pad_w;
     const int h = (index / width) % height + pad_h;
     const int c = (index / width / height) % channels;
     const int n = index / width / height / channels;
     const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
     const int phend = min(h / stride_h + 1, pooled_height);
     const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
     const int pwend = min(w / stride_w + 1, pooled_width);
     Acctype gradient = Acctype(0);
     const Dtype* const top_diff_slice =
         top_diff + (n * channels + c) * pooled_height * pooled_width;
     for (int ph = phstart; ph < phend; ++ph) {
       for (int pw = pwstart; pw < pwend; ++pw) {
         // figure out the pooling size
         int hstart = ph * stride_h - pad_h;
         int wstart = pw * stride_w - pad_w;
         int hend = min(hstart + kernel_h, height + pad_h);
         int wend = min(wstart + kernel_w, width + pad_w);
         int pool_size = (hend - hstart) * (wend - wstart);
         hstart = max(hstart, 0);
         wstart = max(wstart, 0);
         hend = min(hend, height);
         wend = min(wend, width);
         if(COUNT_INCLUDE_PAD)
           gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
         else
           gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart));
       }
     }
     bottom_diff[index] = ScalarConvert<Acctype, Dtype>::to(gradient);
   }
 }

 #include "generic/SpatialAveragePooling.cu"
 #include "THCGenerateFloatTypes.h"
	#include "THCUNN.h"
	#include "THCHalf.h"
	#include "THCHalfAutoNumerics.cuh"
	#include "common.h"

	template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
	__global__ void AvePoolForward(const int nthreads,
	const Dtype* const bottom_data, const int num, const int channels,
	const int height, const int width, const int pooled_height,
	const int pooled_width, const int kernel_h, const int kernel_w,
	const int stride_h, const int stride_w, const int pad_h, const int pad_w,
	Dtype* const top_data) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	const int pw = index % pooled_width;
	const int ph = (index / pooled_width) % pooled_height;
	const int c = (index / pooled_width / pooled_height) % channels;
	const int n = index / pooled_width / pooled_height / channels;
	int hstart = ph * stride_h - pad_h;
	int wstart = pw * stride_w - pad_w;
	int hend = min(hstart + kernel_h, height + pad_h);
	int wend = min(wstart + kernel_w, width + pad_w);
	const int pool_size = (hend - hstart) * (wend - wstart);
	hstart = max(hstart, 0);
	wstart = max(wstart, 0);
	hend = min(hend, height);
	wend = min(wend, width);
	Acctype aveval = Acctype(0);
	const Dtype* const bottom_slice = bottom_data + (n * channels + c) * height * width;
	for (int h = hstart; h < hend; ++h) {
	for (int w = wstart; w < wend; ++w) {
	aveval += bottom_slice[h * width + w];
	}
	}
	if(COUNT_INCLUDE_PAD)
	top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / pool_size);
	else
	top_data[index] = ScalarConvert<Acctype, Dtype>::to(aveval / ((hend - hstart) * (wend - wstart)));
	}
	}

	template <typename Dtype, typename Acctype, bool COUNT_INCLUDE_PAD>
	__global__ void AvePoolBackward(const int nthreads, const Dtype* const top_diff,
	const int num, const int channels, const int height,
	const int width, const int pooled_height, const int pooled_width,
	const int kernel_h, const int kernel_w, const int stride_h,
	const int stride_w, const int pad_h, const int pad_w,
	Dtype* const bottom_diff) {
	CUDA_KERNEL_LOOP(index, nthreads) {
	// find out the local index
	// find out the local offset
	const int w = index % width + pad_w;
	const int h = (index / width) % height + pad_h;
	const int c = (index / width / height) % channels;
	const int n = index / width / height / channels;
	const int phstart = (h < kernel_h) ? 0 : (h - kernel_h) / stride_h + 1;
	const int phend = min(h / stride_h + 1, pooled_height);
	const int pwstart = (w < kernel_w) ? 0 : (w - kernel_w) / stride_w + 1;
	const int pwend = min(w / stride_w + 1, pooled_width);
	Acctype gradient = Acctype(0);
	const Dtype* const top_diff_slice =
	top_diff + (n * channels + c) * pooled_height * pooled_width;
	for (int ph = phstart; ph < phend; ++ph) {
	for (int pw = pwstart; pw < pwend; ++pw) {
	// figure out the pooling size
	int hstart = ph * stride_h - pad_h;
	int wstart = pw * stride_w - pad_w;
	int hend = min(hstart + kernel_h, height + pad_h);
	int wend = min(wstart + kernel_w, width + pad_w);
	int pool_size = (hend - hstart) * (wend - wstart);
	hstart = max(hstart, 0);
	wstart = max(wstart, 0);
	hend = min(hend, height);
	wend = min(wend, width);
	if(COUNT_INCLUDE_PAD)
	gradient += top_diff_slice[ph * pooled_width + pw] / pool_size;
	else
	gradient += top_diff_slice[ph * pooled_width + pw] / ((hend - hstart) * (wend - wstart));
	}
	}
	bottom_diff[index] = ScalarConvert<Acctype, Dtype>::to(gradient);
	}
	}

	#include "generic/SpatialAveragePooling.cu"
	#include "THCGenerateFloatTypes.h"