im2col.h - platform/external/pytorch - Git at Google

 #ifndef THCUNN_IM2COL_H
 #define THCUNN_IM2COL_H

 #include "common.h"

 // Kernel for fast unfold+copy
 // (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
 template <typename Dtype>
 __global__ void im2col_kernel(const int n, const Dtype* data_im,
     const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
     const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
     Dtype* data_col) {
   CUDA_KERNEL_LOOP(index, n) {
     int w_out = index % width_col;
     index /= width_col;
     int h_out = index % height_col;
     int channel_in = index / height_col;
     int channel_out = channel_in * ksize_h * ksize_w;
     int h_in = h_out * stride_h - pad_h;
     int w_in = w_out * stride_w - pad_w;
     data_col += (channel_out * height_col + h_out) * width_col + w_out;
     data_im += (channel_in * height + h_in) * width + w_in;
     for (int i = 0; i < ksize_h; ++i) {
       for (int j = 0; j < ksize_w; ++j) {
         int h = h_in + i;
         int w = w_in + j;
         *data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
           data_im[i * width + j] : 0;
         data_col += height_col * width_col;
       }
     }
   }
 }

 template <typename Dtype>
 void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
     const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
     const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) {
   // We are going to launch channels * height_col * width_col kernels, each
   // kernel responsible for copying a single-channel grid.
   int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1;
   int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1;
   int num_kernels = channels * height_col * width_col;
   // Launch
   im2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
       num_kernels, data_im, height, width, ksize_h, ksize_w,
       pad_h, pad_w, stride_h, stride_w,
       height_col, width_col, data_col
   );
 }

 template <typename Dtype>
 __global__ void col2im_kernel(const int n, const Dtype* data_col,
     const int height, const int width, const int channels, const int patch_h, const int patch_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
     Dtype* data_im) {
   CUDA_KERNEL_LOOP(index, n) {
     Dtype val = 0;
     int w = index % width + pad_w;
     int h = (index / width) % height + pad_h;
     int c = index / (width * height);
     // compute the start and end of the output
     int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
     int w_col_end = min(w / stride_w + 1, width_col);
     int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
     int h_col_end = min(h / stride_h + 1, height_col);
     /*
        for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
     // the col location: [c * width * height + h_out, w_out]
     int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize + (w - w_col * stride_w);
     val += data_col[(c_col * height_col + h_col) * width_col + w_col];
     }
     }
      */
     // equivalent implementation
     int offset = (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
     int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
     int coeff_w_col = (1 - stride_w * height_col * width_col);
     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
         val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
       }
     }
     data_im[index] = val;
   }
 }

 template <typename Dtype>
 void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
     const int height, const int width, const int patch_h, const int patch_w, const int pad_h,
     const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) {
   int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
   int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
   int num_kernels = channels * height * width;
   // To avoid involving atomic operations, we will launch one kernel per
   // bottom dimension, and then in the kernel add up the top dimensions.
   col2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
       num_kernels, data_col, height, width, channels,
       patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
       height_col, width_col, data_im
   );
 }

 #endif
	#ifndef THCUNN_IM2COL_H
	#define THCUNN_IM2COL_H

	#include "common.h"

	// Kernel for fast unfold+copy
	// (borrowed from Caffe: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu)
	template <typename Dtype>
	__global__ void im2col_kernel(const int n, const Dtype* data_im,
	const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
	const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
	Dtype* data_col) {
	CUDA_KERNEL_LOOP(index, n) {
	int w_out = index % width_col;
	index /= width_col;
	int h_out = index % height_col;
	int channel_in = index / height_col;
	int channel_out = channel_in * ksize_h * ksize_w;
	int h_in = h_out * stride_h - pad_h;
	int w_in = w_out * stride_w - pad_w;
	data_col += (channel_out * height_col + h_out) * width_col + w_out;
	data_im += (channel_in * height + h_in) * width + w_in;
	for (int i = 0; i < ksize_h; ++i) {
	for (int j = 0; j < ksize_w; ++j) {
	int h = h_in + i;
	int w = w_in + j;
	*data_col = (h >= 0 && w >= 0 && h < height && w < width) ?
	data_im[i * width + j] : 0;
	data_col += height_col * width_col;
	}
	}
	}
	}

	template <typename Dtype>
	void im2col(cudaStream_t stream, const Dtype* data_im, const int channels,
	const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
	const int pad_w, const int stride_h, const int stride_w, Dtype* data_col) {
	// We are going to launch channels * height_col * width_col kernels, each
	// kernel responsible for copying a single-channel grid.
	int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1;
	int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1;
	int num_kernels = channels * height_col * width_col;
	// Launch
	im2col_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
	num_kernels, data_im, height, width, ksize_h, ksize_w,
	pad_h, pad_w, stride_h, stride_w,
	height_col, width_col, data_col
	);
	}

	template <typename Dtype>
	__global__ void col2im_kernel(const int n, const Dtype* data_col,
	const int height, const int width, const int channels, const int patch_h, const int patch_w,
	const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int height_col, const int width_col,
	Dtype* data_im) {
	CUDA_KERNEL_LOOP(index, n) {
	Dtype val = 0;
	int w = index % width + pad_w;
	int h = (index / width) % height + pad_h;
	int c = index / (width * height);
	// compute the start and end of the output
	int w_col_start = (w < patch_w) ? 0 : (w - patch_w) / stride_w + 1;
	int w_col_end = min(w / stride_w + 1, width_col);
	int h_col_start = (h < patch_h) ? 0 : (h - patch_h) / stride_h + 1;
	int h_col_end = min(h / stride_h + 1, height_col);
	/*
	for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
	for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
	// the col location: [c * width * height + h_out, w_out]
	int c_col = c * patch_h * patch_w + (h - h_col * stride_h) * ksize + (w - w_col * stride_w);
	val += data_col[(c_col * height_col + h_col) * width_col + w_col];
	}
	}
	*/
	// equivalent implementation
	int offset = (c * patch_h * patch_w + h * patch_w + w) * height_col * width_col;
	int coeff_h_col = (1 - stride_h * patch_w * height_col) * width_col;
	int coeff_w_col = (1 - stride_w * height_col * width_col);
	for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
	for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
	val += data_col[offset + h_col * coeff_h_col + w_col * coeff_w_col];
	}
	}
	data_im[index] = val;
	}
	}

	template <typename Dtype>
	void col2im(cudaStream_t stream, const Dtype* data_col, const int channels,
	const int height, const int width, const int patch_h, const int patch_w, const int pad_h,
	const int pad_w, const int stride_h, const int stride_w, Dtype* data_im) {
	int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
	int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
	int num_kernels = channels * height * width;
	// To avoid involving atomic operations, we will launch one kernel per
	// bottom dimension, and then in the kernel add up the top dimensions.
	col2im_kernel <<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, stream>>> (
	num_kernels, data_col, height, width, channels,
	patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
	height_col, width_col, data_im
	);
	}

	#endif