caffe2/image/transform_gpu.cu - platform/external/pytorch - Git at Google

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/image/transform_gpu.h"

 /**
  *
  * Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
  * Distributed under 2-clause BSD license; see accompanying LICENSE file
  *
  **/

 namespace caffe2 {

 namespace {

 // input in (int8, NHWC), output in (fp32, NCHW)
 template <typename In, typename Out>
 __global__
 void transform_kernel(const int N, const int C, const int H, const int W,
                       const float mean, const float std, const In* in, Out* out) {
   const int n = blockIdx.x;

   const int nStride = C*H*W;

   // pointers to data for this image
   const In* input_ptr = &in[n*nStride];
   Out* output_ptr = &out[n*nStride];

   // either read or write uncoalesced - try reading
   for (int c=0; c < C; ++c) {
     for (int h=threadIdx.y; h < H; h += blockDim.y) {
       for (int w=threadIdx.x; w < W; w += blockDim.x) {
         int in_idx = c + C*w + C*W*h;  // HWC
         int out_idx = c*H*W + h*W + w;  // CHW

         //out[out_idx] = static_cast<Out>(
         //                static_cast<In>(in[in_idx]-mean)/std);
         output_ptr[out_idx] = (static_cast<Out>(input_ptr[in_idx])-mean) / std;
       }
     }
   }
 }

 }

 template <typename T_IN, typename T_OUT, class Context>
 bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> *Y, T_OUT mean, T_OUT std, Context *context) {
   return true;
 };

 template <>
 bool TransformOnGPU<uint8_t, float, CUDAContext>(Tensor<CUDAContext>& X, Tensor<CUDAContext> *Y, float std, float mean, CUDAContext *context)
 {
   // data comes in as NHWC
   const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
   // data goes out as NCHW
   Y->Resize(std::vector<int>{N,C,H,W});

   auto* input_data = X.data<uint8_t>();
   auto* output_data = Y->mutable_data<float>();

   transform_kernel<uint8_t,float><<<N, dim3(16,16), 0, context->cuda_stream()>>>(N,C,H,W, mean, std, input_data, output_data);
   return true;
 }

 }  // namespace caffe2
	#include "caffe2/core/context_gpu.h"
	#include "caffe2/image/transform_gpu.h"

	/**
	*
	* Copyright (c) 2016, NVIDIA CORPORATION, All rights reserved
	* Distributed under 2-clause BSD license; see accompanying LICENSE file
	*
	**/

	namespace caffe2 {

	namespace {

	// input in (int8, NHWC), output in (fp32, NCHW)
	template <typename In, typename Out>
	__global__
	void transform_kernel(const int N, const int C, const int H, const int W,
	const float mean, const float std, const In* in, Out* out) {
	const int n = blockIdx.x;

	const int nStride = CHW;

	// pointers to data for this image
	const In* input_ptr = &in[n*nStride];
	Out* output_ptr = &out[n*nStride];

	// either read or write uncoalesced - try reading
	for (int c=0; c < C; ++c) {
	for (int h=threadIdx.y; h < H; h += blockDim.y) {
	for (int w=threadIdx.x; w < W; w += blockDim.x) {
	int in_idx = c + Cw + CW*h; // HWC
	int out_idx = cHW + h*W + w; // CHW

	//out[out_idx] = static_cast<Out>(
	// static_cast<In>(in[in_idx]-mean)/std);
	output_ptr[out_idx] = (static_cast<Out>(input_ptr[in_idx])-mean) / std;
	}
	}
	}
	}

	}

	template <typename T_IN, typename T_OUT, class Context>
	bool TransformOnGPU(Tensor<Context>& X, Tensor<Context> Y, T_OUT mean, T_OUT std, Context context) {
	return true;
	};

	template <>
	bool TransformOnGPU<uint8_t, float, CUDAContext>(Tensor<CUDAContext>& X, Tensor<CUDAContext> Y, float std, float mean, CUDAContext context)
	{
	// data comes in as NHWC
	const int N = X.dim32(0), C = X.dim32(3), H = X.dim32(1), W = X.dim32(2);
	// data goes out as NCHW
	Y->Resize(std::vector<int>{N,C,H,W});

	auto* input_data = X.data<uint8_t>();
	auto* output_data = Y->mutable_data<float>();

	transform_kernel<uint8_t,float><<<N, dim3(16,16), 0, context->cuda_stream()>>>(N,C,H,W, mean, std, input_data, output_data);
	return true;
	}

	} // namespace caffe2