caffe2/utils/mixed_utils.h - platform/external/pytorch - Git at Google

 // Copyright 2004-present Facebook. All Rights Reserved.
 #ifndef CAFFE2_UTILS_MIXED_UTILS_H
 #define CAFFE2_UTILS_MIXED_UTILS_H

 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context_gpu.h"

 // define functions to allow add/mult/store operaions for input/output with
 // mixed precisions.
 namespace caffe2 {

 // functions that will only be triggered when there is no spcialized version
 // supported
 template <typename T, typename T2>
 inline __device__ T mixed_mult(T data1, T2 data2) {
   return data1 * data2;
 };

 template <typename T, typename T2>
 inline __device__ T mixed_add(T data1, T2 data2) {
   return data1 + data2;
 };

 template <typename TIN, typename TOUT>
 inline __device__ void mixed_store(TIN* data_in, TOUT* data_out) {
   *data_out = *data_in;
   return;
 };

 template <typename T>
 inline __device__ void mixed_store(T* data_in, T* data_out) {
   *data_out = *data_in;
   return;
 };

 #ifdef CAFFE_HAS_CUDA_FP16
 // define templated functions to support mixed precision computation
 template <>
 inline __device__ float mixed_mult(float data1, const float data2) {
   return data1 * data2;
 }

 template <>
 inline __device__ float mixed_mult(float data1, const half data2) {
   return data1 * __half2float(data2);
 }

 template <>
 inline __device__ float mixed_mult(float data1, float16 data2) {
   half* data2_half = reinterpret_cast<half*>(&data2);
   return data1 * __half2float(*data2_half);
 }
 template <>
 inline __device__ float mixed_add(float data1, const float data2) {
   return data1 + data2;
 }

 template <>
 inline __device__ float mixed_add(float data1, const half data2) {
   return data1 + __half2float(data2);
 }

 template <>
 inline __device__ float mixed_add(float data1, float16 data2) {
   half* data2_half = reinterpret_cast<half*>(&data2);
   return data1 + __half2float(*data2_half);
 }

 template <>
 inline __device__ void mixed_store(float* data_in, float* data_out) {
   *data_out = *data_in;
   return;
 }

 template <>
 inline __device__ void mixed_store(half* data_in, float* data_out) {
   *data_out = __half2float(*data_in);
   return;
 }

 template <>
 inline __device__ void mixed_store(float16* data_in, float* data_out) {
   half* data_in_half = reinterpret_cast<half*>(data_in);
   *data_out = __half2float(*data_in_half);
   return;
 }

 template <>
 inline __device__ void mixed_store(float* data_in, float16* data_out) {
   half data_in_half = __float2half(*data_in);
   float16* data_in_float16 = reinterpret_cast<float16*>(&data_in_half);
   *data_out = *data_in_float16;
   return;
 }

 template <>
 inline __device__ void mixed_store(float* data_in, half* data_out) {
   half data_in_half = __float2half(*data_in);
   *data_out = data_in_half;
   return;
 }
 #endif // for CAFFE_HAS_CUDA_FP16
 } // namespace caffe2
 #endif // for CAFFE2_UTILS_MIXED_UTILS_H
	// Copyright 2004-present Facebook. All Rights Reserved.
	#ifndef CAFFE2_UTILS_MIXED_UTILS_H
	#define CAFFE2_UTILS_MIXED_UTILS_H

	#include "caffe2/core/common_gpu.h"
	#include "caffe2/core/context_gpu.h"

	// define functions to allow add/mult/store operaions for input/output with
	// mixed precisions.
	namespace caffe2 {

	// functions that will only be triggered when there is no spcialized version
	// supported
	template <typename T, typename T2>
	inline __device__ T mixed_mult(T data1, T2 data2) {
	return data1 * data2;
	};

	template <typename T, typename T2>
	inline __device__ T mixed_add(T data1, T2 data2) {
	return data1 + data2;
	};

	template <typename TIN, typename TOUT>
	inline __device__ void mixed_store(TIN* data_in, TOUT* data_out) {
	data_out = data_in;
	return;
	};

	template <typename T>
	inline __device__ void mixed_store(T* data_in, T* data_out) {
	data_out = data_in;
	return;
	};

	#ifdef CAFFE_HAS_CUDA_FP16
	// define templated functions to support mixed precision computation
	template <>
	inline __device__ float mixed_mult(float data1, const float data2) {
	return data1 * data2;
	}

	template <>
	inline __device__ float mixed_mult(float data1, const half data2) {
	return data1 * __half2float(data2);
	}

	template <>
	inline __device__ float mixed_mult(float data1, float16 data2) {
	half* data2_half = reinterpret_cast<half*>(&data2);
	return data1 * __half2float(*data2_half);
	}
	template <>
	inline __device__ float mixed_add(float data1, const float data2) {
	return data1 + data2;
	}

	template <>
	inline __device__ float mixed_add(float data1, const half data2) {
	return data1 + __half2float(data2);
	}

	template <>
	inline __device__ float mixed_add(float data1, float16 data2) {
	half* data2_half = reinterpret_cast<half*>(&data2);
	return data1 + __half2float(*data2_half);
	}

	template <>
	inline __device__ void mixed_store(float* data_in, float* data_out) {
	data_out = data_in;
	return;
	}

	template <>
	inline __device__ void mixed_store(half* data_in, float* data_out) {
	data_out = __half2float(data_in);
	return;
	}

	template <>
	inline __device__ void mixed_store(float16* data_in, float* data_out) {
	half* data_in_half = reinterpret_cast<half*>(data_in);
	data_out = __half2float(data_in_half);
	return;
	}

	template <>
	inline __device__ void mixed_store(float* data_in, float16* data_out) {
	half data_in_half = __float2half(*data_in);
	float16* data_in_float16 = reinterpret_cast<float16*>(&data_in_half);
	data_out = data_in_float16;
	return;
	}

	template <>
	inline __device__ void mixed_store(float* data_in, half* data_out) {
	half data_in_half = __float2half(*data_in);
	*data_out = data_in_half;
	return;
	}
	#endif // for CAFFE_HAS_CUDA_FP16
	} // namespace caffe2
	#endif // for CAFFE2_UTILS_MIXED_UTILS_H