blob: 0b63b47ade3938f17c3f5acd8cc44ae9e90df7c1 [file] [log] [blame]
#pragma once
#include <caffe2/core/types.h>
#ifdef __CUDA_ARCH__
// Proxy for including cuda_fp16.h, because common_gpu.h
// has necessary diagnostic guards.
#include <caffe2/core/common_gpu.h>
#endif
#if __HIP_DEVICE_COMPILE__
#include <caffe2/core/hip/common_hip.h>
#endif
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
#define CONVERSIONS_DECL __host__ __device__ inline
#else
#define CONVERSIONS_DECL inline
#endif
namespace caffe2 {
namespace convert {
namespace {
inline float16 cpu_float2half_rn(float f) {
float16 ret;
static_assert(
sizeof(unsigned int) == sizeof(float),
"Programming error sizeof(unsigned int) != sizeof(float)");
unsigned* xp = reinterpret_cast<unsigned int*>(&f);
unsigned x = *xp;
unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
unsigned sign, exponent, mantissa;
// Get rid of +NaN/-NaN case first.
if (u > 0x7f800000) {
ret.x = 0x7fffU;
return ret;
}
sign = ((x >> 16) & 0x8000);
// Get rid of +Inf/-Inf, +0/-0.
if (u > 0x477fefff) {
ret.x = sign | 0x7c00U;
return ret;
}
if (u < 0x33000001) {
ret.x = (sign | 0x0000);
return ret;
}
exponent = ((u >> 23) & 0xff);
mantissa = (u & 0x7fffff);
if (exponent > 0x70) {
shift = 13;
exponent -= 0x70;
} else {
shift = 0x7e - exponent;
exponent = 0;
mantissa |= 0x800000;
}
lsb = (1 << shift);
lsb_s1 = (lsb >> 1);
lsb_m1 = (lsb - 1);
// Round to nearest even.
remainder = (mantissa & lsb_m1);
mantissa >>= shift;
if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
++mantissa;
if (!(mantissa & 0x3ff)) {
++exponent;
mantissa = 0;
}
}
ret.x = (sign | (exponent << 10) | mantissa);
return ret;
}
inline float cpu_half2float(float16 h) {
unsigned sign = ((h.x >> 15) & 1);
unsigned exponent = ((h.x >> 10) & 0x1f);
unsigned mantissa = ((h.x & 0x3ff) << 13);
if (exponent == 0x1f) { /* NaN or Inf */
mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
exponent = 0xff;
} else if (!exponent) { /* Denorm or Zero */
if (mantissa) {
unsigned int msb;
exponent = 0x71;
do {
msb = (mantissa & 0x400000);
mantissa <<= 1; /* normalize */
--exponent;
} while (!msb);
mantissa &= 0x7fffff; /* 1.mantissa is implicit */
}
} else {
exponent += 0x70;
}
unsigned i = ((sign << 31) | (exponent << 23) | mantissa);
float ret;
memcpy(&ret, &i, sizeof(i));
return ret;
}
}; // anonymous
#if __CUDACC__
#if CUDA_VERSION >= 9000
CONVERSIONS_DECL float16 halfToFloat16(half x) {
#ifdef __GNUC__
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
#pragma GCC diagnostic push
#endif
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif // __GNUC__
float16 r = *reinterpret_cast<float16*>(&x);
#ifdef __GNUC__
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
#pragma GCC diagnostic pop
#endif
#endif // __GNUC__
return r;
}
inline half float16ToHalf(const float16 x) {
__half_raw hr;
hr.x = x.x;
half r(hr);
return r;
}
inline half floatToHalf(const float x) {
float16 xh = cpu_float2half_rn(x);
return float16ToHalf(xh);
}
#else
inline float16 halfToFloat16(__half x) {
float16 r;
r.x = x.x;
return r;
}
inline __half float16ToHalf(const float16 x) {
__half r;
r.x = x.x;
return r;
}
inline half floatToHalf(const float x) {
float16 xh = cpu_float2half_rn(x);
return float16ToHalf(xh);
}
#endif // CUDA_VERSION
#endif // __CUDACC__
// general version: defer to static_cast
template <typename IN, typename OUT>
CONVERSIONS_DECL OUT To(const IN in) {
return static_cast<OUT>(in);
}
// explicit for fp16
template <>
CONVERSIONS_DECL float16 To(const float in) {
#if __CUDA_ARCH__
// hacky interface between C2 fp16 and CUDA
#if CUDA_VERSION >= 9000
half rh = __float2half(in);
return halfToFloat16(rh);
#else
float16 ret;
ret.x = __float2half(in).x;
return ret;
#endif // CUDA_VERSION >= 9000
#elif __HIP_DEVICE_COMPILE__
float16 ret;
ret.x = __half_as_ushort(__float2half(in));
return ret;
#else
return cpu_float2half_rn(in);
#endif
}
template <>
CONVERSIONS_DECL float To(const float16 in) {
#if __CUDA_ARCH__
#if CUDA_VERSION >= 9000
__half_raw tmp;
#else
__half tmp;
#endif
tmp.x = in.x;
return __half2float(tmp);
#elif __HIP_DEVICE_COMPILE__
__half tmp;
tmp = __ushort_as_half(in.x);
return __half2float(tmp);
#else
return cpu_half2float(in);
#endif
};
template <>
CONVERSIONS_DECL float To(const float in) {
return in;
}
template <typename OUT, typename IN>
CONVERSIONS_DECL OUT Get(IN x) {
return static_cast<OUT>(x);
}
template <>
CONVERSIONS_DECL float Get(float16 x) {
return To<float16, float>(x);
}
template <>
CONVERSIONS_DECL float16 Get(float x) {
return To<float, float16>(x);
}
}; // namespace convert
}; // namespace caffe2
#undef CONVERSIONS_DECL