aten/src/THC/THCAtomics.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_ATOMICS_INC
 #define THC_ATOMICS_INC

 #include <c10/util/complex.h>
 #include <THC/THC.h>
 #include <TH/THHalf.h>
 #include <THC/THCNumerics.cuh>
 #include <ATen/ATen.h>

 template <typename T>
 struct AtomicFPOp;

 template <>
 struct AtomicFPOp<at::Half> {
   template <typename func_t>
   inline __device__ at::Half operator() (at::Half *address, at::Half val, const func_t& func) {
     unsigned int * address_as_ui =
       (unsigned int *) ((char *)address - ((size_t)address & 2));
     unsigned int old = *address_as_ui;
     unsigned int assumed;

     at::Half hsum;
     do {
       assumed = old;
       hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
       hsum = func(hsum, val);
       old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
       old = atomicCAS(address_as_ui, assumed, old);
     } while (assumed != old);
     hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
     return hsum;
   }
 };

 template <>
 struct AtomicFPOp<at::BFloat16> {
   template <typename func_t>
   inline __device__ at::BFloat16 operator() (at::BFloat16 *address, at::BFloat16 val, const func_t& func) {
     unsigned int * address_as_ui =
       (unsigned int *) ((char *)address - ((size_t)address & 2));
     unsigned int old = *address_as_ui;
     unsigned int assumed;

     at::BFloat16 bsum;
     do {
       assumed = old;
       bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
       bsum = func(bsum, val);
       old = (size_t)address & 2 ? (old & 0xffff) | (bsum.x << 16) : (old & 0xffff0000) | bsum.x;
       old = atomicCAS(address_as_ui, assumed, old);
     } while (assumed != old);
     bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
     return bsum.x;
   }
 };

 template <>
 struct AtomicFPOp<double> {
   template <typename func_t>
   inline __device__ double operator() (double * address, double val, const func_t& func) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int old = *address_as_ull;
     unsigned long long int assumed;

     do {
       assumed = old;
       old = atomicCAS(address_as_ull, assumed, func(val, assumed));
       // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
     } while (assumed != old);

     return __longlong_as_double(old);
   }
 };

 template <typename T, size_t n>
 struct AtomicAddIntegerImpl;

 template<typename T>
 struct AtomicAddIntegerImpl<T, 1> {
   inline __device__ void operator()(T *address, T val) {
     size_t offset = (size_t)address & 3;
     uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);
     uint32_t old = *address_as_ui;
     uint32_t shift = offset * 8;
     uint32_t old_byte;
     uint32_t newval;
     uint32_t assumed;

     do {
       assumed = old;
       old_byte = (old >> shift) & 0xff;
       // preserve size in initial cast. Casting directly to uint32_t pads
       // negative signed values with 1's (e.g. signed -1 = unsigned ~0).
       newval = static_cast<uint8_t>(THCNumerics<T>::add(val, old_byte));
       newval = (old & ~(0x000000ff << shift)) | (newval << shift);
       old = atomicCAS(address_as_ui, assumed, newval);
     } while (assumed != old);
   }
 };

 template<typename T>
 struct AtomicAddIntegerImpl<T, 2> {
   inline __device__ void operator()(T *address, T val) {
     size_t offset = (size_t)address & 2;
     uint32_t * address_as_ui = (uint32_t *)((char *)address - offset);
     bool is_32_align = offset;
     uint32_t old = *address_as_ui;
     uint32_t old_bytes;
     uint32_t newval;
     uint32_t assumed;

     do {
       assumed = old;
       old_bytes = is_32_align ? old >> 16 : old & 0xffff;
       // preserve size in initial cast. Casting directly to uint32_t pads
       // negative signed values with 1's (e.g. signed -1 = unsigned ~0).
       newval = static_cast<uint16_t>(THCNumerics<T>::add(val, old_bytes));
       newval = is_32_align ? (old & 0xffff) | (newval << 16) : (old & 0xffff0000) | newval;
       old = atomicCAS(address_as_ui, assumed, newval);
     } while (assumed != old);
   }
 };

 template<typename T>
 struct AtomicAddIntegerImpl<T, 4> {
   inline __device__ void operator()(T *address, T val) {
     uint32_t * address_as_ui = (uint32_t *) (address);
     uint32_t old = *address_as_ui;
     uint32_t newval;
     uint32_t assumed;

     do {
       assumed = old;
       newval = val +  (T)old;
       old = atomicCAS(address_as_ui, assumed, newval);
     } while (assumed != old);
   }
 };

 template<typename T>
 struct AtomicAddIntegerImpl<T, 8> {
   inline __device__ void operator()(T *address, T val) {
     unsigned long long * address_as_ui = (unsigned long long *) (address);
     unsigned long long old = *address_as_ui;
     unsigned long long newval;
     unsigned long long assumed;

     do {
       assumed = old;
       newval = val +  (T)old;
       old = atomicCAS(address_as_ui, assumed, newval);
     } while (assumed != old);
   }
 };

 static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
   AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
 }

 static inline  __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
   AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
 }

 static inline  __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
   AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
 }

 static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
   return atomicAdd(address, val);
 }

 static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
 #ifdef __HIP_PLATFORM_HCC__
   __atomic_fetch_add(address, val, __ATOMIC_RELAXED);
 #else
   AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 #endif
 }

 static inline __device__ void gpuAtomicAdd(bool *address, bool val) {
   *address = address && val;
 }

 static inline  __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
 #if ((CUDA_VERSION < 10000) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
   return AtomicFPOp<at::Half>()(address, val,
                                 [](at::Half hsum, at::Half val) {
                                   return THCNumerics<at::Half>::add(hsum, val);
                                 });
 #else
   return atomicAdd(reinterpret_cast<__half*>(address), val);
 #endif
 }

 static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
   return AtomicFPOp<at::BFloat16>()(address, val,
                                     [](at::BFloat16 bsum, at::BFloat16 val) {
                                       return THCNumerics<at::BFloat16>::add(bsum, val);
                                     });
 }

 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
 // from CUDA C Programmic Guide
 static inline __device__ double atomicAdd(double* address, double val)
 #if defined(__clang__) && defined(__CUDA__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wgcc-compat"
     __attribute__((enable_if(true, "")))
 #pragma GCC diagnostic pop
 #endif
 {

   return AtomicFPOp<double>()(address, val,
                               [](double val, unsigned long long int assumed) {
                                 return __double_as_longlong(val + __longlong_as_double(assumed));
                               });
 }
 #elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) || defined(__HIP_PLATFORM_HCC__)

 /* Note [hip-clang differences to hcc]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * The upcoming hip-clang compiler for ROCm differs from hcc in a few details.
  * It exports the __HIP__ macro, we can hence differentiate between hcc and
  * hip-clang. In the below, hcc only received support for atomicAdd with double
  * typing after work week 18312. hip-clang had support from the first version.
  * In general, the code-visible differences between hip-clang and hcc will be
  * minimal.
  */

 #if defined(__HIP_PLATFORM_HCC__) && __hcc_workweek__ < 18312 && !__HIP__
   // This needs to be defined for the host side pass
   static inline  __device__  double atomicAdd(double *address, double val) { }
 #endif
 #endif

 static inline __device__ double gpuAtomicAdd(double *address, double val) {
   return atomicAdd(address, val);
 }

 static inline __device__ float gpuAtomicAdd(float *address, float val) {
   return atomicAdd(address, val);
 }

 template<typename T>
 static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
   gpuAtomicAdd(&address->real_, val.real_);
   gpuAtomicAdd(&address->imag_, val.imag_);
 }

 /* Note [gpuAtomicAdd vs atomicAdd]
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  * Some extensions such as torchvision call atomicAdd()
  * directly and require non-library provided data type support. Only for these, we
  * continue to provide atomicAdd overloads.
  */
 static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
   return gpuAtomicAdd(address, val);
 }

 static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
   return gpuAtomicAdd(address, val);
 }

 static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
   gpuAtomicAdd(address, val);
 }

 static inline  __device__ void atomicAdd(int8_t *address, int8_t val) {
   gpuAtomicAdd(address, val);
 }

 static inline  __device__ void atomicAdd(int16_t *address, int16_t val) {
   gpuAtomicAdd(address, val);
 }

 static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
   gpuAtomicAdd(address, val);
 }

 static inline __device__ void atomicAdd(bool *address, bool val) {
   gpuAtomicAdd(address, val);
 }

 // Atomic multiplication implementation.

 inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) {
   return AtomicFPOp<at::Half>()(address, val,
                                 [](at::Half bsum, at::Half val) {
                                   return THCNumerics<at::Half>::mul(bsum, val);
                                 });
 }

 inline __device__ at::BFloat16 gpuAtomicMul(at::BFloat16 * address, at::BFloat16 val) {
   return AtomicFPOp<at::BFloat16>()(address, val,
                                     [](at::BFloat16 bsum, at::BFloat16 val) {
                                       return THCNumerics<at::BFloat16>::mul(bsum, val);
                                     });
 }

 inline __device__ double gpuAtomicMul(double * address, double val) {
   return AtomicFPOp<double>()(address, val,
                               [](double val, unsigned long long int assumed) {
                                 return __double_as_longlong(val * __longlong_as_double(assumed));
                               });
 }

 // Dont use a templated function for this since the addition function defaults to the CUDA built-in.
 inline __device__ float gpuAtomicMul (float * address, float val) {
   unsigned int* address_as_ull = (unsigned int*)address;
   unsigned int old = *address_as_ull;
   unsigned int assumed;

   do {
     assumed = old;
     old = atomicCAS(address_as_ull, assumed,
                     __float_as_int(val *
                                    __int_as_float(assumed)));

     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
   } while (assumed != old);

   return __int_as_float(old);
 }
 #endif // THC_ATOMICS_INC
	#ifndef THC_ATOMICS_INC
	#define THC_ATOMICS_INC

	#include <c10/util/complex.h>
	#include <THC/THC.h>
	#include <TH/THHalf.h>
	#include <THC/THCNumerics.cuh>
	#include <ATen/ATen.h>

	template <typename T>
	struct AtomicFPOp;

	template <>
	struct AtomicFPOp<at::Half> {
	template <typename func_t>
	inline __device__ at::Half operator() (at::Half *address, at::Half val, const func_t& func) {
	unsigned int * address_as_ui =
	(unsigned int ) ((char )address - ((size_t)address & 2));
	unsigned int old = *address_as_ui;
	unsigned int assumed;

	at::Half hsum;
	do {
	assumed = old;
	hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
	hsum = func(hsum, val);
	old = (size_t)address & 2 ? (old & 0xffff) \| (hsum.x << 16) : (old & 0xffff0000) \| hsum.x;
	old = atomicCAS(address_as_ui, assumed, old);
	} while (assumed != old);
	hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
	return hsum;
	}
	};

	template <>
	struct AtomicFPOp<at::BFloat16> {
	template <typename func_t>
	inline __device__ at::BFloat16 operator() (at::BFloat16 *address, at::BFloat16 val, const func_t& func) {
	unsigned int * address_as_ui =
	(unsigned int ) ((char )address - ((size_t)address & 2));
	unsigned int old = *address_as_ui;
	unsigned int assumed;

	at::BFloat16 bsum;
	do {
	assumed = old;
	bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
	bsum = func(bsum, val);
	old = (size_t)address & 2 ? (old & 0xffff) \| (bsum.x << 16) : (old & 0xffff0000) \| bsum.x;
	old = atomicCAS(address_as_ui, assumed, old);
	} while (assumed != old);
	bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
	return bsum.x;
	}
	};

	template <>
	struct AtomicFPOp<double> {
	template <typename func_t>
	inline __device__ double operator() (double * address, double val, const func_t& func) {
	unsigned long long int* address_as_ull = (unsigned long long int*)address;
	unsigned long long int old = *address_as_ull;
	unsigned long long int assumed;

	do {
	assumed = old;
	old = atomicCAS(address_as_ull, assumed, func(val, assumed));
	// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
	} while (assumed != old);

	return __longlong_as_double(old);
	}
	};

	template <typename T, size_t n>
	struct AtomicAddIntegerImpl;

	template<typename T>
	struct AtomicAddIntegerImpl<T, 1> {
	inline __device__ void operator()(T *address, T val) {
	size_t offset = (size_t)address & 3;
	uint32_t * address_as_ui = (uint32_t )((char )address - offset);
	uint32_t old = *address_as_ui;
	uint32_t shift = offset * 8;
	uint32_t old_byte;
	uint32_t newval;
	uint32_t assumed;

	do {
	assumed = old;
	old_byte = (old >> shift) & 0xff;
	// preserve size in initial cast. Casting directly to uint32_t pads
	// negative signed values with 1's (e.g. signed -1 = unsigned ~0).
	newval = static_cast<uint8_t>(THCNumerics<T>::add(val, old_byte));
	newval = (old & ~(0x000000ff << shift)) \| (newval << shift);
	old = atomicCAS(address_as_ui, assumed, newval);
	} while (assumed != old);
	}
	};

	template<typename T>
	struct AtomicAddIntegerImpl<T, 2> {
	inline __device__ void operator()(T *address, T val) {
	size_t offset = (size_t)address & 2;
	uint32_t * address_as_ui = (uint32_t )((char )address - offset);
	bool is_32_align = offset;
	uint32_t old = *address_as_ui;
	uint32_t old_bytes;
	uint32_t newval;
	uint32_t assumed;

	do {
	assumed = old;
	old_bytes = is_32_align ? old >> 16 : old & 0xffff;
	// preserve size in initial cast. Casting directly to uint32_t pads
	// negative signed values with 1's (e.g. signed -1 = unsigned ~0).
	newval = static_cast<uint16_t>(THCNumerics<T>::add(val, old_bytes));
	newval = is_32_align ? (old & 0xffff) \| (newval << 16) : (old & 0xffff0000) \| newval;
	old = atomicCAS(address_as_ui, assumed, newval);
	} while (assumed != old);
	}
	};

	template<typename T>
	struct AtomicAddIntegerImpl<T, 4> {
	inline __device__ void operator()(T *address, T val) {
	uint32_t * address_as_ui = (uint32_t *) (address);
	uint32_t old = *address_as_ui;
	uint32_t newval;
	uint32_t assumed;

	do {
	assumed = old;
	newval = val + (T)old;
	old = atomicCAS(address_as_ui, assumed, newval);
	} while (assumed != old);
	}
	};

	template<typename T>
	struct AtomicAddIntegerImpl<T, 8> {
	inline __device__ void operator()(T *address, T val) {
	unsigned long long * address_as_ui = (unsigned long long *) (address);
	unsigned long long old = *address_as_ui;
	unsigned long long newval;
	unsigned long long assumed;

	do {
	assumed = old;
	newval = val + (T)old;
	old = atomicCAS(address_as_ui, assumed, newval);
	} while (assumed != old);
	}
	};

	static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) {
	AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
	}

	static inline __device__ void gpuAtomicAdd(int8_t *address, int8_t val) {
	AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
	}

	static inline __device__ void gpuAtomicAdd(int16_t *address, int16_t val) {
	AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
	}

	static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) {
	return atomicAdd(address, val);
	}

	static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) {
	#ifdef __HIP_PLATFORM_HCC__
	__atomic_fetch_add(address, val, __ATOMIC_RELAXED);
	#else
	AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
	#endif
	}

	static inline __device__ void gpuAtomicAdd(bool *address, bool val) {
	*address = address && val;
	}

	static inline __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) {
	#if ((CUDA_VERSION < 10000) \|\| (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
	return AtomicFPOp<at::Half>()(address, val,
	[](at::Half hsum, at::Half val) {
	return THCNumerics<at::Half>::add(hsum, val);
	});
	#else
	return atomicAdd(reinterpret_cast<__half*>(address), val);
	#endif
	}

	static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) {
	return AtomicFPOp<at::BFloat16>()(address, val,
	[](at::BFloat16 bsum, at::BFloat16 val) {
	return THCNumerics<at::BFloat16>::add(bsum, val);
	});
	}

	#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 \|\| CUDA_VERSION < 8000)
	// from CUDA C Programmic Guide
	static inline __device__ double atomicAdd(double* address, double val)
	#if defined(__clang__) && defined(__CUDA__)
	#pragma GCC diagnostic push
	#pragma GCC diagnostic ignored "-Wgcc-compat"
	__attribute__((enable_if(true, "")))
	#pragma GCC diagnostic pop
	#endif
	{

	return AtomicFPOp<double>()(address, val,
	[](double val, unsigned long long int assumed) {
	return __double_as_longlong(val + __longlong_as_double(assumed));
	});
	}
	#elif !defined(__CUDA_ARCH__) && (CUDA_VERSION < 8000) \|\| defined(__HIP_PLATFORM_HCC__)

	/* Note [hip-clang differences to hcc]
	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	* The upcoming hip-clang compiler for ROCm differs from hcc in a few details.
	* It exports the __HIP__ macro, we can hence differentiate between hcc and
	* hip-clang. In the below, hcc only received support for atomicAdd with double
	* typing after work week 18312. hip-clang had support from the first version.
	* In general, the code-visible differences between hip-clang and hcc will be
	* minimal.
	*/

	#if defined(__HIP_PLATFORM_HCC__) && __hcc_workweek__ < 18312 && !__HIP__
	// This needs to be defined for the host side pass
	static inline __device__ double atomicAdd(double *address, double val) { }
	#endif
	#endif

	static inline __device__ double gpuAtomicAdd(double *address, double val) {
	return atomicAdd(address, val);
	}

	static inline __device__ float gpuAtomicAdd(float *address, float val) {
	return atomicAdd(address, val);
	}

	template<typename T>
	static inline __device__ void gpuAtomicAdd(c10::complex<T> *address, c10::complex<T> val) {
	gpuAtomicAdd(&address->real_, val.real_);
	gpuAtomicAdd(&address->imag_, val.imag_);
	}

	/* Note [gpuAtomicAdd vs atomicAdd]
	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	* Some extensions such as torchvision call atomicAdd()
	* directly and require non-library provided data type support. Only for these, we
	* continue to provide atomicAdd overloads.
	*/
	static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) {
	return gpuAtomicAdd(address, val);
	}

	static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) {
	return gpuAtomicAdd(address, val);
	}

	static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) {
	gpuAtomicAdd(address, val);
	}

	static inline __device__ void atomicAdd(int8_t *address, int8_t val) {
	gpuAtomicAdd(address, val);
	}

	static inline __device__ void atomicAdd(int16_t *address, int16_t val) {
	gpuAtomicAdd(address, val);
	}

	static inline __device__ void atomicAdd(int64_t *address, int64_t val) {
	gpuAtomicAdd(address, val);
	}

	static inline __device__ void atomicAdd(bool *address, bool val) {
	gpuAtomicAdd(address, val);
	}

	// Atomic multiplication implementation.

	inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) {
	return AtomicFPOp<at::Half>()(address, val,
	[](at::Half bsum, at::Half val) {
	return THCNumerics<at::Half>::mul(bsum, val);
	});
	}

	inline __device__ at::BFloat16 gpuAtomicMul(at::BFloat16 * address, at::BFloat16 val) {
	return AtomicFPOp<at::BFloat16>()(address, val,
	[](at::BFloat16 bsum, at::BFloat16 val) {
	return THCNumerics<at::BFloat16>::mul(bsum, val);
	});
	}

	inline __device__ double gpuAtomicMul(double * address, double val) {
	return AtomicFPOp<double>()(address, val,
	[](double val, unsigned long long int assumed) {
	return __double_as_longlong(val * __longlong_as_double(assumed));
	});
	}

	// Dont use a templated function for this since the addition function defaults to the CUDA built-in.
	inline __device__ float gpuAtomicMul (float * address, float val) {
	unsigned int* address_as_ull = (unsigned int*)address;
	unsigned int old = *address_as_ull;
	unsigned int assumed;

	do {
	assumed = old;
	old = atomicCAS(address_as_ull, assumed,
	__float_as_int(val *
	__int_as_float(assumed)));

	// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
	} while (assumed != old);

	return __int_as_float(old);
	}
	#endif // THC_ATOMICS_INC