aten/src/THC/THCTensorMathReduce.cuh - platform/external/pytorch - Git at Google

 #ifndef THC_TENSORMATH_REDUCE_CUH
 #define THC_TENSORMATH_REDUCE_CUH

 #include <THC/THCTensorMath.h>
 #include <THC/THCGeneral.h>
 #include <THC/THCNumerics.cuh>
 #include <THC/THCReduce.cuh>
 #include <THC/THCTensorCopy.hpp>

 /*
 Reductions that (only) operate on accumulate types.
 */

 template <typename T>
 struct ReduceAdd {
   inline __device__ T operator()(const T a, const T b) const {
     return THCNumerics<T>::add(a, b);
   }
 };

 template<typename T>
 inline __device__ T THCMax(const T a, const T b) {
   return THCNumerics<T>::gt(a, b) ? a : b;
 }

 template<typename T, typename AccT>
 __global__ void THCTensor_kernel_renorm(T *data,
                                         const AccT value,
                                         const ptrdiff_t size,
                                         const AccT maxnorm) {
   __shared__ AccT buffer[32];
   int64_t tx = threadIdx.x;
   int64_t bx = blockIdx.x;
   int64_t step = blockDim.x;
   T *row = data + size * bx;

   buffer[tx] = scalar_cast<AccT>(0);
   AccT norm;

   if (THCNumerics<AccT>::eq(value, scalar_cast<AccT, float>(INFINITY))) {
     // get norm of axis
     for (ptrdiff_t i = tx; i < size; i += step) {
       const AccT val = scalar_cast<AccT>(row[i]);
       buffer[tx] = THCMax<AccT>(buffer[tx], static_cast<AccT>(std::abs(val)));
     }
     // add (reduce)
     for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
       __syncthreads();
       if (tx < stride)
         buffer[tx] = THCMax<AccT>(buffer[tx], buffer[tx+stride]);
     }
     // clip norms
     __syncthreads();
     norm = buffer[0];
   } else {
     // get norm of axis
     for (ptrdiff_t i = tx; i < size; i += step) {
       const AccT val = scalar_cast<AccT>(row[i]);
       buffer[tx] = THCNumerics<AccT>::add(
         buffer[tx],
         THCNumerics<AccT>::pow(static_cast<AccT>(std::abs(val)), value)
       );
     }
     // add (reduce)
     for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
       __syncthreads();
       if (tx < stride)
         buffer[tx] = THCNumerics<AccT>::add(buffer[tx], buffer[tx+stride]);
     }
     // clip norms
     __syncthreads();
     norm = THCNumerics<AccT>::pow(buffer[0], static_cast<AccT>(1) / value);
   }

   if (THCNumerics<AccT>::gt(norm, maxnorm)) {
     norm = THCNumerics<AccT>::div(
       maxnorm,
       THCNumerics<AccT>::add(norm, scalar_cast<AccT>(1e-7))
     );
     // renormalize
     for (ptrdiff_t i = tx; i < size; i += step) {
       const AccT val = scalar_cast<AccT>(row[i]);
       row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
     }
   }
 }

 template <typename T>
 struct TensorNonZeroOp {
   TensorNonZeroOp() {}

   __host__ __device__ T operator()(const T lhs) const {
     const T zero = scalar_cast<T>(0);
     if (THCNumerics<T>::eq(lhs, zero)) return zero;

     return scalar_cast<T>(1);
   }
 };

 template <typename T>
 struct AddOp {
   __device__ __forceinline__ T operator()(T const &lhs, T const &rhs) {
     return THCNumerics<T>::add(lhs, rhs);
   }
 };

 template <typename T>
 struct MulOp {
   __device__ __forceinline__ T operator()(T const &lhs, T const &rhs) {
     return THCNumerics<T>::mul(lhs, rhs);
   }
 };

 #endif // THC_TENSORMATH_REDUCE_CUH
	#ifndef THC_TENSORMATH_REDUCE_CUH
	#define THC_TENSORMATH_REDUCE_CUH

	#include <THC/THCTensorMath.h>
	#include <THC/THCGeneral.h>
	#include <THC/THCNumerics.cuh>
	#include <THC/THCReduce.cuh>
	#include <THC/THCTensorCopy.hpp>

	/*
	Reductions that (only) operate on accumulate types.
	*/

	template <typename T>
	struct ReduceAdd {
	inline __device__ T operator()(const T a, const T b) const {
	return THCNumerics<T>::add(a, b);
	}
	};

	template<typename T>
	inline __device__ T THCMax(const T a, const T b) {
	return THCNumerics<T>::gt(a, b) ? a : b;
	}

	template<typename T, typename AccT>
	__global__ void THCTensor_kernel_renorm(T *data,
	const AccT value,
	const ptrdiff_t size,
	const AccT maxnorm) {
	__shared__ AccT buffer[32];
	int64_t tx = threadIdx.x;
	int64_t bx = blockIdx.x;
	int64_t step = blockDim.x;
	T row = data + size bx;

	buffer[tx] = scalar_cast<AccT>(0);
	AccT norm;

	if (THCNumerics<AccT>::eq(value, scalar_cast<AccT, float>(INFINITY))) {
	// get norm of axis
	for (ptrdiff_t i = tx; i < size; i += step) {
	const AccT val = scalar_cast<AccT>(row[i]);
	buffer[tx] = THCMax<AccT>(buffer[tx], static_cast<AccT>(std::abs(val)));
	}
	// add (reduce)
	for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
	__syncthreads();
	if (tx < stride)
	buffer[tx] = THCMax<AccT>(buffer[tx], buffer[tx+stride]);
	}
	// clip norms
	__syncthreads();
	norm = buffer[0];
	} else {
	// get norm of axis
	for (ptrdiff_t i = tx; i < size; i += step) {
	const AccT val = scalar_cast<AccT>(row[i]);
	buffer[tx] = THCNumerics<AccT>::add(
	buffer[tx],
	THCNumerics<AccT>::pow(static_cast<AccT>(std::abs(val)), value)
	);
	}
	// add (reduce)
	for (unsigned int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
	__syncthreads();
	if (tx < stride)
	buffer[tx] = THCNumerics<AccT>::add(buffer[tx], buffer[tx+stride]);
	}
	// clip norms
	__syncthreads();
	norm = THCNumerics<AccT>::pow(buffer[0], static_cast<AccT>(1) / value);
	}

	if (THCNumerics<AccT>::gt(norm, maxnorm)) {
	norm = THCNumerics<AccT>::div(
	maxnorm,
	THCNumerics<AccT>::add(norm, scalar_cast<AccT>(1e-7))
	);
	// renormalize
	for (ptrdiff_t i = tx; i < size; i += step) {
	const AccT val = scalar_cast<AccT>(row[i]);
	row[i] = scalar_cast<T>(THCNumerics<AccT>::mul(val, norm));
	}
	}
	}

	template <typename T>
	struct TensorNonZeroOp {
	TensorNonZeroOp() {}

	__host__ __device__ T operator()(const T lhs) const {
	const T zero = scalar_cast<T>(0);
	if (THCNumerics<T>::eq(lhs, zero)) return zero;

	return scalar_cast<T>(1);
	}
	};

	template <typename T>
	struct AddOp {
	__device__ __forceinline__ T operator()(T const &lhs, T const &rhs) {
	return THCNumerics<T>::add(lhs, rhs);
	}
	};

	template <typename T>
	struct MulOp {
	__device__ __forceinline__ T operator()(T const &lhs, T const &rhs) {
	return THCNumerics<T>::mul(lhs, rhs);
	}
	};

	#endif // THC_TENSORMATH_REDUCE_CUH