gloo/math.cc - platform/external/pytorch - Git at Google

 #include "gloo/math.h"

 #include <algorithm>
 #include <cassert>

 #ifdef GLOO_USE_AVX
 #include <immintrin.h>
 #endif

 #include "gloo/types.h"

 #define is_aligned(POINTER, BYTE_COUNT) \
   (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)

 namespace gloo {

 #ifdef GLOO_USE_AVX

 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
 void sum<float16>(float16* x, const float16* y, size_t n) {
   // Handle unaligned data at the beginning of the buffer
   while (!is_aligned(x, 32)) {
     *x += *y;
     x++;
     y++;
     n--;
   }
   assert(is_aligned(y, 32));
   size_t i;
   for (i = 0; i < (n / 8) * 8; i += 8) {
     __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
     __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
     __m128i vc16 = _mm256_cvtps_ph(_mm256_add_ps(va32, vb32), 0);
     _mm_store_si128((__m128i*)(&x[i]), vc16);
   }
   // Leftovers
   for (; i < n; i++) {
     x[i] += y[i];
   }
 }

 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
 void product<float16>(float16* x, const float16* y, size_t n) {
   // Handle unaligned data at the beginning of the buffer
   while (!is_aligned(x, 32)) {
     *x *= *y;
     x++;
     y++;
     n--;
   }
   assert(is_aligned(y, 32));
   size_t i;
   for (i = 0; i < (n / 8) * 8; i += 8) {
     __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
     __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
     __m128i vc16 = _mm256_cvtps_ph(_mm256_mul_ps(va32, vb32), 0);
     _mm_store_si128((__m128i*)(&x[i]), vc16);
   }
   // Leftovers
   for (; i < n; i++) {
     x[i] *= y[i];
   }
 }

 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
 void max<float16>(float16* x, const float16* y, size_t n) {
   // Handle unaligned data at the beginning of the buffer
   while (!is_aligned(x, 32)) {
     *x = std::max(*x, *y);
     x++;
     y++;
     n--;
   }
   assert(is_aligned(y, 32));
   size_t i;
   for (i = 0; i < (n / 8) * 8; i += 8) {
     __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
     __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
     __m128i vc16 = _mm256_cvtps_ph(_mm256_max_ps(va32, vb32), 0);
     _mm_store_si128((__m128i*)(&x[i]), vc16);
   }
   // Leftovers
   for (; i < n; i++) {
     x[i] = std::max(x[i], y[i]);
   }
 }

 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
 void min<float16>(float16* x, const float16* y, size_t n) {
   // Handle unaligned data at the beginning of the buffer
   while (!is_aligned(x, 32)) {
     *x = std::min(*x, *y);
     x++;
     y++;
     n--;
   }
   assert(is_aligned(y, 32));
   size_t i;
   for (i = 0; i < (n / 8) * 8; i += 8) {
     __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
     __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
     __m128i vc16 = _mm256_cvtps_ph(_mm256_min_ps(va32, vb32), 0);
     _mm_store_si128((__m128i*)(&x[i]), vc16);
   }
   // Leftovers
   for (; i < n; i++) {
     x[i] = std::min(x[i], y[i]);
   }
 }

 #endif

 }
	#include "gloo/math.h"

	#include <algorithm>
	#include <cassert>

	#ifdef GLOO_USE_AVX
	#include <immintrin.h>
	#endif

	#include "gloo/types.h"

	#define is_aligned(POINTER, BYTE_COUNT) \
	(((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)

	namespace gloo {

	#ifdef GLOO_USE_AVX

	// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
	// offset, as would happen when reducing at an offset within an aligned buffer
	template <>
	void sum<float16>(float16* x, const float16* y, size_t n) {
	// Handle unaligned data at the beginning of the buffer
	while (!is_aligned(x, 32)) {
	x += y;
	x++;
	y++;
	n--;
	}
	assert(is_aligned(y, 32));
	size_t i;
	for (i = 0; i < (n / 8) * 8; i += 8) {
	__m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
	__m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
	__m128i vc16 = _mm256_cvtps_ph(_mm256_add_ps(va32, vb32), 0);
	_mm_store_si128((__m128i*)(&x[i]), vc16);
	}
	// Leftovers
	for (; i < n; i++) {
	x[i] += y[i];
	}
	}

	// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
	// offset, as would happen when reducing at an offset within an aligned buffer
	template <>
	void product<float16>(float16* x, const float16* y, size_t n) {
	// Handle unaligned data at the beginning of the buffer
	while (!is_aligned(x, 32)) {
	x = *y;
	x++;
	y++;
	n--;
	}
	assert(is_aligned(y, 32));
	size_t i;
	for (i = 0; i < (n / 8) * 8; i += 8) {
	__m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
	__m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
	__m128i vc16 = _mm256_cvtps_ph(_mm256_mul_ps(va32, vb32), 0);
	_mm_store_si128((__m128i*)(&x[i]), vc16);
	}
	// Leftovers
	for (; i < n; i++) {
	x[i] *= y[i];
	}
	}

	// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
	// offset, as would happen when reducing at an offset within an aligned buffer
	template <>
	void max<float16>(float16* x, const float16* y, size_t n) {
	// Handle unaligned data at the beginning of the buffer
	while (!is_aligned(x, 32)) {
	x = std::max(x, *y);
	x++;
	y++;
	n--;
	}
	assert(is_aligned(y, 32));
	size_t i;
	for (i = 0; i < (n / 8) * 8; i += 8) {
	__m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
	__m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
	__m128i vc16 = _mm256_cvtps_ph(_mm256_max_ps(va32, vb32), 0);
	_mm_store_si128((__m128i*)(&x[i]), vc16);
	}
	// Leftovers
	for (; i < n; i++) {
	x[i] = std::max(x[i], y[i]);
	}
	}

	// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
	// offset, as would happen when reducing at an offset within an aligned buffer
	template <>
	void min<float16>(float16* x, const float16* y, size_t n) {
	// Handle unaligned data at the beginning of the buffer
	while (!is_aligned(x, 32)) {
	x = std::min(x, *y);
	x++;
	y++;
	n--;
	}
	assert(is_aligned(y, 32));
	size_t i;
	for (i = 0; i < (n / 8) * 8; i += 8) {
	__m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
	__m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
	__m128i vc16 = _mm256_cvtps_ph(_mm256_min_ps(va32, vb32), 0);
	_mm_store_si128((__m128i*)(&x[i]), vc16);
	}
	// Leftovers
	for (; i < n; i++) {
	x[i] = std::min(x[i], y[i]);
	}
	}

	#endif

	}