Don't propagate `-mavx` flag to dependents
Summary:
Previously, `gloo/math.h` inlined methods which use AVX builtins,
which required propagating the `-mavx` flag.
This diff moves these definitions out of the header and into a source
file to prevent avoid this.
Reviewed By: pixelb
Differential Revision: D5271043
fbshipit-source-id: dde4dc560dfb557b46d1a582a8b38e7cb8eb0c37
diff --git a/gloo/math.cc b/gloo/math.cc
new file mode 100644
index 0000000..2bdb211
--- /dev/null
+++ b/gloo/math.cc
@@ -0,0 +1,121 @@
+#include "gloo/math.h"
+
+#include <algorithm>
+#include <cassert>
+
+#ifdef GLOO_USE_AVX
+#include <immintrin.h>
+#endif
+
+#include "gloo/types.h"
+
+#define is_aligned(POINTER, BYTE_COUNT) \
+ (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)
+
+namespace gloo {
+
+#ifdef GLOO_USE_AVX
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void sum<float16>(float16* x, const float16* y, size_t n) {
+ // Handle unaligned data at the beginning of the buffer
+ while (!is_aligned(x, 32)) {
+ *x += *y;
+ x++;
+ y++;
+ n--;
+ }
+ assert(is_aligned(y, 32));
+ size_t i;
+ for (i = 0; i < (n / 8) * 8; i += 8) {
+ __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+ __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+ __m128i vc16 = _mm256_cvtps_ph(_mm256_add_ps(va32, vb32), 0);
+ _mm_store_si128((__m128i*)(&x[i]), vc16);
+ }
+ // Leftovers
+ for (; i < n; i++) {
+ x[i] += y[i];
+ }
+}
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void product<float16>(float16* x, const float16* y, size_t n) {
+ // Handle unaligned data at the beginning of the buffer
+ while (!is_aligned(x, 32)) {
+ *x *= *y;
+ x++;
+ y++;
+ n--;
+ }
+ assert(is_aligned(y, 32));
+ size_t i;
+ for (i = 0; i < (n / 8) * 8; i += 8) {
+ __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+ __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+ __m128i vc16 = _mm256_cvtps_ph(_mm256_mul_ps(va32, vb32), 0);
+ _mm_store_si128((__m128i*)(&x[i]), vc16);
+ }
+ // Leftovers
+ for (; i < n; i++) {
+ x[i] *= y[i];
+ }
+}
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void max<float16>(float16* x, const float16* y, size_t n) {
+ // Handle unaligned data at the beginning of the buffer
+ while (!is_aligned(x, 32)) {
+ *x = std::max(*x, *y);
+ x++;
+ y++;
+ n--;
+ }
+ assert(is_aligned(y, 32));
+ size_t i;
+ for (i = 0; i < (n / 8) * 8; i += 8) {
+ __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+ __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+ __m128i vc16 = _mm256_cvtps_ph(_mm256_max_ps(va32, vb32), 0);
+ _mm_store_si128((__m128i*)(&x[i]), vc16);
+ }
+ // Leftovers
+ for (; i < n; i++) {
+ x[i] = std::max(x[i], y[i]);
+ }
+}
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void min<float16>(float16* x, const float16* y, size_t n) {
+ // Handle unaligned data at the beginning of the buffer
+ while (!is_aligned(x, 32)) {
+ *x = std::min(*x, *y);
+ x++;
+ y++;
+ n--;
+ }
+ assert(is_aligned(y, 32));
+ size_t i;
+ for (i = 0; i < (n / 8) * 8; i += 8) {
+ __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+ __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+ __m128i vc16 = _mm256_cvtps_ph(_mm256_min_ps(va32, vb32), 0);
+ _mm_store_si128((__m128i*)(&x[i]), vc16);
+ }
+ // Leftovers
+ for (; i < n; i++) {
+ x[i] = std::min(x[i], y[i]);
+ }
+}
+
+#endif
+
+}
diff --git a/gloo/math.h b/gloo/math.h
index d46c732..889ac4a 100644
--- a/gloo/math.h
+++ b/gloo/math.h
@@ -9,12 +9,6 @@
#pragma once
-#include <algorithm>
-#include <cassert>
-
-#ifdef __AVX2__
-#include <immintrin.h>
-#endif
#ifdef GLOO_USE_EIGEN
#include <Eigen/Core>
#endif
@@ -88,109 +82,36 @@
#endif
-#ifdef __AVX2__
-#define is_aligned(POINTER, BYTE_COUNT) \
- (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)
+#ifdef GLOO_USE_AVX
// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
// offset, as would happen when reducing at an offset within an aligned buffer
template <>
-inline void sum<float16>(float16* x, const float16* y, size_t n) {
- // Handle unaligned data at the beginning of the buffer
- while (!is_aligned(x, 32)) {
- *x += *y;
- x++;
- y++;
- n--;
- }
- assert(is_aligned(y, 32));
- size_t i;
- for (i = 0; i < (n / 8) * 8; i += 8) {
- __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
- __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
- __m128i vc16 = _mm256_cvtps_ph(_mm256_add_ps(va32, vb32), 0);
- _mm_store_si128((__m128i*)(&x[i]), vc16);
- }
- // Leftovers
- for (; i < n; i++) {
- x[i] += y[i];
- }
-}
+void sum<float16>(float16* x, const float16* y, size_t n);
+extern template
+void sum<float16>(float16* x, const float16* y, size_t n);
// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
// offset, as would happen when reducing at an offset within an aligned buffer
template <>
-inline void product<float16>(float16* x, const float16* y, size_t n) {
- // Handle unaligned data at the beginning of the buffer
- while (!is_aligned(x, 32)) {
- *x *= *y;
- x++;
- y++;
- n--;
- }
- assert(is_aligned(y, 32));
- size_t i;
- for (i = 0; i < (n / 8) * 8; i += 8) {
- __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
- __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
- __m128i vc16 = _mm256_cvtps_ph(_mm256_mul_ps(va32, vb32), 0);
- _mm_store_si128((__m128i*)(&x[i]), vc16);
- }
- // Leftovers
- for (; i < n; i++) {
- x[i] *= y[i];
- }
-}
+void product<float16>(float16* x, const float16* y, size_t n);
+extern template
+void product<float16>(float16* x, const float16* y, size_t n);
// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
// offset, as would happen when reducing at an offset within an aligned buffer
template <>
-inline void max<float16>(float16* x, const float16* y, size_t n) {
- // Handle unaligned data at the beginning of the buffer
- while (!is_aligned(x, 32)) {
- *x = std::max(*x, *y);
- x++;
- y++;
- n--;
- }
- assert(is_aligned(y, 32));
- size_t i;
- for (i = 0; i < (n / 8) * 8; i += 8) {
- __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
- __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
- __m128i vc16 = _mm256_cvtps_ph(_mm256_max_ps(va32, vb32), 0);
- _mm_store_si128((__m128i*)(&x[i]), vc16);
- }
- // Leftovers
- for (; i < n; i++) {
- x[i] = std::max(x[i], y[i]);
- }
-}
+void max<float16>(float16* x, const float16* y, size_t n);
+extern template
+void max<float16>(float16* x, const float16* y, size_t n);
// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
// offset, as would happen when reducing at an offset within an aligned buffer
template <>
-inline void min<float16>(float16* x, const float16* y, size_t n) {
- // Handle unaligned data at the beginning of the buffer
- while (!is_aligned(x, 32)) {
- *x = std::min(*x, *y);
- x++;
- y++;
- n--;
- }
- assert(is_aligned(y, 32));
- size_t i;
- for (i = 0; i < (n / 8) * 8; i += 8) {
- __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
- __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
- __m128i vc16 = _mm256_cvtps_ph(_mm256_min_ps(va32, vb32), 0);
- _mm_store_si128((__m128i*)(&x[i]), vc16);
- }
- // Leftovers
- for (; i < n; i++) {
- x[i] = std::min(x[i], y[i]);
- }
-}
+void min<float16>(float16* x, const float16* y, size_t n);
+extern template
+void min<float16>(float16* x, const float16* y, size_t n);
+
#endif
} // namespace gloo