Don't propagate `-mavx` flag to dependents Summary: Previously, `gloo/math.h` inlined methods which use AVX builtins, which required propagating the `-mavx` flag. This diff moves these definitions out of the header and into a source file to prevent avoid this. Reviewed By: pixelb Differential Revision: D5271043 fbshipit-source-id: dde4dc560dfb557b46d1a582a8b38e7cb8eb0c37

commit: ceb13c8cc35261223d91e38ec9dd4963a02d6540 [log] [tgz]
author: Andrew Gallagher <andrewjcg@fb.com> Mon Jun 19 16:22:30 2017 -0700
committer: Pieter Noordhuis <pietern@fb.com> Mon Jun 19 16:46:43 2017 -0700
tree: 0f6c7d3837874fd15c37aae404d8cccbbab6b24b
parent: 32e6372538d5f23c8b21382fd0ed6b5900f26423 [diff]
diff --git a/gloo/math.cc b/gloo/math.cc
new file mode 100644
index 0000000..2bdb211
--- /dev/null
+++ b/gloo/math.cc

@@ -0,0 +1,121 @@
+#include "gloo/math.h"
+
+#include <algorithm>
+#include <cassert>
+
+#ifdef GLOO_USE_AVX
+#include <immintrin.h>
+#endif
+
+#include "gloo/types.h"
+
+#define is_aligned(POINTER, BYTE_COUNT) \
+  (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)
+
+namespace gloo {
+
+#ifdef GLOO_USE_AVX
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void sum<float16>(float16* x, const float16* y, size_t n) {
+  // Handle unaligned data at the beginning of the buffer
+  while (!is_aligned(x, 32)) {
+    *x += *y;
+    x++;
+    y++;
+    n--;
+  }
+  assert(is_aligned(y, 32));
+  size_t i;
+  for (i = 0; i < (n / 8) * 8; i += 8) {
+    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+    __m128i vc16 = _mm256_cvtps_ph(_mm256_add_ps(va32, vb32), 0);
+    _mm_store_si128((__m128i*)(&x[i]), vc16);
+  }
+  // Leftovers
+  for (; i < n; i++) {
+    x[i] += y[i];
+  }
+}
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void product<float16>(float16* x, const float16* y, size_t n) {
+  // Handle unaligned data at the beginning of the buffer
+  while (!is_aligned(x, 32)) {
+    *x *= *y;
+    x++;
+    y++;
+    n--;
+  }
+  assert(is_aligned(y, 32));
+  size_t i;
+  for (i = 0; i < (n / 8) * 8; i += 8) {
+    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+    __m128i vc16 = _mm256_cvtps_ph(_mm256_mul_ps(va32, vb32), 0);
+    _mm_store_si128((__m128i*)(&x[i]), vc16);
+  }
+  // Leftovers
+  for (; i < n; i++) {
+    x[i] *= y[i];
+  }
+}
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void max<float16>(float16* x, const float16* y, size_t n) {
+  // Handle unaligned data at the beginning of the buffer
+  while (!is_aligned(x, 32)) {
+    *x = std::max(*x, *y);
+    x++;
+    y++;
+    n--;
+  }
+  assert(is_aligned(y, 32));
+  size_t i;
+  for (i = 0; i < (n / 8) * 8; i += 8) {
+    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+    __m128i vc16 = _mm256_cvtps_ph(_mm256_max_ps(va32, vb32), 0);
+    _mm_store_si128((__m128i*)(&x[i]), vc16);
+  }
+  // Leftovers
+  for (; i < n; i++) {
+    x[i] = std::max(x[i], y[i]);
+  }
+}
+
+// Assumes x and y are either both aligned to 32 bytes or unaligned by the same
+// offset, as would happen when reducing at an offset within an aligned buffer
+template <>
+void min<float16>(float16* x, const float16* y, size_t n) {
+  // Handle unaligned data at the beginning of the buffer
+  while (!is_aligned(x, 32)) {
+    *x = std::min(*x, *y);
+    x++;
+    y++;
+    n--;
+  }
+  assert(is_aligned(y, 32));
+  size_t i;
+  for (i = 0; i < (n / 8) * 8; i += 8) {
+    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
+    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
+    __m128i vc16 = _mm256_cvtps_ph(_mm256_min_ps(va32, vb32), 0);
+    _mm_store_si128((__m128i*)(&x[i]), vc16);
+  }
+  // Leftovers
+  for (; i < n; i++) {
+    x[i] = std::min(x[i], y[i]);
+  }
+}
+
+#endif
+
+}

diff --git a/gloo/math.h b/gloo/math.h
index d46c732..889ac4a 100644
--- a/gloo/math.h
+++ b/gloo/math.h

@@ -9,12 +9,6 @@
 
 #pragma once
 
-#include <algorithm>
-#include <cassert>
-
-#ifdef __AVX2__
-#include <immintrin.h>
-#endif
 #ifdef GLOO_USE_EIGEN
 #include <Eigen/Core>
 #endif
@@ -88,109 +82,36 @@
 
 #endif
 
-#ifdef __AVX2__
-#define is_aligned(POINTER, BYTE_COUNT) \
-  (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0)
+#ifdef GLOO_USE_AVX
 
 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
-inline void sum<float16>(float16* x, const float16* y, size_t n) {
-  // Handle unaligned data at the beginning of the buffer
-  while (!is_aligned(x, 32)) {
-    *x += *y;
-    x++;
-    y++;
-    n--;
-  }
-  assert(is_aligned(y, 32));
-  size_t i;
-  for (i = 0; i < (n / 8) * 8; i += 8) {
-    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
-    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
-    __m128i vc16 = _mm256_cvtps_ph(_mm256_add_ps(va32, vb32), 0);
-    _mm_store_si128((__m128i*)(&x[i]), vc16);
-  }
-  // Leftovers
-  for (; i < n; i++) {
-    x[i] += y[i];
-  }
-}
+void sum<float16>(float16* x, const float16* y, size_t n);
+extern template
+void sum<float16>(float16* x, const float16* y, size_t n);
 
 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
-inline void product<float16>(float16* x, const float16* y, size_t n) {
-  // Handle unaligned data at the beginning of the buffer
-  while (!is_aligned(x, 32)) {
-    *x *= *y;
-    x++;
-    y++;
-    n--;
-  }
-  assert(is_aligned(y, 32));
-  size_t i;
-  for (i = 0; i < (n / 8) * 8; i += 8) {
-    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
-    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
-    __m128i vc16 = _mm256_cvtps_ph(_mm256_mul_ps(va32, vb32), 0);
-    _mm_store_si128((__m128i*)(&x[i]), vc16);
-  }
-  // Leftovers
-  for (; i < n; i++) {
-    x[i] *= y[i];
-  }
-}
+void product<float16>(float16* x, const float16* y, size_t n);
+extern template
+void product<float16>(float16* x, const float16* y, size_t n);
 
 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
-inline void max<float16>(float16* x, const float16* y, size_t n) {
-  // Handle unaligned data at the beginning of the buffer
-  while (!is_aligned(x, 32)) {
-    *x = std::max(*x, *y);
-    x++;
-    y++;
-    n--;
-  }
-  assert(is_aligned(y, 32));
-  size_t i;
-  for (i = 0; i < (n / 8) * 8; i += 8) {
-    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
-    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
-    __m128i vc16 = _mm256_cvtps_ph(_mm256_max_ps(va32, vb32), 0);
-    _mm_store_si128((__m128i*)(&x[i]), vc16);
-  }
-  // Leftovers
-  for (; i < n; i++) {
-    x[i] = std::max(x[i], y[i]);
-  }
-}
+void max<float16>(float16* x, const float16* y, size_t n);
+extern template
+void max<float16>(float16* x, const float16* y, size_t n);
 
 // Assumes x and y are either both aligned to 32 bytes or unaligned by the same
 // offset, as would happen when reducing at an offset within an aligned buffer
 template <>
-inline void min<float16>(float16* x, const float16* y, size_t n) {
-  // Handle unaligned data at the beginning of the buffer
-  while (!is_aligned(x, 32)) {
-    *x = std::min(*x, *y);
-    x++;
-    y++;
-    n--;
-  }
-  assert(is_aligned(y, 32));
-  size_t i;
-  for (i = 0; i < (n / 8) * 8; i += 8) {
-    __m256 va32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&x[i])));
-    __m256 vb32 = _mm256_cvtph_ps(_mm_loadu_si128((__m128i*)(&y[i])));
-    __m128i vc16 = _mm256_cvtps_ph(_mm256_min_ps(va32, vb32), 0);
-    _mm_store_si128((__m128i*)(&x[i]), vc16);
-  }
-  // Leftovers
-  for (; i < n; i++) {
-    x[i] = std::min(x[i], y[i]);
-  }
-}
+void min<float16>(float16* x, const float16* y, size_t n);
+extern template
+void min<float16>(float16* x, const float16* y, size_t n);
+
 #endif
 
 } // namespace gloo
commit	ceb13c8cc35261223d91e38ec9dd4963a02d6540	[log] [tgz]
author	Andrew Gallagher <andrewjcg@fb.com>	Mon Jun 19 16:22:30 2017 -0700
committer	Pieter Noordhuis <pietern@fb.com>	Mon Jun 19 16:46:43 2017 -0700
tree	0f6c7d3837874fd15c37aae404d8cccbbab6b24b
parent	32e6372538d5f23c8b21382fd0ed6b5900f26423 [diff]