Add bf16 and change header file include path (#91838)

# Motivation
We would like to add the bfloat16 header file to PyTorch to make PyTorch and Intel extension for PyTorch support the bfloat16 data type.

# Solution
- Note that bfloat16 is an Intel extension implementation in the DPC++ compiler instead of standard SYCL, we need to guarantee the bfloat16 header can be included only using the DPC++ compiler. Please refer to [sycl 2020 feature test macros](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#_feature_test_macros). Intel DPC++ compiler uses [SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc) to check bfloat16 feature.
- Refer to [intel/llvm](https://github.com/intel/llvm/blob/59dd38795c7557a779ca38cd27df21a40c7b3c45/clang/lib/Basic/Version.cpp#L129). SYCL_LANGUAGE_VERSION is defined in both SYCL 1.2.1 and SYCL 2020. But only CL_SYCL_LANGUAGE_VERSION is defined in SYCL 1.2.1. So we should check CL_SYCL_LANGUAGE_VERSION first for SYCL 1.2.1. If it is not defined then check SYCL_LANGUAGE_VERSION for SYCL 2020. This will guarantee to be compatible with SYCL 1.2.1 and SYCL 2020.

# Additional
No need UT.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/91838
Approved by: https://github.com/ezyang
diff --git a/c10/util/BFloat16-inl.h b/c10/util/BFloat16-inl.h
index 8add799..c2ad4d2 100644
--- a/c10/util/BFloat16-inl.h
+++ b/c10/util/BFloat16-inl.h
@@ -8,6 +8,15 @@
 C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion")
 #endif
 
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
 namespace c10 {
 
 /// Constructors
@@ -16,6 +25,9 @@
 #if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \
     __CUDA_ARCH__ >= 800
       x(__bfloat16_as_ushort(__float2bfloat16(value)))
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+      x(sycl::bit_cast<uint16_t>(sycl::ext::oneapi::bfloat16(value)))
 #else
       // RNE by default
       x(detail::round_to_nearest_even(value))
@@ -27,6 +39,9 @@
 inline C10_HOST_DEVICE BFloat16::operator float() const {
 #if defined(__CUDACC__) && !defined(USE_ROCM)
   return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#elif defined(__SYCL_DEVICE_ONLY__) && \
+    defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  return float(*reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x));
 #else
   return detail::f32_from_bits(x);
 #endif
@@ -41,6 +56,16 @@
 }
 #endif
 
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+inline C10_HOST_DEVICE BFloat16::BFloat16(
+    const sycl::ext::oneapi::bfloat16& value) {
+  x = *reinterpret_cast<const unsigned short*>(&value);
+}
+inline C10_HOST_DEVICE BFloat16::operator sycl::ext::oneapi::bfloat16() const {
+  return *reinterpret_cast<const sycl::ext::oneapi::bfloat16*>(&x);
+}
+#endif
+
 // CUDA intrinsics
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index 64cda2b..cf20f7a 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -11,6 +11,15 @@
 #include <cuda_bf16.h>
 #endif
 
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
+#include <CL/sycl.hpp> // for SYCL 1.2.1
+#else
+#include <sycl/sycl.hpp> // for SYCL 2020
+#endif
+#include <ext/oneapi/bfloat16.hpp>
+#endif
+
 namespace c10 {
 
 namespace detail {
@@ -94,6 +103,11 @@
   inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value);
   explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const;
 #endif
+
+#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS)
+  inline C10_HOST_DEVICE BFloat16(const sycl::ext::oneapi::bfloat16& value);
+  explicit inline C10_HOST_DEVICE operator sycl::ext::oneapi::bfloat16() const;
+#endif
 };
 
 } // namespace c10
diff --git a/c10/util/Half-inl.h b/c10/util/Half-inl.h
index 8e83027..601dec1 100644
--- a/c10/util/Half-inl.h
+++ b/c10/util/Half-inl.h
@@ -12,10 +12,10 @@
 #include <hip/hip_fp16.h>
 #endif
 
-#if defined(SYCL_LANGUAGE_VERSION)
-#include <sycl/sycl.hpp> // for SYCL 2020
-#elif defined(CL_SYCL_LANGUAGE_VERSION)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
 #include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
 #endif
 
 C10_CLANG_DIAGNOSTIC_PUSH()
diff --git a/c10/util/Half.h b/c10/util/Half.h
index a786db9..bdafa5a 100644
--- a/c10/util/Half.h
+++ b/c10/util/Half.h
@@ -45,10 +45,10 @@
 #include <hip/hip_fp16.h>
 #endif
 
-#if defined(SYCL_LANGUAGE_VERSION)
-#include <sycl/sycl.hpp> // for SYCL 2020
-#elif defined(CL_SYCL_LANGUAGE_VERSION)
+#if defined(CL_SYCL_LANGUAGE_VERSION)
 #include <CL/sycl.hpp> // for SYCL 1.2.1
+#elif defined(SYCL_LANGUAGE_VERSION)
+#include <sycl/sycl.hpp> // for SYCL 2020
 #endif
 
 // Standard check for compiling CUDA with clang