[AOTI] Move c10/util ostream function implementations to their headers (#123847)

Summary: AOTInductor generated code for CPU models may have direct reference to these c10-implemented data types, see _inductor/codegen/cpp_prefix.h. To make sure the AOTI generated code is ABI backward compatible, we need to change those headers to a header-only implementation. The next PR in this stack will add tests to use those data types without linking against libtorch.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/123847
Approved by: https://github.com/jansel
diff --git a/c10/util/BFloat16.h b/c10/util/BFloat16.h
index ab65eeb..95bc5f9 100644
--- a/c10/util/BFloat16.h
+++ b/c10/util/BFloat16.h
@@ -7,8 +7,8 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
-
 #include <iosfwd>
+#include <ostream>
 
 #if defined(__CUDACC__) && !defined(USE_ROCM)
 #include <cuda_bf16.h>
@@ -114,7 +114,12 @@
 #endif
 };
 
-C10_API std::ostream& operator<<(std::ostream& out, const BFloat16& value);
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const BFloat16& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Bfloat16.cpp b/c10/util/Bfloat16.cpp
index 2ea7f74..ae6a839 100644
--- a/c10/util/Bfloat16.cpp
+++ b/c10/util/Bfloat16.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/BFloat16.h>
-#include <ostream>
 #include <type_traits>
 
 namespace c10 {
@@ -8,8 +7,4 @@
     std::is_standard_layout_v<BFloat16>,
     "c10::BFloat16 must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const BFloat16& value) {
-  out << (float)value;
-  return out;
-}
 } // namespace c10
diff --git a/c10/util/Float8_e4m3fn.cpp b/c10/util/Float8_e4m3fn.cpp
index 2a4ed51..9cb648b 100644
--- a/c10/util/Float8_e4m3fn.cpp
+++ b/c10/util/Float8_e4m3fn.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Float8_e4m3fn.h>
-#include <ostream>
 #include <type_traits>
 
 namespace c10 {
@@ -8,8 +7,4 @@
     std::is_standard_layout_v<Float8_e4m3fn>,
     "c10::Float8_e4m3fn must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e4m3fn& value) {
-  out << (float)value;
-  return out;
-}
 } // namespace c10
diff --git a/c10/util/Float8_e4m3fn.h b/c10/util/Float8_e4m3fn.h
index a952b83..d51feab 100644
--- a/c10/util/Float8_e4m3fn.h
+++ b/c10/util/Float8_e4m3fn.h
@@ -239,7 +239,12 @@
   inline C10_HOST_DEVICE bool isnan() const;
 };
 
-C10_API std::ostream& operator<<(std::ostream& out, const Float8_e4m3fn& value);
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e4m3fn& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Float8_e4m3fnuz.cpp b/c10/util/Float8_e4m3fnuz.cpp
index 5c790e3..b18167f 100644
--- a/c10/util/Float8_e4m3fnuz.cpp
+++ b/c10/util/Float8_e4m3fnuz.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Float8_e4m3fnuz.h>
-#include <ostream>
 
 namespace c10 {
 
@@ -7,9 +6,4 @@
     std::is_standard_layout_v<Float8_e4m3fnuz>,
     "c10::Float8_e4m3fnuz must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e4m3fnuz& value) {
-  out << (float)value;
-  return out;
-}
-
 } // namespace c10
diff --git a/c10/util/Float8_e4m3fnuz.h b/c10/util/Float8_e4m3fnuz.h
index e51630d..bed2989 100644
--- a/c10/util/Float8_e4m3fnuz.h
+++ b/c10/util/Float8_e4m3fnuz.h
@@ -127,9 +127,12 @@
   inline C10_HOST_DEVICE bool isnan() const;
 };
 
-C10_API std::ostream& operator<<(
+C10_API inline std::ostream& operator<<(
     std::ostream& out,
-    const Float8_e4m3fnuz& value);
+    const Float8_e4m3fnuz& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Float8_e5m2.cpp b/c10/util/Float8_e5m2.cpp
index 8833283..3a9fc99 100644
--- a/c10/util/Float8_e5m2.cpp
+++ b/c10/util/Float8_e5m2.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Float8_e5m2.h>
-#include <ostream>
 
 namespace c10 {
 
@@ -7,8 +6,4 @@
     std::is_standard_layout<Float8_e5m2>::value,
     "c10::Float8_e5m2 must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e5m2& value) {
-  out << (float)value;
-  return out;
-}
 } // namespace c10
diff --git a/c10/util/Float8_e5m2.h b/c10/util/Float8_e5m2.h
index c05f974..442b7ee 100644
--- a/c10/util/Float8_e5m2.h
+++ b/c10/util/Float8_e5m2.h
@@ -136,7 +136,12 @@
   inline C10_HOST_DEVICE bool isinf() const;
 };
 
-C10_API std::ostream& operator<<(std::ostream& out, const Float8_e5m2& value);
+C10_API inline std::ostream& operator<<(
+    std::ostream& out,
+    const Float8_e5m2& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Float8_e5m2fnuz.cpp b/c10/util/Float8_e5m2fnuz.cpp
index 2613550..e3349b5 100644
--- a/c10/util/Float8_e5m2fnuz.cpp
+++ b/c10/util/Float8_e5m2fnuz.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Float8_e5m2fnuz.h>
-#include <ostream>
 
 namespace c10 {
 
@@ -7,9 +6,4 @@
     std::is_standard_layout_v<Float8_e5m2fnuz>,
     "c10::Float8_e5m2 must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Float8_e5m2fnuz& value) {
-  out << (float)value;
-  return out;
-}
-
 } // namespace c10
diff --git a/c10/util/Float8_e5m2fnuz.h b/c10/util/Float8_e5m2fnuz.h
index f43a912..f637739 100644
--- a/c10/util/Float8_e5m2fnuz.h
+++ b/c10/util/Float8_e5m2fnuz.h
@@ -126,9 +126,12 @@
   inline C10_HOST_DEVICE bool isinf() const;
 };
 
-C10_API std::ostream& operator<<(
+C10_API inline std::ostream& operator<<(
     std::ostream& out,
-    const Float8_e5m2fnuz& value);
+    const Float8_e5m2fnuz& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10
 
diff --git a/c10/util/Half.cpp b/c10/util/Half.cpp
index 7c75112..e977aed 100644
--- a/c10/util/Half.cpp
+++ b/c10/util/Half.cpp
@@ -1,5 +1,4 @@
 #include <c10/util/Half.h>
-#include <ostream>
 #include <type_traits>
 
 namespace c10 {
@@ -8,8 +7,4 @@
     std::is_standard_layout_v<Half>,
     "c10::Half must be standard layout.");
 
-std::ostream& operator<<(std::ostream& out, const Half& value) {
-  out << (float)value;
-  return out;
-}
 } // namespace c10
diff --git a/c10/util/Half.h b/c10/util/Half.h
index 979e0d8..3d5a38c 100644
--- a/c10/util/Half.h
+++ b/c10/util/Half.h
@@ -30,6 +30,7 @@
 #include <cstring>
 #include <iosfwd>
 #include <limits>
+#include <ostream>
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -531,7 +532,10 @@
              typename From::value_type>(f.imag());
 }
 
-C10_API std::ostream& operator<<(std::ostream& out, const Half& value);
+C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) {
+  out << (float)value;
+  return out;
+}
 
 } // namespace c10