From a6971bb4775a22209924f1a6435ce3cc4fd34684 Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 17 Aug 2023 17:00:46 +0200
Subject: [PATCH 01/38] [test] Test gt::half scalar arithmetic (+,-,*,/)

---
 tests/test_half.cxx | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 tests/test_half.cxx
diff --git a/tests/test_half.cxx b/tests/test_half.cxx
new file mode 100644
index 00000000..43a4aafe
--- /dev/null
+++ b/tests/test_half.cxx
@@ -0,0 +1,30 @@
+#include <gtest/gtest.h>
+
+#include <gtensor/gtensor.h>
+
+#include <gtensor/half.h>
+
+TEST(half, ScalarArithmetic)
+{
+    gt::half a{1.0};
+    gt::half b{2.0};
+
+    gt::half c{0.0};
+    gt::half ref{0.0};
+
+    c = a + b;
+    ref = 3.0;
+    EXPECT_EQ(c, ref);
+
+    c = a - b;
+    ref = -1.0;
+    EXPECT_EQ(c, ref);
+
+    c = a * b;
+    ref = 2.0;
+    EXPECT_EQ(c, ref);
+
+    c = a / b;
+    ref = 0.5;
+    EXPECT_EQ(c, ref);
+}

From f402b3e5dd17fdde279368511ae465c4d5a802ba Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 17 Aug 2023 17:07:55 +0200
Subject: [PATCH 02/38] [test] Register test_half

---
 tests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a5e59062..90538c02 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -43,6 +43,7 @@ add_gtensor_test(test_space)
 add_gtensor_test(test_stream)
 add_gtensor_test(test_gtest_predicates)
 add_gtensor_test(test_sparse)
+add_gtensor_test(test_half)
 
 if (GTENSOR_ENABLE_CLIB)
   add_executable(test_clib)

From 49b1c3f6ff39c60d1dcfd873f6f74606d35e05c9 Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 17 Aug 2023 17:18:58 +0200
Subject: [PATCH 03/38] [feat] Prototype for gt::half as of #271

---
 include/gtensor/half.h | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 include/gtensor/half.h

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
new file mode 100644
index 00000000..8fa7a44a
--- /dev/null
+++ b/include/gtensor/half.h
@@ -0,0 +1,46 @@
+// half_wrapper.hxx
+#ifndef HALF_WRAPPER
+#define HALF_WRAPPER
+
+#include <iostream>
+#include <cuda_fp16.h>
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+#define TARGET_ARCH __host__ __device__
+#else
+#define TARGET_ARCH
+#endif
+
+class HalfWrapper
+{
+public:
+    TARGET_ARCH HalfWrapper(float x) : x(x) {}; 
+    TARGET_ARCH HalfWrapper(half x) : x(x) {}; 
+    TARGET_ARCH const HalfWrapper& operator=(const float f) { x = f; return *this; }
+    TARGET_ARCH const half& Get() const { return x; }
+private:
+    half x;
+};
+
+TARGET_ARCH const HalfWrapper operator+(const HalfWrapper& lhs, const HalfWrapper& rhs)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+return HalfWrapper( lhs.Get() + rhs.Get() );
+#else
+return HalfWrapper( float(lhs.Get()) + float(rhs.Get()) );
+#endif
+}
+
+TARGET_ARCH const HalfWrapper operator*(const HalfWrapper& lhs, const HalfWrapper& rhs)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+return HalfWrapper( lhs.Get() * rhs.Get() );
+#else
+return HalfWrapper( float(lhs.Get()) * float(rhs.Get()) );
+#endif
+}
+
+std::ostream& operator<<(std::ostream& s, const HalfWrapper& h)
+{ s << (float) h.Get(); return s; }
+
+#endif

From e26f092e16181e7a7d27c640a22982662e537ede Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 17 Aug 2023 18:02:35 +0200
Subject: [PATCH 04/38] [clean] Namespace and renaming

---
 include/gtensor/half.h | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 8fa7a44a..748177b0 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -1,46 +1,52 @@
-// half_wrapper.hxx
-#ifndef HALF_WRAPPER
-#define HALF_WRAPPER
+#ifndef GTENSOR_HALF_H
+#define GTENSOR_HALF_H
 
 #include <iostream>
 #include <cuda_fp16.h>
 
+namespace gt
+{
+
+// ======================================================================
+// half
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
 #define TARGET_ARCH __host__ __device__
 #else
 #define TARGET_ARCH
 #endif
 
-class HalfWrapper
+class half
 {
 public:
-    TARGET_ARCH HalfWrapper(float x) : x(x) {}; 
-    TARGET_ARCH HalfWrapper(half x) : x(x) {}; 
-    TARGET_ARCH const HalfWrapper& operator=(const float f) { x = f; return *this; }
-    TARGET_ARCH const half& Get() const { return x; }
+    TARGET_ARCH half(float x) : x(x) {};
+    TARGET_ARCH half(__half x) : x(x) {};
+    TARGET_ARCH const half& operator=(const float f) { x = f; return *this; }
+    TARGET_ARCH const __half& Get() const { return x; }
 private:
-    half x;
+    __half x;
 };
 
-TARGET_ARCH const HalfWrapper operator+(const HalfWrapper& lhs, const HalfWrapper& rhs)
+TARGET_ARCH const half operator+(const half& lhs, const half& rhs)
 {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-return HalfWrapper( lhs.Get() + rhs.Get() );
+    return half( lhs.Get() + rhs.Get() );
 #else
-return HalfWrapper( float(lhs.Get()) + float(rhs.Get()) );
+    return half( float(lhs.Get()) + float(rhs.Get()) );
 #endif
 }
 
-TARGET_ARCH const HalfWrapper operator*(const HalfWrapper& lhs, const HalfWrapper& rhs)
+TARGET_ARCH const half operator*(const half& lhs, const half& rhs)
 {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
-return HalfWrapper( lhs.Get() * rhs.Get() );
+    return half( lhs.Get() * rhs.Get() );
 #else
-return HalfWrapper( float(lhs.Get()) * float(rhs.Get()) );
+    return half( float(lhs.Get()) * float(rhs.Get()) );
 #endif
 }
 
-std::ostream& operator<<(std::ostream& s, const HalfWrapper& h)
+std::ostream& operator<<(std::ostream& s, const half& h)
 { s << (float) h.Get(); return s; }
 
+} // namespace gt
+
 #endif

From 93effea725cf630d49e87695c269294a8d5d3713 Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 17 Aug 2023 18:04:24 +0200
Subject: [PATCH 05/38] [feat] Further operators -,/,== for test_half

---
 include/gtensor/half.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 748177b0..c46f1851 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -35,6 +35,15 @@ TARGET_ARCH const half operator+(const half& lhs, const half& rhs)
 #endif
 }
 
+TARGET_ARCH const half operator-(const half& lhs, const half& rhs)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half( lhs.Get() - rhs.Get() );
+#else
+    return half( float(lhs.Get()) - float(rhs.Get()) );
+#endif
+}
+
 TARGET_ARCH const half operator*(const half& lhs, const half& rhs)
 {
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
@@ -44,6 +53,24 @@ TARGET_ARCH const half operator*(const half& lhs, const half& rhs)
 #endif
 }
 
+TARGET_ARCH const half operator/(const half& lhs, const half& rhs)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return half( lhs.Get() / rhs.Get() );
+#else
+    return half( float(lhs.Get()) / float(rhs.Get()) );
+#endif
+}
+
+TARGET_ARCH bool operator==(const half& lhs, const half& rhs)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    return lhs.Get() == rhs.Get();
+#else
+    return float(lhs.Get()) == float(rhs.Get());
+#endif
+}
+
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << (float) h.Get(); return s; }
 

From 465a4653b6a3801b1d80cae48f8a71686aecd741 Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Fri, 18 Aug 2023 15:14:42 +0200
Subject: [PATCH 06/38] [test] Host auto initialize; test needs operator!=

    + NOTE: Straightforward auto-initialization doesn't work on device, i.e.,
            doesn't compile for gt::gtensor<gt::half, 1, gt::space::device>
            --> will be documented in one of the next commits
---
 include/gtensor/half.h |  5 +++++
 tests/test_half.cxx    | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index c46f1851..e54abb03 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -71,6 +71,11 @@ TARGET_ARCH bool operator==(const half& lhs, const half& rhs)
 #endif
 }
 
+TARGET_ARCH bool operator!=(const half& lhs, const half& rhs)
+{
+    return !(lhs == rhs);
+}
+
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << (float) h.Get(); return s; }
 
diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 43a4aafe..da4945d6 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -28,3 +28,22 @@ TEST(half, ScalarArithmetic)
     ref = 0.5;
     EXPECT_EQ(c, ref);
 }
+
+template <typename S>
+void generic_fill_1d(gt::gtensor<gt::half, 1, S>& x, const gt::half& fill_value)
+{
+  auto k_x = x.to_kernel();
+
+  gt::launch<1, S>(x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
+}
+
+TEST(half, AutoInitHost)
+{
+    gt::half fill_value{1.25};
+    gt::gtensor<gt::half, 1, gt::space::host> a(gt::shape(5), fill_value);
+    gt::gtensor<gt::half, 1, gt::space::host> b(a.shape());
+
+    generic_fill_1d<gt::space::host>(b, fill_value);
+
+    EXPECT_EQ(a, b);
+}

From 4d2852a63b2147d261b8d4c6a4aa5eca8d818df6 Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Fri, 18 Aug 2023 15:37:21 +0200
Subject: [PATCH 07/38] [test] Explicit kernel haxpy on host + device

    + NOTE: Need to use cmake with CMAKE_CUDA_ARCHITECTURES flag.
            E.g., -DCMAKE_CUDA_ARCHITECTURES=80 for A100
            [some value >= 53 to properly use CUDA __half type]
    + TODO: Later maybe this can be taken care of in CMakeLists.txt
---
 tests/test_half.cxx | 54 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index da4945d6..3aab6aee 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -47,3 +47,57 @@ TEST(half, AutoInitHost)
 
     EXPECT_EQ(a, b);
 }
+
+void host_explicit_haxpy_1d(const gt::half& a,
+                            const gt::gtensor<gt::half, 1>& x,
+                            gt::gtensor<gt::half, 1>& y)
+{
+  auto k_x = x.to_kernel();
+  auto k_y = y.to_kernel();
+
+  gt::launch_host<1>(
+    y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
+}
+
+TEST(half, HaxpyExplicit1dHost)
+{
+    gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
+    gt::half a{0.5};
+    gt::gtensor<gt::half, 1, gt::space::host> ref(x.shape(), 3.25);
+
+    host_explicit_haxpy_1d(a, x, y);
+
+    EXPECT_EQ(y, ref);
+}
+
+template <typename S>
+void generic_explicit_haxpy_1d( const gt::half& a,
+                                const gt::gtensor<gt::half, 1, S>& x,
+                                      gt::gtensor<gt::half, 1, S>& y)
+{
+  auto k_x = x.to_kernel();
+  auto k_y = y.to_kernel();
+
+  gt::launch<1, S>(
+    y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
+}
+
+TEST(half, HaxpyExplicit1dDevice)
+{
+    gt::gtensor<gt::half, 1, gt::space::host> h_x(gt::shape(3), gt::half{1.5});
+    gt::gtensor<gt::half, 1, gt::space::host> h_y(h_x.shape(), gt::half{2.5});
+    gt::half a{0.5};
+    gt::gtensor<gt::half, 1, gt::space::host> ref(h_x.shape(), 3.25);
+
+    gt::gtensor<gt::half, 1, gt::space::device> d_x(h_x.shape());
+    gt::gtensor<gt::half, 1, gt::space::device> d_y(h_y.shape());
+    gt::copy(h_x, d_x);
+    gt::copy(h_y, d_y);
+
+    generic_explicit_haxpy_1d<gt::space::device>(a, d_x, d_y);
+
+    gt::copy(d_y, h_y);
+
+    EXPECT_EQ(h_y, ref);
+}

From 3856889dfa95c557729be7b72b3d77a6c3893df1 Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Fri, 18 Aug 2023 16:13:53 +0200
Subject: [PATCH 08/38] [test] Documents failure of device half auto init

    + See NOTE in commit db3eec8
    + Works for host+half, host+float, device+float
    + Fails for device+half
---
 tests/CMakeLists.txt        |  1 +
 tests/test_half_failing.cxx | 75 +++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 tests/test_half_failing.cxx

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 90538c02..396b5caa 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -44,6 +44,7 @@ add_gtensor_test(test_stream)
 add_gtensor_test(test_gtest_predicates)
 add_gtensor_test(test_sparse)
 add_gtensor_test(test_half)
+add_gtensor_test(test_half_failing)
 
 if (GTENSOR_ENABLE_CLIB)
   add_executable(test_clib)
diff --git a/tests/test_half_failing.cxx b/tests/test_half_failing.cxx
new file mode 100644
index 00000000..6a5c82b8
--- /dev/null
+++ b/tests/test_half_failing.cxx
@@ -0,0 +1,75 @@
+#include <gtest/gtest.h>
+
+#include <gtensor/gtensor.h>
+
+#include <gtensor/half.h>
+
+template <typename S, typename fp_type>
+void generic_fill_1d(gt::gtensor<fp_type, 1, S>& x, const fp_type& fill_value)
+{
+  auto k_x = x.to_kernel();
+
+  gt::launch<1, S>(
+    x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
+}
+
+TEST(halfFailing, AutoInitHostFloat)
+{
+    float fill_value{1.25};
+    gt::gtensor<float, 1, gt::space::host> a(gt::shape(5), fill_value);
+    gt::gtensor<float, 1, gt::space::host> b(a.shape());
+
+    generic_fill_1d<gt::space::host>(b, fill_value);
+
+    EXPECT_EQ(a, b);
+}
+
+TEST(halfFailing, AutoInitHostHalf)
+{
+    gt::half fill_value{1.25};
+    gt::gtensor<gt::half, 1, gt::space::host> a(gt::shape(5), fill_value);
+    gt::gtensor<gt::half, 1, gt::space::host> b(a.shape());
+
+    generic_fill_1d<gt::space::host>(b, fill_value);
+
+    EXPECT_EQ(a, b);
+}
+
+TEST(halfFailing, AutoInitDeviceFloat)
+{
+    float fill_value{1.25};
+    gt::gtensor<float, 1, gt::space::device> d_a(gt::shape(5), fill_value);
+    gt::gtensor<float, 1, gt::space::device> d_b(d_a.shape());
+
+    generic_fill_1d<gt::space::device>(d_b, fill_value);
+
+    gt::gtensor<float, 1, gt::space::host> h_a(d_a.shape());
+    gt::copy(d_a, h_a);
+    gt::gtensor<float, 1, gt::space::host> h_b(d_a.shape());
+    gt::copy(d_b, h_b);
+
+    EXPECT_EQ(h_a, h_b);
+}
+
+TEST(halfFailing, AutoInitDeviceHalf)
+{
+    gt::half fill_value{1.25};
+    // DOES NOT COMPILE !!! ----------------------------------------------------
+    // gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5), fill_value);
+    EXPECT_EQ(true, false);
+    // temporary workaround:
+    gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5));
+    generic_fill_1d<gt::space::device>(d_a, fill_value);
+    // -------------------------------------------------------------------------
+
+    gt::gtensor<gt::half, 1, gt::space::device> d_b(d_a.shape());
+
+    generic_fill_1d<gt::space::device>(d_b, fill_value);
+
+    gt::gtensor<gt::half, 1, gt::space::host> h_a(d_a.shape());
+    gt::copy(d_a, h_a);
+    gt::gtensor<gt::half, 1, gt::space::host> h_b(d_a.shape());
+    gt::copy(d_b, h_b);
+
+    EXPECT_EQ(h_a, h_b);
+}

From f6d04c0b59180dd629598c2838af448511a55ae4 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 21 Aug 2023 13:32:55 +0200
Subject: [PATCH 09/38] [test] Implicit kernel haxpy on host + device

---
 tests/test_half.cxx | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 3aab6aee..dbd60bd8 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -101,3 +101,34 @@ TEST(half, HaxpyExplicit1dDevice)
 
     EXPECT_EQ(h_y, ref);
 }
+
+TEST(half, HaxpyImplicit1dHost)
+{
+    gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
+    gt::half a{0.5};
+    gt::gtensor<gt::half, 1, gt::space::host> ref(x.shape(), 3.25);
+
+    y = a * x + y;
+
+    EXPECT_EQ(y, ref);
+}
+
+TEST(half, HaxpyImplicit1dDevice)
+{
+    gt::gtensor<gt::half, 1, gt::space::host> h_x(gt::shape(3), 1.5);
+    gt::gtensor<gt::half, 1, gt::space::host> h_y(h_x.shape(), 2.5);
+    gt::half a{0.5};
+    gt::gtensor<gt::half, 1, gt::space::host> ref(h_x.shape(), 3.25);
+
+    gt::gtensor<gt::half, 1, gt::space::device> d_x(h_x.shape());
+    gt::gtensor<gt::half, 1, gt::space::device> d_y(h_y.shape());
+    gt::copy(h_x, d_x);
+    gt::copy(h_y, d_y);
+
+    d_y = a * d_x + d_y;
+
+    gt::copy(d_y, h_y);
+
+    EXPECT_EQ(h_y, ref);
+}

From f8998920909430c9f20ac700f5f09d0c0242f501 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 21 Aug 2023 14:15:10 +0200
Subject: [PATCH 10/38] [fix/test] Fix failure of device half auto init

    + Fixes problem decribed in e5ea095
      by enabling default constructor of gt::half
---
 include/gtensor/half.h      |  1 +
 tests/test_half_failing.cxx | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index e54abb03..742d23ce 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -18,6 +18,7 @@ namespace gt
 class half
 {
 public:
+    half() = default;
     TARGET_ARCH half(float x) : x(x) {};
     TARGET_ARCH half(__half x) : x(x) {};
     TARGET_ARCH const half& operator=(const float f) { x = f; return *this; }
diff --git a/tests/test_half_failing.cxx b/tests/test_half_failing.cxx
index 6a5c82b8..a02b3807 100644
--- a/tests/test_half_failing.cxx
+++ b/tests/test_half_failing.cxx
@@ -55,11 +55,11 @@ TEST(halfFailing, AutoInitDeviceHalf)
 {
     gt::half fill_value{1.25};
     // DOES NOT COMPILE !!! ----------------------------------------------------
-    // gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5), fill_value);
-    EXPECT_EQ(true, false);
-    // temporary workaround:
-    gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5));
-    generic_fill_1d<gt::space::device>(d_a, fill_value);
+     gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5), fill_value);
+//    EXPECT_EQ(true, false);
+//    // temporary workaround:
+//    gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5));
+//    generic_fill_1d<gt::space::device>(d_a, fill_value);
     // -------------------------------------------------------------------------
 
     gt::gtensor<gt::half, 1, gt::space::device> d_b(d_a.shape());

From fb56bb594ccc9611d0ddb7a5e54dc163e53ffc9c Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 21 Aug 2023 14:26:11 +0200
Subject: [PATCH 11/38] [test] Remove test_half_failing

    + Since device half auto init works since last commit
    + Corresponding test added to test_half
---
 tests/CMakeLists.txt        |  1 -
 tests/test_half.cxx         | 16 ++++++++
 tests/test_half_failing.cxx | 75 -------------------------------------
 3 files changed, 16 insertions(+), 76 deletions(-)
 delete mode 100644 tests/test_half_failing.cxx

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 396b5caa..90538c02 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -44,7 +44,6 @@ add_gtensor_test(test_stream)
 add_gtensor_test(test_gtest_predicates)
 add_gtensor_test(test_sparse)
 add_gtensor_test(test_half)
-add_gtensor_test(test_half_failing)
 
 if (GTENSOR_ENABLE_CLIB)
   add_executable(test_clib)
diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index dbd60bd8..b262c29e 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -48,6 +48,22 @@ TEST(half, AutoInitHost)
     EXPECT_EQ(a, b);
 }
 
+TEST(half, AutoInitDevice)
+{
+    gt::half fill_value{1.25};
+    gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5), fill_value);
+    gt::gtensor<gt::half, 1, gt::space::device> d_b(d_a.shape());
+
+    generic_fill_1d<gt::space::device>(d_b, fill_value);
+
+    gt::gtensor<gt::half, 1, gt::space::host> h_a(d_a.shape());
+    gt::gtensor<gt::half, 1, gt::space::host> h_b(d_b.shape());
+    gt::copy(d_a, h_a);
+    gt::copy(d_b, h_b);
+
+    EXPECT_EQ(h_a, h_b);
+}
+
 void host_explicit_haxpy_1d(const gt::half& a,
                             const gt::gtensor<gt::half, 1>& x,
                             gt::gtensor<gt::half, 1>& y)
diff --git a/tests/test_half_failing.cxx b/tests/test_half_failing.cxx
deleted file mode 100644
index a02b3807..00000000
--- a/tests/test_half_failing.cxx
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <gtensor/gtensor.h>
-
-#include <gtensor/half.h>
-
-template <typename S, typename fp_type>
-void generic_fill_1d(gt::gtensor<fp_type, 1, S>& x, const fp_type& fill_value)
-{
-  auto k_x = x.to_kernel();
-
-  gt::launch<1, S>(
-    x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
-}
-
-TEST(halfFailing, AutoInitHostFloat)
-{
-    float fill_value{1.25};
-    gt::gtensor<float, 1, gt::space::host> a(gt::shape(5), fill_value);
-    gt::gtensor<float, 1, gt::space::host> b(a.shape());
-
-    generic_fill_1d<gt::space::host>(b, fill_value);
-
-    EXPECT_EQ(a, b);
-}
-
-TEST(halfFailing, AutoInitHostHalf)
-{
-    gt::half fill_value{1.25};
-    gt::gtensor<gt::half, 1, gt::space::host> a(gt::shape(5), fill_value);
-    gt::gtensor<gt::half, 1, gt::space::host> b(a.shape());
-
-    generic_fill_1d<gt::space::host>(b, fill_value);
-
-    EXPECT_EQ(a, b);
-}
-
-TEST(halfFailing, AutoInitDeviceFloat)
-{
-    float fill_value{1.25};
-    gt::gtensor<float, 1, gt::space::device> d_a(gt::shape(5), fill_value);
-    gt::gtensor<float, 1, gt::space::device> d_b(d_a.shape());
-
-    generic_fill_1d<gt::space::device>(d_b, fill_value);
-
-    gt::gtensor<float, 1, gt::space::host> h_a(d_a.shape());
-    gt::copy(d_a, h_a);
-    gt::gtensor<float, 1, gt::space::host> h_b(d_a.shape());
-    gt::copy(d_b, h_b);
-
-    EXPECT_EQ(h_a, h_b);
-}
-
-TEST(halfFailing, AutoInitDeviceHalf)
-{
-    gt::half fill_value{1.25};
-    // DOES NOT COMPILE !!! ----------------------------------------------------
-     gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5), fill_value);
-//    EXPECT_EQ(true, false);
-//    // temporary workaround:
-//    gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5));
-//    generic_fill_1d<gt::space::device>(d_a, fill_value);
-    // -------------------------------------------------------------------------
-
-    gt::gtensor<gt::half, 1, gt::space::device> d_b(d_a.shape());
-
-    generic_fill_1d<gt::space::device>(d_b, fill_value);
-
-    gt::gtensor<gt::half, 1, gt::space::host> h_a(d_a.shape());
-    gt::copy(d_a, h_a);
-    gt::gtensor<gt::half, 1, gt::space::host> h_b(d_a.shape());
-    gt::copy(d_b, h_b);
-
-    EXPECT_EQ(h_a, h_b);
-}

From 25608b829773ac7396576cede608e02e2704b919 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 21 Aug 2023 14:47:48 +0200
Subject: [PATCH 12/38] [test] Cleaner device tests with auto init

---
 tests/test_half.cxx | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index b262c29e..4ff22826 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -101,15 +101,11 @@ void generic_explicit_haxpy_1d( const gt::half& a,
 
 TEST(half, HaxpyExplicit1dDevice)
 {
-    gt::gtensor<gt::half, 1, gt::space::host> h_x(gt::shape(3), gt::half{1.5});
-    gt::gtensor<gt::half, 1, gt::space::host> h_y(h_x.shape(), gt::half{2.5});
+    gt::gtensor<gt::half, 1, gt::space::device> d_x(gt::shape(3), 1.5);
+    gt::gtensor<gt::half, 1, gt::space::device> d_y(d_x.shape(), 2.5);
     gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::host> ref(h_x.shape(), 3.25);
-
-    gt::gtensor<gt::half, 1, gt::space::device> d_x(h_x.shape());
-    gt::gtensor<gt::half, 1, gt::space::device> d_y(h_y.shape());
-    gt::copy(h_x, d_x);
-    gt::copy(h_y, d_y);
+    gt::gtensor<gt::half, 1, gt::space::host> ref(d_y.shape(), 3.25);
+    gt::gtensor<gt::half, 1, gt::space::host> h_y(d_y.shape());
 
     generic_explicit_haxpy_1d<gt::space::device>(a, d_x, d_y);
 
@@ -132,15 +128,11 @@ TEST(half, HaxpyImplicit1dHost)
 
 TEST(half, HaxpyImplicit1dDevice)
 {
-    gt::gtensor<gt::half, 1, gt::space::host> h_x(gt::shape(3), 1.5);
-    gt::gtensor<gt::half, 1, gt::space::host> h_y(h_x.shape(), 2.5);
+    gt::gtensor<gt::half, 1, gt::space::device> d_x(gt::shape(3), 1.5);
+    gt::gtensor<gt::half, 1, gt::space::device> d_y(d_x.shape(), 2.5);
     gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::host> ref(h_x.shape(), 3.25);
-
-    gt::gtensor<gt::half, 1, gt::space::device> d_x(h_x.shape());
-    gt::gtensor<gt::half, 1, gt::space::device> d_y(h_y.shape());
-    gt::copy(h_x, d_x);
-    gt::copy(h_y, d_y);
+    gt::gtensor<gt::half, 1, gt::space::host> ref(d_y.shape(), 3.25);
+    gt::gtensor<gt::half, 1, gt::space::host> h_y(d_y.shape());
 
     d_y = a * d_x + d_y;
 

From 8356a023e600110e1feea756e7a0fef86f01f642 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 21 Aug 2023 14:50:06 +0200
Subject: [PATCH 13/38] [clean] Uniform indentation in test_half

---
 tests/test_half.cxx | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 4ff22826..a084ae4e 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -32,9 +32,9 @@ TEST(half, ScalarArithmetic)
 template <typename S>
 void generic_fill_1d(gt::gtensor<gt::half, 1, S>& x, const gt::half& fill_value)
 {
-  auto k_x = x.to_kernel();
+    auto k_x = x.to_kernel();
 
-  gt::launch<1, S>(x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
+    gt::launch<1, S>(x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
 }
 
 TEST(half, AutoInitHost)
@@ -68,11 +68,11 @@ void host_explicit_haxpy_1d(const gt::half& a,
                             const gt::gtensor<gt::half, 1>& x,
                             gt::gtensor<gt::half, 1>& y)
 {
-  auto k_x = x.to_kernel();
-  auto k_y = y.to_kernel();
+    auto k_x = x.to_kernel();
+    auto k_y = y.to_kernel();
 
-  gt::launch_host<1>(
-    y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
+    gt::launch_host<1>(
+            y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
 TEST(half, HaxpyExplicit1dHost)
@@ -92,11 +92,11 @@ void generic_explicit_haxpy_1d( const gt::half& a,
                                 const gt::gtensor<gt::half, 1, S>& x,
                                       gt::gtensor<gt::half, 1, S>& y)
 {
-  auto k_x = x.to_kernel();
-  auto k_y = y.to_kernel();
+    auto k_x = x.to_kernel();
+    auto k_y = y.to_kernel();
 
-  gt::launch<1, S>(
-    y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
+    gt::launch<1, S>(
+            y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
 TEST(half, HaxpyExplicit1dDevice)

From 59d3c90cfcc3138acbbfef7db2104085360a7a33 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 21 Aug 2023 16:01:48 +0200
Subject: [PATCH 14/38] [refactor] Reduce # of directives

---
 include/gtensor/half.h | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 742d23ce..3a45a2c7 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -11,8 +11,10 @@ namespace gt
 // half
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
 #define TARGET_ARCH __host__ __device__
+using compute_type = __half;
 #else
 #define TARGET_ARCH
+using compute_type = float;
 #endif
 
 class half
@@ -22,54 +24,34 @@ class half
     TARGET_ARCH half(float x) : x(x) {};
     TARGET_ARCH half(__half x) : x(x) {};
     TARGET_ARCH const half& operator=(const float f) { x = f; return *this; }
-    TARGET_ARCH const __half& Get() const { return x; }
+    TARGET_ARCH compute_type Get() const { return static_cast<compute_type>(x); }
 private:
     __half x;
 };
 
 TARGET_ARCH const half operator+(const half& lhs, const half& rhs)
 {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
     return half( lhs.Get() + rhs.Get() );
-#else
-    return half( float(lhs.Get()) + float(rhs.Get()) );
-#endif
 }
 
 TARGET_ARCH const half operator-(const half& lhs, const half& rhs)
 {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
     return half( lhs.Get() - rhs.Get() );
-#else
-    return half( float(lhs.Get()) - float(rhs.Get()) );
-#endif
 }
 
 TARGET_ARCH const half operator*(const half& lhs, const half& rhs)
 {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
     return half( lhs.Get() * rhs.Get() );
-#else
-    return half( float(lhs.Get()) * float(rhs.Get()) );
-#endif
 }
 
 TARGET_ARCH const half operator/(const half& lhs, const half& rhs)
 {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
     return half( lhs.Get() / rhs.Get() );
-#else
-    return half( float(lhs.Get()) / float(rhs.Get()) );
-#endif
 }
 
 TARGET_ARCH bool operator==(const half& lhs, const half& rhs)
 {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
     return lhs.Get() == rhs.Get();
-#else
-    return float(lhs.Get()) == float(rhs.Get());
-#endif
 }
 
 TARGET_ARCH bool operator!=(const half& lhs, const half& rhs)

From 7937c8ca5efd8768c30ba76ae1abc7b2e81a5677 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Tue, 22 Aug 2023 10:31:58 +0200
Subject: [PATCH 15/38] [refactor] Macro for arithmetic operators

---
 include/gtensor/half.h | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 3a45a2c7..fed3c745 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -23,31 +23,21 @@ class half
     half() = default;
     TARGET_ARCH half(float x) : x(x) {};
     TARGET_ARCH half(__half x) : x(x) {};
+
     TARGET_ARCH const half& operator=(const float f) { x = f; return *this; }
     TARGET_ARCH compute_type Get() const { return static_cast<compute_type>(x); }
 private:
     __half x;
 };
 
-TARGET_ARCH const half operator+(const half& lhs, const half& rhs)
-{
-    return half( lhs.Get() + rhs.Get() );
-}
-
-TARGET_ARCH const half operator-(const half& lhs, const half& rhs)
-{
-    return half( lhs.Get() - rhs.Get() );
-}
+#define PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(op) \
+    TARGET_ARCH const half operator op(const half& lhs, const half& rhs) \
+    { return half( lhs.Get() op rhs.Get() ); }
 
-TARGET_ARCH const half operator*(const half& lhs, const half& rhs)
-{
-    return half( lhs.Get() * rhs.Get() );
-}
-
-TARGET_ARCH const half operator/(const half& lhs, const half& rhs)
-{
-    return half( lhs.Get() / rhs.Get() );
-}
+PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(+);
+PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(-);
+PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(*);
+PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(/);
 
 TARGET_ARCH bool operator==(const half& lhs, const half& rhs)
 {

From f868ce8d42ae4029bb4244261a747be25887ede9 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Tue, 22 Aug 2023 10:37:49 +0200
Subject: [PATCH 16/38] [test] Test gt::half comparisons (==,!=,<,<=,>,>=)

---
 tests/test_half.cxx | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index a084ae4e..576139dd 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -29,6 +29,46 @@ TEST(half, ScalarArithmetic)
     EXPECT_EQ(c, ref);
 }
 
+TEST(half, BinaryComparisonOperators)
+{
+    gt::half a{1.0};
+    gt::half b{2.0};
+    gt::half c{2.0};
+
+    EXPECT_EQ(a, a);
+    EXPECT_EQ(b, b);
+    EXPECT_EQ(b, c);
+    EXPECT_EQ(c, b);
+    EXPECT_EQ(c, c);
+
+    EXPECT_NE(a, b);
+    EXPECT_NE(a, c);
+    EXPECT_NE(b, a);
+    EXPECT_NE(c, a);
+
+    EXPECT_LT(a, b);
+    EXPECT_LT(a, c);
+
+    EXPECT_LE(a, a);
+    EXPECT_LE(a, b);
+    EXPECT_LE(a, c);
+    EXPECT_LE(b, b);
+    EXPECT_LE(b, c);
+    EXPECT_LE(c, b);
+    EXPECT_LE(c, c);
+
+    EXPECT_GT(b, a);
+    EXPECT_GT(c, a);
+
+    EXPECT_GE(a, a);
+    EXPECT_GE(b, a);
+    EXPECT_GE(b, b);
+    EXPECT_GE(b, c);
+    EXPECT_GE(c, a);
+    EXPECT_GE(c, b);
+    EXPECT_GE(c, c);
+}
+
 template <typename S>
 void generic_fill_1d(gt::gtensor<gt::half, 1, S>& x, const gt::half& fill_value)
 {

From 2a18632193a4e429791cfa6d536732ae38300f50 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Tue, 22 Aug 2023 10:38:49 +0200
Subject: [PATCH 17/38] [feat] Comparisons gt::half (==,!=,<,<=,>,>=)

---
 include/gtensor/half.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index fed3c745..9d76ad2c 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -49,6 +49,26 @@ TARGET_ARCH bool operator!=(const half& lhs, const half& rhs)
     return !(lhs == rhs);
 }
 
+TARGET_ARCH bool operator<(const half& lhs, const half& rhs)
+{
+    return lhs.Get() < rhs.Get();
+}
+
+TARGET_ARCH bool operator<=(const half& lhs, const half& rhs)
+{
+    return lhs.Get() <= rhs.Get();
+}
+
+TARGET_ARCH bool operator>(const half& lhs, const half& rhs)
+{
+    return lhs.Get() > rhs.Get();
+}
+
+TARGET_ARCH bool operator>=(const half& lhs, const half& rhs)
+{
+    return lhs.Get() >= rhs.Get();
+}
+
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << (float) h.Get(); return s; }
 

From c4eeb25bf5ea1b449cdaab047f00557a516b2ef7 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Tue, 22 Aug 2023 10:42:32 +0200
Subject: [PATCH 18/38] [refactor] Macro for comparison operators

---
 include/gtensor/half.h | 39 ++++++++++-----------------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 9d76ad2c..54bfd550 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -39,35 +39,16 @@ PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(-);
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(*);
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(/);
 
-TARGET_ARCH bool operator==(const half& lhs, const half& rhs)
-{
-    return lhs.Get() == rhs.Get();
-}
-
-TARGET_ARCH bool operator!=(const half& lhs, const half& rhs)
-{
-    return !(lhs == rhs);
-}
-
-TARGET_ARCH bool operator<(const half& lhs, const half& rhs)
-{
-    return lhs.Get() < rhs.Get();
-}
-
-TARGET_ARCH bool operator<=(const half& lhs, const half& rhs)
-{
-    return lhs.Get() <= rhs.Get();
-}
-
-TARGET_ARCH bool operator>(const half& lhs, const half& rhs)
-{
-    return lhs.Get() > rhs.Get();
-}
-
-TARGET_ARCH bool operator>=(const half& lhs, const half& rhs)
-{
-    return lhs.Get() >= rhs.Get();
-}
+#define PROVIDE_HALF_COMPARISON_OPERATOR(op) \
+    TARGET_ARCH bool operator op(const half& lhs, const half& rhs) \
+    { return lhs.Get() op rhs.Get(); }
+
+PROVIDE_HALF_COMPARISON_OPERATOR(==);
+PROVIDE_HALF_COMPARISON_OPERATOR(!=);
+PROVIDE_HALF_COMPARISON_OPERATOR(<);
+PROVIDE_HALF_COMPARISON_OPERATOR(<=);
+PROVIDE_HALF_COMPARISON_OPERATOR(>);
+PROVIDE_HALF_COMPARISON_OPERATOR(>=);
 
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << (float) h.Get(); return s; }

From 4077de47496991ee19dcbebba34bf52c209a9155 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Wed, 23 Aug 2023 13:37:43 +0200
Subject: [PATCH 19/38] [test] Larger custom kernel using +,-,*,/

    + Test scalar, implicit/explicit kernel, host/device results match
---
 tests/test_half.cxx | 61 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 576139dd..4c5f94c1 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -180,3 +180,64 @@ TEST(half, HaxpyImplicit1dDevice)
 
     EXPECT_EQ(h_y, ref);
 }
+
+template <typename S>
+void generic_explicit_custom_kernel_1d( const gt::half& s1,
+                                        const gt::half& s2,
+                                        const gt::gtensor<gt::half, 1, S>& a,
+                                        const gt::gtensor<gt::half, 1, S>& b,
+                                        const gt::gtensor<gt::half, 1, S>& c,
+                                        const gt::gtensor<gt::half, 1, S>& d,
+                                        const gt::gtensor<gt::half, 1, S>& e,
+                                        gt::gtensor<gt::half, 1, S>& result)
+{
+    auto k_a = a.to_kernel();
+    auto k_b = b.to_kernel();
+    auto k_c = c.to_kernel();
+    auto k_d = d.to_kernel();
+    auto k_e = e.to_kernel();
+    auto k_r = result.to_kernel();
+
+    gt::launch<1, S>(result.shape(), GT_LAMBDA(int i)
+        { k_r(i) = s2 - k_e(i) * ((k_a(i) - s1 * k_b(i)) / k_c(i) + k_d(i)); });
+}
+
+TEST(half, CustomKernelExplicitImplicitHostDevice)
+{
+    gt::half a_val{12.34}, b_val{2.345}, c_val{0.987}, d_val{0.67}, e_val{3.14};
+    gt::half s1{0.1}, s2{4.56};
+
+    gt::half r = s2 - e_val * ((a_val - s1 * b_val) / c_val + d_val);
+
+    auto shape = gt::shape(3);
+
+    gt::gtensor<gt::half, 1, gt::space::host> h_a(shape, a_val);
+    gt::gtensor<gt::half, 1, gt::space::host> h_b(shape, b_val);
+    gt::gtensor<gt::half, 1, gt::space::host> h_c(shape, c_val);
+    gt::gtensor<gt::half, 1, gt::space::host> h_d(shape, d_val);
+    gt::gtensor<gt::half, 1, gt::space::host> h_e(shape, e_val);
+    gt::gtensor<gt::half, 1, gt::space::host> h_r_expl(shape);
+    gt::gtensor<gt::half, 1, gt::space::host> h_r_impl(shape);
+
+    gt::gtensor<gt::half, 1, gt::space::device> d_a(shape, a_val);
+    gt::gtensor<gt::half, 1, gt::space::device> d_b(shape, b_val);
+    gt::gtensor<gt::half, 1, gt::space::device> d_c(shape, c_val);
+    gt::gtensor<gt::half, 1, gt::space::device> d_d(shape, d_val);
+    gt::gtensor<gt::half, 1, gt::space::device> d_e(shape, e_val);
+    gt::gtensor<gt::half, 1, gt::space::device> d_r_expl(shape);
+    gt::gtensor<gt::half, 1, gt::space::device> d_r_impl(shape);
+
+    h_r_impl = s2 - h_e * ((h_a - s1 * h_b) / h_c + h_d);
+    d_r_impl = s2 - d_e * ((d_a - s1 * d_b) / d_c + d_d);
+
+    generic_explicit_custom_kernel_1d<gt::space::host>(s1, s2,
+            h_a, h_b, h_c, h_d, h_e, h_r_expl);
+
+    generic_explicit_custom_kernel_1d<gt::space::device>(s1, s2,
+            d_a, d_b, d_c, d_d, d_e, d_r_expl);
+
+    EXPECT_EQ(h_r_impl(2), r);
+    EXPECT_EQ(h_r_impl, h_r_expl);
+    EXPECT_EQ(h_r_impl, d_r_expl);
+    EXPECT_EQ(h_r_impl, d_r_impl);
+}

From 558b9e0fa5fb58898aa2c31cae95169c205923d4 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Wed, 23 Aug 2023 16:00:58 +0200
Subject: [PATCH 20/38] [clean] Replace C-cast by static_cast

---
 include/gtensor/half.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 54bfd550..ec3b2664 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -51,7 +51,7 @@ PROVIDE_HALF_COMPARISON_OPERATOR(>);
 PROVIDE_HALF_COMPARISON_OPERATOR(>=);
 
 std::ostream& operator<<(std::ostream& s, const half& h)
-{ s << (float) h.Get(); return s; }
+{ s << static_cast<float>(h.Get()); return s; }
 
 } // namespace gt
 

From 796d394ab9955999be88dc5182ee5704d8482637 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Wed, 23 Aug 2023 16:01:43 +0200
Subject: [PATCH 21/38] [clean] Remove const on arithm. return type

---
 include/gtensor/half.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index ec3b2664..ded36a8a 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -31,7 +31,7 @@ class half
 };
 
 #define PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(op) \
-    TARGET_ARCH const half operator op(const half& lhs, const half& rhs) \
+    TARGET_ARCH half operator op(const half& lhs, const half& rhs) \
     { return half( lhs.Get() op rhs.Get() ); }
 
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(+);

From 5d74de64f1f1b1af931f771fa9805e37d37db458 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Wed, 23 Aug 2023 16:22:29 +0200
Subject: [PATCH 22/38] [test/clean] Shorten test by comparing device vecs

---
 tests/test_half.cxx | 39 ++++++++++++++-------------------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 4c5f94c1..7c25eb53 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -91,17 +91,12 @@ TEST(half, AutoInitHost)
 TEST(half, AutoInitDevice)
 {
     gt::half fill_value{1.25};
-    gt::gtensor<gt::half, 1, gt::space::device> d_a(gt::shape(5), fill_value);
-    gt::gtensor<gt::half, 1, gt::space::device> d_b(d_a.shape());
+    gt::gtensor<gt::half, 1, gt::space::device> a(gt::shape(5), fill_value);
+    gt::gtensor<gt::half, 1, gt::space::device> b(a.shape());
 
-    generic_fill_1d<gt::space::device>(d_b, fill_value);
+    generic_fill_1d<gt::space::device>(b, fill_value);
 
-    gt::gtensor<gt::half, 1, gt::space::host> h_a(d_a.shape());
-    gt::gtensor<gt::half, 1, gt::space::host> h_b(d_b.shape());
-    gt::copy(d_a, h_a);
-    gt::copy(d_b, h_b);
-
-    EXPECT_EQ(h_a, h_b);
+    EXPECT_EQ(a, b);
 }
 
 void host_explicit_haxpy_1d(const gt::half& a,
@@ -141,17 +136,14 @@ void generic_explicit_haxpy_1d( const gt::half& a,
 
 TEST(half, HaxpyExplicit1dDevice)
 {
-    gt::gtensor<gt::half, 1, gt::space::device> d_x(gt::shape(3), 1.5);
-    gt::gtensor<gt::half, 1, gt::space::device> d_y(d_x.shape(), 2.5);
+    gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
     gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::host> ref(d_y.shape(), 3.25);
-    gt::gtensor<gt::half, 1, gt::space::host> h_y(d_y.shape());
+    gt::gtensor<gt::half, 1, gt::space::device> ref(y.shape(), 3.25);
 
-    generic_explicit_haxpy_1d<gt::space::device>(a, d_x, d_y);
+    generic_explicit_haxpy_1d<gt::space::device>(a, x, y);
 
-    gt::copy(d_y, h_y);
-
-    EXPECT_EQ(h_y, ref);
+    EXPECT_EQ(y, ref);
 }
 
 TEST(half, HaxpyImplicit1dHost)
@@ -168,17 +160,14 @@ TEST(half, HaxpyImplicit1dHost)
 
 TEST(half, HaxpyImplicit1dDevice)
 {
-    gt::gtensor<gt::half, 1, gt::space::device> d_x(gt::shape(3), 1.5);
-    gt::gtensor<gt::half, 1, gt::space::device> d_y(d_x.shape(), 2.5);
+    gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
     gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::host> ref(d_y.shape(), 3.25);
-    gt::gtensor<gt::half, 1, gt::space::host> h_y(d_y.shape());
+    gt::gtensor<gt::half, 1, gt::space::device> ref(y.shape(), 3.25);
 
-    d_y = a * d_x + d_y;
-
-    gt::copy(d_y, h_y);
+    y = a * x + y;
 
-    EXPECT_EQ(h_y, ref);
+    EXPECT_EQ(y, ref);
 }
 
 template <typename S>

From 2bf15eb1797e12d4fae4a3b31c92aee1217bb20a Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 24 Aug 2023 17:00:52 +0200
Subject: [PATCH 23/38] [test/feat] Mix precision gt::half w. float/double

---
 include/gtensor/half.h | 40 ++++++++++++++++++++++++++
 tests/test_half.cxx    | 64 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index ded36a8a..bd85e6fa 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -34,15 +34,41 @@ class half
     TARGET_ARCH half operator op(const half& lhs, const half& rhs) \
     { return half( lhs.Get() op rhs.Get() ); }
 
+#define PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(op, fp_type) \
+    \
+    TARGET_ARCH fp_type operator op(const half& lhs, const fp_type& rhs) \
+    { return static_cast<fp_type>(lhs.Get()) op rhs; } \
+    \
+    TARGET_ARCH fp_type operator op(const fp_type& lhs, const half& rhs) \
+    { return lhs op static_cast<fp_type>(rhs.Get()); }
+
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(+);
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(-);
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(*);
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(/);
 
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(+, float);
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(-, float);
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(*, float);
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(/, float);
+
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(+, double);
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(-, double);
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(*, double);
+PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(/, double);
+
 #define PROVIDE_HALF_COMPARISON_OPERATOR(op) \
     TARGET_ARCH bool operator op(const half& lhs, const half& rhs) \
     { return lhs.Get() op rhs.Get(); }
 
+#define PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(op, fp_type) \
+    \
+    TARGET_ARCH bool operator op(const half& lhs, const fp_type& rhs) \
+    { return static_cast<fp_type>(lhs.Get()) op rhs; } \
+    \
+    TARGET_ARCH bool operator op(const fp_type& lhs, const half& rhs) \
+    { return lhs op static_cast<fp_type>(rhs.Get()); }
+
 PROVIDE_HALF_COMPARISON_OPERATOR(==);
 PROVIDE_HALF_COMPARISON_OPERATOR(!=);
 PROVIDE_HALF_COMPARISON_OPERATOR(<);
@@ -50,6 +76,20 @@ PROVIDE_HALF_COMPARISON_OPERATOR(<=);
 PROVIDE_HALF_COMPARISON_OPERATOR(>);
 PROVIDE_HALF_COMPARISON_OPERATOR(>=);
 
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(==, float);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(!=, float);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<, float);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<=, float);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>, float);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>=, float);
+
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(==, double);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(!=, double);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<, double);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<=, double);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>, double);
+PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>=, double);
+
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << static_cast<float>(h.Get()); return s; }
 
diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 7c25eb53..21d41923 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -230,3 +230,67 @@ TEST(half, CustomKernelExplicitImplicitHostDevice)
     EXPECT_EQ(h_r_impl, d_r_expl);
     EXPECT_EQ(h_r_impl, d_r_impl);
 }
+
+TEST(half, MixedPrecisionScalar)
+{
+    gt::half a_half{1.0};
+
+    gt::half b_half{2.0};
+    float b_float{2.0};
+    double b_double{2.0};
+
+    auto c_half = a_half + b_half;
+    auto c_float = a_half + b_float;
+    auto c_double = a_half + b_double;
+
+    EXPECT_TRUE((std::is_same<gt::half, decltype(c_half)>::value));
+    EXPECT_TRUE((std::is_same<float, decltype(c_float)>::value));
+    EXPECT_TRUE((std::is_same<double, decltype(c_double)>::value));
+
+    EXPECT_EQ(c_half, c_float);
+    EXPECT_EQ(c_half, c_double);
+}
+
+TEST(half, MixedPrecisionHost)
+{
+    auto shape = gt::shape(3);
+    gt::gtensor<gt::half, 1, gt::space::host> vh(shape, 4.0);
+    gt::gtensor<float, 1, gt::space::host> vf(shape, 3.0);
+    gt::gtensor<double, 1, gt::space::host> vd(shape, 2.0);
+
+    gt::gtensor<gt::half, 1, gt::space::host> rh(shape);
+    gt::gtensor<float, 1, gt::space::host> rf(shape);
+    gt::gtensor<double, 1, gt::space::host> rd(shape);
+
+    gt::gtensor<double, 1, gt::space::host> ref(shape, 10.0);
+
+    rh = (vh * vf) - (vh / vd);
+    rf = (vh * vf) - (vh / vd);
+    rd = (vh * vf) - (vh / vd);
+
+    EXPECT_EQ(ref, rh);
+    EXPECT_EQ(ref, rf);
+    EXPECT_EQ(ref, rd);
+}
+
+TEST(half, MixedPrecisionDevice)
+{
+    auto shape = gt::shape(3);
+    gt::gtensor<gt::half, 1, gt::space::device> vh(shape, 4.0);
+    gt::gtensor<float, 1, gt::space::device> vf(shape, 3.0);
+    gt::gtensor<double, 1, gt::space::device> vd(shape, 2.0);
+
+    gt::gtensor<gt::half, 1, gt::space::device> rh(shape);
+    gt::gtensor<float, 1, gt::space::device> rf(shape);
+    gt::gtensor<double, 1, gt::space::device> rd(shape);
+
+    gt::gtensor<double, 1, gt::space::device> ref(shape, 10.0);
+
+    rh = (vh * vf) - (vh / vd);
+    rf = (vh * vf) - (vh / vd);
+    rd = (vh * vf) - (vh / vd);
+
+    EXPECT_EQ(ref, rh);
+    EXPECT_EQ(ref, rf);
+    EXPECT_EQ(ref, rd);
+}

From 20ebf2ff082044dad355b655e070cf3520938ef1 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 24 Aug 2023 17:06:32 +0200
Subject: [PATCH 24/38] [fix] #undef macros in half.h

---
 include/gtensor/half.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index bd85e6fa..6ae3212c 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -93,6 +93,12 @@ PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>=, double);
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << static_cast<float>(h.Get()); return s; }
 
+#undef TARGET_ARCH
+#undef PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR
+#undef PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR
+#undef PROVIDE_HALF_COMPARISON_OPERATOR
+#undef PROVIDE_MIXED_HALF_COMPARISON_OPERATOR
+
 } // namespace gt
 
 #endif

From e887e9c56e22664456fecef0a14d308ac329bc94 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Fri, 25 Aug 2023 10:18:20 +0200
Subject: [PATCH 25/38] [fix] Build cuda for compute capability < 5.3

---
 include/gtensor/half.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 6ae3212c..dd3e6a7b 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -9,11 +9,15 @@ namespace gt
 
 // ======================================================================
 // half
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+#ifdef __CUDA_ARCH__
 #define TARGET_ARCH __host__ __device__
-using compute_type = __half;
 #else
 #define TARGET_ARCH
+#endif
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+using compute_type = __half;
+#else
 using compute_type = float;
 #endif
 

From 7a05b882d2b2160983f58bdada25040fb6a15b53 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 28 Aug 2023 13:56:37 +0200
Subject: [PATCH 26/38] [fix] Inline gt::half methods/ops via GT_INLINE

---
 include/gtensor/half.h | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index dd3e6a7b..f84e9858 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -3,17 +3,13 @@
 
 #include <iostream>
 #include <cuda_fp16.h>
+#include <gtensor/macros.h>
 
 namespace gt
 {
 
 // ======================================================================
 // half
-#ifdef __CUDA_ARCH__
-#define TARGET_ARCH __host__ __device__
-#else
-#define TARGET_ARCH
-#endif
 
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
 using compute_type = __half;
@@ -25,25 +21,25 @@ class half
 {
 public:
     half() = default;
-    TARGET_ARCH half(float x) : x(x) {};
-    TARGET_ARCH half(__half x) : x(x) {};
+    GT_INLINE half(float x) : x(x) {};
+    GT_INLINE half(__half x) : x(x) {};
 
-    TARGET_ARCH const half& operator=(const float f) { x = f; return *this; }
-    TARGET_ARCH compute_type Get() const { return static_cast<compute_type>(x); }
+    GT_INLINE const half& operator=(const float f) { x = f; return *this; }
+    GT_INLINE compute_type Get() const { return static_cast<compute_type>(x); }
 private:
     __half x;
 };
 
 #define PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(op) \
-    TARGET_ARCH half operator op(const half& lhs, const half& rhs) \
+    GT_INLINE half operator op(const half& lhs, const half& rhs) \
     { return half( lhs.Get() op rhs.Get() ); }
 
 #define PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(op, fp_type) \
     \
-    TARGET_ARCH fp_type operator op(const half& lhs, const fp_type& rhs) \
+    GT_INLINE fp_type operator op(const half& lhs, const fp_type& rhs) \
     { return static_cast<fp_type>(lhs.Get()) op rhs; } \
     \
-    TARGET_ARCH fp_type operator op(const fp_type& lhs, const half& rhs) \
+    GT_INLINE fp_type operator op(const fp_type& lhs, const half& rhs) \
     { return lhs op static_cast<fp_type>(rhs.Get()); }
 
 PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(+);
@@ -62,15 +58,15 @@ PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(*, double);
 PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(/, double);
 
 #define PROVIDE_HALF_COMPARISON_OPERATOR(op) \
-    TARGET_ARCH bool operator op(const half& lhs, const half& rhs) \
+    GT_INLINE bool operator op(const half& lhs, const half& rhs) \
     { return lhs.Get() op rhs.Get(); }
 
 #define PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(op, fp_type) \
     \
-    TARGET_ARCH bool operator op(const half& lhs, const fp_type& rhs) \
+    GT_INLINE bool operator op(const half& lhs, const fp_type& rhs) \
     { return static_cast<fp_type>(lhs.Get()) op rhs; } \
     \
-    TARGET_ARCH bool operator op(const fp_type& lhs, const half& rhs) \
+    GT_INLINE bool operator op(const fp_type& lhs, const half& rhs) \
     { return lhs op static_cast<fp_type>(rhs.Get()); }
 
 PROVIDE_HALF_COMPARISON_OPERATOR(==);
@@ -97,7 +93,6 @@ PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>=, double);
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << static_cast<float>(h.Get()); return s; }
 
-#undef TARGET_ARCH
 #undef PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR
 #undef PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR
 #undef PROVIDE_HALF_COMPARISON_OPERATOR

From 7572edc7ed2fffad74071a6e589fa9920f81fba0 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 28 Aug 2023 14:05:33 +0200
Subject: [PATCH 27/38] [test/clean] Use snake_case for test names

---
 tests/test_half.cxx | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 21d41923..3689ac46 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -4,7 +4,7 @@
 
 #include <gtensor/half.h>
 
-TEST(half, ScalarArithmetic)
+TEST(half, scalar_arithmetic)
 {
     gt::half a{1.0};
     gt::half b{2.0};
@@ -29,7 +29,7 @@ TEST(half, ScalarArithmetic)
     EXPECT_EQ(c, ref);
 }
 
-TEST(half, BinaryComparisonOperators)
+TEST(half, binary_comparison_operators)
 {
     gt::half a{1.0};
     gt::half b{2.0};
@@ -77,7 +77,7 @@ void generic_fill_1d(gt::gtensor<gt::half, 1, S>& x, const gt::half& fill_value)
     gt::launch<1, S>(x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
 }
 
-TEST(half, AutoInitHost)
+TEST(half, auto_init_host)
 {
     gt::half fill_value{1.25};
     gt::gtensor<gt::half, 1, gt::space::host> a(gt::shape(5), fill_value);
@@ -88,7 +88,7 @@ TEST(half, AutoInitHost)
     EXPECT_EQ(a, b);
 }
 
-TEST(half, AutoInitDevice)
+TEST(half, auto_init_device)
 {
     gt::half fill_value{1.25};
     gt::gtensor<gt::half, 1, gt::space::device> a(gt::shape(5), fill_value);
@@ -110,7 +110,7 @@ void host_explicit_haxpy_1d(const gt::half& a,
             y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
-TEST(half, HaxpyExplicit1dHost)
+TEST(half, haxpy_explicit_1d_host)
 {
     gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
@@ -134,7 +134,7 @@ void generic_explicit_haxpy_1d( const gt::half& a,
             y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
-TEST(half, HaxpyExplicit1dDevice)
+TEST(half, haxpy_explicit_1d_device)
 {
     gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
@@ -146,7 +146,7 @@ TEST(half, HaxpyExplicit1dDevice)
     EXPECT_EQ(y, ref);
 }
 
-TEST(half, HaxpyImplicit1dHost)
+TEST(half, haxpy_implicit_1d_host)
 {
     gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
@@ -158,7 +158,7 @@ TEST(half, HaxpyImplicit1dHost)
     EXPECT_EQ(y, ref);
 }
 
-TEST(half, HaxpyImplicit1dDevice)
+TEST(half, haxpy_implicit_1d_device)
 {
     gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
@@ -191,7 +191,7 @@ void generic_explicit_custom_kernel_1d( const gt::half& s1,
         { k_r(i) = s2 - k_e(i) * ((k_a(i) - s1 * k_b(i)) / k_c(i) + k_d(i)); });
 }
 
-TEST(half, CustomKernelExplicitImplicitHostDevice)
+TEST(half, custom_kernel_explicit_implicit_host_device)
 {
     gt::half a_val{12.34}, b_val{2.345}, c_val{0.987}, d_val{0.67}, e_val{3.14};
     gt::half s1{0.1}, s2{4.56};
@@ -231,7 +231,7 @@ TEST(half, CustomKernelExplicitImplicitHostDevice)
     EXPECT_EQ(h_r_impl, d_r_impl);
 }
 
-TEST(half, MixedPrecisionScalar)
+TEST(half, mixed_precision_scalar)
 {
     gt::half a_half{1.0};
 
@@ -251,7 +251,7 @@ TEST(half, MixedPrecisionScalar)
     EXPECT_EQ(c_half, c_double);
 }
 
-TEST(half, MixedPrecisionHost)
+TEST(half, mixed_precision_host)
 {
     auto shape = gt::shape(3);
     gt::gtensor<gt::half, 1, gt::space::host> vh(shape, 4.0);
@@ -273,7 +273,7 @@ TEST(half, MixedPrecisionHost)
     EXPECT_EQ(ref, rd);
 }
 
-TEST(half, MixedPrecisionDevice)
+TEST(half, mixed_precision_device)
 {
     auto shape = gt::shape(3);
     gt::gtensor<gt::half, 1, gt::space::device> vh(shape, 4.0);

From e3bb3be6e01102af65719dd72773b51da58284d3 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 28 Aug 2023 14:08:02 +0200
Subject: [PATCH 28/38] [clean] Rename 1d by 1D in tests

---
 tests/test_half.cxx | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 3689ac46..0f97ced8 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -70,7 +70,7 @@ TEST(half, binary_comparison_operators)
 }
 
 template <typename S>
-void generic_fill_1d(gt::gtensor<gt::half, 1, S>& x, const gt::half& fill_value)
+void generic_fill_1D(gt::gtensor<gt::half, 1, S>& x, const gt::half& fill_value)
 {
     auto k_x = x.to_kernel();
 
@@ -83,7 +83,7 @@ TEST(half, auto_init_host)
     gt::gtensor<gt::half, 1, gt::space::host> a(gt::shape(5), fill_value);
     gt::gtensor<gt::half, 1, gt::space::host> b(a.shape());
 
-    generic_fill_1d<gt::space::host>(b, fill_value);
+    generic_fill_1D<gt::space::host>(b, fill_value);
 
     EXPECT_EQ(a, b);
 }
@@ -94,12 +94,12 @@ TEST(half, auto_init_device)
     gt::gtensor<gt::half, 1, gt::space::device> a(gt::shape(5), fill_value);
     gt::gtensor<gt::half, 1, gt::space::device> b(a.shape());
 
-    generic_fill_1d<gt::space::device>(b, fill_value);
+    generic_fill_1D<gt::space::device>(b, fill_value);
 
     EXPECT_EQ(a, b);
 }
 
-void host_explicit_haxpy_1d(const gt::half& a,
+void host_explicit_haxpy_1D(const gt::half& a,
                             const gt::gtensor<gt::half, 1>& x,
                             gt::gtensor<gt::half, 1>& y)
 {
@@ -110,20 +110,20 @@ void host_explicit_haxpy_1d(const gt::half& a,
             y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
-TEST(half, haxpy_explicit_1d_host)
+TEST(half, haxpy_explicit_1D_host)
 {
     gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
     gt::half a{0.5};
     gt::gtensor<gt::half, 1, gt::space::host> ref(x.shape(), 3.25);
 
-    host_explicit_haxpy_1d(a, x, y);
+    host_explicit_haxpy_1D(a, x, y);
 
     EXPECT_EQ(y, ref);
 }
 
 template <typename S>
-void generic_explicit_haxpy_1d( const gt::half& a,
+void generic_explicit_haxpy_1D( const gt::half& a,
                                 const gt::gtensor<gt::half, 1, S>& x,
                                       gt::gtensor<gt::half, 1, S>& y)
 {
@@ -134,19 +134,19 @@ void generic_explicit_haxpy_1d( const gt::half& a,
             y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
-TEST(half, haxpy_explicit_1d_device)
+TEST(half, haxpy_explicit_1D_device)
 {
     gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
     gt::half a{0.5};
     gt::gtensor<gt::half, 1, gt::space::device> ref(y.shape(), 3.25);
 
-    generic_explicit_haxpy_1d<gt::space::device>(a, x, y);
+    generic_explicit_haxpy_1D<gt::space::device>(a, x, y);
 
     EXPECT_EQ(y, ref);
 }
 
-TEST(half, haxpy_implicit_1d_host)
+TEST(half, haxpy_implicit_1D_host)
 {
     gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
@@ -158,7 +158,7 @@ TEST(half, haxpy_implicit_1d_host)
     EXPECT_EQ(y, ref);
 }
 
-TEST(half, haxpy_implicit_1d_device)
+TEST(half, haxpy_implicit_1D_device)
 {
     gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
     gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
@@ -171,7 +171,7 @@ TEST(half, haxpy_implicit_1d_device)
 }
 
 template <typename S>
-void generic_explicit_custom_kernel_1d( const gt::half& s1,
+void generic_explicit_custom_kernel_1D( const gt::half& s1,
                                         const gt::half& s2,
                                         const gt::gtensor<gt::half, 1, S>& a,
                                         const gt::gtensor<gt::half, 1, S>& b,
@@ -219,10 +219,10 @@ TEST(half, custom_kernel_explicit_implicit_host_device)
     h_r_impl = s2 - h_e * ((h_a - s1 * h_b) / h_c + h_d);
     d_r_impl = s2 - d_e * ((d_a - s1 * d_b) / d_c + d_d);
 
-    generic_explicit_custom_kernel_1d<gt::space::host>(s1, s2,
+    generic_explicit_custom_kernel_1D<gt::space::host>(s1, s2,
             h_a, h_b, h_c, h_d, h_e, h_r_expl);
 
-    generic_explicit_custom_kernel_1d<gt::space::device>(s1, s2,
+    generic_explicit_custom_kernel_1D<gt::space::device>(s1, s2,
             d_a, d_b, d_c, d_d, d_e, d_r_expl);
 
     EXPECT_EQ(h_r_impl(2), r);

From 996f052a247210e8a5dbe42a8c8aa5a531dbec40 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 28 Aug 2023 14:16:43 +0200
Subject: [PATCH 29/38] [clean] Helper function to reduce code duplication

---
 tests/test_half.cxx | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 0f97ced8..2b0e7928 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -251,18 +251,19 @@ TEST(half, mixed_precision_scalar)
     EXPECT_EQ(c_half, c_double);
 }
 
-TEST(half, mixed_precision_host)
+template<typename S>
+void test_mixed_precision_helper()
 {
     auto shape = gt::shape(3);
-    gt::gtensor<gt::half, 1, gt::space::host> vh(shape, 4.0);
-    gt::gtensor<float, 1, gt::space::host> vf(shape, 3.0);
-    gt::gtensor<double, 1, gt::space::host> vd(shape, 2.0);
+    gt::gtensor<gt::half, 1, S> vh(shape, 4.0);
+    gt::gtensor<float, 1, S> vf(shape, 3.0);
+    gt::gtensor<double, 1, S> vd(shape, 2.0);
 
-    gt::gtensor<gt::half, 1, gt::space::host> rh(shape);
-    gt::gtensor<float, 1, gt::space::host> rf(shape);
-    gt::gtensor<double, 1, gt::space::host> rd(shape);
+    gt::gtensor<gt::half, 1, S> rh(shape);
+    gt::gtensor<float, 1, S> rf(shape);
+    gt::gtensor<double, 1, S> rd(shape);
 
-    gt::gtensor<double, 1, gt::space::host> ref(shape, 10.0);
+    gt::gtensor<double, 1, S> ref(shape, 10.0);
 
     rh = (vh * vf) - (vh / vd);
     rf = (vh * vf) - (vh / vd);
@@ -273,24 +274,12 @@ TEST(half, mixed_precision_host)
     EXPECT_EQ(ref, rd);
 }
 
-TEST(half, mixed_precision_device)
+TEST(half, mixed_precision_host)
 {
-    auto shape = gt::shape(3);
-    gt::gtensor<gt::half, 1, gt::space::device> vh(shape, 4.0);
-    gt::gtensor<float, 1, gt::space::device> vf(shape, 3.0);
-    gt::gtensor<double, 1, gt::space::device> vd(shape, 2.0);
-
-    gt::gtensor<gt::half, 1, gt::space::device> rh(shape);
-    gt::gtensor<float, 1, gt::space::device> rf(shape);
-    gt::gtensor<double, 1, gt::space::device> rd(shape);
-
-    gt::gtensor<double, 1, gt::space::device> ref(shape, 10.0);
-
-    rh = (vh * vf) - (vh / vd);
-    rf = (vh * vf) - (vh / vd);
-    rd = (vh * vf) - (vh / vd);
+    test_mixed_precision_helper<gt::space::host>();
+}
 
-    EXPECT_EQ(ref, rh);
-    EXPECT_EQ(ref, rf);
-    EXPECT_EQ(ref, rd);
+TEST(half, mixed_precision_device)
+{
+    test_mixed_precision_helper<gt::space::device>();
 }

From 37bbab0151456dae3b2d7e4a33a98e614de175df Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Mon, 28 Aug 2023 14:30:42 +0200
Subject: [PATCH 30/38] [clean] Remove host-helper: covered by generic one

---
 tests/test_half.cxx | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 2b0e7928..9705c381 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -99,14 +99,15 @@ TEST(half, auto_init_device)
     EXPECT_EQ(a, b);
 }
 
-void host_explicit_haxpy_1D(const gt::half& a,
-                            const gt::gtensor<gt::half, 1>& x,
-                            gt::gtensor<gt::half, 1>& y)
+template <typename S>
+void generic_explicit_haxpy_1D( const gt::half& a,
+                                const gt::gtensor<gt::half, 1, S>& x,
+                                      gt::gtensor<gt::half, 1, S>& y)
 {
     auto k_x = x.to_kernel();
     auto k_y = y.to_kernel();
 
-    gt::launch_host<1>(
+    gt::launch<1, S>(
             y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
@@ -117,23 +118,11 @@ TEST(half, haxpy_explicit_1D_host)
     gt::half a{0.5};
     gt::gtensor<gt::half, 1, gt::space::host> ref(x.shape(), 3.25);
 
-    host_explicit_haxpy_1D(a, x, y);
+    generic_explicit_haxpy_1D<gt::space::host>(a, x, y);
 
     EXPECT_EQ(y, ref);
 }
 
-template <typename S>
-void generic_explicit_haxpy_1D( const gt::half& a,
-                                const gt::gtensor<gt::half, 1, S>& x,
-                                      gt::gtensor<gt::half, 1, S>& y)
-{
-    auto k_x = x.to_kernel();
-    auto k_y = y.to_kernel();
-
-    gt::launch<1, S>(
-            y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
-}
-
 TEST(half, haxpy_explicit_1D_device)
 {
     gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);

From ad4968f1dcbdd7c361854b8fcc1fd05d3ed69817 Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Tue, 29 Aug 2023 15:28:10 +0200
Subject: [PATCH 31/38] [clean] Undef helper macros at end of file

---
 include/gtensor/half.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index f84e9858..ef49ad21 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -93,11 +93,11 @@ PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>=, double);
 std::ostream& operator<<(std::ostream& s, const half& h)
 { s << static_cast<float>(h.Get()); return s; }
 
+} // namespace gt
+
 #undef PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR
 #undef PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR
 #undef PROVIDE_HALF_COMPARISON_OPERATOR
 #undef PROVIDE_MIXED_HALF_COMPARISON_OPERATOR
 
-} // namespace gt
-
 #endif

From dffc73076a7da1c28a1e86160daa84faa4d9cddf Mon Sep 17 00:00:00 2001
From: Carl-Martin Pfeiler <carl-martin.pfeiler@ipp.mpg.de>
Date: Tue, 29 Aug 2023 18:08:36 +0200
Subject: [PATCH 32/38] [fix] Keep cuda_fp16.h out of default incl. chain

    + TODO Alternative gt::half storage_type (e.g., _Float16)
---
 include/gtensor/half.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index ef49ad21..a9dc4b4f 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -2,7 +2,15 @@
 #define GTENSOR_HALF_H
 
 #include <iostream>
+
+#if __has_include(<cuda_fp16.h>)
 #include <cuda_fp16.h>
+#define GTENSOR_FP16_CUDA_HEADER
+#elif 0 // TODO check if other fp16 type available, e.g., _Float16
+#else
+#error "No half precision floating point type available."
+#endif
+
 #include <gtensor/macros.h>
 
 namespace gt
@@ -11,7 +19,15 @@ namespace gt
 // ======================================================================
 // half
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+#if defined(GTENSOR_FP16_CUDA_HEADER)
+using storage_type = __half;
+#else
+#error "No half precision floating point type available."
+#endif
+
+#if defined(GTENSOR_FP16_CUDA_HEADER) \
+    && defined(__CUDA_ARCH__) \
+    && (__CUDA_ARCH__ >= 530)
 using compute_type = __half;
 #else
 using compute_type = float;
@@ -22,12 +38,12 @@ class half
 public:
     half() = default;
     GT_INLINE half(float x) : x(x) {};
-    GT_INLINE half(__half x) : x(x) {};
+    GT_INLINE half(storage_type x) : x(x) {};
 
     GT_INLINE const half& operator=(const float f) { x = f; return *this; }
     GT_INLINE compute_type Get() const { return static_cast<compute_type>(x); }
 private:
-    __half x;
+    storage_type x;
 };
 
 #define PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(op) \

From 0e79cd026bce46c99583d661effa1c9a0fc1a058 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Wed, 30 Aug 2023 12:41:59 +0200
Subject: [PATCH 33/38] [build] Added cmake option GTENSOR_ENABLE_FP16

---
 CMakeLists.txt            | 8 ++++++++
 include/gtensor/gtensor.h | 4 ++++
 include/gtensor/half.h    | 4 ++--
 tests/CMakeLists.txt      | 5 ++++-
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b58305a..ca21e8cb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,7 @@ option(GTENSOR_ALLOCATOR_CACHING "Enable naive caching allocators" ON)
 option(GTENSOR_BOUNDS_CHECK "Enable per access bounds checking" OFF)
 option(GTENSOR_ADDRESS_CHECK "Enable address checking for device spans" OFF)
 option(GTENSOR_SYNC_KERNELS "Enable host sync after assign and launch kernels" OFF)
+option(GTENSOR_ENABLE_FP16 "Enable 16-bit floating point type gt::half" OFF)
 
 if (GTENSOR_ENABLE_FORTRAN)
   # do this early (here) since later the `enable_language(Fortran)` gives me trouble
@@ -335,6 +336,13 @@ else()
   message(STATUS "${PROJECT_NAME}: sync kernels is OFF")
 endif()
 
+if (GTENSOR_ENABLE_FP16)
+  message(STATUS "${PROJECT_NAME}: gt::half is ENABLED")
+  target_compile_definitions(gtensor_${GTENSOR_DEVICE}
+                             INTERFACE GTENSOR_ENABLE_FP16)
+endif()
+
+
 target_compile_definitions(gtensor_${GTENSOR_DEVICE} INTERFACE
   GTENSOR_MANAGED_MEMORY_TYPE_DEFAULT=${GTENSOR_MANAGED_MEMORY_TYPE_DEFAULT})
 message(STATUS "${PROJECT_NAME}: default managed memory type '${GTENSOR_MANAGED_MEMORY_TYPE_DEFAULT}'")
diff --git a/include/gtensor/gtensor.h b/include/gtensor/gtensor.h
index 507f0d20..70dee7d3 100644
--- a/include/gtensor/gtensor.h
+++ b/include/gtensor/gtensor.h
@@ -19,6 +19,10 @@
 #include "operator.h"
 #include "space.h"
 
+#if defined(GTENSOR_ENABLE_FP16)
+#include "half.h"
+#endif
+
 namespace gt
 {
 
diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index a9dc4b4f..6b29c1fe 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -8,7 +8,7 @@
 #define GTENSOR_FP16_CUDA_HEADER
 #elif 0 // TODO check if other fp16 type available, e.g., _Float16
 #else
-#error "No half precision floating point type available."
+#error "GTENSOR_ENABLE_FP16=ON, but no 16-bit FP type available!"
 #endif
 
 #include <gtensor/macros.h>
@@ -22,7 +22,7 @@ namespace gt
 #if defined(GTENSOR_FP16_CUDA_HEADER)
 using storage_type = __half;
 #else
-#error "No half precision floating point type available."
+#error "GTENSOR_ENABLE_FP16=ON, but no 16-bit FP type available!"
 #endif
 
 #if defined(GTENSOR_FP16_CUDA_HEADER) \
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 90538c02..41a4f10d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -43,7 +43,6 @@ add_gtensor_test(test_space)
 add_gtensor_test(test_stream)
 add_gtensor_test(test_gtest_predicates)
 add_gtensor_test(test_sparse)
-add_gtensor_test(test_half)
 
 if (GTENSOR_ENABLE_CLIB)
   add_executable(test_clib)
@@ -81,3 +80,7 @@ if (GTENSOR_ENABLE_FFT)
   add_gtensor_test(test_fft)
   target_link_libraries(test_fft gtfft)
 endif()
+
+if (GTENSOR_ENABLE_FP16)
+  add_gtensor_test(test_half)
+endif()

From 1708453e68a30dfc3a45878b33bcdca5cdc22b03 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 7 Sep 2023 14:11:39 +0200
Subject: [PATCH 34/38] [rename] Change gt::half to gt::float16_t

---
 include/gtensor/half.h | 114 +++++++++++++--------------
 tests/test_half.cxx    | 170 ++++++++++++++++++++---------------------
 2 files changed, 142 insertions(+), 142 deletions(-)

diff --git a/include/gtensor/half.h b/include/gtensor/half.h
index 6b29c1fe..5e21701f 100644
--- a/include/gtensor/half.h
+++ b/include/gtensor/half.h
@@ -1,5 +1,5 @@
-#ifndef GTENSOR_HALF_H
-#define GTENSOR_HALF_H
+#ifndef GTENSOR_FLOAT16T_H
+#define GTENSOR_FLOAT16T_H
 
 #include <iostream>
 
@@ -17,7 +17,7 @@ namespace gt
 {
 
 // ======================================================================
-// half
+// float16_t
 
 #if defined(GTENSOR_FP16_CUDA_HEADER)
 using storage_type = __half;
@@ -33,87 +33,87 @@ using compute_type = __half;
 using compute_type = float;
 #endif
 
-class half
+class float16_t
 {
 public:
-    half() = default;
-    GT_INLINE half(float x) : x(x) {};
-    GT_INLINE half(storage_type x) : x(x) {};
+    float16_t() = default;
+    GT_INLINE float16_t(float x) : x(x) {};
+    GT_INLINE float16_t(storage_type x) : x(x) {};
 
-    GT_INLINE const half& operator=(const float f) { x = f; return *this; }
+    GT_INLINE const float16_t& operator=(const float f) { x = f; return *this; }
     GT_INLINE compute_type Get() const { return static_cast<compute_type>(x); }
 private:
     storage_type x;
 };
 
-#define PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(op) \
-    GT_INLINE half operator op(const half& lhs, const half& rhs) \
-    { return half( lhs.Get() op rhs.Get() ); }
+#define PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(op) \
+    GT_INLINE float16_t operator op(const float16_t& lhs, const float16_t& rhs) \
+    { return float16_t( lhs.Get() op rhs.Get() ); }
 
-#define PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(op, fp_type) \
+#define PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(op, fp_type) \
     \
-    GT_INLINE fp_type operator op(const half& lhs, const fp_type& rhs) \
+    GT_INLINE fp_type operator op(const float16_t& lhs, const fp_type& rhs) \
     { return static_cast<fp_type>(lhs.Get()) op rhs; } \
     \
-    GT_INLINE fp_type operator op(const fp_type& lhs, const half& rhs) \
+    GT_INLINE fp_type operator op(const fp_type& lhs, const float16_t& rhs) \
     { return lhs op static_cast<fp_type>(rhs.Get()); }
 
-PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(+);
-PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(-);
-PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(*);
-PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR(/);
+PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(+);
+PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(-);
+PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(*);
+PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(/);
 
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(+, float);
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(-, float);
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(*, float);
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(/, float);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(+, float);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(-, float);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(*, float);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(/, float);
 
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(+, double);
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(-, double);
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(*, double);
-PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR(/, double);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(+, double);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(-, double);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(*, double);
+PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(/, double);
 
-#define PROVIDE_HALF_COMPARISON_OPERATOR(op) \
-    GT_INLINE bool operator op(const half& lhs, const half& rhs) \
+#define PROVIDE_FLOAT16T_COMPARISON_OPERATOR(op) \
+    GT_INLINE bool operator op(const float16_t& lhs, const float16_t& rhs) \
     { return lhs.Get() op rhs.Get(); }
 
-#define PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(op, fp_type) \
+#define PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(op, fp_type) \
     \
-    GT_INLINE bool operator op(const half& lhs, const fp_type& rhs) \
+    GT_INLINE bool operator op(const float16_t& lhs, const fp_type& rhs) \
     { return static_cast<fp_type>(lhs.Get()) op rhs; } \
     \
-    GT_INLINE bool operator op(const fp_type& lhs, const half& rhs) \
+    GT_INLINE bool operator op(const fp_type& lhs, const float16_t& rhs) \
     { return lhs op static_cast<fp_type>(rhs.Get()); }
 
-PROVIDE_HALF_COMPARISON_OPERATOR(==);
-PROVIDE_HALF_COMPARISON_OPERATOR(!=);
-PROVIDE_HALF_COMPARISON_OPERATOR(<);
-PROVIDE_HALF_COMPARISON_OPERATOR(<=);
-PROVIDE_HALF_COMPARISON_OPERATOR(>);
-PROVIDE_HALF_COMPARISON_OPERATOR(>=);
-
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(==, float);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(!=, float);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<, float);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<=, float);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>, float);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>=, float);
-
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(==, double);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(!=, double);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<, double);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(<=, double);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>, double);
-PROVIDE_MIXED_HALF_COMPARISON_OPERATOR(>=, double);
-
-std::ostream& operator<<(std::ostream& s, const half& h)
+PROVIDE_FLOAT16T_COMPARISON_OPERATOR(==);
+PROVIDE_FLOAT16T_COMPARISON_OPERATOR(!=);
+PROVIDE_FLOAT16T_COMPARISON_OPERATOR(<);
+PROVIDE_FLOAT16T_COMPARISON_OPERATOR(<=);
+PROVIDE_FLOAT16T_COMPARISON_OPERATOR(>);
+PROVIDE_FLOAT16T_COMPARISON_OPERATOR(>=);
+
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(==, float);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(!=, float);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(<, float);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(<=, float);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(>, float);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(>=, float);
+
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(==, double);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(!=, double);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(<, double);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(<=, double);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(>, double);
+PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(>=, double);
+
+std::ostream& operator<<(std::ostream& s, const float16_t& h)
 { s << static_cast<float>(h.Get()); return s; }
 
 } // namespace gt
 
-#undef PROVIDE_HALF_BINARY_ARITHMETIC_OPERATOR
-#undef PROVIDE_MIXED_HALF_BINARY_ARITHMETIC_OPERATOR
-#undef PROVIDE_HALF_COMPARISON_OPERATOR
-#undef PROVIDE_MIXED_HALF_COMPARISON_OPERATOR
+#undef PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR
+#undef PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR
+#undef PROVIDE_FLOAT16T_COMPARISON_OPERATOR
+#undef PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR
 
 #endif
diff --git a/tests/test_half.cxx b/tests/test_half.cxx
index 9705c381..4cc8f5b7 100644
--- a/tests/test_half.cxx
+++ b/tests/test_half.cxx
@@ -4,13 +4,13 @@
 
 #include <gtensor/half.h>
 
-TEST(half, scalar_arithmetic)
+TEST(float16_t, scalar_arithmetic)
 {
-    gt::half a{1.0};
-    gt::half b{2.0};
+    gt::float16_t a{1.0};
+    gt::float16_t b{2.0};
 
-    gt::half c{0.0};
-    gt::half ref{0.0};
+    gt::float16_t c{0.0};
+    gt::float16_t ref{0.0};
 
     c = a + b;
     ref = 3.0;
@@ -29,11 +29,11 @@ TEST(half, scalar_arithmetic)
     EXPECT_EQ(c, ref);
 }
 
-TEST(half, binary_comparison_operators)
+TEST(float16_t, binary_comparison_operators)
 {
-    gt::half a{1.0};
-    gt::half b{2.0};
-    gt::half c{2.0};
+    gt::float16_t a{1.0};
+    gt::float16_t b{2.0};
+    gt::float16_t c{2.0};
 
     EXPECT_EQ(a, a);
     EXPECT_EQ(b, b);
@@ -70,29 +70,29 @@ TEST(half, binary_comparison_operators)
 }
 
 template <typename S>
-void generic_fill_1D(gt::gtensor<gt::half, 1, S>& x, const gt::half& fill_value)
+void generic_fill_1D(gt::gtensor<gt::float16_t, 1, S>& x, const gt::float16_t& fill_value)
 {
     auto k_x = x.to_kernel();
 
     gt::launch<1, S>(x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
 }
 
-TEST(half, auto_init_host)
+TEST(float16_t, auto_init_host)
 {
-    gt::half fill_value{1.25};
-    gt::gtensor<gt::half, 1, gt::space::host> a(gt::shape(5), fill_value);
-    gt::gtensor<gt::half, 1, gt::space::host> b(a.shape());
+    gt::float16_t fill_value{1.25};
+    gt::gtensor<gt::float16_t, 1, gt::space::host> a(gt::shape(5), fill_value);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> b(a.shape());
 
     generic_fill_1D<gt::space::host>(b, fill_value);
 
     EXPECT_EQ(a, b);
 }
 
-TEST(half, auto_init_device)
+TEST(float16_t, auto_init_device)
 {
-    gt::half fill_value{1.25};
-    gt::gtensor<gt::half, 1, gt::space::device> a(gt::shape(5), fill_value);
-    gt::gtensor<gt::half, 1, gt::space::device> b(a.shape());
+    gt::float16_t fill_value{1.25};
+    gt::gtensor<gt::float16_t, 1, gt::space::device> a(gt::shape(5), fill_value);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> b(a.shape());
 
     generic_fill_1D<gt::space::device>(b, fill_value);
 
@@ -100,9 +100,9 @@ TEST(half, auto_init_device)
 }
 
 template <typename S>
-void generic_explicit_haxpy_1D( const gt::half& a,
-                                const gt::gtensor<gt::half, 1, S>& x,
-                                      gt::gtensor<gt::half, 1, S>& y)
+void generic_explicit_haxpy_1D( const gt::float16_t& a,
+                                const gt::gtensor<gt::float16_t, 1, S>& x,
+                                      gt::gtensor<gt::float16_t, 1, S>& y)
 {
     auto k_x = x.to_kernel();
     auto k_y = y.to_kernel();
@@ -111,48 +111,48 @@ void generic_explicit_haxpy_1D( const gt::half& a,
             y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
-TEST(half, haxpy_explicit_1D_host)
+TEST(float16_t, haxpy_explicit_1D_host)
 {
-    gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
-    gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::host> ref(x.shape(), 3.25);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> y(x.shape(), 2.5);
+    gt::float16_t a{0.5};
+    gt::gtensor<gt::float16_t, 1, gt::space::host> ref(x.shape(), 3.25);
 
     generic_explicit_haxpy_1D<gt::space::host>(a, x, y);
 
     EXPECT_EQ(y, ref);
 }
 
-TEST(half, haxpy_explicit_1D_device)
+TEST(float16_t, haxpy_explicit_1D_device)
 {
-    gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
-    gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::device> ref(y.shape(), 3.25);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> y(x.shape(), 2.5);
+    gt::float16_t a{0.5};
+    gt::gtensor<gt::float16_t, 1, gt::space::device> ref(y.shape(), 3.25);
 
     generic_explicit_haxpy_1D<gt::space::device>(a, x, y);
 
     EXPECT_EQ(y, ref);
 }
 
-TEST(half, haxpy_implicit_1D_host)
+TEST(float16_t, haxpy_implicit_1D_host)
 {
-    gt::gtensor<gt::half, 1, gt::space::host> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::half, 1, gt::space::host> y(x.shape(), 2.5);
-    gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::host> ref(x.shape(), 3.25);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> y(x.shape(), 2.5);
+    gt::float16_t a{0.5};
+    gt::gtensor<gt::float16_t, 1, gt::space::host> ref(x.shape(), 3.25);
 
     y = a * x + y;
 
     EXPECT_EQ(y, ref);
 }
 
-TEST(half, haxpy_implicit_1D_device)
+TEST(float16_t, haxpy_implicit_1D_device)
 {
-    gt::gtensor<gt::half, 1, gt::space::device> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::half, 1, gt::space::device> y(x.shape(), 2.5);
-    gt::half a{0.5};
-    gt::gtensor<gt::half, 1, gt::space::device> ref(y.shape(), 3.25);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> x(gt::shape(3), 1.5);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> y(x.shape(), 2.5);
+    gt::float16_t a{0.5};
+    gt::gtensor<gt::float16_t, 1, gt::space::device> ref(y.shape(), 3.25);
 
     y = a * x + y;
 
@@ -160,14 +160,14 @@ TEST(half, haxpy_implicit_1D_device)
 }
 
 template <typename S>
-void generic_explicit_custom_kernel_1D( const gt::half& s1,
-                                        const gt::half& s2,
-                                        const gt::gtensor<gt::half, 1, S>& a,
-                                        const gt::gtensor<gt::half, 1, S>& b,
-                                        const gt::gtensor<gt::half, 1, S>& c,
-                                        const gt::gtensor<gt::half, 1, S>& d,
-                                        const gt::gtensor<gt::half, 1, S>& e,
-                                        gt::gtensor<gt::half, 1, S>& result)
+void generic_explicit_custom_kernel_1D( const gt::float16_t& s1,
+                                        const gt::float16_t& s2,
+                                        const gt::gtensor<gt::float16_t, 1, S>& a,
+                                        const gt::gtensor<gt::float16_t, 1, S>& b,
+                                        const gt::gtensor<gt::float16_t, 1, S>& c,
+                                        const gt::gtensor<gt::float16_t, 1, S>& d,
+                                        const gt::gtensor<gt::float16_t, 1, S>& e,
+                                        gt::gtensor<gt::float16_t, 1, S>& result)
 {
     auto k_a = a.to_kernel();
     auto k_b = b.to_kernel();
@@ -180,30 +180,30 @@ void generic_explicit_custom_kernel_1D( const gt::half& s1,
         { k_r(i) = s2 - k_e(i) * ((k_a(i) - s1 * k_b(i)) / k_c(i) + k_d(i)); });
 }
 
-TEST(half, custom_kernel_explicit_implicit_host_device)
+TEST(float16_t, custom_kernel_explicit_implicit_host_device)
 {
-    gt::half a_val{12.34}, b_val{2.345}, c_val{0.987}, d_val{0.67}, e_val{3.14};
-    gt::half s1{0.1}, s2{4.56};
+    gt::float16_t a_val{12.34}, b_val{2.345}, c_val{0.987}, d_val{0.67}, e_val{3.14};
+    gt::float16_t s1{0.1}, s2{4.56};
 
-    gt::half r = s2 - e_val * ((a_val - s1 * b_val) / c_val + d_val);
+    gt::float16_t r = s2 - e_val * ((a_val - s1 * b_val) / c_val + d_val);
 
     auto shape = gt::shape(3);
 
-    gt::gtensor<gt::half, 1, gt::space::host> h_a(shape, a_val);
-    gt::gtensor<gt::half, 1, gt::space::host> h_b(shape, b_val);
-    gt::gtensor<gt::half, 1, gt::space::host> h_c(shape, c_val);
-    gt::gtensor<gt::half, 1, gt::space::host> h_d(shape, d_val);
-    gt::gtensor<gt::half, 1, gt::space::host> h_e(shape, e_val);
-    gt::gtensor<gt::half, 1, gt::space::host> h_r_expl(shape);
-    gt::gtensor<gt::half, 1, gt::space::host> h_r_impl(shape);
-
-    gt::gtensor<gt::half, 1, gt::space::device> d_a(shape, a_val);
-    gt::gtensor<gt::half, 1, gt::space::device> d_b(shape, b_val);
-    gt::gtensor<gt::half, 1, gt::space::device> d_c(shape, c_val);
-    gt::gtensor<gt::half, 1, gt::space::device> d_d(shape, d_val);
-    gt::gtensor<gt::half, 1, gt::space::device> d_e(shape, e_val);
-    gt::gtensor<gt::half, 1, gt::space::device> d_r_expl(shape);
-    gt::gtensor<gt::half, 1, gt::space::device> d_r_impl(shape);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> h_a(shape, a_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> h_b(shape, b_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> h_c(shape, c_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> h_d(shape, d_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> h_e(shape, e_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> h_r_expl(shape);
+    gt::gtensor<gt::float16_t, 1, gt::space::host> h_r_impl(shape);
+
+    gt::gtensor<gt::float16_t, 1, gt::space::device> d_a(shape, a_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> d_b(shape, b_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> d_c(shape, c_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> d_d(shape, d_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> d_e(shape, e_val);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> d_r_expl(shape);
+    gt::gtensor<gt::float16_t, 1, gt::space::device> d_r_impl(shape);
 
     h_r_impl = s2 - h_e * ((h_a - s1 * h_b) / h_c + h_d);
     d_r_impl = s2 - d_e * ((d_a - s1 * d_b) / d_c + d_d);
@@ -220,35 +220,35 @@ TEST(half, custom_kernel_explicit_implicit_host_device)
     EXPECT_EQ(h_r_impl, d_r_impl);
 }
 
-TEST(half, mixed_precision_scalar)
+TEST(float16_t, mixed_precision_scalar)
 {
-    gt::half a_half{1.0};
+    gt::float16_t a_16{1.0};
 
-    gt::half b_half{2.0};
-    float b_float{2.0};
-    double b_double{2.0};
+    gt::float16_t b_16{2.0};
+    float b_32{2.0};
+    double b_64{2.0};
 
-    auto c_half = a_half + b_half;
-    auto c_float = a_half + b_float;
-    auto c_double = a_half + b_double;
+    auto c_16 = a_16 + b_16;
+    auto c_32 = a_16 + b_32;
+    auto c_64 = a_16 + b_64;
 
-    EXPECT_TRUE((std::is_same<gt::half, decltype(c_half)>::value));
-    EXPECT_TRUE((std::is_same<float, decltype(c_float)>::value));
-    EXPECT_TRUE((std::is_same<double, decltype(c_double)>::value));
+    EXPECT_TRUE((std::is_same<gt::float16_t, decltype(c_16)>::value));
+    EXPECT_TRUE((std::is_same<float, decltype(c_32)>::value));
+    EXPECT_TRUE((std::is_same<double, decltype(c_64)>::value));
 
-    EXPECT_EQ(c_half, c_float);
-    EXPECT_EQ(c_half, c_double);
+    EXPECT_EQ(c_16, c_32);
+    EXPECT_EQ(c_16, c_64);
 }
 
 template<typename S>
 void test_mixed_precision_helper()
 {
     auto shape = gt::shape(3);
-    gt::gtensor<gt::half, 1, S> vh(shape, 4.0);
+    gt::gtensor<gt::float16_t, 1, S> vh(shape, 4.0);
     gt::gtensor<float, 1, S> vf(shape, 3.0);
     gt::gtensor<double, 1, S> vd(shape, 2.0);
 
-    gt::gtensor<gt::half, 1, S> rh(shape);
+    gt::gtensor<gt::float16_t, 1, S> rh(shape);
     gt::gtensor<float, 1, S> rf(shape);
     gt::gtensor<double, 1, S> rd(shape);
 
@@ -263,12 +263,12 @@ void test_mixed_precision_helper()
     EXPECT_EQ(ref, rd);
 }
 
-TEST(half, mixed_precision_host)
+TEST(float16_t, mixed_precision_host)
 {
     test_mixed_precision_helper<gt::space::host>();
 }
 
-TEST(half, mixed_precision_device)
+TEST(float16_t, mixed_precision_device)
 {
     test_mixed_precision_helper<gt::space::device>();
 }

From f014cce09e50cad41271cff3cd141e5d076cfe14 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 7 Sep 2023 14:20:39 +0200
Subject: [PATCH 35/38] [rename] Filenames & includes: half --> float16_t

---
 include/gtensor/{half.h => float16_t.h}     | 0
 include/gtensor/gtensor.h                   | 2 +-
 tests/CMakeLists.txt                        | 2 +-
 tests/{test_half.cxx => test_float16_t.cxx} | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename include/gtensor/{half.h => float16_t.h} (100%)
 rename tests/{test_half.cxx => test_float16_t.cxx} (99%)

diff --git a/include/gtensor/half.h b/include/gtensor/float16_t.h
similarity index 100%
rename from include/gtensor/half.h
rename to include/gtensor/float16_t.h
diff --git a/include/gtensor/gtensor.h b/include/gtensor/gtensor.h
index 70dee7d3..feac235d 100644
--- a/include/gtensor/gtensor.h
+++ b/include/gtensor/gtensor.h
@@ -20,7 +20,7 @@
 #include "space.h"
 
 #if defined(GTENSOR_ENABLE_FP16)
-#include "half.h"
+#include "float16_t.h"
 #endif
 
 namespace gt
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 41a4f10d..b68cdeee 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -82,5 +82,5 @@ if (GTENSOR_ENABLE_FFT)
 endif()
 
 if (GTENSOR_ENABLE_FP16)
-  add_gtensor_test(test_half)
+  add_gtensor_test(test_float16_t)
 endif()
diff --git a/tests/test_half.cxx b/tests/test_float16_t.cxx
similarity index 99%
rename from tests/test_half.cxx
rename to tests/test_float16_t.cxx
index 4cc8f5b7..b8687595 100644
--- a/tests/test_half.cxx
+++ b/tests/test_float16_t.cxx
@@ -2,7 +2,7 @@
 
 #include <gtensor/gtensor.h>
 
-#include <gtensor/half.h>
+#include <gtensor/float16_t.h>
 
 TEST(float16_t, scalar_arithmetic)
 {

From 278d552b71462f07922ad1d75622fe0b45fdc77f Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 7 Sep 2023 14:40:14 +0200
Subject: [PATCH 36/38] [clean] Remove macros.h include

---
 include/gtensor/float16_t.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/gtensor/float16_t.h b/include/gtensor/float16_t.h
index 5e21701f..26a4f367 100644
--- a/include/gtensor/float16_t.h
+++ b/include/gtensor/float16_t.h
@@ -11,8 +11,6 @@
 #error "GTENSOR_ENABLE_FP16=ON, but no 16-bit FP type available!"
 #endif
 
-#include <gtensor/macros.h>
-
 namespace gt
 {
 

From f53673fa8257b767f05f7e3e317e6186c78a1e47 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 7 Sep 2023 14:50:10 +0200
Subject: [PATCH 37/38] [clean] Applied clang-format

---
 include/gtensor/float16_t.h |  83 +++++----
 tests/test_float16_t.cxx    | 361 ++++++++++++++++++------------------
 2 files changed, 234 insertions(+), 210 deletions(-)

diff --git a/include/gtensor/float16_t.h b/include/gtensor/float16_t.h
index 26a4f367..1b65f83d 100644
--- a/include/gtensor/float16_t.h
+++ b/include/gtensor/float16_t.h
@@ -23,9 +23,8 @@ using storage_type = __half;
 #error "GTENSOR_ENABLE_FP16=ON, but no 16-bit FP type available!"
 #endif
 
-#if defined(GTENSOR_FP16_CUDA_HEADER) \
-    && defined(__CUDA_ARCH__) \
-    && (__CUDA_ARCH__ >= 530)
+#if defined(GTENSOR_FP16_CUDA_HEADER) && defined(__CUDA_ARCH__) &&             \
+  (__CUDA_ARCH__ >= 530)
 using compute_type = __half;
 #else
 using compute_type = float;
@@ -34,27 +33,38 @@ using compute_type = float;
 class float16_t
 {
 public:
-    float16_t() = default;
-    GT_INLINE float16_t(float x) : x(x) {};
-    GT_INLINE float16_t(storage_type x) : x(x) {};
+  float16_t() = default;
+  GT_INLINE float16_t(float x) : x(x){};
+  GT_INLINE float16_t(storage_type x) : x(x){};
+
+  GT_INLINE const float16_t& operator=(const float f)
+  {
+    x = f;
+    return *this;
+  }
+  GT_INLINE compute_type Get() const { return static_cast<compute_type>(x); }
 
-    GT_INLINE const float16_t& operator=(const float f) { x = f; return *this; }
-    GT_INLINE compute_type Get() const { return static_cast<compute_type>(x); }
 private:
-    storage_type x;
+  storage_type x;
 };
 
-#define PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(op) \
-    GT_INLINE float16_t operator op(const float16_t& lhs, const float16_t& rhs) \
-    { return float16_t( lhs.Get() op rhs.Get() ); }
-
-#define PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(op, fp_type) \
-    \
-    GT_INLINE fp_type operator op(const float16_t& lhs, const fp_type& rhs) \
-    { return static_cast<fp_type>(lhs.Get()) op rhs; } \
-    \
-    GT_INLINE fp_type operator op(const fp_type& lhs, const float16_t& rhs) \
-    { return lhs op static_cast<fp_type>(rhs.Get()); }
+#define PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(op)                        \
+  GT_INLINE float16_t operator op(const float16_t& lhs, const float16_t& rhs)  \
+  {                                                                            \
+    return float16_t(lhs.Get() op rhs.Get());                                  \
+  }
+
+#define PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(op, fp_type)         \
+                                                                               \
+  GT_INLINE fp_type operator op(const float16_t& lhs, const fp_type& rhs)      \
+  {                                                                            \
+    return static_cast<fp_type>(lhs.Get()) op rhs;                             \
+  }                                                                            \
+                                                                               \
+  GT_INLINE fp_type operator op(const fp_type& lhs, const float16_t& rhs)      \
+  {                                                                            \
+    return lhs op static_cast<fp_type>(rhs.Get());                             \
+  }
 
 PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(+);
 PROVIDE_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(-);
@@ -71,17 +81,23 @@ PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(-, double);
 PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(*, double);
 PROVIDE_MIXED_FLOAT16T_BINARY_ARITHMETIC_OPERATOR(/, double);
 
-#define PROVIDE_FLOAT16T_COMPARISON_OPERATOR(op) \
-    GT_INLINE bool operator op(const float16_t& lhs, const float16_t& rhs) \
-    { return lhs.Get() op rhs.Get(); }
-
-#define PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(op, fp_type) \
-    \
-    GT_INLINE bool operator op(const float16_t& lhs, const fp_type& rhs) \
-    { return static_cast<fp_type>(lhs.Get()) op rhs; } \
-    \
-    GT_INLINE bool operator op(const fp_type& lhs, const float16_t& rhs) \
-    { return lhs op static_cast<fp_type>(rhs.Get()); }
+#define PROVIDE_FLOAT16T_COMPARISON_OPERATOR(op)                               \
+  GT_INLINE bool operator op(const float16_t& lhs, const float16_t& rhs)       \
+  {                                                                            \
+    return lhs.Get() op rhs.Get();                                             \
+  }
+
+#define PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(op, fp_type)                \
+                                                                               \
+  GT_INLINE bool operator op(const float16_t& lhs, const fp_type& rhs)         \
+  {                                                                            \
+    return static_cast<fp_type>(lhs.Get()) op rhs;                             \
+  }                                                                            \
+                                                                               \
+  GT_INLINE bool operator op(const fp_type& lhs, const float16_t& rhs)         \
+  {                                                                            \
+    return lhs op static_cast<fp_type>(rhs.Get());                             \
+  }
 
 PROVIDE_FLOAT16T_COMPARISON_OPERATOR(==);
 PROVIDE_FLOAT16T_COMPARISON_OPERATOR(!=);
@@ -105,7 +121,10 @@ PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(>, double);
 PROVIDE_MIXED_FLOAT16T_COMPARISON_OPERATOR(>=, double);
 
 std::ostream& operator<<(std::ostream& s, const float16_t& h)
-{ s << static_cast<float>(h.Get()); return s; }
+{
+  s << static_cast<float>(h.Get());
+  return s;
+}
 
 } // namespace gt
 
diff --git a/tests/test_float16_t.cxx b/tests/test_float16_t.cxx
index b8687595..469b4725 100644
--- a/tests/test_float16_t.cxx
+++ b/tests/test_float16_t.cxx
@@ -6,269 +6,274 @@
 
 TEST(float16_t, scalar_arithmetic)
 {
-    gt::float16_t a{1.0};
-    gt::float16_t b{2.0};
+  gt::float16_t a{1.0};
+  gt::float16_t b{2.0};
 
-    gt::float16_t c{0.0};
-    gt::float16_t ref{0.0};
+  gt::float16_t c{0.0};
+  gt::float16_t ref{0.0};
 
-    c = a + b;
-    ref = 3.0;
-    EXPECT_EQ(c, ref);
+  c = a + b;
+  ref = 3.0;
+  EXPECT_EQ(c, ref);
 
-    c = a - b;
-    ref = -1.0;
-    EXPECT_EQ(c, ref);
+  c = a - b;
+  ref = -1.0;
+  EXPECT_EQ(c, ref);
 
-    c = a * b;
-    ref = 2.0;
-    EXPECT_EQ(c, ref);
+  c = a * b;
+  ref = 2.0;
+  EXPECT_EQ(c, ref);
 
-    c = a / b;
-    ref = 0.5;
-    EXPECT_EQ(c, ref);
+  c = a / b;
+  ref = 0.5;
+  EXPECT_EQ(c, ref);
 }
 
 TEST(float16_t, binary_comparison_operators)
 {
-    gt::float16_t a{1.0};
-    gt::float16_t b{2.0};
-    gt::float16_t c{2.0};
-
-    EXPECT_EQ(a, a);
-    EXPECT_EQ(b, b);
-    EXPECT_EQ(b, c);
-    EXPECT_EQ(c, b);
-    EXPECT_EQ(c, c);
-
-    EXPECT_NE(a, b);
-    EXPECT_NE(a, c);
-    EXPECT_NE(b, a);
-    EXPECT_NE(c, a);
-
-    EXPECT_LT(a, b);
-    EXPECT_LT(a, c);
-
-    EXPECT_LE(a, a);
-    EXPECT_LE(a, b);
-    EXPECT_LE(a, c);
-    EXPECT_LE(b, b);
-    EXPECT_LE(b, c);
-    EXPECT_LE(c, b);
-    EXPECT_LE(c, c);
-
-    EXPECT_GT(b, a);
-    EXPECT_GT(c, a);
-
-    EXPECT_GE(a, a);
-    EXPECT_GE(b, a);
-    EXPECT_GE(b, b);
-    EXPECT_GE(b, c);
-    EXPECT_GE(c, a);
-    EXPECT_GE(c, b);
-    EXPECT_GE(c, c);
+  gt::float16_t a{1.0};
+  gt::float16_t b{2.0};
+  gt::float16_t c{2.0};
+
+  EXPECT_EQ(a, a);
+  EXPECT_EQ(b, b);
+  EXPECT_EQ(b, c);
+  EXPECT_EQ(c, b);
+  EXPECT_EQ(c, c);
+
+  EXPECT_NE(a, b);
+  EXPECT_NE(a, c);
+  EXPECT_NE(b, a);
+  EXPECT_NE(c, a);
+
+  EXPECT_LT(a, b);
+  EXPECT_LT(a, c);
+
+  EXPECT_LE(a, a);
+  EXPECT_LE(a, b);
+  EXPECT_LE(a, c);
+  EXPECT_LE(b, b);
+  EXPECT_LE(b, c);
+  EXPECT_LE(c, b);
+  EXPECT_LE(c, c);
+
+  EXPECT_GT(b, a);
+  EXPECT_GT(c, a);
+
+  EXPECT_GE(a, a);
+  EXPECT_GE(b, a);
+  EXPECT_GE(b, b);
+  EXPECT_GE(b, c);
+  EXPECT_GE(c, a);
+  EXPECT_GE(c, b);
+  EXPECT_GE(c, c);
 }
 
 template <typename S>
-void generic_fill_1D(gt::gtensor<gt::float16_t, 1, S>& x, const gt::float16_t& fill_value)
+void generic_fill_1D(gt::gtensor<gt::float16_t, 1, S>& x,
+                     const gt::float16_t& fill_value)
 {
-    auto k_x = x.to_kernel();
+  auto k_x = x.to_kernel();
 
-    gt::launch<1, S>(x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
+  gt::launch<1, S>(
+    x.shape(), GT_LAMBDA(int i) { k_x(i) = fill_value; });
 }
 
 TEST(float16_t, auto_init_host)
 {
-    gt::float16_t fill_value{1.25};
-    gt::gtensor<gt::float16_t, 1, gt::space::host> a(gt::shape(5), fill_value);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> b(a.shape());
+  gt::float16_t fill_value{1.25};
+  gt::gtensor<gt::float16_t, 1, gt::space::host> a(gt::shape(5), fill_value);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> b(a.shape());
 
-    generic_fill_1D<gt::space::host>(b, fill_value);
+  generic_fill_1D<gt::space::host>(b, fill_value);
 
-    EXPECT_EQ(a, b);
+  EXPECT_EQ(a, b);
 }
 
 TEST(float16_t, auto_init_device)
 {
-    gt::float16_t fill_value{1.25};
-    gt::gtensor<gt::float16_t, 1, gt::space::device> a(gt::shape(5), fill_value);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> b(a.shape());
+  gt::float16_t fill_value{1.25};
+  gt::gtensor<gt::float16_t, 1, gt::space::device> a(gt::shape(5), fill_value);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> b(a.shape());
 
-    generic_fill_1D<gt::space::device>(b, fill_value);
+  generic_fill_1D<gt::space::device>(b, fill_value);
 
-    EXPECT_EQ(a, b);
+  EXPECT_EQ(a, b);
 }
 
 template <typename S>
-void generic_explicit_haxpy_1D( const gt::float16_t& a,
-                                const gt::gtensor<gt::float16_t, 1, S>& x,
-                                      gt::gtensor<gt::float16_t, 1, S>& y)
+void generic_explicit_haxpy_1D(const gt::float16_t& a,
+                               const gt::gtensor<gt::float16_t, 1, S>& x,
+                               gt::gtensor<gt::float16_t, 1, S>& y)
 {
-    auto k_x = x.to_kernel();
-    auto k_y = y.to_kernel();
+  auto k_x = x.to_kernel();
+  auto k_y = y.to_kernel();
 
-    gt::launch<1, S>(
-            y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
+  gt::launch<1, S>(
+    y.shape(), GT_LAMBDA(int i) { k_y(i) = k_y(i) + a * k_x(i); });
 }
 
 TEST(float16_t, haxpy_explicit_1D_host)
 {
-    gt::gtensor<gt::float16_t, 1, gt::space::host> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> y(x.shape(), 2.5);
-    gt::float16_t a{0.5};
-    gt::gtensor<gt::float16_t, 1, gt::space::host> ref(x.shape(), 3.25);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> x(gt::shape(3), 1.5);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> y(x.shape(), 2.5);
+  gt::float16_t a{0.5};
+  gt::gtensor<gt::float16_t, 1, gt::space::host> ref(x.shape(), 3.25);
 
-    generic_explicit_haxpy_1D<gt::space::host>(a, x, y);
+  generic_explicit_haxpy_1D<gt::space::host>(a, x, y);
 
-    EXPECT_EQ(y, ref);
+  EXPECT_EQ(y, ref);
 }
 
 TEST(float16_t, haxpy_explicit_1D_device)
 {
-    gt::gtensor<gt::float16_t, 1, gt::space::device> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> y(x.shape(), 2.5);
-    gt::float16_t a{0.5};
-    gt::gtensor<gt::float16_t, 1, gt::space::device> ref(y.shape(), 3.25);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> x(gt::shape(3), 1.5);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> y(x.shape(), 2.5);
+  gt::float16_t a{0.5};
+  gt::gtensor<gt::float16_t, 1, gt::space::device> ref(y.shape(), 3.25);
 
-    generic_explicit_haxpy_1D<gt::space::device>(a, x, y);
+  generic_explicit_haxpy_1D<gt::space::device>(a, x, y);
 
-    EXPECT_EQ(y, ref);
+  EXPECT_EQ(y, ref);
 }
 
 TEST(float16_t, haxpy_implicit_1D_host)
 {
-    gt::gtensor<gt::float16_t, 1, gt::space::host> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> y(x.shape(), 2.5);
-    gt::float16_t a{0.5};
-    gt::gtensor<gt::float16_t, 1, gt::space::host> ref(x.shape(), 3.25);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> x(gt::shape(3), 1.5);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> y(x.shape(), 2.5);
+  gt::float16_t a{0.5};
+  gt::gtensor<gt::float16_t, 1, gt::space::host> ref(x.shape(), 3.25);
 
-    y = a * x + y;
+  y = a * x + y;
 
-    EXPECT_EQ(y, ref);
+  EXPECT_EQ(y, ref);
 }
 
 TEST(float16_t, haxpy_implicit_1D_device)
 {
-    gt::gtensor<gt::float16_t, 1, gt::space::device> x(gt::shape(3), 1.5);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> y(x.shape(), 2.5);
-    gt::float16_t a{0.5};
-    gt::gtensor<gt::float16_t, 1, gt::space::device> ref(y.shape(), 3.25);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> x(gt::shape(3), 1.5);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> y(x.shape(), 2.5);
+  gt::float16_t a{0.5};
+  gt::gtensor<gt::float16_t, 1, gt::space::device> ref(y.shape(), 3.25);
 
-    y = a * x + y;
+  y = a * x + y;
 
-    EXPECT_EQ(y, ref);
+  EXPECT_EQ(y, ref);
 }
 
 template <typename S>
-void generic_explicit_custom_kernel_1D( const gt::float16_t& s1,
-                                        const gt::float16_t& s2,
-                                        const gt::gtensor<gt::float16_t, 1, S>& a,
-                                        const gt::gtensor<gt::float16_t, 1, S>& b,
-                                        const gt::gtensor<gt::float16_t, 1, S>& c,
-                                        const gt::gtensor<gt::float16_t, 1, S>& d,
-                                        const gt::gtensor<gt::float16_t, 1, S>& e,
-                                        gt::gtensor<gt::float16_t, 1, S>& result)
+void generic_explicit_custom_kernel_1D(
+  const gt::float16_t& s1, const gt::float16_t& s2,
+  const gt::gtensor<gt::float16_t, 1, S>& a,
+  const gt::gtensor<gt::float16_t, 1, S>& b,
+  const gt::gtensor<gt::float16_t, 1, S>& c,
+  const gt::gtensor<gt::float16_t, 1, S>& d,
+  const gt::gtensor<gt::float16_t, 1, S>& e,
+  gt::gtensor<gt::float16_t, 1, S>& result)
 {
-    auto k_a = a.to_kernel();
-    auto k_b = b.to_kernel();
-    auto k_c = c.to_kernel();
-    auto k_d = d.to_kernel();
-    auto k_e = e.to_kernel();
-    auto k_r = result.to_kernel();
-
-    gt::launch<1, S>(result.shape(), GT_LAMBDA(int i)
-        { k_r(i) = s2 - k_e(i) * ((k_a(i) - s1 * k_b(i)) / k_c(i) + k_d(i)); });
+  auto k_a = a.to_kernel();
+  auto k_b = b.to_kernel();
+  auto k_c = c.to_kernel();
+  auto k_d = d.to_kernel();
+  auto k_e = e.to_kernel();
+  auto k_r = result.to_kernel();
+
+  gt::launch<1, S>(
+    result.shape(), GT_LAMBDA(int i) {
+      k_r(i) = s2 - k_e(i) * ((k_a(i) - s1 * k_b(i)) / k_c(i) + k_d(i));
+    });
 }
 
 TEST(float16_t, custom_kernel_explicit_implicit_host_device)
 {
-    gt::float16_t a_val{12.34}, b_val{2.345}, c_val{0.987}, d_val{0.67}, e_val{3.14};
-    gt::float16_t s1{0.1}, s2{4.56};
-
-    gt::float16_t r = s2 - e_val * ((a_val - s1 * b_val) / c_val + d_val);
-
-    auto shape = gt::shape(3);
-
-    gt::gtensor<gt::float16_t, 1, gt::space::host> h_a(shape, a_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> h_b(shape, b_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> h_c(shape, c_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> h_d(shape, d_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> h_e(shape, e_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> h_r_expl(shape);
-    gt::gtensor<gt::float16_t, 1, gt::space::host> h_r_impl(shape);
-
-    gt::gtensor<gt::float16_t, 1, gt::space::device> d_a(shape, a_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> d_b(shape, b_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> d_c(shape, c_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> d_d(shape, d_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> d_e(shape, e_val);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> d_r_expl(shape);
-    gt::gtensor<gt::float16_t, 1, gt::space::device> d_r_impl(shape);
-
-    h_r_impl = s2 - h_e * ((h_a - s1 * h_b) / h_c + h_d);
-    d_r_impl = s2 - d_e * ((d_a - s1 * d_b) / d_c + d_d);
-
-    generic_explicit_custom_kernel_1D<gt::space::host>(s1, s2,
-            h_a, h_b, h_c, h_d, h_e, h_r_expl);
-
-    generic_explicit_custom_kernel_1D<gt::space::device>(s1, s2,
-            d_a, d_b, d_c, d_d, d_e, d_r_expl);
-
-    EXPECT_EQ(h_r_impl(2), r);
-    EXPECT_EQ(h_r_impl, h_r_expl);
-    EXPECT_EQ(h_r_impl, d_r_expl);
-    EXPECT_EQ(h_r_impl, d_r_impl);
+  gt::float16_t a_val{12.34}, b_val{2.345}, c_val{0.987}, d_val{0.67},
+    e_val{3.14};
+  gt::float16_t s1{0.1}, s2{4.56};
+
+  gt::float16_t r = s2 - e_val * ((a_val - s1 * b_val) / c_val + d_val);
+
+  auto shape = gt::shape(3);
+
+  gt::gtensor<gt::float16_t, 1, gt::space::host> h_a(shape, a_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> h_b(shape, b_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> h_c(shape, c_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> h_d(shape, d_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> h_e(shape, e_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> h_r_expl(shape);
+  gt::gtensor<gt::float16_t, 1, gt::space::host> h_r_impl(shape);
+
+  gt::gtensor<gt::float16_t, 1, gt::space::device> d_a(shape, a_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> d_b(shape, b_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> d_c(shape, c_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> d_d(shape, d_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> d_e(shape, e_val);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> d_r_expl(shape);
+  gt::gtensor<gt::float16_t, 1, gt::space::device> d_r_impl(shape);
+
+  h_r_impl = s2 - h_e * ((h_a - s1 * h_b) / h_c + h_d);
+  d_r_impl = s2 - d_e * ((d_a - s1 * d_b) / d_c + d_d);
+
+  generic_explicit_custom_kernel_1D<gt::space::host>(s1, s2, h_a, h_b, h_c, h_d,
+                                                     h_e, h_r_expl);
+
+  generic_explicit_custom_kernel_1D<gt::space::device>(s1, s2, d_a, d_b, d_c,
+                                                       d_d, d_e, d_r_expl);
+
+  EXPECT_EQ(h_r_impl(2), r);
+  EXPECT_EQ(h_r_impl, h_r_expl);
+  EXPECT_EQ(h_r_impl, d_r_expl);
+  EXPECT_EQ(h_r_impl, d_r_impl);
 }
 
 TEST(float16_t, mixed_precision_scalar)
 {
-    gt::float16_t a_16{1.0};
+  gt::float16_t a_16{1.0};
 
-    gt::float16_t b_16{2.0};
-    float b_32{2.0};
-    double b_64{2.0};
+  gt::float16_t b_16{2.0};
+  float b_32{2.0};
+  double b_64{2.0};
 
-    auto c_16 = a_16 + b_16;
-    auto c_32 = a_16 + b_32;
-    auto c_64 = a_16 + b_64;
+  auto c_16 = a_16 + b_16;
+  auto c_32 = a_16 + b_32;
+  auto c_64 = a_16 + b_64;
 
-    EXPECT_TRUE((std::is_same<gt::float16_t, decltype(c_16)>::value));
-    EXPECT_TRUE((std::is_same<float, decltype(c_32)>::value));
-    EXPECT_TRUE((std::is_same<double, decltype(c_64)>::value));
+  EXPECT_TRUE((std::is_same<gt::float16_t, decltype(c_16)>::value));
+  EXPECT_TRUE((std::is_same<float, decltype(c_32)>::value));
+  EXPECT_TRUE((std::is_same<double, decltype(c_64)>::value));
 
-    EXPECT_EQ(c_16, c_32);
-    EXPECT_EQ(c_16, c_64);
+  EXPECT_EQ(c_16, c_32);
+  EXPECT_EQ(c_16, c_64);
 }
 
-template<typename S>
+template <typename S>
 void test_mixed_precision_helper()
 {
-    auto shape = gt::shape(3);
-    gt::gtensor<gt::float16_t, 1, S> vh(shape, 4.0);
-    gt::gtensor<float, 1, S> vf(shape, 3.0);
-    gt::gtensor<double, 1, S> vd(shape, 2.0);
+  auto shape = gt::shape(3);
+  gt::gtensor<gt::float16_t, 1, S> vh(shape, 4.0);
+  gt::gtensor<float, 1, S> vf(shape, 3.0);
+  gt::gtensor<double, 1, S> vd(shape, 2.0);
 
-    gt::gtensor<gt::float16_t, 1, S> rh(shape);
-    gt::gtensor<float, 1, S> rf(shape);
-    gt::gtensor<double, 1, S> rd(shape);
+  gt::gtensor<gt::float16_t, 1, S> rh(shape);
+  gt::gtensor<float, 1, S> rf(shape);
+  gt::gtensor<double, 1, S> rd(shape);
 
-    gt::gtensor<double, 1, S> ref(shape, 10.0);
+  gt::gtensor<double, 1, S> ref(shape, 10.0);
 
-    rh = (vh * vf) - (vh / vd);
-    rf = (vh * vf) - (vh / vd);
-    rd = (vh * vf) - (vh / vd);
+  rh = (vh * vf) - (vh / vd);
+  rf = (vh * vf) - (vh / vd);
+  rd = (vh * vf) - (vh / vd);
 
-    EXPECT_EQ(ref, rh);
-    EXPECT_EQ(ref, rf);
-    EXPECT_EQ(ref, rd);
+  EXPECT_EQ(ref, rh);
+  EXPECT_EQ(ref, rf);
+  EXPECT_EQ(ref, rd);
 }
 
 TEST(float16_t, mixed_precision_host)
 {
-    test_mixed_precision_helper<gt::space::host>();
+  test_mixed_precision_helper<gt::space::host>();
 }
 
 TEST(float16_t, mixed_precision_device)
 {
-    test_mixed_precision_helper<gt::space::device>();
+  test_mixed_precision_helper<gt::space::device>();
 }

From 88f100d2ec14694439a3fdc32da1d94ab0c186c7 Mon Sep 17 00:00:00 2001
From: cmpfeil <carl-martin.pfeiler@ipp.mpg.de>
Date: Thu, 7 Sep 2023 15:45:59 +0200
Subject: [PATCH 38/38] [rename] Missed some gt::half --> gt::float16_t

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca21e8cb..fe17a217 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ option(GTENSOR_ALLOCATOR_CACHING "Enable naive caching allocators" ON)
 option(GTENSOR_BOUNDS_CHECK "Enable per access bounds checking" OFF)
 option(GTENSOR_ADDRESS_CHECK "Enable address checking for device spans" OFF)
 option(GTENSOR_SYNC_KERNELS "Enable host sync after assign and launch kernels" OFF)
-option(GTENSOR_ENABLE_FP16 "Enable 16-bit floating point type gt::half" OFF)
+option(GTENSOR_ENABLE_FP16 "Enable 16-bit floating point type gt::float16_t" OFF)
 
 if (GTENSOR_ENABLE_FORTRAN)
   # do this early (here) since later the `enable_language(Fortran)` gives me trouble
@@ -337,7 +337,7 @@ else()
 endif()
 
 if (GTENSOR_ENABLE_FP16)
-  message(STATUS "${PROJECT_NAME}: gt::half is ENABLED")
+  message(STATUS "${PROJECT_NAME}: gt::float16_t is ENABLED")
   target_compile_definitions(gtensor_${GTENSOR_DEVICE}
                              INTERFACE GTENSOR_ENABLE_FP16)
 endif()