wdmapp · bd4 · Sep 18, 2024 · Feb 9, 2024 · Aug 31, 2024 · Aug 31, 2024
diff --git a/include/gtensor/assign.h b/include/gtensor/assign.h
@@ -134,6 +134,31 @@ struct assigner<6, space::host>
   }
 };
 
+template <>
+struct assigner<7, space::host>
+{
+  template <typename E1, typename E2>
+  static void run(E1& lhs, const E2& rhs, stream_view stream)
+  {
+    // printf("assigner<7, host>\n");
+    for (int o = 0; o < lhs.shape(6); o++) {
+      for (int n = 0; n < lhs.shape(5); n++) {
+        for (int m = 0; m < lhs.shape(4); m++) {
+          for (int l = 0; l < lhs.shape(3); l++) {
+            for (int k = 0; k < lhs.shape(2); k++) {
+              for (int j = 0; j < lhs.shape(1); j++) {
+                for (int i = 0; i < lhs.shape(0); i++) {
+                  lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, n, o);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 #if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP)
 
 #ifdef GTENSOR_PER_DIM_KERNELS
@@ -219,6 +244,24 @@ __global__ void kernel_assign_6(Elhs lhs, Erhs _rhs)
   }
 }
 
+template <typename Elhs, typename Erhs>
+__global__ void kernel_assign_7(Elhs lhs, Erhs _rhs)
+{
+  auto rhs = _rhs;
+  int tidx = threadIdx.x + blockIdx.x * blockDim.x;
+  int tidy = threadIdx.y + blockIdx.y * blockDim.y;
+  int tidz = blockIdx.z;
+  if (tidx < lhs.shape(0) * lhs.shape(1) &&
+      tidy < lhs.shape(2) * lhs.shape(3)) {
+    int j = tidx / lhs.shape(0), i = tidx % lhs.shape(0);
+    int l = tidy / lhs.shape(2), k = tidy % lhs.shape(2);
+    int m = tidz % lhs.shape(5) % lhs.shape(4);
+    int n = tidz % lhs.shape(5) / lhs.shape(4), int o = tidz / lhs.shape(5);
+
+    lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, o);
+  }
+}
+
 template <>
 struct assigner<1, space::device>
 {
@@ -305,7 +348,7 @@ struct assigner<5, space::device>
   template <typename E1, typename E2>
   static void run(E1& lhs, const E2& rhs, stream_view stream)
   {
-    // printf("assigner<6, device>\n");
+    // printf("assigner<5, device>\n");
     dim3 numThreads(BS_X, BS_Y);
     dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X,
                    (lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y,
@@ -341,6 +384,27 @@ struct assigner<6, space::device>
   }
 };
 
+template <>
+struct assigner<7, space::device>
+{
+  template <typename E1, typename E2>
+  static void run(E1& lhs, const E2& rhs, stream_view stream)
+  {
+    // printf("assigner<7, device>\n");
+    dim3 numThreads(BS_X, BS_Y);
+    dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X,
+                   (lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y,
+                   lhs.shape(4) * lhs.shape(5) * lhs.shape(6));
+
+    gpuSyncIfEnabledStream(stream);
+    // std::cout << "rhs " << typeid(rhs.to_kernel()).name() << "\n";
+    gtLaunchKernel(kernel_assign_7, numBlocks, numThreads, 0,
+                   stream.get_backend_stream(), lhs.to_kernel(),
+                   rhs.to_kernel());
+    gpuSyncIfEnabledStream(stream);
+  }
+};
+
 #else // not defined GTENSOR_PER_DIM_KERNELS
 
 template <typename Elhs, typename Erhs, size_type N>

diff --git a/include/gtensor/expression.h b/include/gtensor/expression.h
@@ -124,6 +124,12 @@ GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<6> idx)
   return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]);
 }
 
+template <typename E>
+GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<7> idx)
+{
+  return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6]);
+}
+
 } // namespace gt
 
 #endif
diff --git a/include/gtensor/gtensor.h b/include/gtensor/gtensor.h
@@ -303,6 +303,27 @@ __global__ void kernel_launch(gt::shape_type<6> shape, F f)
   }
 }
 
+template <typename F>
+__global__ void kernel_launch(gt::shape_type<7> shape, F f)
+{
+  int i = threadIdx.x + blockIdx.x * BS_X;
+  int j = threadIdx.y + blockIdx.y * BS_Y;
+  int b = blockIdx.z;
+  int o = b / (shape[2] * shape[3] * shape[4] * shape[5]);
+  b -= o * (shape[2] * shape[3] * shape[4] * hspae[5]);
+  int n = b / (shape[2] * shape[3] * shape[4]);
+  b -= n * (shape[2] * shape[3] * shape[4]);
+  int m = b / (shape[2] * shape[3]);
+  b -= m * (shape[2] * shape[3]);
+  int l = b / shape[2];
+  b -= l * shape[2];
+  int k = b;
+
+  if (i < shape[0] && j < shape[1]) {
+    f(i, j, k, l, m, n, o);
+  }
+}
+
 #else // not GTENSOR_PER_DIM_KERNELS
 
 template <typename F, size_type N>
@@ -428,6 +449,30 @@ struct launch<6, space::host>
   }
 };
 
+template <>
+struct launch<7, space::host>
+{
+  template <typename F>
+  static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream)
+  {
+    for (int o = 0; o < shape[6]; o++) {
+      for (int n = 0; n < shape[5]; n++) {
+        for (int m = 0; m < shape[4]; m++) {
+          for (int l = 0; l < shape[3]; l++) {
+            for (int k = 0; k < shape[2]; k++) {
+              for (int j = 0; j < shape[1]; j++) {
+                for (int i = 0; i < shape[0]; i++) {
+                  std::forward<F>(f)(i, j, k, l, m, n, o);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
 #if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP)
 
 #ifdef GTENSOR_PER_DIM_KERNELS
@@ -529,6 +574,21 @@ struct launch<6, space::device>
   }
 };
 
+template <>
+struct launch<7, space::device>
+{
+  template <typename F>
+  static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream)
+  {
+    dim3 numThreads(BS_X, BS_Y);
+    dim3 numBlocks((shape[0] + BS_X - 1) / BS_X, (shape[1] + BS_Y - 1) / BS_Y,
+                   shape[2] * shape[3] * shape[4] * shape[5] * shape[6]);
+
+    gtLaunchKernel(kernel_launch, numBlocks, numThreads, 0,
+                   stream.get_backend_stream(), shape, std::forward<F>(f));
+  }
+};
+
 #else // not GTENSOR_PER_DIM_KERNELS
 
 template <int N>

diff --git a/include/gtensor/operator.h b/include/gtensor/operator.h
@@ -355,6 +355,37 @@ struct equals<6, 6, space::host, space::host>
   }
 };
 
+template <>
+struct equals<7, 7, space::host, space::host>
+{
+  template <typename E1, typename E2>
+  static bool run(const E1& e1, const E2& e2)
+  {
+    if (e1.shape() != e2.shape()) {
+      return false;
+    }
+
+    for (int v = 0; v < e1.shape(6); v++) {
+      for (int z = 0; z < e1.shape(5); z++) {
+        for (int y = 0; y < e1.shape(4); y++) {
+          for (int x = 0; x < e1.shape(3); x++) {
+            for (int k = 0; k < e1.shape(2); k++) {
+              for (int j = 0; j < e1.shape(1); j++) {
+                for (int i = 0; i < e1.shape(0); i++) {
+                  if (e1(i, j, k, x, y, z, v) != e2(i, j, k, x, y, z, v)) {
+                    return false;
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    return true;
+  }
+};
+
 #ifdef GTENSOR_HAVE_DEVICE
 
 template <size_type N1, size_type N2>

diff --git a/tests/test_launch.cxx b/tests/test_launch.cxx
@@ -184,6 +184,22 @@ void device_double_add_6d(gt::gtensor_device<double, 6>& a,
   gt::copy(b, out);
 }
 
+void device_double_add_7d(gt::gtensor_device<double, 7>& a,
+                          gt::gtensor<double, 7>& out)
+{
+  auto b = gt::empty_like(a);
+
+  auto k_a = a.to_kernel();
+  auto k_b = b.to_kernel();
+
+  gt::launch<7>(
+    a.shape(), GT_LAMBDA(int i, int j, int k, int l, int m, int n, int o) {
+      k_b(i, j, k, l, m, n, o) =
+        k_a(i, j, k, l, m, n, o) + k_a(i, j, k, l, m, n, o);
+    });
+  gt::copy(b, out);
+}
+
 TEST(gtensor, device_launch_5d)
 {
   gt::gtensor<double, 5> h_a(gt::shape(2, 2, 2, 2, 2));
@@ -242,4 +258,36 @@ TEST(gtensor, device_launch_6d)
   EXPECT_EQ(h_b, h_b_expected);
 }
 
+TEST(gtensor, device_launch_7d)
+{
+  gt::gtensor<double, 7> h_a(gt::shape(2, 2, 2, 2, 2, 2, 2));
+  gt::gtensor_device<double, 7> a(h_a.shape());
+  gt::gtensor<double, 7> h_b(h_a.shape());
+  gt::gtensor<double, 7> h_b_expected(h_a.shape());
+
+  for (int i = 0; i < h_a.shape(0); i++) {
+    for (int j = 0; j < h_a.shape(1); j++) {
+      for (int k = 0; k < h_a.shape(2); k++) {
+        for (int l = 0; l < h_a.shape(3); l++) {
+          for (int m = 0; m < h_a.shape(4); m++) {
+            for (int n = 0; n < h_a.shape(5); n++) {
+              for (int o = 0; n < h_a.shape(6); o++) {
+                h_a(i, j, k, l, m, n, o) = i + j + k + l + m + n + o;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  h_b_expected = 2 * h_a;
+
+  gt::copy(h_a, a);
+
+  device_double_add_7d(a, h_b);
+
+  EXPECT_EQ(h_b, h_b_expected);
+}
+
 #endif // GTENSOR_HAVE_DEVICE