diff --git a/include/gtensor/assign.h b/include/gtensor/assign.h index d8f56398..cc622468 100644 --- a/include/gtensor/assign.h +++ b/include/gtensor/assign.h @@ -134,6 +134,31 @@ struct assigner<6, space::host> } }; +template <> +struct assigner<7, space::host> +{ + template + static void run(E1& lhs, const E2& rhs, stream_view stream) + { + // printf("assigner<7, host>\n"); + for (int o = 0; o < lhs.shape(6); o++) { + for (int n = 0; n < lhs.shape(5); n++) { + for (int m = 0; m < lhs.shape(4); m++) { + for (int l = 0; l < lhs.shape(3); l++) { + for (int k = 0; k < lhs.shape(2); k++) { + for (int j = 0; j < lhs.shape(1); j++) { + for (int i = 0; i < lhs.shape(0); i++) { + lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, n, o); + } + } + } + } + } + } + } + } +}; + #if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP) #ifdef GTENSOR_PER_DIM_KERNELS @@ -219,6 +244,27 @@ __global__ void kernel_assign_6(Elhs lhs, Erhs _rhs) } } +template +__global__ void kernel_assign_7(Elhs lhs, Erhs _rhs) +{ + auto rhs = _rhs; + int tidx = threadIdx.x + blockIdx.x * blockDim.x; + int tidy = threadIdx.y + blockIdx.y * blockDim.y; + int tidz = blockIdx.z; + if (tidx < lhs.shape(0) * lhs.shape(1) && + tidy < lhs.shape(2) * lhs.shape(3)) { + int j = tidx / lhs.shape(0); + int i = tidx % lhs.shape(0); + int l = tidy / lhs.shape(2); + int k = tidy % lhs.shape(2); + int n = tidz % lhs.shape(5) / lhs.shape(4); + int m = tidz % lhs.shape(5) % lhs.shape(4); + int o = tidz / lhs.shape(5); + + lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, o); + } +} + template <> struct assigner<1, space::device> { @@ -305,7 +351,7 @@ struct assigner<5, space::device> template static void run(E1& lhs, const E2& rhs, stream_view stream) { - // printf("assigner<6, device>\n"); + // printf("assigner<5, device>\n"); dim3 numThreads(BS_X, BS_Y); dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X, (lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y, @@ -341,6 +387,27 @@ struct assigner<6, space::device> } }; +template <> +struct assigner<7, space::device> +{ + template + static void run(E1& lhs, const E2& rhs, stream_view stream) + { + // printf("assigner<7, device>\n"); + dim3 numThreads(BS_X, BS_Y); + dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X, + (lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y, + lhs.shape(4) * lhs.shape(5) * lhs.shape(6)); + + gpuSyncIfEnabledStream(stream); + // std::cout << "rhs " << typeid(rhs.to_kernel()).name() << "\n"; + gtLaunchKernel(kernel_assign_7, numBlocks, numThreads, 0, + stream.get_backend_stream(), lhs.to_kernel(), + rhs.to_kernel()); + gpuSyncIfEnabledStream(stream); + } +}; + #else // not defined GTENSOR_PER_DIM_KERNELS template diff --git a/include/gtensor/expression.h b/include/gtensor/expression.h index dc72b036..c3580254 100644 --- a/include/gtensor/expression.h +++ b/include/gtensor/expression.h @@ -124,6 +124,12 @@ GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<6> idx) return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]); } +template +GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<7> idx) +{ + return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6]); +} + } // namespace gt #endif diff --git a/include/gtensor/gtensor.h b/include/gtensor/gtensor.h index 443dd583..a7d5637d 100644 --- a/include/gtensor/gtensor.h +++ b/include/gtensor/gtensor.h @@ -303,6 +303,27 @@ __global__ void kernel_launch(gt::shape_type<6> shape, F f) } } +template +__global__ void kernel_launch(gt::shape_type<7> shape, F f) +{ + int i = threadIdx.x + blockIdx.x * BS_X; + int j = threadIdx.y + blockIdx.y * BS_Y; + int b = blockIdx.z; + int o = b / (shape[2] * shape[3] * shape[4] * shape[5]); + b -= o * (shape[2] * shape[3] * shape[4] * hspae[5]); + int n = b / (shape[2] * shape[3] * shape[4]); + b -= n * (shape[2] * shape[3] * shape[4]); + int m = b / (shape[2] * shape[3]); + b -= m * (shape[2] * shape[3]); + int l = b / shape[2]; + b -= l * shape[2]; + int k = b; + + if (i < shape[0] && j < shape[1]) { + f(i, j, k, l, m, n, o); + } +} + #else // not GTENSOR_PER_DIM_KERNELS template @@ -428,6 +449,30 @@ struct launch<6, space::host> } }; +template <> +struct launch<7, space::host> +{ + template + static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream) + { + for (int o = 0; o < shape[6]; o++) { + for (int n = 0; n < shape[5]; n++) { + for (int m = 0; m < shape[4]; m++) { + for (int l = 0; l < shape[3]; l++) { + for (int k = 0; k < shape[2]; k++) { + for (int j = 0; j < shape[1]; j++) { + for (int i = 0; i < shape[0]; i++) { + std::forward(f)(i, j, k, l, m, n, o); + } + } + } + } + } + } + } + } +}; + #if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP) #ifdef GTENSOR_PER_DIM_KERNELS @@ -529,6 +574,21 @@ struct launch<6, space::device> } }; +template <> +struct launch<7, space::device> +{ + template + static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream) + { + dim3 numThreads(BS_X, BS_Y); + dim3 numBlocks((shape[0] + BS_X - 1) / BS_X, (shape[1] + BS_Y - 1) / BS_Y, + shape[2] * shape[3] * shape[4] * shape[5] * shape[6]); + + gtLaunchKernel(kernel_launch, numBlocks, numThreads, 0, + stream.get_backend_stream(), shape, std::forward(f)); + } +}; + #else // not GTENSOR_PER_DIM_KERNELS template diff --git a/include/gtensor/operator.h b/include/gtensor/operator.h index 774dc6d6..3b8bb5f7 100644 --- a/include/gtensor/operator.h +++ b/include/gtensor/operator.h @@ -355,6 +355,37 @@ struct equals<6, 6, space::host, space::host> } }; +template <> +struct equals<7, 7, space::host, space::host> +{ + template + static bool run(const E1& e1, const E2& e2) + { + if (e1.shape() != e2.shape()) { + return false; + } + + for (int v = 0; v < e1.shape(6); v++) { + for (int z = 0; z < e1.shape(5); z++) { + for (int y = 0; y < e1.shape(4); y++) { + for (int x = 0; x < e1.shape(3); x++) { + for (int k = 0; k < e1.shape(2); k++) { + for (int j = 0; j < e1.shape(1); j++) { + for (int i = 0; i < e1.shape(0); i++) { + if (e1(i, j, k, x, y, z, v) != e2(i, j, k, x, y, z, v)) { + return false; + } + } + } + } + } + } + } + } + return true; + } +}; + #ifdef GTENSOR_HAVE_DEVICE template diff --git a/tests/test_assign.cxx b/tests/test_assign.cxx index 6d43111d..06f76660 100644 --- a/tests/test_assign.cxx +++ b/tests/test_assign.cxx @@ -23,6 +23,22 @@ TEST(assign, gtensor_6d) EXPECT_EQ(a, b); } +TEST(assign, gtensor_7d) +{ + gt::gtensor a(gt::shape(2, 3, 4, 5, 6, 7, 8)); + gt::gtensor b(a.shape()); + + int* adata = a.data(); + + for (int i = 0; i < a.size(); i++) { + adata[i] = i; + } + + EXPECT_NE(a, b); + b = a; + EXPECT_EQ(a, b); +} + TEST(assign, gview_1d_scalar) { auto a = gt::empty(gt::shape(5)); @@ -81,6 +97,23 @@ TEST(assign, broadcast_6d) } } +TEST(assign, broadcast_7d) +{ + gt::gtensor a(gt::shape(8, 1, 2, 4, 1, 1, 6), 0); + gt::gtensor b(gt::shape(8, 1, 2, 1, 1, 1, 6), -7); + + gt::assign(a, b); + + for (int i = 0; i < a.shape(0); i++) { + for (int j = 0; j < a.shape(2); j++) { + for (int k = 0; k < a.shape(3); k++) { + for (int l = 0; l < a.shape(6); l++) { + EXPECT_EQ(a(i, 0, j, k, 0, 0, l), -7); + } + } + } + } +} #ifdef GTENSOR_HAVE_DEVICE TEST(assign, device_gtensor_6d) @@ -103,6 +136,26 @@ TEST(assign, device_gtensor_6d) EXPECT_EQ(h_a, h_b); } +TEST(assign, device_gtensor_7d) +{ + gt::gtensor_device a(gt::shape(2, 3, 4, 5, 6, 7, 8)); + gt::gtensor_device b(a.shape()); + gt::gtensor h_a(a.shape()); + gt::gtensor h_b(a.shape()); + + int* adata = h_a.data(); + + for (int i = 0; i < a.size(); i++) { + adata[i] = i; + } + + gt::copy(h_a, a); + b = a; + gt::copy(b, h_b); + + EXPECT_EQ(h_a, h_b); +} + TEST(assign, device_gtensor_fill) { gt::gtensor_device a(gt::shape(2, 3)); @@ -403,4 +456,26 @@ TEST(assign, device_broadcast_6d) } } +TEST(assign, device_broadcast_7d) +{ + gt::gtensor_device a(gt::shape(8, 1, 2, 4, 1, 1, 4), 0); + gt::gtensor_device b(gt::shape(8, 1, 2, 1, 1, 1, 4), -7); + + gt::gtensor h_a(a.shape()); + + gt::assign(a, b); + + gt::copy(a, h_a); + + for (int i = 0; i < h_a.shape(0); i++) { + for (int j = 0; j < h_a.shape(2); j++) { + for (int k = 0; k < h_a.shape(3); k++) { + for (int l = 0; l < h_a.shape(6); l++) { + EXPECT_EQ(h_a(i, 0, j, k, 0, 0, l), -7); + } + } + } + } +} + #endif // GTENSOR_HAVE_DEVICE diff --git a/tests/test_launch.cxx b/tests/test_launch.cxx index 78e192c4..6ad2a378 100644 --- a/tests/test_launch.cxx +++ b/tests/test_launch.cxx @@ -184,6 +184,22 @@ void device_double_add_6d(gt::gtensor_device& a, gt::copy(b, out); } +void device_double_add_7d(gt::gtensor_device& a, + gt::gtensor& out) +{ + auto b = gt::empty_like(a); + + auto k_a = a.to_kernel(); + auto k_b = b.to_kernel(); + + gt::launch<7>( + a.shape(), GT_LAMBDA(int i, int j, int k, int l, int m, int n, int o) { + k_b(i, j, k, l, m, n, o) = + k_a(i, j, k, l, m, n, o) + k_a(i, j, k, l, m, n, o); + }); + gt::copy(b, out); +} + TEST(gtensor, device_launch_5d) { gt::gtensor h_a(gt::shape(2, 2, 2, 2, 2)); @@ -242,4 +258,36 @@ TEST(gtensor, device_launch_6d) EXPECT_EQ(h_b, h_b_expected); } +TEST(gtensor, device_launch_7d) +{ + gt::gtensor h_a(gt::shape(2, 2, 2, 2, 2, 2, 2)); + gt::gtensor_device a(h_a.shape()); + gt::gtensor h_b(h_a.shape()); + gt::gtensor h_b_expected(h_a.shape()); + + for (int i = 0; i < h_a.shape(0); i++) { + for (int j = 0; j < h_a.shape(1); j++) { + for (int k = 0; k < h_a.shape(2); k++) { + for (int l = 0; l < h_a.shape(3); l++) { + for (int m = 0; m < h_a.shape(4); m++) { + for (int n = 0; n < h_a.shape(5); n++) { + for (int o = 0; o < h_a.shape(6); o++) { + h_a(i, j, k, l, m, n, o) = i + j + k + l + m + n + o; + } + } + } + } + } + } + } + + h_b_expected = 2 * h_a; + + gt::copy(h_a, a); + + device_double_add_7d(a, h_b); + + EXPECT_EQ(h_b, h_b_expected); +} + #endif // GTENSOR_HAVE_DEVICE