Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

7d #289

Merged
merged 9 commits into from
Sep 18, 2024
Merged

7d #289

Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 65 additions & 1 deletion include/gtensor/assign.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,31 @@ struct assigner<6, space::host>
}
};

template <>
struct assigner<7, space::host>
{
template <typename E1, typename E2>
static void run(E1& lhs, const E2& rhs, stream_view stream)
{
// printf("assigner<7, host>\n");
for (int o = 0; o < lhs.shape(6); o++) {
for (int n = 0; n < lhs.shape(5); n++) {
for (int m = 0; m < lhs.shape(4); m++) {
for (int l = 0; l < lhs.shape(3); l++) {
for (int k = 0; k < lhs.shape(2); k++) {
for (int j = 0; j < lhs.shape(1); j++) {
for (int i = 0; i < lhs.shape(0); i++) {
lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, n, o);
}
}
}
}
}
}
}
}
};

#if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP)

#ifdef GTENSOR_PER_DIM_KERNELS
Expand Down Expand Up @@ -219,6 +244,24 @@ __global__ void kernel_assign_6(Elhs lhs, Erhs _rhs)
}
}

template <typename Elhs, typename Erhs>
__global__ void kernel_assign_7(Elhs lhs, Erhs _rhs)
{
auto rhs = _rhs;
int tidx = threadIdx.x + blockIdx.x * blockDim.x;
int tidy = threadIdx.y + blockIdx.y * blockDim.y;
int tidz = blockIdx.z;
if (tidx < lhs.shape(0) * lhs.shape(1) &&
tidy < lhs.shape(2) * lhs.shape(3)) {
int j = tidx / lhs.shape(0), i = tidx % lhs.shape(0);
int l = tidy / lhs.shape(2), k = tidy % lhs.shape(2);
int m = tidz % lhs.shape(5) % lhs.shape(4);
int n = tidz % lhs.shape(5) / lhs.shape(4), int o = tidz / lhs.shape(5);
bd4 marked this conversation as resolved.
Show resolved Hide resolved

lhs(i, j, k, l, m, n, o) = rhs(i, j, k, l, m, o);
}
}

template <>
struct assigner<1, space::device>
{
Expand Down Expand Up @@ -305,7 +348,7 @@ struct assigner<5, space::device>
template <typename E1, typename E2>
static void run(E1& lhs, const E2& rhs, stream_view stream)
{
// printf("assigner<6, device>\n");
// printf("assigner<5, device>\n");
dim3 numThreads(BS_X, BS_Y);
dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X,
(lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y,
Expand Down Expand Up @@ -341,6 +384,27 @@ struct assigner<6, space::device>
}
};

template <>
struct assigner<7, space::device>
{
template <typename E1, typename E2>
static void run(E1& lhs, const E2& rhs, stream_view stream)
{
// printf("assigner<7, device>\n");
dim3 numThreads(BS_X, BS_Y);
dim3 numBlocks((lhs.shape(0) * lhs.shape(1) + BS_X - 1) / BS_X,
(lhs.shape(2) * lhs.shape(3) + BS_Y - 1) / BS_Y,
lhs.shape(4) * lhs.shape(5) * lhs.shape(6));

gpuSyncIfEnabledStream(stream);
// std::cout << "rhs " << typeid(rhs.to_kernel()).name() << "\n";
gtLaunchKernel(kernel_assign_7, numBlocks, numThreads, 0,
stream.get_backend_stream(), lhs.to_kernel(),
rhs.to_kernel());
gpuSyncIfEnabledStream(stream);
}
};

#else // not defined GTENSOR_PER_DIM_KERNELS

template <typename Elhs, typename Erhs, size_type N>
Expand Down
6 changes: 6 additions & 0 deletions include/gtensor/expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,12 @@ GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<6> idx)
return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5]);
}

template <typename E>
GT_INLINE decltype(auto) index_expression(E&& expr, shape_type<7> idx)
{
return expr(idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6]);
}

} // namespace gt

#endif
60 changes: 60 additions & 0 deletions include/gtensor/gtensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,27 @@ __global__ void kernel_launch(gt::shape_type<6> shape, F f)
}
}

template <typename F>
__global__ void kernel_launch(gt::shape_type<7> shape, F f)
{
int i = threadIdx.x + blockIdx.x * BS_X;
int j = threadIdx.y + blockIdx.y * BS_Y;
int b = blockIdx.z;
int o = b / (shape[2] * shape[3] * shape[4] * shape[5]);
b -= o * (shape[2] * shape[3] * shape[4] * hspae[5]);
int n = b / (shape[2] * shape[3] * shape[4]);
b -= n * (shape[2] * shape[3] * shape[4]);
int m = b / (shape[2] * shape[3]);
b -= m * (shape[2] * shape[3]);
int l = b / shape[2];
b -= l * shape[2];
int k = b;

if (i < shape[0] && j < shape[1]) {
f(i, j, k, l, m, n, o);
}
}

#else // not GTENSOR_PER_DIM_KERNELS

template <typename F, size_type N>
Expand Down Expand Up @@ -428,6 +449,30 @@ struct launch<6, space::host>
}
};

template <>
struct launch<7, space::host>
{
template <typename F>
static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream)
{
for (int o = 0; o < shape[6]; o++) {
for (int n = 0; n < shape[5]; n++) {
for (int m = 0; m < shape[4]; m++) {
for (int l = 0; l < shape[3]; l++) {
for (int k = 0; k < shape[2]; k++) {
for (int j = 0; j < shape[1]; j++) {
for (int i = 0; i < shape[0]; i++) {
std::forward<F>(f)(i, j, k, l, m, n, o);
}
}
}
}
}
}
}
}
};

#if defined(GTENSOR_DEVICE_CUDA) || defined(GTENSOR_DEVICE_HIP)

#ifdef GTENSOR_PER_DIM_KERNELS
Expand Down Expand Up @@ -529,6 +574,21 @@ struct launch<6, space::device>
}
};

template <>
struct launch<7, space::device>
{
template <typename F>
static void run(const gt::shape_type<7>& shape, F&& f, gt::stream_view stream)
{
dim3 numThreads(BS_X, BS_Y);
dim3 numBlocks((shape[0] + BS_X - 1) / BS_X, (shape[1] + BS_Y - 1) / BS_Y,
shape[2] * shape[3] * shape[4] * shape[5] * shape[6]);

gtLaunchKernel(kernel_launch, numBlocks, numThreads, 0,
stream.get_backend_stream(), shape, std::forward<F>(f));
}
};

#else // not GTENSOR_PER_DIM_KERNELS

template <int N>
Expand Down
31 changes: 31 additions & 0 deletions include/gtensor/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,37 @@ struct equals<6, 6, space::host, space::host>
}
};

template <>
struct equals<7, 7, space::host, space::host>
{
template <typename E1, typename E2>
static bool run(const E1& e1, const E2& e2)
{
if (e1.shape() != e2.shape()) {
return false;
}

for (int v = 0; v < e1.shape(6); v++) {
for (int z = 0; z < e1.shape(5); z++) {
for (int y = 0; y < e1.shape(4); y++) {
for (int x = 0; x < e1.shape(3); x++) {
for (int k = 0; k < e1.shape(2); k++) {
for (int j = 0; j < e1.shape(1); j++) {
for (int i = 0; i < e1.shape(0); i++) {
if (e1(i, j, k, x, y, z, v) != e2(i, j, k, x, y, z, v)) {
return false;
}
}
}
}
}
}
}
}
return true;
}
};

#ifdef GTENSOR_HAVE_DEVICE

template <size_type N1, size_type N2>
Expand Down
48 changes: 48 additions & 0 deletions tests/test_launch.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,22 @@ void device_double_add_6d(gt::gtensor_device<double, 6>& a,
gt::copy(b, out);
}

void device_double_add_7d(gt::gtensor_device<double, 7>& a,
gt::gtensor<double, 7>& out)
{
auto b = gt::empty_like(a);

auto k_a = a.to_kernel();
auto k_b = b.to_kernel();

gt::launch<7>(
a.shape(), GT_LAMBDA(int i, int j, int k, int l, int m, int n, int o) {
k_b(i, j, k, l, m, n, o) =
k_a(i, j, k, l, m, n, o) + k_a(i, j, k, l, m, n, o);
});
gt::copy(b, out);
}

TEST(gtensor, device_launch_5d)
{
gt::gtensor<double, 5> h_a(gt::shape(2, 2, 2, 2, 2));
Expand Down Expand Up @@ -242,4 +258,36 @@ TEST(gtensor, device_launch_6d)
EXPECT_EQ(h_b, h_b_expected);
}

TEST(gtensor, device_launch_7d)
{
gt::gtensor<double, 7> h_a(gt::shape(2, 2, 2, 2, 2, 2, 2));
gt::gtensor_device<double, 7> a(h_a.shape());
gt::gtensor<double, 7> h_b(h_a.shape());
gt::gtensor<double, 7> h_b_expected(h_a.shape());

for (int i = 0; i < h_a.shape(0); i++) {
for (int j = 0; j < h_a.shape(1); j++) {
for (int k = 0; k < h_a.shape(2); k++) {
for (int l = 0; l < h_a.shape(3); l++) {
for (int m = 0; m < h_a.shape(4); m++) {
for (int n = 0; n < h_a.shape(5); n++) {
for (int o = 0; n < h_a.shape(6); o++) {
bd4 marked this conversation as resolved.
Show resolved Hide resolved
h_a(i, j, k, l, m, n, o) = i + j + k + l + m + n + o;
}
}
}
}
}
}
}

h_b_expected = 2 * h_a;

gt::copy(h_a, a);

device_double_add_7d(a, h_b);

EXPECT_EQ(h_b, h_b_expected);
}

#endif // GTENSOR_HAVE_DEVICE
Loading