diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index 5aaa7e938d279..f09541b0d5580 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -41,7 +41,9 @@ static void call_fini_array_callbacks() { } // namespace LIBC_NAMESPACE_DECL -extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void +extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel, + clang::amdgpu_flat_work_group_size(1, 1), + clang::amdgpu_max_num_work_groups(1)]] void _begin(int argc, char **argv, char **env) { __atomic_store_n(&LIBC_NAMESPACE::app.env_ptr, reinterpret_cast(env), __ATOMIC_RELAXED); @@ -60,7 +62,9 @@ _start(int argc, char **argv, char **envp, int *ret) { __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); } -extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void +extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel, + clang::amdgpu_flat_work_group_size(1, 1), + clang::amdgpu_max_num_work_groups(1)]] void _end(int retval) { // Only a single thread should call `exit` here, the rest should gracefully // return from the kernel. This is so only one thread calls the destructors