forked from GPUOpen-Drivers/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[libomptarget] Implement host plugin for amdgpu
[libomptarget] Implement host plugin for amdgpu Replacement for D71384. Primary difference is inlining the dependency on atmi followed by extensive simplification and bugfixes. This is the latest version from https://github.com/ROCm-Developer-Tools/amd-llvm-project/tree/aomp12 with minor patches and a rename from hsa to amdgpu, on the basis that this can't be used by other implementations of hsa without additional work. This will not build unless the ROCM_DIR variable is passed so won't break other builds. That variable is used to locate two amdgpu specific libraries that ship as part of rocm: libhsakmt at https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface libhsa-runtime64 at https://github.com/RadeonOpenCompute/ROCR-Runtime These libraries build from source. The build scripts in those repos are for shared libraries, but can be adapted to statically link both into this plugin. There are caveats. - This works well enough to run various tests and benchmarks, and will be used to support the current clang bring up - It is adequately thread safe for the above but there will be races remaining - It is not stylistically correct for llvm, though has had clang-format run - It has suboptimal memory management and locking strategies - The debug printing / error handling is inconsistent I would like to contribute this pretty much as-is and then improve it in-tree. This would be advantagous because the aomp12 branch that was in use for fixing this codebase has just been joined with the amd internal rocm dev process. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D85742
- Loading branch information
1 parent
a49b05b
commit d0b3129
Showing
21 changed files
with
5,193 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
##===----------------------------------------------------------------------===## | ||
# | ||
# The LLVM Compiler Infrastructure | ||
# | ||
# This file is dual licensed under the MIT and the University of Illinois Open | ||
# Source Licenses. See LICENSE.txt for details. | ||
# | ||
##===----------------------------------------------------------------------===## | ||
# | ||
# Build a plugin for an AMDGPU machine if available. | ||
# | ||
##===----------------------------------------------------------------------===## | ||
|
||
################################################################################ | ||
|
||
if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND) | ||
libomptarget_say("Not building AMDGPU plugin: LIBELF not found") | ||
return() | ||
endif() | ||
|
||
if(NOT ROCM_DIR) | ||
libomptarget_say("Not building AMDGPU plugin: ROCM_DIR is not set") | ||
return() | ||
endif() | ||
|
||
set(LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS ${ROCM_DIR}/hsa/include ${ROCM_DIR}/hsa/include/hsa) | ||
set(LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${ROCM_DIR}/hsa/lib) | ||
set(LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS ${ROCM_DIR}/lib) | ||
|
||
mark_as_advanced( LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS) | ||
|
||
if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux") | ||
libomptarget_say("Not building amdgpu plugin: only support amdgpu in Linux x86_64, ppc64le, or aarch64 hosts.") | ||
return() | ||
endif() | ||
libomptarget_say("Building amdgpu offloading plugin using ROCM_DIR = ${ROCM_DIR}") | ||
|
||
libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS: ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}") | ||
libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS}") | ||
libomptarget_say("LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS: ${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}") | ||
|
||
################################################################################ | ||
# Define the suffix for the runtime messaging dumps. | ||
add_definitions(-DTARGET_NAME=AMDGPU) | ||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$") | ||
add_definitions(-DLITTLEENDIAN_CPU=1) | ||
endif() | ||
|
||
if(CMAKE_BUILD_TYPE MATCHES Debug) | ||
add_definitions(-DDEBUG) | ||
endif() | ||
|
||
include_directories( | ||
${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS} | ||
${CMAKE_CURRENT_SOURCE_DIR}/impl | ||
) | ||
|
||
add_library(omptarget.rtl.amdgpu SHARED | ||
impl/atmi.cpp | ||
impl/atmi_interop_hsa.cpp | ||
impl/data.cpp | ||
impl/machine.cpp | ||
impl/system.cpp | ||
impl/utils.cpp | ||
impl/msgpack.cpp | ||
src/rtl.cpp | ||
) | ||
|
||
# Install plugin under the lib destination folder. | ||
# When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug | ||
install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}") | ||
|
||
target_link_libraries( | ||
omptarget.rtl.amdgpu | ||
-lpthread -ldl -Wl,-rpath,${OPENMP_INSTALL_LIBDIR} | ||
-L${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS} -L${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS} -lhsa-runtime64 -lhsakmt -Wl,-rpath,${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS},-rpath,${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS} | ||
-lelf | ||
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports" | ||
"-Wl,-z,defs" | ||
) | ||
|
||
# Report to the parent scope that we are building a plugin for amdgpu | ||
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
/*===-------------------------------------------------------------------------- | ||
* ATMI (Asynchronous Task and Memory Interface) | ||
* | ||
* This file is distributed under the MIT License. See LICENSE.txt for details. | ||
*===------------------------------------------------------------------------*/ | ||
#include "rt.h" | ||
/* | ||
* Initialize/Finalize | ||
*/ | ||
atmi_status_t atmi_init() { return core::Runtime::Initialize(); } | ||
|
||
atmi_status_t atmi_finalize() { return core::Runtime::Finalize(); } | ||
|
||
/* | ||
* Machine Info | ||
*/ | ||
atmi_machine_t *atmi_machine_get_info() { | ||
return core::Runtime::GetMachineInfo(); | ||
} | ||
|
||
/* | ||
* Modules | ||
*/ | ||
atmi_status_t atmi_module_register_from_memory_to_place( | ||
void *module_bytes, size_t module_size, atmi_place_t place, | ||
atmi_status_t (*on_deserialized_data)(void *data, size_t size, | ||
void *cb_state), | ||
void *cb_state) { | ||
return core::Runtime::getInstance().RegisterModuleFromMemory( | ||
module_bytes, module_size, place, on_deserialized_data, cb_state); | ||
} | ||
|
||
/* | ||
* Data | ||
*/ | ||
atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size) { | ||
return core::Runtime::Memcpy(dest, src, size); | ||
} | ||
|
||
atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); } | ||
|
||
atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) { | ||
return core::Runtime::Malloc(ptr, size, place); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
/*===-------------------------------------------------------------------------- | ||
* ATMI (Asynchronous Task and Memory Interface) | ||
* | ||
* This file is distributed under the MIT License. See LICENSE.txt for details. | ||
*===------------------------------------------------------------------------*/ | ||
#ifndef INCLUDE_ATMI_H_ | ||
#define INCLUDE_ATMI_H_ | ||
|
||
#define ROCM_VERSION_MAJOR 3 | ||
#define ROCM_VERSION_MINOR 2 | ||
|
||
/** \defgroup enumerations Enumerated Types | ||
* @{ | ||
*/ | ||
|
||
/** | ||
* @brief Status codes. | ||
*/ | ||
typedef enum atmi_status_t { | ||
/** | ||
* The function has been executed successfully. | ||
*/ | ||
ATMI_STATUS_SUCCESS = 0, | ||
/** | ||
* A undocumented error has occurred. | ||
*/ | ||
ATMI_STATUS_UNKNOWN = 1, | ||
/** | ||
* A generic error has occurred. | ||
*/ | ||
ATMI_STATUS_ERROR = 2, | ||
} atmi_status_t; | ||
|
||
/** | ||
* @brief Device Types. | ||
*/ | ||
typedef enum atmi_devtype_s { | ||
ATMI_DEVTYPE_CPU = 0x0001, | ||
ATMI_DEVTYPE_iGPU = 0x0010, // Integrated GPU | ||
ATMI_DEVTYPE_dGPU = 0x0100, // Discrete GPU | ||
ATMI_DEVTYPE_GPU = ATMI_DEVTYPE_iGPU | ATMI_DEVTYPE_dGPU, // Any GPU | ||
ATMI_DEVTYPE_ALL = 0x111 // Union of all device types | ||
} atmi_devtype_t; | ||
|
||
/** | ||
* @brief Memory Access Type. | ||
*/ | ||
typedef enum atmi_memtype_s { | ||
ATMI_MEMTYPE_FINE_GRAINED = 0, | ||
ATMI_MEMTYPE_COARSE_GRAINED = 1, | ||
ATMI_MEMTYPE_ANY | ||
} atmi_memtype_t; | ||
|
||
/** | ||
* @brief ATMI Memory Fences for Tasks. | ||
*/ | ||
typedef enum atmi_task_fence_scope_s { | ||
/** | ||
* No memory fence applied; external fences have to be applied around the task | ||
* launch/completion. | ||
*/ | ||
ATMI_FENCE_SCOPE_NONE = 0, | ||
/** | ||
* The fence is applied to the device. | ||
*/ | ||
ATMI_FENCE_SCOPE_DEVICE = 1, | ||
/** | ||
* The fence is applied to the entire system. | ||
*/ | ||
ATMI_FENCE_SCOPE_SYSTEM = 2 | ||
} atmi_task_fence_scope_t; | ||
|
||
/** @} */ | ||
|
||
/** \defgroup common Common ATMI Structures | ||
* @{ | ||
*/ | ||
|
||
/** | ||
* @brief ATMI Compute Place | ||
*/ | ||
typedef struct atmi_place_s { | ||
/** | ||
* The node in a cluster where computation should occur. | ||
* Default is node_id = 0 for local computations. | ||
*/ | ||
unsigned int node_id; | ||
/** | ||
* Device type: CPU, GPU or DSP | ||
*/ | ||
atmi_devtype_t type; | ||
/** | ||
* The device ordinal number ordered by runtime; -1 for any | ||
*/ | ||
int device_id; | ||
} atmi_place_t; | ||
|
||
/** | ||
* @brief ATMI Memory Place | ||
*/ | ||
typedef struct atmi_mem_place_s { | ||
/** | ||
* The node in a cluster where computation should occur. | ||
* Default is node_id = 0 for local computations. | ||
*/ | ||
unsigned int node_id; | ||
/** | ||
* Device type: CPU, GPU or DSP | ||
*/ | ||
atmi_devtype_t dev_type; | ||
/** | ||
* The device ordinal number ordered by runtime; -1 for any | ||
*/ | ||
int dev_id; | ||
// atmi_memtype_t mem_type; // Fine grained or Coarse grained | ||
/** | ||
* The memory space/region ordinal number ordered by runtime; -1 for any | ||
*/ | ||
int mem_id; | ||
} atmi_mem_place_t; | ||
|
||
/** | ||
* @brief ATMI Memory Space/region Structure | ||
*/ | ||
typedef struct atmi_memory_s { | ||
/** | ||
* Memory capacity | ||
*/ | ||
unsigned long int capacity; | ||
/** | ||
* Memory type | ||
*/ | ||
atmi_memtype_t type; | ||
} atmi_memory_t; | ||
|
||
/** | ||
* @brief ATMI Device Structure | ||
*/ | ||
typedef struct atmi_device_s { | ||
/** | ||
* Device type: CPU, GPU or DSP | ||
*/ | ||
atmi_devtype_t type; | ||
/** | ||
* The number of compute cores | ||
*/ | ||
unsigned int core_count; | ||
/** | ||
* The number of memory spaces/regions that are accessible | ||
* from this device | ||
*/ | ||
unsigned int memory_count; | ||
/** | ||
* Array of memory spaces/regions that are accessible | ||
* from this device. | ||
*/ | ||
atmi_memory_t *memories; | ||
} atmi_device_t; | ||
|
||
/** | ||
* @brief ATMI Machine Structure | ||
*/ | ||
typedef struct atmi_machine_s { | ||
/** | ||
* The number of devices categorized by the device type | ||
*/ | ||
unsigned int device_count_by_type[ATMI_DEVTYPE_ALL]; | ||
/** | ||
* The device structures categorized by the device type | ||
*/ | ||
atmi_device_t *devices_by_type[ATMI_DEVTYPE_ALL]; | ||
} atmi_machine_t; | ||
|
||
// Below are some helper macros that can be used to setup | ||
// some of the ATMI data structures. | ||
#define ATMI_PLACE_CPU(node, cpu_id) \ | ||
{ .node_id = node, .type = ATMI_DEVTYPE_CPU, .device_id = cpu_id } | ||
#define ATMI_PLACE_GPU(node, gpu_id) \ | ||
{ .node_id = node, .type = ATMI_DEVTYPE_GPU, .device_id = gpu_id } | ||
#define ATMI_MEM_PLACE_CPU(node, cpu_id) \ | ||
{ \ | ||
.node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \ | ||
.mem_id = -1 \ | ||
} | ||
#define ATMI_MEM_PLACE_GPU(node, gpu_id) \ | ||
{ \ | ||
.node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \ | ||
.mem_id = -1 \ | ||
} | ||
#define ATMI_MEM_PLACE_CPU_MEM(node, cpu_id, cpu_mem_id) \ | ||
{ \ | ||
.node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \ | ||
.mem_id = cpu_mem_id \ | ||
} | ||
#define ATMI_MEM_PLACE_GPU_MEM(node, gpu_id, gpu_mem_id) \ | ||
{ \ | ||
.node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \ | ||
.mem_id = gpu_mem_id \ | ||
} | ||
#define ATMI_MEM_PLACE(d_type, d_id, m_id) \ | ||
{ .node_id = 0, .dev_type = d_type, .dev_id = d_id, .mem_id = m_id } | ||
|
||
#endif // INCLUDE_ATMI_H_ |
Oops, something went wrong.