Skip to content

Commit

Permalink
[libomptarget] Implement host plugin for amdgpu
Browse files Browse the repository at this point in the history
[libomptarget] Implement host plugin for amdgpu

Replacement for D71384. Primary difference is inlining the dependency on atmi
followed by extensive simplification and bugfixes. This is the latest version
from https://github.com/ROCm-Developer-Tools/amd-llvm-project/tree/aomp12 with
minor patches and a rename from hsa to amdgpu, on the basis that this can't be
used by other implementations of hsa without additional work.

This will not build unless the ROCM_DIR variable is passed so won't break other
builds. That variable is used to locate two amdgpu specific libraries that ship
as part of rocm:
libhsakmt at https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface
libhsa-runtime64 at https://github.com/RadeonOpenCompute/ROCR-Runtime
These libraries build from source. The build scripts in those repos are for
shared libraries, but can be adapted to statically link both into this plugin.

There are caveats.
- This works well enough to run various tests and benchmarks, and will be used
  to support the current clang bring up
- It is adequately thread safe for the above but there will be races remaining
- It is not stylistically correct for llvm, though has had clang-format run
- It has suboptimal memory management and locking strategies
- The debug printing / error handling is inconsistent

I would like to contribute this pretty much as-is and then improve it in-tree.
This would be advantagous because the aomp12 branch that was in use for fixing
this codebase has just been joined with the amd internal rocm dev process.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D85742
  • Loading branch information
JonChesterfield committed Aug 15, 2020
1 parent a49b05b commit d0b3129
Show file tree
Hide file tree
Showing 21 changed files with 5,193 additions and 1 deletion.
1 change: 1 addition & 0 deletions openmp/libomptarget/plugins/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ endif()
endmacro()

add_subdirectory(aarch64)
add_subdirectory(amdgpu)
add_subdirectory(cuda)
add_subdirectory(ppc64)
add_subdirectory(ppc64le)
Expand Down
84 changes: 84 additions & 0 deletions openmp/libomptarget/plugins/amdgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
##===----------------------------------------------------------------------===##
#
# The LLVM Compiler Infrastructure
#
# This file is dual licensed under the MIT and the University of Illinois Open
# Source Licenses. See LICENSE.txt for details.
#
##===----------------------------------------------------------------------===##
#
# Build a plugin for an AMDGPU machine if available.
#
##===----------------------------------------------------------------------===##

################################################################################

if(NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
libomptarget_say("Not building AMDGPU plugin: LIBELF not found")
return()
endif()

if(NOT ROCM_DIR)
libomptarget_say("Not building AMDGPU plugin: ROCM_DIR is not set")
return()
endif()

set(LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS ${ROCM_DIR}/hsa/include ${ROCM_DIR}/hsa/include/hsa)
set(LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${ROCM_DIR}/hsa/lib)
set(LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS ${ROCM_DIR}/lib)

mark_as_advanced( LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS)

if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")
libomptarget_say("Not building amdgpu plugin: only support amdgpu in Linux x86_64, ppc64le, or aarch64 hosts.")
return()
endif()
libomptarget_say("Building amdgpu offloading plugin using ROCM_DIR = ${ROCM_DIR}")

libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS: ${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}")
libomptarget_say("LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS ${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS}")
libomptarget_say("LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS: ${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}")

################################################################################
# Define the suffix for the runtime messaging dumps.
add_definitions(-DTARGET_NAME=AMDGPU)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "(ppc64le)|(aarch64)$")
add_definitions(-DLITTLEENDIAN_CPU=1)
endif()

if(CMAKE_BUILD_TYPE MATCHES Debug)
add_definitions(-DDEBUG)
endif()

include_directories(
${LIBOMPTARGET_DEP_LIBHSA_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/impl
)

add_library(omptarget.rtl.amdgpu SHARED
impl/atmi.cpp
impl/atmi_interop_hsa.cpp
impl/data.cpp
impl/machine.cpp
impl/system.cpp
impl/utils.cpp
impl/msgpack.cpp
src/rtl.cpp
)

# Install plugin under the lib destination folder.
# When we build for debug, OPENMP_LIBDIR_SUFFIX get set to -debug
install(TARGETS omptarget.rtl.amdgpu LIBRARY DESTINATION "lib${OPENMP_LIBDIR_SUFFIX}")

target_link_libraries(
omptarget.rtl.amdgpu
-lpthread -ldl -Wl,-rpath,${OPENMP_INSTALL_LIBDIR}
-L${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS} -L${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS} -lhsa-runtime64 -lhsakmt -Wl,-rpath,${LIBOMPTARGET_DEP_LIBHSA_LIBRARIES_DIRS},-rpath,${LIBOMPTARGET_DEP_LIBHSAKMT_LIBRARIES_DIRS}
-lelf
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
"-Wl,-z,defs"
)

# Report to the parent scope that we are building a plugin for amdgpu
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa" PARENT_SCOPE)

44 changes: 44 additions & 0 deletions openmp/libomptarget/plugins/amdgpu/impl/atmi.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*===--------------------------------------------------------------------------
* ATMI (Asynchronous Task and Memory Interface)
*
* This file is distributed under the MIT License. See LICENSE.txt for details.
*===------------------------------------------------------------------------*/
#include "rt.h"
/*
* Initialize/Finalize
*/
atmi_status_t atmi_init() { return core::Runtime::Initialize(); }

atmi_status_t atmi_finalize() { return core::Runtime::Finalize(); }

/*
* Machine Info
*/
atmi_machine_t *atmi_machine_get_info() {
return core::Runtime::GetMachineInfo();
}

/*
* Modules
*/
atmi_status_t atmi_module_register_from_memory_to_place(
void *module_bytes, size_t module_size, atmi_place_t place,
atmi_status_t (*on_deserialized_data)(void *data, size_t size,
void *cb_state),
void *cb_state) {
return core::Runtime::getInstance().RegisterModuleFromMemory(
module_bytes, module_size, place, on_deserialized_data, cb_state);
}

/*
* Data
*/
atmi_status_t atmi_memcpy(void *dest, const void *src, size_t size) {
return core::Runtime::Memcpy(dest, src, size);
}

atmi_status_t atmi_free(void *ptr) { return core::Runtime::Memfree(ptr); }

atmi_status_t atmi_malloc(void **ptr, size_t size, atmi_mem_place_t place) {
return core::Runtime::Malloc(ptr, size, place);
}
203 changes: 203 additions & 0 deletions openmp/libomptarget/plugins/amdgpu/impl/atmi.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
/*===--------------------------------------------------------------------------
* ATMI (Asynchronous Task and Memory Interface)
*
* This file is distributed under the MIT License. See LICENSE.txt for details.
*===------------------------------------------------------------------------*/
#ifndef INCLUDE_ATMI_H_
#define INCLUDE_ATMI_H_

#define ROCM_VERSION_MAJOR 3
#define ROCM_VERSION_MINOR 2

/** \defgroup enumerations Enumerated Types
* @{
*/

/**
* @brief Status codes.
*/
typedef enum atmi_status_t {
/**
* The function has been executed successfully.
*/
ATMI_STATUS_SUCCESS = 0,
/**
* A undocumented error has occurred.
*/
ATMI_STATUS_UNKNOWN = 1,
/**
* A generic error has occurred.
*/
ATMI_STATUS_ERROR = 2,
} atmi_status_t;

/**
* @brief Device Types.
*/
typedef enum atmi_devtype_s {
ATMI_DEVTYPE_CPU = 0x0001,
ATMI_DEVTYPE_iGPU = 0x0010, // Integrated GPU
ATMI_DEVTYPE_dGPU = 0x0100, // Discrete GPU
ATMI_DEVTYPE_GPU = ATMI_DEVTYPE_iGPU | ATMI_DEVTYPE_dGPU, // Any GPU
ATMI_DEVTYPE_ALL = 0x111 // Union of all device types
} atmi_devtype_t;

/**
* @brief Memory Access Type.
*/
typedef enum atmi_memtype_s {
ATMI_MEMTYPE_FINE_GRAINED = 0,
ATMI_MEMTYPE_COARSE_GRAINED = 1,
ATMI_MEMTYPE_ANY
} atmi_memtype_t;

/**
* @brief ATMI Memory Fences for Tasks.
*/
typedef enum atmi_task_fence_scope_s {
/**
* No memory fence applied; external fences have to be applied around the task
* launch/completion.
*/
ATMI_FENCE_SCOPE_NONE = 0,
/**
* The fence is applied to the device.
*/
ATMI_FENCE_SCOPE_DEVICE = 1,
/**
* The fence is applied to the entire system.
*/
ATMI_FENCE_SCOPE_SYSTEM = 2
} atmi_task_fence_scope_t;

/** @} */

/** \defgroup common Common ATMI Structures
* @{
*/

/**
* @brief ATMI Compute Place
*/
typedef struct atmi_place_s {
/**
* The node in a cluster where computation should occur.
* Default is node_id = 0 for local computations.
*/
unsigned int node_id;
/**
* Device type: CPU, GPU or DSP
*/
atmi_devtype_t type;
/**
* The device ordinal number ordered by runtime; -1 for any
*/
int device_id;
} atmi_place_t;

/**
* @brief ATMI Memory Place
*/
typedef struct atmi_mem_place_s {
/**
* The node in a cluster where computation should occur.
* Default is node_id = 0 for local computations.
*/
unsigned int node_id;
/**
* Device type: CPU, GPU or DSP
*/
atmi_devtype_t dev_type;
/**
* The device ordinal number ordered by runtime; -1 for any
*/
int dev_id;
// atmi_memtype_t mem_type; // Fine grained or Coarse grained
/**
* The memory space/region ordinal number ordered by runtime; -1 for any
*/
int mem_id;
} atmi_mem_place_t;

/**
* @brief ATMI Memory Space/region Structure
*/
typedef struct atmi_memory_s {
/**
* Memory capacity
*/
unsigned long int capacity;
/**
* Memory type
*/
atmi_memtype_t type;
} atmi_memory_t;

/**
* @brief ATMI Device Structure
*/
typedef struct atmi_device_s {
/**
* Device type: CPU, GPU or DSP
*/
atmi_devtype_t type;
/**
* The number of compute cores
*/
unsigned int core_count;
/**
* The number of memory spaces/regions that are accessible
* from this device
*/
unsigned int memory_count;
/**
* Array of memory spaces/regions that are accessible
* from this device.
*/
atmi_memory_t *memories;
} atmi_device_t;

/**
* @brief ATMI Machine Structure
*/
typedef struct atmi_machine_s {
/**
* The number of devices categorized by the device type
*/
unsigned int device_count_by_type[ATMI_DEVTYPE_ALL];
/**
* The device structures categorized by the device type
*/
atmi_device_t *devices_by_type[ATMI_DEVTYPE_ALL];
} atmi_machine_t;

// Below are some helper macros that can be used to setup
// some of the ATMI data structures.
#define ATMI_PLACE_CPU(node, cpu_id) \
{ .node_id = node, .type = ATMI_DEVTYPE_CPU, .device_id = cpu_id }
#define ATMI_PLACE_GPU(node, gpu_id) \
{ .node_id = node, .type = ATMI_DEVTYPE_GPU, .device_id = gpu_id }
#define ATMI_MEM_PLACE_CPU(node, cpu_id) \
{ \
.node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \
.mem_id = -1 \
}
#define ATMI_MEM_PLACE_GPU(node, gpu_id) \
{ \
.node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \
.mem_id = -1 \
}
#define ATMI_MEM_PLACE_CPU_MEM(node, cpu_id, cpu_mem_id) \
{ \
.node_id = node, .dev_type = ATMI_DEVTYPE_CPU, .dev_id = cpu_id, \
.mem_id = cpu_mem_id \
}
#define ATMI_MEM_PLACE_GPU_MEM(node, gpu_id, gpu_mem_id) \
{ \
.node_id = node, .dev_type = ATMI_DEVTYPE_GPU, .dev_id = gpu_id, \
.mem_id = gpu_mem_id \
}
#define ATMI_MEM_PLACE(d_type, d_id, m_id) \
{ .node_id = 0, .dev_type = d_type, .dev_id = d_id, .mem_id = m_id }

#endif // INCLUDE_ATMI_H_
Loading

0 comments on commit d0b3129

Please sign in to comment.