Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CI] Update CI docker and suppress warnings #2333

Merged
merged 23 commits into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ set(BOOST_COMPONENTS filesystem)
add_definitions(-DBOOST_ALL_NO_LIB=1)
find_package(Boost REQUIRED COMPONENTS ${BOOST_COMPONENTS})

find_path(HALF_INCLUDE_DIR half.hpp)
find_path(HALF_INCLUDE_DIR half/half.hpp)
message(STATUS "HALF_INCLUDE_DIR: ${HALF_INCLUDE_DIR}")

option( MIOPEN_DEBUG_FIND_DB_CACHING "Use system find-db caching" ON)
Expand Down Expand Up @@ -707,6 +707,13 @@ enable_clang_tidy(
-cppcoreguidelines-avoid-const-or-ref-data-members
-cppcoreguidelines-avoid-do-while
-misc-use-anonymous-namespace
###################################################################
# TODO Code Quality WORKAROUND ROCm 5.7
###################################################################
-llvmlibc-inline-function-decl
-cppcoreguidelines-avoid-capture-default-when-capturing-this
-cppcoreguidelines-rvalue-reference-param-not-moved
-readability-avoid-unconditional-preprocessor-if
${MIOPEN_TIDY_CHECKS}
${MIOPEN_TIDY_ERRORS}
HEADER_FILTER
Expand Down
10 changes: 5 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@ DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
ENV APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=DontWarn
RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/trusted.gpg.d/rocm-keyring.gpg

RUN wget https://repo.radeon.com/amdgpu-install/5.6/ubuntu/focal/amdgpu-install_5.6.50600-1_all.deb --no-check-certificate
RUN wget https://repo.radeon.com/amdgpu-install/.5.7/ubuntu/focal/amdgpu-install_5.7.50700-1_all.deb --no-check-certificate
RUN apt-get update && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
./amdgpu-install_5.6.50600-1_all.deb
./amdgpu-install_5.7.50700-1_all.deb

# Add rocm repository
RUN export ROCM_APT_VER=5.6;\
RUN export ROCM_APT_VER=5.7;\
echo $ROCM_APT_VER &&\
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/$ROCM_APT_VER/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list' &&\
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/$ROCM_APT_VER focal main > /etc/apt/sources.list.d/rocm.list'
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/amdgpu/.$ROCM_APT_VER/ubuntu focal main > /etc/apt/sources.list.d/amdgpu.list' &&\
sh -c 'echo deb [arch=amd64 signed-by=/etc/apt/trusted.gpg.d/rocm-keyring.gpg] https://repo.radeon.com/rocm/apt/.apt_$ROCM_APT_VER focal main > /etc/apt/sources.list.d/rocm.list'
RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"

RUN amdgpu-install -y --usecase=rocm --no-dkms
Expand Down
24 changes: 12 additions & 12 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def cmake_build(Map conf=[:]){
def fin_build_cmd = cmake_fin_build_cmd(miopen_install_path)
cmd += """
export RETDIR=\$PWD
cd ${env.WORKSPACE}/fin
cd ${env.WORKSPACE}/fin
${fin_build_cmd}
cd \$RETDIR
"""
Expand Down Expand Up @@ -191,7 +191,7 @@ def getDockerImage(Map conf=[:])
{
env.DOCKER_BUILDKIT=1
def prefixpath = conf.get("prefixpath", "/opt/rocm") // one image for each prefix 1: /usr/local 2:/opt/rocm
def gpu_arch = "gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx1102" // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
def gpu_arch = "gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" // prebuilt dockers should have all the architectures enabled so one image can be used for all stages
def miotensile_version = conf.get("miotensile_version", "default") // deprecated
def target_id = conf.get("target_id", "OFF") // deprecated
def mlir_build = conf.get("mlir_build", "ON") // always ON
Expand All @@ -203,7 +203,7 @@ def getDockerImage(Map conf=[:])
{
echo "FOUND CCACHE SERVER: ${CCACHE_HOST}"
}
else
else
{
echo "CCACHE SERVER: ${CCACHE_HOST} NOT FOUND, got ${check_host} response"
}
Expand All @@ -230,7 +230,7 @@ def getDockerImage(Map conf=[:])
dockerImage = docker.build("${image}", "${dockerArgs} .")
withDockerRegistry([ credentialsId: "docker_test_cred", url: "" ]) {
dockerImage.push()
}
}
}
return [dockerImage, image]
}
Expand Down Expand Up @@ -622,7 +622,7 @@ pipeline {
stage('Fp32 Hip AnyGPU') {
when {
beforeAgent true
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A || params.TARGET_NAVI21 }
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A }
}
options {
retry(2)
Expand All @@ -635,7 +635,7 @@ pipeline {
stage('Fp32 Hip Debug AnyGPU') {
when {
beforeAgent true
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A || params.TARGET_NAVI21 }
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A }
}
options {
retry(2)
Expand Down Expand Up @@ -681,7 +681,7 @@ pipeline {
stage('Fp32 Hip Debug NOCOMGR AnyGPU') {
when {
beforeAgent true
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A || params.TARGET_NAVI21 }
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A }
}
options {
retry(2)
Expand Down Expand Up @@ -714,7 +714,7 @@ pipeline {
stage('Fp32 Hip Static AnyGPU') {
when {
beforeAgent true
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A || params.TARGET_NAVI21 }
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A }
}
options {
retry(2)
Expand All @@ -727,7 +727,7 @@ pipeline {
stage('Fp32 Hip Normal-Find AnyGPU') {
when {
beforeAgent true
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A || params.TARGET_NAVI21 }
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A }
}
options {
retry(2)
Expand All @@ -744,7 +744,7 @@ pipeline {
stage('Fp32 Hip Fast-Find AnyGPU') {
when {
beforeAgent true
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A || params.TARGET_NAVI21 }
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A }
}
options {
retry(2)
Expand All @@ -761,7 +761,7 @@ pipeline {
stage('Fp32 Hip AnyGPU') {
when {
beforeAgent true
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A || params.TARGET_NAVI21 }
expression { params.TARGET_VEGA20 || params.TARGET_VEGA10 || params.TARGET_GFX908 || params.TARGET_GFX90A }
}
options {
retry(2)
Expand Down Expand Up @@ -917,7 +917,7 @@ pipeline {
}
agent{ label rocmnode("navi21") }
steps{
buildHipClangJobAndReboot(setup_flags: Full_test + Fp16_flags)
buildHipClangJobAndReboot(setup_flags: Full_test + Fp16_flags, build_cmd: Navi21_build_cmd)
}
}
stage('Fp32 Hip All gfx908') {
Expand Down
2 changes: 1 addition & 1 deletion fin
Submodule fin updated from ebf9b3 to b2f3f4
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
boost@1.79 -DCMAKE_POSITION_INDEPENDENT_CODE=On --build -DCMAKE_CXX_FLAGS=" -std=c++14 -Wno-enum-constexpr-conversion "
ROCmSoftwarePlatform/half@4f19ce3e56f3d3a17cf69f9db4ff3722f7445b0d --build
boost@1.79 -DCMAKE_POSITION_INDEPENDENT_CODE=On --build -DCMAKE_CXX_FLAGS=" -std=c++14 -Wno-enum-constexpr-conversion -Wno-deprecated-builtins -Wno-deprecated-declarations "
ROCmSoftwarePlatform/half@10abd99e7815f0ca5d892f58dd7d15a23b7cf92c --build
ROCmSoftwarePlatform/rocMLIR@rocm-5.5.0 -H sha256:a5f62769d28a73e60bc8d61022820f050e97c977c8f6f6275488db31512e1f42 -DBUILD_FAT_LIBROCKCOMPILER=1 -DCMAKE_IGNORE_PATH=/opt/conda/envs/py_3.9 -DCMAKE_IGNORE_PREFIX_PATH=/opt/conda
nlohmann/json@v3.9.1 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off
ROCmSoftwarePlatform/FunctionalPlus@v0.2.18-p0
ROCmSoftwarePlatform/eigen@3.4.0
ROCmSoftwarePlatform/frugally-deep@9683d557eb672ee2304f80f6682c51242d748a50
ROCmSoftwarePlatform/composable_kernel@f0831350d15c3d368d7ae321dd08441d6569086e -DDTYPES="fp16;fp32;bf16" -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON
ROCmSoftwarePlatform/composable_kernel@0629870d27397ab10a90fde6d7050f3e3d65fc2f -DDTYPES="fp16;fp32;bf16" -DCMAKE_BUILD_TYPE=Release -DINSTANCES_ONLY=ON
1 change: 1 addition & 0 deletions src/comgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,7 @@ void BuildHip(const std::string& name,
opts.push_back("-Wno-newline-eof");
opts.push_back("-Wno-reserved-identifier");
opts.push_back("-Wno-old-style-cast");
opts.push_back("-Wno-extra-semi-stmt");
#endif
#if WORKAROUND_ISSUE_1674
opts.push_back("-Wno-gnu-line-marker");
Expand Down
26 changes: 23 additions & 3 deletions src/hip/handlehip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@
#define MIOPEN_WORKAROUND_ROCM_COMPILER_SUPPORT_ISSUE_30 \
(MIOPEN_USE_COMGR && BUILD_SHARED_LIBS && (HIP_PACKAGE_VERSION_FLAT < 4003000000ULL))

/// hipMemGetInfo constantly fails on gfx906/900 and Navi21.
/// Brute-force W/A: return fixed values.
#define WORKAROUND_FAULTY_HIPMEMGETINFO_VEGA_NAVI2X (ROCM_FEATURE_DEPRECATED_VEGA_NAVI2X)

MIOPEN_DECLARE_ENV_VAR(MIOPEN_DEVICE_CU)

namespace miopen {
Expand All @@ -72,10 +76,26 @@ void toCallHipInit() __attribute__((constructor(1000)));
void toCallHipInit() { hipInit(0); }
#endif

hipError_t hip_mem_get_info_wrapper(std::size_t* const free, std::size_t* const total)
{
#if WORKAROUND_FAULTY_HIPMEMGETINFO_VEGA_NAVI2X
const auto status = hipMemGetInfo(free, total);
if(status == hipSuccess)
return status;
MIOPEN_LOG_W("hipMemGetInfo error, status: " << status);
assert(free != nullptr && total != nullptr);
*free = 16ULL * 1024 * 1024 * 1024; // 16 GiB
*total = *free;
return hipSuccess;
#else
return hipMemGetInfo(free, total);
#endif
}

std::size_t GetAvailableMemory()
{
size_t free, total;
auto status = hipMemGetInfo(&free, &total);
auto status = hip_mem_get_info_wrapper(&free, &total);
if(status != hipSuccess)
MIOPEN_THROW_HIP_STATUS(status, "Failed getting available memory");
return free;
Expand Down Expand Up @@ -289,7 +309,7 @@ Handle::Handle() : impl(std::make_unique<HandleImpl>())
this->impl->device = set_default_device();
this->impl->root_stream = impl->create_stream();
#else
this->impl->device = get_device_id();
this->impl->device = get_device_id();
this->impl->root_stream = HandleImpl::reference_stream(nullptr);
#endif
auto root_stream = this->impl->root_stream.get();
Expand Down Expand Up @@ -640,7 +660,7 @@ std::size_t Handle::GetMaxMemoryAllocSize()
if(m_MaxMemoryAllocSizeCached == 0)
{
size_t free, total;
auto status = hipMemGetInfo(&free, &total);
auto status = hip_mem_get_info_wrapper(&free, &total);
if(status != hipSuccess)
MIOPEN_THROW_HIP_STATUS(status, "Failed getting available memory");
m_MaxMemoryAllocSizeCached = floor(total * 0.85);
Expand Down
9 changes: 4 additions & 5 deletions src/include/miopen/hipoc_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,21 @@ inline HipEventPtr make_hip_event()
return HipEventPtr{result};
}

#if 1
#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017

#if 1
#if 1 // Keep around other storage techinques -- @pfultz2 27.03.2017
template <class T, class U>
struct KernelArgsPair
{
static const int alignment = sizeof(U);
static const int padding = (alignment - (sizeof(T) % alignment)) % alignment;
static const int padding = (alignment - sizeof(T) % alignment) % alignment;
static const int second_index = sizeof(T) + padding;
KernelArgsPair(T x, U y)
{

new(buffer) T(x); // NOLINT (clang-analyzer-cplusplus.PlacementNew)
new(buffer + second_index) U(y);
}
char buffer[second_index + sizeof(U)] = {};
alignas(U) char buffer[second_index + sizeof(U)] = {};
Copy link
Contributor

@atamazov atamazov Aug 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This may change the layout of kernel arguments in memory. Without alignas(U), the required buffer alignment is 1 (i.e., no alignment is required). Therefore, the alignment of KernelArgsPair instances is also 1, and several such instances reside in memory without any gaps.

With alignas(U), the padding required for KernelArgsPair is alginof(U), which may lead to gaps between instances of KernelArgsPair.

If you see kernel failures, then please revert this change.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@atamazov I do see lots of failures on gfx906 with

handlehip.cpp:80: Failed getting available memory: invalid argument

However, it looks more likely a runtime issue since it cannot be reproduced on other newer ASICs. But I will revert this change and try again.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@junliume This is not related and I see the same on navi21.

Reverting this change won't resolve the issue with getting available memory.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@junliume It seems like this change is indeed correct and should be kept.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@atamazov can you verify it happens on Navi21?
For Vega nodes some have problems but some other nodes do not.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@junliume Of course, this happens on Navi21, as I've reported a while ago at #2307 (comment) where you can find the dirty hacks for this. I am working on more or less regular W/A which should be suitable for merging into develop.

};
#else
template <class T, class U>
Expand Down
3 changes: 3 additions & 0 deletions src/include/miopen/rocm_features.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,7 @@
#define ROCM_FEATURE_LLVM_AMDGCN_BUFFER_ATOMIC_FADD_F32_RETURNS_FLOAT \
(HIP_PACKAGE_VERSION_FLAT >= 4001021072ULL)

/// GFX906 and GFX103X are deprecated since 5.7 RC.
#define ROCM_FEATURE_DEPRECATED_VEGA_NAVI2X (HIP_PACKAGE_VERSION_FLAT >= 5007000000ULL)

#endif // GUARD_ROCM_FEATURES_HPP_