From 779a444009da190c47a2f820395ca001abc29b62 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Wed, 11 Sep 2024 12:22:35 -0400 Subject: [PATCH 01/94] [libc] fix tls teardown while being used (#108229) The call chain to `Mutex:lock` can be polluted by stack protector. For completely safe, let's postpone the main TLS tearing down to a separate phase. fix #108030 --- libc/src/stdlib/atexit.cpp | 6 +++++- libc/src/stdlib/quick_exit.cpp | 3 +++ libc/startup/linux/do_start.cpp | 11 ++++++----- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp index c8a15dd3cfef2df..799aad136bda5c1 100644 --- a/libc/src/stdlib/atexit.cpp +++ b/libc/src/stdlib/atexit.cpp @@ -16,6 +16,7 @@ namespace LIBC_NAMESPACE_DECL { constinit ExitCallbackList atexit_callbacks; Mutex handler_list_mtx(false, false, false, false); +[[gnu::weak]] extern void teardown_main_tls(); extern "C" { @@ -24,8 +25,11 @@ int __cxa_atexit(AtExitCallback *callback, void *payload, void *) { } void __cxa_finalize(void *dso) { - if (!dso) + if (!dso) { call_exit_callbacks(atexit_callbacks); + if (teardown_main_tls) + teardown_main_tls(); + } } } // extern "C" diff --git a/libc/src/stdlib/quick_exit.cpp b/libc/src/stdlib/quick_exit.cpp index a5abf3e05d1a13a..29110b33afcf509 100644 --- a/libc/src/stdlib/quick_exit.cpp +++ b/libc/src/stdlib/quick_exit.cpp @@ -16,9 +16,12 @@ namespace LIBC_NAMESPACE_DECL { extern ExitCallbackList at_quick_exit_callbacks; +[[gnu::weak]] extern void teardown_main_tls(); [[noreturn]] LLVM_LIBC_FUNCTION(void, quick_exit, (int status)) { call_exit_callbacks(at_quick_exit_callbacks); + if (teardown_main_tls) + teardown_main_tls(); internal::exit(status); } diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp index 72060b4adb21483..ff104c7f0d1d2f7 100644 --- a/libc/startup/linux/do_start.cpp +++ b/libc/startup/linux/do_start.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// #include "startup/linux/do_start.h" +#include "config/linux/app.h" #include "include/llvm-libc-macros/link-macros.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/macros/config.h" @@ -60,6 +61,10 @@ static void call_fini_array_callbacks() { } static ThreadAttributes main_thread_attrib; +static TLSDescriptor tls; +// We separate teardown_main_tls from callbacks as callback function themselves +// may require TLS. +void teardown_main_tls() { cleanup_tls(tls.addr, tls.size); } [[noreturn]] void do_start() { auto tid = syscall_impl(SYS_gettid); @@ -122,7 +127,6 @@ static ThreadAttributes main_thread_attrib; // This descriptor has to be static since its cleanup function cannot // capture the context. - static TLSDescriptor tls; init_tls(tls); if (tls.size != 0 && !set_thread_ptr(tls.tp)) syscall_impl(SYS_exit, 1); @@ -130,10 +134,7 @@ static ThreadAttributes main_thread_attrib; self.attrib = &main_thread_attrib; main_thread_attrib.atexit_callback_mgr = internal::get_thread_atexit_callback_mgr(); - // We register the cleanup_tls function to be the last atexit callback to be - // invoked. It will tear down the TLS. Other callbacks may depend on TLS (such - // as the stack protector canary). - atexit([]() { cleanup_tls(tls.addr, tls.size); }); + // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init // array callbacks as they can potentially register their own atexit From fa4a631fc63bdd9ffe5598bcc744656cea6fdb56 Mon Sep 17 00:00:00 2001 From: Chris B Date: Wed, 11 Sep 2024 11:49:44 -0500 Subject: [PATCH 02/94] [NFC] [HLSL] Update test for HLSL 202x (#108097) HLSL 202x inherits from C++11, which generates additional loop hint information for loops that must progress. Since HLSL 202x is going to be the default for Clang we want to make sure all our tests pass with it. Required for https://github.com/llvm/llvm-project/issues/108044 --- clang/test/CodeGenHLSL/loops/unroll.hlsl | 46 +++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/clang/test/CodeGenHLSL/loops/unroll.hlsl b/clang/test/CodeGenHLSL/loops/unroll.hlsl index 7389f21dd3472ba..efca0747805d4bb 100644 --- a/clang/test/CodeGenHLSL/loops/unroll.hlsl +++ b/clang/test/CodeGenHLSL/loops/unroll.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ +// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \ // RUN: dxil-pc-shadermodel6.3-library -disable-llvm-passes %s -emit-llvm -o - | FileCheck %s /*** for ***/ @@ -35,8 +35,8 @@ void for_nested_one_unroll_enable() for( int j = 0; j < 10; ++j) s += i + j; } -// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE:.*]] -// CHECK-NOT: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_1_ENABLE:.*]] +// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_INNER:.*]] +// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_NESTED_ENABLE_OUTER:.*]] } void for_nested_two_unroll_enable() @@ -111,20 +111,26 @@ void do_enable() } -// CHECK: ![[FOR_DISTINCT]] = distinct !{![[FOR_DISTINCT]], ![[FOR_COUNT:.*]]} -// CHECK: ![[FOR_COUNT]] = !{!"llvm.loop.unroll.count", i32 8} -// CHECK: ![[FOR_DISABLE]] = distinct !{![[FOR_DISABLE]], ![[DISABLE:.*]]} -// CHECK: ![[DISABLE]] = !{!"llvm.loop.unroll.disable"} -// CHECK: ![[FOR_ENABLE]] = distinct !{![[FOR_ENABLE]], ![[ENABLE:.*]]} -// CHECK: ![[ENABLE]] = !{!"llvm.loop.unroll.enable"} -// CHECK: ![[FOR_NESTED_ENABLE]] = distinct !{![[FOR_NESTED_ENABLE]], ![[ENABLE]]} -// CHECK: ![[FOR_NESTED2_ENABLE]] = distinct !{![[FOR_NESTED2_ENABLE]], ![[ENABLE]]} -// CHECK: ![[FOR_NESTED2_1_ENABLE]] = distinct !{![[FOR_NESTED2_1_ENABLE]], ![[ENABLE]]} -// CHECK: ![[WHILE_DISTINCT]] = distinct !{![[WHILE_DISTINCT]], ![[WHILE_COUNT:.*]]} -// CHECK: ![[WHILE_COUNT]] = !{!"llvm.loop.unroll.count", i32 4} -// CHECK: ![[WHILE_DISABLE]] = distinct !{![[WHILE_DISABLE]], ![[DISABLE]]} -// CHECK: ![[WHILE_ENABLE]] = distinct !{![[WHILE_ENABLE]], ![[ENABLE]]} -// CHECK: ![[DO_DISTINCT]] = distinct !{![[DO_DISTINCT]], ![[DO_COUNT:.*]]} -// CHECK: ![[DO_COUNT]] = !{!"llvm.loop.unroll.count", i32 16} -// CHECK: ![[DO_DISABLE]] = distinct !{![[DO_DISABLE]], ![[DISABLE]]} -// CHECK: ![[DO_ENABLE]] = distinct !{![[DO_ENABLE]], ![[ENABLE]]} +// CHECK-DAG: [[MUST_PROGRESS:.*]] = !{!"llvm.loop.mustprogress"} +// CHECK-DAG: [[DISABLE:.*]] = !{!"llvm.loop.unroll.disable"} +// CHECK-DAG: [[FOR_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 8} +// CHECK-DAG: [[ENABLE:.*]] = !{!"llvm.loop.unroll.enable"} +// CHECK-DAG: [[WHILE_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 4} +// CHECK-DAG: [[DO_COUNT:.*]] = !{!"llvm.loop.unroll.count", i32 16} + +// CHECK-DAG: ![[FOR_DISTINCT]] = distinct !{![[FOR_DISTINCT]], [[MUST_PROGRESS]], [[FOR_COUNT]]} +// CHECK-DAG: ![[FOR_DISABLE]] = distinct !{![[FOR_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]} +// CHECK-DAG: ![[FOR_ENABLE]] = distinct !{![[FOR_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} + +// CHECK-DAG: ![[FOR_NESTED_ENABLE_INNER]] = distinct !{![[FOR_NESTED_ENABLE_INNER]], [[MUST_PROGRESS]]} +// CHECK-DAG: ![[FOR_NESTED_ENABLE_OUTER]] = distinct !{![[FOR_NESTED_ENABLE_OUTER]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[FOR_NESTED2_ENABLE]] = distinct !{![[FOR_NESTED2_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[FOR_NESTED2_1_ENABLE]] = distinct !{![[FOR_NESTED2_1_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[WHILE_DISTINCT]] = distinct !{![[WHILE_DISTINCT]], [[MUST_PROGRESS]], [[WHILE_COUNT]]} + +// CHECK-DAG: ![[WHILE_DISABLE]] = distinct !{![[WHILE_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]} +// CHECK-DAG: ![[WHILE_ENABLE]] = distinct !{![[WHILE_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} +// CHECK-DAG: ![[DO_DISTINCT]] = distinct !{![[DO_DISTINCT]], [[MUST_PROGRESS]], [[DO_COUNT]]} + +// CHECK-DAG: ![[DO_DISABLE]] = distinct !{![[DO_DISABLE]], [[MUST_PROGRESS]], [[DISABLE]]} +// CHECK-DAG: ![[DO_ENABLE]] = distinct !{![[DO_ENABLE]], [[MUST_PROGRESS]], [[ENABLE]]} From d8e124dffaaea142d17b9911fc4de91039c8d1b1 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Wed, 11 Sep 2024 12:51:11 -0400 Subject: [PATCH 03/94] [libc] implement vdso (#91572) --- libc/hdr/CMakeLists.txt | 18 ++ libc/hdr/link_macros.h | 22 ++ libc/hdr/sys_auxv_macros.h | 22 ++ .../src/__support/OSUtil/linux/CMakeLists.txt | 30 +++ .../OSUtil/linux/aarch64/CMakeLists.txt | 10 + .../src/__support/OSUtil/linux/aarch64/vdso.h | 37 +++ .../__support/OSUtil/linux/arm/CMakeLists.txt | 10 + libc/src/__support/OSUtil/linux/arm/vdso.h | 37 +++ .../OSUtil/linux/riscv/CMakeLists.txt | 10 + libc/src/__support/OSUtil/linux/riscv/vdso.h | 43 ++++ libc/src/__support/OSUtil/linux/vdso.cpp | 237 ++++++++++++++++++ libc/src/__support/OSUtil/linux/vdso.h | 81 ++++++ libc/src/__support/OSUtil/linux/vdso_sym.h | 70 ++++++ .../OSUtil/linux/x86_64/CMakeLists.txt | 10 + libc/src/__support/OSUtil/linux/x86_64/vdso.h | 43 ++++ libc/src/sys/auxv/getauxval.h | 2 +- .../src/__support/OSUtil/linux/CMakeLists.txt | 18 ++ .../src/__support/OSUtil/linux/vdso_test.cpp | 162 ++++++++++++ 18 files changed, 861 insertions(+), 1 deletion(-) create mode 100644 libc/hdr/link_macros.h create mode 100644 libc/hdr/sys_auxv_macros.h create mode 100644 libc/src/__support/OSUtil/linux/aarch64/vdso.h create mode 100644 libc/src/__support/OSUtil/linux/arm/vdso.h create mode 100644 libc/src/__support/OSUtil/linux/riscv/vdso.h create mode 100644 libc/src/__support/OSUtil/linux/vdso.cpp create mode 100644 libc/src/__support/OSUtil/linux/vdso.h create mode 100644 libc/src/__support/OSUtil/linux/vdso_sym.h create mode 100644 libc/src/__support/OSUtil/linux/x86_64/vdso.h create mode 100644 libc/test/src/__support/OSUtil/linux/vdso_test.cpp diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt index a2fad9b473ed7e3..e0b65b7c2eb02df 100644 --- a/libc/hdr/CMakeLists.txt +++ b/libc/hdr/CMakeLists.txt @@ -143,4 +143,22 @@ add_proxy_header_library( libc.include.llvm-libc-macros.limits_macros ) +add_proxy_header_library( + link_macros + HDRS + link_macros.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-macros.link_macros + libc.include.link +) + +add_proxy_header_library( + sys_auxv_macros + HDRS + sys_auxv_macros.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-macros.sys_auxv_macros + libc.include.sys_auxv +) + add_subdirectory(types) diff --git a/libc/hdr/link_macros.h b/libc/hdr/link_macros.h new file mode 100644 index 000000000000000..8a78a864e6ce4cd --- /dev/null +++ b/libc/hdr/link_macros.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from link.h ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_LINK_MACROS_H +#define LLVM_LIBC_HDR_LINK_MACROS_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-macros/link-macros.h" + +#else // Overlay mode + +#include + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_LINK_MACROS_H diff --git a/libc/hdr/sys_auxv_macros.h b/libc/hdr/sys_auxv_macros.h new file mode 100644 index 000000000000000..c04011baedb8606 --- /dev/null +++ b/libc/hdr/sys_auxv_macros.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from sys/auxv.h ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_SYS_AUXV_MACROS_H +#define LLVM_LIBC_HDR_SYS_AUXV_MACROS_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-macros/sys-auxv-macros.h" + +#else // Overlay mode + +#include + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_SYS_AUXV_MACROS_H diff --git a/libc/src/__support/OSUtil/linux/CMakeLists.txt b/libc/src/__support/OSUtil/linux/CMakeLists.txt index 089cad454d534d7..6c7014940407d83 100644 --- a/libc/src/__support/OSUtil/linux/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/CMakeLists.txt @@ -23,3 +23,33 @@ add_object_library( libc.hdr.types.struct_f_owner_ex libc.hdr.types.off_t ) + +add_header_library( + vdso_sym + HDRS + vdso_sym.h + DEPENDS + libc.src.__support.common +) + +add_object_library( + vdso + HDRS + vdso.h + SRCS + vdso.cpp + DEPENDS + .${LIBC_TARGET_ARCHITECTURE}.vdso + libc.src.__support.CPP.array + libc.src.__support.CPP.optional + libc.src.__support.CPP.string_view + libc.src.__support.threads.callonce + libc.src.__support.threads.linux.futex_word_type + libc.hdr.types.struct_timeval + libc.hdr.types.struct_timespec + libc.hdr.types.clockid_t + libc.hdr.types.time_t + libc.hdr.link_macros + libc.src.errno.errno + libc.src.sys.auxv.getauxval +) diff --git a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt index eea9badc46cae68..d9451a1af1df350 100644 --- a/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/aarch64/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/aarch64/vdso.h b/libc/src/__support/OSUtil/linux/aarch64/vdso.h new file mode 100644 index 000000000000000..3c4c6205071da2c --- /dev/null +++ b/libc/src/__support/OSUtil/linux/aarch64/vdso.h @@ -0,0 +1,37 @@ +//===---------- aarch64 vdso configuration ------------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/kernel/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::RTSigReturn: + return "__kernel_rt_sigreturn"; + case VDSOSym::GetTimeOfDay: + return "__kernel_gettimeofday"; + case VDSOSym::ClockGetTime: + return "__kernel_clock_gettime"; + case VDSOSym::ClockGetRes: + return "__kernel_clock_getres"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_2.6.39"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_AARCH64_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt index 733366f6d4a2e38..d991f7e0914796b 100644 --- a/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/arm/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/arm/vdso.h b/libc/src/__support/OSUtil/linux/arm/vdso.h new file mode 100644 index 000000000000000..3de5860359c1551 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/arm/vdso.h @@ -0,0 +1,37 @@ +//===---------- arm vdso configuration ----------------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::ClockGetTime: + return "__vdso_clock_gettime"; + case VDSOSym::GetTimeOfDay: + return "__vdso_gettimeofday"; + case VDSOSym::ClockGetRes: + return "__vdso_clock_getres"; + case VDSOSym::ClockGetTime64: + return "__vdso_clock_gettime64"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_2.6"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_ARM_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt index e271204f519820c..eb93dd4af35ce7d 100644 --- a/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/riscv/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/riscv/vdso.h b/libc/src/__support/OSUtil/linux/riscv/vdso.h new file mode 100644 index 000000000000000..24ddb25ea980a5a --- /dev/null +++ b/libc/src/__support/OSUtil/linux/riscv/vdso.h @@ -0,0 +1,43 @@ +//===---------- RISC-V vdso configuration -------------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/riscv/kernel/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::RTSigReturn: + return "__vdso_rt_sigreturn"; + case VDSOSym::GetTimeOfDay: + return "__vdso_gettimeofday"; + case VDSOSym::ClockGetTime: + return "__vdso_clock_gettime"; + case VDSOSym::ClockGetRes: + return "__vdso_clock_getres"; + case VDSOSym::GetCpu: + return "__vdso_getcpu"; + case VDSOSym::FlushICache: + return "__vdso_flush_icache"; + case VDSOSym::RiscvHwProbe: + return "__vdso_riscv_hwprobe"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_4.15"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_RISCV_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp new file mode 100644 index 000000000000000..cb43764badad1fd --- /dev/null +++ b/libc/src/__support/OSUtil/linux/vdso.cpp @@ -0,0 +1,237 @@ +//===------------- Linux VDSO Implementation --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "src/__support/OSUtil/linux/vdso.h" +#include "hdr/link_macros.h" +#include "hdr/sys_auxv_macros.h" +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/optional.h" +#include "src/__support/CPP/string_view.h" +#include "src/__support/threads/callonce.h" +#include "src/__support/threads/linux/futex_word.h" +#include "src/errno/libc_errno.h" +#include "src/sys/auxv/getauxval.h" +#include + +// TODO: This is a temporary workaround to avoid including elf.h +// Include our own headers for ElfW and friends once we have them. +namespace LIBC_NAMESPACE_DECL { + +namespace vdso { + +Symbol::VDSOArray Symbol::global_cache{}; +CallOnceFlag Symbol::once_flag = callonce_impl::NOT_CALLED; + +namespace { +// See https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symverdefs.html +struct Verdaux { + ElfW(Word) vda_name; /* Version or dependency names */ + ElfW(Word) vda_next; /* Offset in bytes to next verdaux + entry */ +}; +struct Verdef { + ElfW(Half) vd_version; /* Version revision */ + ElfW(Half) vd_flags; /* Version information */ + ElfW(Half) vd_ndx; /* Version Index */ + ElfW(Half) vd_cnt; /* Number of associated aux entries */ + ElfW(Word) vd_hash; /* Version name hash value */ + ElfW(Word) vd_aux; /* Offset in bytes to verdaux array */ + ElfW(Word) vd_next; /* Offset in bytes to next verdef entry */ + Verdef *next() const { + if (vd_next == 0) + return nullptr; + return reinterpret_cast(reinterpret_cast(this) + + vd_next); + } + Verdaux *aux() const { + return reinterpret_cast(reinterpret_cast(this) + + vd_aux); + } +}; + +// version search procedure specified by +// https://refspecs.linuxfoundation.org/LSB_1.3.0/gLSB/gLSB/symversion.html#SYMVERTBL +cpp::string_view find_version(Verdef *verdef, ElfW(Half) * versym, + const char *strtab, size_t idx) { + constexpr ElfW(Half) VER_FLG_BASE = 0x1; + if (!versym) + return ""; + ElfW(Half) identifier = versym[idx] & 0x7FFF; + // iterate through all version definitions + for (Verdef *def = verdef; def != nullptr; def = def->next()) { + // skip if this is a file-level version + if (def->vd_flags & VER_FLG_BASE) + continue; + // check if the version identifier matches. Highest bit is used to determine + // whether the symbol is local. Only lower 15 bits are used for version + // identifier. + if ((def->vd_ndx & 0x7FFF) == identifier) { + Verdaux *aux = def->aux(); + return strtab + aux->vda_name; + } + } + return ""; +} + +size_t shdr_get_symbol_count(ElfW(Shdr) * vdso_shdr, size_t e_shnum) { + if (!vdso_shdr) + return 0; + // iterate all sections until we locate the dynamic symbol section + for (size_t i = 0; i < e_shnum; ++i) { + // dynamic symbol section is a table section + // therefore, the number of entries can be computed as the ratio + // of the section size to the size of a single entry + if (vdso_shdr[i].sh_type == SHT_DYNSYM) + return vdso_shdr[i].sh_size / vdso_shdr[i].sh_entsize; + } + return 0; +} + +struct VDSOSymbolTable { + const char *strtab; + ElfW(Sym) * symtab; + // The following can be nullptr if the vDSO does not have versioning + ElfW(Half) * versym; + Verdef *verdef; + + void populate_symbol_cache(Symbol::VDSOArray &symbol_table, + size_t symbol_count, ElfW(Addr) vdso_addr) { + for (size_t i = 0, e = symbol_table.size(); i < e; ++i) { + Symbol sym = i; + cpp::string_view name = sym.name(); + cpp::string_view version = sym.version(); + if (name.empty()) + continue; + + for (size_t j = 0; j < symbol_count; ++j) { + if (name == strtab + symtab[j].st_name) { + // we find a symbol with desired name + // now we need to check if it has the right version + if (versym && verdef && + version != find_version(verdef, versym, strtab, j)) + continue; + + // put the symbol address into the symbol table + symbol_table[i] = + reinterpret_cast(vdso_addr + symtab[j].st_value); + } + } + } + } +}; + +struct PhdrInfo { + ElfW(Addr) vdso_addr; + ElfW(Dyn) * vdso_dyn; + static cpp::optional from(ElfW(Phdr) * vdso_phdr, size_t e_phnum, + uintptr_t vdso_ehdr_addr) { + constexpr ElfW(Addr) INVALID_ADDR = static_cast(-1); + ElfW(Addr) vdso_addr = INVALID_ADDR; + ElfW(Dyn) *vdso_dyn = nullptr; + if (!vdso_phdr) + return cpp::nullopt; + // iterate through all the program headers until we get the desired pieces + for (size_t i = 0; i < e_phnum; ++i) { + if (vdso_phdr[i].p_type == PT_DYNAMIC) + vdso_dyn = reinterpret_cast(vdso_ehdr_addr + + vdso_phdr[i].p_offset); + + if (vdso_phdr[i].p_type == PT_LOAD) + vdso_addr = + vdso_ehdr_addr + vdso_phdr[i].p_offset - vdso_phdr[i].p_vaddr; + + if (vdso_addr && vdso_dyn) + return PhdrInfo{vdso_addr, vdso_dyn}; + } + + return cpp::nullopt; + } + + cpp::optional populate_symbol_table() { + const char *strtab = nullptr; + ElfW(Sym) *symtab = nullptr; + ElfW(Half) *versym = nullptr; + Verdef *verdef = nullptr; + for (ElfW(Dyn) *d = vdso_dyn; d->d_tag != DT_NULL; ++d) { + switch (d->d_tag) { + case DT_STRTAB: + strtab = reinterpret_cast(vdso_addr + d->d_un.d_ptr); + break; + case DT_SYMTAB: + symtab = reinterpret_cast(vdso_addr + d->d_un.d_ptr); + break; + case DT_VERSYM: + versym = reinterpret_cast(vdso_addr + d->d_un.d_ptr); + break; + case DT_VERDEF: + verdef = reinterpret_cast(vdso_addr + d->d_un.d_ptr); + break; + } + if (strtab && symtab && versym && verdef) + break; + } + if (strtab == nullptr || symtab == nullptr) + return cpp::nullopt; + + return VDSOSymbolTable{strtab, symtab, versym, verdef}; + } +}; +} // namespace + +void Symbol::initialize_vdso_global_cache() { + // first clear the symbol table + for (auto &i : global_cache) + i = nullptr; + + // get the address of the VDSO, protect errno since getauxval may change + // it + int errno_backup = libc_errno; + uintptr_t vdso_ehdr_addr = getauxval(AT_SYSINFO_EHDR); + // Get the memory address of the vDSO ELF header. + auto vdso_ehdr = reinterpret_cast(vdso_ehdr_addr); + // leave the table unpopulated if we don't have vDSO + if (vdso_ehdr == nullptr) { + libc_errno = errno_backup; + return; + } + + // locate the section header inside the elf using the section header + // offset + auto vdso_shdr = + reinterpret_cast(vdso_ehdr_addr + vdso_ehdr->e_shoff); + size_t symbol_count = shdr_get_symbol_count(vdso_shdr, vdso_ehdr->e_shnum); + + // early return if no symbol is found + if (symbol_count == 0) + return; + + // We need to find both the loadable segment and the dynamic linking of + // the vDSO. compute vdso_phdr as the program header using the program + // header offset + ElfW(Phdr) *vdso_phdr = + reinterpret_cast(vdso_ehdr_addr + vdso_ehdr->e_phoff); + cpp::optional phdr_info = + PhdrInfo::from(vdso_phdr, vdso_ehdr->e_phnum, vdso_ehdr_addr); + // early return if either the dynamic linking or the loadable segment is + // not found + if (!phdr_info.has_value()) + return; + + // now, locate several more tables inside the dynmaic linking section + cpp::optional vdso_symbol_table = + phdr_info->populate_symbol_table(); + + // early return if we can't find any required fields of the symbol table + if (!vdso_symbol_table.has_value()) + return; + + // finally, populate the global symbol table cache + vdso_symbol_table->populate_symbol_cache(global_cache, symbol_count, + phdr_info->vdso_addr); +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/OSUtil/linux/vdso.h b/libc/src/__support/OSUtil/linux/vdso.h new file mode 100644 index 000000000000000..a5108b3a1fb5d3e --- /dev/null +++ b/libc/src/__support/OSUtil/linux/vdso.h @@ -0,0 +1,81 @@ +//===------------- Linux VDSO Header ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H +#include "src/__support/CPP/array.h" +#include "src/__support/common.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/properties/architectures.h" +#include "src/__support/threads/callonce.h" + +#if defined(LIBC_TARGET_ARCH_IS_X86) +#include "x86_64/vdso.h" +#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) +#include "aarch64/vdso.h" +#elif defined(LIBC_TARGET_ARCH_IS_ARM) +#include "arm/vdso.h" +#elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV) +#include "riscv/vdso.h" +#else +#error "unknown arch" +#endif + +namespace LIBC_NAMESPACE_DECL { +namespace vdso { + +class Symbol { + VDSOSym sym; + +public: + LIBC_INLINE_VAR static constexpr size_t COUNT = + static_cast(VDSOSym::VDSOSymCount); + LIBC_INLINE constexpr explicit Symbol(VDSOSym sym) : sym(sym) {} + LIBC_INLINE constexpr Symbol(size_t idx) : sym(static_cast(idx)) {} + LIBC_INLINE constexpr cpp::string_view name() const { + return symbol_name(sym); + } + LIBC_INLINE constexpr cpp::string_view version() const { + return symbol_version(sym); + } + LIBC_INLINE constexpr operator size_t() const { + return static_cast(sym); + } + LIBC_INLINE constexpr bool is_valid() const { + return *this < Symbol::global_cache.size(); + } + using VDSOArray = cpp::array; + +private: + static CallOnceFlag once_flag; + static VDSOArray global_cache; + static void initialize_vdso_global_cache(); + + LIBC_INLINE void *get() const { + if (name().empty() || !is_valid()) + return nullptr; + + callonce(&once_flag, Symbol::initialize_vdso_global_cache); + return (global_cache[*this]); + } + template friend struct TypedSymbol; +}; + +template struct TypedSymbol { + LIBC_INLINE constexpr operator VDSOSymType() const { + return cpp::bit_cast>(Symbol{sym}.get()); + } + template + LIBC_INLINE auto operator()(Args &&...args) const { + return this->operator VDSOSymType()(cpp::forward(args)...); + } +}; + +} // namespace vdso + +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_VDSO_H diff --git a/libc/src/__support/OSUtil/linux/vdso_sym.h b/libc/src/__support/OSUtil/linux/vdso_sym.h new file mode 100644 index 000000000000000..eb5f204a82f3048 --- /dev/null +++ b/libc/src/__support/OSUtil/linux/vdso_sym.h @@ -0,0 +1,70 @@ +//===------------- Linux VDSO Symbols ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "hdr/types/clock_t.h" +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_timespec.h" +#include "hdr/types/struct_timeval.h" +#include "hdr/types/time_t.h" +#include "src/__support/common.h" +#include // For size_t. + +// NOLINTBEGIN(llvmlibc-implementation-in-namespace) +// TODO: some of the following can be defined via proxy headers. +struct __kernel_timespec; +struct timezone; +struct riscv_hwprobe; +struct getcpu_cache; +struct cpu_set_t; +// NOLINTEND(llvmlibc-implementation-in-namespace) + +namespace LIBC_NAMESPACE_DECL { +namespace vdso { + +enum class VDSOSym { + ClockGetTime, + ClockGetTime64, + GetTimeOfDay, + GetCpu, + Time, + ClockGetRes, + RTSigReturn, + FlushICache, + RiscvHwProbe, + VDSOSymCount +}; + +template LIBC_INLINE constexpr auto dispatcher() { + if constexpr (sym == VDSOSym::ClockGetTime) + return static_cast(nullptr); + else if constexpr (sym == VDSOSym::ClockGetTime64) + return static_cast(nullptr); + else if constexpr (sym == VDSOSym::GetTimeOfDay) + return static_cast( + nullptr); + else if constexpr (sym == VDSOSym::GetCpu) + return static_cast( + nullptr); + else if constexpr (sym == VDSOSym::Time) + return static_cast(nullptr); + else if constexpr (sym == VDSOSym::ClockGetRes) + return static_cast(nullptr); + else if constexpr (sym == VDSOSym::RTSigReturn) + return static_cast(nullptr); + else if constexpr (sym == VDSOSym::FlushICache) + return static_cast(nullptr); + else if constexpr (sym == VDSOSym::RiscvHwProbe) + return static_cast(nullptr); + else + return static_cast(nullptr); +} + +template using VDSOSymType = decltype(dispatcher()); + +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt index a7f2d74e6353e07..1324491f37b76b4 100644 --- a/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt +++ b/libc/src/__support/OSUtil/linux/x86_64/CMakeLists.txt @@ -5,3 +5,13 @@ add_header_library( DEPENDS libc.src.__support.common ) + +add_header_library( + vdso + HDRS + vdso.h + DEPENDS + libc.src.__support.common + libc.src.__support.CPP.string_view + libc.src.__support.OSUtil.linux.vdso_sym +) diff --git a/libc/src/__support/OSUtil/linux/x86_64/vdso.h b/libc/src/__support/OSUtil/linux/x86_64/vdso.h new file mode 100644 index 000000000000000..abe7c33e07cfaba --- /dev/null +++ b/libc/src/__support/OSUtil/linux/x86_64/vdso.h @@ -0,0 +1,43 @@ +//===---------- x86/x86_64 vdso configuration ---------------------* C++ *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/linux/vdso_sym.h" +namespace LIBC_NAMESPACE_DECL { +namespace vdso { +// translate VDSOSym to symbol names +// On x86, there are symbols defined without the __vdso_ prefix, however, +// it is suggested that one should use the __vdso_ prefix. +// Additionally, there is also an __vdso_sgx_enter_enclave, it is for the SGX +// support, we do not include it here for now. +// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/entry/vdso/vdso.lds.S +LIBC_INLINE constexpr cpp::string_view symbol_name(VDSOSym sym) { + switch (sym) { + case VDSOSym::ClockGetTime: + return "__vdso_clock_gettime"; + case VDSOSym::GetTimeOfDay: + return "__vdso_gettimeofday"; + case VDSOSym::GetCpu: + return "__vdso_getcpu"; + case VDSOSym::Time: + return "__vdso_time"; + case VDSOSym::ClockGetRes: + return "__vdso_clock_getres"; + default: + return ""; + } +} + +// symbol versions +LIBC_INLINE constexpr cpp::string_view symbol_version(VDSOSym) { + return "LINUX_2.6"; +} +} // namespace vdso +} // namespace LIBC_NAMESPACE_DECL +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_LINUX_X86_64_VDSO_H diff --git a/libc/src/sys/auxv/getauxval.h b/libc/src/sys/auxv/getauxval.h index 3e6971340bbef15..d9da45ff0839810 100644 --- a/libc/src/sys/auxv/getauxval.h +++ b/libc/src/sys/auxv/getauxval.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H #define LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H +#include "hdr/sys_auxv_macros.h" #include "src/__support/macros/config.h" -#include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt index bfb072c03e97125..ff82616cc4a701f 100644 --- a/libc/test/src/__support/OSUtil/linux/CMakeLists.txt +++ b/libc/test/src/__support/OSUtil/linux/CMakeLists.txt @@ -1,3 +1,21 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE}) add_subdirectory(${LIBC_TARGET_ARCHITECTURE}) endif() + +add_libc_test( + vdso_test + SUITE libc-osutil-tests + SRCS vdso_test.cpp + DEPENDS + libc.src.__support.OSUtil.linux.vdso + libc.src.__support.OSUtil.osutil + libc.hdr.types.struct_sigaction + libc.hdr.types.struct_timeval + libc.hdr.types.struct_timespec + libc.hdr.types.clockid_t + libc.hdr.types.time_t + libc.hdr.time_macros + libc.hdr.signal_macros + libc.src.signal.sigaction + libc.src.signal.raise +) diff --git a/libc/test/src/__support/OSUtil/linux/vdso_test.cpp b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp new file mode 100644 index 000000000000000..2363db69c02f97b --- /dev/null +++ b/libc/test/src/__support/OSUtil/linux/vdso_test.cpp @@ -0,0 +1,162 @@ +//===-- Unittests for VDSO ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/signal_macros.h" +#include "hdr/time_macros.h" +#include "hdr/types/clockid_t.h" +#include "hdr/types/struct_sigaction.h" +#include "hdr/types/struct_timespec.h" +#include "hdr/types/struct_timeval.h" +#include "hdr/types/time_t.h" +#include "src/__support/OSUtil/linux/vdso.h" +#include "src/__support/OSUtil/syscall.h" +#include "src/__support/macros/properties/architectures.h" +#include "src/signal/raise.h" +#include "src/signal/sigaction.h" +#include "test/UnitTest/ErrnoSetterMatcher.h" +#include "test/UnitTest/LibcTest.h" +#include "test/UnitTest/Test.h" +#include +#include + +struct riscv_hwprobe { + int64_t key; + uint64_t value; +}; + +namespace LIBC_NAMESPACE_DECL { +// For x86_64, we explicitly test some traditional vdso symbols are indeed +// available. + +TEST(LlvmLibcOSUtilVDSOTest, GetTimeOfDay) { + vdso::TypedSymbol symbol; + if (!symbol) + return; + timeval tv; + EXPECT_EQ(symbol(&tv, nullptr), 0); + // hopefully people are not building time machines using our libc. + EXPECT_GT(tv.tv_sec, static_cast(0)); +} + +TEST(LlvmLibcOSUtilVDSOTest, Time) { + vdso::TypedSymbol symbol; + if (!symbol) + return; + time_t a, b; + EXPECT_GT(symbol(&a), static_cast(0)); + EXPECT_GT(symbol(&b), static_cast(0)); + EXPECT_GE(b, a); +} + +TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime) { + vdso::TypedSymbol symbol; + if (!symbol) + return; + timespec a, b; + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0); + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0); + if (a.tv_sec == b.tv_sec) { + EXPECT_LT(a.tv_nsec, b.tv_nsec); + } else { + EXPECT_LT(a.tv_sec, b.tv_sec); + } +} + +TEST(LlvmLibcOSUtilVDSOTest, ClockGetTime64) { + vdso::TypedSymbol symbol; + if (!symbol) + return; + // See kernel API at + // https://elixir.bootlin.com/linux/latest/source/tools/testing/selftests/vDSO/vdso_test_correctness.c#L155 + __kernel_timespec a, b; + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &a), 0); + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &b), 0); + if (a.tv_sec == b.tv_sec) { + EXPECT_LT(a.tv_nsec, b.tv_nsec); + } else { + EXPECT_LT(a.tv_sec, b.tv_sec); + } +} + +TEST(LlvmLibcOSUtilVDSOTest, ClockGetRes) { + vdso::TypedSymbol symbol; + if (!symbol) + return; + timespec res{}; + EXPECT_EQ(symbol(CLOCK_MONOTONIC, &res), 0); + EXPECT_TRUE(res.tv_sec > 0 || res.tv_nsec > 0); +} + +TEST(LlvmLibcOSUtilVDSOTest, GetCpu) { + // The kernel system call has a third argument, which should be passed as + // nullptr. + vdso::TypedSymbol symbol; + if (!symbol) + return; + unsigned cpu = static_cast(-1), node = static_cast(-1); + EXPECT_EQ(symbol(&cpu, &node, nullptr), 0); + EXPECT_GE(cpu, 0u); + EXPECT_GE(node, 0u); +} + +static bool flag = false; +static void sigprof_handler [[gnu::used]] (int) { flag = true; } + +TEST(LlvmLibcOSUtilVDSOTest, RtSigReturn) { + using namespace testing::ErrnoSetterMatcher; + // must use struct since there is a function of the same name in the same + // scope. + struct sigaction sa {}; + struct sigaction old_sa {}; + sa.sa_handler = sigprof_handler; + sa.sa_flags = SA_RESTORER; + vdso::TypedSymbol symbol; + if (!symbol) + return; + sa.sa_restorer = symbol; + ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &sa, &old_sa), Succeeds()); + raise(SIGPROF); + ASSERT_TRUE(flag); + flag = false; + ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGPROF, &old_sa, nullptr), Succeeds()); +} + +TEST(LlvmLibcOSUtilVDSOTest, FlushICache) { + vdso::TypedSymbol symbol; + if (!symbol) + return; + char buf[512]; + // we just check that the flush will not panic the program. + // the flags part only take 0/1 as up to kernel 6.10, which is used to + // indicate whether the flush is local to the core or global. + symbol(buf, buf + sizeof(buf), 0); + symbol(buf, buf + sizeof(buf), 1); +} + +// https://docs.kernel.org/6.5/riscv/hwprobe.html +TEST(LlvmLibcOSUtilVDSOTest, RiscvHwProbe) { + using namespace testing::ErrnoSetterMatcher; + vdso::TypedSymbol symbol; + if (!symbol) + return; + // If a key is unknown to the kernel, its key field will be cleared to -1, and + // its value set to 0. We expect probes.value are all 0. + // Usermode can supply NULL for cpus and 0 for cpu_count as a shortcut for all + // online CPUs + riscv_hwprobe probes[2] = {{-1, 1}, {-1, 1}}; + ASSERT_THAT(symbol(/*pairs=*/probes, /*count=*/2, /*cpusetsize=*/0, + /*cpuset=*/nullptr, + /*flags=*/0), + Succeeds()); + for (auto &probe : probes) { + EXPECT_EQ(probe.key, static_cast(-1)); + EXPECT_EQ(probe.value, static_cast(0)); + } +} + +} // namespace LIBC_NAMESPACE_DECL From 2f321fac722e6c7913825f003c194b923d027354 Mon Sep 17 00:00:00 2001 From: Julian Schmidt Date: Wed, 11 Sep 2024 18:52:41 +0200 Subject: [PATCH 04/94] [NFC][clang-tidy] fix tests of deleted functions for missing-std-forward (#106861) Since #87832, unnamed identifiers are excluded from being diagnosed. As a result, the tests that were supposed to test that deleted functions are correctly ignored, are ignored because of the unnamed identifiers instead of the deleted function. This change simply introduces names for the parameters of the deleted functions. --- .../checkers/cppcoreguidelines/missing-std-forward.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp index 8116db58c937d44..98c592db7ce2268 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/missing-std-forward.cpp @@ -187,14 +187,14 @@ void lambda_value_reference_auxiliary_var(T&& t) { namespace deleted_functions { template -void f(T &&) = delete; +void f(T &&t) = delete; struct S { template - S(T &&) = delete; + S(T &&t) = delete; template - void operator&(T &&) = delete; + void operator&(T &&t) = delete; }; } // namespace deleted_functions From 866b93e6b33fac9a4bc62bbc32199bd98f434784 Mon Sep 17 00:00:00 2001 From: Jonathon Penix Date: Wed, 11 Sep 2024 09:53:11 -0700 Subject: [PATCH 05/94] [RISCV] Don't outline pcrel_lo when the function has a section prefix (#107943) GNU ld will error when encountering a pcrel_lo whose corresponding pcrel_hi is in a different section. [1] introduced a check to help prevent this issue by preventing outlining in a few circumstances. However, we can also hit this same issue when outlining from functions with prefixes ("hot"/"unlikely"/"unknown" from profile information, for example) as the outlined function might not have the same prefix, possibly resulting in a "paired" pcrel_lo and pcrel_hi ending up in different sections. To prevent this issue, take a similar approach as [1] and additionally prevent outlining when we see a pcrel_lo and the function has a prefix. [1] https://github.com/llvm/llvm-project/commit/96c85f80f0d615ffde0f85d8270e0a8c9f4e5430 Fixes #107520 --- llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 +- .../RISCV/machineoutliner-pcrel-lo.mir | 104 +++++++++++++++++- 2 files changed, 99 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index 13212c2aea5ddeb..02f65ffcea64547 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -2918,7 +2918,7 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI, // if any possible. if (MO.getTargetFlags() == RISCVII::MO_PCREL_LO && (MI.getMF()->getTarget().getFunctionSections() || F.hasComdat() || - F.hasSection())) + F.hasSection() || F.getSectionPrefix())) return outliner::InstrType::Illegal; } diff --git a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir index 8a83543b0280fd5..fd3630bcfad2560 100644 --- a/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir +++ b/llvm/test/CodeGen/RISCV/machineoutliner-pcrel-lo.mir @@ -18,6 +18,9 @@ define i32 @foo2(i32 %a, i32 %b) comdat { ret i32 0 } define i32 @foo3(i32 %a, i32 %b) section ".abc" { ret i32 0 } + + define i32 @foo4(i32 %a, i32 %b) !section_prefix !0 { ret i32 0 } + !0 = !{!"function_section_prefix", !"myprefix"} ... --- name: foo @@ -27,23 +30,24 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11, implicit $x13 ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: PseudoRET + ; ; CHECK-FS-LABEL: name: foo ; CHECK-FS: bb.0: ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 @@ -109,26 +113,27 @@ body: | ; CHECK: bb.0: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: liveins: $x10, $x11, $x13 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_1, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) ; CHECK-NEXT: PseudoBR %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: PseudoRET + ; ; CHECK-FS-LABEL: name: foo2 ; CHECK-FS: bb.0: ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 @@ -223,6 +228,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: PseudoRET + ; ; CHECK-FS-LABEL: name: foo3 ; CHECK-FS: bb.0: ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 @@ -289,3 +295,89 @@ body: | bb.3: PseudoRET ... +--- +name: foo4 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: foo4 + ; CHECK: bb.0: + ; CHECK-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + ; CHECK-NEXT: PseudoBR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + ; CHECK-NEXT: PseudoBR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + ; CHECK-NEXT: PseudoBR %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: PseudoRET + ; + ; CHECK-FS-LABEL: name: foo4 + ; CHECK-FS: bb.0: + ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + ; CHECK-FS-NEXT: PseudoBR %bb.3 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: bb.1: + ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + ; CHECK-FS-NEXT: PseudoBR %bb.3 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: bb.2: + ; CHECK-FS-NEXT: liveins: $x10, $x11, $x13 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11 + ; CHECK-FS-NEXT: $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + ; CHECK-FS-NEXT: PseudoBR %bb.3 + ; CHECK-FS-NEXT: {{ $}} + ; CHECK-FS-NEXT: bb.3: + ; CHECK-FS-NEXT: PseudoRET + bb.0: + liveins: $x10, $x11, $x13 + + $x11 = ORI $x11, 1023 + $x12 = ADDI $x10, 17 + $x11 = AND $x12, $x11 + $x10 = SUB $x10, $x11 + $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + PseudoBR %bb.3 + + bb.1: + liveins: $x10, $x11, $x13 + + $x11 = ORI $x11, 1023 + $x12 = ADDI $x10, 17 + $x11 = AND $x12, $x11 + $x10 = SUB $x10, $x11 + $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + PseudoBR %bb.3 + + bb.2: + liveins: $x10, $x11, $x13 + + $x11 = ORI $x11, 1023 + $x12 = ADDI $x10, 17 + $x11 = AND $x12, $x11 + $x10 = SUB $x10, $x11 + $x11 = LW killed renamable $x13, target-flags(riscv-pcrel-lo) :: (dereferenceable load (s32) from @bar) + PseudoBR %bb.3 + + bb.3: + PseudoRET +... From 415288a2a7db0f55f5f6f0866e8f61faf86bf6fd Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Wed, 11 Sep 2024 09:53:53 -0700 Subject: [PATCH 06/94] [WebAssembly] Add load and store patterns for V8F16. (#108119) --- .../WebAssembly/WebAssemblyISelLowering.cpp | 4 ++++ .../WebAssembly/WebAssemblyInstrSIMD.td | 4 ++-- .../CodeGen/WebAssembly/half-precision.ll | 22 ++++++++++++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 5cc084f3ab13879..5971194a045b985 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -96,6 +96,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(ISD::STORE, T, Custom); } } + if (Subtarget->hasFP16()) { + setOperationAction(ISD::LOAD, MVT::v8f16, Custom); + setOperationAction(ISD::STORE, MVT::v8f16, Custom); + } if (Subtarget->hasReferenceTypes()) { // We need custom load and store lowering for both externref, funcref and // Other. The MVT::Other here represents tables of reference types. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 9d17d90f5305415..9be23dacf75013e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -189,7 +189,7 @@ defm LOAD_V128_A64 : } // Def load patterns from WebAssemblyInstrMemory.td for vector types -foreach vec = StdVecs in { +foreach vec = AllVecs in { defm : LoadPat; } @@ -390,7 +390,7 @@ defm STORE_V128_A64 : } // Def store patterns from WebAssemblyInstrMemory.td for vector types -foreach vec = StdVecs in { +foreach vec = AllVecs in { defm : StorePat; } diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index c0b14d2064d5eb8..185b86488747d0f 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -1,5 +1,4 @@ ; RUN: llc < %s --mtriple=wasm32-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s -; RUN: llc < %s --mtriple=wasm64-unknown-unknown -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128 | FileCheck %s declare float @llvm.wasm.loadf32.f16(ptr) declare void @llvm.wasm.storef16.f32(float, ptr) @@ -308,3 +307,24 @@ define <8 x i16> @trunc_sat_u_v8i16_sat(<8 x half> %x) { %a = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> %x) ret <8 x i16> %a } + +; ============================================================================== +; Load and Store +; ============================================================================== +define <8 x half> @load_v8f16(ptr %p) { +; CHECK-LABEL: load_v8f16: +; CHECK: .functype load_v8f16 (i32) -> (v128) +; CHECK-NEXT: v128.load $push0=, 0($0) +; CHECK-NEXT: return $pop0 + %v = load <8 x half>, ptr %p + ret <8 x half> %v +} + +define void @store_v8f16(<8 x half> %v, ptr %p) { +; CHECK-LABEL: store_v8f16: +; CHECK: .functype store_v8f16 (v128, i32) -> () +; CHECK-NEXT: v128.store 0($1), $0 +; CHECK-NEXT: return + store <8 x half> %v , ptr %p + ret void +} From c076638c702b1d43e8f1c4a813deb3c09b748abb Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Wed, 11 Sep 2024 10:00:10 -0700 Subject: [PATCH 07/94] [WebAssembly] Support BUILD_VECTOR with F16x8. (#108117) Convert BUILD_VECTORS with FP16x8 to I16x8 since there's no FP16 scalar value to intialize v128.const. --- .../WebAssembly/WebAssemblyISelLowering.cpp | 15 +++++++++++++++ llvm/test/CodeGen/WebAssembly/half-precision.ll | 7 +++++++ 2 files changed, 22 insertions(+) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 5971194a045b985..1875a8fd4c4404a 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -212,6 +212,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( MVT::v2f64}) setOperationAction(ISD::BUILD_VECTOR, T, Custom); + if (Subtarget->hasFP16()) + setOperationAction(ISD::BUILD_VECTOR, MVT::f16, Custom); + // We have custom shuffle lowering to expose the shuffle mask for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64, MVT::v2f64}) @@ -2059,6 +2062,18 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + if (VT == MVT::v8f16) { + // BUILD_VECTOR can't handle FP16 operands since Wasm doesn't have a scaler + // FP16 type, so cast them to I16s. + MVT IVT = VT.changeVectorElementType(MVT::i16); + SmallVector NewOps; + for (unsigned I = 0, E = Op.getNumOperands(); I < E; ++I) + NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I))); + SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); + return DAG.getBitcast(VT, Res); + } + if (auto ConvertLow = LowerConvertLow(Op, DAG)) return ConvertLow; diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index 185b86488747d0f..5f0ba4aa9c3c4fe 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -27,6 +27,13 @@ define <8 x half> @splat_v8f16(float %x) { ret <8 x half> %v } +; CHECK-LABEL: const_splat_v8f16: +; CHECK: v128.const $push0=, 20800, 0, 0, 0, 0, 0, 0, 20800 +; CHECK-NEXT: return $pop0 +define <8 x half> @const_splat_v8f16() { + ret <8 x half> +} + ; CHECK-LABEL: extract_lane_v8f16: ; CHECK: f16x8.extract_lane $push0=, $0, 1 ; CHECK-NEXT: return $pop0 From 7721db489630166a220cfc27051d6259588229e1 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Wed, 11 Sep 2024 10:08:30 -0700 Subject: [PATCH 08/94] [WebKit Static Analyzer] Treat WTFReportBacktrace as a trivial function. (#108167) Treat WTFReportBacktrace, which prints out the backtrace, as trivial. --- clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp | 1 + clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 49bbff1942167ba..2b9b7883c978bad 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -397,6 +397,7 @@ class TrivialFunctionAnalysisVisitor return true; if (Name == "WTFCrashWithInfo" || Name == "WTFBreakpointTrap" || + Name == "WTFReportBacktrace" || Name == "WTFCrashWithSecurityImplication" || Name == "WTFCrash" || Name == "WTFReportAssertionFailure" || Name == "isMainThread" || Name == "isMainThreadOrGCThread" || Name == "isMainRunLoop" || diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp index a98c6eb9c84d97d..424ebd349e955a4 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp @@ -6,6 +6,7 @@ void WTFBreakpointTrap(); void WTFCrashWithInfo(int, const char*, const char*, int); void WTFReportAssertionFailure(const char* file, int line, const char* function, const char* assertion); +void WTFReportBacktrace(void); void WTFCrash(void); void WTFCrashWithSecurityImplication(void); @@ -334,6 +335,7 @@ class RefCounted { } unsigned trivial60() { return ObjectWithNonTrivialDestructor { 5 }.value(); } unsigned trivial61() { return DerivedNumber('7').value(); } + void trivial62() { WTFReportBacktrace(); } static RefCounted& singleton() { static RefCounted s_RefCounted; @@ -506,6 +508,7 @@ class UnrelatedClass { getFieldTrivial().trivial59(); // no-warning getFieldTrivial().trivial60(); // no-warning getFieldTrivial().trivial61(); // no-warning + getFieldTrivial().trivial62(); // no-warning RefCounted::singleton().trivial18(); // no-warning RefCounted::singleton().someFunction(); // no-warning From 0d48d4d835ec7a2e4d59a8fe4c26dc9823cee56a Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 11 Sep 2024 12:12:17 -0500 Subject: [PATCH 09/94] [mlir][AMDGPU] Support vector<2xf16> inputs to buffer atomic fadd (#108238) Extend the lowering of atomic.fadd to support the v2f16 variant avaliable on some AMDGPU chips. Co-authored-by: Giuseppe Rossini --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 ++-- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 7 +++++-- .../Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 11 +++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 8a1ef94c853a587..64db4448bc2f2b6 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -253,8 +253,8 @@ def AMDGPU_RawBufferAtomicCmpswapOp : // Raw buffer atomic floating point add def AMDGPU_RawBufferAtomicFaddOp : AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, - AttrSizedOperandSegments]>, - Arguments<(ins F32:$value, + AttrSizedOperandSegments]>, + Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16]>]>:$value, Arg:$memref, Variadic:$indices, DefaultValuedAttr:$boundsCheck, diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 96b433294d258ab..fc5dd7c56021292 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -115,15 +115,18 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { rewriter.getIntegerType(floatType.getWidth())); } if (auto dataVector = dyn_cast(wantedDataType)) { + uint32_t vecLen = dataVector.getNumElements(); uint32_t elemBits = dataVector.getElementTypeBitWidth(); - uint32_t totalBits = elemBits * dataVector.getNumElements(); + uint32_t totalBits = elemBits * vecLen; + bool usePackedFp16 = + dyn_cast_or_null(*gpuOp) && vecLen == 2; if (totalBits > maxVectorOpWidth) return gpuOp.emitOpError( "Total width of loads or stores must be no more than " + Twine(maxVectorOpWidth) + " bits, but we call for " + Twine(totalBits) + " bits. This should've been caught in validation"); - if (elemBits < 32) { + else if (!usePackedFp16 && elemBits < 32) { if (totalBits > 32) { if (totalBits % 32 != 0) return gpuOp.emitOpError("Load or store of more than 32-bits that " diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index 717667c22af8009..cc51a8c40942f98 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -151,6 +151,17 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, func.return } +// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2f16 +func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: memref<64xf16>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) + // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xf16> + amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xf16> -> memref<64xf16>, i32 + func.return +} + // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) From cb031267bd7a5946dfd6e46e9a5441ddca057b47 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 11 Sep 2024 12:28:15 -0500 Subject: [PATCH 10/94] Revert "[mlir][AMDGPU] Support vector<2xf16> inputs to buffer atomic fadd (#108238)" (#108256) This reverts commit 0d48d4d835ec7a2e4d59a8fe4c26dc9823cee56a. Mistakenly landed without approval --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 4 ++-- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 7 ++----- .../Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 11 ----------- 3 files changed, 4 insertions(+), 18 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 64db4448bc2f2b6..8a1ef94c853a587 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -253,8 +253,8 @@ def AMDGPU_RawBufferAtomicCmpswapOp : // Raw buffer atomic floating point add def AMDGPU_RawBufferAtomicFaddOp : AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, - AttrSizedOperandSegments]>, - Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16]>]>:$value, + AttrSizedOperandSegments]>, + Arguments<(ins F32:$value, Arg:$memref, Variadic:$indices, DefaultValuedAttr:$boundsCheck, diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index fc5dd7c56021292..96b433294d258ab 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -115,18 +115,15 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { rewriter.getIntegerType(floatType.getWidth())); } if (auto dataVector = dyn_cast(wantedDataType)) { - uint32_t vecLen = dataVector.getNumElements(); uint32_t elemBits = dataVector.getElementTypeBitWidth(); - uint32_t totalBits = elemBits * vecLen; - bool usePackedFp16 = - dyn_cast_or_null(*gpuOp) && vecLen == 2; + uint32_t totalBits = elemBits * dataVector.getNumElements(); if (totalBits > maxVectorOpWidth) return gpuOp.emitOpError( "Total width of loads or stores must be no more than " + Twine(maxVectorOpWidth) + " bits, but we call for " + Twine(totalBits) + " bits. This should've been caught in validation"); - else if (!usePackedFp16 && elemBits < 32) { + if (elemBits < 32) { if (totalBits > 32) { if (totalBits % 32 != 0) return gpuOp.emitOpError("Load or store of more than 32-bits that " diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index cc51a8c40942f98..717667c22af8009 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -151,17 +151,6 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, func.return } -// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2f16 -func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: memref<64xf16>, %idx: i32) { - // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) - // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) - // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) - // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] - // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xf16> - amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xf16> -> memref<64xf16>, i32 - func.return -} - // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) From b7b28e770c461b2513ddc98953c6e019cb2f29a4 Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 11 Sep 2024 10:29:23 -0700 Subject: [PATCH 11/94] [NVPTX] Improve copy avoidance during lowering. (#106423) On newer GPUs, where `cvta.param` instruction is available we can avoid making byval arguments when their pointers are used in a few more cases, even when `__grid_constant__` is not specified. - phi - select - memcpy from the parameter. Switched pointer traversal from a DIY implementation to PtrUseVisitor. --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 232 +++++-- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 1 + .../CodeGen/NVPTX/lower-args-gridconstant.ll | 556 +++++++++++------ llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 573 +++++++++++++----- 4 files changed, 984 insertions(+), 378 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 1205ad4c6b008fd..082546c4dd72f88 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -139,16 +139,21 @@ #include "NVPTX.h" #include "NVPTXTargetMachine.h" #include "NVPTXUtilities.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/PtrUseVisitor.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include #include @@ -217,7 +222,8 @@ INITIALIZE_PASS_END(NVPTXLowerArgs, "nvptx-lower-args", // pointer in parameter AS. // For "escapes" (to memory, a function call, or a ptrtoint), cast the OldUse to // generic using cvta.param. -static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { +static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, + bool IsGridConstant) { Instruction *I = dyn_cast(OldUse->getUser()); assert(I && "OldUse must be in an instruction"); struct IP { @@ -228,7 +234,8 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { SmallVector ItemsToConvert = {{OldUse, I, Param}}; SmallVector InstructionsToDelete; - auto CloneInstInParamAS = [GridConstant](const IP &I) -> Value * { + auto CloneInstInParamAS = [HasCvtaParam, + IsGridConstant](const IP &I) -> Value * { if (auto *LI = dyn_cast(I.OldInstruction)) { LI->setOperand(0, I.NewParam); return LI; @@ -252,8 +259,25 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { // Just pass through the argument, the old ASC is no longer needed. return I.NewParam; } + if (auto *MI = dyn_cast(I.OldInstruction)) { + if (MI->getRawSource() == I.OldUse->get()) { + // convert to memcpy/memmove from param space. + IRBuilder<> Builder(I.OldInstruction); + Intrinsic::ID ID = MI->getIntrinsicID(); + + CallInst *B = Builder.CreateMemTransferInst( + ID, MI->getRawDest(), MI->getDestAlign(), I.NewParam, + MI->getSourceAlign(), MI->getLength(), MI->isVolatile()); + for (unsigned I : {0, 1}) + if (uint64_t Bytes = MI->getParamDereferenceableBytes(I)) + B->addDereferenceableParamAttr(I, Bytes); + return B; + } + // We may be able to handle other cases if the argument is + // __grid_constant__ + } - if (GridConstant) { + if (HasCvtaParam) { auto GetParamAddrCastToGeneric = [](Value *Addr, Instruction *OriginalUser) -> Value * { PointerType *ReturnTy = @@ -269,24 +293,44 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool GridConstant) { OriginalUser->getIterator()); return CvtToGenCall; }; - - if (auto *CI = dyn_cast(I.OldInstruction)) { - I.OldUse->set(GetParamAddrCastToGeneric(I.NewParam, CI)); - return CI; + auto *ParamInGenericAS = + GetParamAddrCastToGeneric(I.NewParam, I.OldInstruction); + + // phi/select could use generic arg pointers w/o __grid_constant__ + if (auto *PHI = dyn_cast(I.OldInstruction)) { + for (auto [Idx, V] : enumerate(PHI->incoming_values())) { + if (V.get() == I.OldUse->get()) + PHI->setIncomingValue(Idx, ParamInGenericAS); + } } - if (auto *SI = dyn_cast(I.OldInstruction)) { - // byval address is being stored, cast it to generic - if (SI->getValueOperand() == I.OldUse->get()) - SI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, SI)); - return SI; + if (auto *SI = dyn_cast(I.OldInstruction)) { + if (SI->getTrueValue() == I.OldUse->get()) + SI->setTrueValue(ParamInGenericAS); + if (SI->getFalseValue() == I.OldUse->get()) + SI->setFalseValue(ParamInGenericAS); } - if (auto *PI = dyn_cast(I.OldInstruction)) { - if (PI->getPointerOperand() == I.OldUse->get()) - PI->setOperand(0, GetParamAddrCastToGeneric(I.NewParam, PI)); - return PI; + + // Escapes or writes can only use generic param pointers if + // __grid_constant__ is in effect. + if (IsGridConstant) { + if (auto *CI = dyn_cast(I.OldInstruction)) { + I.OldUse->set(ParamInGenericAS); + return CI; + } + if (auto *SI = dyn_cast(I.OldInstruction)) { + // byval address is being stored, cast it to generic + if (SI->getValueOperand() == I.OldUse->get()) + SI->setOperand(0, ParamInGenericAS); + return SI; + } + if (auto *PI = dyn_cast(I.OldInstruction)) { + if (PI->getPointerOperand() == I.OldUse->get()) + PI->setOperand(0, ParamInGenericAS); + return PI; + } + // TODO: iIf we allow stores, we should allow memcpy/memset to + // parameter, too. } - llvm_unreachable( - "Instruction unsupported even for grid_constant argument"); } llvm_unreachable("Unsupported instruction"); @@ -409,49 +453,110 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS, } } +namespace { +struct ArgUseChecker : PtrUseVisitor { + using Base = PtrUseVisitor; + + bool IsGridConstant; + // Set of phi/select instructions using the Arg + SmallPtrSet Conditionals; + + ArgUseChecker(const DataLayout &DL, bool IsGridConstant) + : PtrUseVisitor(DL), IsGridConstant(IsGridConstant) {} + + PtrInfo visitArgPtr(Argument &A) { + assert(A.getType()->isPointerTy()); + IntegerType *IntIdxTy = cast(DL.getIndexType(A.getType())); + IsOffsetKnown = false; + Offset = APInt(IntIdxTy->getBitWidth(), 0); + PI.reset(); + Conditionals.clear(); + + LLVM_DEBUG(dbgs() << "Checking Argument " << A << "\n"); + // Enqueue the uses of this pointer. + enqueueUsers(A); + + // Visit all the uses off the worklist until it is empty. + // Note that unlike PtrUseVisitor we intentionally do not track offsets. + // We're only interested in how we use the pointer. + while (!(Worklist.empty() || PI.isAborted())) { + UseToVisit ToVisit = Worklist.pop_back_val(); + U = ToVisit.UseAndIsOffsetKnown.getPointer(); + Instruction *I = cast(U->getUser()); + if (isa(I) || isa(I)) + Conditionals.insert(I); + LLVM_DEBUG(dbgs() << "Processing " << *I << "\n"); + Base::visit(I); + } + if (PI.isEscaped()) + LLVM_DEBUG(dbgs() << "Argument pointer escaped: " << *PI.getEscapingInst() + << "\n"); + else if (PI.isAborted()) + LLVM_DEBUG(dbgs() << "Pointer use needs a copy: " << *PI.getAbortingInst() + << "\n"); + LLVM_DEBUG(dbgs() << "Traversed " << Conditionals.size() + << " conditionals\n"); + return PI; + } + + void visitStoreInst(StoreInst &SI) { + // Storing the pointer escapes it. + if (U->get() == SI.getValueOperand()) + return PI.setEscapedAndAborted(&SI); + // Writes to the pointer are UB w/ __grid_constant__, but do not force a + // copy. + if (!IsGridConstant) + return PI.setAborted(&SI); + } + + void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { + // ASC to param space are no-ops and do not need a copy + if (ASC.getDestAddressSpace() != ADDRESS_SPACE_PARAM) + return PI.setEscapedAndAborted(&ASC); + Base::visitAddrSpaceCastInst(ASC); + } + + void visitPtrToIntInst(PtrToIntInst &I) { + if (IsGridConstant) + return; + Base::visitPtrToIntInst(I); + } + void visitPHINodeOrSelectInst(Instruction &I) { + assert(isa(I) || isa(I)); + } + // PHI and select just pass through the pointers. + void visitPHINode(PHINode &PN) { enqueueUsers(PN); } + void visitSelectInst(SelectInst &SI) { enqueueUsers(SI); } + + void visitMemTransferInst(MemTransferInst &II) { + if (*U == II.getRawDest() && !IsGridConstant) + PI.setAborted(&II); + // memcpy/memmove are OK when the pointer is source. We can convert them to + // AS-specific memcpy. + } + + void visitMemSetInst(MemSetInst &II) { + if (!IsGridConstant) + PI.setAborted(&II); + } +}; // struct ArgUseChecker +} // namespace + void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) { - bool IsGridConstant = isParamGridConstant(*Arg); Function *Func = Arg->getParent(); + bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam(); + bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg); + const DataLayout &DL = Func->getDataLayout(); BasicBlock::iterator FirstInst = Func->getEntryBlock().begin(); Type *StructType = Arg->getParamByValType(); assert(StructType && "Missing byval type"); - auto AreSupportedUsers = [&](Value *Start) { - SmallVector ValuesToCheck = {Start}; - auto IsSupportedUse = [IsGridConstant](Value *V) -> bool { - if (isa(V) || isa(V) || isa(V)) - return true; - // ASC to param space are OK, too -- we'll just strip them. - if (auto *ASC = dyn_cast(V)) { - if (ASC->getDestAddressSpace() == ADDRESS_SPACE_PARAM) - return true; - } - // Simple calls and stores are supported for grid_constants - // writes to these pointers are undefined behaviour - if (IsGridConstant && - (isa(V) || isa(V) || isa(V))) - return true; - return false; - }; - - while (!ValuesToCheck.empty()) { - Value *V = ValuesToCheck.pop_back_val(); - if (!IsSupportedUse(V)) { - LLVM_DEBUG(dbgs() << "Need a " - << (isParamGridConstant(*Arg) ? "cast " : "copy ") - << "of " << *Arg << " because of " << *V << "\n"); - (void)Arg; - return false; - } - if (!isa(V) && !isa(V) && !isa(V) && - !isa(V)) - llvm::append_range(ValuesToCheck, V->users()); - } - return true; - }; - - if (llvm::all_of(Arg->users(), AreSupportedUsers)) { + ArgUseChecker AUC(DL, IsGridConstant); + ArgUseChecker::PtrInfo PI = AUC.visitArgPtr(*Arg); + bool ArgUseIsReadOnly = !(PI.isEscaped() || PI.isAborted()); + // Easy case, accessing parameter directly is fine. + if (ArgUseIsReadOnly && AUC.Conditionals.empty()) { // Convert all loads and intermediate operations to use parameter AS and // skip creation of a local copy of the argument. SmallVector UsesToUpdate; @@ -462,7 +567,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, Arg, PointerType::get(StructType, ADDRESS_SPACE_PARAM), Arg->getName(), FirstInst); for (Use *U : UsesToUpdate) - convertToParamAS(U, ArgInParamAS, IsGridConstant); + convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant); LLVM_DEBUG(dbgs() << "No need to copy or cast " << *Arg << "\n"); const auto *TLI = @@ -473,13 +578,17 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, return; } - const DataLayout &DL = Func->getDataLayout(); + // We can't access byval arg directly and need a pointer. on sm_70+ we have + // ability to take a pointer to the argument without making a local copy. + // However, we're still not allowed to write to it. If the user specified + // `__grid_constant__` for the argument, we'll consider escaped pointer as + // read-only. unsigned AS = DL.getAllocaAddrSpace(); - if (isParamGridConstant(*Arg)) { - // Writes to a grid constant are undefined behaviour. We do not need a - // temporary copy. When a pointer might have escaped, conservatively replace - // all of its uses (which might include a device function call) with a cast - // to the generic address space. + if (HasCvtaParam && (ArgUseIsReadOnly || IsGridConstant)) { + LLVM_DEBUG(dbgs() << "Using non-copy pointer to " << *Arg << "\n"); + // Replace all argument pointer uses (which might include a device function + // call) with a cast to the generic address space using cvta.param + // instruction, which avoids a local copy. IRBuilder<> IRB(&Func->getEntryBlock().front()); // Cast argument to param address space @@ -500,6 +609,7 @@ void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM, // Do not replace Arg in the cast to param space CastToParam->setOperand(0, Arg); } else { + LLVM_DEBUG(dbgs() << "Creating a local copy of " << *Arg << "\n"); // Otherwise we have to create a temporary copy. AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst); diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 0591782e8148b9b..457f10f1d64a260 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -94,6 +94,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool hasDotInstructions() const { return SmVersion >= 61 && PTXVersion >= 50; } + bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; } unsigned int getFullSmVersion() const { return FullSmVersion; } unsigned int getSmVersion() const { return getFullSmVersion() / 10; } // GPUs with "a" suffix have include architecture-accelerated features that diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index f6db9c429dba576..176dfee11cfb090 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -1,18 +1,30 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes OPT -; RUN: llc < %s -mcpu=sm_70 --mtriple nvptx64-nvidia-cuda | FileCheck %s --check-prefixes PTX +; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT +; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) { ; PTX-LABEL: grid_const_int( -; PTX-NOT: ld.u32 -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_int_param_0]; -; +; PTX: { +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1]; +; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0]; +; PTX-NEXT: add.s32 %r3, %r2, %r1; +; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_int( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) { -; OPT-NOT: alloca -; OPT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 -; +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; OPT-NEXT: [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr +; OPT-NEXT: [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]] +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT3]], align 4 +; OPT-NEXT: ret void %tmp = load i32, ptr %input1, align 4 %add = add i32 %tmp, %input2 store i32 %add, ptr %out @@ -24,19 +36,29 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ ; PTX-LABEL: grid_const_struct( ; PTX: { -; PTX-NOT: ld.u32 -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_struct_param_0]; -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_struct_param_0+4]; -; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [grid_const_struct_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_struct_param_0]; +; PTX-NEXT: ld.param.u32 %r2, [grid_const_struct_param_0+4]; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_struct( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) { -; OPT-NOT: alloca -; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 -; OPT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 -; OPT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 -; OPT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUT4:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUT4]] to ptr +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4 +; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]] +; OPT-NEXT: store i32 [[ADD]], ptr [[OUT5]], align 4 +; OPT-NEXT: ret void %gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 %int1 = load i32, ptr %gep1 @@ -49,41 +71,85 @@ define void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) { ; PTX-LABEL: grid_const_escape( ; PTX: { -; PTX-NOT: .local -; PTX: cvta.param.{{.*}} +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_escape_param_0; +; PTX-NEXT: mov.u64 %rd2, %rd1; +; PTX-NEXT: cvta.param.u64 %rd3, %rd2; +; PTX-NEXT: { // callseq 0, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd3; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r1, [retval0+0]; +; PTX-NEXT: } // callseq 0 +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]]) +; OPT-NEXT: ret void %call = call i32 @escape(ptr %input) ret void } define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) { ; PTX-LABEL: multiple_grid_const_escape( -; PTX: mov.{{.*}} [[RD1:%.*]], multiple_grid_const_escape_param_0; -; PTX: mov.{{.*}} [[RD2:%.*]], multiple_grid_const_escape_param_2; -; PTX: mov.{{.*}} [[RD3:%.*]], [[RD2]]; -; PTX: mov.{{.*}} [[RD4:%.*]], [[RD1]]; -; PTX: cvta.param.{{.*}} [[RD5:%.*]], [[RD4]]; -; PTX: cvta.param.{{.*}} [[RD6:%.*]], [[RD3]]; -; PTX: { -; PTX: st.param.b64 [param0+0], [[RD5]]; -; PTX: st.param.b64 [param2+0], [[RD6]]; -; +; PTX: { +; PTX-NEXT: .local .align 4 .b8 __local_depot3[4]; +; PTX-NEXT: .reg .b64 %SP; +; PTX-NEXT: .reg .b64 %SPL; +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.u64 %SPL, __local_depot3; +; PTX-NEXT: cvta.local.u64 %SP, %SPL; +; PTX-NEXT: mov.b64 %rd1, multiple_grid_const_escape_param_0; +; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_2; +; PTX-NEXT: mov.u64 %rd3, %rd2; +; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1]; +; PTX-NEXT: cvta.param.u64 %rd4, %rd3; +; PTX-NEXT: mov.u64 %rd5, %rd1; +; PTX-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX-NEXT: add.u64 %rd7, %SP, 0; +; PTX-NEXT: add.u64 %rd8, %SPL, 0; +; PTX-NEXT: st.local.u32 [%rd8], %r1; +; PTX-NEXT: { // callseq 1, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd6; +; PTX-NEXT: .param .b64 param1; +; PTX-NEXT: st.param.b64 [param1+0], %rd7; +; PTX-NEXT: .param .b64 param2; +; PTX-NEXT: st.param.b64 [param2+0], %rd4; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape3, +; PTX-NEXT: ( +; PTX-NEXT: param0, +; PTX-NEXT: param1, +; PTX-NEXT: param2 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r2, [retval0+0]; +; PTX-NEXT: } // callseq 1 +; PTX-NEXT: ret; ; OPT-LABEL: define void @multiple_grid_const_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) { -; OPT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT-NOT: alloca %struct.s -; OPT: [[A_ADDR:%.*]] = alloca i32, align 4 -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[B_PARAM:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; OPT-NEXT: [[B_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[B_PARAM]]) +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 +; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 ; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]]) -; +; OPT-NEXT: ret void %a.addr = alloca i32, align 4 store i32 %a, ptr %a.addr, align 4 %call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b) @@ -92,40 +158,58 @@ define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 define void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) { ; PTX-LABEL: grid_const_memory_escape( -; PTX-NOT: .local -; PTX: mov.b64 [[RD1:%.*]], grid_const_memory_escape_param_0; -; PTX: cvta.param.u64 [[RD3:%.*]], [[RD2:%.*]]; -; PTX: st.global.u64 [[[RD4:%.*]]], [[RD3]]; -; +; PTX: { +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: st.global.u64 [%rd3], %rd5; +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_memory_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[INPUT_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) -; OPT: store ptr [[INPUT_PARAM_GEN]], ptr {{.*}}, align 8 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[ADDR4:%.*]] = addrspacecast ptr [[ADDR]] to ptr addrspace(1) +; OPT-NEXT: [[ADDR5:%.*]] = addrspacecast ptr addrspace(1) [[ADDR4]] to ptr +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR5]], align 8 +; OPT-NEXT: ret void store ptr %input, ptr %addr, align 8 ret void } define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) { ; PTX-LABEL: grid_const_inlineasm_escape( -; PTX-NOT .local -; PTX: add.{{.*}} [[RD2:%.*]], [[RD1:%.*]], 4; -; PTX: cvta.param.u64 [[RD4:%.*]], [[RD2]] -; PTX: cvta.param.u64 [[RD3:%.*]], [[RD1]] -; PTX: add.s64 [[RD5:%.*]], [[RD3]], [[RD4]]; -; +; PTX: { +; PTX-NEXT: .reg .b64 %rd<8>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5; +; PTX-NEXT: mov.u64 %rd7, %rd4; +; PTX-NEXT: cvta.param.u64 %rd2, %rd7; +; PTX-NEXT: add.s64 %rd3, %rd2, 4; +; PTX-NEXT: // begin inline asm +; PTX-NEXT: add.s64 %rd1, %rd2, %rd3; +; PTX-NEXT: // end inline asm +; PTX-NEXT: st.global.u64 [%rd6], %rd1; +; PTX-NEXT: ret; +; PTX-NOT .local ; OPT-LABEL: define void @grid_const_inlineasm_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) { -; OPT-NOT: alloca [[STRUCT_S]] -; OPT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[TMPPTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 0 -; OPT: [[TMPPTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT_PARAM]], i32 0, i32 1 -; OPT: [[TMPPTR22_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR22]]) -; OPT: [[TMPPTR13_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[TMPPTR13]]) -; OPT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[RESULT4:%.*]] = addrspacecast ptr [[RESULT]] to ptr addrspace(1) +; OPT-NEXT: [[RESULT5:%.*]] = addrspacecast ptr addrspace(1) [[RESULT4]] to ptr +; OPT-NEXT: [[INPUT_PARAM:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT_PARAM]]) +; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2 +; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT5]], align 8 +; OPT-NEXT: ret void %tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 %1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1 @@ -135,24 +219,42 @@ define void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, pt define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escape( -; PTX-NOT: .local -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escape_param_0]; -; PTX: add.{{.*}} -; PTX: cvta.param.u64 [[RD3:%.*]], {{%.*}} -; PTX: st.param.{{.*}} [param0+0], [[RD3]] -; PTX: call -; +; PTX: { +; PTX-NEXT: .reg .b32 %r<5>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escape_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escape_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: ld.u32 %r1, [%rd5]; +; PTX-NEXT: add.s32 %r2, %r1, %r1; +; PTX-NEXT: st.global.u32 [%rd3], %r2; +; PTX-NEXT: { // callseq 2, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd5; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r3, [retval0+0]; +; PTX-NEXT: } // callseq 2 +; PTX-NEXT: ret; ; OPT-LABEL: define void @grid_const_partial_escape( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]], ptr {{%.*}}) { -; OPT-NOT: alloca -; OPT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT1]], align 4 -; OPT: [[TWICE:%.*]] = add i32 [[VAL]], [[VAL]] -; OPT: store i32 [[TWICE]] -; OPT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) -; OPT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) -; OPT: ret void -; +; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr +; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1]]) +; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4 +; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]] +; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT5]], align 4 +; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]]) +; OPT-NEXT: ret void %val = load i32, ptr %input %twice = add i32 %val, %val store i32 %twice, ptr %output @@ -163,27 +265,46 @@ define void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) { define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) { ; PTX-LABEL: grid_const_partial_escapemem( ; PTX: { -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_partial_escapemem_param_0]; -; PTX: ld.param.{{.*}} [[R2:%.*]], [grid_const_partial_escapemem_param_0+4]; -; PTX: cvta.param.{{.*}} [[RD5:%.*]], {{%.*}}; -; PTX: st.global.{{.*}} [{{.*}}], [[RD5]]; -; PTX: add.s32 [[R3:%.*]], [[R1]], [[R2]] -; PTX: st.param.{{.*}} [param0+0], [[RD5]] -; PTX: escape +; PTX-NEXT: .reg .b32 %r<6>; +; PTX-NEXT: .reg .b64 %rd<6>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escapemem_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_partial_escapemem_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.u64 %rd4, %rd1; +; PTX-NEXT: cvta.param.u64 %rd5, %rd4; +; PTX-NEXT: ld.u32 %r1, [%rd5]; +; PTX-NEXT: ld.u32 %r2, [%rd5+4]; +; PTX-NEXT: st.global.u64 [%rd3], %rd5; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: { // callseq 3, 0 +; PTX-NEXT: .param .b64 param0; +; PTX-NEXT: st.param.b64 [param0+0], %rd5; +; PTX-NEXT: .param .b32 retval0; +; PTX-NEXT: call.uni (retval0), +; PTX-NEXT: escape, +; PTX-NEXT: ( +; PTX-NEXT: param0 +; PTX-NEXT: ); +; PTX-NEXT: ld.param.b32 %r4, [retval0+0]; +; PTX-NEXT: } // callseq 3 +; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_partial_escapemem( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr {{%.*}}) { -; OPT-NOT: alloca -; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[PTR13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 0 -; OPT: [[VAL1:%.*]] = load i32, ptr addrspace(101) [[PTR13]], align 4 -; OPT: [[PTR22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT2]], i32 0, i32 1 -; OPT: [[VAL2:%.*]] = load i32, ptr addrspace(101) [[PTR22]], align 4 -; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) -; OPT: store ptr [[INPUT1]] -; OPT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] -; OPT: [[PTR1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[PTR13]]) -; OPT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) -; +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[OUTPUT4:%.*]] = addrspacecast ptr [[OUTPUT]] to ptr addrspace(1) +; OPT-NEXT: [[OUTPUT5:%.*]] = addrspacecast ptr addrspace(1) [[OUTPUT4]] to ptr +; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0 +; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4 +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1 +; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4 +; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT5]], align 8 +; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]] +; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]]) +; OPT-NEXT: ret i32 [[ADD]] %ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0 %val1 = load i32, ptr %ptr1 %ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1 @@ -194,29 +315,48 @@ define i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %outpu ret i32 %add } -define void @grid_const_phi_escape(ptr byval(%struct.s) align 4 %input1, ptr %inout) { -; PTX-LABEL: grid_const_phi_escape( -; PTX: cvta.param.{{.*}} [[RD1:%.*]], {{.*}} -; PTX: @[[P1:%.*]] bra $L__BB[[TARGET_LABEL:[_0-9]+]]; -; PTX: $L__BB[[TARGET_LABEL]]: -; PTX: ld.{{.*}} [[R1:%.*]], [[[RD1]]]; -; -; OPT-LABEL: define void @grid_const_phi_escape( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr {{%.*}}) { -; OPT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) -; OPT: br i1 {{.*}}, label %[[FIRST:.*]], label %[[SECOND:.*]] +define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) { +; PTX-LABEL: grid_const_phi( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd5, grid_const_phi_param_0; +; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_param_1]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; +; PTX-NEXT: mov.u64 %rd7, %rd5; +; PTX-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: @%p1 bra $L__BB8_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: add.s64 %rd8, %rd8, 4; +; PTX-NEXT: $L__BB8_2: // %merge +; PTX-NEXT: ld.u32 %r2, [%rd8]; +; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_phi( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: -; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 -; OPT: br label %[[MERGE:.*]] +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; OPT-NEXT: br label %[[MERGE:.*]] ; OPT: [[SECOND]]: -; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 -; OPT: br label %[[MERGE]] +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1 +; OPT-NEXT: br label %[[MERGE]] ; OPT: [[MERGE]]: -; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] -; OPT-NOT: load i32, ptr addrspace(101) -; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; +; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 @@ -235,32 +375,53 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define void @grid_const_phi_escape2(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { -; PTX-LABEL: grid_const_phi_escape2( -; PTX: ld.param.{{.*}} [[R1:%.*]], [grid_const_phi_escape2_param_1+4]; -; PTX: @[[P1:%.*]] bra $L__BB[[LABEL:[_0-9]+]]; -; PTX: cvta.param.u64 [[RD1:%.*]], [[RD2:%.*]]; -; PTX: ld.u32 [[R1]], [[[RD1]]]; -; PTX: $L__BB[[LABEL]]: -; PTX: st.global.u32 [[[RD3:%.*]]], [[R1]] -; OPT-LABEL: define void @grid_const_phi_escape2( -; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr {{%.*}}) { -; OPT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 -; OPT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) -; OPT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 -; OPT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 -; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) -; OPT: br i1 [[LESS:%.*]], label %[[FIRST:.*]], label %[[SECOND:.*]] +define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) { +; PTX-LABEL: grid_const_phi_ngc( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<12>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd6, grid_const_phi_ngc_param_0; +; PTX-NEXT: ld.param.u64 %rd7, [grid_const_phi_ngc_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd1, %rd7; +; PTX-NEXT: mov.u64 %rd10, %rd6; +; PTX-NEXT: cvta.param.u64 %rd11, %rd10; +; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: @%p1 bra $L__BB9_2; +; PTX-NEXT: // %bb.1: // %second +; PTX-NEXT: mov.b64 %rd8, grid_const_phi_ngc_param_1; +; PTX-NEXT: mov.u64 %rd9, %rd8; +; PTX-NEXT: cvta.param.u64 %rd2, %rd9; +; PTX-NEXT: add.s64 %rd11, %rd2, 4; +; PTX-NEXT: $L__BB9_2: // %merge +; PTX-NEXT: ld.u32 %r2, [%rd11]; +; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_phi_ngc( +; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]] ; OPT: [[FIRST]]: -; OPT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 -; OPT: br label %[[MERGE:.*]] +; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; OPT-NEXT: br label %[[MERGE:.*]] ; OPT: [[SECOND]]: -; OPT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 -; OPT: br label %[[MERGE]] +; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 +; OPT-NEXT: br label %[[MERGE]] ; OPT: [[MERGE]]: -; OPT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] -; +; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 br i1 %less, label %first, label %second @@ -278,22 +439,42 @@ merge: } ; NOTE: %input2 is *not* grid_constant -define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { -; PTX-LABEL: grid_const_select_escape( -; PTX: cvta.param.{{.*}} [[RD2:%.*]], [[RD1:%.*]] -; PTX: setp.lt.{{.*}} [[P1:%.*]], {{%.*}}, 0 -; PTX: add.{{.*}} [[RD3:%.*]], %SP, 0; -; PTX: selp.{{.*}} [[RD4:%.*]], [[RD2]], [[RD3]], [[P1]]; -; PTX: ld.u32 {{%.*}}, [[[RD4]]]; -; OPT-LABEL: define void @grid_const_select_escape( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) { -; OPT: [[INPUT24:%.*]] = alloca i32, align 4 -; OPT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) -; OPT: [[INPUT11:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT12]]) -; OPT: load i32, ptr [[INOUT]] -; OPT: [[PTRNEW:%.*]] = select i1 [[LESS:%.*]], ptr [[INPUT11]], ptr [[INPUT24]] -; OPT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 -; +define void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) { +; PTX-LABEL: grid_const_select( +; PTX: { +; PTX-NEXT: .reg .pred %p<2>; +; PTX-NEXT: .reg .b32 %r<3>; +; PTX-NEXT: .reg .b64 %rd<10>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0; +; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2]; +; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; +; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1; +; PTX-NEXT: mov.u64 %rd5, %rd4; +; PTX-NEXT: cvta.param.u64 %rd6, %rd5; +; PTX-NEXT: mov.u64 %rd7, %rd1; +; PTX-NEXT: cvta.param.u64 %rd8, %rd7; +; PTX-NEXT: ld.global.u32 %r1, [%rd3]; +; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; +; PTX-NEXT: selp.b64 %rd9, %rd8, %rd6, %p1; +; PTX-NEXT: ld.u32 %r2, [%rd9]; +; PTX-NEXT: st.global.u32 [%rd3], %r2; +; PTX-NEXT: ret; +; OPT-LABEL: define void @grid_const_select( +; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; OPT-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; OPT-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; OPT-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT2]], align 4 +; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0 +; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] +; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; OPT-NEXT: ret void %val = load i32, ptr %inout %less = icmp slt i32 %val, 0 %ptrnew = select i1 %less, ptr %input1, ptr %input2 @@ -304,16 +485,27 @@ define void @grid_const_select_escape(ptr byval(i32) align 4 %input1, ptr byval( define i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-LABEL: grid_const_ptrtoint( -; PTX-NOT: .local -; PTX: ld.param.{{.*}} {{%.*}}, [grid_const_ptrtoint_param_0]; -; PTX: cvta.param.u64 [[RD1:%.*]], {{%.*}} -; PTX: cvt.u32.u64 {{%.*}}, [[RD1]] +; PTX: { +; PTX-NEXT: .reg .b32 %r<4>; +; PTX-NEXT: .reg .b64 %rd<4>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0; +; PTX-NEXT: mov.u64 %rd2, %rd1; +; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0]; +; PTX-NEXT: cvta.param.u64 %rd3, %rd2; +; PTX-NEXT: cvt.u32.u64 %r2, %rd3; +; PTX-NEXT: add.s32 %r3, %r1, %r2; +; PTX-NEXT: st.param.b32 [func_retval0+0], %r3; +; PTX-NEXT: ret; ; OPT-LABEL: define i32 @grid_const_ptrtoint( -; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) { -; OPT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) -; OPT: [[VAL:%.*]] = load i32, ptr addrspace(101) [[INPUT2]] -; OPT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) -; OPT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 +; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] { +; OPT-NEXT: [[INPUT2:%.*]] = addrspacecast ptr [[INPUT]] to ptr addrspace(101) +; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4 +; OPT-NEXT: [[INPUT1:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2]]) +; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32 +; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]] +; OPT-NEXT: ret i32 [[KEEPALIVE]] %val = load i32, ptr %input %ptrval = ptrtoint ptr %input to i32 %keepalive = add i32 %val, %ptrval @@ -352,13 +544,13 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr !14 = !{ptr @grid_const_partial_escapemem, !"kernel", i32 1, !"grid_constant", !15} !15 = !{i32 1} -!16 = !{ptr @grid_const_phi_escape, !"kernel", i32 1, !"grid_constant", !17} +!16 = !{ptr @grid_const_phi, !"kernel", i32 1, !"grid_constant", !17} !17 = !{i32 1} -!18 = !{ptr @grid_const_phi_escape2, !"kernel", i32 1, !"grid_constant", !19} +!18 = !{ptr @grid_const_phi_ngc, !"kernel", i32 1, !"grid_constant", !19} !19 = !{i32 1} -!20 = !{ptr @grid_const_select_escape, !"kernel", i32 1, !"grid_constant", !21} +!20 = !{ptr @grid_const_select, !"kernel", i32 1, !"grid_constant", !21} !21 = !{i32 1} !22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23} diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index f041f202777f61a..a414a6c41cd5b25 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -1,166 +1,469 @@ -; RUN: llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK32 -; RUN: llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} -; RUN: %if ptxas %{ llc < %s -mtriple nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %} - -%struct.ham = type { [4 x i32] } - -; // Verify that load with static offset into parameter is done directly. -; CHECK-LABEL: .visible .entry static_offset -; CHECK-NOT: .local -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 -; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1 -; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]] -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; -; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_addr1]]+12]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @static_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { -bb: - %tmp = icmp eq i32 %arg2, 3 - br i1 %tmp, label %bb3, label %bb6 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --version 5 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_60 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_60 +; RUN: opt < %s -mtriple nvptx64 -mcpu=sm_70 -mattr=ptx77 -nvptx-lower-args -S | FileCheck %s --check-prefixes=COMMON,SM_70 +source_filename = "" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +%struct.S = type { i32, i32 } + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +declare dso_local void @_Z6escapePv(ptr noundef) local_unnamed_addr #0 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +declare dso_local void @_Z6escapei(i32 noundef) local_unnamed_addr #0 -bb3: ; preds = %bb - %tmp4 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 3 - %tmp5 = load i32, ptr %tmp4, align 4 - store i32 %tmp5, ptr %arg, align 4 - br label %bb6 +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1 -bb6: ; preds = %bb3, %bb +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite) +declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) #1 + +; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write) +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void +; +entry: + %i = load i32, ptr %s, align 4 + store i32 %i, ptr %out, align 4 ret void } -; // Verify that load with dynamic offset into parameter is also done directly. -; CHECK-LABEL: .visible .entry dynamic_offset -; CHECK-NOT: .local -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: mov.b64 %[[param_addr:rd[0-9]+]], {{.*}}_param_1 -; CHECK64: mov.u64 %[[param_addr1:rd[0-9]+]], %[[param_addr]] -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; CHECK64: add.s64 %[[param_w_offset:rd[0-9]+]], %[[param_addr1]], +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only_gep( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: mov.b32 %[[param_addr:r[0-9]+]], {{.*}}_param_1 -; CHECK32: mov.u32 %[[param_addr1:r[0-9]+]], %[[param_addr]] -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; CHECK32: add.s32 %[[param_w_offset:r[0-9]+]], %[[param_addr1]], +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %i = load i32, ptr %b, align 4 + store i32 %i, ptr %out, align 4 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep_asc(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only_gep_asc( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4 +; COMMON-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void ; -; CHECK: ld.param.u32 [[value:%r[0-9]+]], [%[[param_w_offset]]]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %asc = addrspacecast ptr %b to ptr addrspace(101) + %i = load i32, ptr addrspace(101) %asc, align 4 + store i32 %i, ptr %out, align 4 + ret void +} -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @dynamic_offset(ptr nocapture %arg, ptr nocapture readonly byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #0 { -bb: - %tmp = sext i32 %arg2 to i64 - %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp - %tmp4 = load i32, ptr %tmp3, align 4 - store i32 %tmp4, ptr %arg, align 4 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @read_only_gep_asc0(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @read_only_gep_asc0( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: [[ASC:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(101) +; COMMON-NEXT: [[ASC0:%.*]] = addrspacecast ptr addrspace(101) [[ASC]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[ASC0]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[OUT2]], align 4 +; COMMON-NEXT: ret void +; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + %asc = addrspacecast ptr %b to ptr addrspace(101) + %asc0 = addrspacecast ptr addrspace(101) %asc to ptr + %i = load i32, ptr %asc0, align 4 + store i32 %i, ptr %out, align 4 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S3]]) +; COMMON-NEXT: ret void +; +entry: + call void @_Z6escapePv(ptr noundef nonnull %s) #0 ret void } -; Same as above, but with a bitcast present in the chain -; CHECK-LABEL:.visible .entry gep_bitcast -; CHECK-NOT: .local -; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_param_0] -; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_param_1 +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_gep(ptr nocapture noundef readnone %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr_gep( +; COMMON-SAME: ptr nocapture noundef readnone [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) +; COMMON-NEXT: ret void ; -; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_param_0] -; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_param_1 +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + call void @_Z6escapePv(ptr noundef nonnull %b) #0 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr_store( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: store ptr [[S3]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void ; -; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_param_2] -; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]]; -; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]]; +entry: + store ptr %s, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptr_gep_store(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptr_gep_store( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S3]], i64 4 +; COMMON-NEXT: store ptr [[B]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void +; +entry: + %b = getelementptr inbounds nuw i8, ptr %s, i64 4 + store ptr %b, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @escape_ptrtoint(ptr nocapture noundef writeonly %out, ptr noundef byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @escape_ptrtoint( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S3]] to i64 +; COMMON-NEXT: store i64 [[I]], ptr [[OUT2]], align 8 +; COMMON-NEXT: ret void +; +entry: + %i = ptrtoint ptr %s to i64 + store i64 %i, ptr %out, align 8 + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @memcpy_from_param(ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @memcpy_from_param( +; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true) +; COMMON-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @memcpy_to_param( +; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef readnone byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[ENTRY:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S3]], ptr [[IN2]], i64 16, i1 true) +; COMMON-NEXT: ret void +; +entry: + tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) + ret void +} + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) +define dso_local void @copy_on_store(ptr nocapture noundef readonly %in, ptr nocapture noundef byval(%struct.S) align 4 %s, i1 noundef zeroext %b) local_unnamed_addr #0 { +; COMMON-LABEL: define dso_local void @copy_on_store( +; COMMON-SAME: ptr nocapture noundef readonly [[IN:%.*]], ptr nocapture noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[S3:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[S4:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101) +; COMMON-NEXT: [[S5:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[S4]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[S5]], ptr [[S3]], align 4 +; COMMON-NEXT: [[IN1:%.*]] = addrspacecast ptr [[IN]] to ptr addrspace(1) +; COMMON-NEXT: [[IN2:%.*]] = addrspacecast ptr addrspace(1) [[IN1]] to ptr +; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN2]], align 4 +; COMMON-NEXT: store i32 [[I]], ptr [[S3]], align 4 +; COMMON-NEXT: ret void ; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @gep_bitcast(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { bb: - %n64 = sext i32 %n to i64 - %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64 - %load = load i8, ptr %gep, align 4 - store i8 %load, ptr %out, align 4 + %i = load i32, ptr %in, align 4 + store i32 %i, ptr %s, align 4 ret void } -; Same as above, but with an ASC(101) present in the chain -; CHECK-LABEL:.visible .entry gep_bitcast_asc -; CHECK-NOT: .local -; CHECK64-DAG: ld.param.u64 [[out:%rd[0-9]+]], [gep_bitcast_asc_param_0] -; CHECK64-DAG: mov.b64 {{%rd[0-9]+}}, gep_bitcast_asc_param_1 +define void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { +; SM_60-LABEL: define void @test_select( +; SM_60-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_60-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr +; SM_60-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; SM_60-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] +; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[OUT8]], align 4 +; SM_60-NEXT: ret void ; -; CHECK32-DAG: ld.param.u32 [[out:%r[0-9]+]], [gep_bitcast_asc_param_0] -; CHECK32-DAG: mov.b32 {{%r[0-9]+}}, gep_bitcast_asc_param_1 +; SM_70-LABEL: define void @test_select( +; SM_70-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; SM_70-NEXT: [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr +; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]] +; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[OUT2]], align 4 +; SM_70-NEXT: ret void ; -; CHECK-DAG: ld.param.u32 {{%r[0-9]+}}, [gep_bitcast_asc_param_2] -; CHECK64: ld.param.u8 [[value:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK64: st.global.u8 [{{%rd[0-9]+}}], [[value]]; -; CHECK32: ld.param.u8 [[value:%rs[0-9]+]], [{{%r[0-9]+}}] -; CHECK32: st.global.u8 [{{%r[0-9]+}}], [[value]]; +bb: + %ptrnew = select i1 %cond, ptr %input1, ptr %input2 + %valloaded = load i32, ptr %ptrnew, align 4 + store i32 %valloaded, ptr %out, align 4 + ret void +} + +define void @test_select_write(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %out, i1 %cond) { +; COMMON-LABEL: define void @test_select_write( +; COMMON-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[OUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3:[0-9]+]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[OUT7:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1) +; COMMON-NEXT: [[OUT8:%.*]] = addrspacecast ptr addrspace(1) [[OUT7]] to ptr +; COMMON-NEXT: [[INPUT24:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT26:%.*]] = load i32, ptr addrspace(101) [[INPUT25]], align 4 +; COMMON-NEXT: store i32 [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT13:%.*]] = load i32, ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store i32 [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void ; -; Function Attrs: nofree norecurse nounwind willreturn mustprogress -define dso_local void @gep_bitcast_asc(ptr nocapture %out, ptr nocapture readonly byval(%struct.ham) align 4 %in, i32 %n) local_unnamed_addr #0 { bb: - %n64 = sext i32 %n to i64 - %gep = getelementptr inbounds %struct.ham, ptr %in, i64 0, i32 0, i64 %n64 - %asc = addrspacecast ptr %gep to ptr addrspace(101) - %load = load i8, ptr addrspace(101) %asc, align 4 - store i8 %load, ptr %out, align 4 - ret void -} - - -; Verify that if the pointer escapes, then we do fall back onto using a temp copy. -; CHECK-LABEL: .visible .entry pointer_escapes -; CHECK: .local .align 4 .b8 __local_depot{{.*}} -; CHECK64: ld.param.u64 [[result_addr:%rd[0-9]+]], [{{.*}}_param_0] -; CHECK64: add.u64 %[[copy_addr:rd[0-9]+]], %SPL, 0; -; CHECK32: ld.param.u32 [[result_addr:%r[0-9]+]], [{{.*}}_param_0] -; CHECK32: add.u32 %[[copy_addr:r[0-9]+]], %SPL, 0; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+12]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+8]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1+4]; -; CHECK-DAG: ld.param.u32 %{{.*}}, [pointer_escapes_param_1]; -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+12], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+8], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]+4], -; CHECK-DAG: st.local.u32 [%[[copy_addr]]], -; CHECK64: cvta.to.global.u64 [[result_addr_g:%rd[0-9]+]], [[result_addr]] -; CHECK64: add.s64 %[[copy_w_offset:rd[0-9]+]], %[[copy_addr]], -; CHECK32: cvta.to.global.u32 [[result_addr_g:%r[0-9]+]], [[result_addr]] -; CHECK32: add.s32 %[[copy_w_offset:r[0-9]+]], %[[copy_addr]], -; CHECK: ld.local.u32 [[value:%r[0-9]+]], [%[[copy_w_offset]]]; -; CHECK: st.global.u32 [[[result_addr_g]]], [[value]]; - -; Function Attrs: convergent norecurse nounwind mustprogress -define dso_local void @pointer_escapes(ptr nocapture %arg, ptr byval(%struct.ham) align 4 %arg1, i32 %arg2) local_unnamed_addr #1 { + %ptrnew = select i1 %cond, ptr %input1, ptr %input2 + store i32 1, ptr %ptrnew, align 4 + ret void +} + +define void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, ptr %inout, i1 %cond) { +; SM_60-LABEL: define void @test_phi( +; SM_60-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_60-NEXT: [[BB:.*:]] +; SM_60-NEXT: [[INOUT7:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; SM_60-NEXT: [[INOUT8:%.*]] = addrspacecast ptr addrspace(1) [[INOUT7]] to ptr +; SM_60-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 +; SM_60-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 +; SM_60-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 +; SM_60-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_60-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 +; SM_60-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; SM_60: [[FIRST]]: +; SM_60-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; SM_60-NEXT: br label %[[MERGE:.*]] +; SM_60: [[SECOND]]: +; SM_60-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 +; SM_60-NEXT: br label %[[MERGE]] +; SM_60: [[MERGE]]: +; SM_60-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_60-NEXT: store i32 [[VALLOADED]], ptr [[INOUT8]], align 4 +; SM_60-NEXT: ret void +; +; SM_70-LABEL: define void @test_phi( +; SM_70-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; SM_70-NEXT: [[BB:.*:]] +; SM_70-NEXT: [[INOUT1:%.*]] = addrspacecast ptr [[INOUT]] to ptr addrspace(1) +; SM_70-NEXT: [[INOUT2:%.*]] = addrspacecast ptr addrspace(1) [[INOUT1]] to ptr +; SM_70-NEXT: [[INPUT2_PARAM:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT2_PARAM]]) +; SM_70-NEXT: [[INPUT1_PARAM:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = call ptr @llvm.nvvm.ptr.param.to.gen.p0.p101(ptr addrspace(101) [[INPUT1_PARAM]]) +; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; SM_70: [[FIRST]]: +; SM_70-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0 +; SM_70-NEXT: br label %[[MERGE:.*]] +; SM_70: [[SECOND]]: +; SM_70-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1 +; SM_70-NEXT: br label %[[MERGE]] +; SM_70: [[MERGE]]: +; SM_70-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4 +; SM_70-NEXT: store i32 [[VALLOADED]], ptr [[INOUT2]], align 4 +; SM_70-NEXT: ret void +; bb: - %tmp = sext i32 %arg2 to i64 - %tmp3 = getelementptr inbounds %struct.ham, ptr %arg1, i64 0, i32 0, i64 %tmp - %tmp4 = load i32, ptr %tmp3, align 4 - store i32 %tmp4, ptr %arg, align 4 - %tmp5 = call ptr @escape(ptr nonnull %tmp3) #3 + br i1 %cond, label %first, label %second + +first: ; preds = %bb + %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 + br label %merge + +second: ; preds = %bb + %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 + br label %merge + +merge: ; preds = %second, %first + %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] + %valloaded = load i32, ptr %ptrnew, align 4 + store i32 %valloaded, ptr %inout, align 4 ret void } -; Function Attrs: convergent nounwind -declare dso_local ptr @escape(ptr) local_unnamed_addr +define void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr byval(%struct.S) %input2, i1 %cond) { +; COMMON-LABEL: define void @test_phi_write( +; COMMON-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], i1 [[COND:%.*]]) #[[ATTR3]] { +; COMMON-NEXT: [[BB:.*:]] +; COMMON-NEXT: [[INPUT24:%.*]] = alloca [[STRUCT_S]], align 8 +; COMMON-NEXT: [[INPUT25:%.*]] = addrspacecast ptr [[INPUT2]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT26:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT25]], align 8 +; COMMON-NEXT: store [[STRUCT_S]] [[INPUT26]], ptr [[INPUT24]], align 4 +; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4 +; COMMON-NEXT: [[INPUT12:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101) +; COMMON-NEXT: [[INPUT13:%.*]] = load [[STRUCT_S]], ptr addrspace(101) [[INPUT12]], align 4 +; COMMON-NEXT: store [[STRUCT_S]] [[INPUT13]], ptr [[INPUT11]], align 4 +; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]] +; COMMON: [[FIRST]]: +; COMMON-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT11]], i32 0, i32 0 +; COMMON-NEXT: br label %[[MERGE:.*]] +; COMMON: [[SECOND]]: +; COMMON-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT24]], i32 0, i32 1 +; COMMON-NEXT: br label %[[MERGE]] +; COMMON: [[MERGE]]: +; COMMON-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ] +; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4 +; COMMON-NEXT: ret void +; +bb: + br i1 %cond, label %first, label %second + +first: ; preds = %bb + %ptr1 = getelementptr inbounds %struct.S, ptr %input1, i32 0, i32 0 + br label %merge + +second: ; preds = %bb + %ptr2 = getelementptr inbounds %struct.S, ptr %input2, i32 0, i32 1 + br label %merge + +merge: ; preds = %second, %first + %ptrnew = phi ptr [ %ptr1, %first ], [ %ptr2, %second ] + store i32 1, ptr %ptrnew, align 4 + ret void +} +attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) "no-trapping-math"="true" "target-cpu"="sm_60" "target-features"="+ptx78,+sm_60" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) } +attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) } -!llvm.module.flags = !{!0, !1, !2} -!nvvm.annotations = !{!3, !4, !5, !6, !7} +!llvm.module.flags = !{!0, !1, !2, !3} +!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19} +!llvm.ident = !{!20, !21} -!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 9, i32 1]} +!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]} !1 = !{i32 1, !"wchar_size", i32 4} !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0} -!3 = !{ptr @static_offset, !"kernel", i32 1} -!4 = !{ptr @dynamic_offset, !"kernel", i32 1} -!5 = !{ptr @pointer_escapes, !"kernel", i32 1} -!6 = !{ptr @gep_bitcast, !"kernel", i32 1} -!7 = !{ptr @gep_bitcast_asc, !"kernel", i32 1} +!3 = !{i32 7, !"frame-pointer", i32 2} +!4 = !{ptr @read_only, !"kernel", i32 1} +!5 = !{ptr @escape_ptr, !"kernel", i32 1} +!6 = !{ptr @escape_ptr_gep, !"kernel", i32 1} +!7 = !{ptr @escape_ptr_store, !"kernel", i32 1} +!8 = !{ptr @escape_ptr_gep_store, !"kernel", i32 1} +!9 = !{ptr @escape_ptrtoint, !"kernel", i32 1} +!10 = !{ptr @memcpy_from_param, !"kernel", i32 1} +!11 = !{ptr @memcpy_to_param, !"kernel", i32 1} +!12 = !{ptr @copy_on_store, !"kernel", i32 1} +!13 = !{ptr @read_only_gep, !"kernel", i32 1} +!14 = !{ptr @read_only_gep_asc, !"kernel", i32 1} +!15 = !{ptr @read_only_gep_asc0, !"kernel", i32 1} +!16 = !{ptr @test_select, !"kernel", i32 1} +!17 = !{ptr @test_phi, !"kernel", i32 1} +!18 = !{ptr @test_phi_write, !"kernel", i32 1} +!19 = !{ptr @test_select_write, !"kernel", i32 1} +!20 = !{!"clang version 20.0.0git"} +!21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"} From 96b7c64b8a874584a9dad44bb8901904c14701c0 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Wed, 11 Sep 2024 10:33:19 -0700 Subject: [PATCH 12/94] [LLDB] Reapply SBSaveCore Add Memory List (#107937) Recently in #107731 this change was revereted due to excess memory size in `TestSkinnyCore`. This was due to a bug where a range's end was being passed as size. Creating massive memory ranges. Additionally, and requiring additional review, I added more unit tests and more verbose logic to the merging of save core memory regions. @jasonmolenda as an FYI. --- lldb/include/lldb/API/SBMemoryRegionInfo.h | 2 +- lldb/include/lldb/API/SBSaveCoreOptions.h | 11 + lldb/include/lldb/Symbol/SaveCoreOptions.h | 11 +- .../lldb/Target/CoreFileMemoryRanges.h | 50 +++++ lldb/include/lldb/Target/Process.h | 25 +-- lldb/include/lldb/Utility/RangeMap.h | 6 + lldb/include/lldb/lldb-enumerations.h | 1 + lldb/include/lldb/lldb-forward.h | 1 + lldb/include/lldb/lldb-private-interfaces.h | 1 - lldb/source/API/SBSaveCoreOptions.cpp | 11 + lldb/source/Commands/CommandObjectProcess.cpp | 1 + .../ObjectFile/Mach-O/ObjectFileMachO.cpp | 6 +- .../ObjectFile/Mach-O/ObjectFileMachO.h | 1 + .../Minidump/MinidumpFileBuilder.cpp | 35 +-- .../ObjectFile/Minidump/MinidumpFileBuilder.h | 5 +- .../ObjectFile/Minidump/ObjectFileMinidump.h | 1 + .../ObjectFile/PECOFF/ObjectFilePECOFF.cpp | 1 + .../ObjectFile/PECOFF/ObjectFilePECOFF.h | 1 + lldb/source/Symbol/SaveCoreOptions.cpp | 14 ++ lldb/source/Target/CMakeLists.txt | 1 + lldb/source/Target/CoreFileMemoryRanges.cpp | 86 ++++++++ lldb/source/Target/Process.cpp | 76 +++++-- .../TestProcessSaveCoreMinidump.py | 149 +++++++++++++ lldb/unittests/Process/Utility/CMakeLists.txt | 1 + .../Utility/CoreFileMemoryRangesTest.cpp | 205 ++++++++++++++++++ 25 files changed, 635 insertions(+), 67 deletions(-) create mode 100644 lldb/include/lldb/Target/CoreFileMemoryRanges.h create mode 100644 lldb/source/Target/CoreFileMemoryRanges.cpp create mode 100644 lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp diff --git a/lldb/include/lldb/API/SBMemoryRegionInfo.h b/lldb/include/lldb/API/SBMemoryRegionInfo.h index be55de4ead1fa8c..f9a5dc993d7cb6f 100644 --- a/lldb/include/lldb/API/SBMemoryRegionInfo.h +++ b/lldb/include/lldb/API/SBMemoryRegionInfo.h @@ -120,7 +120,7 @@ class LLDB_API SBMemoryRegionInfo { private: friend class SBProcess; friend class SBMemoryRegionInfoList; - + friend class SBSaveCoreOptions; friend class lldb_private::ScriptInterpreter; lldb_private::MemoryRegionInfo &ref(); diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h index ba48ba5eaea5a0b..c076d3ce6f75752 100644 --- a/lldb/include/lldb/API/SBSaveCoreOptions.h +++ b/lldb/include/lldb/API/SBSaveCoreOptions.h @@ -80,6 +80,17 @@ class LLDB_API SBSaveCoreOptions { /// \return True if the thread was removed, false if it was not in the list. bool RemoveThread(lldb::SBThread thread); + /// Add a memory region to save in the core file. + /// + /// \param region The memory region to save. + /// \returns An empty SBError upon success, or an error if the region is + /// invalid. + /// \note Ranges that overlapped will be unioned into a single region, this + /// also supercedes stack minification. Specifying full regions and a + /// non-custom core style will include the specified regions and union them + /// with all style specific regions. + SBError AddMemoryRegionToSave(const SBMemoryRegionInfo ®ion); + /// Reset all options. void Clear(); diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h index f4fed4676fa4ae1..d90d08026016dc6 100644 --- a/lldb/include/lldb/Symbol/SaveCoreOptions.h +++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h @@ -10,13 +10,15 @@ #define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H #include "lldb/Utility/FileSpec.h" -#include "lldb/lldb-forward.h" -#include "lldb/lldb-types.h" +#include "lldb/Utility/RangeMap.h" #include +#include #include #include +using MemoryRanges = lldb_private::RangeVector; + namespace lldb_private { class SaveCoreOptions { @@ -38,8 +40,12 @@ class SaveCoreOptions { Status AddThread(lldb::ThreadSP thread_sp); bool RemoveThread(lldb::ThreadSP thread_sp); bool ShouldThreadBeSaved(lldb::tid_t tid) const; + bool HasSpecifiedThreads() const; Status EnsureValidConfiguration(lldb::ProcessSP process_sp) const; + const MemoryRanges &GetCoreFileMemoryRanges() const; + + void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo ®ion); void Clear(); @@ -51,6 +57,7 @@ class SaveCoreOptions { std::optional m_style; lldb::ProcessSP m_process_sp; std::unordered_set m_threads_to_save; + MemoryRanges m_regions_to_save; }; } // namespace lldb_private diff --git a/lldb/include/lldb/Target/CoreFileMemoryRanges.h b/lldb/include/lldb/Target/CoreFileMemoryRanges.h new file mode 100644 index 000000000000000..503ecd691e59486 --- /dev/null +++ b/lldb/include/lldb/Target/CoreFileMemoryRanges.h @@ -0,0 +1,50 @@ +//===-- CoreFileMemoryRanges.h ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Utility/RangeMap.h" +#include "lldb/Utility/Status.h" + +#include "llvm/ADT/AddressRanges.h" + +#ifndef LLDB_TARGET_COREFILEMEMORYRANGES_H +#define LLDB_TARGET_COREFILEMEMORYRANGES_H + +namespace lldb_private { + +struct CoreFileMemoryRange { + llvm::AddressRange range; /// The address range to save into the core file. + uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits. + + bool operator==(const CoreFileMemoryRange &rhs) const { + return range == rhs.range && lldb_permissions == rhs.lldb_permissions; + } + + bool operator!=(const CoreFileMemoryRange &rhs) const { + return !(*this == rhs); + } + + bool operator<(const CoreFileMemoryRange &rhs) const { + if (range < rhs.range) + return true; + if (range == rhs.range) + return lldb_permissions < rhs.lldb_permissions; + return false; + } +}; + +class CoreFileMemoryRanges + : public lldb_private::RangeDataVector { +public: + /// Finalize and merge all overlapping ranges in this collection. Ranges + /// will be seperated based on permissions. + Status FinalizeCoreFileSaveRanges(); +}; +} // namespace lldb_private + +#endif // LLDB_TARGET_COREFILEMEMORYRANGES_H diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h index c66cfb2c245efda..b8c53a474ba6b95 100644 --- a/lldb/include/lldb/Target/Process.h +++ b/lldb/include/lldb/Target/Process.h @@ -35,6 +35,8 @@ #include "lldb/Host/ProcessLaunchInfo.h" #include "lldb/Host/ProcessRunLock.h" #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" +#include "lldb/Target/CoreFileMemoryRanges.h" #include "lldb/Target/ExecutionContextScope.h" #include "lldb/Target/InstrumentationRuntime.h" #include "lldb/Target/Memory.h" @@ -710,29 +712,6 @@ class Process : public std::enable_shared_from_this, /// is not supported by the plugin, error otherwise. virtual llvm::Expected SaveCore(llvm::StringRef outfile); - struct CoreFileMemoryRange { - llvm::AddressRange range; /// The address range to save into the core file. - uint32_t lldb_permissions; /// A bit set of lldb::Permissions bits. - - bool operator==(const CoreFileMemoryRange &rhs) const { - return range == rhs.range && lldb_permissions == rhs.lldb_permissions; - } - - bool operator!=(const CoreFileMemoryRange &rhs) const { - return !(*this == rhs); - } - - bool operator<(const CoreFileMemoryRange &rhs) const { - if (range < rhs.range) - return true; - if (range == rhs.range) - return lldb_permissions < rhs.lldb_permissions; - return false; - } - }; - - using CoreFileMemoryRanges = std::vector; - /// Helper function for Process::SaveCore(...) that calculates the address /// ranges that should be saved. This allows all core file plug-ins to save /// consistent memory ranges given a \a core_style. diff --git a/lldb/include/lldb/Utility/RangeMap.h b/lldb/include/lldb/Utility/RangeMap.h index 8cc382bcc046ce1..433466eebced8b8 100644 --- a/lldb/include/lldb/Utility/RangeMap.h +++ b/lldb/include/lldb/Utility/RangeMap.h @@ -450,6 +450,12 @@ class RangeDataVector { void Append(const Entry &entry) { m_entries.emplace_back(entry); } + /// Append a range with data to the vector + /// \param B The base of the memory range + /// \param S The size of the memory range + /// \param T The data associated with the memory range + void Append(B &&b, S &&s, T &&t) { m_entries.emplace_back(Entry(b, s, t)); } + bool Erase(uint32_t start, uint32_t end) { if (start >= end || end > m_entries.size()) return false; diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h index 7bfde8b9de1271c..938f6e3abe8f2a6 100644 --- a/lldb/include/lldb/lldb-enumerations.h +++ b/lldb/include/lldb/lldb-enumerations.h @@ -1222,6 +1222,7 @@ enum SaveCoreStyle { eSaveCoreFull = 1, eSaveCoreDirtyOnly = 2, eSaveCoreStackOnly = 3, + eSaveCoreCustomOnly = 4, }; /// Events that might happen during a trace session. diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index 337eff696fcf3f5..5fb288ad43af488 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -207,6 +207,7 @@ class StackFrameRecognizer; class StackFrameRecognizerManager; class StackID; class Status; +class SaveCoreOptions; class StopInfo; class Stoppoint; class StoppointCallbackContext; diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h index b3c8cda899b95ef..5bac5cd3e86b59d 100644 --- a/lldb/include/lldb/lldb-private-interfaces.h +++ b/lldb/include/lldb/lldb-private-interfaces.h @@ -9,7 +9,6 @@ #ifndef LLDB_LLDB_PRIVATE_INTERFACES_H #define LLDB_LLDB_PRIVATE_INTERFACES_H -#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/lldb-enumerations.h" #include "lldb/lldb-forward.h" #include "lldb/lldb-private-enumerations.h" diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp index ef82b0253f11997..c79b57fa62c2be8 100644 --- a/lldb/source/API/SBSaveCoreOptions.cpp +++ b/lldb/source/API/SBSaveCoreOptions.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "lldb/API/SBSaveCoreOptions.h" +#include "lldb/API/SBMemoryRegionInfo.h" #include "lldb/Host/FileSystem.h" #include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Utility/Instrumentation.h" @@ -89,6 +90,16 @@ bool SBSaveCoreOptions::RemoveThread(lldb::SBThread thread) { return m_opaque_up->RemoveThread(thread.GetSP()); } +lldb::SBError +SBSaveCoreOptions::AddMemoryRegionToSave(const SBMemoryRegionInfo ®ion) { + LLDB_INSTRUMENT_VA(this, region); + // Currently add memory region can't fail, so we always return a success + // SBerror, but because these API's live forever, this is the most future + // proof thing to do. + m_opaque_up->AddMemoryRegionToSave(region.ref()); + return SBError(); +} + void SBSaveCoreOptions::Clear() { LLDB_INSTRUMENT_VA(this); m_opaque_up->Clear(); diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 25eb633f1e6dad9..5b0f4f66f248b64 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -25,6 +25,7 @@ #include "lldb/Interpreter/OptionArgParser.h" #include "lldb/Interpreter/OptionGroupPythonClassWithDict.h" #include "lldb/Interpreter/Options.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Target/Platform.h" #include "lldb/Target/Process.h" #include "lldb/Target/StopInfo.h" diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp index b28beab117cca4b..06da83e26a26a5b 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp @@ -6562,13 +6562,15 @@ bool ObjectFileMachO::SaveCore(const lldb::ProcessSP &process_sp, } if (make_core) { - Process::CoreFileMemoryRanges core_ranges; + CoreFileMemoryRanges core_ranges; error = process_sp->CalculateCoreFileSaveRanges(options, core_ranges); if (error.Success()) { const uint32_t addr_byte_size = target_arch.GetAddressByteSize(); const ByteOrder byte_order = target_arch.GetByteOrder(); std::vector segment_load_commands; - for (const auto &core_range : core_ranges) { + for (const auto &core_range_info : core_ranges) { + // TODO: Refactor RangeDataVector to have a data iterator. + const auto &core_range = core_range_info.data; uint32_t cmd_type = LC_SEGMENT_64; uint32_t segment_size = sizeof(llvm::MachO::segment_command_64); if (addr_byte_size == 4) { diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h index 27bc237aaac48d8..be87112df7d8984 100644 --- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h +++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.h @@ -12,6 +12,7 @@ #include "lldb/Core/Address.h" #include "lldb/Host/SafeMachO.h" #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/FileSpecList.h" #include "lldb/Utility/RangeMap.h" diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index 5c9ba223ad143e3..edc568a6b47e00d 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -831,25 +831,32 @@ Status MinidumpFileBuilder::AddMemoryList() { // bytes of the core file. Thread structures in minidump files can only use // 32 bit memory descriptiors, so we emit them first to ensure the memory is // in accessible with a 32 bit offset. - Process::CoreFileMemoryRanges ranges_32; - Process::CoreFileMemoryRanges ranges_64; - Process::CoreFileMemoryRanges all_core_memory_ranges; + std::vector ranges_32; + std::vector ranges_64; + CoreFileMemoryRanges all_core_memory_ranges; error = m_process_sp->CalculateCoreFileSaveRanges(m_save_core_options, all_core_memory_ranges); + + std::vector all_core_memory_vec; + // Extract all the data into just a vector of data. So we can mutate this in + // place. + for (const auto &core_range : all_core_memory_ranges) + all_core_memory_vec.push_back(core_range.data); + if (error.Fail()) return error; // Start by saving all of the stacks and ensuring they fit under the 32b // limit. uint64_t total_size = GetCurrentDataEndOffset(); - auto iterator = all_core_memory_ranges.begin(); - while (iterator != all_core_memory_ranges.end()) { + auto iterator = all_core_memory_vec.begin(); + while (iterator != all_core_memory_vec.end()) { if (m_saved_stack_ranges.count(iterator->range.start()) > 0) { // We don't save stacks twice. ranges_32.push_back(*iterator); total_size += iterator->range.size() + sizeof(llvm::minidump::MemoryDescriptor); - iterator = all_core_memory_ranges.erase(iterator); + iterator = all_core_memory_vec.erase(iterator); } else { iterator++; } @@ -869,11 +876,11 @@ Status MinidumpFileBuilder::AddMemoryList() { // Then anything overflow extends into 64b addressable space. // All core memeroy ranges will either container nothing on stacks only // or all the memory ranges including stacks - if (!all_core_memory_ranges.empty()) - total_size += 256 + (all_core_memory_ranges.size() * + if (!all_core_memory_vec.empty()) + total_size += 256 + (all_core_memory_vec.size() * sizeof(llvm::minidump::MemoryDescriptor_64)); - for (const auto &core_range : all_core_memory_ranges) { + for (const auto &core_range : all_core_memory_vec) { const addr_t range_size = core_range.range.size(); // We don't need to check for stacks here because we already removed them // from all_core_memory_ranges. @@ -958,15 +965,15 @@ Status MinidumpFileBuilder::DumpDirectories() const { } static uint64_t -GetLargestRangeSize(const Process::CoreFileMemoryRanges &ranges) { +GetLargestRangeSize(const std::vector &ranges) { uint64_t max_size = 0; for (const auto &core_range : ranges) max_size = std::max(max_size, core_range.range.size()); return max_size; } -Status -MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) { +Status MinidumpFileBuilder::AddMemoryList_32( + std::vector &ranges) { std::vector descriptors; Status error; if (ranges.size() == 0) @@ -1042,8 +1049,8 @@ MinidumpFileBuilder::AddMemoryList_32(Process::CoreFileMemoryRanges &ranges) { return error; } -Status -MinidumpFileBuilder::AddMemoryList_64(Process::CoreFileMemoryRanges &ranges) { +Status MinidumpFileBuilder::AddMemoryList_64( + std::vector &ranges) { Status error; if (ranges.empty()) return error; diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h index 762de83db5a39ce..71001e26c00e91c 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h @@ -23,6 +23,7 @@ #include #include +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Target/Process.h" #include "lldb/Target/Target.h" #include "lldb/Utility/DataBufferHeap.h" @@ -120,9 +121,9 @@ class MinidumpFileBuilder { lldb_private::Status AddData(const void *data, uint64_t size); // Add MemoryList stream, containing dumps of important memory segments lldb_private::Status - AddMemoryList_64(lldb_private::Process::CoreFileMemoryRanges &ranges); + AddMemoryList_64(std::vector &ranges); lldb_private::Status - AddMemoryList_32(lldb_private::Process::CoreFileMemoryRanges &ranges); + AddMemoryList_32(std::vector &ranges); // Update the thread list on disk with the newly emitted stack RVAs. lldb_private::Status FixThreadStacks(); lldb_private::Status FlushBufferToDisk(); diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h index b76fcd0052a8a8c..2f45f01558e6675 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h +++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.h @@ -21,6 +21,7 @@ #define LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_OBJECTFILEMINIDUMP_H #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Utility/ArchSpec.h" class ObjectFileMinidump : public lldb_private::PluginInterface { diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp index 9d01089745dfc91..8d9c919bc9b1013 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp @@ -17,6 +17,7 @@ #include "lldb/Interpreter/OptionValueDictionary.h" #include "lldb/Interpreter/OptionValueProperties.h" #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "lldb/Target/Process.h" #include "lldb/Target/SectionLoadList.h" #include "lldb/Target/Target.h" diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h index 8bccf3be3e5f636..4f4dedf773c5ba8 100644 --- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h +++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h @@ -13,6 +13,7 @@ #include #include "lldb/Symbol/ObjectFile.h" +#include "lldb/Symbol/SaveCoreOptions.h" #include "llvm/Object/COFF.h" class ObjectFilePECOFF : public lldb_private::ObjectFile { diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp index 35943726f2e4ef6..8d9aadece2152dd 100644 --- a/lldb/source/Symbol/SaveCoreOptions.cpp +++ b/lldb/source/Symbol/SaveCoreOptions.cpp @@ -102,6 +102,19 @@ bool SaveCoreOptions::ShouldThreadBeSaved(lldb::tid_t tid) const { return m_threads_to_save.count(tid) > 0; } +bool SaveCoreOptions::HasSpecifiedThreads() const { + return !m_threads_to_save.empty(); +} + +void SaveCoreOptions::AddMemoryRegionToSave( + const lldb_private::MemoryRegionInfo ®ion) { + m_regions_to_save.Insert(region.GetRange(), /*combine=*/true); +} + +const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const { + return m_regions_to_save; +} + Status SaveCoreOptions::EnsureValidConfiguration( lldb::ProcessSP process_sp) const { Status error; @@ -131,4 +144,5 @@ void SaveCoreOptions::Clear() { m_style = std::nullopt; m_threads_to_save.clear(); m_process_sp.reset(); + m_regions_to_save.Clear(); } diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt index a42c44b761dc56e..a6d2eace975420f 100644 --- a/lldb/source/Target/CMakeLists.txt +++ b/lldb/source/Target/CMakeLists.txt @@ -11,6 +11,7 @@ add_lldb_library(lldbTarget ABI.cpp AssertFrameRecognizer.cpp DynamicRegisterInfo.cpp + CoreFileMemoryRanges.cpp ExecutionContext.cpp InstrumentationRuntime.cpp InstrumentationRuntimeStopInfo.cpp diff --git a/lldb/source/Target/CoreFileMemoryRanges.cpp b/lldb/source/Target/CoreFileMemoryRanges.cpp new file mode 100644 index 000000000000000..6e4ca4995915c33 --- /dev/null +++ b/lldb/source/Target/CoreFileMemoryRanges.cpp @@ -0,0 +1,86 @@ +//===-- CoreFileMemoryRanges.cpp --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Target/CoreFileMemoryRanges.h" + +using namespace lldb; +using namespace lldb_private; + +using Entry = CoreFileMemoryRanges::Entry; + +static bool Overlaps(const Entry *region_one, const Entry *region_two) { + return !(region_one->GetRangeEnd() < region_two->GetRangeBase() || + region_two->GetRangeEnd() < region_one->GetRangeBase()); +} + +static bool IntersectHelper(const Entry *region_one, const Entry *region_two) { + return region_one->GetRangeBase() == region_two->GetRangeEnd() || + region_one->GetRangeEnd() == region_two->GetRangeBase(); +} + +static bool OnlyIntersects(const Entry *region_one, const Entry *region_two) { + return IntersectHelper(region_one, region_two) || + IntersectHelper(region_two, region_one); +} + +static bool PermissionsMatch(const Entry *region_one, const Entry *region_two) { + return region_one->data.lldb_permissions == region_two->data.lldb_permissions; +} + +// This assumes any overlapping ranges will share the same permissions +// and that adjacent ranges could have different permissions. +Status CoreFileMemoryRanges::FinalizeCoreFileSaveRanges() { + Status error; + this->Sort(); + for (size_t i = this->GetSize() - 1; i > 0; i--) { + auto region_one = this->GetMutableEntryAtIndex(i); + auto region_two = this->GetMutableEntryAtIndex(i - 1); + if (Overlaps(region_one, region_two)) { + // It's okay for interesecting regions to have different permissions but + // if they overlap we fail because we don't know what to do with them. + if (!PermissionsMatch(region_one, region_two)) { + // Permissions mismatch and it's not a simple intersection. + if (!OnlyIntersects(region_one, region_two)) { + error = Status::FromErrorStringWithFormatv( + "Memory region at {0}::{1} has different permssions than " + "overlapping region at {2}::{3}", + region_one->GetRangeBase(), region_one->GetRangeEnd(), + region_two->GetRangeBase(), region_two->GetRangeEnd()); + return error; + } + // Simple intersection, we can just not merge these. + else + continue; + } + const addr_t base = + std::min(region_one->GetRangeBase(), region_two->GetRangeBase()); + const addr_t byte_size = + std::max(region_one->GetRangeEnd(), region_two->GetRangeEnd()) - base; + + region_two->SetRangeBase(base); + region_two->SetByteSize(byte_size); + + // Because this is a range data vector, the entry has a base as well + // as the data contained in the entry. So we have to update both. + // And llvm::AddressRange isn't mutable so we have to create a new one. + llvm::AddressRange range(base, base + byte_size); + const CoreFileMemoryRange core_range = { + range, region_two->data.lldb_permissions}; + region_two->data = core_range; + // Erase is delete from [Inclusive, exclusive index). + if (!this->Erase(i, i + 1)) { + error = Status::FromErrorStringWithFormat( + "Core file memory ranges mutated outside of " + "CalculateCoreFileSaveRanges"); + return error; + } + } + } + + return error; +} diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp index 40f3115f1ff6de9..aca08972811470c 100644 --- a/lldb/source/Target/Process.cpp +++ b/lldb/source/Target/Process.cpp @@ -6463,7 +6463,7 @@ Status Process::WriteMemoryTags(lldb::addr_t addr, size_t len, } // Create a CoreFileMemoryRange from a MemoryRegionInfo -static Process::CoreFileMemoryRange +static CoreFileMemoryRange CreateCoreFileMemoryRange(const MemoryRegionInfo ®ion) { const addr_t addr = region.GetRange().GetRangeBase(); llvm::AddressRange range(addr, addr + region.GetRange().GetByteSize()); @@ -6474,7 +6474,7 @@ CreateCoreFileMemoryRange(const MemoryRegionInfo ®ion) { // were added. Return false if the dirty page information is not valid or in // the region. static bool AddDirtyPages(const MemoryRegionInfo ®ion, - Process::CoreFileMemoryRanges &ranges) { + CoreFileMemoryRanges &ranges) { const auto &dirty_page_list = region.GetDirtyPageList(); if (!dirty_page_list) return false; @@ -6494,14 +6494,14 @@ static bool AddDirtyPages(const MemoryRegionInfo ®ion, } else { // Add previous contiguous range and init the new range with the // current dirty page. - ranges.push_back({range, lldb_permissions}); + ranges.Append(range.start(), range.size(), {range, lldb_permissions}); range = llvm::AddressRange(page_addr, page_addr + page_size); } } } // The last range if (!range.empty()) - ranges.push_back({range, lldb_permissions}); + ranges.Append(range.start(), range.size(), {range, lldb_permissions}); return true; } @@ -6513,7 +6513,7 @@ static bool AddDirtyPages(const MemoryRegionInfo ®ion, // will be added to \a ranges, else the entire range will be added to \a // ranges. static void AddRegion(const MemoryRegionInfo ®ion, bool try_dirty_pages, - Process::CoreFileMemoryRanges &ranges) { + CoreFileMemoryRanges &ranges) { // Don't add empty ranges. if (region.GetRange().GetByteSize() == 0) return; @@ -6522,13 +6522,17 @@ static void AddRegion(const MemoryRegionInfo ®ion, bool try_dirty_pages, return; if (try_dirty_pages && AddDirtyPages(region, ranges)) return; - ranges.push_back(CreateCoreFileMemoryRange(region)); + + ranges.Append(region.GetRange().GetRangeBase(), + region.GetRange().GetByteSize(), + CreateCoreFileMemoryRange(region)); } -static void SaveOffRegionsWithStackPointers( - Process &process, const SaveCoreOptions &core_options, - const MemoryRegionInfos ®ions, Process::CoreFileMemoryRanges &ranges, - std::set &stack_ends) { +static void SaveOffRegionsWithStackPointers(Process &process, + const SaveCoreOptions &core_options, + const MemoryRegionInfos ®ions, + CoreFileMemoryRanges &ranges, + std::set &stack_ends) { const bool try_dirty_pages = true; // Before we take any dump, we want to save off the used portions of the @@ -6568,11 +6572,11 @@ static void SaveOffRegionsWithStackPointers( // for a full core file style. static void GetCoreFileSaveRangesFull(Process &process, const MemoryRegionInfos ®ions, - Process::CoreFileMemoryRanges &ranges, + CoreFileMemoryRanges &ranges, std::set &stack_ends) { // Don't add only dirty pages, add full regions. -const bool try_dirty_pages = false; + const bool try_dirty_pages = false; for (const auto ®ion : regions) if (stack_ends.count(region.GetRange().GetRangeEnd()) == 0) AddRegion(region, try_dirty_pages, ranges); @@ -6582,9 +6586,10 @@ const bool try_dirty_pages = false; // least some dirty pages, as some OS versions don't support reporting what // pages are dirty within an memory region. If no memory regions have dirty // page information fall back to saving out all ranges with write permissions. -static void GetCoreFileSaveRangesDirtyOnly( - Process &process, const MemoryRegionInfos ®ions, - Process::CoreFileMemoryRanges &ranges, std::set &stack_ends) { +static void GetCoreFileSaveRangesDirtyOnly(Process &process, + const MemoryRegionInfos ®ions, + CoreFileMemoryRanges &ranges, + std::set &stack_ends) { // Iterate over the regions and find all dirty pages. bool have_dirty_page_info = false; @@ -6613,9 +6618,10 @@ static void GetCoreFileSaveRangesDirtyOnly( // dirty regions as this will make the core file smaller. If the process // doesn't support dirty regions, then it will fall back to adding the full // stack region. -static void GetCoreFileSaveRangesStackOnly( - Process &process, const MemoryRegionInfos ®ions, - Process::CoreFileMemoryRanges &ranges, std::set &stack_ends) { +static void GetCoreFileSaveRangesStackOnly(Process &process, + const MemoryRegionInfos ®ions, + CoreFileMemoryRanges &ranges, + std::set &stack_ends) { const bool try_dirty_pages = true; // Some platforms support annotating the region information that tell us that // it comes from a thread stack. So look for those regions first. @@ -6628,6 +6634,24 @@ static void GetCoreFileSaveRangesStackOnly( } } +static void GetUserSpecifiedCoreFileSaveRanges(Process &process, + const MemoryRegionInfos ®ions, + const SaveCoreOptions &options, + CoreFileMemoryRanges &ranges) { + const auto &option_ranges = options.GetCoreFileMemoryRanges(); + if (option_ranges.IsEmpty()) + return; + + for (const auto &range : regions) { + auto entry = option_ranges.FindEntryThatContains(range.GetRange()); + if (entry) { + ranges.Append(range.GetRange().GetRangeBase(), + range.GetRange().GetByteSize(), + CreateCoreFileMemoryRange(range)); + } + } +} + Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options, CoreFileMemoryRanges &ranges) { lldb_private::MemoryRegionInfos regions; @@ -6643,11 +6667,18 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options, "callers must set the core_style to something other than " "eSaveCoreUnspecified"); + GetUserSpecifiedCoreFileSaveRanges(*this, regions, options, ranges); + std::set stack_ends; - SaveOffRegionsWithStackPointers(*this, options, regions, ranges, stack_ends); + // For fully custom set ups, we don't want to even look at threads if there + // are no threads specified. + if (core_style != lldb::eSaveCoreCustomOnly || options.HasSpecifiedThreads()) + SaveOffRegionsWithStackPointers(*this, options, regions, ranges, + stack_ends); switch (core_style) { case eSaveCoreUnspecified: + case eSaveCoreCustomOnly: break; case eSaveCoreFull: @@ -6666,10 +6697,11 @@ Status Process::CalculateCoreFileSaveRanges(const SaveCoreOptions &options, if (err.Fail()) return err; - if (ranges.empty()) - return Status("no valid address ranges found for core style"); + if (ranges.IsEmpty()) + return Status::FromErrorStringWithFormat( + "no valid address ranges found for core style"); - return Status(); // Success! + return ranges.FinalizeCoreFileSaveRanges(); } std::vector diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py index ed15793b527fc91..2cbe20ee10b1af5 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py @@ -344,3 +344,152 @@ def test_save_linux_mini_dump_default_options(self): self.assertTrue(self.dbg.DeleteTarget(target)) if os.path.isfile(default_value_file): os.unlink(default_value_file) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_save_linux_minidump_one_region(self): + """Test that we can save a Linux mini dump with one region in sbsavecore regions""" + + self.build() + exe = self.getBuildArtifact("a.out") + one_region_file = self.getBuildArtifact("core.one_region.dmp") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + self.assertState(process.GetState(), lldb.eStateStopped) + + memory_region = lldb.SBMemoryRegionInfo() + memory_list = process.GetMemoryRegions() + memory_list.GetMemoryRegionAtIndex(0, memory_region) + + # This is almost identical to the single thread test case because + # minidump defaults to stacks only, so we want to see if the + # default options work as expected. + options = lldb.SBSaveCoreOptions() + file_spec = lldb.SBFileSpec(one_region_file) + options.SetOutputFile(file_spec) + options.SetPluginName("minidump") + options.AddMemoryRegionToSave(memory_region) + options.SetStyle(lldb.eSaveCoreCustomOnly) + error = process.SaveCore(options) + print(f"Error: {error.GetCString()}") + self.assertTrue(error.Success(), error.GetCString()) + + core_target = self.dbg.CreateTarget(None) + core_proc = core_target.LoadCore(one_region_file) + core_memory_list = core_proc.GetMemoryRegions() + # Note because the /proc/pid maps are included on linux, we can't + # depend on size for validation, so we'll ensure the first region + # is present and then assert we fail on the second. + core_memory_region = lldb.SBMemoryRegionInfo() + core_memory_list.GetMemoryRegionAtIndex(0, core_memory_region) + self.assertEqual( + core_memory_region.GetRegionBase(), memory_region.GetRegionBase() + ) + self.assertEqual( + core_memory_region.GetRegionEnd(), memory_region.GetRegionEnd() + ) + + region_two = lldb.SBMemoryRegionInfo() + core_memory_list.GetMemoryRegionAtIndex(1, region_two) + err = lldb.SBError() + content = core_proc.ReadMemory(region_two.GetRegionBase(), 1, err) + self.assertTrue(err.Fail(), "Should fail to read memory") + + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) + if os.path.isfile(one_region_file): + os.unlink(one_region_file) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_save_minidump_custom_save_style(self): + """Test that verifies a custom and unspecified save style fails for + containing no data to save""" + + self.build() + exe = self.getBuildArtifact("a.out") + custom_file = self.getBuildArtifact("core.custom.dmp") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + self.assertState(process.GetState(), lldb.eStateStopped) + + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(custom_file)) + options.SetPluginName("minidump") + options.SetStyle(lldb.eSaveCoreCustomOnly) + + error = process.SaveCore(options) + self.assertTrue(error.Fail()) + self.assertEqual( + error.GetCString(), "no valid address ranges found for core style" + ) + + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) + if os.path.isfile(custom_file): + os.unlink(custom_file) + + def save_core_with_region(self, process, region_index): + try: + custom_file = self.getBuildArtifact("core.custom.dmp") + memory_region = lldb.SBMemoryRegionInfo() + memory_list = process.GetMemoryRegions() + memory_list.GetMemoryRegionAtIndex(0, memory_region) + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(custom_file)) + options.SetPluginName("minidump") + options.SetStyle(lldb.eSaveCoreFull) + + error = process.SaveCore(options) + self.assertTrue(error.Success()) + core_target = self.dbg.CreateTarget(None) + core_proc = core_target.LoadCore(custom_file) + core_memory_list = core_proc.GetMemoryRegions() + # proc/pid/ maps are included on linux, so we can't depend on size + # for validation, we make a set of all the ranges, + # and ensure no duplicates! + range_set = set() + for x in range(core_memory_list.GetSize()): + core_memory_region = lldb.SBMemoryRegionInfo() + core_memory_list.GetMemoryRegionAtIndex(x, core_memory_region) + mem_tuple = ( + core_memory_region.GetRegionBase(), + core_memory_region.GetRegionEnd(), + ) + self.assertTrue( + mem_tuple not in range_set, "Duplicate memory region found" + ) + range_set.add(mem_tuple) + finally: + if os.path.isfile(custom_file): + os.unlink(custom_file) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_save_minidump_custom_save_style_duplicated_regions(self): + """Test that verifies a custom and unspecified save style fails for + containing no data to save""" + + self.build() + exe = self.getBuildArtifact("a.out") + try: + target = self.dbg.CreateTarget(exe) + process = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + self.assertState(process.GetState(), lldb.eStateStopped) + + memory_list = process.GetMemoryRegions() + # Test that we don't duplicate regions, by duplicating regions + # at various indices. + self.save_core_with_region(process, 0) + self.save_core_with_region(process, len(memory_list) - 1) + + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) diff --git a/lldb/unittests/Process/Utility/CMakeLists.txt b/lldb/unittests/Process/Utility/CMakeLists.txt index 651f871621fdfcd..ec0ff95d073b920 100644 --- a/lldb/unittests/Process/Utility/CMakeLists.txt +++ b/lldb/unittests/Process/Utility/CMakeLists.txt @@ -18,6 +18,7 @@ add_lldb_unittest(ProcessUtilityTests LinuxProcMapsTest.cpp MemoryTagManagerAArch64MTETest.cpp RegisterContextTest.cpp + CoreFileMemoryRangesTest.cpp ${PLATFORM_SOURCES} LINK_LIBS diff --git a/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp new file mode 100644 index 000000000000000..6d514b11323864e --- /dev/null +++ b/lldb/unittests/Process/Utility/CoreFileMemoryRangesTest.cpp @@ -0,0 +1,205 @@ +//===-- CoreFileMemoryRangesTests.cpp +//---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include "lldb/Target/CoreFileMemoryRanges.h" +#include "lldb/lldb-types.h" + +using namespace lldb_private; + +TEST(CoreFileMemoryRangesTest, MapOverlappingRanges) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const size_t iterations = 10; + for (size_t i = 0; i < iterations; i++) { + const lldb::addr_t start = start_addr + (i * increment_addr); + const lldb::addr_t end = start + increment_addr; + // Arbitrary value + const uint32_t permissions = 0x3; + llvm::AddressRange range(start, end); + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {start, end - start, core_range}; + ranges.Append(entry); + } + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(1, ranges.GetSize()); + const auto range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(range); + ASSERT_THAT(start_addr, range->GetRangeBase()); + ASSERT_THAT(start_addr + (iterations * increment_addr), range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, RangesSplitByPermissions) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const size_t iterations = 10; + for (size_t i = 0; i < iterations; i++) { + const lldb::addr_t start = start_addr + (i * increment_addr); + const lldb::addr_t end = start + increment_addr; + const uint32_t permissions = i; + llvm::AddressRange range(start, end); + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {start, end - start, core_range}; + ranges.Append(entry); + } + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(10, ranges.GetSize()); + const auto range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(range); + ASSERT_THAT(start_addr, range->GetRangeBase()); + ASSERT_THAT(start_addr + increment_addr, range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, MapPartialOverlappingRanges) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const size_t iterations = 10; + for (size_t i = 0; i < iterations; i++) { + const lldb::addr_t start = start_addr + (i * increment_addr); + const lldb::addr_t end = start + increment_addr; + // Arbitrary value + const uint32_t permissions = 0x3; + llvm::AddressRange range(start, end); + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {start, end - start, core_range}; + ranges.Append(entry); + } + + const lldb::addr_t unique_start = 0x7fff0000; + const lldb::addr_t unique_end = unique_start + increment_addr; + llvm::AddressRange range(unique_start, unique_end); + const uint32_t permissions = 0x3; + const CoreFileMemoryRange core_range = {range, permissions}; + // The range data is Start, Size, While the range is start-end. + CoreFileMemoryRanges::Entry entry = {unique_start, unique_end - unique_start, + core_range}; + ranges.Append(entry); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(2, ranges.GetSize()); + const auto merged_range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(merged_range); + ASSERT_THAT(start_addr, merged_range->GetRangeBase()); + ASSERT_THAT(start_addr + (iterations * increment_addr), + merged_range->GetRangeEnd()); + const auto unique_range = ranges.GetEntryAtIndex(1); + ASSERT_TRUE(unique_range); + ASSERT_THAT(unique_start, unique_range->GetRangeBase()); + ASSERT_THAT(unique_end, unique_range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_SamePermissions) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const lldb::addr_t superior_region_end = start_addr + increment_addr * 10; + llvm::AddressRange range(start_addr, superior_region_end); + const CoreFileMemoryRange core_range = {range, 0x3}; + CoreFileMemoryRanges::Entry entry = { + start_addr, superior_region_end - start_addr, core_range}; + ranges.Append(entry); + const lldb::addr_t inferior_region_end = start_addr + increment_addr; + llvm::AddressRange inferior_range(start_addr, inferior_region_end); + const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x3}; + CoreFileMemoryRanges::Entry inferior_entry = { + start_addr, inferior_region_end - start_addr, inferior_core_range}; + ranges.Append(inferior_entry); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(1, ranges.GetSize()); + const auto searched_range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(searched_range); + ASSERT_THAT(start_addr, searched_range->GetRangeBase()); + ASSERT_THAT(superior_region_end, searched_range->GetRangeEnd()); +} + +TEST(CoreFileMemoryRangesTest, SuperiorAndInferiorRanges_DifferentPermissions) { + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t increment_addr = 0x1000; + const lldb::addr_t superior_region_end = start_addr + increment_addr * 10; + llvm::AddressRange range(start_addr, superior_region_end); + const CoreFileMemoryRange core_range = {range, 0x3}; + CoreFileMemoryRanges::Entry entry = { + start_addr, superior_region_end - start_addr, core_range}; + ranges.Append(entry); + const lldb::addr_t inferior_region_end = start_addr + increment_addr; + llvm::AddressRange inferior_range(start_addr, inferior_region_end); + const CoreFileMemoryRange inferior_core_range = {inferior_range, 0x4}; + CoreFileMemoryRanges::Entry inferior_entry = { + start_addr, inferior_region_end - start_addr, inferior_core_range}; + ranges.Append(inferior_entry); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Fail()); +} + +TEST(CoreFileMemoryRangesTest, NonIntersectingRangesSamePermissions) { + const int permissions = 0x7; + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t region_one_start = 0x1000; + const lldb::addr_t region_one_end = 0x2000; + llvm::AddressRange range_one(region_one_start, region_one_end); + const CoreFileMemoryRange core_range_one = {range_one, permissions}; + CoreFileMemoryRanges::Entry entry_one = { + region_one_start, region_one_end - region_one_start, core_range_one}; + ranges.Append(entry_one); + const lldb::addr_t region_two_start = 0xb000; + const lldb::addr_t region_two_end = 0xc000; + llvm::AddressRange range_two(region_two_start, region_two_end); + const CoreFileMemoryRange core_range_two = {range_two, permissions}; + CoreFileMemoryRanges::Entry entry_two = { + region_two_start, region_two_end - region_two_start, core_range_two}; + ranges.Append(entry_two); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(2UL, ranges.GetSize()); + ASSERT_THAT(region_one_start, ranges.GetEntryAtIndex(0)->GetRangeBase()); + ASSERT_THAT(region_two_start, ranges.GetEntryAtIndex(1)->GetRangeBase()); +} + +TEST(CoreFileMemoryRangesTest, PartialOverlapping) { + const int permissions = 0x3; + lldb_private::CoreFileMemoryRanges ranges; + const lldb::addr_t start_addr = 0x1000; + const lldb::addr_t end_addr = 0x2000; + llvm::AddressRange range_one(start_addr, end_addr); + const CoreFileMemoryRange core_range_one = {range_one, permissions}; + CoreFileMemoryRanges::Entry entry_one = {start_addr, end_addr - start_addr, + core_range_one}; + llvm::AddressRange range_two(start_addr / 2, end_addr / 2); + const CoreFileMemoryRange core_range_two = {range_two, permissions}; + CoreFileMemoryRanges::Entry entry_two = { + start_addr / 2, end_addr / 2 - start_addr / 2, core_range_two}; + ranges.Append(entry_one); + ranges.Append(entry_two); + + Status error = ranges.FinalizeCoreFileSaveRanges(); + EXPECT_TRUE(error.Success()); + ASSERT_THAT(1, ranges.GetSize()); + const auto searched_range = ranges.GetEntryAtIndex(0); + ASSERT_TRUE(searched_range); + ASSERT_THAT(start_addr / 2, searched_range->GetRangeBase()); + ASSERT_THAT(end_addr, searched_range->GetRangeEnd()); +} From f02c72f9f996b2ef99886d345d87f6c62a3ee897 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 11 Sep 2024 17:36:02 +0000 Subject: [PATCH 13/94] [gn build] Port 96b7c64b8a87 --- llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn index fe8a3f590dd3c24..acf9e7aa701e7fd 100644 --- a/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Target/BUILD.gn @@ -35,6 +35,7 @@ static_library("Target") { sources = [ "ABI.cpp", "AssertFrameRecognizer.cpp", + "CoreFileMemoryRanges.cpp", "DynamicRegisterInfo.cpp", "ExecutionContext.cpp", "InstrumentationRuntime.cpp", From 943182e3112756de8982babad6b5c8e74fdf8d02 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 10:49:33 -0700 Subject: [PATCH 14/94] [clang][TableGen] Change comment command emitter to const RecordKeeper (#108199) Change comment command emitter to const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../TableGen/ClangCommentCommandInfoEmitter.cpp | 14 +++++++------- clang/utils/TableGen/TableGenBackends.h | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp index aee7d38786a51c6..1a2503dcf660cfb 100644 --- a/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp +++ b/clang/utils/TableGen/ClangCommentCommandInfoEmitter.cpp @@ -20,16 +20,16 @@ using namespace llvm; -void clang::EmitClangCommentCommandInfo(RecordKeeper &Records, +void clang::EmitClangCommentCommandInfo(const RecordKeeper &Records, raw_ostream &OS) { emitSourceFileHeader("A list of commands useable in documentation comments", OS, Records); OS << "namespace {\n" "const CommandInfo Commands[] = {\n"; - std::vector Tags = Records.getAllDerivedDefinitions("Command"); + ArrayRef Tags = Records.getAllDerivedDefinitions("Command"); for (size_t i = 0, e = Tags.size(); i != e; ++i) { - Record &Tag = *Tags[i]; + const Record &Tag = *Tags[i]; OS << " { " << "\"" << Tag.getValueAsString("Name") << "\", " << "\"" << Tag.getValueAsString("EndCommandName") << "\", " << i << ", " @@ -62,7 +62,7 @@ void clang::EmitClangCommentCommandInfo(RecordKeeper &Records, std::vector Matches; for (size_t i = 0, e = Tags.size(); i != e; ++i) { - Record &Tag = *Tags[i]; + const Record &Tag = *Tags[i]; std::string Name = std::string(Tag.getValueAsString("Name")); std::string Return; raw_string_ostream(Return) << "return &Commands[" << i << "];"; @@ -112,7 +112,7 @@ static std::string MangleName(StringRef Str) { return Mangled; } -void clang::EmitClangCommentCommandList(RecordKeeper &Records, +void clang::EmitClangCommentCommandList(const RecordKeeper &Records, raw_ostream &OS) { emitSourceFileHeader("A list of commands useable in documentation comments", OS, Records); @@ -121,9 +121,9 @@ void clang::EmitClangCommentCommandList(RecordKeeper &Records, << "# define COMMENT_COMMAND(NAME)\n" << "#endif\n"; - std::vector Tags = Records.getAllDerivedDefinitions("Command"); + ArrayRef Tags = Records.getAllDerivedDefinitions("Command"); for (size_t i = 0, e = Tags.size(); i != e; ++i) { - Record &Tag = *Tags[i]; + const Record &Tag = *Tags[i]; std::string MangledName = MangleName(Tag.getValueAsString("Name")); OS << "COMMENT_COMMAND(" << MangledName << ")\n"; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 35cc04d6ef31f4b..5b1b0153e8cef92 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -91,9 +91,9 @@ void EmitClangCommentHTMLTagsProperties(llvm::RecordKeeper &Records, void EmitClangCommentHTMLNamedCharacterReferences(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentCommandInfo(llvm::RecordKeeper &Records, +void EmitClangCommentCommandInfo(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentCommandList(llvm::RecordKeeper &Records, +void EmitClangCommentCommandList(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangOpcodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); From dca9f21724c2206973b78ddc3ab3327b85f1e3ec Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 10:50:00 -0700 Subject: [PATCH 15/94] [clang][TableGen] Change HTML Emitter to use const RecordKeeper (#108201) Change HTMLNamedCharacterReferenceEmitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- ...CommentHTMLNamedCharacterReferenceEmitter.cpp | 16 ++++++---------- clang/utils/TableGen/TableGenBackends.h | 4 ++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp index f1cd9af0519d1b8..bd75b3f6b652a16 100644 --- a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp +++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp @@ -46,21 +46,17 @@ static bool translateCodePointToUTF8(unsigned CodePoint, return true; } -void clang::EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records, - raw_ostream &OS) { - std::vector Tags = Records.getAllDerivedDefinitions("NCR"); +void clang::EmitClangCommentHTMLNamedCharacterReferences( + const RecordKeeper &Records, raw_ostream &OS) { std::vector NameToUTF8; SmallString<32> CLiteral; - for (std::vector::iterator I = Tags.begin(), E = Tags.end(); - I != E; ++I) { - Record &Tag = **I; - std::string Spelling = std::string(Tag.getValueAsString("Spelling")); - uint64_t CodePoint = Tag.getValueAsInt("CodePoint"); + for (const Record *Tag : Records.getAllDerivedDefinitions("NCR")) { + std::string Spelling = std::string(Tag->getValueAsString("Spelling")); + uint64_t CodePoint = Tag->getValueAsInt("CodePoint"); CLiteral.clear(); CLiteral.append("return "); if (!translateCodePointToUTF8(CodePoint, CLiteral)) { - SrcMgr.PrintMessage(Tag.getLoc().front(), - SourceMgr::DK_Error, + SrcMgr.PrintMessage(Tag->getLoc().front(), SourceMgr::DK_Error, Twine("invalid code point")); continue; } diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 5b1b0153e8cef92..e8287d5bee08b39 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -88,8 +88,8 @@ void EmitClangCommentHTMLTags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangCommentHTMLTagsProperties(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentHTMLNamedCharacterReferences(llvm::RecordKeeper &Records, - llvm::raw_ostream &OS); +void EmitClangCommentHTMLNamedCharacterReferences( + const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangCommentCommandInfo(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); From e382b0c9972b4a3cf6c4bc21be50e12b76a488bd Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 10:50:26 -0700 Subject: [PATCH 16/94] [clang][TableGen] Change HTML Tags emitter to use const RecordKeeper (#108202) Change HTML Tags emitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../utils/TableGen/ClangCommentHTMLTagsEmitter.cpp | 13 +++++++------ clang/utils/TableGen/TableGenBackends.h | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp index 3dc1098753e0bff..a457315bc62c5c7 100644 --- a/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp +++ b/clang/utils/TableGen/ClangCommentHTMLTagsEmitter.cpp @@ -19,10 +19,11 @@ using namespace llvm; -void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) { - std::vector Tags = Records.getAllDerivedDefinitions("Tag"); +void clang::EmitClangCommentHTMLTags(const RecordKeeper &Records, + raw_ostream &OS) { + ArrayRef Tags = Records.getAllDerivedDefinitions("Tag"); std::vector Matches; - for (Record *Tag : Tags) { + for (const Record *Tag : Tags) { Matches.emplace_back(std::string(Tag->getValueAsString("Spelling")), "return true;"); } @@ -35,12 +36,12 @@ void clang::EmitClangCommentHTMLTags(RecordKeeper &Records, raw_ostream &OS) { << "}\n\n"; } -void clang::EmitClangCommentHTMLTagsProperties(RecordKeeper &Records, +void clang::EmitClangCommentHTMLTagsProperties(const RecordKeeper &Records, raw_ostream &OS) { - std::vector Tags = Records.getAllDerivedDefinitions("Tag"); + ArrayRef Tags = Records.getAllDerivedDefinitions("Tag"); std::vector MatchesEndTagOptional; std::vector MatchesEndTagForbidden; - for (Record *Tag : Tags) { + for (const Record *Tag : Tags) { std::string Spelling = std::string(Tag->getValueAsString("Spelling")); StringMatcher::StringPair Match(Spelling, "return true;"); if (Tag->getValueAsBit("EndTagOptional")) diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index e8287d5bee08b39..503684118278bf0 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -84,9 +84,9 @@ void EmitClangDiagsIndexName(llvm::RecordKeeper &Records, void EmitClangSACheckers(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentHTMLTags(llvm::RecordKeeper &Records, +void EmitClangCommentHTMLTags(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangCommentHTMLTagsProperties(llvm::RecordKeeper &Records, +void EmitClangCommentHTMLTagsProperties(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangCommentHTMLNamedCharacterReferences( const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); From 07dc9b838efc32647aeafbf7325e3d710412a0bf Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 10:50:57 -0700 Subject: [PATCH 17/94] [clang][TableGen] Change DataCollector to use const RecordKeeper (#108203) Change DataCollectors Emitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- clang/utils/TableGen/ClangDataCollectorsEmitter.cpp | 2 +- clang/utils/TableGen/TableGenBackends.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp index 45082935c1f7943..dae6710d752358d 100644 --- a/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp +++ b/clang/utils/TableGen/ClangDataCollectorsEmitter.cpp @@ -4,7 +4,7 @@ using namespace llvm; -void clang::EmitClangDataCollectors(RecordKeeper &RK, raw_ostream &OS) { +void clang::EmitClangDataCollectors(const RecordKeeper &RK, raw_ostream &OS) { const auto &Defs = RK.getClasses(); for (const auto &Entry : Defs) { Record &R = *Entry.second; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 503684118278bf0..699df49bc804661 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -152,7 +152,7 @@ void EmitClangOpenCLBuiltinHeader(llvm::RecordKeeper &Records, void EmitClangOpenCLBuiltinTests(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangDataCollectors(llvm::RecordKeeper &Records, +void EmitClangDataCollectors(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitTestPragmaAttributeSupportedAttributes(llvm::RecordKeeper &Records, From 0767027f363f81157657549a5db9ff9daf9198a5 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 10:51:23 -0700 Subject: [PATCH 18/94] [clang][TableGen] Change Opcode Emitter to use const RecordKeeper (#108211) Change Opcode Emitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- clang/utils/TableGen/ClangOpcodesEmitter.cpp | 6 +++--- clang/utils/TableGen/TableGenBackends.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/utils/TableGen/ClangOpcodesEmitter.cpp b/clang/utils/TableGen/ClangOpcodesEmitter.cpp index 120e1e2efa32b4a..7e426d59359a877 100644 --- a/clang/utils/TableGen/ClangOpcodesEmitter.cpp +++ b/clang/utils/TableGen/ClangOpcodesEmitter.cpp @@ -20,11 +20,11 @@ using namespace llvm; namespace { class ClangOpcodesEmitter { - RecordKeeper &Records; + const RecordKeeper &Records; unsigned NumTypes; public: - ClangOpcodesEmitter(RecordKeeper &R) + ClangOpcodesEmitter(const RecordKeeper &R) : Records(R), NumTypes(Records.getAllDerivedDefinitions("Type").size()) {} void run(raw_ostream &OS); @@ -404,6 +404,6 @@ void ClangOpcodesEmitter::PrintTypes(raw_ostream &OS, OS << ">"; } -void clang::EmitClangOpcodes(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangOpcodes(const RecordKeeper &Records, raw_ostream &OS) { ClangOpcodesEmitter(Records).run(OS); } diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index 699df49bc804661..a07583691253603 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -95,7 +95,7 @@ void EmitClangCommentCommandInfo(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangCommentCommandList(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpcodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangOpcodes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangSyntaxNodeList(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); From 7c53a7aae7947bd3a400d6b5713fe31dcfb46648 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 10:51:48 -0700 Subject: [PATCH 19/94] [clang][TableGen] Change OpenCL emitter to use const RecordKeeper (#108213) Change OpenCL builtins emitter to use const RecordKeeper This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../TableGen/ClangOpenCLBuiltinEmitter.cpp | 46 ++++++++++--------- clang/utils/TableGen/TableGenBackends.h | 6 +-- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp index 74c3a856ab6937c..d68dcc472a7bdbd 100644 --- a/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp +++ b/clang/utils/TableGen/ClangOpenCLBuiltinEmitter.cpp @@ -87,7 +87,7 @@ struct BuiltinTableEntries { // class BuiltinNameEmitter { public: - BuiltinNameEmitter(RecordKeeper &Records, raw_ostream &OS) + BuiltinNameEmitter(const RecordKeeper &Records, raw_ostream &OS) : Records(Records), OS(OS) {} // Entrypoint to generate the functions and structures for checking @@ -100,7 +100,7 @@ class BuiltinNameEmitter { // Contains OpenCL builtin functions and related information, stored as // Record instances. They are coming from the associated TableGen file. - RecordKeeper &Records; + const RecordKeeper &Records; // The output file. raw_ostream &OS; @@ -113,7 +113,7 @@ class BuiltinNameEmitter { // \param Output (out) String containing the enums to emit in the output file. // \param List (out) List containing the extracted Types, except the Types in // TypesSeen. - void ExtractEnumTypes(std::vector &Types, + void ExtractEnumTypes(ArrayRef Types, StringMap &TypesSeen, std::string &Output, std::vector &List); @@ -237,7 +237,7 @@ class BuiltinNameEmitter { /// Base class for emitting a file (e.g. header or test) from OpenCLBuiltins.td class OpenCLBuiltinFileEmitterBase { public: - OpenCLBuiltinFileEmitterBase(RecordKeeper &Records, raw_ostream &OS) + OpenCLBuiltinFileEmitterBase(const RecordKeeper &Records, raw_ostream &OS) : Records(Records), OS(OS) {} virtual ~OpenCLBuiltinFileEmitterBase() = default; @@ -305,7 +305,7 @@ class OpenCLBuiltinFileEmitterBase { // Contains OpenCL builtin functions and related information, stored as // Record instances. They are coming from the associated TableGen file. - RecordKeeper &Records; + const RecordKeeper &Records; // The output file. raw_ostream &OS; @@ -316,7 +316,7 @@ class OpenCLBuiltinFileEmitterBase { // builtin function described in the .td input. class OpenCLBuiltinTestEmitter : public OpenCLBuiltinFileEmitterBase { public: - OpenCLBuiltinTestEmitter(RecordKeeper &Records, raw_ostream &OS) + OpenCLBuiltinTestEmitter(const RecordKeeper &Records, raw_ostream &OS) : OpenCLBuiltinFileEmitterBase(Records, OS) {} // Entrypoint to generate the functions for testing all OpenCL builtin @@ -329,7 +329,7 @@ class OpenCLBuiltinTestEmitter : public OpenCLBuiltinFileEmitterBase { // prototype for each builtin function described in the .td input. class OpenCLBuiltinHeaderEmitter : public OpenCLBuiltinFileEmitterBase { public: - OpenCLBuiltinHeaderEmitter(RecordKeeper &Records, raw_ostream &OS) + OpenCLBuiltinHeaderEmitter(const RecordKeeper &Records, raw_ostream &OS) : OpenCLBuiltinFileEmitterBase(Records, OS) {} // Entrypoint to generate the header. @@ -362,7 +362,7 @@ void BuiltinNameEmitter::Emit() { EmitQualTypeFinder(); } -void BuiltinNameEmitter::ExtractEnumTypes(std::vector &Types, +void BuiltinNameEmitter::ExtractEnumTypes(ArrayRef Types, StringMap &TypesSeen, std::string &Output, std::vector &List) { @@ -392,11 +392,11 @@ void BuiltinNameEmitter::EmitDeclarations() { // Extract generic types and non-generic types separately, to keep // gentypes at the end of the enum which simplifies the special handling // for gentypes in SemaLookup. - std::vector GenTypes = + ArrayRef GenTypes = Records.getAllDerivedDefinitions("GenericType"); ExtractEnumTypes(GenTypes, TypesSeen, GenTypeEnums, GenTypeList); - std::vector Types = Records.getAllDerivedDefinitions("Type"); + ArrayRef Types = Records.getAllDerivedDefinitions("Type"); ExtractEnumTypes(Types, TypesSeen, TypeEnums, TypeList); OS << TypeEnums; @@ -499,7 +499,7 @@ static void VerifySignature(const std::vector &Signature, void BuiltinNameEmitter::GetOverloads() { // Populate the TypeMap. - std::vector Types = Records.getAllDerivedDefinitions("Type"); + ArrayRef Types = Records.getAllDerivedDefinitions("Type"); unsigned I = 0; for (const auto &T : Types) { TypeMap.insert(std::make_pair(T, I++)); @@ -507,7 +507,8 @@ void BuiltinNameEmitter::GetOverloads() { // Populate the SignaturesList and the FctOverloadMap. unsigned CumulativeSignIndex = 0; - std::vector Builtins = Records.getAllDerivedDefinitions("Builtin"); + ArrayRef Builtins = + Records.getAllDerivedDefinitions("Builtin"); for (const auto *B : Builtins) { StringRef BName = B->getValueAsString("Name"); FctOverloadMap.try_emplace(BName); @@ -535,7 +536,7 @@ void BuiltinNameEmitter::GetOverloads() { void BuiltinNameEmitter::EmitExtensionTable() { OS << "static const char *FunctionExtensionTable[] = {\n"; unsigned Index = 0; - std::vector FuncExtensions = + ArrayRef FuncExtensions = Records.getAllDerivedDefinitions("FunctionExtension"); for (const auto &FE : FuncExtensions) { @@ -804,11 +805,11 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty, OS << "\n switch (Ty.ID) {\n"; // Switch cases for image types (Image2d, Image3d, ...) - std::vector ImageTypes = + ArrayRef ImageTypes = Records.getAllDerivedDefinitions("ImageType"); // Map an image type name to its 3 access-qualified types (RO, WO, RW). - StringMap> ImageTypesMap; + StringMap> ImageTypesMap; for (auto *IT : ImageTypes) ImageTypesMap[IT->getValueAsString("Name")].push_back(IT); @@ -890,7 +891,7 @@ static void OCL2Qual(Sema &S, const OpenCLTypeStruct &Ty, // Switch cases for non generic, non image types (int, int4, float, ...). // Only insert the plain scalar type; vector information and type qualifiers // are added in step 2. - std::vector Types = Records.getAllDerivedDefinitions("Type"); + ArrayRef Types = Records.getAllDerivedDefinitions("Type"); StringMap TypesSeen; for (const auto *T : Types) { @@ -1211,7 +1212,8 @@ void OpenCLBuiltinTestEmitter::emit() { unsigned TestID = 0; // Iterate over all builtins. - std::vector Builtins = Records.getAllDerivedDefinitions("Builtin"); + ArrayRef Builtins = + Records.getAllDerivedDefinitions("Builtin"); for (const auto *B : Builtins) { StringRef Name = B->getValueAsString("Name"); @@ -1274,7 +1276,8 @@ void OpenCLBuiltinHeaderEmitter::emit() { )"; // Iterate over all builtins; sort to follow order of definition in .td file. - std::vector Builtins = Records.getAllDerivedDefinitions("Builtin"); + std::vector Builtins = + Records.getAllDerivedDefinitions("Builtin"); llvm::sort(Builtins, LessRecord()); for (const auto *B : Builtins) { @@ -1319,18 +1322,19 @@ void OpenCLBuiltinHeaderEmitter::emit() { "#pragma OPENCL EXTENSION all : disable\n"; } -void clang::EmitClangOpenCLBuiltins(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangOpenCLBuiltins(const RecordKeeper &Records, + raw_ostream &OS) { BuiltinNameEmitter NameChecker(Records, OS); NameChecker.Emit(); } -void clang::EmitClangOpenCLBuiltinHeader(RecordKeeper &Records, +void clang::EmitClangOpenCLBuiltinHeader(const RecordKeeper &Records, raw_ostream &OS) { OpenCLBuiltinHeaderEmitter HeaderFileGenerator(Records, OS); HeaderFileGenerator.emit(); } -void clang::EmitClangOpenCLBuiltinTests(RecordKeeper &Records, +void clang::EmitClangOpenCLBuiltinTests(const RecordKeeper &Records, raw_ostream &OS) { OpenCLBuiltinTestEmitter TestFileGenerator(Records, OS); TestFileGenerator.emit(); diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index a07583691253603..d190950c939142d 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -145,11 +145,11 @@ void EmitClangAttrDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangDiagDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangOptDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpenCLBuiltins(llvm::RecordKeeper &Records, +void EmitClangOpenCLBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpenCLBuiltinHeader(llvm::RecordKeeper &Records, +void EmitClangOpenCLBuiltinHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOpenCLBuiltinTests(llvm::RecordKeeper &Records, +void EmitClangOpenCLBuiltinTests(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangDataCollectors(const llvm::RecordKeeper &Records, From 8625eb0b87c86d3ef42a365d7593eed664b379e8 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 10:52:15 -0700 Subject: [PATCH 20/94] [clang][TableGen] Change OptionDoc Emitter to use const RecordKeeper (#108216) Change OptionDoc Emitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../utils/TableGen/ClangOptionDocEmitter.cpp | 46 +++++++++---------- clang/utils/TableGen/TableGenBackends.h | 2 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/clang/utils/TableGen/ClangOptionDocEmitter.cpp b/clang/utils/TableGen/ClangOptionDocEmitter.cpp index 86835611b842180..8c32f0218e761b5 100644 --- a/clang/utils/TableGen/ClangOptionDocEmitter.cpp +++ b/clang/utils/TableGen/ClangOptionDocEmitter.cpp @@ -24,8 +24,8 @@ using namespace llvm; namespace { struct DocumentedOption { - Record *Option; - std::vector Aliases; + const Record *Option; + std::vector Aliases; }; struct DocumentedGroup; struct Documentation { @@ -37,7 +37,7 @@ struct Documentation { } }; struct DocumentedGroup : Documentation { - Record *Group; + const Record *Group; }; static bool hasFlag(const Record *Option, StringRef OptionFlag, @@ -63,25 +63,25 @@ static bool isOptionVisible(const Record *Option, const Record *DocInfo) { } // Reorganize the records into a suitable form for emitting documentation. -Documentation extractDocumentation(RecordKeeper &Records, +Documentation extractDocumentation(const RecordKeeper &Records, const Record *DocInfo) { Documentation Result; // Build the tree of groups. The root in the tree is the fake option group // (Record*)nullptr, which contains all top-level groups and options. - std::map > OptionsInGroup; - std::map > GroupsInGroup; - std::map > Aliases; + std::map> OptionsInGroup; + std::map> GroupsInGroup; + std::map> Aliases; - std::map OptionsByName; - for (Record *R : Records.getAllDerivedDefinitions("Option")) + std::map OptionsByName; + for (const Record *R : Records.getAllDerivedDefinitions("Option")) OptionsByName[std::string(R->getValueAsString("Name"))] = R; - auto Flatten = [](Record *R) { + auto Flatten = [](const Record *R) { return R->getValue("DocFlatten") && R->getValueAsBit("DocFlatten"); }; - auto SkipFlattened = [&](Record *R) -> Record* { + auto SkipFlattened = [&](const Record *R) -> const Record * { while (R && Flatten(R)) { auto *G = dyn_cast(R->getValueInit("Group")); if (!G) @@ -91,17 +91,17 @@ Documentation extractDocumentation(RecordKeeper &Records, return R; }; - for (Record *R : Records.getAllDerivedDefinitions("OptionGroup")) { + for (const Record *R : Records.getAllDerivedDefinitions("OptionGroup")) { if (Flatten(R)) continue; - Record *Group = nullptr; + const Record *Group = nullptr; if (auto *G = dyn_cast(R->getValueInit("Group"))) Group = SkipFlattened(G->getDef()); GroupsInGroup[Group].push_back(R); } - for (Record *R : Records.getAllDerivedDefinitions("Option")) { + for (const Record *R : Records.getAllDerivedDefinitions("Option")) { if (auto *A = dyn_cast(R->getValueInit("Alias"))) { Aliases[A->getDef()].push_back(R); continue; @@ -120,33 +120,33 @@ Documentation extractDocumentation(RecordKeeper &Records, } } - Record *Group = nullptr; + const Record *Group = nullptr; if (auto *G = dyn_cast(R->getValueInit("Group"))) Group = SkipFlattened(G->getDef()); OptionsInGroup[Group].push_back(R); } - auto CompareByName = [](Record *A, Record *B) { + auto CompareByName = [](const Record *A, const Record *B) { return A->getValueAsString("Name") < B->getValueAsString("Name"); }; - auto CompareByLocation = [](Record *A, Record *B) { + auto CompareByLocation = [](const Record *A, const Record *B) { return A->getLoc()[0].getPointer() < B->getLoc()[0].getPointer(); }; - auto DocumentationForOption = [&](Record *R) -> DocumentedOption { + auto DocumentationForOption = [&](const Record *R) -> DocumentedOption { auto &A = Aliases[R]; llvm::sort(A, CompareByName); return {R, std::move(A)}; }; - std::function DocumentationForGroup = - [&](Record *R) -> Documentation { + std::function DocumentationForGroup = + [&](const Record *R) -> Documentation { Documentation D; auto &Groups = GroupsInGroup[R]; llvm::sort(Groups, CompareByLocation); - for (Record *G : Groups) { + for (const Record *G : Groups) { D.Groups.emplace_back(); D.Groups.back().Group = G; Documentation &Base = D.Groups.back(); @@ -157,7 +157,7 @@ Documentation extractDocumentation(RecordKeeper &Records, auto &Options = OptionsInGroup[R]; llvm::sort(Options, CompareByName); - for (Record *O : Options) + for (const Record *O : Options) if (isOptionVisible(O, DocInfo)) D.Options.push_back(DocumentationForOption(O)); @@ -444,7 +444,7 @@ void emitDocumentation(int Depth, const Documentation &Doc, } // namespace -void clang::EmitClangOptDocs(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangOptDocs(const RecordKeeper &Records, raw_ostream &OS) { const Record *DocInfo = Records.getDef("GlobalDocumentation"); if (!DocInfo) { PrintFatalError("The GlobalDocumentation top-level definition is missing, " diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index d190950c939142d..fe55ef2f423afeb 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -143,7 +143,7 @@ void EmitCdeBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangDiagDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangOptDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangOptDocs(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangOpenCLBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); From 1896ee38898a73ea9c2894e848884c8999884ab1 Mon Sep 17 00:00:00 2001 From: lntue <35648136+lntue@users.noreply.github.com> Date: Wed, 11 Sep 2024 14:13:31 -0400 Subject: [PATCH 21/94] [libc] Fix undefined behavior for nan functions. (#106468) Currently the nan* functions use nullptr dereferencing to crash with SIGSEGV if the input is nullptr. Both `nan(nullptr)` and `nullptr` dereferencing are undefined behaviors according to the C standard. Employing `nullptr` dereference in the `nan` function implementation is ok if users only linked against the pre-built library, but it might be completely removed by the compilers' optimizations if it is built from source together with the users' code. See for instance: https://godbolt.org/z/fd8KcM9bx This PR uses volatile load to prevent the undefined behavior if libc is built without sanitizers, and leave the current undefined behavior if libc is built with sanitizers, so that the undefined behavior can be caught for users' codes. --- .../modules/LLVMLibCCompileOptionRules.cmake | 4 +++ libc/config/config.json | 6 ++++ libc/docs/configure.rst | 2 ++ libc/src/__support/CMakeLists.txt | 3 ++ libc/src/__support/macros/CMakeLists.txt | 10 ++++++ libc/src/__support/macros/null_check.h | 33 +++++++++++++++++++ libc/src/__support/macros/sanitizer.h | 21 ++++++++++-- libc/src/__support/str_to_float.h | 4 +++ libc/test/src/compiler/CMakeLists.txt | 1 + .../src/compiler/stack_chk_guard_test.cpp | 6 ++-- libc/test/src/math/smoke/CMakeLists.txt | 14 +++++--- libc/test/src/math/smoke/nan_test.cpp | 7 ++-- libc/test/src/math/smoke/nanf128_test.cpp | 7 ++-- libc/test/src/math/smoke/nanf16_test.cpp | 7 ++-- libc/test/src/math/smoke/nanf_test.cpp | 7 ++-- libc/test/src/math/smoke/nanl_test.cpp | 7 ++-- .../llvm-project-overlay/libc/BUILD.bazel | 11 +++++++ 17 files changed, 124 insertions(+), 26 deletions(-) create mode 100644 libc/src/__support/macros/null_check.h diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 45dfe3e63302bfa..8643c9bb48ad41b 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -75,6 +75,10 @@ function(_get_compile_options_from_config output_var) list(APPEND config_options "-DLIBC_TYPES_TIME_T_IS_32_BIT") endif() + if(LIBC_ADD_NULL_CHECKS) + list(APPEND config_options "-DLIBC_ADD_NULL_CHECKS") + endif() + set(${output_var} ${config_options} PARENT_SCOPE) endfunction(_get_compile_options_from_config) diff --git a/libc/config/config.json b/libc/config/config.json index 2e72c0a3fd1d690..7dfbb560a36db37 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -94,5 +94,11 @@ "value": false, "doc": "Force the size of time_t to 64 bits, even on platforms where compatibility considerations would otherwise make it 32-bit." } + }, + "general": { + "LIBC_ADD_NULL_CHECKS": { + "value": true, + "doc": "Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior." + } } } diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 54ca5d55d7b2435..86875d4c975c018 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -30,6 +30,8 @@ to learn about the defaults for your platform and target. - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience. * **"errno" options** - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM. +* **"general" options** + - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** - ``LIBC_CONF_MATH_OPTIMIZATIONS``: Configures optimizations for math functions. Values accepted are LIBC_MATH_SKIP_ACCURATE_PASS, LIBC_MATH_SMALL_TABLES, LIBC_MATH_NO_ERRNO, LIBC_MATH_NO_EXCEPT, and LIBC_MATH_FAST. * **"printf" options** diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index 9bd1e29081a801f..0302ad64f8b5df6 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -192,6 +192,9 @@ add_header_library( libc.src.__support.CPP.optional libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.config + libc.src.__support.macros.null_check + libc.src.__support.macros.optimization libc.src.errno.errno ) diff --git a/libc/src/__support/macros/CMakeLists.txt b/libc/src/__support/macros/CMakeLists.txt index bcd47c3651bf5d9..99d4f640f283a47 100644 --- a/libc/src/__support/macros/CMakeLists.txt +++ b/libc/src/__support/macros/CMakeLists.txt @@ -27,3 +27,13 @@ add_header_library( DEPENDS libc.src.__support.macros.properties.compiler ) + +add_header_library( + null_check + HDRS + null_check.h + DEPENDS + .config + .optimization + .sanitizer +) diff --git a/libc/src/__support/macros/null_check.h b/libc/src/__support/macros/null_check.h new file mode 100644 index 000000000000000..400f7d809db4fa0 --- /dev/null +++ b/libc/src/__support/macros/null_check.h @@ -0,0 +1,33 @@ +//===-- Safe nullptr check --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H +#define LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/sanitizer.h" + +#if defined(LIBC_ADD_NULL_CHECKS) && !defined(LIBC_HAS_SANITIZER) +// Use volatile to prevent undefined behavior of dereferencing nullptr. +// Intentionally crashing with SIGSEGV. +#define LIBC_CRASH_ON_NULLPTR(PTR) \ + do { \ + if (LIBC_UNLIKELY(PTR == nullptr)) { \ + volatile auto *crashing = PTR; \ + [[maybe_unused]] volatile auto crash = *crashing; \ + __builtin_trap(); \ + } \ + } while (0) +#else +#define LIBC_CRASH_ON_NULLPTR(ptr) \ + do { \ + } while (0) +#endif + +#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_NULL_CHECK_H diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h index c4f8b5bce39755f..c20412e0f8b69f1 100644 --- a/libc/src/__support/macros/sanitizer.h +++ b/libc/src/__support/macros/sanitizer.h @@ -15,7 +15,25 @@ // Functions to unpoison memory //----------------------------------------------------------------------------- +#if LIBC_HAS_FEATURE(address_sanitizer) || defined(__SANITIZE_ADDRESS__) +#define LIBC_HAS_ADDRESS_SANITIZER +#endif + #if LIBC_HAS_FEATURE(memory_sanitizer) +#define LIBC_HAS_MEMORY_SANITIZER +#endif + +#if LIBC_HAS_FEATURE(undefined_behavior_sanitizer) +#define LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER +#endif + +#if defined(LIBC_HAS_ADDRESS_SANITIZER) || \ + defined(LIBC_HAS_MEMORY_SANITIZER) || \ + defined(LIBC_HAS_UNDEFINED_BEHAVIOR_SANITIZER) +#define LIBC_HAS_SANITIZER +#endif + +#ifdef LIBC_HAS_MEMORY_SANITIZER // Only perform MSAN unpoison in non-constexpr context. #include #define MSAN_UNPOISON(addr, size) \ @@ -27,8 +45,7 @@ #define MSAN_UNPOISON(ptr, size) #endif -#if LIBC_HAS_FEATURE(address_sanitizer) -#define LIBC_HAVE_ADDRESS_SANITIZER +#ifdef LIBC_HAS_ADDRESS_SANITIZER #include #define ASAN_POISON_MEMORY_REGION(addr, size) \ __asan_poison_memory_region((addr), (size)) diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h index ffd6ebf27c77264..a452b3a55fdeb4c 100644 --- a/libc/src/__support/str_to_float.h +++ b/libc/src/__support/str_to_float.h @@ -20,6 +20,8 @@ #include "src/__support/detailed_powers_of_ten.h" #include "src/__support/high_precision_decimal.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/null_check.h" +#include "src/__support/macros/optimization.h" #include "src/__support/str_to_integer.h" #include "src/__support/str_to_num_result.h" #include "src/__support/uint128.h" @@ -1208,6 +1210,8 @@ template LIBC_INLINE StrToNumResult strtonan(const char *arg) { using FPBits = typename fputil::FPBits; using StorageType = typename FPBits::StorageType; + LIBC_CRASH_ON_NULLPTR(arg); + FPBits result; int error = 0; StorageType nan_mantissa = 0; diff --git a/libc/test/src/compiler/CMakeLists.txt b/libc/test/src/compiler/CMakeLists.txt index 65a9acceb6f7f10..a45fa8c55e51285 100644 --- a/libc/test/src/compiler/CMakeLists.txt +++ b/libc/test/src/compiler/CMakeLists.txt @@ -7,6 +7,7 @@ add_libc_unittest( SRCS stack_chk_guard_test.cpp DEPENDS + libc.hdr.signal_macros libc.src.__support.macros.sanitizer libc.src.compiler.__stack_chk_fail libc.src.string.memset diff --git a/libc/test/src/compiler/stack_chk_guard_test.cpp b/libc/test/src/compiler/stack_chk_guard_test.cpp index 6b71e155fa3e4d0..4ec8398c9fc95dc 100644 --- a/libc/test/src/compiler/stack_chk_guard_test.cpp +++ b/libc/test/src/compiler/stack_chk_guard_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "include/llvm-libc-macros/signal-macros.h" +#include "hdr/signal_macros.h" #include "src/__support/macros/sanitizer.h" #include "src/compiler/__stack_chk_fail.h" #include "src/string/memset.h" @@ -18,7 +18,7 @@ TEST(LlvmLibcStackChkFail, Death) { // Disable the test when asan is enabled so that it doesn't immediately fail // after the memset, but before the stack canary is re-checked. -#ifndef LIBC_HAVE_ADDRESS_SANITIZER +#ifndef LIBC_HAS_ADDRESS_SANITIZER TEST(LlvmLibcStackChkFail, Smash) { EXPECT_DEATH( [] { @@ -27,4 +27,4 @@ TEST(LlvmLibcStackChkFail, Smash) { }, WITH_SIGNAL(SIGABRT)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 7271e933b9311d4..e943d98256a97b2 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -2895,9 +2895,10 @@ add_fp_unittest( SRCS nanf_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanf libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY @@ -2910,9 +2911,10 @@ add_fp_unittest( SRCS nan_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nan libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY @@ -2925,9 +2927,10 @@ add_fp_unittest( SRCS nanl_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanl libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY @@ -2940,7 +2943,7 @@ add_fp_unittest( SRCS nanf16_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanf16 libc.src.__support.FPUtil.fp_bits libc.src.__support.macros.sanitizer @@ -2956,9 +2959,10 @@ add_fp_unittest( SRCS nanf128_test.cpp DEPENDS - libc.include.signal + libc.hdr.signal_macros libc.src.math.nanf128 libc.src.__support.FPUtil.fp_bits + libc.src.__support.macros.sanitizer # FIXME: The nan tests currently have death tests, which aren't supported for # hermetic tests. UNIT_TEST_ONLY diff --git a/libc/test/src/math/smoke/nan_test.cpp b/libc/test/src/math/smoke/nan_test.cpp index 68c844181a19469..46b9e9aa9563abf 100644 --- a/libc/test/src/math/smoke/nan_test.cpp +++ b/libc/test/src/math/smoke/nan_test.cpp @@ -6,12 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/math/nan.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include class LlvmLibcNanTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { public: @@ -43,8 +44,8 @@ TEST_F(LlvmLibcNanTest, RandomString) { run_test("123 ", 0x7ff8000000000000); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanTest, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nan(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf128_test.cpp b/libc/test/src/math/smoke/nanf128_test.cpp index 015cc31e4be237e..25dd2ef1d5b1ca7 100644 --- a/libc/test/src/math/smoke/nanf128_test.cpp +++ b/libc/test/src/math/smoke/nanf128_test.cpp @@ -6,7 +6,9 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/__support/uint128.h" #include "src/math/nanf128.h" #include "test/UnitTest/FEnvSafeTest.h" @@ -53,9 +55,8 @@ TEST_F(LlvmLibcNanf128Test, RandomString) { QUIET_NAN); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) -#include +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanf128Test, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanf128(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf16_test.cpp b/libc/test/src/math/smoke/nanf16_test.cpp index 81b844bf6bb59c4..ec640a3b9eef92b 100644 --- a/libc/test/src/math/smoke/nanf16_test.cpp +++ b/libc/test/src/math/smoke/nanf16_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/macros/sanitizer.h" #include "src/math/nanf16.h" @@ -13,8 +14,6 @@ #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include - class LlvmLibcNanf16Test : public LIBC_NAMESPACE::testing::FEnvSafeTest { public: using StorageType = LIBC_NAMESPACE::fputil::FPBits::StorageType; @@ -44,8 +43,8 @@ TEST_F(LlvmLibcNanf16Test, RandomString) { run_test("123 ", 0x7e00); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanf16Test, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanf16(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanf_test.cpp b/libc/test/src/math/smoke/nanf_test.cpp index ff5823685225cef..dd3124ee9c51124 100644 --- a/libc/test/src/math/smoke/nanf_test.cpp +++ b/libc/test/src/math/smoke/nanf_test.cpp @@ -6,12 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/math/nanf.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include class LlvmLibcNanfTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { public: @@ -42,8 +43,8 @@ TEST_F(LlvmLibcNanfTest, RandomString) { run_test("123 ", 0x7fc00000); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanfTest, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanf(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/libc/test/src/math/smoke/nanl_test.cpp b/libc/test/src/math/smoke/nanl_test.cpp index de9af05100c10a6..ef3f9c15dafd9f6 100644 --- a/libc/test/src/math/smoke/nanl_test.cpp +++ b/libc/test/src/math/smoke/nanl_test.cpp @@ -6,12 +6,13 @@ // //===----------------------------------------------------------------------===// +#include "hdr/signal_macros.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/macros/sanitizer.h" #include "src/math/nanl.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64) #define SELECT_LONG_DOUBLE(val, _, __) val @@ -70,8 +71,8 @@ TEST_F(LlvmLibcNanlTest, RandomString) { run_test("123 ", expected); } -#if !defined(LIBC_HAVE_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) +#if !defined(LIBC_HAS_ADDRESS_SANITIZER) && defined(LIBC_TARGET_OS_IS_LINUX) TEST_F(LlvmLibcNanlTest, InvalidInput) { EXPECT_DEATH([] { LIBC_NAMESPACE::nanl(nullptr); }, WITH_SIGNAL(SIGSEGV)); } -#endif // LIBC_HAVE_ADDRESS_SANITIZER +#endif // LIBC_HAS_ADDRESS_SANITIZER diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index f3d3c745246af81..b86fcace5703c79 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -272,6 +272,16 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_macros_null_check", + hdrs = ["src/__support/macros/null_check.h"], + deps = [ + ":__support_macros_config", + ":__support_macros_optimization", + ":__support_macros_sanitizer", + ], +) + libc_support_library( name = "__support_common", hdrs = [ @@ -665,6 +675,7 @@ libc_support_library( ":__support_ctype_utils", ":__support_fputil_fp_bits", ":__support_fputil_rounding_mode", + ":__support_macros_null_check", ":__support_str_to_integer", ":__support_str_to_num_result", ":__support_uint128", From be770edee59310b158bf3a30ddc2645007ab8da3 Mon Sep 17 00:00:00 2001 From: Sam Clegg Date: Wed, 11 Sep 2024 11:22:46 -0700 Subject: [PATCH 22/94] [lld][WebAssembly] Reject shared libraries when `-static`/`-Bstatic` is used (#108263) This matches the behaviour of GNU ld and the ELF version of lld. --- lld/test/wasm/static-error.s | 12 ++++++++++++ lld/wasm/Driver.cpp | 10 ++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 lld/test/wasm/static-error.s diff --git a/lld/test/wasm/static-error.s b/lld/test/wasm/static-error.s new file mode 100644 index 000000000000000..3557506a5f07a25 --- /dev/null +++ b/lld/test/wasm/static-error.s @@ -0,0 +1,12 @@ +// RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.o +// RUN: wasm-ld --experimental-pic -shared -o %t.so %t.o + +// RUN: wasm-ld --experimental-pic -pie -o /dev/null %t.o %t.so +// RUN: not wasm-ld -o /dev/null -static %t.o %t.so 2>&1 | FileCheck %s + +// CHECK: attempted static link of dynamic object + +.global _start +_start: + .functype _start () -> () + end_function diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index cb8fe2534f5fe7c..2de7dcaeb43d47c 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -333,9 +333,15 @@ void LinkerDriver::addFile(StringRef path) { return; } case file_magic::bitcode: - case file_magic::wasm_object: - files.push_back(createObjectFile(mbref, "", 0, inLib)); + case file_magic::wasm_object: { + auto obj = createObjectFile(mbref, "", 0, inLib); + if (config->isStatic && isa(obj)) { + error("attempted static link of dynamic object " + path); + break; + } + files.push_back(obj); break; + } case file_magic::unknown: if (mbref.getBuffer().starts_with("#STUB")) { files.push_back(make(mbref)); From e3f936eb755d9ae37019ffcc7f53d71d2d58d188 Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Wed, 11 Sep 2024 11:34:26 -0700 Subject: [PATCH 23/94] Don't rely on undefined behavior to store how a `User` object's allocation is laid out (#105714) In `User::operator new` a single allocation is created to store the `User` object itself, "intrusive" operands or a pointer for "hung off" operands, and the descriptor. After allocation, details about the layout (number of operands, how the operands are stored, if there is a descriptor) are stored in the `User` object by settings its fields. The `Value` and `User` constructors are then very careful not to initialize these fields so that the values set during allocation can be subsequently read. However, when the `User` object is returned from `operator new` [its value is technically "indeterminate" and so reading a field without first initializing it is undefined behavior (and will be erroneous in C++26)](https://en.cppreference.com/w/cpp/language/default_initialization#Indeterminate_and_erroneous_values). We discovered this issue when trying to build LLVM using MSVC's [`/sdl` flag](https://learn.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks?view=msvc-170) which clears class fields after allocation (the docs say that this feature shouldn't be turned on for custom allocators and should only clear pointers, but that doesn't seem to match the implementation). MSVC's behavior both with and without the `/sdl` flag is standards conforming since a program is supposed to initialize storage before reading from it, thus the compiler implementation changing any values will never be observed in a well-formed program. The standard also provides no provisions for making storage bytes not indeterminate by setting them during allocation or `operator new`. The fix for this is to create a set of types that encode the layout and provide these to both `operator new` and the constructor: * The `AllocMarker` types are used to select which `operator new` to use. * `AllocMarker` can then be implicitly converted to a `AllocInfo` which tells the constructor how the type was laid out. --- llvm/include/llvm/Analysis/MemorySSA.h | 28 +-- llvm/include/llvm/IR/Constant.h | 4 +- llvm/include/llvm/IR/Constants.h | 35 ++-- llvm/include/llvm/IR/DerivedUser.h | 4 +- llvm/include/llvm/IR/Function.h | 7 +- llvm/include/llvm/IR/GlobalAlias.h | 4 +- llvm/include/llvm/IR/GlobalIFunc.h | 4 +- llvm/include/llvm/IR/GlobalObject.h | 7 +- llvm/include/llvm/IR/GlobalValue.h | 6 +- llvm/include/llvm/IR/GlobalVariable.h | 31 +-- llvm/include/llvm/IR/InstrTypes.h | 22 ++- llvm/include/llvm/IR/Instruction.h | 2 +- llvm/include/llvm/IR/Instructions.h | 262 +++++++++++++++---------- llvm/include/llvm/IR/User.h | 105 +++++++--- llvm/lib/IR/Constants.cpp | 36 ++-- llvm/lib/IR/ConstantsContext.h | 44 +++-- llvm/lib/IR/Function.cpp | 5 +- llvm/lib/IR/Globals.cpp | 11 +- llvm/lib/IR/Instruction.cpp | 4 +- llvm/lib/IR/Instructions.cpp | 235 +++++++++++----------- llvm/lib/IR/User.cpp | 20 +- 21 files changed, 508 insertions(+), 368 deletions(-) diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h index c5eff151ca4180f..09fc34af60dc3c4 100644 --- a/llvm/include/llvm/Analysis/MemorySSA.h +++ b/llvm/include/llvm/Analysis/MemorySSA.h @@ -218,8 +218,8 @@ class MemoryAccess inline unsigned getID() const; MemoryAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue, - BasicBlock *BB, unsigned NumOperands) - : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue), + BasicBlock *BB, AllocInfo AllocInfo) + : DerivedUser(Type::getVoidTy(C), Vty, AllocInfo, DeleteValue), Block(BB) {} // Use deleteValue() to delete a generic MemoryAccess. @@ -280,8 +280,8 @@ class MemoryUseOrDef : public MemoryAccess { MemoryUseOrDef(LLVMContext &C, MemoryAccess *DMA, unsigned Vty, DeleteValueTy DeleteValue, Instruction *MI, BasicBlock *BB, - unsigned NumOperands) - : MemoryAccess(C, Vty, DeleteValue, BB, NumOperands), + AllocInfo AllocInfo) + : MemoryAccess(C, Vty, DeleteValue, BB, AllocInfo), MemoryInstruction(MI) { setDefiningAccess(DMA); } @@ -307,15 +307,16 @@ class MemoryUseOrDef : public MemoryAccess { /// MemoryUse's is exactly the set of Instructions for which /// AliasAnalysis::getModRefInfo returns "Ref". class MemoryUse final : public MemoryUseOrDef { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + public: DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess); MemoryUse(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB) - : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB, - /*NumOperands=*/1) {} + : MemoryUseOrDef(C, DMA, MemoryUseVal, deleteMe, MI, BB, AllocMarker) {} // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } static bool classof(const Value *MA) { @@ -367,6 +368,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUse, MemoryAccess) /// associated with them. This use points to the nearest reaching /// MemoryDef/MemoryPhi. class MemoryDef final : public MemoryUseOrDef { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: friend class MemorySSA; @@ -374,12 +377,11 @@ class MemoryDef final : public MemoryUseOrDef { MemoryDef(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB, unsigned Ver) - : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB, - /*NumOperands=*/2), + : MemoryUseOrDef(C, DMA, MemoryDefVal, deleteMe, MI, BB, AllocMarker), ID(Ver) {} // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } static bool classof(const Value *MA) { @@ -474,8 +476,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(MemoryUseOrDef, MemoryAccess) /// Because MemoryUse's do not generate new definitions, they do not have this /// issue. class MemoryPhi final : public MemoryAccess { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } public: void operator delete(void *Ptr) { User::operator delete(Ptr); } @@ -484,7 +488,7 @@ class MemoryPhi final : public MemoryAccess { DECLARE_TRANSPARENT_OPERAND_ACCESSORS(MemoryAccess); MemoryPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0) - : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, 0), ID(Ver), + : MemoryAccess(C, MemoryPhiVal, deleteMe, BB, AllocMarker), ID(Ver), ReservedSpace(NumPreds) { allocHungoffUses(ReservedSpace); } diff --git a/llvm/include/llvm/IR/Constant.h b/llvm/include/llvm/IR/Constant.h index a82e37b7e2df232..0aefb5ecf6b7f24 100644 --- a/llvm/include/llvm/IR/Constant.h +++ b/llvm/include/llvm/IR/Constant.h @@ -41,8 +41,8 @@ class APInt; /// LLVM Constant Representation class Constant : public User { protected: - Constant(Type *ty, ValueTy vty, Use *Ops, unsigned NumOps) - : User(ty, vty, Ops, NumOps) {} + Constant(Type *ty, ValueTy vty, AllocInfo AllocInfo) + : User(ty, vty, AllocInfo) {} ~Constant() = default; diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h index 62ccde96e5397be..3b16aa039a50871 100644 --- a/llvm/include/llvm/IR/Constants.h +++ b/llvm/include/llvm/IR/Constants.h @@ -51,6 +51,8 @@ template struct ConstantAggrKeyType; /// Since they can be in use by unrelated modules (and are never based on /// GlobalValues), it never makes sense to RAUW them. class ConstantData : public Constant { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{0}; + friend class Constant; Value *handleOperandChangeImpl(Value *From, Value *To) { @@ -58,9 +60,9 @@ class ConstantData : public Constant { } protected: - explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, nullptr, 0) {} + explicit ConstantData(Type *Ty, ValueTy VT) : Constant(Ty, VT, AllocMarker) {} - void *operator new(size_t S) { return User::operator new(S, 0); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } public: void operator delete(void *Ptr) { User::operator delete(Ptr); } @@ -399,7 +401,8 @@ class ConstantAggregateZero final : public ConstantData { /// use operands. class ConstantAggregate : public Constant { protected: - ConstantAggregate(Type *T, ValueTy VT, ArrayRef V); + ConstantAggregate(Type *T, ValueTy VT, ArrayRef V, + AllocInfo AllocInfo); public: /// Transparently provide more efficient getOperand methods. @@ -425,7 +428,7 @@ class ConstantArray final : public ConstantAggregate { friend struct ConstantAggrKeyType; friend class Constant; - ConstantArray(ArrayType *T, ArrayRef Val); + ConstantArray(ArrayType *T, ArrayRef Val, AllocInfo AllocInfo); void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -457,7 +460,7 @@ class ConstantStruct final : public ConstantAggregate { friend struct ConstantAggrKeyType; friend class Constant; - ConstantStruct(StructType *T, ArrayRef Val); + ConstantStruct(StructType *T, ArrayRef Val, AllocInfo AllocInfo); void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -509,7 +512,7 @@ class ConstantVector final : public ConstantAggregate { friend struct ConstantAggrKeyType; friend class Constant; - ConstantVector(VectorType *T, ArrayRef Val); + ConstantVector(VectorType *T, ArrayRef Val, AllocInfo AllocInfo); void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -890,9 +893,11 @@ class ConstantTargetNone final : public ConstantData { class BlockAddress final : public Constant { friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + BlockAddress(Function *F, BasicBlock *BB); - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -936,9 +941,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BlockAddress, Value) class DSOLocalEquivalent final : public Constant { friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + DSOLocalEquivalent(GlobalValue *GV); - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -973,9 +980,11 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DSOLocalEquivalent, Value) class NoCFIValue final : public Constant { friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + NoCFIValue(GlobalValue *GV); - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -1013,10 +1022,12 @@ class ConstantPtrAuth final : public Constant { friend struct ConstantPtrAuthKeyType; friend class Constant; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{4}; + ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc, Constant *AddrDisc); - void *operator new(size_t s) { return User::operator new(s, 4); } + void *operator new(size_t s) { return User::operator new(s, AllocMarker); } void destroyConstantImpl(); Value *handleOperandChangeImpl(Value *From, Value *To); @@ -1102,8 +1113,8 @@ class ConstantExpr : public Constant { Value *handleOperandChangeImpl(Value *From, Value *To); protected: - ConstantExpr(Type *ty, unsigned Opcode, Use *Ops, unsigned NumOps) - : Constant(ty, ConstantExprVal, Ops, NumOps) { + ConstantExpr(Type *ty, unsigned Opcode, AllocInfo AllocInfo) + : Constant(ty, ConstantExprVal, AllocInfo) { // Operation type (an Instruction opcode) is stored as the SubclassData. setValueSubclassData(Opcode); } diff --git a/llvm/include/llvm/IR/DerivedUser.h b/llvm/include/llvm/IR/DerivedUser.h index a25d316c2d60bc5..a307315864b425a 100644 --- a/llvm/include/llvm/IR/DerivedUser.h +++ b/llvm/include/llvm/IR/DerivedUser.h @@ -34,9 +34,9 @@ class DerivedUser : public User { DeleteValueTy DeleteValue; public: - DerivedUser(Type *Ty, unsigned VK, Use *U, unsigned NumOps, + DerivedUser(Type *Ty, unsigned VK, AllocInfo AllocInfo, DeleteValueTy DeleteValue) - : User(Ty, VK, U, NumOps), DeleteValue(DeleteValue) {} + : User(Ty, VK, AllocInfo), DeleteValue(DeleteValue) {} }; } // end namespace llvm diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index f7e4e976ae4c444..866c68d15e4011d 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -72,6 +72,8 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { using const_arg_iterator = const Argument *; private: + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + // Important things that make up a function! BasicBlockListType BasicBlocks; ///< The basic blocks @@ -171,13 +173,14 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { static Function *Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N = "", Module *M = nullptr) { - return new Function(Ty, Linkage, AddrSpace, N, M); + return new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M); } // TODO: remove this once all users have been updated to pass an AddrSpace static Function *Create(FunctionType *Ty, LinkageTypes Linkage, const Twine &N = "", Module *M = nullptr) { - return new Function(Ty, Linkage, static_cast(-1), N, M); + return new (AllocMarker) + Function(Ty, Linkage, static_cast(-1), N, M); } /// Creates a new function and attaches it to a module. diff --git a/llvm/include/llvm/IR/GlobalAlias.h b/llvm/include/llvm/IR/GlobalAlias.h index 583d66e28155d76..3db6984c4a30c34 100644 --- a/llvm/include/llvm/IR/GlobalAlias.h +++ b/llvm/include/llvm/IR/GlobalAlias.h @@ -28,6 +28,8 @@ template class SymbolTableListTraits; class GlobalAlias : public GlobalValue, public ilist_node { friend class SymbolTableListTraits; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Aliasee, Module *Parent); @@ -59,7 +61,7 @@ class GlobalAlias : public GlobalValue, public ilist_node { static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee); // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Provide fast operand accessors diff --git a/llvm/include/llvm/IR/GlobalIFunc.h b/llvm/include/llvm/IR/GlobalIFunc.h index 8935284f32d7598..0d2f152cef403e7 100644 --- a/llvm/include/llvm/IR/GlobalIFunc.h +++ b/llvm/include/llvm/IR/GlobalIFunc.h @@ -34,6 +34,8 @@ template class SymbolTableListTraits; class GlobalIFunc final : public GlobalObject, public ilist_node { friend class SymbolTableListTraits; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Linkage, const Twine &Name, Constant *Resolver, Module *Parent); @@ -48,7 +50,7 @@ class GlobalIFunc final : public GlobalObject, public ilist_node { Constant *Resolver, Module *Parent); // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Provide fast operand accessors diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h index b6a974d8bb9f086..08edc13d81f880a 100644 --- a/llvm/include/llvm/IR/GlobalObject.h +++ b/llvm/include/llvm/IR/GlobalObject.h @@ -40,10 +40,9 @@ class GlobalObject : public GlobalValue { }; protected: - GlobalObject(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps, - LinkageTypes Linkage, const Twine &Name, - unsigned AddressSpace = 0) - : GlobalValue(Ty, VTy, Ops, NumOps, Linkage, Name, AddressSpace) { + GlobalObject(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage, + const Twine &Name, unsigned AddressSpace = 0) + : GlobalValue(Ty, VTy, AllocInfo, Linkage, Name, AddressSpace) { setGlobalValueSubClassData(0); } ~GlobalObject(); diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h index 53eddebdd6ae68c..d9104d7af5f9725 100644 --- a/llvm/include/llvm/IR/GlobalValue.h +++ b/llvm/include/llvm/IR/GlobalValue.h @@ -77,9 +77,9 @@ class GlobalValue : public Constant { }; protected: - GlobalValue(Type *Ty, ValueTy VTy, Use *Ops, unsigned NumOps, - LinkageTypes Linkage, const Twine &Name, unsigned AddressSpace) - : Constant(PointerType::get(Ty, AddressSpace), VTy, Ops, NumOps), + GlobalValue(Type *Ty, ValueTy VTy, AllocInfo AllocInfo, LinkageTypes Linkage, + const Twine &Name, unsigned AddressSpace) + : Constant(PointerType::get(Ty, AddressSpace), VTy, AllocInfo), ValueType(Ty), Visibility(DefaultVisibility), UnnamedAddrVal(unsigned(UnnamedAddr::None)), DllStorageClass(DefaultStorageClass), ThreadLocal(NotThreadLocal), diff --git a/llvm/include/llvm/IR/GlobalVariable.h b/llvm/include/llvm/IR/GlobalVariable.h index 0736c300de72f58..83e484816d7d4c9 100644 --- a/llvm/include/llvm/IR/GlobalVariable.h +++ b/llvm/include/llvm/IR/GlobalVariable.h @@ -39,6 +39,8 @@ class DIGlobalVariableExpression; class GlobalVariable : public GlobalObject, public ilist_node { friend class SymbolTableListTraits; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + AttributeSet Attrs; // Is this a global constant? @@ -70,24 +72,31 @@ class GlobalVariable : public GlobalObject, public ilist_node { GlobalVariable(const GlobalVariable &) = delete; GlobalVariable &operator=(const GlobalVariable &) = delete; +private: + /// Set the number of operands on a GlobalVariable. + /// + /// GlobalVariable always allocates space for a single operands, but + /// doesn't always use it. + void setGlobalVariableNumOperands(unsigned NumOps) { + assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands"); + NumUserOperands = NumOps; + } + +public: ~GlobalVariable() { dropAllReferences(); + + // Number of operands can be set to 0 after construction and initialization. + // Make sure that number of operands is reset to 1, as this is needed in + // User::operator delete + setGlobalVariableNumOperands(1); } // allocate space for exactly one operand - void *operator new(size_t s) { - return User::operator new(s, 1); - } + void *operator new(size_t s) { return User::operator new(s, AllocMarker); } // delete space for exactly one operand as created in the corresponding new operator - void operator delete(void *ptr){ - assert(ptr != nullptr && "must not be nullptr"); - User *Obj = static_cast(ptr); - // Number of operands can be set to 0 after construction and initialization. Make sure - // that number of operands is reset to 1, as this is needed in User::operator delete - Obj->setGlobalVariableNumOperands(1); - User::operator delete(Obj); - } + void operator delete(void *ptr) { User::operator delete(ptr); } /// Provide fast operand accessors DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 5ed3ec46dce57db..4720533bac85980 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -55,24 +55,26 @@ typedef unsigned ID; //===----------------------------------------------------------------------===// class UnaryInstruction : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + protected: UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock::iterator IB) - : Instruction(Ty, iType, &Op<0>(), 1, IB) { + : Instruction(Ty, iType, AllocMarker, IB) { Op<0>() = V; } UnaryInstruction(Type *Ty, unsigned iType, Value *V, Instruction *IB = nullptr) - : Instruction(Ty, iType, &Op<0>(), 1, IB) { + : Instruction(Ty, iType, AllocMarker, IB) { Op<0>() = V; } UnaryInstruction(Type *Ty, unsigned iType, Value *V, BasicBlock *IAE) - : Instruction(Ty, iType, &Op<0>(), 1, IAE) { + : Instruction(Ty, iType, AllocMarker, IAE) { Op<0>() = V; } public: // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -186,6 +188,8 @@ class UnaryOperator : public UnaryInstruction { //===----------------------------------------------------------------------===// class BinaryOperator : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + void AssertOK(); protected: @@ -199,7 +203,7 @@ class BinaryOperator : public Instruction { public: // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -745,6 +749,8 @@ class PossiblyNonNegInst : public CastInst { /// This class is the base class for the comparison instructions. /// Abstract base class of comparison instructions. class CmpInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: /// This enumeration lists the possible predicates for CmpInst subclasses. /// Values in the range 0-31 are reserved for FCmpInst, while values in the @@ -814,7 +820,7 @@ class CmpInst : public Instruction { public: // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Construct a compare instruction, given the opcode, the predicate and @@ -2416,10 +2422,10 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CallBase, Value) //===----------------------------------------------------------------------===// class FuncletPadInst : public Instruction { private: - FuncletPadInst(const FuncletPadInst &CPI); + FuncletPadInst(const FuncletPadInst &CPI, AllocInfo AllocInfo); explicit FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, - ArrayRef Args, unsigned Values, + ArrayRef Args, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(Value *ParentPad, ArrayRef Args, const Twine &NameStr); diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index c27572300d50630..a12d5d9d8fe9454 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -1030,7 +1030,7 @@ class Instruction : public User, setValueSubclassData(Storage); } - Instruction(Type *Ty, unsigned iType, Use *Ops, unsigned NumOps, + Instruction(Type *Ty, unsigned iType, AllocInfo AllocInfo, InsertPosition InsertBefore = nullptr); private: diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index ab3321ee755717a..e89739a55526625 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -297,6 +297,8 @@ class StoreInst : public Instruction { void AssertOK(); + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -314,7 +316,7 @@ class StoreInst : public Instruction { InsertPosition InsertBefore = nullptr); // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Return true if this is a store to a volatile memory location. @@ -420,6 +422,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(StoreInst, Value) class FenceInst : public Instruction { using OrderingField = AtomicOrderingBitfieldElementT<0>; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{0}; + void Init(AtomicOrdering Ordering, SyncScope::ID SSID); protected: @@ -436,7 +440,7 @@ class FenceInst : public Instruction { InsertPosition InsertBefore = nullptr); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S, 0); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Returns the ordering constraint of this fence instruction. @@ -502,6 +506,8 @@ class AtomicCmpXchgInst : public Instruction { typename Bitfield::Element; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -515,7 +521,7 @@ class AtomicCmpXchgInst : public Instruction { InsertPosition InsertBefore = nullptr); // allocate space for exactly three operands - void *operator new(size_t S) { return User::operator new(S, 3); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } using VolatileField = BoolBitfieldElementT<0>; @@ -774,13 +780,15 @@ class AtomicRMWInst : public Instruction { using BinOpBitfieldElement = typename Bitfield::Element; + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment, AtomicOrdering Ordering, SyncScope::ID SSID, InsertPosition InsertBefore = nullptr); // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } using VolatileField = BoolBitfieldElementT<0>; @@ -924,14 +932,14 @@ class GetElementPtrInst : public Instruction { Type *SourceElementType; Type *ResultElementType; - GetElementPtrInst(const GetElementPtrInst &GEPI); + GetElementPtrInst(const GetElementPtrInst &GEPI, AllocInfo AllocInfo); /// Constructors - Create a getelementptr instruction with a base pointer an /// list of indices. The first and second ctor can optionally insert before an /// existing instruction, the third appends the new instruction to the /// specified BasicBlock. inline GetElementPtrInst(Type *PointeeType, Value *Ptr, - ArrayRef IdxList, unsigned Values, + ArrayRef IdxList, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(Value *Ptr, ArrayRef IdxList, const Twine &NameStr); @@ -949,8 +957,9 @@ class GetElementPtrInst : public Instruction { InsertPosition InsertBefore = nullptr) { unsigned Values = 1 + unsigned(IdxList.size()); assert(PointeeType && "Must specify element type"); - return new (Values) GetElementPtrInst(PointeeType, Ptr, IdxList, Values, - NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{Values}; + return new (AllocMarker) GetElementPtrInst( + PointeeType, Ptr, IdxList, AllocMarker, NameStr, InsertBefore); } static GetElementPtrInst *Create(Type *PointeeType, Value *Ptr, @@ -1124,12 +1133,11 @@ struct OperandTraits : public VariadicOperandTraits {}; GetElementPtrInst::GetElementPtrInst(Type *PointeeType, Value *Ptr, - ArrayRef IdxList, unsigned Values, - const Twine &NameStr, + ArrayRef IdxList, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr, - OperandTraits::op_end(this) - Values, - Values, InsertBefore), + : Instruction(getGEPReturnType(Ptr, IdxList), GetElementPtr, AllocInfo, + InsertBefore), SourceElementType(PointeeType), ResultElementType(getIndexedType(PointeeType, IdxList)) { init(Ptr, IdxList, NameStr); @@ -1403,26 +1411,29 @@ class FCmpInst: public CmpInst { /// hold the calling convention of the call. /// class CallInst : public CallBase { - CallInst(const CallInst &CI); + CallInst(const CallInst &CI, AllocInfo AllocInfo); /// Construct a CallInst from a range of arguments inline CallInst(FunctionType *Ty, Value *Func, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr, - InsertPosition InsertBefore); + AllocInfo AllocInfo, InsertPosition InsertBefore); inline CallInst(FunctionType *Ty, Value *Func, ArrayRef Args, - const Twine &NameStr, InsertPosition InsertBefore) - : CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore) {} + const Twine &NameStr, AllocInfo AllocInfo, + InsertPosition InsertBefore) + : CallInst(Ty, Func, Args, std::nullopt, NameStr, AllocInfo, + InsertBefore) {} explicit CallInst(FunctionType *Ty, Value *F, const Twine &NameStr, - InsertPosition InsertBefore); + AllocInfo AllocInfo, InsertPosition InsertBefore); void init(FunctionType *FTy, Value *Func, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr); void init(FunctionType *FTy, Value *Func, const Twine &NameStr); /// Compute the number of operands to allocate. - static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) { + static unsigned ComputeNumOperands(unsigned NumArgs, + unsigned NumBundleInputs = 0) { // We need one operand for the called function, plus the input operand // counts provided. return 1 + NumArgs + NumBundleInputs; @@ -1437,26 +1448,29 @@ class CallInst : public CallBase { public: static CallInst *Create(FunctionType *Ty, Value *F, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new (ComputeNumOperands(0)) CallInst(Ty, F, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(0)}; + return new (AllocMarker) + CallInst(Ty, F, NameStr, AllocMarker, InsertBefore); } static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef Args, const Twine &NameStr, InsertPosition InsertBefore = nullptr) { - return new (ComputeNumOperands(Args.size())) - CallInst(Ty, Func, Args, std::nullopt, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{ComputeNumOperands(Args.size())}; + return new (AllocMarker) CallInst(Ty, Func, Args, std::nullopt, NameStr, + AllocMarker, InsertBefore); } static CallInst *Create(FunctionType *Ty, Value *Func, ArrayRef Args, ArrayRef Bundles = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - const int NumOperands = - ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); - const unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + ComputeNumOperands(unsigned(Args.size()), CountBundleInputs(Bundles)), + unsigned(Bundles.size() * sizeof(BundleOpInfo))}; - return new (NumOperands, DescriptorBytes) - CallInst(Ty, Func, Args, Bundles, NameStr, InsertBefore); + return new (AllocMarker) + CallInst(Ty, Func, Args, Bundles, NameStr, AllocMarker, InsertBefore); } static CallInst *Create(FunctionCallee Func, const Twine &NameStr = "", @@ -1561,12 +1575,11 @@ class CallInst : public CallBase { CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr, - InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Call, - OperandTraits::op_end(this) - - (Args.size() + CountBundleInputs(Bundles) + 1), - unsigned(Args.size() + CountBundleInputs(Bundles) + 1), + AllocInfo AllocInfo, InsertPosition InsertBefore) + : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo, InsertBefore) { + assert(AllocInfo.NumOps == + unsigned(Args.size() + CountBundleInputs(Bundles) + 1)); init(Ty, Func, Args, Bundles, NameStr); } @@ -1577,10 +1590,11 @@ CallInst::CallInst(FunctionType *Ty, Value *Func, ArrayRef Args, /// This class represents the LLVM 'select' instruction. /// class SelectInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; SelectInst(Value *C, Value *S1, Value *S2, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(S1->getType(), Instruction::Select, &Op<0>(), 3, + : Instruction(S1->getType(), Instruction::Select, AllocMarker, InsertBefore) { init(C, S1, S2); setName(NameStr); @@ -1604,7 +1618,8 @@ class SelectInst : public Instruction { const Twine &NameStr = "", InsertPosition InsertBefore = nullptr, Instruction *MDFrom = nullptr) { - SelectInst *Sel = new(3) SelectInst(C, S1, S2, NameStr, InsertBefore); + SelectInst *Sel = + new (AllocMarker) SelectInst(C, S1, S2, NameStr, InsertBefore); if (MDFrom) Sel->copyMetadata(*MDFrom); return Sel; @@ -1693,6 +1708,8 @@ class VAArgInst : public UnaryInstruction { /// element from a VectorType value /// class ExtractElementInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + ExtractElementInst(Value *Vec, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr); @@ -1706,7 +1723,8 @@ class ExtractElementInst : public Instruction { static ExtractElementInst *Create(Value *Vec, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new(2) ExtractElementInst(Vec, Idx, NameStr, InsertBefore); + return new (AllocMarker) + ExtractElementInst(Vec, Idx, NameStr, InsertBefore); } /// Return true if an extractelement instruction can be @@ -1749,6 +1767,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ExtractElementInst, Value) /// element into a VectorType value /// class InsertElementInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; + InsertElementInst(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr); @@ -1763,7 +1783,8 @@ class InsertElementInst : public Instruction { static InsertElementInst *Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new(3) InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore); + return new (AllocMarker) + InsertElementInst(Vec, NewElt, Idx, NameStr, InsertBefore); } /// Return true if an insertelement instruction can be @@ -1813,6 +1834,8 @@ constexpr int PoisonMaskElem = -1; /// For scalable vectors, all the elements of the mask must be 0 or -1. This /// requirement may be relaxed in the future. class ShuffleVectorInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + SmallVector ShuffleMask; Constant *ShuffleMaskForBitcode; @@ -1834,7 +1857,7 @@ class ShuffleVectorInst : public Instruction { const Twine &NameStr = "", InsertPosition InsertBefore = nullptr); - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { return User::operator delete(Ptr); } /// Swap the operands and adjust the mask to preserve the semantics @@ -2395,6 +2418,8 @@ ExtractValueInst::ExtractValueInst(Value *Agg, ArrayRef Idxs, /// value into an aggregate value. /// class InsertValueInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + SmallVector Indices; InsertValueInst(const InsertValueInst &IVI); @@ -2423,7 +2448,7 @@ class InsertValueInst : public Instruction { public: // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } static InsertValueInst *Create(Value *Agg, Value *Val, @@ -2493,9 +2518,7 @@ struct OperandTraits : InsertValueInst::InsertValueInst(Value *Agg, Value *Val, ArrayRef Idxs, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(Agg->getType(), InsertValue, - OperandTraits::op_begin(this), 2, - InsertBefore) { + : Instruction(Agg->getType(), InsertValue, AllocMarker, InsertBefore) { init(Agg, Val, Idxs, NameStr); } @@ -2510,6 +2533,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueInst, Value) // scientist's overactive imagination. // class PHINode : public Instruction { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + /// The number of operands actually allocated. NumOperands is /// the number actually in use. unsigned ReservedSpace; @@ -2519,7 +2544,7 @@ class PHINode : public Instruction { explicit PHINode(Type *Ty, unsigned NumReservedValues, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) - : Instruction(Ty, Instruction::PHI, nullptr, 0, InsertBefore), + : Instruction(Ty, Instruction::PHI, AllocMarker, InsertBefore), ReservedSpace(NumReservedValues) { assert(!Ty->isTokenTy() && "PHI nodes cannot have token type!"); setName(NameStr); @@ -2545,7 +2570,8 @@ class PHINode : public Instruction { static PHINode *Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - return new PHINode(Ty, NumReservedValues, NameStr, InsertBefore); + return new (AllocMarker) + PHINode(Ty, NumReservedValues, NameStr, InsertBefore); } /// Provide fast operand accessors @@ -2749,6 +2775,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(PHINode, Value) class LandingPadInst : public Instruction { using CleanupField = BoolBitfieldElementT<0>; + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + /// The number of operands actually allocated. NumOperands is /// the number actually in use. unsigned ReservedSpace; @@ -2763,7 +2791,7 @@ class LandingPadInst : public Instruction { const Twine &NameStr, InsertPosition InsertBefore); // Allocate space for exactly zero operands. - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void growOperands(unsigned Size); void init(unsigned NumReservedValues, const Twine &NameStr); @@ -2843,7 +2871,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(LandingPadInst, Value) /// does not continue in this function any longer. /// class ReturnInst : public Instruction { - ReturnInst(const ReturnInst &RI); + ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo); private: // ReturnInst constructors: @@ -2859,8 +2887,8 @@ class ReturnInst : public Instruction { // // NOTE: If the Value* passed is of type void then the constructor behaves as // if it was passed NULL. - explicit ReturnInst(LLVMContext &C, Value *retVal = nullptr, - InsertPosition InsertBefore = nullptr); + explicit ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo, + InsertPosition InsertBefore); protected: // Note: Instruction needs to be a friend here to call cloneImpl. @@ -2871,11 +2899,13 @@ class ReturnInst : public Instruction { public: static ReturnInst *Create(LLVMContext &C, Value *retVal = nullptr, InsertPosition InsertBefore = nullptr) { - return new(!!retVal) ReturnInst(C, retVal, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{retVal ? 1U : 0U}; + return new (AllocMarker) ReturnInst(C, retVal, AllocMarker, InsertBefore); } static ReturnInst *Create(LLVMContext &C, BasicBlock *InsertAtEnd) { - return new (0) ReturnInst(C, nullptr, InsertAtEnd); + IntrusiveOperandsAllocMarker AllocMarker{0}; + return new (AllocMarker) ReturnInst(C, nullptr, AllocMarker, InsertAtEnd); } /// Provide fast operand accessors @@ -2923,7 +2953,7 @@ class BranchInst : public Instruction { /// [Cond, FalseDest,] TrueDest. This makes some accessors faster because /// they don't have to check for cond/uncond branchness. These are mostly /// accessed relative from op_end(). - BranchInst(const BranchInst &BI); + BranchInst(const BranchInst &BI, AllocInfo AllocInfo); // BranchInst constructors (where {B, T, F} are blocks, and C is a condition): // BranchInst(BB *B) - 'br B' // BranchInst(BB* T, BB *F, Value *C) - 'br C, T, F' @@ -2933,10 +2963,10 @@ class BranchInst : public Instruction { // BranchInst(BB* T, BB *F, Value *C, Inst *I) - 'br C, T, F', insert before I // BranchInst(BB* B, BB *I) - 'br B' insert at end // BranchInst(BB* T, BB *F, Value *C, BB *I) - 'br C, T, F', insert at end - explicit BranchInst(BasicBlock *IfTrue, - InsertPosition InsertBefore = nullptr); + explicit BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo, + InsertPosition InsertBefore); BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, - InsertPosition InsertBefore = nullptr); + AllocInfo AllocInfo, InsertPosition InsertBefore); void AssertOK(); @@ -2976,13 +3006,16 @@ class BranchInst : public Instruction { static BranchInst *Create(BasicBlock *IfTrue, InsertPosition InsertBefore = nullptr) { - return new(1) BranchInst(IfTrue, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{1}; + return new (AllocMarker) BranchInst(IfTrue, AllocMarker, InsertBefore); } static BranchInst *Create(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, InsertPosition InsertBefore = nullptr) { - return new(3) BranchInst(IfTrue, IfFalse, Cond, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{3}; + return new (AllocMarker) + BranchInst(IfTrue, IfFalse, Cond, AllocMarker, InsertBefore); } /// Transparently provide more efficient getOperand methods. @@ -3054,6 +3087,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value) /// Multiway switch /// class SwitchInst : public Instruction { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + unsigned ReservedSpace; // Operand[0] = Value to switch on @@ -3070,7 +3105,7 @@ class SwitchInst : public Instruction { InsertPosition InsertBefore); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void init(Value *Value, BasicBlock *Default, unsigned NumReserved); void growOperands(); @@ -3442,6 +3477,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value) /// Indirect Branch Instruction. /// class IndirectBrInst : public Instruction { + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + unsigned ReservedSpace; // Operand[0] = Address to jump to @@ -3456,7 +3493,7 @@ class IndirectBrInst : public Instruction { InsertPosition InsertBefore); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void init(Value *Address, unsigned NumDests); void growOperands(); @@ -3576,14 +3613,14 @@ class InvokeInst : public CallBase { /// The index from the end of the operand array to the unwind destination. static constexpr int UnwindDestOpEndIdx = -2; - InvokeInst(const InvokeInst &BI); + InvokeInst(const InvokeInst &BI, AllocInfo AllocInfo); /// Construct an InvokeInst given a range of arguments. /// /// Construct an InvokeInst from a range of arguments inline InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, - ArrayRef Bundles, int NumOperands, + ArrayRef Bundles, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, @@ -3591,10 +3628,11 @@ class InvokeInst : public CallBase { ArrayRef Bundles, const Twine &NameStr); /// Compute the number of operands to allocate. - static int ComputeNumOperands(int NumArgs, int NumBundleInputs = 0) { + static unsigned ComputeNumOperands(unsigned NumArgs, + size_t NumBundleInputs = 0) { // We need one operand for the called function, plus our extra operands and // the input operand counts provided. - return 1 + NumExtraOperands + NumArgs + NumBundleInputs; + return 1 + NumExtraOperands + NumArgs + unsigned(NumBundleInputs); } protected: @@ -3608,10 +3646,11 @@ class InvokeInst : public CallBase { BasicBlock *IfException, ArrayRef Args, const Twine &NameStr, InsertPosition InsertBefore = nullptr) { - int NumOperands = ComputeNumOperands(Args.size()); - return new (NumOperands) + IntrusiveOperandsAllocMarker AllocMarker{ + ComputeNumOperands(unsigned(Args.size()))}; + return new (AllocMarker) InvokeInst(Ty, Func, IfNormal, IfException, Args, std::nullopt, - NumOperands, NameStr, InsertBefore); + AllocMarker, NameStr, InsertBefore); } static InvokeInst *Create(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, @@ -3619,12 +3658,12 @@ class InvokeInst : public CallBase { ArrayRef Bundles = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - int NumOperands = - ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)); - unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)), + unsigned(Bundles.size() * sizeof(BundleOpInfo))}; - return new (NumOperands, DescriptorBytes) - InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, NumOperands, + return new (AllocMarker) + InvokeInst(Ty, Func, IfNormal, IfException, Args, Bundles, AllocMarker, NameStr, InsertBefore); } @@ -3709,10 +3748,9 @@ class InvokeInst : public CallBase { InvokeInst::InvokeInst(FunctionType *Ty, Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, - ArrayRef Bundles, int NumOperands, + ArrayRef Bundles, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Invoke, - OperandTraits::op_end(this) - NumOperands, NumOperands, + : CallBase(Ty->getReturnType(), Instruction::Invoke, AllocInfo, InsertBefore) { init(Ty, Func, IfNormal, IfException, Args, Bundles, NameStr); } @@ -3729,7 +3767,7 @@ class CallBrInst : public CallBase { unsigned NumIndirectDests; - CallBrInst(const CallBrInst &BI); + CallBrInst(const CallBrInst &BI, AllocInfo AllocInfo); /// Construct a CallBrInst given a range of arguments. /// @@ -3737,7 +3775,7 @@ class CallBrInst : public CallBase { inline CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest, ArrayRef IndirectDests, ArrayRef Args, ArrayRef Bundles, - int NumOperands, const Twine &NameStr, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore); void init(FunctionType *FTy, Value *Func, BasicBlock *DefaultDest, @@ -3745,11 +3783,11 @@ class CallBrInst : public CallBase { ArrayRef Bundles, const Twine &NameStr); /// Compute the number of operands to allocate. - static int ComputeNumOperands(int NumArgs, int NumIndirectDests, - int NumBundleInputs = 0) { + static unsigned ComputeNumOperands(int NumArgs, int NumIndirectDests, + int NumBundleInputs = 0) { // We need one operand for the called function, plus our extra operands and // the input operand counts provided. - return 2 + NumIndirectDests + NumArgs + NumBundleInputs; + return unsigned(2 + NumIndirectDests + NumArgs + NumBundleInputs); } protected: @@ -3764,10 +3802,11 @@ class CallBrInst : public CallBase { ArrayRef IndirectDests, ArrayRef Args, const Twine &NameStr, InsertPosition InsertBefore = nullptr) { - int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size()); - return new (NumOperands) + IntrusiveOperandsAllocMarker AllocMarker{ + ComputeNumOperands(Args.size(), IndirectDests.size())}; + return new (AllocMarker) CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, std::nullopt, - NumOperands, NameStr, InsertBefore); + AllocMarker, NameStr, InsertBefore); } static CallBrInst * @@ -3775,13 +3814,14 @@ class CallBrInst : public CallBase { ArrayRef IndirectDests, ArrayRef Args, ArrayRef Bundles = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - int NumOperands = ComputeNumOperands(Args.size(), IndirectDests.size(), - CountBundleInputs(Bundles)); - unsigned DescriptorBytes = Bundles.size() * sizeof(BundleOpInfo); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + ComputeNumOperands(Args.size(), IndirectDests.size(), + CountBundleInputs(Bundles)), + unsigned(Bundles.size() * sizeof(BundleOpInfo))}; - return new (NumOperands, DescriptorBytes) + return new (AllocMarker) CallBrInst(Ty, Func, DefaultDest, IndirectDests, Args, Bundles, - NumOperands, NameStr, InsertBefore); + AllocMarker, NameStr, InsertBefore); } static CallBrInst *Create(FunctionCallee Func, BasicBlock *DefaultDest, @@ -3881,10 +3921,9 @@ class CallBrInst : public CallBase { CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest, ArrayRef IndirectDests, ArrayRef Args, - ArrayRef Bundles, int NumOperands, + ArrayRef Bundles, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::CallBr, - OperandTraits::op_end(this) - NumOperands, NumOperands, + : CallBase(Ty->getReturnType(), Instruction::CallBr, AllocInfo, InsertBefore) { init(Ty, Func, DefaultDest, IndirectDests, Args, Bundles, NameStr); } @@ -3897,6 +3936,8 @@ CallBrInst::CallBrInst(FunctionType *Ty, Value *Func, BasicBlock *DefaultDest, /// Resume the propagation of an exception. /// class ResumeInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + ResumeInst(const ResumeInst &RI); explicit ResumeInst(Value *Exn, InsertPosition InsertBefore = nullptr); @@ -3909,7 +3950,7 @@ class ResumeInst : public Instruction { public: static ResumeInst *Create(Value *Exn, InsertPosition InsertBefore = nullptr) { - return new(1) ResumeInst(Exn, InsertBefore); + return new (AllocMarker) ResumeInst(Exn, InsertBefore); } /// Provide fast operand accessors @@ -3951,6 +3992,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value) class CatchSwitchInst : public Instruction { using UnwindDestField = BoolBitfieldElementT<0>; + constexpr static HungOffOperandsAllocMarker AllocMarker{}; + /// The number of operands actually allocated. NumOperands is /// the number actually in use. unsigned ReservedSpace; @@ -3969,7 +4012,7 @@ class CatchSwitchInst : public Instruction { InsertPosition InsertBefore); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void init(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReserved); void growOperands(unsigned Size); @@ -4114,9 +4157,9 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CatchSwitchInst, Value) class CleanupPadInst : public FuncletPadInst { private: explicit CleanupPadInst(Value *ParentPad, ArrayRef Args, - unsigned Values, const Twine &NameStr, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, Values, + : FuncletPadInst(Instruction::CleanupPad, ParentPad, Args, AllocInfo, NameStr, InsertBefore) {} public: @@ -4124,9 +4167,9 @@ class CleanupPadInst : public FuncletPadInst { ArrayRef Args = std::nullopt, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - unsigned Values = 1 + Args.size(); - return new (Values) - CleanupPadInst(ParentPad, Args, Values, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())}; + return new (AllocMarker) + CleanupPadInst(ParentPad, Args, AllocMarker, NameStr, InsertBefore); } /// Methods for support type inquiry through isa, cast, and dyn_cast: @@ -4144,18 +4187,18 @@ class CleanupPadInst : public FuncletPadInst { class CatchPadInst : public FuncletPadInst { private: explicit CatchPadInst(Value *CatchSwitch, ArrayRef Args, - unsigned Values, const Twine &NameStr, + AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, Values, + : FuncletPadInst(Instruction::CatchPad, CatchSwitch, Args, AllocInfo, NameStr, InsertBefore) {} public: static CatchPadInst *Create(Value *CatchSwitch, ArrayRef Args, const Twine &NameStr = "", InsertPosition InsertBefore = nullptr) { - unsigned Values = 1 + Args.size(); - return new (Values) - CatchPadInst(CatchSwitch, Args, Values, NameStr, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{unsigned(1 + Args.size())}; + return new (AllocMarker) + CatchPadInst(CatchSwitch, Args, AllocMarker, NameStr, InsertBefore); } /// Convenience accessors @@ -4181,6 +4224,8 @@ class CatchPadInst : public FuncletPadInst { //===----------------------------------------------------------------------===// class CatchReturnInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + CatchReturnInst(const CatchReturnInst &RI); CatchReturnInst(Value *CatchPad, BasicBlock *BB, InsertPosition InsertBefore); @@ -4197,7 +4242,7 @@ class CatchReturnInst : public Instruction { InsertPosition InsertBefore = nullptr) { assert(CatchPad); assert(BB); - return new (2) CatchReturnInst(CatchPad, BB, InsertBefore); + return new (AllocMarker) CatchReturnInst(CatchPad, BB, InsertBefore); } /// Provide fast operand accessors @@ -4257,9 +4302,9 @@ class CleanupReturnInst : public Instruction { using UnwindDestField = BoolBitfieldElementT<0>; private: - CleanupReturnInst(const CleanupReturnInst &RI); - CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, unsigned Values, - InsertPosition InsertBefore = nullptr); + CleanupReturnInst(const CleanupReturnInst &RI, AllocInfo AllocInfo); + CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, + AllocInfo AllocInfo, InsertPosition InsertBefore = nullptr); void init(Value *CleanupPad, BasicBlock *UnwindBB); @@ -4277,8 +4322,9 @@ class CleanupReturnInst : public Instruction { unsigned Values = 1; if (UnwindBB) ++Values; - return new (Values) - CleanupReturnInst(CleanupPad, UnwindBB, Values, InsertBefore); + IntrusiveOperandsAllocMarker AllocMarker{Values}; + return new (AllocMarker) + CleanupReturnInst(CleanupPad, UnwindBB, AllocMarker, InsertBefore); } /// Provide fast operand accessors @@ -4350,6 +4396,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(CleanupReturnInst, Value) /// end of the block cannot be reached. /// class UnreachableInst : public Instruction { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{0}; + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -4361,7 +4409,7 @@ class UnreachableInst : public Instruction { InsertPosition InsertBefore = nullptr); // allocate space for exactly zero operands - void *operator new(size_t S) { return User::operator new(S, 0); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } unsigned getNumSuccessors() const { return 0; } diff --git a/llvm/include/llvm/IR/User.h b/llvm/include/llvm/IR/User.h index 910815f236abea4..39e1314bd8130b2 100644 --- a/llvm/include/llvm/IR/User.h +++ b/llvm/include/llvm/IR/User.h @@ -43,39 +43,86 @@ struct OperandTraits; class User : public Value { friend struct HungoffOperandTraits; + template friend struct ConstantAggrKeyType; LLVM_ATTRIBUTE_ALWAYS_INLINE static void * allocateFixedOperandUser(size_t, unsigned, unsigned); protected: + // Disable the default operator new, as all subclasses must use one of the + // custom operators below depending on how they store their operands. + void *operator new(size_t Size) = delete; + + /// Indicates this User has operands "hung off" in another allocation. + struct HungOffOperandsAllocMarker {}; + + /// Indicates this User has operands co-allocated. + struct IntrusiveOperandsAllocMarker { + /// The number of operands for this User. + const unsigned NumOps; + }; + + /// Indicates this User has operands and a descriptor co-allocated . + struct IntrusiveOperandsAndDescriptorAllocMarker { + /// The number of operands for this User. + const unsigned NumOps; + /// The number of bytes to allocate for the descriptor. Must be divisible by + /// `sizeof(void *)`. + const unsigned DescBytes; + }; + + /// Information about how a User object was allocated, to be passed into the + /// User constructor. + /// + /// DO NOT USE DIRECTLY. Use one of the `AllocMarker` structs instead, they + /// call all be implicitly converted to `AllocInfo`. + struct AllocInfo { + public: + const unsigned NumOps : NumUserOperandsBits; + const bool HasHungOffUses : 1; + const bool HasDescriptor : 1; + + AllocInfo() = delete; + + constexpr AllocInfo(const HungOffOperandsAllocMarker) + : NumOps(0), HasHungOffUses(true), HasDescriptor(false) {} + + constexpr AllocInfo(const IntrusiveOperandsAllocMarker Alloc) + : NumOps(Alloc.NumOps), HasHungOffUses(false), HasDescriptor(false) {} + + constexpr AllocInfo(const IntrusiveOperandsAndDescriptorAllocMarker Alloc) + : NumOps(Alloc.NumOps), HasHungOffUses(false), + HasDescriptor(Alloc.DescBytes != 0) {} + }; + /// Allocate a User with an operand pointer co-allocated. /// /// This is used for subclasses which need to allocate a variable number /// of operands, ie, 'hung off uses'. - void *operator new(size_t Size); + void *operator new(size_t Size, HungOffOperandsAllocMarker); /// Allocate a User with the operands co-allocated. /// /// This is used for subclasses which have a fixed number of operands. - void *operator new(size_t Size, unsigned Us); + void *operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait); /// Allocate a User with the operands co-allocated. If DescBytes is non-zero /// then allocate an additional DescBytes bytes before the operands. These /// bytes can be accessed by calling getDescriptor. - /// - /// DescBytes needs to be divisible by sizeof(void *). The allocated - /// descriptor, if any, is aligned to sizeof(void *) bytes. - /// - /// This is used for subclasses which have a fixed number of operands. - void *operator new(size_t Size, unsigned Us, unsigned DescBytes); - - User(Type *ty, unsigned vty, Use *, unsigned NumOps) - : Value(ty, vty) { - assert(NumOps < (1u << NumUserOperandsBits) && "Too many operands"); - NumUserOperands = NumOps; + void *operator new(size_t Size, + IntrusiveOperandsAndDescriptorAllocMarker allocTrait); + + User(Type *ty, unsigned vty, AllocInfo AllocInfo) : Value(ty, vty) { + assert(AllocInfo.NumOps < (1u << NumUserOperandsBits) && + "Too many operands"); + NumUserOperands = AllocInfo.NumOps; + assert((!AllocInfo.HasDescriptor || !AllocInfo.HasHungOffUses) && + "Cannot have both hung off uses and a descriptor"); + HasHungOffUses = AllocInfo.HasHungOffUses; + HasDescriptor = AllocInfo.HasDescriptor; // If we have hung off uses, then the operand list should initially be // null. - assert((!HasHungOffUses || !getOperandList()) && + assert((!AllocInfo.HasHungOffUses || !getOperandList()) && "Error in initializing hung off uses for User"); } @@ -98,7 +145,20 @@ class User : public Value { /// Free memory allocated for User and Use objects. void operator delete(void *Usr); /// Placement delete - required by std, called if the ctor throws. - void operator delete(void *Usr, unsigned) { + void operator delete(void *Usr, HungOffOperandsAllocMarker) { + // Note: If a subclass manipulates the information which is required to + // calculate the Usr memory pointer, e.g. NumUserOperands, the operator + // delete of that subclass has to restore the changed information to the + // original value, since the dtor of that class is not called if the ctor + // fails. + User::operator delete(Usr); + +#ifndef LLVM_ENABLE_EXCEPTIONS + llvm_unreachable("Constructor throws?"); +#endif + } + /// Placement delete - required by std, called if the ctor throws. + void operator delete(void *Usr, IntrusiveOperandsAllocMarker) { // Note: If a subclass manipulates the information which is required to calculate the // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has // to restore the changed information to the original value, since the dtor of that class @@ -110,7 +170,7 @@ class User : public Value { #endif } /// Placement delete - required by std, called if the ctor throws. - void operator delete(void *Usr, unsigned, unsigned) { + void operator delete(void *Usr, IntrusiveOperandsAndDescriptorAllocMarker) { // Note: If a subclass manipulates the information which is required to calculate the // Usr memory pointer, e.g. NumUserOperands, the operator delete of that subclass has // to restore the changed information to the original value, since the dtor of that class @@ -195,19 +255,6 @@ class User : public Value { /// Returns the descriptor co-allocated with this User instance. MutableArrayRef getDescriptor(); - /// Set the number of operands on a GlobalVariable. - /// - /// GlobalVariable always allocates space for a single operands, but - /// doesn't always use it. - /// - /// FIXME: As that the number of operands is used to find the start of - /// the allocated memory in operator delete, we need to always think we have - /// 1 operand before delete. - void setGlobalVariableNumOperands(unsigned NumOps) { - assert(NumOps <= 1 && "GlobalVariable can only have 0 or 1 operands"); - NumUserOperands = NumOps; - } - /// Subclasses with hung off uses need to manage the operand count /// themselves. In these instances, the operand count isn't used to find the /// OperandList, so there's no issue in having the operand count change. diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp index e32a54fa346a9ae..6d035d537329577 100644 --- a/llvm/lib/IR/Constants.cpp +++ b/llvm/lib/IR/Constants.cpp @@ -1267,9 +1267,9 @@ static Constant *getSequenceIfElementsMatch(Constant *C, } ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT, - ArrayRef V) - : Constant(T, VT, OperandTraits::op_end(this) - V.size(), - V.size()) { + ArrayRef V, + AllocInfo AllocInfo) + : Constant(T, VT, AllocInfo) { llvm::copy(V, op_begin()); // Check that types match, unless this is an opaque struct. @@ -1282,8 +1282,9 @@ ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT, } } -ConstantArray::ConstantArray(ArrayType *T, ArrayRef V) - : ConstantAggregate(T, ConstantArrayVal, V) { +ConstantArray::ConstantArray(ArrayType *T, ArrayRef V, + AllocInfo AllocInfo) + : ConstantAggregate(T, ConstantArrayVal, V, AllocInfo) { assert(V.size() == T->getNumElements() && "Invalid initializer for constant array"); } @@ -1346,8 +1347,9 @@ StructType *ConstantStruct::getTypeForElements(ArrayRef V, return getTypeForElements(V[0]->getContext(), V, Packed); } -ConstantStruct::ConstantStruct(StructType *T, ArrayRef V) - : ConstantAggregate(T, ConstantStructVal, V) { +ConstantStruct::ConstantStruct(StructType *T, ArrayRef V, + AllocInfo AllocInfo) + : ConstantAggregate(T, ConstantStructVal, V, AllocInfo) { assert((T->isOpaque() || V.size() == T->getNumElements()) && "Invalid initializer for constant struct"); } @@ -1388,8 +1390,9 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef V) { return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V); } -ConstantVector::ConstantVector(VectorType *T, ArrayRef V) - : ConstantAggregate(T, ConstantVectorVal, V) { +ConstantVector::ConstantVector(VectorType *T, ArrayRef V, + AllocInfo AllocInfo) + : ConstantAggregate(T, ConstantVectorVal, V, AllocInfo) { assert(V.size() == cast(T)->getNumElements() && "Invalid initializer for constant vector"); } @@ -1879,7 +1882,7 @@ BlockAddress *BlockAddress::get(Function *F, BasicBlock *BB) { BlockAddress::BlockAddress(Function *F, BasicBlock *BB) : Constant(PointerType::get(F->getContext(), F->getAddressSpace()), - Value::BlockAddressVal, &Op<0>(), 2) { + Value::BlockAddressVal, AllocMarker) { setOperand(0, F); setOperand(1, BB); BB->AdjustBlockAddressRefCount(1); @@ -1951,7 +1954,7 @@ DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) { } DSOLocalEquivalent::DSOLocalEquivalent(GlobalValue *GV) - : Constant(GV->getType(), Value::DSOLocalEquivalentVal, &Op<0>(), 1) { + : Constant(GV->getType(), Value::DSOLocalEquivalentVal, AllocMarker) { setOperand(0, GV); } @@ -2009,7 +2012,7 @@ NoCFIValue *NoCFIValue::get(GlobalValue *GV) { } NoCFIValue::NoCFIValue(GlobalValue *GV) - : Constant(GV->getType(), Value::NoCFIValueVal, &Op<0>(), 1) { + : Constant(GV->getType(), Value::NoCFIValueVal, AllocMarker) { setOperand(0, GV); } @@ -2056,7 +2059,7 @@ ConstantPtrAuth *ConstantPtrAuth::getWithSameSchema(Constant *Pointer) const { ConstantPtrAuth::ConstantPtrAuth(Constant *Ptr, ConstantInt *Key, ConstantInt *Disc, Constant *AddrDisc) - : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, &Op<0>(), 4) { + : Constant(Ptr->getType(), Value::ConstantPtrAuthVal, AllocMarker) { assert(Ptr->getType()->isPointerTy()); assert(Key->getBitWidth() == 32); assert(Disc->getBitWidth() == 64); @@ -2758,11 +2761,8 @@ const char *ConstantExpr::getOpcodeName() const { GetElementPtrConstantExpr::GetElementPtrConstantExpr( Type *SrcElementTy, Constant *C, ArrayRef IdxList, Type *DestTy, - std::optional InRange) - : ConstantExpr(DestTy, Instruction::GetElementPtr, - OperandTraits::op_end(this) - - (IdxList.size() + 1), - IdxList.size() + 1), + std::optional InRange, AllocInfo AllocInfo) + : ConstantExpr(DestTy, Instruction::GetElementPtr, AllocInfo), SrcElementTy(SrcElementTy), ResElementTy(GetElementPtrInst::getIndexedType(SrcElementTy, IdxList)), InRange(std::move(InRange)) { diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h index bd19ec6b9dcac01..6afc86ffc73abc1 100644 --- a/llvm/lib/IR/ConstantsContext.h +++ b/llvm/lib/IR/ConstantsContext.h @@ -44,14 +44,16 @@ namespace llvm { /// CastConstantExpr - This class is private to Constants.cpp, and is used /// behind the scenes to implement cast constant exprs. class CastConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{1}; + public: CastConstantExpr(unsigned Opcode, Constant *C, Type *Ty) - : ConstantExpr(Ty, Opcode, &Op<0>(), 1) { + : ConstantExpr(Ty, Opcode, AllocMarker) { Op<0>() = C; } // allocate space for exactly one operand - void *operator new(size_t S) { return User::operator new(S, 1); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); @@ -67,17 +69,19 @@ class CastConstantExpr final : public ConstantExpr { /// BinaryConstantExpr - This class is private to Constants.cpp, and is used /// behind the scenes to implement binary constant exprs. class BinaryConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2, unsigned Flags) - : ConstantExpr(C1->getType(), Opcode, &Op<0>(), 2) { + : ConstantExpr(C1->getType(), Opcode, AllocMarker) { Op<0>() = C1; Op<1>() = C2; SubclassOptionalData = Flags; } // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -95,16 +99,18 @@ class BinaryConstantExpr final : public ConstantExpr { /// Constants.cpp, and is used behind the scenes to implement /// extractelement constant exprs. class ExtractElementConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: ExtractElementConstantExpr(Constant *C1, Constant *C2) - : ConstantExpr(cast(C1->getType())->getElementType(), - Instruction::ExtractElement, &Op<0>(), 2) { + : ConstantExpr(cast(C1->getType())->getElementType(), + Instruction::ExtractElement, AllocMarker) { Op<0>() = C1; Op<1>() = C2; } // allocate space for exactly two operands - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -122,17 +128,18 @@ class ExtractElementConstantExpr final : public ConstantExpr { /// Constants.cpp, and is used behind the scenes to implement /// insertelement constant exprs. class InsertElementConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{3}; + public: InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3) - : ConstantExpr(C1->getType(), Instruction::InsertElement, - &Op<0>(), 3) { + : ConstantExpr(C1->getType(), Instruction::InsertElement, AllocMarker) { Op<0>() = C1; Op<1>() = C2; Op<2>() = C3; } // allocate space for exactly three operands - void *operator new(size_t S) { return User::operator new(S, 3); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -150,12 +157,14 @@ class InsertElementConstantExpr final : public ConstantExpr { /// Constants.cpp, and is used behind the scenes to implement /// shufflevector constant exprs. class ShuffleVectorConstantExpr final : public ConstantExpr { + constexpr static IntrusiveOperandsAllocMarker AllocMarker{2}; + public: ShuffleVectorConstantExpr(Constant *C1, Constant *C2, ArrayRef Mask) : ConstantExpr(VectorType::get( cast(C1->getType())->getElementType(), Mask.size(), isa(C1->getType())), - Instruction::ShuffleVector, &Op<0>(), 2) { + Instruction::ShuffleVector, AllocMarker) { assert(ShuffleVectorInst::isValidOperands(C1, C2, Mask) && "Invalid shuffle vector instruction operands!"); Op<0>() = C1; @@ -168,7 +177,7 @@ class ShuffleVectorConstantExpr final : public ConstantExpr { SmallVector ShuffleMask; Constant *ShuffleMaskForBitcode; - void *operator new(size_t S) { return User::operator new(S, 2); } + void *operator new(size_t S) { return User::operator new(S, AllocMarker); } void operator delete(void *Ptr) { return User::operator delete(Ptr); } /// Transparently provide more efficient getOperand methods. @@ -191,15 +200,17 @@ class GetElementPtrConstantExpr : public ConstantExpr { GetElementPtrConstantExpr(Type *SrcElementTy, Constant *C, ArrayRef IdxList, Type *DestTy, - std::optional InRange); + std::optional InRange, + AllocInfo AllocInfo); public: static GetElementPtrConstantExpr * Create(Type *SrcElementTy, Constant *C, ArrayRef IdxList, Type *DestTy, unsigned Flags, std::optional InRange) { - GetElementPtrConstantExpr *Result = new (IdxList.size() + 1) + IntrusiveOperandsAllocMarker AllocMarker{unsigned(IdxList.size() + 1)}; + GetElementPtrConstantExpr *Result = new (AllocMarker) GetElementPtrConstantExpr(SrcElementTy, C, IdxList, DestTy, - std::move(InRange)); + std::move(InRange), AllocMarker); Result->SubclassOptionalData = Flags; return Result; } @@ -318,7 +329,8 @@ template struct ConstantAggrKeyType { using TypeClass = typename ConstantInfo::TypeClass; ConstantClass *create(TypeClass *Ty) const { - return new (Operands.size()) ConstantClass(Ty, Operands); + User::IntrusiveOperandsAllocMarker AllocMarker{unsigned(Operands.size())}; + return new (AllocMarker) ConstantClass(Ty, Operands, AllocMarker); } }; diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index afef8930669e841..82ff4e1bc7f5c5a 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -402,7 +402,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N, Module *M) { - auto *F = new Function(Ty, Linkage, AddrSpace, N, M); + auto *F = new (AllocMarker) Function(Ty, Linkage, AddrSpace, N, M); AttrBuilder B(F->getContext()); UWTableKind UWTable = M->getUwtable(); if (UWTable != UWTableKind::None) @@ -501,8 +501,7 @@ static unsigned computeAddrSpace(unsigned AddrSpace, Module *M) { Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &name, Module *ParentModule) - : GlobalObject(Ty, Value::FunctionVal, - OperandTraits::op_begin(this), 0, Linkage, name, + : GlobalObject(Ty, Value::FunctionVal, AllocMarker, Linkage, name, computeAddrSpace(AddrSpace, ParentModule)), NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) { assert(FunctionType::isValidReturnType(getReturnType()) && diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp index 2bc69cdb712b0ab..99f4fa50e9c4337 100644 --- a/llvm/lib/IR/Globals.cpp +++ b/llvm/lib/IR/Globals.cpp @@ -442,9 +442,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, Constant *InitVal, const Twine &Name, ThreadLocalMode TLMode, unsigned AddressSpace, bool isExternallyInitialized) - : GlobalObject(Ty, Value::GlobalVariableVal, - OperandTraits::op_begin(this), - InitVal != nullptr, Link, Name, AddressSpace), + : GlobalObject(Ty, Value::GlobalVariableVal, AllocMarker, Link, Name, + AddressSpace), isConstantGlobal(constant), isExternallyInitializedConstant(isExternallyInitialized) { assert(!Ty->isFunctionTy() && PointerType::isValidElementType(Ty) && @@ -454,6 +453,8 @@ GlobalVariable::GlobalVariable(Type *Ty, bool constant, LinkageTypes Link, assert(InitVal->getType() == Ty && "Initializer should be the same type as the GlobalVariable!"); Op<0>() = InitVal; + } else { + setGlobalVariableNumOperands(0); } } @@ -540,7 +541,7 @@ void GlobalVariable::setCodeModel(CodeModel::Model CM) { GlobalAlias::GlobalAlias(Type *Ty, unsigned AddressSpace, LinkageTypes Link, const Twine &Name, Constant *Aliasee, Module *ParentModule) - : GlobalValue(Ty, Value::GlobalAliasVal, &Op<0>(), 1, Link, Name, + : GlobalValue(Ty, Value::GlobalAliasVal, AllocMarker, Link, Name, AddressSpace) { setAliasee(Aliasee); if (ParentModule) @@ -597,7 +598,7 @@ const GlobalObject *GlobalAlias::getAliaseeObject() const { GlobalIFunc::GlobalIFunc(Type *Ty, unsigned AddressSpace, LinkageTypes Link, const Twine &Name, Constant *Resolver, Module *ParentModule) - : GlobalObject(Ty, Value::GlobalIFuncVal, &Op<0>(), 1, Link, Name, + : GlobalObject(Ty, Value::GlobalIFuncVal, AllocMarker, Link, Name, AddressSpace) { setResolver(Resolver); if (ParentModule) diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 62d88ce21657b21..b1c2b0200c82696 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -32,9 +32,9 @@ InsertPosition::InsertPosition(Instruction *InsertBefore) InsertPosition::InsertPosition(BasicBlock *InsertAtEnd) : InsertAt(InsertAtEnd ? InsertAtEnd->end() : InstListType::iterator()) {} -Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps, +Instruction::Instruction(Type *ty, unsigned it, AllocInfo AllocInfo, InsertPosition InsertBefore) - : User(ty, Value::InstructionVal + it, Ops, NumOps) { + : User(ty, Value::InstructionVal + it, AllocInfo) { // When called with an iterator, there must be a block to insert into. if (InstListType::iterator InsertIt = InsertBefore; InsertIt.isValid()) { BasicBlock *BB = InsertIt.getNodeParent(); diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 19da1f60d424d26..e95b98a6404432e 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -121,8 +121,9 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) { //===----------------------------------------------------------------------===// PHINode::PHINode(const PHINode &PN) - : Instruction(PN.getType(), Instruction::PHI, nullptr, PN.getNumOperands()), + : Instruction(PN.getType(), Instruction::PHI, AllocMarker), ReservedSpace(PN.getNumOperands()) { + NumUserOperands = PN.getNumOperands(); allocHungoffUses(PN.getNumOperands()); std::copy(PN.op_begin(), PN.op_end(), op_begin()); copyIncomingBlocks(make_range(PN.block_begin(), PN.block_end())); @@ -243,14 +244,14 @@ bool PHINode::hasConstantOrUndefValue() const { LandingPadInst::LandingPadInst(Type *RetTy, unsigned NumReservedValues, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(RetTy, Instruction::LandingPad, nullptr, 0, InsertBefore) { + : Instruction(RetTy, Instruction::LandingPad, AllocMarker, InsertBefore) { init(NumReservedValues, NameStr); } LandingPadInst::LandingPadInst(const LandingPadInst &LP) - : Instruction(LP.getType(), Instruction::LandingPad, nullptr, - LP.getNumOperands()), + : Instruction(LP.getType(), Instruction::LandingPad, AllocMarker), ReservedSpace(LP.getNumOperands()) { + NumUserOperands = LP.getNumOperands(); allocHungoffUses(LP.getNumOperands()); Use *OL = getOperandList(); const Use *InOL = LP.getOperandList(); @@ -716,16 +717,16 @@ void CallInst::init(FunctionType *FTy, Value *Func, const Twine &NameStr) { } CallInst::CallInst(FunctionType *Ty, Value *Func, const Twine &Name, - InsertPosition InsertBefore) - : CallBase(Ty->getReturnType(), Instruction::Call, - OperandTraits::op_end(this) - 1, 1, InsertBefore) { + AllocInfo AllocInfo, InsertPosition InsertBefore) + : CallBase(Ty->getReturnType(), Instruction::Call, AllocInfo, + InsertBefore) { init(Ty, Func, Name); } -CallInst::CallInst(const CallInst &CI) - : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, - OperandTraits::op_end(this) - CI.getNumOperands(), - CI.getNumOperands()) { +CallInst::CallInst(const CallInst &CI, AllocInfo AllocInfo) + : CallBase(CI.Attrs, CI.FTy, CI.getType(), Instruction::Call, AllocInfo) { + assert(getNumOperands() == CI.getNumOperands() && + "Wrong number of operands allocated"); setTailCallKind(CI.getTailCallKind()); setCallingConv(CI.getCallingConv()); @@ -774,7 +775,7 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, const Twine &NameStr) { this->FTy = FTy; - assert((int)getNumOperands() == + assert(getNumOperands() == ComputeNumOperands(Args.size(), CountBundleInputs(Bundles)) && "NumOperands not set up?"); @@ -803,10 +804,10 @@ void InvokeInst::init(FunctionType *FTy, Value *Fn, BasicBlock *IfNormal, setName(NameStr); } -InvokeInst::InvokeInst(const InvokeInst &II) - : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, - OperandTraits::op_end(this) - II.getNumOperands(), - II.getNumOperands()) { +InvokeInst::InvokeInst(const InvokeInst &II, AllocInfo AllocInfo) + : CallBase(II.Attrs, II.FTy, II.getType(), Instruction::Invoke, AllocInfo) { + assert(getNumOperands() == II.getNumOperands() && + "Wrong number of operands allocated"); setCallingConv(II.getCallingConv()); std::copy(II.op_begin(), II.op_end(), op_begin()); std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(), @@ -855,9 +856,9 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough, const Twine &NameStr) { this->FTy = FTy; - assert((int)getNumOperands() == - ComputeNumOperands(Args.size(), IndirectDests.size(), - CountBundleInputs(Bundles)) && + assert(getNumOperands() == ComputeNumOperands(Args.size(), + IndirectDests.size(), + CountBundleInputs(Bundles)) && "NumOperands not set up?"); #ifndef NDEBUG @@ -887,10 +888,11 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough, setName(NameStr); } -CallBrInst::CallBrInst(const CallBrInst &CBI) +CallBrInst::CallBrInst(const CallBrInst &CBI, AllocInfo AllocInfo) : CallBase(CBI.Attrs, CBI.FTy, CBI.getType(), Instruction::CallBr, - OperandTraits::op_end(this) - CBI.getNumOperands(), - CBI.getNumOperands()) { + AllocInfo) { + assert(getNumOperands() == CBI.getNumOperands() && + "Wrong number of operands allocated"); setCallingConv(CBI.getCallingConv()); std::copy(CBI.op_begin(), CBI.op_end(), op_begin()); std::copy(CBI.bundle_op_info_begin(), CBI.bundle_op_info_end(), @@ -918,19 +920,19 @@ CallBrInst *CallBrInst::Create(CallBrInst *CBI, ArrayRef OpB, // ReturnInst Implementation //===----------------------------------------------------------------------===// -ReturnInst::ReturnInst(const ReturnInst &RI) +ReturnInst::ReturnInst(const ReturnInst &RI, AllocInfo AllocInfo) : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Ret, - OperandTraits::op_end(this) - RI.getNumOperands(), - RI.getNumOperands()) { + AllocInfo) { + assert(getNumOperands() == RI.getNumOperands() && + "Wrong number of operands allocated"); if (RI.getNumOperands()) Op<0>() = RI.Op<0>(); SubclassOptionalData = RI.SubclassOptionalData; } -ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, +ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, AllocInfo AllocInfo, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(C), Instruction::Ret, - OperandTraits::op_end(this) - !!retVal, !!retVal, + : Instruction(Type::getVoidTy(C), Instruction::Ret, AllocInfo, InsertBefore) { if (retVal) Op<0>() = retVal; @@ -942,13 +944,13 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, ResumeInst::ResumeInst(const ResumeInst &RI) : Instruction(Type::getVoidTy(RI.getContext()), Instruction::Resume, - OperandTraits::op_begin(this), 1) { + AllocMarker) { Op<0>() = RI.Op<0>(); } ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(Exn->getContext()), Instruction::Resume, - OperandTraits::op_begin(this), 1, InsertBefore) { + AllocMarker, InsertBefore) { Op<0>() = Exn; } @@ -956,11 +958,11 @@ ResumeInst::ResumeInst(Value *Exn, InsertPosition InsertBefore) // CleanupReturnInst Implementation //===----------------------------------------------------------------------===// -CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI) - : Instruction(CRI.getType(), Instruction::CleanupRet, - OperandTraits::op_end(this) - - CRI.getNumOperands(), - CRI.getNumOperands()) { +CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI, + AllocInfo AllocInfo) + : Instruction(CRI.getType(), Instruction::CleanupRet, AllocInfo) { + assert(getNumOperands() == CRI.getNumOperands() && + "Wrong number of operands allocated"); setSubclassData( CRI.getSubclassData()); Op<0>() = CRI.Op<0>(); @@ -978,12 +980,10 @@ void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) { } CleanupReturnInst::CleanupReturnInst(Value *CleanupPad, BasicBlock *UnwindBB, - unsigned Values, + AllocInfo AllocInfo, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(CleanupPad->getContext()), - Instruction::CleanupRet, - OperandTraits::op_end(this) - Values, - Values, InsertBefore) { + Instruction::CleanupRet, AllocInfo, InsertBefore) { init(CleanupPad, UnwindBB); } @@ -997,7 +997,7 @@ void CatchReturnInst::init(Value *CatchPad, BasicBlock *BB) { CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI) : Instruction(Type::getVoidTy(CRI.getContext()), Instruction::CatchRet, - OperandTraits::op_begin(this), 2) { + AllocMarker) { Op<0>() = CRI.Op<0>(); Op<1>() = CRI.Op<1>(); } @@ -1005,8 +1005,7 @@ CatchReturnInst::CatchReturnInst(const CatchReturnInst &CRI) CatchReturnInst::CatchReturnInst(Value *CatchPad, BasicBlock *BB, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(BB->getContext()), Instruction::CatchRet, - OperandTraits::op_begin(this), 2, - InsertBefore) { + AllocMarker, InsertBefore) { init(CatchPad, BB); } @@ -1018,7 +1017,7 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest, unsigned NumReservedValues, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(ParentPad->getType(), Instruction::CatchSwitch, nullptr, 0, + : Instruction(ParentPad->getType(), Instruction::CatchSwitch, AllocMarker, InsertBefore) { if (UnwindDest) ++NumReservedValues; @@ -1027,8 +1026,8 @@ CatchSwitchInst::CatchSwitchInst(Value *ParentPad, BasicBlock *UnwindDest, } CatchSwitchInst::CatchSwitchInst(const CatchSwitchInst &CSI) - : Instruction(CSI.getType(), Instruction::CatchSwitch, nullptr, - CSI.getNumOperands()) { + : Instruction(CSI.getType(), Instruction::CatchSwitch, AllocMarker) { + NumUserOperands = CSI.NumUserOperands; init(CSI.getParentPad(), CSI.getUnwindDest(), CSI.getNumOperands()); setNumHungOffUseOperands(ReservedSpace); Use *OL = getOperandList(); @@ -1093,22 +1092,19 @@ void FuncletPadInst::init(Value *ParentPad, ArrayRef Args, setName(NameStr); } -FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI) - : Instruction(FPI.getType(), FPI.getOpcode(), - OperandTraits::op_end(this) - - FPI.getNumOperands(), - FPI.getNumOperands()) { +FuncletPadInst::FuncletPadInst(const FuncletPadInst &FPI, AllocInfo AllocInfo) + : Instruction(FPI.getType(), FPI.getOpcode(), AllocInfo) { + assert(getNumOperands() == FPI.getNumOperands() && + "Wrong number of operands allocated"); std::copy(FPI.op_begin(), FPI.op_end(), op_begin()); setParentPad(FPI.getParentPad()); } FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, - ArrayRef Args, unsigned Values, + ArrayRef Args, AllocInfo AllocInfo, const Twine &NameStr, InsertPosition InsertBefore) - : Instruction(ParentPad->getType(), Op, - OperandTraits::op_end(this) - Values, Values, - InsertBefore) { + : Instruction(ParentPad->getType(), Op, AllocInfo, InsertBefore) { init(ParentPad, Args, NameStr); } @@ -1118,8 +1114,8 @@ FuncletPadInst::FuncletPadInst(Instruction::FuncletPadOps Op, Value *ParentPad, UnreachableInst::UnreachableInst(LLVMContext &Context, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr, - 0, InsertBefore) {} + : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, + AllocMarker, InsertBefore) {} //===----------------------------------------------------------------------===// // BranchInst Implementation @@ -1131,19 +1127,18 @@ void BranchInst::AssertOK() { "May only branch on boolean predicates!"); } -BranchInst::BranchInst(BasicBlock *IfTrue, InsertPosition InsertBefore) +BranchInst::BranchInst(BasicBlock *IfTrue, AllocInfo AllocInfo, + InsertPosition InsertBefore) : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br, - OperandTraits::op_end(this) - 1, 1, - InsertBefore) { + AllocInfo, InsertBefore) { assert(IfTrue && "Branch destination may not be null!"); Op<-1>() = IfTrue; } BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, - InsertPosition InsertBefore) + AllocInfo AllocInfo, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(IfTrue->getContext()), Instruction::Br, - OperandTraits::op_end(this) - 3, 3, - InsertBefore) { + AllocInfo, InsertBefore) { // Assign in order of operand index to make use-list order predictable. Op<-3>() = Cond; Op<-2>() = IfFalse; @@ -1153,10 +1148,11 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, #endif } -BranchInst::BranchInst(const BranchInst &BI) +BranchInst::BranchInst(const BranchInst &BI, AllocInfo AllocInfo) : Instruction(Type::getVoidTy(BI.getContext()), Instruction::Br, - OperandTraits::op_end(this) - BI.getNumOperands(), - BI.getNumOperands()) { + AllocInfo) { + assert(getNumOperands() == BI.getNumOperands() && + "Wrong number of operands allocated"); // Assign in order of operand index to make use-list order predictable. if (BI.getNumOperands() != 1) { assert(BI.getNumOperands() == 3 && "BR can have 1 or 3 operands!"); @@ -1313,9 +1309,8 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align, StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align, AtomicOrdering Order, SyncScope::ID SSID, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(val->getContext()), Store, - OperandTraits::op_begin(this), - OperandTraits::operands(this), InsertBefore) { + : Instruction(Type::getVoidTy(val->getContext()), Store, AllocMarker, + InsertBefore) { Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); @@ -1356,8 +1351,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal, InsertPosition InsertBefore) : Instruction( StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())), - AtomicCmpXchg, OperandTraits::op_begin(this), - OperandTraits::operands(this), InsertBefore) { + AtomicCmpXchg, AllocMarker, InsertBefore) { Init(Ptr, Cmp, NewVal, Alignment, SuccessOrdering, FailureOrdering, SSID); } @@ -1389,9 +1383,7 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val, AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val, Align Alignment, AtomicOrdering Ordering, SyncScope::ID SSID, InsertPosition InsertBefore) - : Instruction(Val->getType(), AtomicRMW, - OperandTraits::op_begin(this), - OperandTraits::operands(this), InsertBefore) { + : Instruction(Val->getType(), AtomicRMW, AllocMarker, InsertBefore) { Init(Operation, Ptr, Val, Alignment, Ordering, SSID); } @@ -1448,7 +1440,7 @@ StringRef AtomicRMWInst::getOperationName(BinOp Op) { FenceInst::FenceInst(LLVMContext &C, AtomicOrdering Ordering, SyncScope::ID SSID, InsertPosition InsertBefore) - : Instruction(Type::getVoidTy(C), Fence, nullptr, 0, InsertBefore) { + : Instruction(Type::getVoidTy(C), Fence, AllocMarker, InsertBefore) { setOrdering(Ordering); setSyncScopeID(SSID); } @@ -1466,13 +1458,13 @@ void GetElementPtrInst::init(Value *Ptr, ArrayRef IdxList, setName(Name); } -GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI) - : Instruction(GEPI.getType(), GetElementPtr, - OperandTraits::op_end(this) - - GEPI.getNumOperands(), - GEPI.getNumOperands()), +GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI, + AllocInfo AllocInfo) + : Instruction(GEPI.getType(), GetElementPtr, AllocInfo), SourceElementType(GEPI.SourceElementType), ResultElementType(GEPI.ResultElementType) { + assert(getNumOperands() == GEPI.getNumOperands() && + "Wrong number of operands allocated"); std::copy(GEPI.op_begin(), GEPI.op_end(), op_begin()); SubclassOptionalData = GEPI.SubclassOptionalData; } @@ -1606,9 +1598,8 @@ bool GetElementPtrInst::collectOffset( ExtractElementInst::ExtractElementInst(Value *Val, Value *Index, const Twine &Name, InsertPosition InsertBef) - : Instruction( - cast(Val->getType())->getElementType(), ExtractElement, - OperandTraits::op_begin(this), 2, InsertBef) { + : Instruction(cast(Val->getType())->getElementType(), + ExtractElement, AllocMarker, InsertBef) { assert(isValidOperands(Val, Index) && "Invalid extractelement instruction operands!"); Op<0>() = Val; @@ -1629,9 +1620,7 @@ bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) { InsertElementInst::InsertElementInst(Value *Vec, Value *Elt, Value *Index, const Twine &Name, InsertPosition InsertBef) - : Instruction(Vec->getType(), InsertElement, - OperandTraits::op_begin(this), 3, - InsertBef) { + : Instruction(Vec->getType(), InsertElement, AllocMarker, InsertBef) { assert(isValidOperands(Vec, Elt, Index) && "Invalid insertelement instruction operands!"); Op<0>() = Vec; @@ -1679,8 +1668,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask, : Instruction( VectorType::get(cast(V1->getType())->getElementType(), cast(Mask->getType())->getElementCount()), - ShuffleVector, OperandTraits::op_begin(this), - OperandTraits::operands(this), InsertBefore) { + ShuffleVector, AllocMarker, InsertBefore) { assert(isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"); @@ -1698,8 +1686,7 @@ ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef Mask, : Instruction( VectorType::get(cast(V1->getType())->getElementType(), Mask.size(), isa(V1->getType())), - ShuffleVector, OperandTraits::op_begin(this), - OperandTraits::operands(this), InsertBefore) { + ShuffleVector, AllocMarker, InsertBefore) { assert(isValidOperands(V1, V2, Mask) && "Invalid shuffle vector instruction operands!"); Op<0>() = V1; @@ -2464,9 +2451,8 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef Idxs, } InsertValueInst::InsertValueInst(const InsertValueInst &IVI) - : Instruction(IVI.getType(), InsertValue, - OperandTraits::op_begin(this), 2), - Indices(IVI.Indices) { + : Instruction(IVI.getType(), InsertValue, AllocMarker), + Indices(IVI.Indices) { Op<0>() = IVI.getOperand(0); Op<1>() = IVI.getOperand(1); SubclassOptionalData = IVI.SubclassOptionalData; @@ -2565,8 +2551,7 @@ void UnaryOperator::AssertOK() { BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty, const Twine &Name, InsertPosition InsertBefore) - : Instruction(Ty, iType, OperandTraits::op_begin(this), - OperandTraits::operands(this), InsertBefore) { + : Instruction(Ty, iType, AllocMarker, InsertBefore) { Op<0>() = S1; Op<1>() = S2; setName(Name); @@ -3427,8 +3412,7 @@ AddrSpaceCastInst::AddrSpaceCastInst(Value *S, Type *Ty, const Twine &Name, CmpInst::CmpInst(Type *ty, OtherOps op, Predicate predicate, Value *LHS, Value *RHS, const Twine &Name, InsertPosition InsertBefore, Instruction *FlagsSource) - : Instruction(ty, op, OperandTraits::op_begin(this), - OperandTraits::operands(this), InsertBefore) { + : Instruction(ty, op, AllocMarker, InsertBefore) { Op<0>() = LHS; Op<1>() = RHS; setPredicate((Predicate)predicate); @@ -3918,12 +3902,12 @@ void SwitchInst::init(Value *Value, BasicBlock *Default, unsigned NumReserved) { SwitchInst::SwitchInst(Value *Value, BasicBlock *Default, unsigned NumCases, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(Value->getContext()), Instruction::Switch, - nullptr, 0, InsertBefore) { + AllocMarker, InsertBefore) { init(Value, Default, 2+NumCases*2); } SwitchInst::SwitchInst(const SwitchInst &SI) - : Instruction(SI.getType(), Instruction::Switch, nullptr, 0) { + : Instruction(SI.getType(), Instruction::Switch, AllocMarker) { init(SI.getCondition(), SI.getDefaultDest(), SI.getNumOperands()); setNumHungOffUseOperands(SI.getNumOperands()); Use *OL = getOperandList(); @@ -4125,13 +4109,14 @@ void IndirectBrInst::growOperands() { IndirectBrInst::IndirectBrInst(Value *Address, unsigned NumCases, InsertPosition InsertBefore) : Instruction(Type::getVoidTy(Address->getContext()), - Instruction::IndirectBr, nullptr, 0, InsertBefore) { + Instruction::IndirectBr, AllocMarker, InsertBefore) { init(Address, NumCases); } IndirectBrInst::IndirectBrInst(const IndirectBrInst &IBI) : Instruction(Type::getVoidTy(IBI.getContext()), Instruction::IndirectBr, - nullptr, IBI.getNumOperands()) { + AllocMarker) { + NumUserOperands = IBI.NumUserOperands; allocHungoffUses(IBI.getNumOperands()); Use *OL = getOperandList(); const Use *InOL = IBI.getOperandList(); @@ -4185,7 +4170,8 @@ FreezeInst::FreezeInst(Value *S, const Twine &Name, InsertPosition InsertBefore) // unit that uses these classes. GetElementPtrInst *GetElementPtrInst::cloneImpl() const { - return new (getNumOperands()) GetElementPtrInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) GetElementPtrInst(*this, AllocMarker); } UnaryOperator *UnaryOperator::cloneImpl() const { @@ -4305,10 +4291,13 @@ AddrSpaceCastInst *AddrSpaceCastInst::cloneImpl() const { CallInst *CallInst::cloneImpl() const { if (hasOperandBundles()) { - unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); - return new(getNumOperands(), DescriptorBytes) CallInst(*this); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + getNumOperands(), + getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))}; + return new (AllocMarker) CallInst(*this, AllocMarker); } - return new(getNumOperands()) CallInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) CallInst(*this, AllocMarker); } SelectInst *SelectInst::cloneImpl() const { @@ -4331,18 +4320,20 @@ ShuffleVectorInst *ShuffleVectorInst::cloneImpl() const { return new ShuffleVectorInst(getOperand(0), getOperand(1), getShuffleMask()); } -PHINode *PHINode::cloneImpl() const { return new PHINode(*this); } +PHINode *PHINode::cloneImpl() const { return new (AllocMarker) PHINode(*this); } LandingPadInst *LandingPadInst::cloneImpl() const { return new LandingPadInst(*this); } ReturnInst *ReturnInst::cloneImpl() const { - return new(getNumOperands()) ReturnInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) ReturnInst(*this, AllocMarker); } BranchInst *BranchInst::cloneImpl() const { - return new(getNumOperands()) BranchInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) BranchInst(*this, AllocMarker); } SwitchInst *SwitchInst::cloneImpl() const { return new SwitchInst(*this); } @@ -4353,28 +4344,37 @@ IndirectBrInst *IndirectBrInst::cloneImpl() const { InvokeInst *InvokeInst::cloneImpl() const { if (hasOperandBundles()) { - unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); - return new(getNumOperands(), DescriptorBytes) InvokeInst(*this); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + getNumOperands(), + getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))}; + return new (AllocMarker) InvokeInst(*this, AllocMarker); } - return new(getNumOperands()) InvokeInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) InvokeInst(*this, AllocMarker); } CallBrInst *CallBrInst::cloneImpl() const { if (hasOperandBundles()) { - unsigned DescriptorBytes = getNumOperandBundles() * sizeof(BundleOpInfo); - return new (getNumOperands(), DescriptorBytes) CallBrInst(*this); + IntrusiveOperandsAndDescriptorAllocMarker AllocMarker{ + getNumOperands(), + getNumOperandBundles() * unsigned(sizeof(BundleOpInfo))}; + return new (AllocMarker) CallBrInst(*this, AllocMarker); } - return new (getNumOperands()) CallBrInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) CallBrInst(*this, AllocMarker); } -ResumeInst *ResumeInst::cloneImpl() const { return new (1) ResumeInst(*this); } +ResumeInst *ResumeInst::cloneImpl() const { + return new (AllocMarker) ResumeInst(*this); +} CleanupReturnInst *CleanupReturnInst::cloneImpl() const { - return new (getNumOperands()) CleanupReturnInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) CleanupReturnInst(*this, AllocMarker); } CatchReturnInst *CatchReturnInst::cloneImpl() const { - return new (getNumOperands()) CatchReturnInst(*this); + return new (AllocMarker) CatchReturnInst(*this); } CatchSwitchInst *CatchSwitchInst::cloneImpl() const { @@ -4382,7 +4382,8 @@ CatchSwitchInst *CatchSwitchInst::cloneImpl() const { } FuncletPadInst *FuncletPadInst::cloneImpl() const { - return new (getNumOperands()) FuncletPadInst(*this); + IntrusiveOperandsAllocMarker AllocMarker{getNumOperands()}; + return new (AllocMarker) FuncletPadInst(*this, AllocMarker); } UnreachableInst *UnreachableInst::cloneImpl() const { diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp index 00dd9c72c469cbf..b0aa785deb9afd8 100644 --- a/llvm/lib/IR/User.cpp +++ b/llvm/lib/IR/User.cpp @@ -145,10 +145,7 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us, ::operator new(Size + sizeof(Use) * Us + DescBytesToAllocate)); Use *Start = reinterpret_cast(Storage + DescBytesToAllocate); Use *End = Start + Us; - User *Obj = reinterpret_cast(End); - Obj->NumUserOperands = Us; - Obj->HasHungOffUses = false; - Obj->HasDescriptor = DescBytes != 0; + User *Obj = reinterpret_cast(End); for (; Start != End; Start++) new (Start) Use(Obj); @@ -160,22 +157,21 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us, return Obj; } -void *User::operator new(size_t Size, unsigned Us) { - return allocateFixedOperandUser(Size, Us, 0); +void *User::operator new(size_t Size, IntrusiveOperandsAllocMarker allocTrait) { + return allocateFixedOperandUser(Size, allocTrait.NumOps, 0); } -void *User::operator new(size_t Size, unsigned Us, unsigned DescBytes) { - return allocateFixedOperandUser(Size, Us, DescBytes); +void *User::operator new(size_t Size, + IntrusiveOperandsAndDescriptorAllocMarker allocTrait) { + return allocateFixedOperandUser(Size, allocTrait.NumOps, + allocTrait.DescBytes); } -void *User::operator new(size_t Size) { +void *User::operator new(size_t Size, HungOffOperandsAllocMarker) { // Allocate space for a single Use* void *Storage = ::operator new(Size + sizeof(Use *)); Use **HungOffOperandList = static_cast(Storage); User *Obj = reinterpret_cast(HungOffOperandList + 1); - Obj->NumUserOperands = 0; - Obj->HasHungOffUses = true; - Obj->HasDescriptor = false; *HungOffOperandList = nullptr; return Obj; } From 666a3f4ed4f62a9b1b732dae6a34a66d31217563 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 11 Sep 2024 11:36:07 -0700 Subject: [PATCH 24/94] [libc] Stub TLS functions on the GPU temporarily (#108267) Summary: There's an extern weak symbol for this, we should just factor these into a more common interface. Stub them temporarily to make the bots happy. PTXAS does not handle extern weak. --- libc/startup/gpu/amdgpu/start.cpp | 3 +++ libc/startup/gpu/nvptx/start.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index f09541b0d558081..8bd0c3a938d0291 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -17,6 +17,9 @@ extern "C" int main(int argc, char **argv, char **envp); namespace LIBC_NAMESPACE_DECL { +// FIXME: Factor this out into common logic so we don't need to stub it here. +void teardown_main_tls() {} + DataEnvironment app; extern "C" uintptr_t __init_array_start[]; diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index ef1e63e5161a61a..bc529b36f50970b 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -19,6 +19,9 @@ namespace LIBC_NAMESPACE_DECL { DataEnvironment app; +// FIXME: Factor this out into common logic so we don't need to stub it here. +void teardown_main_tls() {} + extern "C" { // Nvidia's 'nvlink' linker does not provide these symbols. We instead need // to manually create them and update the globals in the loader implememtation. From bd4e0dfa945fb7fe73801bcee63c21aa8123b928 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 11 Sep 2024 11:40:16 -0700 Subject: [PATCH 25/94] [SandboxIR][Bench] Benchmark RUOW (#107456) This patch adds a benchmark for ReplaceUsesOfWith(). --- llvm/benchmarks/SandboxIRBench.cpp | 42 ++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp index c646ba6290daffe..6cff2eefdb19f2d 100644 --- a/llvm/benchmarks/SandboxIRBench.cpp +++ b/llvm/benchmarks/SandboxIRBench.cpp @@ -134,6 +134,45 @@ template static void RAUW(benchmark::State &State) { } } +static std::string generateRUOWIR(unsigned NumOperands) { + std::stringstream SS; + auto GenOps = [&SS, NumOperands]() { + for (auto Cnt : seq(0, NumOperands)) { + SS << "i8 %arg" << Cnt; + bool IsLast = Cnt + 1 == NumOperands; + if (!IsLast) + SS << ", "; + } + }; + + SS << "define void @foo("; + GenOps(); + SS << ") {\n"; + + SS << " call void @foo("; + GenOps(); + SS << ")\n"; + SS << "ret void"; + SS << "}"; + return SS.str(); +} + +template static void RUOW(benchmark::State &State) { + LLVMContext LLVMCtx; + sandboxir::Context Ctx(LLVMCtx); + std::unique_ptr LLVMM; + unsigned NumOperands = State.range(0); + auto *BB = genIR(LLVMM, LLVMCtx, Ctx, generateRUOWIR, NumOperands); + + auto It = BB->begin(); + auto *F = BB->getParent(); + auto *Arg0 = F->getArg(0); + auto *Arg1 = F->getArg(1); + auto *Call = &*It++; + for (auto _ : State) + Call->replaceUsesOfWith(Arg0, Arg1); +} + BENCHMARK(GetType); BENCHMARK(GetType); @@ -143,4 +182,7 @@ BENCHMARK(BBWalk)->Args({1024}); BENCHMARK(RAUW)->Args({512}); BENCHMARK(RAUW)->Args({512}); +BENCHMARK(RUOW)->Args({4096}); +BENCHMARK(RUOW)->Args({4096}); + BENCHMARK_MAIN(); From c3d39cbb9a5e76f253c865dd544ccdf8eec95029 Mon Sep 17 00:00:00 2001 From: Marc Auberer Date: Wed, 11 Sep 2024 20:55:03 +0200 Subject: [PATCH 26/94] [ADT][NFC] Constexpr-ify if in DenseMap::clear (#108243) Make if constexpr due to constexpr condition. --- llvm/include/llvm/ADT/DenseMap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h index fb267cf5cee1c64..083d5c9388f7c8a 100644 --- a/llvm/include/llvm/ADT/DenseMap.h +++ b/llvm/include/llvm/ADT/DenseMap.h @@ -120,7 +120,7 @@ class DenseMapBase : public DebugEpochBase { } const KeyT EmptyKey = getEmptyKey(); - if (std::is_trivially_destructible::value) { + if constexpr (std::is_trivially_destructible_v) { // Use a simpler loop when values don't need destruction. for (BucketT *P = getBuckets(), *E = getBucketsEnd(); P != E; ++P) P->getFirst() = EmptyKey; From bbff52bfd49336bc0fdc83d8dfc616266bc07cbf Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 11 Sep 2024 14:59:25 -0400 Subject: [PATCH 27/94] [libc++] Guard PSTL headers with >= C++17 (#108234) Otherwise we fail to build with modules in C++03 mode once we migrate to a single top-level module, because those headers get pulled in but they don't compile as C++03. --- libcxx/include/__pstl/backend.h | 24 +++++++++++-------- libcxx/include/__pstl/backend_fwd.h | 16 ++++++++----- libcxx/include/__pstl/backends/default.h | 4 ++++ libcxx/include/__pstl/backends/libdispatch.h | 12 ++++++---- libcxx/include/__pstl/backends/serial.h | 4 ++++ libcxx/include/__pstl/backends/std_thread.h | 4 ++++ libcxx/include/__pstl/cpu_algos/any_of.h | 4 ++++ libcxx/include/__pstl/cpu_algos/cpu_traits.h | 4 ++++ libcxx/include/__pstl/cpu_algos/fill.h | 4 ++++ libcxx/include/__pstl/cpu_algos/find_if.h | 4 ++++ libcxx/include/__pstl/cpu_algos/for_each.h | 4 ++++ libcxx/include/__pstl/cpu_algos/merge.h | 4 ++++ libcxx/include/__pstl/cpu_algos/stable_sort.h | 4 ++++ libcxx/include/__pstl/cpu_algos/transform.h | 4 ++++ .../__pstl/cpu_algos/transform_reduce.h | 4 ++++ libcxx/include/__pstl/dispatch.h | 4 ++++ libcxx/include/__pstl/handle_exception.h | 4 ++++ 17 files changed, 88 insertions(+), 20 deletions(-) diff --git a/libcxx/include/__pstl/backend.h b/libcxx/include/__pstl/backend.h index 86d9f28c77fa8c5..5980b0708cd340a 100644 --- a/libcxx/include/__pstl/backend.h +++ b/libcxx/include/__pstl/backend.h @@ -19,16 +19,20 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> -#if defined(_LIBCPP_PSTL_BACKEND_SERIAL) -# include <__pstl/backends/default.h> -# include <__pstl/backends/serial.h> -#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) -# include <__pstl/backends/default.h> -# include <__pstl/backends/std_thread.h> -#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) -# include <__pstl/backends/default.h> -# include <__pstl/backends/libdispatch.h> -#endif +#if _LIBCPP_STD_VER >= 17 + +# if defined(_LIBCPP_PSTL_BACKEND_SERIAL) +# include <__pstl/backends/default.h> +# include <__pstl/backends/serial.h> +# elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) +# include <__pstl/backends/default.h> +# include <__pstl/backends/std_thread.h> +# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) +# include <__pstl/backends/default.h> +# include <__pstl/backends/libdispatch.h> +# endif + +#endif // _LIBCPP_STD_VER >= 17 _LIBCPP_POP_MACROS diff --git a/libcxx/include/__pstl/backend_fwd.h b/libcxx/include/__pstl/backend_fwd.h index 32c5da576fb3c06..2132e8dbceb3ad2 100644 --- a/libcxx/include/__pstl/backend_fwd.h +++ b/libcxx/include/__pstl/backend_fwd.h @@ -39,6 +39,8 @@ _LIBCPP_PUSH_MACROS // the user. // +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -50,18 +52,18 @@ struct __libdispatch_backend_tag; struct __serial_backend_tag; struct __std_thread_backend_tag; -#if defined(_LIBCPP_PSTL_BACKEND_SERIAL) +# if defined(_LIBCPP_PSTL_BACKEND_SERIAL) using __current_configuration = __backend_configuration<__serial_backend_tag, __default_backend_tag>; -#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) +# elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) using __current_configuration = __backend_configuration<__std_thread_backend_tag, __default_backend_tag>; -#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) +# elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH) using __current_configuration = __backend_configuration<__libdispatch_backend_tag, __default_backend_tag>; -#else +# else // ...New vendors can add parallel backends here... -# error "Invalid PSTL backend configuration" -#endif +# error "Invalid PSTL backend configuration" +# endif template struct __find_if; @@ -296,6 +298,8 @@ struct __reduce; } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKEND_FWD_H diff --git a/libcxx/include/__pstl/backends/default.h b/libcxx/include/__pstl/backends/default.h index b655da51fe340b4..3672bbf60a265e1 100644 --- a/libcxx/include/__pstl/backends/default.h +++ b/libcxx/include/__pstl/backends/default.h @@ -33,6 +33,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -498,6 +500,8 @@ struct __rotate_copy<__default_backend_tag, _ExecutionPolicy> { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_DEFAULT_H diff --git a/libcxx/include/__pstl/backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h index a0c3ad980ed1b0c..2d6ab49c8f7f3a2 100644 --- a/libcxx/include/__pstl/backends/libdispatch.h +++ b/libcxx/include/__pstl/backends/libdispatch.h @@ -44,6 +44,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -140,15 +142,15 @@ struct __cpu_traits<__libdispatch_backend_tag> { unique_ptr<__merge_range_t[], decltype(__destroy)> __ranges( [&]() -> __merge_range_t* { -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS try { -#endif +# endif return std::allocator<__merge_range_t>().allocate(__n_ranges); -#ifndef _LIBCPP_HAS_NO_EXCEPTIONS +# ifndef _LIBCPP_HAS_NO_EXCEPTIONS } catch (const std::bad_alloc&) { return nullptr; } -#endif +# endif }(), __destroy); @@ -392,6 +394,8 @@ struct __fill<__libdispatch_backend_tag, _ExecutionPolicy> } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H diff --git a/libcxx/include/__pstl/backends/serial.h b/libcxx/include/__pstl/backends/serial.h index 5f24499899bd201..f4142016ccc7927 100644 --- a/libcxx/include/__pstl/backends/serial.h +++ b/libcxx/include/__pstl/backends/serial.h @@ -30,6 +30,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -176,6 +178,8 @@ struct __transform_reduce_binary<__serial_backend_tag, _ExecutionPolicy> { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_SERIAL_H diff --git a/libcxx/include/__pstl/backends/std_thread.h b/libcxx/include/__pstl/backends/std_thread.h index 49570bd30b0828d..19b985f860a1744 100644 --- a/libcxx/include/__pstl/backends/std_thread.h +++ b/libcxx/include/__pstl/backends/std_thread.h @@ -32,6 +32,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -131,6 +133,8 @@ struct __fill<__std_thread_backend_tag, _ExecutionPolicy> } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_BACKENDS_STD_THREAD_H diff --git a/libcxx/include/__pstl/cpu_algos/any_of.h b/libcxx/include/__pstl/cpu_algos/any_of.h index b33c787a29db265..803db7974eca7b5 100644 --- a/libcxx/include/__pstl/cpu_algos/any_of.h +++ b/libcxx/include/__pstl/cpu_algos/any_of.h @@ -26,6 +26,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -94,6 +96,8 @@ struct __cpu_parallel_any_of { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_ANY_OF_H diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h index 0483d6918fd01d9..5e59752fa5723cc 100644 --- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h +++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h @@ -19,6 +19,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -81,6 +83,8 @@ struct __cpu_traits; } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_CPU_TRAITS_H diff --git a/libcxx/include/__pstl/cpu_algos/fill.h b/libcxx/include/__pstl/cpu_algos/fill.h index 4e6d29b30cc69df..3e5936589a6a631 100644 --- a/libcxx/include/__pstl/cpu_algos/fill.h +++ b/libcxx/include/__pstl/cpu_algos/fill.h @@ -23,6 +23,8 @@ # pragma GCC system_header #endif +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -63,4 +65,6 @@ struct __cpu_parallel_fill { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + #endif // _LIBCPP___PSTL_CPU_ALGOS_FILL_H diff --git a/libcxx/include/__pstl/cpu_algos/find_if.h b/libcxx/include/__pstl/cpu_algos/find_if.h index 12b2e88971df7d7..cd92e5a99f12f85 100644 --- a/libcxx/include/__pstl/cpu_algos/find_if.h +++ b/libcxx/include/__pstl/cpu_algos/find_if.h @@ -31,6 +31,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -132,6 +134,8 @@ struct __cpu_parallel_find_if { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_FIND_IF_H diff --git a/libcxx/include/__pstl/cpu_algos/for_each.h b/libcxx/include/__pstl/cpu_algos/for_each.h index d4d7862135ff918..cec719bc47b88cd 100644 --- a/libcxx/include/__pstl/cpu_algos/for_each.h +++ b/libcxx/include/__pstl/cpu_algos/for_each.h @@ -23,6 +23,8 @@ # pragma GCC system_header #endif +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -63,4 +65,6 @@ struct __cpu_parallel_for_each { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + #endif // _LIBCPP___PSTL_CPU_ALGOS_FOR_EACH_H diff --git a/libcxx/include/__pstl/cpu_algos/merge.h b/libcxx/include/__pstl/cpu_algos/merge.h index dfa4cbf69b14701..a9069ca51de2f55 100644 --- a/libcxx/include/__pstl/cpu_algos/merge.h +++ b/libcxx/include/__pstl/cpu_algos/merge.h @@ -26,6 +26,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -80,6 +82,8 @@ struct __cpu_parallel_merge { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_MERGE_H diff --git a/libcxx/include/__pstl/cpu_algos/stable_sort.h b/libcxx/include/__pstl/cpu_algos/stable_sort.h index 8e64f3e537c0726..5afdd3fd629ba99 100644 --- a/libcxx/include/__pstl/cpu_algos/stable_sort.h +++ b/libcxx/include/__pstl/cpu_algos/stable_sort.h @@ -21,6 +21,8 @@ # pragma GCC system_header #endif +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -44,4 +46,6 @@ struct __cpu_parallel_stable_sort { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + #endif // _LIBCPP___PSTL_CPU_ALGOS_STABLE_SORT_H diff --git a/libcxx/include/__pstl/cpu_algos/transform.h b/libcxx/include/__pstl/cpu_algos/transform.h index 27ce8e27b242af6..979121be8c8c9aa 100644 --- a/libcxx/include/__pstl/cpu_algos/transform.h +++ b/libcxx/include/__pstl/cpu_algos/transform.h @@ -27,6 +27,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -148,6 +150,8 @@ struct __cpu_parallel_transform_binary { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_H diff --git a/libcxx/include/__pstl/cpu_algos/transform_reduce.h b/libcxx/include/__pstl/cpu_algos/transform_reduce.h index 36ac1a9072a89eb..aafbf1ca96b40d2 100644 --- a/libcxx/include/__pstl/cpu_algos/transform_reduce.h +++ b/libcxx/include/__pstl/cpu_algos/transform_reduce.h @@ -31,6 +31,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -211,6 +213,8 @@ struct __cpu_parallel_transform_reduce { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_CPU_ALGOS_TRANSFORM_REDUCE_H diff --git a/libcxx/include/__pstl/dispatch.h b/libcxx/include/__pstl/dispatch.h index 5e903f7524fe9b2..ea40fa79eb9496c 100644 --- a/libcxx/include/__pstl/dispatch.h +++ b/libcxx/include/__pstl/dispatch.h @@ -23,6 +23,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -61,6 +63,8 @@ using __dispatch = typename __find_first_implemented<_Algorithm, _BackendConfigu } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_DISPATCH_H diff --git a/libcxx/include/__pstl/handle_exception.h b/libcxx/include/__pstl/handle_exception.h index d6270958c3a7c70..57dfcfde4554f81 100644 --- a/libcxx/include/__pstl/handle_exception.h +++ b/libcxx/include/__pstl/handle_exception.h @@ -22,6 +22,8 @@ _LIBCPP_PUSH_MACROS #include <__undef_macros> +#if _LIBCPP_STD_VER >= 17 + _LIBCPP_BEGIN_NAMESPACE_STD namespace __pstl { @@ -52,6 +54,8 @@ _LIBCPP_HIDE_FROM_ABI auto __handle_exception(_Args&&... __args) { } // namespace __pstl _LIBCPP_END_NAMESPACE_STD +#endif // _LIBCPP_STD_VER >= 17 + _LIBCPP_POP_MACROS #endif // _LIBCPP___PSTL_HANDLE_EXCEPTION_H From 118f120eaab8d763b28c71f0d2e2c1e0c752832b Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 11 Sep 2024 14:59:43 -0400 Subject: [PATCH 28/94] [libc++] Get rid of experimental/__config (#108233) It doesn't serve much of a purpose since we can easily put its contents inside __config. Removing it simplifies the modulemap once we are trying to create a single top-level module. --- libcxx/include/CMakeLists.txt | 1 - libcxx/include/__config | 9 ++++ libcxx/include/experimental/__config | 45 ------------------- .../include/experimental/__simd/aligned_tag.h | 2 +- .../include/experimental/__simd/declaration.h | 9 +++- .../include/experimental/__simd/reference.h | 2 +- libcxx/include/experimental/__simd/scalar.h | 2 +- libcxx/include/experimental/__simd/simd.h | 2 +- .../include/experimental/__simd/simd_mask.h | 2 +- libcxx/include/experimental/__simd/traits.h | 2 +- libcxx/include/experimental/__simd/utility.h | 2 +- libcxx/include/experimental/__simd/vec_ext.h | 2 +- libcxx/include/experimental/iterator | 2 +- libcxx/include/experimental/memory | 2 +- libcxx/include/experimental/propagate_const | 2 +- libcxx/include/experimental/simd | 2 +- libcxx/include/experimental/type_traits | 2 +- libcxx/include/experimental/utility | 2 +- libcxx/include/module.modulemap | 4 -- libcxx/src/any.cpp | 2 +- libcxx/src/optional.cpp | 2 +- 21 files changed, 33 insertions(+), 67 deletions(-) delete mode 100644 libcxx/include/experimental/__config diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index a571832ab724d42..ffff8114e5870d4 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -925,7 +925,6 @@ set(files exception execution expected - experimental/__config experimental/__simd/aligned_tag.h experimental/__simd/declaration.h experimental/__simd/reference.h diff --git a/libcxx/include/__config b/libcxx/include/__config index b0a5dda147a6ae3..f0a924352308755 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -591,6 +591,15 @@ typedef __char32_t char32_t; inline namespace _LIBCPP_ABI_NAMESPACE { # define _LIBCPP_END_NAMESPACE_STD }} _LIBCPP_POP_EXTENSION_DIAGNOSTICS +#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL namespace std { namespace experimental { +#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL }} + +#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 { +#define _LIBCPP_END_NAMESPACE_LFTS } _LIBCPP_END_NAMESPACE_EXPERIMENTAL + +#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 { +#define _LIBCPP_END_NAMESPACE_LFTS_V2 } _LIBCPP_END_NAMESPACE_EXPERIMENTAL + #ifdef _LIBCPP_ABI_NO_FILESYSTEM_INLINE_NAMESPACE # define _LIBCPP_BEGIN_NAMESPACE_FILESYSTEM _LIBCPP_BEGIN_NAMESPACE_STD namespace filesystem { # define _LIBCPP_END_NAMESPACE_FILESYSTEM } _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/experimental/__config b/libcxx/include/experimental/__config deleted file mode 100644 index 7b23791511ceffc..000000000000000 --- a/libcxx/include/experimental/__config +++ /dev/null @@ -1,45 +0,0 @@ -// -*- C++ -*- -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP_EXPERIMENTAL_CONFIG -#define _LIBCPP_EXPERIMENTAL_CONFIG - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -#define _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL \ - namespace std { \ - namespace experimental { -#define _LIBCPP_END_NAMESPACE_EXPERIMENTAL \ - } \ - } - -#define _LIBCPP_BEGIN_NAMESPACE_LFTS _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v1 { -#define _LIBCPP_END_NAMESPACE_LFTS \ - } \ - } \ - } - -#define _LIBCPP_BEGIN_NAMESPACE_LFTS_V2 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace fundamentals_v2 { -#define _LIBCPP_END_NAMESPACE_LFTS_V2 \ - } \ - } \ - } - -// TODO: support more targets -#if defined(__AVX__) -# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32 -#else -# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16 -#endif - -#endif diff --git a/libcxx/include/experimental/__simd/aligned_tag.h b/libcxx/include/experimental/__simd/aligned_tag.h index 31d2b50aa1dd368..e364e146a601169 100644 --- a/libcxx/include/experimental/__simd/aligned_tag.h +++ b/libcxx/include/experimental/__simd/aligned_tag.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H #define _LIBCPP_EXPERIMENTAL___SIMD_ALIGNED_TAG_H +#include <__config> #include <__memory/assume_aligned.h> #include <__type_traits/remove_const.h> #include -#include #include #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) diff --git a/libcxx/include/experimental/__simd/declaration.h b/libcxx/include/experimental/__simd/declaration.h index 7b45d035c27121c..2ac7224159cf35d 100644 --- a/libcxx/include/experimental/__simd/declaration.h +++ b/libcxx/include/experimental/__simd/declaration.h @@ -10,11 +10,18 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H #define _LIBCPP_EXPERIMENTAL___SIMD_DECLARATION_H +#include <__config> #include -#include #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL) +// TODO: support more targets +# if defined(__AVX__) +# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 32 +# else +# define _LIBCPP_NATIVE_SIMD_WIDTH_IN_BYTES 16 +# endif + _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL inline namespace parallelism_v2 { namespace simd_abi { diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h index c60c08b0ea459c2..cba460baaa95bbc 100644 --- a/libcxx/include/experimental/__simd/reference.h +++ b/libcxx/include/experimental/__simd/reference.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H +#include <__config> #include <__type_traits/enable_if.h> #include <__type_traits/is_assignable.h> #include <__type_traits/is_same.h> @@ -17,7 +18,6 @@ #include <__utility/forward.h> #include <__utility/move.h> #include -#include #include _LIBCPP_PUSH_MACROS diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h index a2aeeb5cd0f54fd..a76933e1a5849d6 100644 --- a/libcxx/include/experimental/__simd/scalar.h +++ b/libcxx/include/experimental/__simd/scalar.h @@ -11,9 +11,9 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_SCALAR_H #include <__assert> +#include <__config> #include <__type_traits/integral_constant.h> #include -#include #include #include diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h index db0f9b39d9600c9..2c65d19e67b36fe 100644 --- a/libcxx/include/experimental/__simd/simd.h +++ b/libcxx/include/experimental/__simd/simd.h @@ -10,12 +10,12 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_H +#include <__config> #include <__type_traits/enable_if.h> #include <__type_traits/is_same.h> #include <__type_traits/remove_cvref.h> #include <__utility/forward.h> #include -#include #include #include #include diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h index d54d4898b718af3..552731941531341 100644 --- a/libcxx/include/experimental/__simd/simd_mask.h +++ b/libcxx/include/experimental/__simd/simd_mask.h @@ -10,10 +10,10 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H #define _LIBCPP_EXPERIMENTAL___SIMD_SIMD_MASK_H +#include <__config> #include <__type_traits/enable_if.h> #include <__type_traits/is_same.h> #include -#include #include #include #include diff --git a/libcxx/include/experimental/__simd/traits.h b/libcxx/include/experimental/__simd/traits.h index ec25b4bfa7f95eb..b817df604ef72e5 100644 --- a/libcxx/include/experimental/__simd/traits.h +++ b/libcxx/include/experimental/__simd/traits.h @@ -11,10 +11,10 @@ #define _LIBCPP_EXPERIMENTAL___SIMD_TRAITS_H #include <__bit/bit_ceil.h> +#include <__config> #include <__type_traits/integral_constant.h> #include <__type_traits/is_same.h> #include -#include #include #include diff --git a/libcxx/include/experimental/__simd/utility.h b/libcxx/include/experimental/__simd/utility.h index 708fa3d8f72cef8..0103b06b719532e 100644 --- a/libcxx/include/experimental/__simd/utility.h +++ b/libcxx/include/experimental/__simd/utility.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H #define _LIBCPP_EXPERIMENTAL___SIMD_UTILITY_H +#include <__config> #include <__type_traits/is_arithmetic.h> #include <__type_traits/is_const.h> #include <__type_traits/is_constant_evaluated.h> @@ -22,7 +23,6 @@ #include <__utility/integer_sequence.h> #include #include -#include #include _LIBCPP_PUSH_MACROS diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h index 5787f237bb01ef3..1f707cf3e184242 100644 --- a/libcxx/include/experimental/__simd/vec_ext.h +++ b/libcxx/include/experimental/__simd/vec_ext.h @@ -12,11 +12,11 @@ #include <__assert> #include <__bit/bit_ceil.h> +#include <__config> #include <__type_traits/integral_constant.h> #include <__utility/forward.h> #include <__utility/integer_sequence.h> #include -#include #include #include #include diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator index de82da2d3d72bdc..edfe6e707bcec9a 100644 --- a/libcxx/include/experimental/iterator +++ b/libcxx/include/experimental/iterator @@ -52,11 +52,11 @@ namespace std { */ +#include <__config> #include <__memory/addressof.h> #include <__type_traits/decay.h> #include <__utility/forward.h> #include <__utility/move.h> -#include #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/experimental/memory b/libcxx/include/experimental/memory index e9663d43a8ab739..bf8a154690af019 100644 --- a/libcxx/include/experimental/memory +++ b/libcxx/include/experimental/memory @@ -49,6 +49,7 @@ public: } */ +#include <__config> #include <__functional/hash.h> #include <__functional/operations.h> #include <__type_traits/add_lvalue_reference.h> @@ -57,7 +58,6 @@ public: #include <__type_traits/enable_if.h> #include <__type_traits/is_convertible.h> #include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/experimental/propagate_const b/libcxx/include/experimental/propagate_const index d7a695d8388923f..510d374bb4bf948 100644 --- a/libcxx/include/experimental/propagate_const +++ b/libcxx/include/experimental/propagate_const @@ -107,6 +107,7 @@ */ +#include <__config> #include <__functional/operations.h> #include <__fwd/functional.h> #include <__type_traits/conditional.h> @@ -128,7 +129,6 @@ #include <__utility/move.h> #include <__utility/swap.h> #include -#include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header diff --git a/libcxx/include/experimental/simd b/libcxx/include/experimental/simd index 484543b81daf1f4..35120b4b4aab421 100644 --- a/libcxx/include/experimental/simd +++ b/libcxx/include/experimental/simd @@ -75,7 +75,7 @@ inline namespace parallelism_v2 { # pragma GCC system_header #endif -#include +#include <__config> #include #include #include diff --git a/libcxx/include/experimental/type_traits b/libcxx/include/experimental/type_traits index 31b041bc94c43a4..a4bb59afaf4ac27 100644 --- a/libcxx/include/experimental/type_traits +++ b/libcxx/include/experimental/type_traits @@ -68,7 +68,7 @@ inline namespace fundamentals_v1 { */ -#include +#include <__config> #if _LIBCPP_STD_VER >= 14 diff --git a/libcxx/include/experimental/utility b/libcxx/include/experimental/utility index 8bd0a055b7783f4..cbc7ad140e40c11 100644 --- a/libcxx/include/experimental/utility +++ b/libcxx/include/experimental/utility @@ -30,7 +30,7 @@ inline namespace fundamentals_v1 { */ -#include +#include <__config> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 65df579b8d6dd70..add8726dead428f 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -610,10 +610,6 @@ module std_experimental [system] { header "experimental/utility" export * } - module __config { - textual header "experimental/__config" - export * - } } // Convenience method to get all of the above modules in a single import statement. diff --git a/libcxx/src/any.cpp b/libcxx/src/any.cpp index b0ad695669dec40..eaca2dd23976505 100644 --- a/libcxx/src/any.cpp +++ b/libcxx/src/any.cpp @@ -12,7 +12,7 @@ namespace std { const char* bad_any_cast::what() const noexcept { return "bad any cast"; } } // namespace std -#include +#include <__config> // Preserve std::experimental::any_bad_cast for ABI compatibility // Even though it no longer exists in a header file diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp index 62b474a312be2de..4e7e28898f88146 100644 --- a/libcxx/src/optional.cpp +++ b/libcxx/src/optional.cpp @@ -17,7 +17,7 @@ const char* bad_optional_access::what() const noexcept { return "bad_optional_ac } // namespace std -#include +#include <__config> // Preserve std::experimental::bad_optional_access for ABI compatibility // Even though it no longer exists in a header file From 882f21ec87abd960b7ce3e10225f2bfeda3e1f74 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Wed, 11 Sep 2024 12:03:23 -0700 Subject: [PATCH 29/94] [WebKit Checkers] Allow "singleton" suffix to be camelCased. (#108257) We should allow singleton and fooSingleton as singleton function names. --- .../StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp | 8 +++----- clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp | 8 ++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 2b9b7883c978bad..42efc0c43766e94 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -231,11 +231,9 @@ bool isSingleton(const FunctionDecl *F) { if (!MethodDecl->isStatic()) return false; } - const auto &Name = safeGetName(F); - std::string SingletonStr = "singleton"; - auto index = Name.find(SingletonStr); - return index != std::string::npos && - index == Name.size() - SingletonStr.size(); + const auto &NameStr = safeGetName(F); + StringRef Name = NameStr; // FIXME: Make safeGetName return StringRef. + return Name == "singleton" || Name.ends_with("Singleton"); } // We only care about statements so let's use the simple diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp index 424ebd349e955a4..97efb354f0371dd 100644 --- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp @@ -343,6 +343,12 @@ class RefCounted { return s_RefCounted; } + static RefCounted& otherSingleton() { + static RefCounted s_RefCounted; + s_RefCounted.ref(); + return s_RefCounted; + } + Number nonTrivial1() { return Number(3) + Number(4); } Number nonTrivial2() { return Number { 0.3 }; } int nonTrivial3() { return v ? otherFunction() : 0; } @@ -512,6 +518,8 @@ class UnrelatedClass { RefCounted::singleton().trivial18(); // no-warning RefCounted::singleton().someFunction(); // no-warning + RefCounted::otherSingleton().trivial18(); // no-warning + RefCounted::otherSingleton().someFunction(); // no-warning getFieldTrivial().recursiveTrivialFunction(7); // no-warning getFieldTrivial().recursiveComplexFunction(9); From aabb0121eece5243aca847cf2962f6464679c3c4 Mon Sep 17 00:00:00 2001 From: Matteo Franciolini Date: Wed, 11 Sep 2024 12:03:49 -0700 Subject: [PATCH 30/94] [mlir][bufferization] Fix OpFilter::denyDialect (#108249) The implementation would crash with unloaded dialects. --- .../mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h index 2fda091e412aefe..aceb9d059b95f3a 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -152,7 +152,7 @@ class OpFilter { /// This function adds a DENY entry. void denyDialect(StringRef dialectNamespace) { Entry::FilterFn filterFn = [=](Operation *op) { - return op->getDialect()->getNamespace() == dialectNamespace; + return op->getName().getDialectNamespace() == dialectNamespace; }; entries.push_back(Entry{filterFn, Entry::FilterType::DENY}); } From b06954a5d02a41a38b72f7914c791428ccd95318 Mon Sep 17 00:00:00 2001 From: Ryosuke Niwa Date: Wed, 11 Sep 2024 12:06:04 -0700 Subject: [PATCH 31/94] [alpha.webkit.UncountedCallArgsChecker] Allow protector functions in Objective-C++ (#108184) This PR fixes the bug that WebKit checkers didn't recognize the return value of an Objective-C++ selector which returns Ref or RefPtr to be safe. --- .../Checkers/WebKit/PtrTypesSemantics.cpp | 10 +++++++ .../Checkers/WebKit/PtrTypesSemantics.h | 5 ++++ .../WebKit/UncountedCallArgsChecker.cpp | 3 +-- .../Checkers/WebKit/uncounted-obj-arg.mm | 26 +++++++++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp index 42efc0c43766e94..f48b2fd9dca71be 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp @@ -143,6 +143,16 @@ bool isReturnValueRefCounted(const clang::FunctionDecl *F) { return false; } +std::optional isUncounted(const QualType T) { + if (auto *Subst = dyn_cast(T)) { + if (auto *Decl = Subst->getAssociatedDecl()) { + if (isRefType(safeGetName(Decl))) + return false; + } + } + return isUncounted(T->getAsCXXRecordDecl()); +} + std::optional isUncounted(const CXXRecordDecl* Class) { // Keep isRefCounted first as it's cheaper. diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h index ec1db1cc3358073..2932e62ad06e4bd 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h @@ -20,6 +20,7 @@ class CXXMethodDecl; class CXXRecordDecl; class Decl; class FunctionDecl; +class QualType; class Stmt; class Type; @@ -42,6 +43,10 @@ std::optional isRefCountable(const clang::CXXRecordDecl* Class); /// \returns true if \p Class is ref-counted, false if not. bool isRefCounted(const clang::CXXRecordDecl *Class); +/// \returns true if \p Class is ref-countable AND not ref-counted, false if +/// not, std::nullopt if inconclusive. +std::optional isUncounted(const clang::QualType T); + /// \returns true if \p Class is ref-countable AND not ref-counted, false if /// not, std::nullopt if inconclusive. std::optional isUncounted(const clang::CXXRecordDecl* Class); diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp index 704c082a4d1d630..81c2434ce64775e 100644 --- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp @@ -87,8 +87,7 @@ class UncountedCallArgsChecker } auto *E = MemberCallExpr->getImplicitObjectArgument(); QualType ArgType = MemberCallExpr->getObjectType(); - std::optional IsUncounted = - isUncounted(ArgType->getAsCXXRecordDecl()); + std::optional IsUncounted = isUncounted(ArgType); if (IsUncounted && *IsUncounted && !isPtrOriginSafe(E)) reportBugOnThis(E); } diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm new file mode 100644 index 000000000000000..db0c5b19eec5bbb --- /dev/null +++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.mm @@ -0,0 +1,26 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s +// expected-no-diagnostics + +#import "mock-types.h" +#import "mock-system-header.h" +#import "../../Inputs/system-header-simulator-for-objc-dealloc.h" + +@interface Foo : NSObject + +@property (nonatomic, readonly) RefPtr countable; + +- (void)execute; +- (RefPtr)_protectedRefCountable; +@end + +@implementation Foo + +- (void)execute { + self._protectedRefCountable->method(); +} + +- (RefPtr)_protectedRefCountable { + return _countable; +} + +@end From 0cfa5abd9ddb59bb4dfd8690ba9f8634cfc48e78 Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 11 Sep 2024 12:14:50 -0700 Subject: [PATCH 32/94] [SandboxIR][Bench] Add tests with tracking enabled (#108273) Benchmarks RAUW and RUOW when tracking is enabled. --- llvm/benchmarks/SandboxIRBench.cpp | 32 +++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp index 6cff2eefdb19f2d..ca2cab664f91e5b 100644 --- a/llvm/benchmarks/SandboxIRBench.cpp +++ b/llvm/benchmarks/SandboxIRBench.cpp @@ -34,15 +34,19 @@ static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { } enum class IR { - LLVM, - SBox, + LLVM, ///> LLVM IR + SBoxNoTracking, ///> Sandbox IR with tracking disabled + SBoxTracking, ///> Sandbox IR with tracking enabled }; // Traits to get llvm::BasicBlock/sandboxir::BasicBlock from IR::LLVM/IR::SBox. template struct TypeSelect {}; template <> struct TypeSelect { using BasicBlock = llvm::BasicBlock; }; -template <> struct TypeSelect { +template <> struct TypeSelect { + using BasicBlock = sandboxir::BasicBlock; +}; +template <> struct TypeSelect { using BasicBlock = sandboxir::BasicBlock; }; @@ -59,12 +63,22 @@ genIR(std::unique_ptr &LLVMM, LLVMContext &LLVMCtx, sandboxir::Function *F = Ctx.createFunction(LLVMF); sandboxir::BasicBlock *BB = &*F->begin(); + // Start tracking if we are testing with tracking enabled. + if constexpr (IRTy == IR::SBoxTracking) + Ctx.save(); + if constexpr (IRTy == IR::LLVM) return LLVMBB; else return BB; } +template static void finalize(sandboxir::Context &Ctx) { + // Accept changes if we are tracking. + if constexpr (IRTy == IR::SBoxTracking) + Ctx.accept(); +} + static std::string generateBBWalkIR(unsigned Size) { std::stringstream SS; SS << "define void @foo(i32 %v1, i32 %v2) {\n"; @@ -132,6 +146,7 @@ template static void RAUW(benchmark::State &State) { Def1->replaceAllUsesWith(Def2); Def2->replaceAllUsesWith(Def1); } + finalize(Ctx); } static std::string generateRUOWIR(unsigned NumOperands) { @@ -171,18 +186,21 @@ template static void RUOW(benchmark::State &State) { auto *Call = &*It++; for (auto _ : State) Call->replaceUsesOfWith(Arg0, Arg1); + finalize(Ctx); } BENCHMARK(GetType); -BENCHMARK(GetType); +BENCHMARK(GetType); BENCHMARK(BBWalk)->Args({1024}); -BENCHMARK(BBWalk)->Args({1024}); +BENCHMARK(BBWalk)->Args({1024}); BENCHMARK(RAUW)->Args({512}); -BENCHMARK(RAUW)->Args({512}); +BENCHMARK(RAUW)->Args({512}); +BENCHMARK(RAUW)->Args({512}); BENCHMARK(RUOW)->Args({4096}); -BENCHMARK(RUOW)->Args({4096}); +BENCHMARK(RUOW)->Args({4096}); +BENCHMARK(RUOW)->Args({4096}); BENCHMARK_MAIN(); From 4570984e7fe7409cec10d2305fb43c0b52806683 Mon Sep 17 00:00:00 2001 From: Saleem Abdulrasool Date: Wed, 11 Sep 2024 11:58:10 -0700 Subject: [PATCH 33/94] builtins: honour `_M_ARM64` as `__aarch64__` When clang is used as `clang-cl`, we use MSVC style macros. The spelling of `__aarch64__` is converted to `_M_ARM64`. Account for this alternative spelling in the conditional check. While in the area, add a tertiary spelling of `__arm64__` to ensure that we catch more of the variants. --- compiler-rt/lib/builtins/cpu_model/aarch64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c index 0dd397783b67f5a..ea2da23a95278fd 100644 --- a/compiler-rt/lib/builtins/cpu_model/aarch64.c +++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c @@ -14,7 +14,7 @@ #include "aarch64.h" -#if !defined(__aarch64__) +#if !defined(__aarch64__) && !defined(__arm64__) && !defined(_M_ARM64) #error This file is intended only for aarch64-based targets #endif From 108ed9d9fe33abc7337350329d048ec3000b1cb6 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Wed, 11 Sep 2024 20:30:14 +0100 Subject: [PATCH 34/94] [flang] Remove a leftover debugging message. (#108175) --- flang/lib/Semantics/expression.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 10aef72dd42cb9d..e94a49f6871db43 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -2744,7 +2744,6 @@ std::pair ExpressionAnalyzer::ResolveGeneric( (!procedure->IsElemental() && nonElemental)) { int d{ComputeCudaMatchingDistance( context_.languageFeatures(), *procedure, localActuals)}; - llvm::errs() << "matching distance: " << d << "\n"; if (d != crtMatchingDistance) { if (d > crtMatchingDistance) { continue; From c31d343857f514dde9146279797ebdcd4010e60b Mon Sep 17 00:00:00 2001 From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com> Date: Wed, 11 Sep 2024 15:02:38 -0500 Subject: [PATCH 35/94] Update legalizations for LowerGpuOpsToROCDLOps (#108266) LLVM::FAbsOp and LLVM::SqrtOp are legal after https://github.com/llvm/llvm-project/pull/102971 --- mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index 93e8b080a4f6728..29926719129dc53 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -329,10 +329,9 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) { target.addLegalDialect<::mlir::LLVM::LLVMDialect>(); target.addLegalDialect(); target.addIllegalDialect(); - target.addIllegalOp(); + target.addIllegalOp(); // TODO: Remove once we support replacing non-root ops. target.addLegalOp(); From b8239e1201f5871bed5b633b76fa9536672f287f Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Wed, 11 Sep 2024 13:03:00 -0700 Subject: [PATCH 36/94] [HLSL] Add StructuredBuffer to external sema source (#106316) This PR adds `StructuredBuffer` to `HLSLExternalSemaSource.cpp`, by copying the logic from RWBuffer but just replacing the name with StructuredBuffer. The change now allows StructuredBuffers to be defined in HLSL, though they function the same as RWBuffers. Further work to apply the appropriate attributes that distinguish StructuredBuffers from other Buffer types will be deferred. This improves our position on https://github.com/llvm/llvm-project/issues/106189 --- clang/lib/Sema/HLSLExternalSemaSource.cpp | 10 +++ clang/test/AST/HLSL/StructuredBuffer-AST.hlsl | 64 +++++++++++++++++++ .../StructuredBuffer-annotations.hlsl | 22 +++++++ .../StructuredBuffer-constructor.hlsl | 12 ++++ .../StructuredBuffer-elementtype.hlsl | 52 +++++++++++++++ .../builtins/StructuredBuffer-subscript.hlsl | 16 +++++ .../SemaHLSL/BuiltIns/StructuredBuffers.hlsl | 19 ++++++ 7 files changed, 195 insertions(+) create mode 100644 clang/test/AST/HLSL/StructuredBuffer-AST.hlsl create mode 100644 clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl create mode 100644 clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl create mode 100644 clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl create mode 100644 clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl create mode 100644 clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 071e64fe56d48a5..da7bbf8baa74dfb 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -525,6 +525,16 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .addArraySubscriptOperators() .completeDefinition(); }); + + Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "StructuredBuffer") + .addSimpleTemplateParams(*SemaPtr, {"element_type"}) + .Record; + onCompletion(Decl, [this](CXXRecordDecl *Decl) { + setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, + ResourceKind::TypedBuffer, /*IsROV=*/false) + .addArraySubscriptOperators() + .completeDefinition(); + }); } void HLSLExternalSemaSource::onCompletion(CXXRecordDecl *Record, diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl new file mode 100644 index 000000000000000..42991d8dc9c2e3d --- /dev/null +++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl @@ -0,0 +1,64 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s + + +// This test tests two different AST generations. The "EMPTY" test mode verifies +// the AST generated by forward declaration of the HLSL types which happens on +// initializing the HLSL external AST with an AST Context. + +// The non-empty mode has a use that requires the StructuredBuffer type be complete, +// which results in the AST being populated by the external AST source. That +// case covers the full implementation of the template declaration and the +// instantiated specialization. + +// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer +// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer +// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final + +// There should be no more occurrances of StructuredBuffer +// EMPTY-NOT: StructuredBuffer + +#ifndef EMPTY + +StructuredBuffer Buffer; + +#endif + +// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit StructuredBuffer +// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer definition + +// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'element_type *' +// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <> Implicit UAV +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer + +// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' +// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const StructuredBuffer' lvalue implicit this +// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' +// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline + +// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' +// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> +// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'StructuredBuffer' lvalue implicit this +// CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' +// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline + +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class StructuredBuffer definition + +// CHECK: TemplateArgument type 'float' +// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' +// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit referenced h 'float *' +// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <> Implicit UAV +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl new file mode 100644 index 000000000000000..16b7295c985f77b --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s + +StructuredBuffer Buffer1; +StructuredBuffer > BufferArray[4]; + +StructuredBuffer Buffer2 : register(u3); +StructuredBuffer > BufferArray2[4] : register(u4); + +StructuredBuffer Buffer3 : register(u3, space1); +StructuredBuffer > BufferArray3[4] : register(u4, space1); + +[numthreads(1,1,1)] +void main() { +} + +// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]} +// CHECK-DAG: ![[Single]] = !{ptr @"?Buffer1@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[Array]] = !{ptr @"?BufferArray@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 -1, i32 0} +// CHECK-DAG: ![[SingleAllocated]] = !{ptr @"?Buffer2@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 0} +// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @"?BufferArray2@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 0} +// CHECK-DAG: ![[SingleSpace]] = !{ptr @"?Buffer3@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, i1 false, i32 3, i32 1} +// CHECK-DAG: ![[ArraySpace]] = !{ptr @"?BufferArray3@@3PAV?$StructuredBuffer@T?$__vector@M$03@__clang@@@hlsl@@A", i32 10, i32 9, i1 false, i32 4, i32 1} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl new file mode 100644 index 000000000000000..34019e5b186931a --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl @@ -0,0 +1,12 @@ +// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV + +StructuredBuffer Buf; + +// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ" +// CHECK-NEXT: entry: + +// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1) +// CHECK: store ptr %[[HandleRes]], ptr %h, align 4 + +// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1) +// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8 diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl new file mode 100644 index 000000000000000..8ddf8a6004403ed --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl @@ -0,0 +1,52 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s + +StructuredBuffer BufI16; +StructuredBuffer BufU16; +StructuredBuffer BufI32; +StructuredBuffer BufU32; +StructuredBuffer BufI64; +StructuredBuffer BufU64; +StructuredBuffer BufF16; +StructuredBuffer BufF32; +StructuredBuffer BufF64; +StructuredBuffer< vector > BufI16x4; +StructuredBuffer< vector > BufU32x3; +StructuredBuffer BufF16x2; +StructuredBuffer BufF32x3; +// TODO: StructuredBuffer BufSNormF16; -> 11 +// TODO: StructuredBuffer BufUNormF16; -> 12 +// TODO: StructuredBuffer BufSNormF32; -> 13 +// TODO: StructuredBuffer BufUNormF32; -> 14 +// TODO: StructuredBuffer BufSNormF64; -> 15 +// TODO: StructuredBuffer BufUNormF64; -> 16 + +[numthreads(1,1,1)] +void main(int GI : SV_GroupIndex) { + BufI16[GI] = 0; + BufU16[GI] = 0; + BufI32[GI] = 0; + BufU32[GI] = 0; + BufI64[GI] = 0; + BufU64[GI] = 0; + BufF16[GI] = 0; + BufF32[GI] = 0; + BufF64[GI] = 0; + BufI16x4[GI] = 0; + BufU32x3[GI] = 0; + BufF16x2[GI] = 0; + BufF32x3[GI] = 0; +} + +// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16@@3V?$StructuredBuffer@F@hlsl@@A", i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufU16@@3V?$StructuredBuffer@G@hlsl@@A", i32 10, i32 3, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufI32@@3V?$StructuredBuffer@H@hlsl@@A", i32 10, i32 4, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32@@3V?$StructuredBuffer@I@hlsl@@A", i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufI64@@3V?$StructuredBuffer@J@hlsl@@A", i32 10, i32 6, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufU64@@3V?$StructuredBuffer@K@hlsl@@A", i32 10, i32 7, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16@@3V?$StructuredBuffer@$f16@@hlsl@@A", i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32@@3V?$StructuredBuffer@M@hlsl@@A", i32 10, i32 9, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufF64@@3V?$StructuredBuffer@N@hlsl@@A", i32 10, i32 10, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufI16x4@@3V?$StructuredBuffer@T?$__vector@F$03@__clang@@@hlsl@@A", i32 10, i32 2, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufU32x3@@3V?$StructuredBuffer@T?$__vector@I$02@__clang@@@hlsl@@A", i32 10, i32 5, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufF16x2@@3V?$StructuredBuffer@T?$__vector@$f16@$01@__clang@@@hlsl@@A", i32 10, i32 8, +// CHECK: !{{[0-9]+}} = !{ptr @"?BufF32x3@@3V?$StructuredBuffer@T?$__vector@M$02@__clang@@@hlsl@@A", i32 10, i32 9, diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl new file mode 100644 index 000000000000000..9bd885d94d7e75d --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s + +StructuredBuffer In; +StructuredBuffer Out; + +[numthreads(1,1,1)] +void main(unsigned GI : SV_GroupIndex) { + Out[GI] = In[GI]; +} + +// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy +// and confusing to follow so the match here is pretty weak. + +// CHECK: define internal void @"?main@@YAXI@Z" +// CHECK-NOT: call +// CHECK: ret void diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl new file mode 100644 index 000000000000000..2450941f5d9b463 --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s + +typedef vector float3; + +StructuredBuffer Buffer; + +// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}} +// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} +StructuredBuffer BufferErr1; + +// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}} +// expected-note@*:* {{template declaration from hidden source: template class StructuredBuffer}} +StructuredBuffer<> BufferErr2; + +[numthreads(1,1,1)] +void main() { + (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer >'}} + // expected-note@* {{implicitly declared private here}} +} From ea83e1c05a5adee5d8f9e680356ee57556ba64a1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Sep 2024 21:04:33 +0100 Subject: [PATCH 37/94] [LV] Assign cost to all interleave members when not interleaving. At the moment, the full cost of all interleave group members is assigned to the instruction at the group's insert position, even if the decision was to not form an interleave group. This can lead to inaccurate cost estimates, e.g. if the instruction at the insert position is dead. If the decision is to not vectorize but scalarize or scather/gather, then the cost will be to total cost for all members. In those cases, assign individual the cost per member, to more closely reflect to choice per instruction. This fixes a divergence between legacy and VPlan-based cost model. Fixes https://github.com/llvm/llvm-project/issues/108098. --- .../Transforms/Vectorize/LoopVectorize.cpp | 16 +- .../X86/interleaved-load-f32-stride-5.ll | 50 +- .../X86/interleaved-load-f32-stride-7.ll | 70 +- .../X86/interleaved-load-f32-stride-8.ll | 64 +- .../X86/interleaved-load-f64-stride-2.ll | 8 +- .../X86/interleaved-load-f64-stride-3.ll | 12 +- .../X86/interleaved-load-f64-stride-4.ll | 16 +- .../X86/interleaved-load-f64-stride-5.ll | 50 +- .../X86/interleaved-load-f64-stride-6.ll | 24 +- .../X86/interleaved-load-f64-stride-7.ll | 70 +- .../X86/interleaved-load-f64-stride-8.ll | 80 +- .../X86/interleaved-load-i16-stride-5.ll | 50 +- .../X86/interleaved-load-i16-stride-7.ll | 70 +- .../X86/interleaved-load-i16-stride-8.ll | 80 +- ...erleaved-load-i32-stride-4-indices-01uu.ll | 4 +- .../X86/interleaved-load-i32-stride-5.ll | 50 +- .../X86/interleaved-load-i32-stride-7.ll | 70 +- .../X86/interleaved-load-i32-stride-8.ll | 64 +- .../X86/interleaved-load-i64-stride-2.ll | 8 +- .../X86/interleaved-load-i64-stride-3.ll | 12 +- .../X86/interleaved-load-i64-stride-4.ll | 16 +- .../X86/interleaved-load-i64-stride-5.ll | 50 +- .../X86/interleaved-load-i64-stride-6.ll | 24 +- .../X86/interleaved-load-i64-stride-7.ll | 70 +- .../X86/interleaved-load-i64-stride-8.ll | 80 +- .../X86/interleaved-load-i8-stride-5.ll | 50 +- .../X86/interleaved-load-i8-stride-7.ll | 70 +- .../X86/interleaved-load-i8-stride-8.ll | 80 +- .../X86/interleaved-store-f64-stride-8.ll | 32 +- .../X86/interleaved-store-i64-stride-8.ll | 32 +- .../X86/masked-interleaved-store-i16.ll | 8 +- .../AArch64/interleaved-vs-scalar.ll | 4 +- .../LoopVectorize/AArch64/interleaved_cost.ll | 4 +- .../LoopVectorize/ARM/mve-interleaved-cost.ll | 696 +++++++++--------- .../LoopVectorize/RISCV/dead-ops-cost.ll | 93 ++- .../SystemZ/mem-interleaving-costs-03.ll | 3 +- 36 files changed, 1141 insertions(+), 1039 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 640a7bf3d672f09..3b6b154b9660cfb 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1182,13 +1182,23 @@ class LoopVectorizationCostModel { InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. - /// But the cost will be assigned to one instruction only. + /// When interleaving, the cost will only be assigned one instruction, the + /// insert position. For other cases, add the appropriate fraction of the + /// total cost to each instruction. This ensures accurate costs are used, + /// even if the insert position instruction is not used. + InstructionCost InsertPosCost = Cost; + InstructionCost OtherMemberCost = 0; + if (W != CM_Interleave) + OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers(); + ; for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) { if (auto *I = Grp->getMember(Idx)) { if (Grp->getInsertPos() == I) - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + WideningDecisions[std::make_pair(I, VF)] = + std::make_pair(W, InsertPosCost); else - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); + WideningDecisions[std::make_pair(I, VF)] = + std::make_pair(W, OtherMemberCost); } } } diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll index 29dce5f21173aff..57ae02abc119967 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-5.ll @@ -82,26 +82,26 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load float, ptr %in2, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load float, ptr %in3, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 75 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 150 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -139,11 +139,11 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load float, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll index 0e7b1c58e587c15..2cc5150f3c887f9 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-7.ll @@ -108,34 +108,34 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 105 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 15 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -180,13 +180,13 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll index 8830aff579c3227..1899741b8a3b408 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f32-stride-8.ll @@ -121,22 +121,22 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load float, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load float, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load float, ptr %in2, align 4 @@ -145,14 +145,14 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 30 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4 @@ -203,14 +203,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load float, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load float, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load float, ptr %in7, align 4 -; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load float, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load float, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load float, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load float, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load float, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load float, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load float, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll index cfd3d7841caa2dd..2d4b300a8100a43 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-2.ll @@ -71,10 +71,10 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll index 5ec5b5173138563..5dfb25e25d6e61e 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-3.ll @@ -88,12 +88,12 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load double, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll index 450743df723251c..bd88ca810728b18 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-4.ll @@ -107,14 +107,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll index 5e5c718dba97d2c..9c0798631fdba5c 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-5.ll @@ -72,21 +72,21 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 35 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 70 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -114,16 +114,16 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll index 62541fa2368c6a4..99a735d3f552c4a 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-6.ll @@ -133,18 +133,18 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll index cfed8554b978b83..168e9166ea1dd44 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-7.ll @@ -94,27 +94,27 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 49 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 98 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -152,20 +152,20 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load double, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll index 07939b914d0224c..919a17e8729e0ff 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-f64-stride-8.ll @@ -105,30 +105,30 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 3 for VF 2 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load double, ptr %in0, align 8 @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load double, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load double, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load double, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load double, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load double, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load double, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load double, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load double, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load double, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load double, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load double, ptr %in7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll index 964a9b660942e1f..6737c722b46ff9e 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-5.ll @@ -93,31 +93,31 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i16, ptr %in2, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i16, ptr %in3, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 165 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 330 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll index 6653198397dd258..46d56a75f1c4de4 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-7.ll @@ -123,41 +123,41 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i16, ptr %in4, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 231 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 462 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll index b3a5cbeccc09c4c..4d65abdaf688c98 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-8.ll @@ -138,46 +138,46 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i16, ptr %in5, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i16, ptr %in6, align 2 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 264 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 -; AVX2: LV: Found an estimated cost of 528 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 33 for VF 16 For instruction: %v7 = load i16, ptr %in7, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v1 = load i16, ptr %in1, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v2 = load i16, ptr %in2, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v3 = load i16, ptr %in3, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v4 = load i16, ptr %in4, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v5 = load i16, ptr %in5, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v6 = load i16, ptr %in6, align 2 +; AVX2: LV: Found an estimated cost of 66 for VF 32 For instruction: %v7 = load i16, ptr %in7, align 2 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll index c0ea210385dfdae..28a6443efcfb9d3 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-4-indices-01uu.ll @@ -73,8 +73,8 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX512: LV: Found an estimated cost of 50 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll index 2a261ca4de4fa49..5cad7bf662c5b31 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-5.ll @@ -82,26 +82,26 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i32, ptr %in2, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i32, ptr %in3, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 85 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 170 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -139,11 +139,11 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 400 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v4 = load i32, ptr %in4, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll index 8bf3071d29fbe16..cfb83d4a0236535 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-7.ll @@ -108,34 +108,34 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 119 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 238 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 17 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -180,13 +180,13 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll index 3182de2df058a88..775785462de4741 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i32-stride-8.ll @@ -121,22 +121,22 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i32, ptr %in7, align 4 ; AVX2: LV: Found an estimated cost of 48 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i32, ptr %in1, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i32, ptr %in2, align 4 @@ -145,14 +145,14 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX2: LV: Found an estimated cost of 272 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX2: LV: Found an estimated cost of 34 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4 @@ -203,14 +203,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i32, ptr %in5, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i32, ptr %in6, align 4 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i32, ptr %in7, align 4 -; AVX512: LV: Found an estimated cost of 320 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i32, ptr %in1, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i32, ptr %in2, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i32, ptr %in3, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i32, ptr %in4, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i32, ptr %in5, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i32, ptr %in6, align 4 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v7 = load i32, ptr %in7, align 4 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll index 27e2ee0392615b6..cf350cc9f8307e2 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-2.ll @@ -71,10 +71,10 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll index c37723257c1f753..9ca0d8c9d7e33b8 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-3.ll @@ -88,12 +88,12 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 80 for VF 64 For instruction: %v2 = load i64, ptr %in2, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll index 2eb7c5e93078f55..86ee6c8b30bda07 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-4.ll @@ -107,14 +107,14 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll index c11da4309737d22..f6143d4ae9f3b18 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-5.ll @@ -72,21 +72,21 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -114,16 +114,16 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 100 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 200 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll index de57af6ebe39842..43dc53d683de394 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-6.ll @@ -133,18 +133,18 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll index 949c1af1fdad3b4..70ed74dcc26da3c 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-7.ll @@ -94,27 +94,27 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 63 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 126 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -152,20 +152,20 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %v6 = load i64, ptr %in6, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll index 4388ccfbdcfc4f0..401e4de111d73f7 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i64-stride-8.ll @@ -105,30 +105,30 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 72 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX2: LV: Found an estimated cost of 144 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 9 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX2: LV: Found an estimated cost of 18 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 ; ; AVX512-LABEL: 'test' ; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i64, ptr %in0, align 8 @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i64, ptr %in5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i64, ptr %in6, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %v7 = load i64, ptr %in7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i64, ptr %in0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v1 = load i64, ptr %in1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v2 = load i64, ptr %in2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v3 = load i64, ptr %in3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v4 = load i64, ptr %in4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v5 = load i64, ptr %in5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v6 = load i64, ptr %in6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %v7 = load i64, ptr %in7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll index 6078fb440f9d13f..ef3c80c27550a3e 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-5.ll @@ -93,31 +93,31 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v2 = load i8, ptr %in2, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v3 = load i8, ptr %in3, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 20 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 40 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 80 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 160 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 325 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll index 778a4e7dfd7d9df..8e7c316b69b3acc 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-7.ll @@ -123,41 +123,41 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v4 = load i8, ptr %in4, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 56 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 112 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 224 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 455 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll index a230b5a0b1f2b74..752cc229922bea4 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-8.ll @@ -138,46 +138,46 @@ define void @test() { ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v5 = load i8, ptr %in5, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v6 = load i8, ptr %in6, align 1 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 32 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 64 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 128 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 256 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 -; AVX2: LV: Found an estimated cost of 520 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 -; AVX2: LV: Found an estimated cost of 0 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 4 for VF 2 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 8 for VF 4 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 16 for VF 8 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 32 for VF 16 For instruction: %v7 = load i8, ptr %in7, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v1 = load i8, ptr %in1, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v2 = load i8, ptr %in2, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v3 = load i8, ptr %in3, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v4 = load i8, ptr %in4, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v5 = load i8, ptr %in5, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v6 = load i8, ptr %in6, align 1 +; AVX2: LV: Found an estimated cost of 65 for VF 32 For instruction: %v7 = load i8, ptr %in7, align 1 ; ; AVX512DQ-LABEL: 'test' ; AVX512DQ: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1 diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll index c1a66c1a41d74f3..ed2bb3f750b01d6 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-f64-stride-8.ll @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v5, ptr %out5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store double %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store double %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store double %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store double %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store double %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store double %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store double %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll index 7be9577960efe3e..a5398771041291a 100644 --- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll +++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i64-stride-8.ll @@ -163,22 +163,22 @@ define void @test() { ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8 ; AVX512: LV: Found an estimated cost of 0 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8 ; AVX512: LV: Found an estimated cost of 46 for VF 4 For instruction: store i64 %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 80 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8 -; AVX512: LV: Found an estimated cost of 0 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 -; AVX512: LV: Found an estimated cost of 160 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: store i64 %v7, ptr %out7, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v0, ptr %out0, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8 +; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: store i64 %v7, ptr %out7, align 8 ; entry: br label %for.body diff --git a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll index 13a844230f89d94..41dd266d0a87acd 100644 --- a/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll +++ b/llvm/test/Analysis/CostModel/X86/masked-interleaved-store-i16.ll @@ -38,8 +38,8 @@ define void @test1(ptr noalias nocapture %points, ptr noalias nocapture readonly ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 12 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 @@ -99,8 +99,8 @@ define void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr noalias no ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %2, ptr %arrayidx7, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 -; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 10 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %0, ptr %arrayidx2, align 2 +; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 5 for VF 2 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction: store i16 %0, ptr %arrayidx2, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction: store i16 %2, ptr %arrayidx7, align 2 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction: store i16 %0, ptr %arrayidx2, align 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll index 430b0db87b88468..d1d1b0ab100fb8b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -7,8 +7,8 @@ target triple = "aarch64--linux-gnu" %pair = type { i8, i8 } ; CHECK-LABEL: test -; CHECK: Found an estimated cost of 16 for VF 2 For instruction: {{.*}} load i8 -; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 8 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 8 for VF 2 For instruction: {{.*}} load i8 ; CHECK-LABEL: entry: ; CHECK-LABEL: vector.body: ; CHECK: [[LOAD1:%.*]] = load i8 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll index 21af9ae801e16ca..dec124b55cd4e0d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -168,8 +168,8 @@ entry: ; gaps. ; ; VF_2-LABEL: Checking a loop in 'i64_factor_8' -; VF_2: Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_2: Found an estimated cost of 8 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll index c7a04e3669ed627..976c6a9a570af97 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -15,10 +15,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i8_factor_2' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp2, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp1, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, ptr %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp1, align 1 @@ -56,10 +56,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i16_factor_2' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp1, align 2 @@ -97,10 +97,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_2' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp1, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_2' ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, ptr %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp1, align 4 @@ -138,25 +138,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_2' -; VF_2: Found an estimated cost of 44 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 44 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_2' -; VF_4: Found an estimated cost of 88 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 88 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_2' -; VF_8: Found an estimated cost of 176 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 176 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_2' -; VF_16: Found an estimated cost of 352 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 352 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 +; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, ptr %data, i64 %i, i32 0 @@ -179,10 +179,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f16_factor_2' -; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load half, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp2, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp1, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_2' ; VF_4: Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, ptr %tmp1, align 2 @@ -261,25 +261,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f64_factor_2' -; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_2' -; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_2' -; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp1, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_2' -; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 +; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp2 = load double, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp2, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.2, ptr %data, i64 %i, i32 0 @@ -306,33 +306,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i8_factor_3' -; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_3' -; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 72 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_3' -; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp2, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_3' -; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i8, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp3, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp2, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0 @@ -358,33 +358,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i16_factor_3' -; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_3' -; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 72 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_3' -; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp2, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_3' -; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i16, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp3, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp2, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.3, ptr %data, i64 %i, i32 0 @@ -410,33 +410,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_3' -; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 36 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_3' -; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_3' -; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp2, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_3' -; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 288 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp3 = load i32, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp3, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp2, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.3, ptr %data, i64 %i, i32 0 @@ -462,33 +462,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_3' -; VF_2: Found an estimated cost of 66 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 66 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_3' -; VF_4: Found an estimated cost of 132 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 132 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_3' -; VF_8: Found an estimated cost of 264 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 264 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_3' -; VF_16: Found an estimated cost of 528 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 528 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 +; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp3 = load i64, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.3, ptr %data, i64 %i, i32 0 @@ -514,12 +514,12 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f16_factor_3' -; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load half, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load half, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load half, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp3, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp4, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store half %tmp5, ptr %tmp2, align 2 ; VF_4-LABEL: Checking a loop in 'f16_factor_3' ; VF_4: Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, ptr %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, ptr %tmp1, align 2 @@ -573,12 +573,12 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float %tmp4, ptr %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store float %tmp5, ptr %tmp2, align 4 ; VF_4-LABEL: Checking a loop in 'f32_factor_3' -; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp3 = load float, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp3, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp2, align 4 ; VF_8-LABEL: Checking a loop in 'f32_factor_3' ; VF_8: Found an estimated cost of 64 for VF 8 For instruction: %tmp3 = load float, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, ptr %tmp1, align 4 @@ -618,33 +618,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f64_factor_3' -; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_3' -; VF_4: Found an estimated cost of 36 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 36 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_3' -; VF_8: Found an estimated cost of 72 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 72 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp2, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_3' -; VF_16: Found an estimated cost of 144 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 144 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 +; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp3 = load double, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp3, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.3, ptr %data, i64 %i, i32 0 @@ -673,41 +673,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i8_factor_4' -; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_4-LABEL: Checking a loop in 'i8_factor_4' -; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 96 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_8-LABEL: Checking a loop in 'i8_factor_4' -; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 %tmp7, ptr %tmp3, align 1 ; VF_16-LABEL: Checking a loop in 'i8_factor_4' -; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i8, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i8, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i8, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i8, ptr %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp4, ptr %tmp0, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp5, ptr %tmp1, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp6, ptr %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 %tmp7, ptr %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0 @@ -736,41 +736,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i16_factor_4' -; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_4-LABEL: Checking a loop in 'i16_factor_4' -; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 96 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_8-LABEL: Checking a loop in 'i16_factor_4' -; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 %tmp7, ptr %tmp3, align 2 ; VF_16-LABEL: Checking a loop in 'i16_factor_4' -; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i16, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i16, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i16, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i16, ptr %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp4, ptr %tmp0, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp5, ptr %tmp1, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp6, ptr %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 %tmp7, ptr %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, ptr %data, i64 %i, i32 0 @@ -799,41 +799,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i32_factor_4' -; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 48 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_4-LABEL: Checking a loop in 'i32_factor_4' -; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_8-LABEL: Checking a loop in 'i32_factor_4' -; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 192 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 %tmp7, ptr %tmp3, align 4 ; VF_16-LABEL: Checking a loop in 'i32_factor_4' -; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 384 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp4 = load i32, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp5 = load i32, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp6 = load i32, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: %tmp7 = load i32, ptr %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp4, ptr %tmp0, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp5, ptr %tmp1, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp6, ptr %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 %tmp7, ptr %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, ptr %data, i64 %i, i32 0 @@ -862,41 +862,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'i64_factor_4' -; VF_2: Found an estimated cost of 88 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 88 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_2: Found an estimated cost of 22 for VF 2 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 22 for VF 2 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'i64_factor_4' -; VF_4: Found an estimated cost of 176 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 176 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_4: Found an estimated cost of 44 for VF 4 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 44 for VF 4 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'i64_factor_4' -; VF_8: Found an estimated cost of 352 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 352 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_8: Found an estimated cost of 88 for VF 8 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 88 for VF 8 For instruction: store i64 %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'i64_factor_4' -; VF_16: Found an estimated cost of 704 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 704 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 +; VF_16: Found an estimated cost of 176 for VF 16 For instruction: %tmp4 = load i64, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp5 = load i64, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp6 = load i64, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: %tmp7 = load i64, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 176 for VF 16 For instruction: store i64 %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.4, ptr %data, i64 %i, i32 0 @@ -997,14 +997,14 @@ entry: ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float %tmp6, ptr %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 20 for VF 2 For instruction: store float %tmp7, ptr %tmp3, align 4 ; VF_4-LABEL: Checking a loop in 'f32_factor_4' -; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4 -; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4 +; VF_4: Found an estimated cost of 8 for VF 4 For instruction: %tmp4 = load float, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp5 = load float, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp6 = load float, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: %tmp7 = load float, ptr %tmp3, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp4, ptr %tmp0, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp5, ptr %tmp1, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp6, ptr %tmp2, align 4 +; VF_4-NEXT: Found an estimated cost of 8 for VF 4 For instruction: store float %tmp7, ptr %tmp3, align 4 ; VF_8-LABEL: Checking a loop in 'f32_factor_4' ; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp4 = load float, ptr %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, ptr %tmp1, align 4 @@ -1051,41 +1051,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in 'f64_factor_4' -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_2: Found an estimated cost of 6 for VF 2 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 6 for VF 2 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_4-LABEL: Checking a loop in 'f64_factor_4' -; VF_4: Found an estimated cost of 48 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 48 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_4: Found an estimated cost of 12 for VF 4 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 12 for VF 4 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_8-LABEL: Checking a loop in 'f64_factor_4' -; VF_8: Found an estimated cost of 96 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 96 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_8: Found an estimated cost of 24 for VF 8 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 24 for VF 8 For instruction: store double %tmp7, ptr %tmp3, align 8 ; VF_16-LABEL: Checking a loop in 'f64_factor_4' -; VF_16: Found an estimated cost of 192 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 192 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 +; VF_16: Found an estimated cost of 48 for VF 16 For instruction: %tmp4 = load double, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp5 = load double, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp6 = load double, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: %tmp7 = load double, ptr %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp4, ptr %tmp0, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp5, ptr %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp6, ptr %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 48 for VF 16 For instruction: store double %tmp7, ptr %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.4, ptr %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 6d309c4453c7e14..df02cb741700e55 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -176,7 +176,7 @@ declare i16 @llvm.umax.i16(i16, i16) ; Test case for https://github.com/llvm/llvm-project/issues/106780. define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0 { ; CHECK-LABEL: define i32 @cost_of_exit_branch_and_cond_insts( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i1 [[C:%.*]], i16 [[X:%.*]]) #[[ATTR2:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[X]] to i32 ; CHECK-NEXT: [[UMAX3:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP0]], i32 111) @@ -404,6 +404,95 @@ exit: ret void } +; Test for https://github.com/llvm/llvm-project/issues/108098. +define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %src, ptr noalias %dst) #0 { +; CHECK-LABEL: define void @gather_interleave_group_with_dead_insert_pos( +; CHECK-SAME: i64 [[N:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 16, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4 +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x i8>, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <32 x i8>, ptr [[TMP10]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> poison, <8 x i32> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC2]], <32 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = zext <8 x i8> [[STRIDED_VEC4]] to <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = zext <8 x i8> [[STRIDED_VEC5]] to <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[VEC_IND]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[DST]], <8 x i64> [[STEP_ADD]] +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP11]], <8 x ptr> [[TMP13]], i32 4, <8 x i1> ) +; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> [[TMP12]], <8 x ptr> [[TMP14]], i32 4, <8 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD]], +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L_DEAD:%.*]] = load i8, ptr [[GEP_SRC_0]], align 1 +; CHECK-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]] +; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[L_1]] to i32 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 [[EXT]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 4 +; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %gep.src.0 = getelementptr i8, ptr %src, i64 %iv + %l.dead = load i8, ptr %gep.src.0, align 1 + %iv.1 = add i64 %iv, 1 + %gep.src.1 = getelementptr i8, ptr %src, i64 %iv.1 + %l.1 = load i8, ptr %gep.src.1, align 1 + %ext = zext i8 %l.1 to i32 + %gep.dst = getelementptr i32, ptr %dst, i64 %iv + store i32 %ext, ptr %gep.dst, align 4 + %iv.next = add nsw i64 %iv, 4 + %ec = icmp slt i64 %iv, %N + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+64bit,+v" } + ;. ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} @@ -425,4 +514,6 @@ exit: ; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]]} ; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} ; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} +; CHECK: [[LOOP20]] = distinct !{[[LOOP20]], [[META1]], [[META2]]} +; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[META2]], [[META1]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll index 88eb9c4d27e33a6..13c443c4d579f4c 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs-03.ll @@ -6,7 +6,8 @@ ; Check cost function for <8 x i128> store interleave group. ; CHECK: LV: Checking a loop in 'fun' -; CHECK: LV: Found an estimated cost of 8 for VF 4 For instruction: store i128 8721036757475490113 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i128 8721036757475490113 +; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction: store i128 8721036757475490113 define noundef i32 @fun(i32 %argc, ptr nocapture readnone %argv) { entry: From 956591bec5ffe5d293c4ac8044686f56ba8c118c Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Sep 2024 13:08:48 -0700 Subject: [PATCH 38/94] [SandboxIR] Add remaining SelectInst methods and track swapValues() (#108114) --- llvm/include/llvm/SandboxIR/SandboxIR.h | 15 ++++++++++++- llvm/lib/SandboxIR/SandboxIR.cpp | 6 +++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 11 +++++++++ llvm/unittests/SandboxIR/TrackerTest.cpp | 26 ++++++++++++++++++++++ 4 files changed, 57 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 2fdbbbd094650f4..88884683f591a47 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -1506,6 +1506,10 @@ class SelectInst : public SingleLLVMInstructionImpl { static Value *create(Value *Cond, Value *True, Value *False, BasicBlock *InsertAtEnd, Context &Ctx, const Twine &Name = ""); + + const Value *getCondition() const { return getOperand(0); } + const Value *getTrueValue() const { return getOperand(1); } + const Value *getFalseValue() const { return getOperand(2); } Value *getCondition() { return getOperand(0); } Value *getTrueValue() { return getOperand(1); } Value *getFalseValue() { return getOperand(2); } @@ -1513,7 +1517,16 @@ class SelectInst : public SingleLLVMInstructionImpl { void setCondition(Value *New) { setOperand(0, New); } void setTrueValue(Value *New) { setOperand(1, New); } void setFalseValue(Value *New) { setOperand(2, New); } - void swapValues() { cast(Val)->swapValues(); } + void swapValues(); + + /// Return a string if the specified operands are invalid for a select + /// operation, otherwise return null. + static const char *areInvalidOperands(Value *Cond, Value *True, + Value *False) { + return llvm::SelectInst::areInvalidOperands(Cond->Val, True->Val, + False->Val); + } + /// For isa/dyn_cast. static bool classof(const Value *From); }; diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index 18fdcda15a1a911..df3839518c9d089 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -662,6 +662,12 @@ Value *SelectInst::create(Value *Cond, Value *True, Value *False, return createCommon(Cond, True, False, Name, Builder, Ctx); } +void SelectInst::swapValues() { + Ctx.getTracker().emplaceIfTracking(getOperandUse(1), + getOperandUse(2)); + cast(Val)->swapValues(); +} + bool SelectInst::classof(const Value *From) { return From->getSubclassID() == ClassID::Select; } diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index b76d24dc297b965..148afd9483d5681 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -1354,14 +1354,18 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) { auto *BB = &*F->begin(); auto It = BB->begin(); auto *Select = cast(&*It++); + const auto *ConstSelect = Select; // To test the const getters. auto *Ret = &*It++; // Check getCondition(). EXPECT_EQ(Select->getCondition(), Cond0); + EXPECT_EQ(ConstSelect->getCondition(), Cond0); // Check getTrueValue(). EXPECT_EQ(Select->getTrueValue(), V0); + EXPECT_EQ(ConstSelect->getTrueValue(), V0); // Check getFalseValue(). EXPECT_EQ(Select->getFalseValue(), V1); + EXPECT_EQ(ConstSelect->getFalseValue(), V1); // Check setCondition(). Select->setCondition(Cond1); EXPECT_EQ(Select->getCondition(), Cond1); @@ -1371,6 +1375,13 @@ define void @foo(i1 %c0, i8 %v0, i8 %v1, i1 %c1) { // Check setFalseValue(). Select->setFalseValue(V0); EXPECT_EQ(Select->getFalseValue(), V0); + // Check swapValues(). + Select->swapValues(); + EXPECT_EQ(Select->getTrueValue(), V0); + EXPECT_EQ(Select->getFalseValue(), V1); + // Check areInvalidOperands. + EXPECT_EQ(sandboxir::SelectInst::areInvalidOperands(Cond0, V0, V1), nullptr); + EXPECT_NE(sandboxir::SelectInst::areInvalidOperands(V0, V1, Cond0), nullptr); { // Check SelectInst::create() InsertBefore. diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp index a1f39fe958e3515..a1a4117b5e97b11 100644 --- a/llvm/unittests/SandboxIR/TrackerTest.cpp +++ b/llvm/unittests/SandboxIR/TrackerTest.cpp @@ -964,6 +964,32 @@ define void @foo(i32 %cond0, i32 %cond1) { EXPECT_EQ(Switch->findCaseDest(BB1), One); } +TEST_F(TrackerTest, SelectInst) { + parseIR(C, R"IR( +define void @foo(i1 %c0, i8 %v0, i8 %v1) { + %sel = select i1 %c0, i8 %v0, i8 %v1 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + sandboxir::Function *F = Ctx.createFunction(LLVMF); + auto *V0 = F->getArg(1); + auto *V1 = F->getArg(2); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *Select = cast(&*It++); + + // Check tracking for swapValues. + Ctx.save(); + Select->swapValues(); + EXPECT_EQ(Select->getTrueValue(), V1); + EXPECT_EQ(Select->getFalseValue(), V0); + Ctx.revert(); + EXPECT_EQ(Select->getTrueValue(), V0); + EXPECT_EQ(Select->getFalseValue(), V1); +} + TEST_F(TrackerTest, ShuffleVectorInst) { parseIR(C, R"IR( define void @foo(<2 x i8> %v1, <2 x i8> %v2) { From b3f3c0c63358b412348022d10308b97332d02bcd Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 11 Sep 2024 11:44:28 -0700 Subject: [PATCH 39/94] [clang][AArch64] Put soft-float ABI checks under isSoftFloat(). NFC --- clang/lib/CodeGen/Targets/AArch64.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 2f119feb93aaf37..ec617eec67192cc 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -500,7 +500,7 @@ bool AArch64SwiftABIInfo::isLegalVectorType(CharUnits VectorSize, bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const { // For the soft-float ABI variant, no types are considered to be homogeneous // aggregates. - if (Kind == AArch64ABIKind::AAPCSSoft) + if (isSoftFloat()) return false; // Homogeneous aggregates for AAPCS64 must have base types of a floating @@ -555,8 +555,8 @@ RValue AArch64ABIInfo::EmitAAPCSVAArg(Address VAListAddr, QualType Ty, BaseTy = ArrTy->getElementType(); NumRegs = ArrTy->getNumElements(); } - bool IsFPR = Kind != AArch64ABIKind::AAPCSSoft && - (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy()); + bool IsFPR = + !isSoftFloat() && (BaseTy->isFloatingPointTy() || BaseTy->isVectorTy()); // The AArch64 va_list type and handling is specified in the Procedure Call // Standard, section B.4: From 9c0ba62010b5850adf6b4c3979128aa6e9189aca Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Wed, 11 Sep 2024 13:39:50 -0700 Subject: [PATCH 40/94] [ctx_prof] Relax the "profile use" case around `PGOOpt` (#108265) `PGOOpt` could have a value if, for instance, debug info for profiling is requested. Relaxing the requirement, for now, following that eventually we would factor `PGOOpt` to better capture the supported interplay between the various profiling options. --- llvm/lib/Passes/PassBuilderPipelines.cpp | 6 +++--- llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 6ebf262379c2fb1..8f151a99b11709a 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -1181,8 +1181,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // Enable contextual profiling instrumentation. const bool IsCtxProfGen = !IsPGOInstrGen && IsPreLink && PGOCtxProfLoweringPass::isCtxIRPGOInstrEnabled(); - const bool IsCtxProfUse = !UseCtxProfile.empty() && !PGOOpt && - Phase == ThinOrFullLTOPhase::ThinLTOPreLink; + const bool IsCtxProfUse = + !UseCtxProfile.empty() && Phase == ThinOrFullLTOPhase::ThinLTOPreLink; if (IsPGOInstrGen || IsPGOInstrUse || IsMemprofUse || IsCtxProfGen || IsCtxProfUse) @@ -1673,7 +1673,7 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level) { // In pre-link, for ctx prof use, we stop here with an instrumented IR. We let // thinlto use the contextual info to perform imports; then use the contextual // profile in the post-thinlink phase. - if (!UseCtxProfile.empty() && !PGOOpt) { + if (!UseCtxProfile.empty()) { addRequiredLTOPreLinkPasses(MPM); return MPM; } diff --git a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll index 7959e4d0760edbe..56ed92ea1b7ffbe 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-prof-use-prelink.ll @@ -4,6 +4,7 @@ ; There is no scenario currently of doing ctx profile use without thinlto. ; ; RUN: opt -passes='thinlto-pre-link' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s +; RUN: opt -debug-info-for-profiling -passes='thinlto-pre-link' -use-ctx-profile=something_that_does_not_exist %s -S | FileCheck %s declare void @bar() From c2b93e0671d8cfd6b1a24c6e1d7be290125b8974 Mon Sep 17 00:00:00 2001 From: Xiaofeng Tian <110771974+txff99@users.noreply.github.com> Date: Wed, 11 Sep 2024 13:42:18 -0700 Subject: [PATCH 41/94] update llvm-dis header with available options (#108073) update llvm-dis header options Closes #108069 --- llvm/tools/llvm-dis/llvm-dis.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp index d28af85bc739eb4..a3a62f042ddbd1f 100644 --- a/llvm/tools/llvm-dis/llvm-dis.cpp +++ b/llvm/tools/llvm-dis/llvm-dis.cpp @@ -11,7 +11,23 @@ // llvm-dis [options] x.bc - Read LLVM bitcode from the x.bc file, write asm // to the x.ll file. // Options: -// --help - Output information about command line switches +// +// Color Options: +// --color - Use colors in output (default=autodetect) +// +// Disassembler Options: +// -f - Enable binary output on terminals +// --materialize-metadata - Load module without materializing metadata, +// then materialize only the metadata +// -o - Override output filename +// --show-annotations - Add informational comments to the .ll file +// +// Generic Options: +// --help - Display available options +// (--help-hidden for more) +// --help-list - Display list of available options +// (--help-list-hidden for more) +// --version - Display the version of this program // //===----------------------------------------------------------------------===// From 5c7957dd4f12e7c9128068c5ed92464cdc59947e Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 11 Sep 2024 22:21:52 +0100 Subject: [PATCH 42/94] [AArch64] Allow i16->f64 uitofp tbl shuffles Just as we convert i8->f32 uitofp to tbl to perform the zext, we can do the same for i16->f64. --- .../Target/AArch64/AArch64ISelLowering.cpp | 15 +- .../CodeGen/AArch64/fp-conversion-to-tbl.ll | 175 ++++++++++++++++++ 2 files changed, 185 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 516d0cf33aaeb04..47da9d577cd827c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16615,7 +16615,7 @@ bool AArch64TargetLowering::shouldSinkOperands( static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl &Mask) { - if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64) + if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64) return false; assert(DstWidth % SrcWidth == 0 && @@ -16649,7 +16649,7 @@ static Value *createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, return nullptr; auto *FirstEltZero = Builder.CreateInsertElement( - PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); + PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0)); Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask); Result = Builder.CreateBitCast(Result, DstTy); if (DstTy != ZExtTy) @@ -16670,7 +16670,7 @@ static Value *createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, return nullptr; auto *FirstEltZero = Builder.CreateInsertElement( - PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0)); + PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0)); return Builder.CreateShuffleVector(Op, FirstEltZero, Mask); } @@ -16847,6 +16847,9 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( return false; } + if (DstTy->getScalarSizeInBits() >= 64) + return false; + IRBuilder<> Builder(ZExt); Value *Result = createTblShuffleForZExt( Builder, ZExt->getOperand(0), cast(ZExt->getType()), @@ -16859,8 +16862,10 @@ bool AArch64TargetLowering::optimizeExtendOrTruncateConversion( } auto *UIToFP = dyn_cast(I); - if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) && - DstTy->getElementType()->isFloatTy()) { + if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) && + DstTy->getElementType()->isFloatTy()) || + (SrcTy->getElementType()->isIntegerTy(16) && + DstTy->getElementType()->isDoubleTy()))) { IRBuilder<> Builder(I); Value *ZExt = createTblShuffleForZExt( Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy), diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll index 0a3b9a070c2b321..d9d80f1cb50ee1b 100644 --- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll @@ -648,3 +648,178 @@ loop: exit: ret void } + +define void @uitofp_v8i16_to_v8f64(ptr nocapture noundef readonly %x, ptr nocapture noundef writeonly %y, i32 noundef %n) { +; CHECK-LABEL: uitofp_v8i16_to_v8f64: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh22: +; CHECK-NEXT: adrp x8, lCPI10_0@PAGE +; CHECK-NEXT: Lloh23: +; CHECK-NEXT: adrp x9, lCPI10_1@PAGE +; CHECK-NEXT: Lloh24: +; CHECK-NEXT: adrp x10, lCPI10_2@PAGE +; CHECK-NEXT: Lloh25: +; CHECK-NEXT: ldr q0, [x8, lCPI10_0@PAGEOFF] +; CHECK-NEXT: Lloh26: +; CHECK-NEXT: adrp x8, lCPI10_3@PAGE +; CHECK-NEXT: Lloh27: +; CHECK-NEXT: ldr q1, [x9, lCPI10_1@PAGEOFF] +; CHECK-NEXT: Lloh28: +; CHECK-NEXT: ldr q2, [x10, lCPI10_2@PAGEOFF] +; CHECK-NEXT: Lloh29: +; CHECK-NEXT: ldr q3, [x8, lCPI10_3@PAGEOFF] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB10_1: ; %vector.body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldr q4, [x0, x8] +; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: add x8, x8, #64 +; CHECK-NEXT: cmp x8, #2, lsl #12 ; =8192 +; CHECK-NEXT: tbl.16b v5, { v4 }, v0 +; CHECK-NEXT: tbl.16b v6, { v4 }, v1 +; CHECK-NEXT: tbl.16b v7, { v4 }, v2 +; CHECK-NEXT: tbl.16b v4, { v4 }, v3 +; CHECK-NEXT: ucvtf.2d v5, v5 +; CHECK-NEXT: ucvtf.2d v6, v6 +; CHECK-NEXT: ucvtf.2d v7, v7 +; CHECK-NEXT: ucvtf.2d v4, v4 +; CHECK-NEXT: stp q6, q5, [x9, #32] +; CHECK-NEXT: stp q4, q7, [x9] +; CHECK-NEXT: b.ne LBB10_1 +; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh26, Lloh29 +; CHECK-NEXT: .loh AdrpLdr Lloh24, Lloh28 +; CHECK-NEXT: .loh AdrpLdr Lloh23, Lloh27 +; CHECK-NEXT: .loh AdrpAdrp Lloh22, Lloh26 +; CHECK-NEXT: .loh AdrpLdr Lloh22, Lloh25 +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %.idx = shl nsw i64 %index, 3 + %g = getelementptr inbounds i8, ptr %x, i64 %.idx + %wide.vec = load <8 x i16>, ptr %g, align 2 + %u = uitofp <8 x i16> %wide.vec to <8 x double> + %h = getelementptr inbounds double, ptr %y, i64 %index + store <8 x double> %u, ptr %h, align 8 + %index.next = add nuw i64 %index, 8 + %c = icmp eq i64 %index.next, 1024 + br i1 %c, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + +define void @uitofp_ld4_v32i16_to_v8f64(ptr nocapture noundef readonly %x, ptr nocapture noundef writeonly %y, i32 noundef %n) { +; CHECK-LABEL: uitofp_ld4_v32i16_to_v8f64: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: Lloh30: +; CHECK-NEXT: adrp x8, lCPI11_0@PAGE +; CHECK-NEXT: Lloh31: +; CHECK-NEXT: adrp x9, lCPI11_1@PAGE +; CHECK-NEXT: Lloh32: +; CHECK-NEXT: adrp x10, lCPI11_2@PAGE +; CHECK-NEXT: Lloh33: +; CHECK-NEXT: ldr q0, [x8, lCPI11_0@PAGEOFF] +; CHECK-NEXT: Lloh34: +; CHECK-NEXT: adrp x8, lCPI11_3@PAGE +; CHECK-NEXT: Lloh35: +; CHECK-NEXT: ldr q1, [x9, lCPI11_1@PAGEOFF] +; CHECK-NEXT: Lloh36: +; CHECK-NEXT: ldr q2, [x10, lCPI11_2@PAGEOFF] +; CHECK-NEXT: Lloh37: +; CHECK-NEXT: ldr q3, [x8, lCPI11_3@PAGEOFF] +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: LBB11_1: ; %vector.body +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x9, x0, x8 +; CHECK-NEXT: ldp q5, q4, [x9, #32] +; CHECK-NEXT: ldp q7, q6, [x9] +; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: add x8, x8, #64 +; CHECK-NEXT: tbl.16b v16, { v4 }, v0 +; CHECK-NEXT: tbl.16b v17, { v5 }, v0 +; CHECK-NEXT: tbl.16b v21, { v4 }, v1 +; CHECK-NEXT: tbl.16b v18, { v6 }, v0 +; CHECK-NEXT: tbl.16b v19, { v7 }, v0 +; CHECK-NEXT: tbl.16b v20, { v7 }, v1 +; CHECK-NEXT: tbl.16b v22, { v5 }, v1 +; CHECK-NEXT: tbl.16b v23, { v5 }, v2 +; CHECK-NEXT: tbl.16b v24, { v4 }, v2 +; CHECK-NEXT: tbl.16b v25, { v7 }, v2 +; CHECK-NEXT: tbl.16b v5, { v5 }, v3 +; CHECK-NEXT: tbl.16b v4, { v4 }, v3 +; CHECK-NEXT: tbl.16b v7, { v7 }, v3 +; CHECK-NEXT: tbl.16b v26, { v6 }, v1 +; CHECK-NEXT: tbl.16b v27, { v6 }, v2 +; CHECK-NEXT: tbl.16b v6, { v6 }, v3 +; CHECK-NEXT: ucvtf.2d v17, v17 +; CHECK-NEXT: ucvtf.2d v16, v16 +; CHECK-NEXT: ucvtf.2d v19, v19 +; CHECK-NEXT: ucvtf.2d v18, v18 +; CHECK-NEXT: ucvtf.2d v22, v22 +; CHECK-NEXT: ucvtf.2d v23, v23 +; CHECK-NEXT: ucvtf.2d v5, v5 +; CHECK-NEXT: ucvtf.2d v21, v21 +; CHECK-NEXT: ucvtf.2d v24, v24 +; CHECK-NEXT: ucvtf.2d v4, v4 +; CHECK-NEXT: cmp x8, #2, lsl #12 ; =8192 +; CHECK-NEXT: ucvtf.2d v20, v20 +; CHECK-NEXT: ucvtf.2d v25, v25 +; CHECK-NEXT: ucvtf.2d v7, v7 +; CHECK-NEXT: ucvtf.2d v26, v26 +; CHECK-NEXT: ucvtf.2d v27, v27 +; CHECK-NEXT: ucvtf.2d v6, v6 +; CHECK-NEXT: fadd.2d v17, v22, v17 +; CHECK-NEXT: fadd.2d v5, v23, v5 +; CHECK-NEXT: fadd.2d v16, v21, v16 +; CHECK-NEXT: fadd.2d v4, v24, v4 +; CHECK-NEXT: fadd.2d v19, v20, v19 +; CHECK-NEXT: fadd.2d v7, v25, v7 +; CHECK-NEXT: fadd.2d v18, v26, v18 +; CHECK-NEXT: fadd.2d v6, v27, v6 +; CHECK-NEXT: fadd.2d v5, v17, v5 +; CHECK-NEXT: fadd.2d v4, v16, v4 +; CHECK-NEXT: fadd.2d v7, v19, v7 +; CHECK-NEXT: fadd.2d v6, v18, v6 +; CHECK-NEXT: stp q5, q4, [x9, #32] +; CHECK-NEXT: stp q7, q6, [x9] +; CHECK-NEXT: b.ne LBB11_1 +; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup +; CHECK-NEXT: ret +; CHECK-NEXT: .loh AdrpLdr Lloh34, Lloh37 +; CHECK-NEXT: .loh AdrpLdr Lloh32, Lloh36 +; CHECK-NEXT: .loh AdrpLdr Lloh31, Lloh35 +; CHECK-NEXT: .loh AdrpAdrp Lloh30, Lloh34 +; CHECK-NEXT: .loh AdrpLdr Lloh30, Lloh33 +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %.idx = shl nsw i64 %index, 3 + %0 = getelementptr inbounds i8, ptr %x, i64 %.idx + %wide.vec = load <32 x i16>, ptr %0, align 2 + %strided.vec = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> + %strided.vec36 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> + %strided.vec37 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> + %strided.vec38 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> + %1 = uitofp <8 x i16> %strided.vec to <8 x double> + %2 = uitofp <8 x i16> %strided.vec36 to <8 x double> + %3 = fadd fast <8 x double> %2, %1 + %4 = uitofp <8 x i16> %strided.vec37 to <8 x double> + %5 = fadd fast <8 x double> %3, %4 + %6 = uitofp <8 x i16> %strided.vec38 to <8 x double> + %7 = fadd fast <8 x double> %5, %6 + %8 = getelementptr inbounds double, ptr %y, i64 %index + store <8 x double> %7, ptr %8, align 8 + %index.next = add nuw i64 %index, 8 + %9 = icmp eq i64 %index.next, 1024 + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} + From 63d8bd27275458ccd5fd4010671ad781b0a3698c Mon Sep 17 00:00:00 2001 From: ChiaHungDuan Date: Wed, 11 Sep 2024 14:22:47 -0700 Subject: [PATCH 43/94] =?UTF-8?q?[scudo]=20Add=20thread-safety=20annotatio?= =?UTF-8?q?n=20on=20getMemoryGroupFragmentationIn=E2=80=A6=20(#108277)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add thread-safety annotation on getMemoryGroupFragmentationInfoInRegion --- compiler-rt/lib/scudo/standalone/primary64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h index 045070d0e34de9d..a3b6e309ed3fcea 100644 --- a/compiler-rt/lib/scudo/standalone/primary64.h +++ b/compiler-rt/lib/scudo/standalone/primary64.h @@ -1207,7 +1207,7 @@ template class SizeClassAllocator64 { void getMemoryGroupFragmentationInfoInRegion(RegionInfo *Region, uptr ClassId, ScopedString *Str) - REQUIRES(Region->MMLock) { + REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) { const uptr BlockSize = getSizeByClassId(ClassId); const uptr AllocatedUserEnd = Region->MemMapInfo.AllocatedUser + Region->RegionBeg; From 60efbe99cb2ad19373d07de4806472094258508e Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Wed, 11 Sep 2024 15:33:22 -0600 Subject: [PATCH 44/94] [NFC][rtsan] Docs of how to disable rtsan (#107707) --- clang/docs/RealtimeSanitizer.rst | 50 ++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/clang/docs/RealtimeSanitizer.rst b/clang/docs/RealtimeSanitizer.rst index 799cd43509c6e62..5e281a2a3579077 100644 --- a/clang/docs/RealtimeSanitizer.rst +++ b/clang/docs/RealtimeSanitizer.rst @@ -83,3 +83,53 @@ non-zero exit code. #13 0x00010230dd64 in main main.cpp:9 #14 0x0001958960dc () #15 0x2f557ffffffffffc () + +Disabling +--------- + +In some circumstances, you may want to suppress error reporting in a specific scope. + +In C++, this is achieved via ``__rtsan::ScopedDisabler``. Within the scope where the ``ScopedDisabler`` object is instantiated, all sanitizer error reports are suppressed. This suppression applies to the current scope as well as all invoked functions, including any functions called transitively. + +.. code-block:: c++ + + #include + + void process(const std::vector& buffer) [[clang::nonblocking]] { + { + __rtsan::ScopedDisabler d; + ... + } + } + +If RealtimeSanitizer is not enabled at compile time (i.e., the code is not compiled with the ``-fsanitize=realtime`` flag), the ``ScopedDisabler`` is compiled as a no-op. + +In C, you can use the ``__rtsan_disable()`` and ``rtsan_enable()`` functions to manually disable and re-enable RealtimeSanitizer checks. + +.. code-block:: c++ + + #include + + int process(const float* buffer) [[clang::nonblocking]] + { + { + __rtsan_disable(); + + ... + + __rtsan_enable(); + } + } + +Each call to ``__rtsan_disable()`` must be paired with a subsequent call to ``__rtsan_enable()`` to restore normal sanitizer functionality. If a corresponding ``rtsan_enable()`` call is not made, the behavior is undefined. + +Compile-time sanitizer detection +-------------------------------- + +Clang provides the pre-processor macro ``__has_feature`` which may be used to detect if RealtimeSanitizer is enabled at compile-time. + +.. code-block:: c++ + + #if defined(__has_feature) && __has_feature(realtime_sanitizer) + ... + #endif From 54c6e1c3f51758469cc06cbcc2ad28af210fc004 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 11 Sep 2024 14:31:42 -0700 Subject: [PATCH 45/94] [SLP] Move a non-power-of-two bailout down slightly The first part of CheckForShuffledLoads isn't doing any subvector analysis, so it's perfectly safe for arbitrary VL. --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c89a50fc7bd4297..00d9f2909d71e2e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4819,12 +4819,6 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( // representation is better than just gather. auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment, bool ProfitableGatherPointers) { - // FIXME: The following code has not been updated for non-power-of-2 - // vectors. The splitting logic here does not cover the original - // vector if the vector factor is not a power of two. FIXME - if (!has_single_bit(VL.size())) - return false; - // Compare masked gather cost and loads + insert subvector costs. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto [ScalarGEPCost, VectorGEPCost] = @@ -4874,6 +4868,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( constexpr unsigned ListLimit = 4; if (!TryRecursiveCheck || VL.size() < ListLimit) return MaskedGatherCost - GatherCost >= -SLPCostThreshold; + + // FIXME: The following code has not been updated for non-power-of-2 + // vectors. The splitting logic here does not cover the original + // vector if the vector factor is not a power of two. FIXME + if (!has_single_bit(VL.size())) + return false; + unsigned Sz = DL->getTypeSizeInBits(ScalarTy); unsigned MinVF = getMinVF(2 * Sz); DemandedElts.clearAllBits(); From ec7c8cd45fa3ca8cc5584caee4b1eaf843294af5 Mon Sep 17 00:00:00 2001 From: Chris Apple Date: Wed, 11 Sep 2024 15:34:45 -0600 Subject: [PATCH 46/94] [compiler-rt][NFC] Add preprocessor definitions for 64 bit file interceptors that were missing (#108059) These are needed in #108057 --- .../lib/sanitizer_common/sanitizer_platform_interceptors.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 7d7ed9bc07ccfea..e71a6bcd6a83715 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -183,6 +183,11 @@ #define SANITIZER_INTERCEPT_FPUTS SI_POSIX #define SANITIZER_INTERCEPT_PUTS SI_POSIX +#define SANITIZER_INTERCEPT_CREAT64 (SI_GLIBC || SI_SOLARIS32) +#define SANITIZER_INTERCEPT_FCNTL64 (SI_GLIBC || SI_SOLARIS32) +#define SANITIZER_INTERCEPT_OPEN64 (SI_GLIBC || SI_SOLARIS32) +#define SANITIZER_INTERCEPT_OPENAT64 (SI_GLIBC || SI_SOLARIS32) + #define SANITIZER_INTERCEPT_PREAD64 (SI_GLIBC || SI_SOLARIS32) #define SANITIZER_INTERCEPT_PWRITE64 (SI_GLIBC || SI_SOLARIS32) From 4618b67b48447ed924bc195cfe735a73841e312c Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Wed, 11 Sep 2024 14:36:10 -0700 Subject: [PATCH 47/94] [libc][bazel] Enable epoll_pwait2 on bazel (#108254) The wrapper for epoll_pwait2 has been difficult to enable since it requires a very new version of the linux kernel (5.11). On cmake we still need to create a mechanism to check if we can build it, but our current bazel users are all on a new enough kernel version we can just enable it. --- .../llvm-project-overlay/libc/BUILD.bazel | 42 +++++++++---------- .../libc/test/src/sys/epoll/BUILD.bazel | 34 +++++++-------- 2 files changed, 36 insertions(+), 40 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index b86fcace5703c79..4be8e17e7df24a6 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -4137,25 +4137,23 @@ libc_function( ], ) -#TODO: Enable once epoll_pwait2 availablilty can be checked first. -# https://github.com/llvm/llvm-project/issues/80060 -# libc_function( -# name = "epoll_pwait2", -# srcs = ["src/sys/epoll/linux/epoll_pwait2.cpp"], -# hdrs = ["src/sys/epoll/epoll_pwait2.h"], -# target_compatible_with = select({ -# "@platforms//os:linux": [], -# "//conditions:default": ["@platforms//:incompatible"], -# }), -# weak = True, -# deps = [ -# ":__support_macros_sanitizer", -# ":__support_osutil_syscall", -# ":errno", -# ":hdr_signal_macros", -# ":hdr_sys_epoll_macros", -# ":types_sigset_t", -# ":types_struct_epoll_event", -# ":types_struct_timespec", -# ], -# ) +libc_function( + name = "epoll_pwait2", + srcs = ["src/sys/epoll/linux/epoll_pwait2.cpp"], + hdrs = ["src/sys/epoll/epoll_pwait2.h"], + target_compatible_with = select({ + "@platforms//os:linux": [], + "//conditions:default": ["@platforms//:incompatible"], + }), + weak = True, + deps = [ + ":__support_macros_sanitizer", + ":__support_osutil_syscall", + ":errno", + ":hdr_signal_macros", + ":hdr_sys_epoll_macros", + ":types_sigset_t", + ":types_struct_epoll_event", + ":types_struct_timespec", + ], +) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel index b090bde35b88d6e..7fb50403682a745 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/sys/epoll/BUILD.bazel @@ -78,21 +78,19 @@ libc_test( ], ) -#TODO: Enable once epoll_pwait2 availablilty can be checked first. -# https://github.com/llvm/llvm-project/issues/80060 -# libc_test( -# name = "epoll_pwait2_test", -# srcs = ["linux/epoll_pwait2_test.cpp"], -# libc_function_deps = [ -# "//libc:epoll_pwait2", -# "//libc:epoll_create1", -# "//libc:epoll_ctl", -# "//libc:pipe", -# "//libc:close", -# ], -# deps = [ -# "//libc:hdr_sys_epoll_macros", -# "//libc:types_struct_epoll_event", -# "//libc:types_struct_timespec", -# ], -# ) +libc_test( + name = "epoll_pwait2_test", + srcs = ["linux/epoll_pwait2_test.cpp"], + libc_function_deps = [ + "//libc:epoll_pwait2", + "//libc:epoll_create1", + "//libc:epoll_ctl", + "//libc:pipe", + "//libc:close", + ], + deps = [ + "//libc:hdr_sys_epoll_macros", + "//libc:types_struct_epoll_event", + "//libc:types_struct_timespec", + ], +) From a66ce58ac6f338c91cccb9801bca04efae9f3f37 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 11 Sep 2024 14:42:51 -0700 Subject: [PATCH 48/94] [BOLT] Drop suffixes in parsePseudoProbe GUID assignment (#106243) Pseudo probe function records contain GUIDs assigned by the compiler using an IR function name. Thus suffixes added later (e.g. `.llvm.` for internal symbols, `.destroy`/`.resume` for coroutine fragments, and `.cold`/`.warm` for split fragments) cause GUID mismatch. Address that by dropping those suffixes using `getCommonName` which is a parametrized form of `getLTOCommonName`. --- bolt/include/bolt/Utils/Utils.h | 5 +++ bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 40 ++++++++++++++++++------ bolt/lib/Utils/Utils.cpp | 12 +++++-- 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/bolt/include/bolt/Utils/Utils.h b/bolt/include/bolt/Utils/Utils.h index 3886c5f8757c08a..9baee7d94066def 100644 --- a/bolt/include/bolt/Utils/Utils.h +++ b/bolt/include/bolt/Utils/Utils.h @@ -41,6 +41,11 @@ std::string getEscapedName(const StringRef &Name); /// Return the unescaped name std::string getUnescapedName(const StringRef &Name); +/// Return a common part for a given \p Name wrt a given \p Suffixes list. +/// Preserve the suffix if \p KeepSuffix is set, only dropping characters +/// following it, otherwise drop the suffix as well. +std::optional getCommonName(const StringRef Name, bool KeepSuffix, + ArrayRef Suffixes); /// LTO-generated function names take a form: /// /// .lto_priv./... diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp index 4925b4b385d9b1e..e97d522844fc022 100644 --- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp +++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp @@ -14,6 +14,7 @@ #include "bolt/Rewrite/MetadataRewriter.h" #include "bolt/Rewrite/MetadataRewriters.h" #include "bolt/Utils/CommandLineOpts.h" +#include "bolt/Utils/Utils.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCPseudoProbe.h" #include "llvm/Support/CommandLine.h" @@ -133,10 +134,19 @@ void PseudoProbeRewriter::parsePseudoProbe() { MCPseudoProbeDecoder::Uint64Set GuidFilter; MCPseudoProbeDecoder::Uint64Map FuncStartAddrs; + SmallVector Suffixes( + {".destroy", ".resume", ".llvm.", ".cold", ".warm"}); for (const BinaryFunction *F : BC.getAllBinaryFunctions()) { for (const MCSymbol *Sym : F->getSymbols()) { - FuncStartAddrs[Function::getGUID(NameResolver::restore(Sym->getName()))] = - F->getAddress(); + StringRef SymName = Sym->getName(); + for (auto Name : {std::optional(NameResolver::restore(SymName)), + getCommonName(SymName, false, Suffixes)}) { + if (!Name) + continue; + SymName = *Name; + uint64_t GUID = Function::getGUID(SymName); + FuncStartAddrs[GUID] = F->getAddress(); + } } } Contents = PseudoProbeSection->getContents(); @@ -155,13 +165,25 @@ void PseudoProbeRewriter::parsePseudoProbe() { ProbeDecoder.printProbesForAllAddresses(outs()); } - for (const auto &FuncDesc : ProbeDecoder.getGUID2FuncDescMap()) { - uint64_t GUID = FuncDesc.FuncGUID; - if (!FuncStartAddrs.contains(GUID)) - continue; - BinaryFunction *BF = BC.getBinaryFunctionAtAddress(FuncStartAddrs[GUID]); - assert(BF); - BF->setGUID(GUID); + const GUIDProbeFunctionMap &GUID2Func = ProbeDecoder.getGUID2FuncDescMap(); + // Checks GUID in GUID2Func and returns it if it's present or null otherwise. + auto checkGUID = [&](StringRef SymName) -> uint64_t { + uint64_t GUID = Function::getGUID(SymName); + if (GUID2Func.find(GUID) == GUID2Func.end()) + return 0; + return GUID; + }; + for (BinaryFunction *F : BC.getAllBinaryFunctions()) { + for (const MCSymbol *Sym : F->getSymbols()) { + StringRef SymName = NameResolver::restore(Sym->getName()); + uint64_t GUID = checkGUID(SymName); + std::optional CommonName = + getCommonName(SymName, false, Suffixes); + if (!GUID && CommonName) + GUID = checkGUID(*CommonName); + if (GUID) + F->setGUID(GUID); + } } } diff --git a/bolt/lib/Utils/Utils.cpp b/bolt/lib/Utils/Utils.cpp index 718e97535fd22a2..ecc2f1010a9858c 100644 --- a/bolt/lib/Utils/Utils.cpp +++ b/bolt/lib/Utils/Utils.cpp @@ -66,15 +66,21 @@ std::string getUnescapedName(const StringRef &Name) { return Output; } -std::optional getLTOCommonName(const StringRef Name) { - for (StringRef Suffix : {".__uniq.", ".lto_priv.", ".constprop.", ".llvm."}) { +std::optional getCommonName(const StringRef Name, bool KeepSuffix, + ArrayRef Suffixes) { + for (StringRef Suffix : Suffixes) { size_t LTOSuffixPos = Name.find(Suffix); if (LTOSuffixPos != StringRef::npos) - return Name.substr(0, LTOSuffixPos + Suffix.size()); + return Name.substr(0, LTOSuffixPos + (KeepSuffix ? Suffix.size() : 0)); } return std::nullopt; } +std::optional getLTOCommonName(const StringRef Name) { + return getCommonName(Name, true, + {".__uniq.", ".lto_priv.", ".constprop.", ".llvm."}); +} + std::optional readDWARFExpressionTargetReg(StringRef ExprBytes) { uint8_t Opcode = ExprBytes[0]; if (Opcode == dwarf::DW_CFA_def_cfa_expression) From 1797174ea6adab08474658f9c9748991d172321c Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Wed, 11 Sep 2024 14:44:06 -0700 Subject: [PATCH 49/94] [NFC][sanitizer] Commit test for #106912 (#108289) Almost all sanitizers already support the test. * Tsan does not use DlsymAlloc yet. * Lsan will support with #106912. memprof,rtsan,nsan are not tested as part of sanitizer_common, but we should keep them here to show up when it happen. --------- Co-authored-by: Xiaofeng Tian <110771974+txff99@users.noreply.github.com> --- .../sanitizer_common/TestCases/dlsym_alloc.c | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c diff --git a/compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c b/compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c new file mode 100644 index 000000000000000..3905ac40ae2dc74 --- /dev/null +++ b/compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c @@ -0,0 +1,61 @@ +// RUN: %clang -O0 %s -o %t && %run %t + +// FIXME: TSAN does not use DlsymAlloc. +// UNSUPPORTED: tsan + +// FIXME: https://github.com/llvm/llvm-project/pull/106912 +// XFAIL: lsan + +#include + +const char *test() __attribute__((disable_sanitizer_instrumentation)) { + void *volatile p = malloc(3); + p = realloc(p, 7); + free(p); + + p = calloc(3, 7); + free(p); + + free(NULL); + + return ""; +} + +const char *__asan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__hwasan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__lsan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__memprof_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__msan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__nsan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__rtsan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__tsan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} +const char *__ubsan_default_options() + __attribute__((disable_sanitizer_instrumentation)) { + return test(); +} + +int main(int argc, char **argv) { return 0; } From 3a0ef2a2d3113d162d0133d7384df52abb2e3d92 Mon Sep 17 00:00:00 2001 From: "A. Jiang" Date: Thu, 12 Sep 2024 05:46:59 +0800 Subject: [PATCH 50/94] [libc++] Reland LWG2921 and LWG2976 (#107960) They were originally implemented in d42db7e083ee0 but reverted later in a2f3c63282330be0. This PR implement both LWG issues again, guarding the removed functions with `_LIBCPP_STD_VER <= 14`, because they should be treated as patches for P0302R1 which was adopted for C++17. Fixes #103598 Fixes #103755 --- libcxx/docs/Status/Cxx17Issues.csv | 2 +- libcxx/docs/Status/Cxx20Issues.csv | 2 +- libcxx/include/future | 10 ++++++++-- .../ctor2.compile.pass.cpp | 19 ++++++++++++++----- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv index 7119382eb5cfb43..af3dee9ca50c98d 100644 --- a/libcxx/docs/Status/Cxx17Issues.csv +++ b/libcxx/docs/Status/Cxx17Issues.csv @@ -306,7 +306,7 @@ "`LWG2905 `__","is_constructible_v, P, D const &> should be false when D is not copy constructible","2017-02 (Kona)","|Complete|","","" "`LWG2908 `__","The less-than operator for shared pointers could do more","2017-02 (Kona)","|Complete|","","" "`LWG2911 `__","An is_aggregate type trait is needed","2017-02 (Kona)","|Complete|","","" -"`LWG2921 `__","packaged_task and type-erased allocators","2017-02 (Kona)","|Complete|","","" +"`LWG2921 `__","packaged_task and type-erased allocators","2017-02 (Kona)","|Complete|","20.0","Originally implemented in LLVM 6.0 but reverted later. Old documentation incorrectly said it was implemented." "`LWG2934 `__","optional doesn't compare with T","2017-02 (Kona)","|Complete|","","" "","","","","","" "`LWG2901 `__","Variants cannot properly support allocators","2017-07 (Toronto)","|Complete|","","" diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv index c79289968811bd1..8e9fc1398bc0950 100644 --- a/libcxx/docs/Status/Cxx20Issues.csv +++ b/libcxx/docs/Status/Cxx20Issues.csv @@ -27,7 +27,7 @@ "`LWG2964 `__","Apparently redundant requirement for dynamic_pointer_cast","2017-11 (Albuquerque)","","","" "`LWG2965 `__","Non-existing path::native_string() in filesystem_error::what() specification","2017-11 (Albuquerque)","|Nothing To Do|","","" "`LWG2972 `__","What is ``is_trivially_destructible_v``\ ?","2017-11 (Albuquerque)","|Complete|","","" -"`LWG2976 `__","Dangling uses_allocator specialization for packaged_task","2017-11 (Albuquerque)","|Complete|","","" +"`LWG2976 `__","Dangling uses_allocator specialization for packaged_task","2017-11 (Albuquerque)","|Complete|","20.0","Originally implemented in LLVM 6.0 but reverted later. Old documentation incorrectly said it was implemented." "`LWG2977 `__","unordered_meow::merge() has incorrect Throws: clause","2017-11 (Albuquerque)","|Nothing To Do|","","" "`LWG2978 `__","Hash support for pmr::string and friends","2017-11 (Albuquerque)","|Complete|","16.0","" "`LWG2979 `__","aligned_union should require complete object types","2017-11 (Albuquerque)","|Complete|","","" diff --git a/libcxx/include/future b/libcxx/include/future index 9a0eb7971a313d3..9158ea34ee109d9 100644 --- a/libcxx/include/future +++ b/libcxx/include/future @@ -329,7 +329,7 @@ public: template explicit packaged_task(F&& f); template - packaged_task(allocator_arg_t, const Allocator& a, F&& f); + packaged_task(allocator_arg_t, const Allocator& a, F&& f); // removed in C++17 ~packaged_task(); // no copy @@ -356,7 +356,7 @@ public: template void swap(packaged_task&) noexcept; -template struct uses_allocator, Alloc>; +template struct uses_allocator, Alloc>; // removed in C++17 } // std @@ -1606,9 +1606,11 @@ public: template , packaged_task>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI explicit packaged_task(_Fp&& __f) : __f_(std::forward<_Fp>(__f)) {} +# if _LIBCPP_STD_VER <= 14 template , packaged_task>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI packaged_task(allocator_arg_t, const _Allocator& __a, _Fp&& __f) : __f_(allocator_arg_t(), __a, std::forward<_Fp>(__f)), __p_(allocator_arg_t(), __a) {} +# endif // ~packaged_task() = default; // no copy @@ -1696,9 +1698,11 @@ public: _LIBCPP_HIDE_FROM_ABI packaged_task() _NOEXCEPT : __p_(nullptr) {} template , packaged_task>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI explicit packaged_task(_Fp&& __f) : __f_(std::forward<_Fp>(__f)) {} +# if _LIBCPP_STD_VER <= 14 template , packaged_task>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI packaged_task(allocator_arg_t, const _Allocator& __a, _Fp&& __f) : __f_(allocator_arg_t(), __a, std::forward<_Fp>(__f)), __p_(allocator_arg_t(), __a) {} +# endif // ~packaged_task() = default; // no copy @@ -1790,8 +1794,10 @@ swap(packaged_task<_Rp(_ArgTypes...)>& __x, packaged_task<_Rp(_ArgTypes...)>& __ __x.swap(__y); } +# if _LIBCPP_STD_VER <= 14 template struct _LIBCPP_TEMPLATE_VIS uses_allocator, _Alloc> : public true_type {}; +# endif template _LIBCPP_HIDE_FROM_ABI future<_Rp> __make_deferred_assoc_state(_Fp&& __f) { diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp index 3ab59909cfafbe3..a3bdd45975c96fd 100644 --- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp +++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor2.compile.pass.cpp @@ -18,11 +18,13 @@ #include #include +#include +#include #include "test_allocator.h" struct A {}; -using PT = std::packaged_task; +using PT = std::packaged_task; using VPT = volatile std::packaged_task; static_assert(!std::is_constructible, VPT>::value, ""); @@ -35,7 +37,14 @@ static_assert(!std::is_constructible static_assert(!std::is_constructible, volatile PA&>::value, ""); static_assert(!std::is_constructible, volatile PA&&>::value, ""); -static_assert( std::is_constructible, const PI&>::value, ""); -static_assert( std::is_constructible, const PI&&>::value, ""); -static_assert( std::is_constructible, volatile PI&>::value, ""); -static_assert( std::is_constructible, volatile PI&&>::value, ""); +#if TEST_STD_VER >= 17 // packaged_task allocator support was removed in C++17 (LWG 2921) +static_assert(!std::is_constructible_v, const PI&>); +static_assert(!std::is_constructible_v, const PI&&>); +static_assert(!std::is_constructible_v, volatile PI&>); +static_assert(!std::is_constructible_v, volatile PI&&>); +#else +static_assert(std::is_constructible, const PI&>::value, ""); +static_assert(std::is_constructible, const PI&&>::value, ""); +static_assert(std::is_constructible, volatile PI&>::value, ""); +static_assert(std::is_constructible, volatile PI&&>::value, ""); +#endif From 695cb55ccb34a3cf659c12e1cbca1b916372a199 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Wed, 11 Sep 2024 17:47:33 -0400 Subject: [PATCH 51/94] [libc++] Remove obsolete header restrictions for _LIBCPP_HAS_NO_THREADS (#107437) The _LIBCPP_HAS_NO_THREADS carve-out does not result in hard errors anymore, but the patch that changed that forgot to update the header restrictions we use to auto-generate several files. We can also remove the restrictions for the no-localization build and no-wide-characters, but doing it is less straightforward so I'm leaving it out of this patch. --- libcxx/include/__std_clang_module | 28 ++++++----------------- libcxx/modules/std.cppm.in | 28 ++++++----------------- libcxx/utils/libcxx/header_information.py | 9 -------- 3 files changed, 14 insertions(+), 51 deletions(-) diff --git a/libcxx/include/__std_clang_module b/libcxx/include/__std_clang_module index 18d6ce6b46c1f6e..572528669a1e154 100644 --- a/libcxx/include/__std_clang_module +++ b/libcxx/include/__std_clang_module @@ -33,9 +33,7 @@ #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) # include #endif -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #include @@ -101,9 +99,7 @@ # include #endif #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) @@ -120,9 +116,7 @@ # include #endif #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) @@ -153,13 +147,9 @@ # include #endif #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) @@ -175,9 +165,7 @@ #include #include #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) # include #endif @@ -192,9 +180,7 @@ #endif #include #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #include diff --git a/libcxx/modules/std.cppm.in b/libcxx/modules/std.cppm.in index ad8a639b7f71a1c..653f9933ba67c7a 100644 --- a/libcxx/modules/std.cppm.in +++ b/libcxx/modules/std.cppm.in @@ -23,9 +23,7 @@ module; #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER) # include #endif -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #include @@ -76,9 +74,7 @@ module; # include #endif #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) # include @@ -94,9 +90,7 @@ module; # include #endif #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) @@ -123,13 +117,9 @@ module; # include #endif #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) @@ -137,9 +127,7 @@ module; #endif #include #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #if !defined(_LIBCPP_HAS_NO_LOCALIZATION) # include #endif @@ -152,9 +140,7 @@ module; # include #endif #include -#if !defined(_LIBCPP_HAS_NO_THREADS) -# include -#endif +#include #include #include #include diff --git a/libcxx/utils/libcxx/header_information.py b/libcxx/utils/libcxx/header_information.py index 166c9a77c08e70d..694402141e1fabb 100644 --- a/libcxx/utils/libcxx/header_information.py +++ b/libcxx/utils/libcxx/header_information.py @@ -31,15 +31,6 @@ "strstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", "syncstream": "!defined(_LIBCPP_HAS_NO_LOCALIZATION)", - # headers with #error directives - "barrier": "!defined(_LIBCPP_HAS_NO_THREADS)", - "future": "!defined(_LIBCPP_HAS_NO_THREADS)", - "latch": "!defined(_LIBCPP_HAS_NO_THREADS)", - "semaphore": "!defined(_LIBCPP_HAS_NO_THREADS)", - "shared_mutex": "!defined(_LIBCPP_HAS_NO_THREADS)", - "stop_token": "!defined(_LIBCPP_HAS_NO_THREADS)", - "thread": "!defined(_LIBCPP_HAS_NO_THREADS)", - # headers with #error directives "wchar.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", "wctype.h": "!defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)", From a85883662ad1904844b1bd5a34bf93d9b383e3b8 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 15:18:58 -0700 Subject: [PATCH 52/94] [clangl[TableGen] Change Diagnostic Emitter to use const RecordKeeper (#108209) Change Diagnostic Emitter to use const RecordKeeper. This is a part of effort to have better const correctness in TableGen backends: https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089 --- .../TableGen/ClangDiagnosticsEmitter.cpp | 98 ++++++++++--------- clang/utils/TableGen/TableGenBackends.h | 12 ++- 2 files changed, 59 insertions(+), 51 deletions(-) diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp index 6ca24a8c74b2ff4..773668caa757479 100644 --- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp +++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp @@ -39,12 +39,13 @@ using namespace llvm; namespace { class DiagGroupParentMap { - RecordKeeper &Records; - std::map > Mapping; + const RecordKeeper &Records; + std::map> Mapping; + public: - DiagGroupParentMap(RecordKeeper &records) : Records(records) { - std::vector DiagGroups - = Records.getAllDerivedDefinitions("DiagGroup"); + DiagGroupParentMap(const RecordKeeper &records) : Records(records) { + ArrayRef DiagGroups = + Records.getAllDerivedDefinitions("DiagGroup"); for (unsigned i = 0, e = DiagGroups.size(); i != e; ++i) { std::vector SubGroups = DiagGroups[i]->getValueAsListOfDefs("SubGroups"); @@ -53,7 +54,7 @@ class DiagGroupParentMap { } } - const std::vector &getParents(const Record *Group) { + const std::vector &getParents(const Record *Group) { return Mapping[Group]; } }; @@ -68,7 +69,8 @@ getCategoryFromDiagGroup(const Record *Group, // The diag group may the subgroup of one or more other diagnostic groups, // check these for a category as well. - const std::vector &Parents = DiagGroupParents.getParents(Group); + const std::vector &Parents = + DiagGroupParents.getParents(Group); for (unsigned i = 0, e = Parents.size(); i != e; ++i) { CatName = getCategoryFromDiagGroup(Parents[i], DiagGroupParents); if (!CatName.empty()) return CatName; @@ -94,19 +96,19 @@ static std::string getDiagnosticCategory(const Record *R, namespace { class DiagCategoryIDMap { - RecordKeeper &Records; + const RecordKeeper &Records; StringMap CategoryIDs; std::vector CategoryStrings; public: - DiagCategoryIDMap(RecordKeeper &records) : Records(records) { + DiagCategoryIDMap(const RecordKeeper &records) : Records(records) { DiagGroupParentMap ParentInfo(Records); // The zero'th category is "". CategoryStrings.push_back(""); CategoryIDs[""] = 0; - std::vector Diags = - Records.getAllDerivedDefinitions("Diagnostic"); + ArrayRef Diags = + Records.getAllDerivedDefinitions("Diagnostic"); for (unsigned i = 0, e = Diags.size(); i != e; ++i) { std::string Category = getDiagnosticCategory(Diags[i], ParentInfo); if (Category.empty()) continue; // Skip diags with no category. @@ -153,8 +155,8 @@ static bool diagGroupBeforeByName(const Record *LHS, const Record *RHS) { /// Invert the 1-[0/1] mapping of diags to group into a one to many /// mapping of groups to diags in the group. -static void groupDiagnostics(const std::vector &Diags, - const std::vector &DiagGroups, +static void groupDiagnostics(ArrayRef Diags, + ArrayRef DiagGroups, std::map &DiagsInGroup) { for (unsigned i = 0, e = Diags.size(); i != e; ++i) { @@ -172,7 +174,7 @@ static void groupDiagnostics(const std::vector &Diags, // Add all DiagGroup's to the DiagsInGroup list to make sure we pick up empty // groups (these are warnings that GCC supports that clang never produces). for (unsigned i = 0, e = DiagGroups.size(); i != e; ++i) { - Record *Group = DiagGroups[i]; + const Record *Group = DiagGroups[i]; GroupInfo &GI = DiagsInGroup[std::string(Group->getValueAsString("GroupName"))]; GI.GroupName = Group->getName(); @@ -255,20 +257,18 @@ class InferPedantic { GMap; DiagGroupParentMap &DiagGroupParents; - const std::vector &Diags; - const std::vector DiagGroups; + ArrayRef Diags; + const std::vector DiagGroups; std::map &DiagsInGroup; llvm::DenseSet DiagsSet; GMap GroupCount; public: InferPedantic(DiagGroupParentMap &DiagGroupParents, - const std::vector &Diags, - const std::vector &DiagGroups, + ArrayRef Diags, + ArrayRef DiagGroups, std::map &DiagsInGroup) - : DiagGroupParents(DiagGroupParents), - Diags(Diags), - DiagGroups(DiagGroups), - DiagsInGroup(DiagsInGroup) {} + : DiagGroupParents(DiagGroupParents), Diags(Diags), + DiagGroups(DiagGroups), DiagsInGroup(DiagsInGroup) {} /// Compute the set of diagnostics and groups that are immediately /// in -Wpedantic. @@ -302,7 +302,8 @@ bool InferPedantic::isSubGroupOfGroup(const Record *Group, if (GName == GroupName) return true; - const std::vector &Parents = DiagGroupParents.getParents(Group); + const std::vector &Parents = + DiagGroupParents.getParents(Group); for (unsigned i = 0, e = Parents.size(); i != e; ++i) if (isSubGroupOfGroup(Parents[i], GName)) return true; @@ -347,7 +348,8 @@ void InferPedantic::markGroup(const Record *Group) { // group's count is equal to the number of subgroups and diagnostics in // that group, we can safely add this group to -Wpedantic. if (groupInPedantic(Group, /* increment */ true)) { - const std::vector &Parents = DiagGroupParents.getParents(Group); + const std::vector &Parents = + DiagGroupParents.getParents(Group); for (unsigned i = 0, e = Parents.size(); i != e; ++i) markGroup(Parents[i]); } @@ -359,7 +361,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic, // "pedantic" group. For those that aren't explicitly included in -Wpedantic, // mark them for consideration to be included in -Wpedantic directly. for (unsigned i = 0, e = Diags.size(); i != e; ++i) { - Record *R = Diags[i]; + const Record *R = Diags[i]; if (isExtension(R) && isOffByDefault(R)) { DiagsSet.insert(R); if (DefInit *Group = dyn_cast(R->getValueInit("Group"))) { @@ -375,7 +377,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic, // march through Diags a second time to ensure the results are emitted // in deterministic order. for (unsigned i = 0, e = Diags.size(); i != e; ++i) { - Record *R = Diags[i]; + const Record *R = Diags[i]; if (!DiagsSet.count(R)) continue; // Check if the group is implicitly in -Wpedantic. If so, @@ -401,13 +403,14 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic, // march through the groups to ensure the results are emitted /// in a deterministc order. for (unsigned i = 0, ei = DiagGroups.size(); i != ei; ++i) { - Record *Group = DiagGroups[i]; + const Record *Group = DiagGroups[i]; if (!groupInPedantic(Group)) continue; - const std::vector &Parents = DiagGroupParents.getParents(Group); - bool AllParentsInPedantic = - llvm::all_of(Parents, [&](Record *R) { return groupInPedantic(R); }); + const std::vector &Parents = + DiagGroupParents.getParents(Group); + bool AllParentsInPedantic = llvm::all_of( + Parents, [&](const Record *R) { return groupInPedantic(R); }); // If all the parents are in -Wpedantic, this means that this diagnostic // group will be indirectly included by -Wpedantic already. In that // case, do not add it directly to -Wpedantic. If the group has no @@ -583,7 +586,7 @@ struct DiagnosticTextBuilder { DiagnosticTextBuilder(DiagnosticTextBuilder const &) = delete; DiagnosticTextBuilder &operator=(DiagnosticTextBuilder const &) = delete; - DiagnosticTextBuilder(RecordKeeper &Records) { + DiagnosticTextBuilder(const RecordKeeper &Records) { // Build up the list of substitution records. for (auto *S : Records.getAllDerivedDefinitions("TextSubstitution")) { EvaluatingRecordGuard Guard(&EvaluatingRecord, S); @@ -593,7 +596,7 @@ struct DiagnosticTextBuilder { // Check that no diagnostic definitions have the same name as a // substitution. - for (Record *Diag : Records.getAllDerivedDefinitions("Diagnostic")) { + for (const Record *Diag : Records.getAllDerivedDefinitions("Diagnostic")) { StringRef Name = Diag->getName(); if (Substitutions.count(Name)) llvm::PrintFatalError( @@ -1407,7 +1410,7 @@ static void verifyDiagnosticWording(const Record &Diag) { /// ClangDiagsDefsEmitter - The top-level class emits .def files containing /// declarations of Clang diagnostics. -void clang::EmitClangDiagsDefs(RecordKeeper &Records, raw_ostream &OS, +void clang::EmitClangDiagsDefs(const RecordKeeper &Records, raw_ostream &OS, const std::string &Component) { // Write the #if guard if (!Component.empty()) { @@ -1421,10 +1424,11 @@ void clang::EmitClangDiagsDefs(RecordKeeper &Records, raw_ostream &OS, DiagnosticTextBuilder DiagTextBuilder(Records); - std::vector Diags = Records.getAllDerivedDefinitions("Diagnostic"); + ArrayRef Diags = + Records.getAllDerivedDefinitions("Diagnostic"); - std::vector DiagGroups - = Records.getAllDerivedDefinitions("DiagGroup"); + ArrayRef DiagGroups = + Records.getAllDerivedDefinitions("DiagGroup"); std::map DiagsInGroup; groupDiagnostics(Diags, DiagGroups, DiagsInGroup); @@ -1764,7 +1768,7 @@ static void emitDiagTable(std::map &DiagsInGroup, /// CATEGORY("Lambda Issue", DiagCat_Lambda_Issue) /// #endif /// \endcode -static void emitCategoryTable(RecordKeeper &Records, raw_ostream &OS) { +static void emitCategoryTable(const RecordKeeper &Records, raw_ostream &OS) { DiagCategoryIDMap CategoriesByID(Records); OS << "\n#ifdef GET_CATEGORY_TABLE\n"; for (auto const &C : CategoriesByID) @@ -1772,13 +1776,14 @@ static void emitCategoryTable(RecordKeeper &Records, raw_ostream &OS) { OS << "#endif // GET_CATEGORY_TABLE\n\n"; } -void clang::EmitClangDiagGroups(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangDiagGroups(const RecordKeeper &Records, raw_ostream &OS) { // Compute a mapping from a DiagGroup to all of its parents. DiagGroupParentMap DGParentMap(Records); - std::vector Diags = Records.getAllDerivedDefinitions("Diagnostic"); + ArrayRef Diags = + Records.getAllDerivedDefinitions("Diagnostic"); - std::vector DiagGroups = + ArrayRef DiagGroups = Records.getAllDerivedDefinitions("DiagGroup"); std::map DiagsInGroup; @@ -1824,9 +1829,10 @@ struct RecordIndexElement }; } // end anonymous namespace. -void clang::EmitClangDiagsIndexName(RecordKeeper &Records, raw_ostream &OS) { - const std::vector &Diags = - Records.getAllDerivedDefinitions("Diagnostic"); +void clang::EmitClangDiagsIndexName(const RecordKeeper &Records, + raw_ostream &OS) { + ArrayRef Diags = + Records.getAllDerivedDefinitions("Diagnostic"); std::vector Index; Index.reserve(Diags.size()); @@ -1915,7 +1921,7 @@ void writeDiagnosticText(DiagnosticTextBuilder &Builder, const Record *R, } // namespace } // namespace docs -void clang::EmitClangDiagDocs(RecordKeeper &Records, raw_ostream &OS) { +void clang::EmitClangDiagDocs(const RecordKeeper &Records, raw_ostream &OS) { using namespace docs; // Get the documentation introduction paragraph. @@ -1930,10 +1936,10 @@ void clang::EmitClangDiagDocs(RecordKeeper &Records, raw_ostream &OS) { DiagnosticTextBuilder Builder(Records); - std::vector Diags = + ArrayRef Diags = Records.getAllDerivedDefinitions("Diagnostic"); - std::vector DiagGroups = + std::vector DiagGroups = Records.getAllDerivedDefinitions("DiagGroup"); llvm::sort(DiagGroups, diagGroupBeforeByName); diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h index fe55ef2f423afeb..6b8d7f82ec9845f 100644 --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -76,10 +76,11 @@ void EmitClangAttrDocTable(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangDiagsDefs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS, - const std::string &Component); -void EmitClangDiagGroups(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangDiagsIndexName(llvm::RecordKeeper &Records, +void EmitClangDiagsDefs(const llvm::RecordKeeper &Records, + llvm::raw_ostream &OS, const std::string &Component); +void EmitClangDiagGroups(const llvm::RecordKeeper &Records, + llvm::raw_ostream &OS); +void EmitClangDiagsIndexName(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangSACheckers(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); @@ -142,7 +143,8 @@ void EmitCdeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitCdeBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangAttrDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); -void EmitClangDiagDocs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitClangDiagDocs(const llvm::RecordKeeper &Records, + llvm::raw_ostream &OS); void EmitClangOptDocs(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitClangOpenCLBuiltins(const llvm::RecordKeeper &Records, From a29afb754fb445a2cccc361c556d4e072604b3be Mon Sep 17 00:00:00 2001 From: Chris B Date: Wed, 11 Sep 2024 17:27:09 -0500 Subject: [PATCH 53/94] [HLSL] Allow truncation to scalar (#104844) HLSL allows implicit conversions to truncate vectors to scalar pr-values. These conversions are scored as vector truncations and should warn appropriately. This change allows forming a truncation cast to a pr-value, but not an l-value. Truncating a vector to a scalar is performed by loading the first element of the vector and disregarding the remaining elements. Fixes #102964 --- clang/lib/AST/Expr.cpp | 2 +- clang/lib/AST/ExprConstant.cpp | 22 +++++++- clang/lib/CodeGen/CGExprScalar.cpp | 17 ++++--- clang/lib/Sema/SemaExprCXX.cpp | 40 ++++++++------- clang/lib/Sema/SemaOverload.cpp | 50 ++++++++++++------- .../standard_conversion_sequences.hlsl | 24 +++++++++ clang/test/CodeGenHLSL/builtins/dot.hlsl | 12 ----- clang/test/CodeGenHLSL/builtins/lerp.hlsl | 18 ------- clang/test/CodeGenHLSL/builtins/mad.hlsl | 18 ------- .../TruncationOverloadResolution.hlsl | 36 +++++++++++++ .../BuiltinVector/ScalarSwizzleErrors.hlsl | 6 ++- .../BuiltinVector/TruncationConstantExpr.hlsl | 20 ++++++++ 12 files changed, 172 insertions(+), 93 deletions(-) create mode 100644 clang/test/SemaHLSL/Types/BuiltinVector/TruncationConstantExpr.hlsl diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 6545912ed160d96..e10142eff8ec47a 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -1924,7 +1924,6 @@ bool CastExpr::CastConsistency() const { case CK_FixedPointToIntegral: case CK_IntegralToFixedPoint: case CK_MatrixCast: - case CK_HLSLVectorTruncation: assert(!getType()->isBooleanType() && "unheralded conversion to bool"); goto CheckNoBasePath; @@ -1945,6 +1944,7 @@ bool CastExpr::CastConsistency() const { case CK_BuiltinFnToFnPtr: case CK_FixedPointToBoolean: case CK_HLSLArrayRValue: + case CK_HLSLVectorTruncation: CheckNoBasePath: assert(path_empty() && "Cast kind should not have a base path!"); break; diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 78d25006360042b..6387e375dda79c0 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -10935,6 +10935,15 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) { return true; } + case CK_HLSLVectorTruncation: { + APValue Val; + SmallVector Elements; + if (!EvaluateVector(SE, Val, Info)) + return Error(E); + for (unsigned I = 0; I < NElts; I++) + Elements.push_back(Val.getVectorElt(I)); + return Success(Elements, E); + } default: return ExprEvaluatorBaseTy::VisitCastExpr(E); } @@ -14478,7 +14487,6 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) { case CK_FixedPointCast: case CK_IntegralToFixedPoint: case CK_MatrixCast: - case CK_HLSLVectorTruncation: llvm_unreachable("invalid cast kind for integral value"); case CK_BitCast: @@ -14651,6 +14659,12 @@ bool IntExprEvaluator::VisitCastExpr(const CastExpr *E) { return false; return Success(Value, E); } + case CK_HLSLVectorTruncation: { + APValue Val; + if (!EvaluateVector(SubExpr, Val, Info)) + return Error(E); + return Success(Val.getVectorElt(0), E); + } } llvm_unreachable("unknown cast resulting in integral value"); @@ -15177,6 +15191,12 @@ bool FloatExprEvaluator::VisitCastExpr(const CastExpr *E) { Result = V.getComplexFloatReal(); return true; } + case CK_HLSLVectorTruncation: { + APValue Val; + if (!EvaluateVector(SubExpr, Val, Info)) + return Error(E); + return Success(Val.getVectorElt(0), E); + } } } diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 9027bab6b680d42..82caf65ac68d6b1 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2709,14 +2709,19 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) { return CGF.CGM.createOpenCLIntToSamplerConversion(E, CGF); case CK_HLSLVectorTruncation: { - assert(DestTy->isVectorType() && "Expected dest type to be vector type"); + assert((DestTy->isVectorType() || DestTy->isBuiltinType()) && + "Destination type must be a vector or builtin type."); Value *Vec = Visit(const_cast(E)); - SmallVector Mask; - unsigned NumElts = DestTy->castAs()->getNumElements(); - for (unsigned I = 0; I != NumElts; ++I) - Mask.push_back(I); + if (auto *VecTy = DestTy->getAs()) { + SmallVector Mask; + unsigned NumElts = VecTy->getNumElements(); + for (unsigned I = 0; I != NumElts; ++I) + Mask.push_back(I); - return Builder.CreateShuffleVector(Vec, Mask, "trunc"); + return Builder.CreateShuffleVector(Vec, Mask, "trunc"); + } + llvm::Value *Zero = llvm::Constant::getNullValue(CGF.SizeTy); + return Builder.CreateExtractElement(Vec, Zero, "cast.vtrunc"); } } // end of switch diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index a14a086731c13d6..a33854a211ce83d 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -4313,8 +4313,10 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, // from type to the elements of the to type without resizing the vector. static QualType adjustVectorType(ASTContext &Context, QualType FromTy, QualType ToType, QualType *ElTy = nullptr) { - auto *ToVec = ToType->castAs(); - QualType ElType = ToVec->getElementType(); + QualType ElType = ToType; + if (auto *ToVec = ToType->getAs()) + ElType = ToVec->getElementType(); + if (ElTy) *ElTy = ElType; if (!FromTy->isVectorType()) @@ -4475,7 +4477,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, case ICK_Integral_Conversion: { QualType ElTy = ToType; QualType StepTy = ToType; - if (ToType->isVectorType()) + if (FromType->isVectorType() || ToType->isVectorType()) StepTy = adjustVectorType(Context, FromType, ToType, &ElTy); if (ElTy->isBooleanType()) { assert(FromType->castAs()->getDecl()->isFixed() && @@ -4495,7 +4497,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, case ICK_Floating_Promotion: case ICK_Floating_Conversion: { QualType StepTy = ToType; - if (ToType->isVectorType()) + if (FromType->isVectorType() || ToType->isVectorType()) StepTy = adjustVectorType(Context, FromType, ToType); From = ImpCastExprToType(From, StepTy, CK_FloatingCast, VK_PRValue, /*BasePath=*/nullptr, CCK) @@ -4527,7 +4529,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, case ICK_Floating_Integral: { QualType ElTy = ToType; QualType StepTy = ToType; - if (ToType->isVectorType()) + if (FromType->isVectorType() || ToType->isVectorType()) StepTy = adjustVectorType(Context, FromType, ToType, &ElTy); if (ElTy->isRealFloatingType()) From = ImpCastExprToType(From, StepTy, CK_IntegralToFloating, VK_PRValue, @@ -4669,11 +4671,11 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, } QualType ElTy = FromType; QualType StepTy = ToType; - if (FromType->isVectorType()) { - if (getLangOpts().HLSL) - StepTy = adjustVectorType(Context, FromType, ToType); + if (FromType->isVectorType()) ElTy = FromType->castAs()->getElementType(); - } + if (getLangOpts().HLSL && + (FromType->isVectorType() || ToType->isVectorType())) + StepTy = adjustVectorType(Context, FromType, ToType); From = ImpCastExprToType(From, StepTy, ScalarTypeToBooleanCastKind(ElTy), VK_PRValue, @@ -4828,8 +4830,8 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, // TODO: Support HLSL matrices. assert((!From->getType()->isMatrixType() && !ToType->isMatrixType()) && "Dimension conversion for matrix types is not implemented yet."); - assert(ToType->isVectorType() && - "Dimension conversion is only supported for vector types."); + assert((ToType->isVectorType() || ToType->isBuiltinType()) && + "Dimension conversion output must be vector or scalar type."); switch (SCS.Dimension) { case ICK_HLSL_Vector_Splat: { // Vector splat from any arithmetic type to a vector. @@ -4841,18 +4843,18 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType, } case ICK_HLSL_Vector_Truncation: { // Note: HLSL built-in vectors are ExtVectors. Since this truncates a - // vector to a smaller vector, this can only operate on arguments where - // the source and destination types are ExtVectors. - assert(From->getType()->isExtVectorType() && ToType->isExtVectorType() && - "HLSL vector truncation should only apply to ExtVectors"); + // vector to a smaller vector or to a scalar, this can only operate on + // arguments where the source type is an ExtVector and the destination + // type is destination type is either an ExtVectorType or a builtin scalar + // type. auto *FromVec = From->getType()->castAs(); - auto *ToVec = ToType->castAs(); - QualType ElType = FromVec->getElementType(); - QualType TruncTy = - Context.getExtVectorType(ElType, ToVec->getNumElements()); + QualType TruncTy = FromVec->getElementType(); + if (auto *ToVec = ToType->getAs()) + TruncTy = Context.getExtVectorType(TruncTy, ToVec->getNumElements()); From = ImpCastExprToType(From, TruncTy, CK_HLSLVectorTruncation, From->getValueKind()) .get(); + break; } case ICK_Identity: diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 861b0a91240b3bb..ea72d3f003cbc46 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -2032,26 +2032,42 @@ static bool IsVectorConversion(Sema &S, QualType FromType, QualType ToType, if (S.Context.hasSameUnqualifiedType(FromType, ToType)) return false; + // HLSL allows implicit truncation of vector types. + if (S.getLangOpts().HLSL) { + auto *ToExtType = ToType->getAs(); + auto *FromExtType = FromType->getAs(); + + // If both arguments are vectors, handle possible vector truncation and + // element conversion. + if (ToExtType && FromExtType) { + unsigned FromElts = FromExtType->getNumElements(); + unsigned ToElts = ToExtType->getNumElements(); + if (FromElts < ToElts) + return false; + if (FromElts == ToElts) + ElConv = ICK_Identity; + else + ElConv = ICK_HLSL_Vector_Truncation; + + QualType FromElTy = FromExtType->getElementType(); + QualType ToElTy = ToExtType->getElementType(); + if (S.Context.hasSameUnqualifiedType(FromElTy, ToElTy)) + return true; + return IsVectorElementConversion(S, FromElTy, ToElTy, ICK, From); + } + if (FromExtType && !ToExtType) { + ElConv = ICK_HLSL_Vector_Truncation; + QualType FromElTy = FromExtType->getElementType(); + if (S.Context.hasSameUnqualifiedType(FromElTy, ToType)) + return true; + return IsVectorElementConversion(S, FromElTy, ToType, ICK, From); + } + // Fallthrough for the case where ToType is a vector and FromType is not. + } + // There are no conversions between extended vector types, only identity. if (auto *ToExtType = ToType->getAs()) { if (auto *FromExtType = FromType->getAs()) { - // HLSL allows implicit truncation of vector types. - if (S.getLangOpts().HLSL) { - unsigned FromElts = FromExtType->getNumElements(); - unsigned ToElts = ToExtType->getNumElements(); - if (FromElts < ToElts) - return false; - if (FromElts == ToElts) - ElConv = ICK_Identity; - else - ElConv = ICK_HLSL_Vector_Truncation; - - QualType FromElTy = FromExtType->getElementType(); - QualType ToElTy = ToExtType->getElementType(); - if (S.Context.hasSameUnqualifiedType(FromElTy, ToElTy)) - return true; - return IsVectorElementConversion(S, FromElTy, ToElTy, ICK, From); - } // There are no conversions between extended vector types other than the // identity conversion. return false; diff --git a/clang/test/CodeGenHLSL/BasicFeatures/standard_conversion_sequences.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/standard_conversion_sequences.hlsl index 5d751be6dae0665..6478ea67e32a0df 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/standard_conversion_sequences.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/standard_conversion_sequences.hlsl @@ -117,3 +117,27 @@ void d4_to_b2() { vector d4 = 9.0; vector b2 = d4; } + +// CHECK-LABEL: d4_to_d1 +// CHECK: [[d4:%.*]] = alloca <4 x double> +// CHECK: [[d1:%.*]] = alloca <1 x double> +// CHECK: store <4 x double> , ptr [[d4]] +// CHECK: [[vecd4:%.*]] = load <4 x double>, ptr [[d4]] +// CHECK: [[vecd1:%.*]] = shufflevector <4 x double> [[vecd4]], <4 x double> poison, <1 x i32> zeroinitializer +// CHECK: store <1 x double> [[vecd1]], ptr [[d1:%.*]], align 8 +void d4_to_d1() { + vector d4 = 9.0; + vector d1 = d4; +} + +// CHECK-LABEL: d4_to_dScalar +// CHECK: [[d4:%.*]] = alloca <4 x double> +// CHECK: [[d:%.*]] = alloca double +// CHECK: store <4 x double> , ptr [[d4]] +// CHECK: [[vecd4:%.*]] = load <4 x double>, ptr [[d4]] +// CHECK: [[d4x:%.*]] = extractelement <4 x double> [[vecd4]], i32 0 +// CHECK: store double [[d4x]], ptr [[d]] +void d4_to_dScalar() { + vector d4 = 9.0; + double d = d4; +} diff --git a/clang/test/CodeGenHLSL/builtins/dot.hlsl b/clang/test/CodeGenHLSL/builtins/dot.hlsl index 2b76fae61147b4b..3f6be04a595e237 100644 --- a/clang/test/CodeGenHLSL/builtins/dot.hlsl +++ b/clang/test/CodeGenHLSL/builtins/dot.hlsl @@ -155,18 +155,6 @@ float test_dot_float3(float3 p0, float3 p1) { return dot(p0, p1); } // CHECK: ret float %hlsl.dot float test_dot_float4(float4 p0, float4 p1) { return dot(p0, p1); } -// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v2f32(<2 x float> %splat.splat, <2 x float> -// CHECK: ret float %hlsl.dot -float test_dot_float2_splat(float p0, float2 p1) { return dot(p0, p1); } - -// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v3f32(<3 x float> %splat.splat, <3 x float> -// CHECK: ret float %hlsl.dot -float test_dot_float3_splat(float p0, float3 p1) { return dot(p0, p1); } - -// CHECK: %hlsl.dot = call float @llvm.[[ICF]].fdot.v4f32(<4 x float> %splat.splat, <4 x float> -// CHECK: ret float %hlsl.dot -float test_dot_float4_splat(float p0, float4 p1) { return dot(p0, p1); } - // CHECK: %hlsl.dot = fmul double // CHECK: ret double %hlsl.dot double test_dot_double(double p0, double p1) { return dot(p0, p1); } diff --git a/clang/test/CodeGenHLSL/builtins/lerp.hlsl b/clang/test/CodeGenHLSL/builtins/lerp.hlsl index 298d157da00a354..b11046894bd889c 100644 --- a/clang/test/CodeGenHLSL/builtins/lerp.hlsl +++ b/clang/test/CodeGenHLSL/builtins/lerp.hlsl @@ -56,21 +56,3 @@ float3 test_lerp_float3(float3 p0) { return lerp(p0, p0, p0); } // CHECK: %hlsl.lerp = call <4 x float> @llvm.[[TARGET]].lerp.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}) // CHECK: ret <4 x float> %hlsl.lerp float4 test_lerp_float4(float4 p0) { return lerp(p0, p0, p0); } - -// CHECK: %[[b:.*]] = load <2 x float>, ptr %p1.addr, align 8 -// CHECK: %[[c:.*]] = load <2 x float>, ptr %p1.addr, align 8 -// CHECK: %hlsl.lerp = call <2 x float> @llvm.[[TARGET]].lerp.v2f32(<2 x float> %splat.splat, <2 x float> %[[b]], <2 x float> %[[c]]) -// CHECK: ret <2 x float> %hlsl.lerp -float2 test_lerp_float2_splat(float p0, float2 p1) { return lerp(p0, p1, p1); } - -// CHECK: %[[b:.*]] = load <3 x float>, ptr %p1.addr, align 16 -// CHECK: %[[c:.*]] = load <3 x float>, ptr %p1.addr, align 16 -// CHECK: %hlsl.lerp = call <3 x float> @llvm.[[TARGET]].lerp.v3f32(<3 x float> %splat.splat, <3 x float> %[[b]], <3 x float> %[[c]]) -// CHECK: ret <3 x float> %hlsl.lerp -float3 test_lerp_float3_splat(float p0, float3 p1) { return lerp(p0, p1, p1); } - -// CHECK: %[[b:.*]] = load <4 x float>, ptr %p1.addr, align 16 -// CHECK: %[[c:.*]] = load <4 x float>, ptr %p1.addr, align 16 -// CHECK: %hlsl.lerp = call <4 x float> @llvm.[[TARGET]].lerp.v4f32(<4 x float> %splat.splat, <4 x float> %[[b]], <4 x float> %[[c]]) -// CHECK: ret <4 x float> %hlsl.lerp -float4 test_lerp_float4_splat(float p0, float4 p1) { return lerp(p0, p1, p1); } diff --git a/clang/test/CodeGenHLSL/builtins/mad.hlsl b/clang/test/CodeGenHLSL/builtins/mad.hlsl index 449a793caf93b7d..265a2552c80fb48 100644 --- a/clang/test/CodeGenHLSL/builtins/mad.hlsl +++ b/clang/test/CodeGenHLSL/builtins/mad.hlsl @@ -263,21 +263,3 @@ uint64_t3 test_mad_uint64_t3(uint64_t3 p0, uint64_t3 p1, uint64_t3 p2) { return // SPIR_CHECK: mul nuw <4 x i64> %{{.*}}, %{{.*}} // SPIR_CHECK: add nuw <4 x i64> %{{.*}}, %{{.*}} uint64_t4 test_mad_uint64_t4(uint64_t4 p0, uint64_t4 p1, uint64_t4 p2) { return mad(p0, p1, p2); } - -// CHECK: %[[p1:.*]] = load <2 x float>, ptr %p1.addr, align 8 -// CHECK: %[[p2:.*]] = load <2 x float>, ptr %p2.addr, align 8 -// CHECK: %hlsl.fmad = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %splat.splat, <2 x float> %[[p1]], <2 x float> %[[p2]]) -// CHECK: ret <2 x float> %hlsl.fmad -float2 test_mad_float2_splat(float p0, float2 p1, float2 p2) { return mad(p0, p1, p2); } - -// CHECK: %[[p1:.*]] = load <3 x float>, ptr %p1.addr, align 16 -// CHECK: %[[p2:.*]] = load <3 x float>, ptr %p2.addr, align 16 -// CHECK: %hlsl.fmad = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %splat.splat, <3 x float> %[[p1]], <3 x float> %[[p2]]) -// CHECK: ret <3 x float> %hlsl.fmad -float3 test_mad_float3_splat(float p0, float3 p1, float3 p2) { return mad(p0, p1, p2); } - -// CHECK: %[[p1:.*]] = load <4 x float>, ptr %p1.addr, align 16 -// CHECK: %[[p2:.*]] = load <4 x float>, ptr %p2.addr, align 16 -// CHECK: %hlsl.fmad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %splat.splat, <4 x float> %[[p1]], <4 x float> %[[p2]]) -// CHECK: ret <4 x float> %hlsl.fmad -float4 test_mad_float4_splat(float p0, float4 p1, float4 p2) { return mad(p0, p1, p2); } diff --git a/clang/test/SemaHLSL/TruncationOverloadResolution.hlsl b/clang/test/SemaHLSL/TruncationOverloadResolution.hlsl index 2bcb367c5669a3a..0192c27860f1405 100644 --- a/clang/test/SemaHLSL/TruncationOverloadResolution.hlsl +++ b/clang/test/SemaHLSL/TruncationOverloadResolution.hlsl @@ -24,6 +24,42 @@ void Case2(float4 F) { Half2Double2(F); // expected-warning{{implicit conversion truncates vector: 'float4' (aka 'vector') to 'vector' (vector of 2 'double' values)}} } +// Case 3: Allow truncation down to vector or T. +void Half(half H); +void Float(float F); +void Double(double D); + +void Half1(half1 H); +void Float1(float1 F); +void Double1(double1 D); + +void Case3(half3 H, float3 F, double3 D) { + Half(H); // expected-warning{{implicit conversion turns vector to scalar: 'half3' (aka 'vector') to 'half'}} + Half(F); // expected-warning{{implicit conversion turns vector to scalar: 'float3' (aka 'vector') to 'half'}} + Half(D); // expected-warning{{implicit conversion turns vector to scalar: 'double3' (aka 'vector') to 'half'}} + + Float(H); // expected-warning{{implicit conversion turns vector to scalar: 'half3' (aka 'vector') to 'float'}} + Float(F); // expected-warning{{implicit conversion turns vector to scalar: 'float3' (aka 'vector') to 'float'}} + Float(D); // expected-warning{{implicit conversion turns vector to scalar: 'double3' (aka 'vector') to 'float'}} + + Double(H); // expected-warning{{implicit conversion turns vector to scalar: 'half3' (aka 'vector') to 'double'}} + Double(F); // expected-warning{{implicit conversion turns vector to scalar: 'float3' (aka 'vector') to 'double'}} + Double(D); // expected-warning{{implicit conversion turns vector to scalar: 'double3' (aka 'vector') to 'double'}} + + Half1(H); // expected-warning{{implicit conversion truncates vector: 'half3' (aka 'vector') to 'vector' (vector of 1 'half' value)}} + Half1(F); // expected-warning{{implicit conversion truncates vector: 'float3' (aka 'vector') to 'vector' (vector of 1 'half' value)}} expected-warning{{implicit conversion loses floating-point precision: 'float3' (aka 'vector') to 'vector' (vector of 1 'half' value)}} + Half1(D); // expected-warning{{implicit conversion truncates vector: 'double3' (aka 'vector') to 'vector' (vector of 1 'half' value)}} expected-warning{{implicit conversion loses floating-point precision: 'double3' (aka 'vector') to 'vector' (vector of 1 'half' value)}} + + Float1(H); // expected-warning{{implicit conversion truncates vector: 'half3' (aka 'vector') to 'vector' (vector of 1 'float' value)}} + Float1(F); // expected-warning{{implicit conversion truncates vector: 'float3' (aka 'vector') to 'vector' (vector of 1 'float' value)}} + Float1(D); // expected-warning{{implicit conversion truncates vector: 'double3' (aka 'vector') to 'vector' (vector of 1 'float' value)}} expected-warning{{implicit conversion loses floating-point precision: 'double3' (aka 'vector') to 'vector' (vector of 1 'float' value)}} + + Double1(H); // expected-warning{{implicit conversion truncates vector: 'half3' (aka 'vector') to 'vector' (vector of 1 'double' value)}} + Double1(F); // expected-warning{{implicit conversion truncates vector: 'float3' (aka 'vector') to 'vector' (vector of 1 'double' value)}} + Double1(D); // expected-warning{{implicit conversion truncates vector: 'double3' (aka 'vector') to 'vector' (vector of 1 'double' value)}} +} + + #if ERROR // Case 3: Two promotions or two conversions are ambiguous. void Float2Double2(double2 D); // expected-note{{candidate function}} diff --git a/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzleErrors.hlsl b/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzleErrors.hlsl index 5088991f2e28ac5..b1c75acbc16c6f5 100644 --- a/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzleErrors.hlsl +++ b/clang/test/SemaHLSL/Types/BuiltinVector/ScalarSwizzleErrors.hlsl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -x hlsl -finclude-default-header -verify %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s int2 ToTwoInts(int V) { return V.xy; // expected-error{{vector component access exceeds type 'vector' (vector of 1 'int' value)}} @@ -16,6 +16,10 @@ float2 WhatIsHappening(float V) { return V.; // expected-error{{expected unqualified-id}} } +float ScalarLValue(float2 V) { + (float)V = 4.0; // expected-error{{assignment to cast is illegal, lvalue casts are not supported}} +} + // These cases produce no error. float2 HowManyFloats(float V) { diff --git a/clang/test/SemaHLSL/Types/BuiltinVector/TruncationConstantExpr.hlsl b/clang/test/SemaHLSL/Types/BuiltinVector/TruncationConstantExpr.hlsl new file mode 100644 index 000000000000000..918daa03d803229 --- /dev/null +++ b/clang/test/SemaHLSL/Types/BuiltinVector/TruncationConstantExpr.hlsl @@ -0,0 +1,20 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -std=hlsl202x -verify %s + +// expected-no-diagnostics + +// Note: these tests are a bit awkward because at time of writing we don't have a +// good way to constexpr `any` for bool vector conditions, and the condition for +// _Static_assert must be an integral constant. +export void fn() { + // This compiling successfully verifies that the vector constant expression + // gets truncated to an integer at compile time for instantiation. + _Static_assert(((int)1.xxxx) + 0 == 1, "Woo!"); + + // This compiling successfully verifies that the vector constant expression + // gets truncated to a float at compile time for instantiation. + _Static_assert(((float)1.0.xxxx) + 0.0 == 1.0, "Woo!"); + + // This compiling successfully verifies that a vector can be truncated to a + // smaller vector, then truncated to a float as a constant expression. + _Static_assert(((float2)float4(6, 5, 4, 3)).x == 6, "Woo!"); +} From 07a7bdc806961ef63b1fd7bdd63f27c6c803aa7c Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Wed, 11 Sep 2024 15:27:38 -0700 Subject: [PATCH 54/94] [WebAssembly] Fix lane index size for f16x8 extract_lane. (#108118) --- llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 9be23dacf75013e..b652ee98cef107c 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -684,7 +684,7 @@ defm EXTRACT_LANE_F16x8 : HALF_PRECISION_I<(outs F32:$dst), (ins V128:$vec, vec_i8imm_op:$idx), (outs), (ins vec_i8imm_op:$idx), [(set (f32 F32:$dst), (int_wasm_extract_lane_f16x8 - (v8f16 V128:$vec), (i32 LaneIdx16:$idx)))], + (v8f16 V128:$vec), (i32 LaneIdx8:$idx)))], "f16x8.extract_lane\t$dst, $vec, $idx", "f16x8.extract_lane\t$idx", 0x121>; From 31d48372732bc3fd3606aeb9c5cceb7cce739b4e Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 11 Sep 2024 15:28:37 -0700 Subject: [PATCH 55/94] [SandboxIR][Bench] SandboxIR creation (#108278) Adds a benchmark for the overhead of SandboxIR creation. --- llvm/benchmarks/SandboxIRBench.cpp | 35 ++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/llvm/benchmarks/SandboxIRBench.cpp b/llvm/benchmarks/SandboxIRBench.cpp index ca2cab664f91e5b..d4601d5f53d07a3 100644 --- a/llvm/benchmarks/SandboxIRBench.cpp +++ b/llvm/benchmarks/SandboxIRBench.cpp @@ -89,6 +89,31 @@ static std::string generateBBWalkIR(unsigned Size) { return SS.str(); } +template static void SBoxIRCreation(benchmark::State &State) { + static_assert(IRTy != IR::LLVM, "Expected SBoxTracking or SBoxNoTracking"); + LLVMContext LLVMCtx; + unsigned NumInstrs = State.range(0); + std::unique_ptr LLVMM; + std::string IRStr = generateBBWalkIR(NumInstrs); + LLVMM = parseIR(LLVMCtx, IRStr.c_str()); + llvm::Function *LLVMF = &*LLVMM->getFunction("foo"); + + for (auto _ : State) { + State.PauseTiming(); + sandboxir::Context Ctx(LLVMCtx); + if constexpr (IRTy == IR::SBoxTracking) + Ctx.save(); + State.ResumeTiming(); + + sandboxir::Function *F = Ctx.createFunction(LLVMF); + benchmark::DoNotOptimize(F); + State.PauseTiming(); + if constexpr (IRTy == IR::SBoxTracking) + Ctx.accept(); + State.ResumeTiming(); + } +} + template static void BBWalk(benchmark::State &State) { LLVMContext LLVMCtx; sandboxir::Context Ctx(LLVMCtx); @@ -189,6 +214,16 @@ template static void RUOW(benchmark::State &State) { finalize(Ctx); } +// Measure the time it takes to create Sandbox IR without/with tracking. +BENCHMARK(SBoxIRCreation) + ->Args({10}) + ->Args({100}) + ->Args({1000}); +BENCHMARK(SBoxIRCreation) + ->Args({10}) + ->Args({100}) + ->Args({1000}); + BENCHMARK(GetType); BENCHMARK(GetType); From c98d6c2e4293a4ab352e08d49dac3c1357cbbc6a Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Wed, 11 Sep 2024 15:29:02 -0700 Subject: [PATCH 56/94] [NFC] Reformat ClangASTPropertiesEmitter `ASTPropsEmitter` class (#108275) --- .../TableGen/ClangASTPropertiesEmitter.cpp | 94 +++++++++---------- 1 file changed, 45 insertions(+), 49 deletions(-) diff --git a/clang/utils/TableGen/ClangASTPropertiesEmitter.cpp b/clang/utils/TableGen/ClangASTPropertiesEmitter.cpp index de8dda60681ff87..70005da28559d3d 100644 --- a/clang/utils/TableGen/ClangASTPropertiesEmitter.cpp +++ b/clang/utils/TableGen/ClangASTPropertiesEmitter.cpp @@ -88,99 +88,98 @@ struct CasedTypeInfo { }; class ASTPropsEmitter { - raw_ostream &Out; - RecordKeeper &Records; - std::map NodeInfos; + raw_ostream &Out; + RecordKeeper &Records; + std::map NodeInfos; std::vector AllPropertyTypes; std::map CasedTypeInfos; public: - ASTPropsEmitter(RecordKeeper &records, raw_ostream &out) - : Out(out), Records(records) { - - // Find all the properties. - for (Property property : - records.getAllDerivedDefinitions(PropertyClassName)) { - HasProperties node = property.getClass(); - NodeInfos[node].Properties.push_back(property); - } + ASTPropsEmitter(RecordKeeper &records, raw_ostream &out) + : Out(out), Records(records) { + + // Find all the properties. + for (Property property : + records.getAllDerivedDefinitions(PropertyClassName)) { + HasProperties node = property.getClass(); + NodeInfos[node].Properties.push_back(property); + } // Find all the creation rules. for (CreationRule creationRule : - records.getAllDerivedDefinitions(CreationRuleClassName)) { + records.getAllDerivedDefinitions(CreationRuleClassName)) { HasProperties node = creationRule.getClass(); auto &info = NodeInfos[node]; if (info.Creator) { - PrintFatalError(creationRule.getLoc(), - "multiple creator rules for \"" + node.getName() - + "\""); + PrintFatalError(creationRule.getLoc(), "multiple creator rules for \"" + + node.getName() + "\""); } info.Creator = creationRule; } // Find all the override rules. for (OverrideRule overrideRule : - records.getAllDerivedDefinitions(OverrideRuleClassName)) { + records.getAllDerivedDefinitions(OverrideRuleClassName)) { HasProperties node = overrideRule.getClass(); auto &info = NodeInfos[node]; if (info.Override) { PrintFatalError(overrideRule.getLoc(), - "multiple override rules for \"" + node.getName() - + "\""); + "multiple override rules for \"" + node.getName() + + "\""); } info.Override = overrideRule; } // Find all the write helper rules. for (ReadHelperRule helperRule : - records.getAllDerivedDefinitions(ReadHelperRuleClassName)) { + records.getAllDerivedDefinitions(ReadHelperRuleClassName)) { HasProperties node = helperRule.getClass(); auto &info = NodeInfos[node]; if (info.ReadHelper) { PrintFatalError(helperRule.getLoc(), - "multiple write helper rules for \"" + node.getName() - + "\""); + "multiple write helper rules for \"" + node.getName() + + "\""); } info.ReadHelper = helperRule; } // Find all the concrete property types. for (PropertyType type : - records.getAllDerivedDefinitions(PropertyTypeClassName)) { + records.getAllDerivedDefinitions(PropertyTypeClassName)) { // Ignore generic specializations; they're generally not useful when // emitting basic emitters etc. - if (type.isGenericSpecialization()) continue; + if (type.isGenericSpecialization()) + continue; AllPropertyTypes.push_back(type); } // Find all the type kind rules. for (TypeKindRule kindRule : - records.getAllDerivedDefinitions(TypeKindClassName)) { + records.getAllDerivedDefinitions(TypeKindClassName)) { PropertyType type = kindRule.getParentType(); auto &info = CasedTypeInfos[type]; if (info.KindRule) { - PrintFatalError(kindRule.getLoc(), - "multiple kind rules for \"" - + type.getCXXTypeName() + "\""); + PrintFatalError(kindRule.getLoc(), "multiple kind rules for \"" + + type.getCXXTypeName() + "\""); } info.KindRule = kindRule; } // Find all the type cases. for (TypeCase typeCase : - records.getAllDerivedDefinitions(TypeCaseClassName)) { + records.getAllDerivedDefinitions(TypeCaseClassName)) { CasedTypeInfos[typeCase.getParentType()].Cases.push_back(typeCase); } Validator(*this).validate(); - } + } void visitAllProperties(HasProperties derived, const NodeInfo &derivedInfo, - function_ref visit) { + function_ref visit) { std::set ignoredProperties; auto overrideRule = derivedInfo.Override; @@ -195,20 +194,19 @@ class ASTPropsEmitter { visitAllNodesWithInfo(derived, derivedInfo, [&](HasProperties node, const NodeInfo &info) { - for (Property prop : info.Properties) { - if (ignoredProperties.count(prop.getName())) - continue; + for (Property prop : info.Properties) { + if (ignoredProperties.count(prop.getName())) + continue; - visit(prop); - } - }); + visit(prop); + } + }); } - void visitAllNodesWithInfo(HasProperties derivedNode, - const NodeInfo &derivedNodeInfo, - llvm::function_ref - visit) { + void visitAllNodesWithInfo( + HasProperties derivedNode, const NodeInfo &derivedNodeInfo, + llvm::function_ref + visit) { visit(derivedNode, derivedNodeInfo); // Also walk the bases if appropriate. @@ -217,7 +215,8 @@ class ASTPropsEmitter { auto it = NodeInfos.find(base); // Ignore intermediate nodes that don't add interesting properties. - if (it == NodeInfos.end()) continue; + if (it == NodeInfos.end()) + continue; auto &baseInfo = it->second; visit(base, baseInfo); @@ -225,14 +224,12 @@ class ASTPropsEmitter { } } - template - void emitNodeReaderClass() { + template void emitNodeReaderClass() { auto info = ReaderWriterInfo::forReader(); emitNodeReaderWriterClass(info); } - template - void emitNodeWriterClass() { + template void emitNodeWriterClass() { auto info = ReaderWriterInfo::forWriter(); emitNodeReaderWriterClass(info); } @@ -241,8 +238,7 @@ class ASTPropsEmitter { void emitNodeReaderWriterClass(const ReaderWriterInfo &info); template - void emitNodeReaderWriterMethod(NodeClass node, - const ReaderWriterInfo &info); + void emitNodeReaderWriterMethod(NodeClass node, const ReaderWriterInfo &info); void emitPropertiedReaderWriterBody(HasProperties node, const ReaderWriterInfo &info); From 060137038ab9246b377e190ae3c6f272fa57cbfc Mon Sep 17 00:00:00 2001 From: yronglin Date: Thu, 12 Sep 2024 06:29:48 +0800 Subject: [PATCH 57/94] Reapply "[Clang][CWG1815] Support lifetime extension of temporary created by aggregate initialization using a default member initializer" (#108039) The PR reapply https://github.com/llvm/llvm-project/pull/97308. - Implement [CWG1815](https://wg21.link/CWG1815): Support lifetime extension of temporary created by aggregate initialization using a default member initializer. - Fix crash that introduced in https://github.com/llvm/llvm-project/pull/97308. In `InitListChecker::FillInEmptyInitForField`, when we enter rebuild-default-init context, we copy all the contents of the parent context to the current context, which will cause the `MaybeODRUseExprs` to be lost. But we don't need to copy the entire context, only the `DelayedDefaultInitializationContext` was required, which is used to build `SourceLocExpr`, etc. --------- Signed-off-by: yronglin --- clang/docs/ReleaseNotes.rst | 3 + .../clang/Basic/DiagnosticSemaKinds.td | 7 -- clang/include/clang/Sema/Sema.h | 23 ++--- clang/lib/Parse/ParseDecl.cpp | 3 +- clang/lib/Sema/CheckExprLifetime.cpp | 18 +--- clang/lib/Sema/SemaExpr.cpp | 36 +++++-- clang/lib/Sema/SemaExprCXX.cpp | 3 - clang/lib/Sema/SemaInit.cpp | 23 +++-- .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 5 +- clang/lib/Sema/TreeTransform.h | 20 ++-- clang/test/AST/ast-dump-default-init-json.cpp | 6 +- clang/test/AST/ast-dump-default-init.cpp | 2 +- .../Analysis/lifetime-extended-regions.cpp | 9 +- clang/test/CXX/drs/cwg16xx.cpp | 23 ++++- clang/test/CXX/drs/cwg18xx.cpp | 19 +++- clang/test/CXX/special/class.temporary/p6.cpp | 34 +++++++ clang/test/SemaCXX/PR97308.cpp | 21 ++++ clang/test/SemaCXX/constexpr-default-arg.cpp | 4 +- .../cxx11-default-member-initializers.cpp | 97 +++++++++++++++++++ clang/test/SemaCXX/eval-crashes.cpp | 6 +- clang/www/cxx_dr_status.html | 2 +- 21 files changed, 277 insertions(+), 87 deletions(-) create mode 100644 clang/test/SemaCXX/PR97308.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 59ccdf1e15cd81c..af6d1c5826a2fcc 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -108,6 +108,9 @@ C++ Language Changes - Allow single element access of GCC vector/ext_vector_type object to be constant expression. Supports the `V.xyzw` syntax and other tidbits as seen in OpenCL. Selecting multiple elements is left as a future work. +- Implement `CWG1815 `_. Support lifetime extension + of temporary created by aggregate initialization using a default member + initializer. - Accept C++26 user-defined ``static_assert`` messages in C++11 as an extension. diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index b160fee827a7509..efdc058edca56d0 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10162,13 +10162,6 @@ def warn_dangling_pointer_assignment : Warning< "will be destroyed at the end of the full-expression">, InGroup; -def warn_unsupported_lifetime_extension : Warning< - "lifetime extension of " - "%select{temporary|backing array of initializer list}0 created " - "by aggregate initialization using a default member initializer " - "is not yet supported; lifetime of %select{temporary|backing array}0 " - "will end at the end of the full-expression">, InGroup; - // For non-floating point, expressions of the form x == x or x != x // should result in a warning, since these always evaluate to a constant. // Array comparisons have similar warnings diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 68c782a15c6f1be..99eef472223a007 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -6403,6 +6403,9 @@ class Sema final : public SemaBase { /// example, in a for-range initializer). bool InLifetimeExtendingContext = false; + /// Whether we should rebuild CXXDefaultArgExpr and CXXDefaultInitExpr. + bool RebuildDefaultArgOrDefaultInit = false; + // When evaluating immediate functions in the initializer of a default // argument or default member initializer, this is the declaration whose // default initializer is being evaluated and the location of the call @@ -7810,9 +7813,11 @@ class Sema final : public SemaBase { } bool isInLifetimeExtendingContext() const { - assert(!ExprEvalContexts.empty() && - "Must be in an expression evaluation context"); - return ExprEvalContexts.back().InLifetimeExtendingContext; + return currentEvaluationContext().InLifetimeExtendingContext; + } + + bool needsRebuildOfDefaultArgOrInit() const { + return currentEvaluationContext().RebuildDefaultArgOrDefaultInit; } bool isCheckingDefaultArgumentOrInitializer() const { @@ -7854,18 +7859,6 @@ class Sema final : public SemaBase { return Res; } - /// keepInLifetimeExtendingContext - Pull down InLifetimeExtendingContext - /// flag from previous context. - void keepInLifetimeExtendingContext() { - if (ExprEvalContexts.size() > 2 && - parentEvaluationContext().InLifetimeExtendingContext) { - auto &LastRecord = ExprEvalContexts.back(); - auto &PrevRecord = parentEvaluationContext(); - LastRecord.InLifetimeExtendingContext = - PrevRecord.InLifetimeExtendingContext; - } - } - DefaultedComparisonKind getDefaultedComparisonKind(const FunctionDecl *FD) { return getDefaultedFunctionKind(FD).asComparison(); } diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 61a1ca3da6bca0a..1f56884be392d64 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -2509,8 +2509,9 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS, // P2718R0 - Lifetime extension in range-based for loops. if (getLangOpts().CPlusPlus23) { - auto &LastRecord = Actions.ExprEvalContexts.back(); + auto &LastRecord = Actions.currentEvaluationContext(); LastRecord.InLifetimeExtendingContext = true; + LastRecord.RebuildDefaultArgOrDefaultInit = true; } if (getLangOpts().OpenMP) diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp index c8e703036c132cc..77c73f47658fe10 100644 --- a/clang/lib/Sema/CheckExprLifetime.cpp +++ b/clang/lib/Sema/CheckExprLifetime.cpp @@ -896,11 +896,6 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, enum PathLifetimeKind { /// Lifetime-extend along this path. Extend, - /// We should lifetime-extend, but we don't because (due to technical - /// limitations) we can't. This happens for default member initializers, - /// which we don't clone for every use, so we don't have a unique - /// MaterializeTemporaryExpr to update. - ShouldExtend, /// Do not lifetime extend along this path. NoExtend }; @@ -912,7 +907,7 @@ shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) { PathLifetimeKind Kind = PathLifetimeKind::Extend; for (auto Elem : Path) { if (Elem.Kind == IndirectLocalPathEntry::DefaultInit) - Kind = PathLifetimeKind::ShouldExtend; + return PathLifetimeKind::Extend; else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit) return PathLifetimeKind::NoExtend; } @@ -1058,17 +1053,6 @@ static void checkExprLifetimeImpl(Sema &SemaRef, // Also visit the temporaries lifetime-extended by this initializer. return true; - case PathLifetimeKind::ShouldExtend: - // We're supposed to lifetime-extend the temporary along this path (per - // the resolution of DR1815), but we don't support that yet. - // - // FIXME: Properly handle this situation. Perhaps the easiest approach - // would be to clone the initializer expression on each use that would - // lifetime extend its temporaries. - SemaRef.Diag(DiagLoc, diag::warn_unsupported_lifetime_extension) - << RK << DiagRange; - break; - case PathLifetimeKind::NoExtend: // If the path goes through the initialization of a variable or field, // it can't possibly reach a temporary created in this full-expression. diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 32dac4440fb82a7..8f3e15cc9a9bb72 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -5429,6 +5429,8 @@ struct EnsureImmediateInvocationInDefaultArgs EnsureImmediateInvocationInDefaultArgs(Sema &SemaRef) : TreeTransform(SemaRef) {} + bool AlwaysRebuild() { return true; } + // Lambda can only have immediate invocations in the default // args of their parameters, which is transformed upon calling the closure. // The body is not a subexpression, so we have nothing to do. @@ -5470,7 +5472,7 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc, assert(Param->hasDefaultArg() && "can't build nonexistent default arg"); bool NestedDefaultChecking = isCheckingDefaultArgumentOrInitializer(); - bool InLifetimeExtendingContext = isInLifetimeExtendingContext(); + bool NeedRebuild = needsRebuildOfDefaultArgOrInit(); std::optional InitializationContext = OutermostDeclarationWithDelayedImmediateInvocations(); @@ -5506,13 +5508,15 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc, // Rewrite the call argument that was created from the corresponding // parameter's default argument. - if (V.HasImmediateCalls || InLifetimeExtendingContext) { + if (V.HasImmediateCalls || + (NeedRebuild && isa_and_present(Param->getInit()))) { if (V.HasImmediateCalls) ExprEvalContexts.back().DelayedDefaultInitializationContext = { CallLoc, Param, CurContext}; // Pass down lifetime extending flag, and collect temporaries in // CreateMaterializeTemporaryExpr when we rewrite the call argument. - keepInLifetimeExtendingContext(); + currentEvaluationContext().InLifetimeExtendingContext = + parentEvaluationContext().InLifetimeExtendingContext; EnsureImmediateInvocationInDefaultArgs Immediate(*this); ExprResult Res; runWithSufficientStackSpace(CallLoc, [&] { @@ -5558,7 +5562,7 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { Expr *Init = nullptr; bool NestedDefaultChecking = isCheckingDefaultArgumentOrInitializer(); - + bool NeedRebuild = needsRebuildOfDefaultArgOrInit(); EnterExpressionEvaluationContext EvalContext( *this, ExpressionEvaluationContext::PotentiallyEvaluated, Field); @@ -5593,12 +5597,27 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { ImmediateCallVisitor V(getASTContext()); if (!NestedDefaultChecking) V.TraverseDecl(Field); - if (V.HasImmediateCalls) { + + // CWG1815 + // Support lifetime extension of temporary created by aggregate + // initialization using a default member initializer. We should rebuild + // the initializer in a lifetime extension context if the initializer + // expression is an ExprWithCleanups. Then make sure the normal lifetime + // extension code recurses into the default initializer and does lifetime + // extension when warranted. + bool ContainsAnyTemporaries = + isa_and_present(Field->getInClassInitializer()); + if (Field->getInClassInitializer() && + !Field->getInClassInitializer()->containsErrors() && + (V.HasImmediateCalls || (NeedRebuild && ContainsAnyTemporaries))) { ExprEvalContexts.back().DelayedDefaultInitializationContext = {Loc, Field, CurContext}; ExprEvalContexts.back().IsCurrentlyCheckingDefaultArgumentOrInitializer = NestedDefaultChecking; - + // Pass down lifetime extending flag, and collect temporaries in + // CreateMaterializeTemporaryExpr when we rewrite the call argument. + currentEvaluationContext().InLifetimeExtendingContext = + parentEvaluationContext().InLifetimeExtendingContext; EnsureImmediateInvocationInDefaultArgs Immediate(*this); ExprResult Res; runWithSufficientStackSpace(Loc, [&] { @@ -17675,11 +17694,10 @@ void Sema::PopExpressionEvaluationContext() { // Append the collected materialized temporaries into previous context before // exit if the previous also is a lifetime extending context. - auto &PrevRecord = parentEvaluationContext(); if (getLangOpts().CPlusPlus23 && Rec.InLifetimeExtendingContext && - PrevRecord.InLifetimeExtendingContext && + parentEvaluationContext().InLifetimeExtendingContext && !Rec.ForRangeLifetimeExtendTemps.empty()) { - PrevRecord.ForRangeLifetimeExtendTemps.append( + parentEvaluationContext().ForRangeLifetimeExtendTemps.append( Rec.ForRangeLifetimeExtendTemps); } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index a33854a211ce83d..ac3fe6ab8f9bd0a 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -1540,9 +1540,6 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo, bool ListInitialization) { QualType Ty = TInfo->getType(); SourceLocation TyBeginLoc = TInfo->getTypeLoc().getBeginLoc(); - - assert((!ListInitialization || Exprs.size() == 1) && - "List initialization must have exactly one expression."); SourceRange FullRange = SourceRange(TyBeginLoc, RParenOrBraceLoc); InitializedEntity Entity = diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 7dc17187524621b..d21b8cb8c04e637 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -750,8 +750,21 @@ void InitListChecker::FillInEmptyInitForField(unsigned Init, FieldDecl *Field, if (Field->hasInClassInitializer()) { if (VerifyOnly) return; - - ExprResult DIE = SemaRef.BuildCXXDefaultInitExpr(Loc, Field); + ExprResult DIE; + { + // Enter a default initializer rebuild context, then we can support + // lifetime extension of temporary created by aggregate initialization + // using a default member initializer. + // CWG1815 (https://wg21.link/CWG1815). + EnterExpressionEvaluationContext RebuildDefaultInit( + SemaRef, Sema::ExpressionEvaluationContext::PotentiallyEvaluated); + SemaRef.currentEvaluationContext().RebuildDefaultArgOrDefaultInit = + true; + SemaRef.currentEvaluationContext().DelayedDefaultInitializationContext = + SemaRef.parentEvaluationContext() + .DelayedDefaultInitializationContext; + DIE = SemaRef.BuildCXXDefaultInitExpr(Loc, Field); + } if (DIE.isInvalid()) { hadError = true; return; @@ -7521,10 +7534,8 @@ Sema::CreateMaterializeTemporaryExpr(QualType T, Expr *Temporary, // are done in both CreateMaterializeTemporaryExpr and MaybeBindToTemporary, // but there may be a chance to merge them. Cleanup.setExprNeedsCleanups(false); - if (isInLifetimeExtendingContext()) { - auto &Record = ExprEvalContexts.back(); - Record.ForRangeLifetimeExtendTemps.push_back(MTE); - } + if (isInLifetimeExtendingContext()) + currentEvaluationContext().ForRangeLifetimeExtendTemps.push_back(MTE); return MTE; } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 6df412cbb09c83f..bb311e384092804 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -5481,7 +5481,10 @@ void Sema::InstantiateVariableInitializer( EnterExpressionEvaluationContext Evaluated( *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated, Var); - keepInLifetimeExtendingContext(); + currentEvaluationContext().InLifetimeExtendingContext = + parentEvaluationContext().InLifetimeExtendingContext; + currentEvaluationContext().RebuildDefaultArgOrDefaultInit = + parentEvaluationContext().RebuildDefaultArgOrDefaultInit; // Instantiate the initializer. ExprResult Init; diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 0daf620b4123e42..4bbc024587915c3 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -4254,7 +4254,10 @@ ExprResult TreeTransform::TransformInitializer(Expr *Init, getSema(), EnterExpressionEvaluationContext::InitList, Construct->isListInitialization()); - getSema().keepInLifetimeExtendingContext(); + getSema().currentEvaluationContext().InLifetimeExtendingContext = + getSema().parentEvaluationContext().InLifetimeExtendingContext; + getSema().currentEvaluationContext().RebuildDefaultArgOrDefaultInit = + getSema().parentEvaluationContext().RebuildDefaultArgOrDefaultInit; SmallVector NewArgs; bool ArgChanged = false; if (getDerived().TransformExprs(Construct->getArgs(), Construct->getNumArgs(), @@ -8924,8 +8927,9 @@ TreeTransform::TransformCXXForRangeStmt(CXXForRangeStmt *S) { // P2718R0 - Lifetime extension in range-based for loops. if (getSema().getLangOpts().CPlusPlus23) { - auto &LastRecord = getSema().ExprEvalContexts.back(); + auto &LastRecord = getSema().currentEvaluationContext(); LastRecord.InLifetimeExtendingContext = true; + LastRecord.RebuildDefaultArgOrDefaultInit = true; } StmtResult Init = S->getInit() ? getDerived().TransformStmt(S->getInit()) : StmtResult(); @@ -14443,6 +14447,13 @@ TreeTransform::TransformCXXTemporaryObjectExpr( if (TransformExprs(E->getArgs(), E->getNumArgs(), true, Args, &ArgumentChanged)) return ExprError(); + + if (E->isListInitialization() && !E->isStdInitListInitialization()) { + ExprResult Res = RebuildInitList(E->getBeginLoc(), Args, E->getEndLoc()); + if (Res.isInvalid()) + return ExprError(); + Args = {Res.get()}; + } } if (!getDerived().AlwaysRebuild() && @@ -14454,12 +14465,9 @@ TreeTransform::TransformCXXTemporaryObjectExpr( return SemaRef.MaybeBindToTemporary(E); } - // FIXME: We should just pass E->isListInitialization(), but we're not - // prepared to handle list-initialization without a child InitListExpr. SourceLocation LParenLoc = T->getTypeLoc().getEndLoc(); return getDerived().RebuildCXXTemporaryObjectExpr( - T, LParenLoc, Args, E->getEndLoc(), - /*ListInitialization=*/LParenLoc.isInvalid()); + T, LParenLoc, Args, E->getEndLoc(), E->isListInitialization()); } template diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp index 1058b4e3ea4d93d..f4949a9c9eedf4f 100644 --- a/clang/test/AST/ast-dump-default-init-json.cpp +++ b/clang/test/AST/ast-dump-default-init-json.cpp @@ -789,10 +789,10 @@ void test() { // CHECK-NEXT: "valueCategory": "lvalue", // CHECK-NEXT: "extendingDecl": { // CHECK-NEXT: "id": "0x{{.*}}", -// CHECK-NEXT: "kind": "FieldDecl", -// CHECK-NEXT: "name": "a", +// CHECK-NEXT: "kind": "VarDecl", +// CHECK-NEXT: "name": "b", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "const A &" +// CHECK-NEXT: "qualType": "B" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "storageDuration": "automatic", diff --git a/clang/test/AST/ast-dump-default-init.cpp b/clang/test/AST/ast-dump-default-init.cpp index 15b29f04bf21bf6..26864fbf15424dc 100644 --- a/clang/test/AST/ast-dump-default-init.cpp +++ b/clang/test/AST/ast-dump-default-init.cpp @@ -13,7 +13,7 @@ void test() { } // CHECK: -CXXDefaultInitExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue has rewritten init // CHECK-NEXT: `-ExprWithCleanups 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue -// CHECK-NEXT: `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Field 0x{{[^ ]*}} 'a' 'const A &' +// CHECK-NEXT: `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Var 0x{{[^ ]*}} 'b' 'B' // CHECK-NEXT: `-ImplicitCastExpr 0x{{[^ ]*}} <{{.*}}> 'const A' // CHECK-NEXT: `-CXXFunctionalCastExpr 0x{{[^ ]*}} <{{.*}}> 'A' functional cast to A // CHECK-NEXT: `-InitListExpr 0x{{[^ ]*}} <{{.*}}> 'A' diff --git a/clang/test/Analysis/lifetime-extended-regions.cpp b/clang/test/Analysis/lifetime-extended-regions.cpp index 4e98bd4b0403ebc..4458ad294af7cb0 100644 --- a/clang/test/Analysis/lifetime-extended-regions.cpp +++ b/clang/test/Analysis/lifetime-extended-regions.cpp @@ -120,10 +120,11 @@ void aggregateWithReferences() { clang_analyzer_dump(viaReference); // expected-warning-re {{&lifetime_extended_object{RefAggregate, viaReference, S{{[0-9]+}}} }} clang_analyzer_dump(viaReference.rx); // expected-warning-re {{&lifetime_extended_object{int, viaReference, S{{[0-9]+}}} }} clang_analyzer_dump(viaReference.ry); // expected-warning-re {{&lifetime_extended_object{Composite, viaReference, S{{[0-9]+}}} }} - - // clang does not currently implement extending lifetime of object bound to reference members of aggregates, - // that are created from default member initializer (see `warn_unsupported_lifetime_extension` from `-Wdangling`) - RefAggregate defaultInitExtended{i}; // clang-bug does not extend `Composite` + + // FIXME: clang currently support extending lifetime of object bound to reference members of aggregates, + // that are created from default member initializer. But CFG and ExprEngine need to be updated to address this change. + // The following expect warning: {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }} + RefAggregate defaultInitExtended{i}; clang_analyzer_dump(defaultInitExtended.ry); // expected-warning {{Unknown }} } diff --git a/clang/test/CXX/drs/cwg16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp index cf6b45ceabf2cc8..95e241f0d03e9b9 100644 --- a/clang/test/CXX/drs/cwg16xx.cpp +++ b/clang/test/CXX/drs/cwg16xx.cpp @@ -449,6 +449,27 @@ namespace cwg1696 { // cwg1696: 7 // since-cxx14-note@-2 {{default member initializer declared here}} }; A a{a, a}; + + struct A1 { + A1() : v(42) {} + // since-cxx14-error@-1 {{reference member 'v' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} + // since-cxx14-note@#cwg1696-A1 {{reference member declared here}} + const int &v; // #cwg1696-A1 + }; + + struct A2 { + A2() = default; + // since-cxx14-error@-1 {{reference member 'v' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} + // since-cxx14-note-re@#cwg1696-A2-b {{in defaulted default constructor for {{.*}} first required here}} + // since-cxx14-note@#cwg1696-A2-a {{initializing field 'v' with default member initializer}} + A2(int v) : v(v) {} + // since-cxx14-warning@-1 {{binding reference member 'v' to stack allocated parameter 'v'}} + // since-cxx14-note@#cwg1696-A2-a {{reference member declared here}} + const int &v = 42; // #cwg1696-A2-a + }; + A2 a1; // #cwg1696-A2-b + + A2 a2(1); // OK, unfortunately #endif } @@ -483,8 +504,6 @@ namespace cwg1696 { // cwg1696: 7 const A &a = A(); // #cwg1696-D1-a }; D1 d1 = {}; // #cwg1696-d1 - // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} - // since-cxx14-note@#cwg1696-D1-a {{initializing field 'a' with default member initializer}} struct D2 { const A &a = A(); // #cwg1696-D2-a diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp index 61b7faa96a9fbbb..7f0fb8cf589d48c 100644 --- a/clang/test/CXX/drs/cwg18xx.cpp +++ b/clang/test/CXX/drs/cwg18xx.cpp @@ -206,19 +206,28 @@ namespace cwg1814 { // cwg1814: yes #endif } -namespace cwg1815 { // cwg1815: no +namespace cwg1815 { // cwg1815: 20 #if __cplusplus >= 201402L - // FIXME: needs codegen test - struct A { int &&r = 0; }; // #cwg1815-A + struct A { int &&r = 0; }; A a = {}; - // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} FIXME - // since-cxx14-note@#cwg1815-A {{initializing field 'r' with default member initializer}} struct B { int &&r = 0; }; // #cwg1815-B // since-cxx14-error@-1 {{reference member 'r' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} // since-cxx14-note@#cwg1815-B {{initializing field 'r' with default member initializer}} // since-cxx14-note@#cwg1815-b {{in implicit default constructor for 'cwg1815::B' first required here}} B b; // #cwg1815-b + +#if __cplusplus >= 201703L + struct C { const int &r = 0; }; + constexpr C c = {}; // OK, since cwg1815 + static_assert(c.r == 0); + + constexpr int f() { + A a = {}; // OK, since cwg1815 + return a.r; + } + static_assert(f() == 0); +#endif #endif } diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp index 5554363cc69abb5..a6d2adfd1fd2c5b 100644 --- a/clang/test/CXX/special/class.temporary/p6.cpp +++ b/clang/test/CXX/special/class.temporary/p6.cpp @@ -269,6 +269,40 @@ void init_capture_init_list() { // CHECK: } } +void check_dr1815() { // dr1815: yes +#if __cplusplus >= 201402L + + struct A { + int &&r = 0; + ~A() {} + }; + + struct B { + A &&a = A{}; + ~B() {} + }; + B a = {}; + + // CHECK: call {{.*}}block_scope_begin_function + extern void block_scope_begin_function(); + extern void block_scope_end_function(); + block_scope_begin_function(); + { + // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev + // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev + B b = {}; + } + // CHECK: call {{.*}}block_scope_end_function + block_scope_end_function(); + + // CHECK: call {{.*}}some_other_function + extern void some_other_function(); + some_other_function(); + // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev + // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev +#endif +} + namespace P2718R0 { namespace basic { template using T2 = std::list; diff --git a/clang/test/SemaCXX/PR97308.cpp b/clang/test/SemaCXX/PR97308.cpp new file mode 100644 index 000000000000000..7f550bc15d741c6 --- /dev/null +++ b/clang/test/SemaCXX/PR97308.cpp @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -o - -emit-llvm -triple x86_64-linux-gnu %s + +// Check there are no crash issue CodeGen action. +// https://github.com/llvm/llvm-project/pull/97308 +struct a { +} constexpr b; +class c { +public: + c(a); +}; +class B { +public: + using d = int; + struct e { + enum { f } g; + int h; + c i; + d j{}; + }; +}; +B::e k{B::e::f, int(), b}; diff --git a/clang/test/SemaCXX/constexpr-default-arg.cpp b/clang/test/SemaCXX/constexpr-default-arg.cpp index ec9b2927880bdfb..901123bfb359ff1 100644 --- a/clang/test/SemaCXX/constexpr-default-arg.cpp +++ b/clang/test/SemaCXX/constexpr-default-arg.cpp @@ -32,8 +32,8 @@ void test_default_arg2() { } // Check that multiple CXXDefaultInitExprs don't cause an assertion failure. -struct A { int &&r = 0; }; // expected-note 2{{default member initializer}} +struct A { int &&r = 0; }; struct B { A x, y; }; -B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}} +B b = {}; // expected-no-diagnostics } diff --git a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp index dd8e9c6b7fc11f0..5e26c3a3b82cd55 100644 --- a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp +++ b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp @@ -27,6 +27,103 @@ class MemInit { C m = s; }; +namespace std { +typedef decltype(sizeof(int)) size_t; + +// libc++'s implementation +template class initializer_list { + const _E *__begin_; + size_t __size_; + + initializer_list(const _E *__b, size_t __s) : __begin_(__b), __size_(__s) {} + +public: + typedef _E value_type; + typedef const _E &reference; + typedef const _E &const_reference; + typedef size_t size_type; + + typedef const _E *iterator; + typedef const _E *const_iterator; + + initializer_list() : __begin_(nullptr), __size_(0) {} + + size_t size() const { return __size_; } + const _E *begin() const { return __begin_; } + const _E *end() const { return __begin_ + __size_; } +}; +} // namespace std + +#if __cplusplus >= 201703L + +// Test CXXDefaultInitExpr rebuild issue in +// https://github.com/llvm/llvm-project/pull/87933 +namespace test_rebuild { +template class C { +public: + C(std::initializer_list); +}; + +template using Ptr = __remove_pointer(T) *; +template C(T) -> C, sizeof(T)>; + +class A { +public: + template T1 *some_func(T2 &&); +}; + +struct B : A { + int *ar = some_func(C{some_func(0)}); + B() {} +}; + +int TestBody_got; +template class Vector { +public: + Vector(std::initializer_list); +}; +template Vector(Ts...) -> Vector; +class ProgramBuilder { +public: + template int *create(ARGS); +}; + +struct TypeTest : ProgramBuilder { + int *str_f16 = create(Vector{0}); + TypeTest() {} +}; +class TypeTest_Element_Test : TypeTest { + void TestBody(); +}; +void TypeTest_Element_Test::TestBody() { + int *expect = str_f16; + &TestBody_got != expect; // expected-warning {{inequality comparison result unused}} +} +} // namespace test_rebuild + +// Test CXXDefaultInitExpr rebuild issue in +// https://github.com/llvm/llvm-project/pull/92527 +namespace test_rebuild2 { +struct F { + int g; +}; +struct H {}; +struct I { + I(const F &); + I(H); +}; +struct L { + I i = I({.g = 0}); +}; +struct N : L {}; + +void f() { + delete new L; // Ok + delete new N; // Ok +} +} // namespace test_rebuild2 +#endif // __cplusplus >= 201703L + #if __cplusplus >= 202002L // This test ensures cleanup expressions are correctly produced // in the presence of default member initializers. diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp index 0865dafe4bf92a5..21e05f19be0caa7 100644 --- a/clang/test/SemaCXX/eval-crashes.cpp +++ b/clang/test/SemaCXX/eval-crashes.cpp @@ -25,11 +25,9 @@ namespace pr33140_0b { } namespace pr33140_2 { - // FIXME: The declaration of 'b' below should lifetime-extend two int - // temporaries. - struct A { int &&r = 0; }; // expected-note 2{{initializing field 'r' with default member initializer}} + struct A { int &&r = 0; }; struct B { A x, y; }; - B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}} + B b = {}; } namespace pr33140_3 { diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index b638f0ff30bcceb..f036fc5add24138 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -10717,7 +10717,7 @@

C++ defect report implementation status

1815 CD4 Lifetime extension in aggregate initialization - No + Clang 20 1816 From 0909e3027004bb710b1d761569eb15452ce10346 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 11 Sep 2024 15:29:42 -0700 Subject: [PATCH 58/94] [lldb] Skip checksum-mismatch.test on Windows --- lldb/test/Shell/SymbolFile/checksum-mismatch.test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/test/Shell/SymbolFile/checksum-mismatch.test b/lldb/test/Shell/SymbolFile/checksum-mismatch.test index 5db97647c9aa02c..8dc55f80e5c2ef9 100644 --- a/lldb/test/Shell/SymbolFile/checksum-mismatch.test +++ b/lldb/test/Shell/SymbolFile/checksum-mismatch.test @@ -1,3 +1,5 @@ +UNSUPPORTED: system-windows + RUN: mkdir -p %t RUN: cp %S/Inputs/main.c %t/main.c RUN: %clang_host %t/main.c -std=c99 -gdwarf-5 -o %t/main.out From ae0ed3d58600da9ec266bf86d0084775f561ba3a Mon Sep 17 00:00:00 2001 From: tmiasko Date: Thu, 12 Sep 2024 00:37:02 +0200 Subject: [PATCH 59/94] [lsan] Fix free(NULL) interception during initialization (#106912) Previously an attempt to free a null pointer during initialization would fail on ENSURE_LSAN_INITED assertion (since a null pointer is not owned by DlsymAlloc). --- compiler-rt/lib/lsan/lsan_interceptors.cpp | 2 ++ compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/compiler-rt/lib/lsan/lsan_interceptors.cpp b/compiler-rt/lib/lsan/lsan_interceptors.cpp index b569c337e976419..efbf2fdfb0ab3ff 100644 --- a/compiler-rt/lib/lsan/lsan_interceptors.cpp +++ b/compiler-rt/lib/lsan/lsan_interceptors.cpp @@ -77,6 +77,8 @@ INTERCEPTOR(void*, malloc, uptr size) { } INTERCEPTOR(void, free, void *p) { + if (UNLIKELY(!p)) + return; if (DlsymAlloc::PointerIsMine(p)) return DlsymAlloc::Free(p); ENSURE_LSAN_INITED; diff --git a/compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c b/compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c index 3905ac40ae2dc74..0228c3bc50dbd95 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c +++ b/compiler-rt/test/sanitizer_common/TestCases/dlsym_alloc.c @@ -3,9 +3,6 @@ // FIXME: TSAN does not use DlsymAlloc. // UNSUPPORTED: tsan -// FIXME: https://github.com/llvm/llvm-project/pull/106912 -// XFAIL: lsan - #include const char *test() __attribute__((disable_sanitizer_instrumentation)) { From 2a9208b0c9ddec4d321bf3af1d06a60210c89da3 Mon Sep 17 00:00:00 2001 From: Brendan Dahl Date: Wed, 11 Sep 2024 15:39:56 -0700 Subject: [PATCH 60/94] [WebAssembly] Change F16x8 extract lane to require constant integer. (#108116) Building with no optimizations resulted in failures since the lane constant wasn't a constant in LLVM IR. --- .../clang/Basic/BuiltinsWebAssembly.def | 4 ++-- clang/lib/Headers/wasm_simd128.h | 21 +++++++++---------- clang/test/CodeGen/builtins-wasm.c | 12 +++++------ 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index 2e80eef2c8b9bc9..ad73f031922a0b2 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -209,8 +209,8 @@ TARGET_BUILTIN(__builtin_wasm_relaxed_dot_bf16x8_add_f32_f32x4, "V4fV8UsV8UsV4f" TARGET_BUILTIN(__builtin_wasm_loadf16_f32, "fh*", "nU", "fp16") TARGET_BUILTIN(__builtin_wasm_storef16_f32, "vfh*", "n", "fp16") TARGET_BUILTIN(__builtin_wasm_splat_f16x8, "V8hf", "nc", "fp16") -TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hi", "nc", "fp16") -TARGET_BUILTIN(__builtin_wasm_replace_lane_f16x8, "V8hV8hif", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_extract_lane_f16x8, "fV8hIi", "nc", "fp16") +TARGET_BUILTIN(__builtin_wasm_replace_lane_f16x8, "V8hV8hIif", "nc", "fp16") // Reference Types builtins // Some builtins are custom type-checked - see 't' as part of the third argument, diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index 67d12f6f2cf4191..14e36e85da8efa2 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -1888,18 +1888,17 @@ static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_splat(float __a) { return (v128_t)__builtin_wasm_splat_f16x8(__a); } -static __inline__ float __FP16_FN_ATTRS wasm_f16x8_extract_lane(v128_t __a, - int __i) - __REQUIRE_CONSTANT(__i) { - return __builtin_wasm_extract_lane_f16x8((__f16x8)__a, __i); -} +#ifdef __wasm_fp16__ +// TODO Replace the following macros with regular C functions and use normal +// target-independent vector code like the other replace/extract instructions. -static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_replace_lane(v128_t __a, - int __i, - float __b) - __REQUIRE_CONSTANT(__i) { - return (v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)__a, __i, __b); -} +#define wasm_f16x8_extract_lane(__a, __i) \ + (__builtin_wasm_extract_lane_f16x8((__f16x8)(__a), __i)) + +#define wasm_f16x8_replace_lane(__a, __i, __b) \ + ((v128_t)__builtin_wasm_replace_lane_f16x8((__f16x8)(__a), __i, __b)) + +#endif static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_abs(v128_t __a) { return (v128_t)__builtin_wasm_abs_f16x8((__f16x8)__a); diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 3010b8954f1c2ef..8943a92faad044c 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -834,16 +834,16 @@ f16x8 splat_f16x8(float a) { return __builtin_wasm_splat_f16x8(a); } -float extract_lane_f16x8(f16x8 a, int i) { - // WEBASSEMBLY: %0 = tail call float @llvm.wasm.extract.lane.f16x8(<8 x half> %a, i32 %i) +float extract_lane_f16x8(f16x8 a) { + // WEBASSEMBLY: %0 = tail call float @llvm.wasm.extract.lane.f16x8(<8 x half> %a, i32 7) // WEBASSEMBLY-NEXT: ret float %0 - return __builtin_wasm_extract_lane_f16x8(a, i); + return __builtin_wasm_extract_lane_f16x8(a, 7); } -f16x8 replace_lane_f16x8(f16x8 a, int i, float v) { - // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.replace.lane.f16x8(<8 x half> %a, i32 %i, float %v) +f16x8 replace_lane_f16x8(f16x8 a, float v) { + // WEBASSEMBLY: %0 = tail call <8 x half> @llvm.wasm.replace.lane.f16x8(<8 x half> %a, i32 7, float %v) // WEBASSEMBLY-NEXT: ret <8 x half> %0 - return __builtin_wasm_replace_lane_f16x8(a, i, v); + return __builtin_wasm_replace_lane_f16x8(a, 7, v); } f16x8 min_f16x8(f16x8 a, f16x8 b) { From b5fd9463a3b9aecfc132828510f7e2a47b581b14 Mon Sep 17 00:00:00 2001 From: jofrn Date: Wed, 11 Sep 2024 18:46:46 -0400 Subject: [PATCH 61/94] [HIP][Clang][CodeGen] Handle hip bin symbols properly. (#107458) Remove '_' in fatbin and gpubin symbol suffixes when missing TU hash ID. Internalize gpubin symbol so that it is not unresolved at link-time when symbol is not relocatable. --- clang/lib/CodeGen/CGCUDANV.cpp | 19 +++++++++++-------- clang/test/CodeGenCUDA/device-stub.cu | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp index 59c5927717933d5..ae14d74f2d91511 100644 --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -840,8 +840,10 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { FatBinStr = new llvm::GlobalVariable( CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, - "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr, - llvm::GlobalVariable::NotThreadLocal); + "__hip_fatbin" + (CGM.getLangOpts().CUID.empty() + ? "" + : "_" + CGM.getContext().getCUIDHash()), + nullptr, llvm::GlobalVariable::NotThreadLocal); cast(FatBinStr)->setSection(FatbinConstantName); } @@ -894,8 +896,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { // thread safety of the loaded program. Therefore we can assume sequential // execution of constructor functions here. if (IsHIP) { - auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage - : llvm::GlobalValue::ExternalLinkage; + auto Linkage = RelocatableDeviceCode ? llvm::GlobalValue::ExternalLinkage + : llvm::GlobalValue::InternalLinkage; llvm::BasicBlock *IfBlock = llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); llvm::BasicBlock *ExitBlock = @@ -905,10 +907,11 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { GpuBinaryHandle = new llvm::GlobalVariable( TheModule, PtrTy, /*isConstant=*/false, Linkage, /*Initializer=*/ - CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr, - CudaGpuBinary - ? "__hip_gpubin_handle" - : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash()); + !RelocatableDeviceCode ? llvm::ConstantPointerNull::get(PtrTy) + : nullptr, + "__hip_gpubin_handle" + (CGM.getLangOpts().CUID.empty() + ? "" + : "_" + CGM.getContext().getCUIDHash())); GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); // Prevent the weak symbol in different shared libraries being merged. if (Linkage != llvm::GlobalValue::InternalLinkage) diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu index 60304647bd4c54a..8695433f6df10cf 100644 --- a/clang/test/CodeGenCUDA/device-stub.cu +++ b/clang/test/CodeGenCUDA/device-stub.cu @@ -175,7 +175,7 @@ __device__ void device_use() { // HIP-SAME: section ".hipFatBinSegment" // * variable to save GPU binary handle after initialization // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global ptr null -// HIPNEF: @__[[PREFIX]]_gpubin_handle_{{[0-9a-f]+}} = external hidden global ptr, align 8 +// HIPNEF: @__[[PREFIX]]_gpubin_handle_{{[0-9a-f]+}} = internal global ptr null, align 8 // * constant unnamed string with NVModuleID // CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 From 7910812414108ed9085548e2704f3ad5c018e970 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 11 Sep 2024 15:41:35 -0700 Subject: [PATCH 62/94] [SLP] Regen a test to pick up naming changes --- .../SLPVectorizer/RISCV/complex-loads.ll | 214 +++++++++--------- 1 file changed, 107 insertions(+), 107 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 36681ecea4f50fd..01c842edd88e418 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -45,13 +45,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP16]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP28]] to <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP13]], [[TMP12]] ; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], ; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP25]], [[TMP8]] -; CHECK-NEXT: [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP17:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP17:%.*]] = zext <2 x i8> [[TMP28]] to <2 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP19]] @@ -62,16 +62,16 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP22]], [[TMP30]] ; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], ; CHECK-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP37]], [[TMP20]] -; CHECK-NEXT: [[TMP26:%.*]] = add <2 x i32> [[TMP27]], [[TMP15]] -; CHECK-NEXT: [[TMP38:%.*]] = sub <2 x i32> [[TMP15]], [[TMP27]] -; CHECK-NEXT: [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]] -; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]] +; CHECK-NEXT: [[TMP38:%.*]] = add <2 x i32> [[TMP27]], [[TMP15]] +; CHECK-NEXT: [[TMP29:%.*]] = sub <2 x i32> [[TMP15]], [[TMP27]] ; CHECK-NEXT: [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 ; CHECK-NEXT: [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 -; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]] -; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]] +; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]] +; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP29]], i32 1 +; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP34]], [[TMP32]] +; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP32]], [[TMP34]] ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 ; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1 @@ -80,17 +80,17 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5 ; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> ; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP33]], [[TMP39]] +; CHECK-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP58]], [[TMP39]] ; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP42]], [[TMP59]] +; CHECK-NEXT: [[TMP59:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP59]] to <2 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP42]], [[TMP62]] ; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], -; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP40]] +; CHECK-NEXT: [[TMP68:%.*]] = add <2 x i32> [[TMP46]], [[TMP40]] ; CHECK-NEXT: [[TMP48:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32> ; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> , i32 2) @@ -104,53 +104,53 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]] ; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], ; CHECK-NEXT: [[TMP60:%.*]] = add <2 x i32> [[TMP70]], [[TMP52]] -; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP60]], [[TMP62]] -; CHECK-NEXT: [[TMP47:%.*]] = sub <2 x i32> [[TMP62]], [[TMP60]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 -; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]] -; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP74]], [[TMP75]] +; CHECK-NEXT: [[TMP47:%.*]] = add <2 x i32> [[TMP60]], [[TMP68]] +; CHECK-NEXT: [[TMP33:%.*]] = sub <2 x i32> [[TMP68]], [[TMP60]] ; CHECK-NEXT: [[TMP61:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0 ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1 -; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP79]], [[TMP61]] -; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP61]], [[TMP79]] -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] -; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] +; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP79]], [[TMP61]] +; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP61]], [[TMP79]] ; CHECK-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0 -; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP63]], 15 +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP33]], i32 1 +; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP71]], [[TMP63]] +; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP63]], [[TMP71]] +; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] +; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0 +; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15 ; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 ; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV]], 15 +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[SUB47_2]], 15 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 ; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 ; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] ; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] ; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15 -; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537 -; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535 +; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP107]], 15 +; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 +; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 ; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]] ; CHECK-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]] -; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15 -; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 -; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 -; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]] -; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15 +; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV_1]], 15 ; CHECK-NEXT: [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537 ; CHECK-NEXT: [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535 +; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]] +; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]] +; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15 +; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 +; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 ; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 ; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32> ; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP77:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> +; CHECK-NEXT: [[TMP72:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> ; CHECK-NEXT: [[TMP73:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32> -; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32> -; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP78]], [[TMP76]] +; CHECK-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32> +; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32> +; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP74]], [[TMP76]] ; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], -; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32> +; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32> ; CHECK-NEXT: [[TMP81:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32> ; CHECK-NEXT: [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> , i32 2) @@ -158,40 +158,40 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]] ; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 -; CHECK-NEXT: [[TMP90:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]] -; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP90]] +; CHECK-NEXT: [[TMP89:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]] +; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP89]] ; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP86]], [[TMP77]] -; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP98]] +; CHECK-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP86]], [[TMP72]] +; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP100]] ; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> -; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]] -; CHECK-NEXT: [[TMP91:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]] -; CHECK-NEXT: [[TMP238:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0 -; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1 -; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]] -; CHECK-NEXT: [[SUB51:%.*]] = sub i32 [[TMP238]], [[TMP108]] +; CHECK-NEXT: [[TMP91:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]] +; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]] ; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0 ; CHECK-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1 -; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP94]] -; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP94]], [[SUB47]] -; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15 +; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[SUB47]], [[TMP94]] +; CHECK-NEXT: [[SUB51:%.*]] = sub i32 [[TMP94]], [[SUB47]] +; CHECK-NEXT: [[TMP98:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0 +; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1 +; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP98]] +; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP98]], [[TMP99]] +; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[SUB47]], 15 +; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 +; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 +; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15 ; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 ; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[SUB47]], 15 -; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537 -; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535 ; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 ; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> -; CHECK-NEXT: [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32> -; CHECK-NEXT: [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32> -; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP155]] +; CHECK-NEXT: [[TMP108:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP108]] to <2 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP116:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> +; CHECK-NEXT: [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> +; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP116]], [[TMP118]] ; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], -; CHECK-NEXT: [[TMP156:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP156]] to <2 x i32> +; CHECK-NEXT: [[TMP121:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP121]] to <2 x i32> ; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> ; CHECK-NEXT: [[TMP114:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> , i32 2) @@ -205,35 +205,35 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP122:%.*]] = sub <2 x i32> [[TMP117]], [[TMP103]] ; CHECK-NEXT: [[TMP123:%.*]] = add <2 x i32> [[TMP125]], [[TMP122]] ; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP120]], [[TMP123]] -; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP120]] +; CHECK-NEXT: [[TMP156:%.*]] = sub <2 x i32> [[TMP123]], [[TMP120]] ; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 ; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]] +; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[TMP146]], [[TMP145]] ; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP145]], [[TMP146]] -; CHECK-NEXT: [[TMP126:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0 -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1 -; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP127]], [[TMP126]] -; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP126]], [[TMP127]] +; CHECK-NEXT: [[TMP180:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0 +; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1 +; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP142]], [[TMP180]] +; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP180]], [[TMP142]] ; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15 ; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 ; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 ; CHECK-NEXT: [[TMP147:%.*]] = lshr <2 x i32> [[TMP110]], ; CHECK-NEXT: [[TMP148:%.*]] = and <2 x i32> [[TMP147]], ; CHECK-NEXT: [[TMP149:%.*]] = mul <2 x i32> [[TMP148]], -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] -; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] -; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] +; CHECK-NEXT: [[ADD79:%.*]] = add i32 [[ADD94]], [[ADD78]] ; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] -; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]] +; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD79]] +; CHECK-NEXT: [[SUB105:%.*]] = sub i32 [[ADD79]], [[ADD95]] +; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB104]] +; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB104]], [[SUB102]] ; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]] +; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] ; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]] -; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[SUB47_2]] +; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB105]] ; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP108]] +; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] +; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[SUB47]] ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] ; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] @@ -241,9 +241,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]] ; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] ; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_1]] +; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] ; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]] -; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP121]], <2 x i32> +; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP156]], <2 x i32> ; CHECK-NEXT: [[TMP130:%.*]] = lshr <2 x i32> [[TMP129]], ; CHECK-NEXT: [[TMP131:%.*]] = and <2 x i32> [[TMP130]], ; CHECK-NEXT: [[TMP132:%.*]] = mul <2 x i32> [[TMP131]], @@ -256,13 +256,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <2 x i32> [[TMP153]], <2 x i32> [[TMP138]], <2 x i32> ; CHECK-NEXT: [[TMP140:%.*]] = add <2 x i32> [[TMP132]], [[TMP139]] ; CHECK-NEXT: [[TMP141:%.*]] = xor <2 x i32> [[TMP140]], [[TMP129]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]] +; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]] ; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] -; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0 -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP142]] -; CHECK-NEXT: [[TMP154:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1 -; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP154]] +; CHECK-NEXT: [[TMP154:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0 +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP154]] +; CHECK-NEXT: [[TMP155:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1 +; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP155]] ; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] ; CHECK-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]] ; CHECK-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB51_1]] @@ -270,25 +270,25 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP245:%.*]] = shufflevector <2 x i32> [[TMP244]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP197:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 ; CHECK-NEXT: [[TMP198:%.*]] = shufflevector <2 x i32> [[TMP197]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP216:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]] -; CHECK-NEXT: [[TMP210:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]] -; CHECK-NEXT: [[TMP221:%.*]] = shufflevector <2 x i32> [[TMP216]], <2 x i32> [[TMP210]], <2 x i32> +; CHECK-NEXT: [[TMP207:%.*]] = add <2 x i32> [[TMP245]], [[TMP198]] +; CHECK-NEXT: [[TMP208:%.*]] = sub <2 x i32> [[TMP245]], [[TMP198]] +; CHECK-NEXT: [[TMP209:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP208]], <2 x i32> ; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]] ; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]] -; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_2]] +; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_6]], [[ADD105_2]] ; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; CHECK-NEXT: [[TMP134:%.*]] = add <2 x i32> [[TMP149]], [[TMP221]] +; CHECK-NEXT: [[TMP134:%.*]] = add <2 x i32> [[TMP149]], [[TMP209]] ; CHECK-NEXT: [[TMP213:%.*]] = xor <2 x i32> [[TMP134]], [[TMP110]] -; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP238]], 15 +; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP94]], 15 ; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 ; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 ; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP238]] +; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP94]] ; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP237]] -; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1 -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP218]] +; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP157]] +; CHECK-NEXT: [[TMP158:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1 +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP158]] ; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] ; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]] ; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]] @@ -301,7 +301,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP220:%.*]] = shufflevector <2 x i32> [[TMP261]], <2 x i32> [[TMP262]], <2 x i32> ; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]] ; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]] -; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_6]], [[ADD105_3]] +; CHECK-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_4]], [[ADD105_3]] ; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV1]] ; CHECK-NEXT: [[TMP230:%.*]] = lshr <2 x i32> [[TMP102]], ; CHECK-NEXT: [[TMP231:%.*]] = and <2 x i32> [[TMP230]], From aa60a3e4d0664dedc8ae0ea005459186fdc1aab9 Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Wed, 11 Sep 2024 17:51:07 -0500 Subject: [PATCH 63/94] [mlir][AMDGPU] Support vector<2xf16> inputs to buffer atomic fadd (#108286) Extend the lowering of atomic.fadd to support the v2f16 variant avaliable on some AMDGPU chips. Re-lands #108238 (and addresses review comments from there) Co-authored-by: Giuseppe Rossini --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 6 +++--- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 7 +++++-- .../Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 11 +++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 8a1ef94c853a587..1ec8227e2326376 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -254,7 +254,7 @@ def AMDGPU_RawBufferAtomicCmpswapOp : def AMDGPU_RawBufferAtomicFaddOp : AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>, AttrSizedOperandSegments]>, - Arguments<(ins F32:$value, + Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16]>]>:$value, Arg:$memref, Variadic:$indices, DefaultValuedAttr:$boundsCheck, @@ -405,7 +405,7 @@ def AMDGPU_RawBufferAtomicUminOp : def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm", "The possible permutations for a DPP operation", - [ + [ I32EnumAttrCase<"quad_perm", 0>, I32EnumAttrCase<"row_shl", 1>, I32EnumAttrCase<"row_shr", 2>, @@ -419,7 +419,7 @@ def AMDGPU_DPPPerm : I32EnumAttr<"DPPPerm", I32EnumAttrCase<"row_bcast_15", 10>, I32EnumAttrCase<"row_bcast_31", 11> ]> { - let genSpecializedAttr = 0; + let genSpecializedAttr = 0; let cppNamespace = "::mlir::amdgpu"; } diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 96b433294d258ab..9fb557bc8a65edc 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -115,15 +115,18 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { rewriter.getIntegerType(floatType.getWidth())); } if (auto dataVector = dyn_cast(wantedDataType)) { + uint32_t vecLen = dataVector.getNumElements(); uint32_t elemBits = dataVector.getElementTypeBitWidth(); - uint32_t totalBits = elemBits * dataVector.getNumElements(); + uint32_t totalBits = elemBits * vecLen; + bool usePackedFp16 = + isa_and_present(*gpuOp) && vecLen == 2; if (totalBits > maxVectorOpWidth) return gpuOp.emitOpError( "Total width of loads or stores must be no more than " + Twine(maxVectorOpWidth) + " bits, but we call for " + Twine(totalBits) + " bits. This should've been caught in validation"); - if (elemBits < 32) { + if (!usePackedFp16 && elemBits < 32) { if (totalBits > 32) { if (totalBits % 32 != 0) return gpuOp.emitOpError("Load or store of more than 32-bits that " diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir index 717667c22af8009..cc51a8c40942f98 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir @@ -151,6 +151,17 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, func.return } +// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2f16 +func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: memref<64xf16>, %idx: i32) { + // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32) + // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32) + // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32) + // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]] + // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xf16> + amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xf16> -> memref<64xf16>, i32 + func.return +} + // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) { // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32) From fa8b737a81c310b297d1120dae1f915c63486498 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 11 Sep 2024 15:52:31 -0700 Subject: [PATCH 64/94] [SLP][RISCV] Add test for 3 element build vector feeding reduce Our costs for build vectors are currently a bit off which inhibits vectorization. Fix forthcoming. --- .../SLPVectorizer/RISCV/vec3-base.ll | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index faffe16f8e9cd98..6dd9242989b627a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -793,6 +793,25 @@ entry: ret double %add } +define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) { +; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec( +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret float [[ADD_1]] +; + %mul.0 = fmul fast float %a, 10.0 + %mul.1 = fmul fast float %b, 10.0 + %mul.2 = fmul fast float %c, 10.0 + + %add.0 = fadd fast float %mul.0, %mul.1 + %add.1 = fadd fast float %add.0, %mul.2 + ret float %add.1 +} + + declare float @llvm.fmuladd.f32(float, float, float) declare double @llvm.fmuladd.f64(double, double, double) From d32982b6b3753091a532530c7a66f9686deb5233 Mon Sep 17 00:00:00 2001 From: Sarah Spall Date: Wed, 11 Sep 2024 16:04:44 -0700 Subject: [PATCH 65/94] [HLSL] fix elementwise bitreverse test (#108128) The test called 'ceil' instead of 'bitreverse', which I assume was a copy paste leftover. --- clang/test/Sema/builtins-elementwise-math.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/Sema/builtins-elementwise-math.c b/clang/test/Sema/builtins-elementwise-math.c index 2673f1f519af694..628274380ae5f2b 100644 --- a/clang/test/Sema/builtins-elementwise-math.c +++ b/clang/test/Sema/builtins-elementwise-math.c @@ -275,8 +275,8 @@ void test_builtin_elementwise_min(int i, short s, double d, float4 v, int3 iv, u void test_builtin_elementwise_bitreverse(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) { - struct Foo s = __builtin_elementwise_ceil(f); - // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}} + struct Foo s = __builtin_elementwise_bitreverse(i); + // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'int'}} i = __builtin_elementwise_bitreverse(); // expected-error@-1 {{too few arguments to function call, expected 1, have 0}} From 6e4dcbb21dab47e520f2cd19e7017af27328669e Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Wed, 11 Sep 2024 16:04:56 -0700 Subject: [PATCH 66/94] [clang][deps] Print tracing VFS data (#108056) Clang's `-cc1 -print-stats` shows lots of useful internal data including basic `FileManager` stats. Since this layer caches some results, it is unclear how that information translates to actual filesystem accesses. This PR uses `llvm::vfs::TracingFileSystem` to provide that missing information. Similar mechanism is implemented for `clang-scan-deps`'s verbose mode (`-v`). IO contention proved to be a real bottleneck a couple of times already and this new feature should make those easier to detect in the future. The tracing VFS is inserted below the caching FS and above the real FS. --- .../DependencyScanningService.h | 6 +++- .../DependencyScanningTool.h | 2 ++ .../DependencyScanningWorker.h | 2 ++ clang/lib/Basic/FileManager.cpp | 11 +++++++ clang/lib/Frontend/CompilerInstance.cpp | 3 ++ .../DependencyScanningService.cpp | 4 +-- .../DependencyScanningWorker.cpp | 3 ++ clang/test/ClangScanDeps/verbose.test | 28 +++++++++++++++++ clang/test/Misc/print-stats-vfs.test | 17 +++++++++++ clang/tools/clang-scan-deps/ClangScanDeps.cpp | 30 ++++++++++++++++++- 10 files changed, 102 insertions(+), 4 deletions(-) create mode 100644 clang/test/ClangScanDeps/verbose.test create mode 100644 clang/test/Misc/print-stats-vfs.test diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h index 557f0e547ab4a8f..4a343f2872d8d97 100644 --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningService.h @@ -76,7 +76,7 @@ class DependencyScanningService { DependencyScanningService( ScanningMode Mode, ScanningOutputFormat Format, ScanningOptimizations OptimizeArgs = ScanningOptimizations::Default, - bool EagerLoadModules = false); + bool EagerLoadModules = false, bool TraceVFS = false); ScanningMode getMode() const { return Mode; } @@ -86,6 +86,8 @@ class DependencyScanningService { bool shouldEagerLoadModules() const { return EagerLoadModules; } + bool shouldTraceVFS() const { return TraceVFS; } + DependencyScanningFilesystemSharedCache &getSharedCache() { return SharedCache; } @@ -97,6 +99,8 @@ class DependencyScanningService { const ScanningOptimizations OptimizeArgs; /// Whether to set up command-lines to load PCM files eagerly. const bool EagerLoadModules; + /// Whether to trace VFS accesses. + const bool TraceVFS; /// The global file system cache. DependencyScanningFilesystemSharedCache SharedCache; }; diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h index cb9476d1550df34..012237e0278f4ab 100644 --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningTool.h @@ -144,6 +144,8 @@ class DependencyScanningTool { StringRef CWD, const llvm::DenseSet &AlreadySeen, LookupModuleOutputCallback LookupModuleOutput); + llvm::vfs::FileSystem &getWorkerVFS() const { return Worker.getVFS(); } + private: DependencyScanningWorker Worker; }; diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h index 0f607862194b316..da6e0401411a34f 100644 --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningWorker.h @@ -104,6 +104,8 @@ class DependencyScanningWorker { bool shouldEagerLoadModules() const { return EagerLoadModules; } + llvm::vfs::FileSystem &getVFS() const { return *BaseFS; } + private: std::shared_ptr PCHContainerOps; /// The file system to be used during the scan. diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 4509cee1ca0fedf..6097b85a03064ba 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -692,5 +692,16 @@ void FileManager::PrintStats() const { llvm::errs() << NumFileLookups << " file lookups, " << NumFileCacheMisses << " file cache misses.\n"; + getVirtualFileSystem().visit([](llvm::vfs::FileSystem &VFS) { + if (auto *T = dyn_cast_or_null(&VFS)) + llvm::errs() << "\n*** Virtual File System Stats:\n" + << T->NumStatusCalls << " status() calls\n" + << T->NumOpenFileForReadCalls << " openFileForRead() calls\n" + << T->NumDirBeginCalls << " dir_begin() calls\n" + << T->NumGetRealPathCalls << " getRealPath() calls\n" + << T->NumExistsCalls << " exists() calls\n" + << T->NumIsLocalCalls << " isLocal() calls\n"; + }); + //llvm::errs() << PagesMapped << BytesOfPagesMapped << FSLookups; } diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 1364641a9b71e12..5a273474f1d6b60 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -381,6 +381,9 @@ FileManager *CompilerInstance::createFileManager( : createVFSFromCompilerInvocation(getInvocation(), getDiagnostics()); assert(VFS && "FileManager has no VFS?"); + if (getFrontendOpts().ShowStats) + VFS = + llvm::makeIntrusiveRefCnt(std::move(VFS)); FileMgr = new FileManager(getFileSystemOpts(), std::move(VFS)); return FileMgr.get(); } diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp index 7458ef484b16c40..4fb5977580497cf 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningService.cpp @@ -15,9 +15,9 @@ using namespace dependencies; DependencyScanningService::DependencyScanningService( ScanningMode Mode, ScanningOutputFormat Format, - ScanningOptimizations OptimizeArgs, bool EagerLoadModules) + ScanningOptimizations OptimizeArgs, bool EagerLoadModules, bool TraceVFS) : Mode(Mode), Format(Format), OptimizeArgs(OptimizeArgs), - EagerLoadModules(EagerLoadModules) { + EagerLoadModules(EagerLoadModules), TraceVFS(TraceVFS) { // Initialize targets for object file support. llvm::InitializeAllTargets(); llvm::InitializeAllTargetMCs(); diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp index 09ad5ebc7954cf8..d77187bfb1f2b8b 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp @@ -501,6 +501,9 @@ DependencyScanningWorker::DependencyScanningWorker( // The scanner itself writes only raw ast files. PCHContainerOps->registerWriter(std::make_unique()); + if (Service.shouldTraceVFS()) + FS = llvm::makeIntrusiveRefCnt(std::move(FS)); + switch (Service.getMode()) { case ScanningMode::DependencyDirectivesScan: DepFS = diff --git a/clang/test/ClangScanDeps/verbose.test b/clang/test/ClangScanDeps/verbose.test new file mode 100644 index 000000000000000..99c5214c7620188 --- /dev/null +++ b/clang/test/ClangScanDeps/verbose.test @@ -0,0 +1,28 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json + +// RUN: clang-scan-deps -compilation-database %t/cdb.json -v -o %t/result.json 2>&1 | FileCheck %s +// CHECK: *** Virtual File System Stats: +// CHECK-NEXT: {{[[:digit:]]+}} status() calls +// CHECK-NEXT: {{[[:digit:]]+}} openFileForRead() calls +// CHECK-NEXT: {{[[:digit:]]+}} dir_begin() calls +// CHECK-NEXT: {{[[:digit:]]+}} getRealPath() calls +// CHECK-NEXT: {{[[:digit:]]+}} exists() calls +// CHECK-NEXT: {{[[:digit:]]+}} isLocal() calls + +//--- tu.c + +//--- cdb.json.in +[ + { + "file": "DIR/tu.c" + "directory": "DIR", + "command": "clang -c DIR/tu.c -o DIR/tu.o" + }, + { + "file": "DIR/tu.c" + "directory": "DIR", + "command": "clang -c DIR/tu.c -o DIR/tu.o" + } +] diff --git a/clang/test/Misc/print-stats-vfs.test b/clang/test/Misc/print-stats-vfs.test new file mode 100644 index 000000000000000..65446cb7a5077d3 --- /dev/null +++ b/clang/test/Misc/print-stats-vfs.test @@ -0,0 +1,17 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t + +// RUN: %clang_cc1 -fsyntax-only %t/tu.c -I %t/dir1 -I %t/dir2 -print-stats 2>&1 | FileCheck %s + +//--- tu.c +#include "header.h" +//--- dir1/other.h +//--- dir2/header.h + +// CHECK: *** Virtual File System Stats: +// CHECK-NEXT: {{[[:digit:]]+}} status() calls +// CHECK-NEXT: {{[[:digit:]]+}} openFileForRead() calls +// CHECK-NEXT: {{[[:digit:]]+}} dir_begin() calls +// CHECK-NEXT: {{[[:digit:]]+}} getRealPath() calls +// CHECK-NEXT: {{[[:digit:]]+}} exists() calls +// CHECK-NEXT: {{[[:digit:]]+}} isLocal() calls diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp index a8f6150dd3493d1..259058c798e5d1b 100644 --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -915,6 +915,13 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { if (Format == ScanningOutputFormat::Full) FD.emplace(ModuleName.empty() ? Inputs.size() : 0); + std::atomic NumStatusCalls = 0; + std::atomic NumOpenFileForReadCalls = 0; + std::atomic NumDirBeginCalls = 0; + std::atomic NumGetRealPathCalls = 0; + std::atomic NumExistsCalls = 0; + std::atomic NumIsLocalCalls = 0; + auto ScanningTask = [&](DependencyScanningService &Service) { DependencyScanningTool WorkerTool(Service); @@ -999,10 +1006,21 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { HadErrors = true; } } + + WorkerTool.getWorkerVFS().visit([&](llvm::vfs::FileSystem &VFS) { + if (auto *T = dyn_cast_or_null(&VFS)) { + NumStatusCalls += T->NumStatusCalls; + NumOpenFileForReadCalls += T->NumOpenFileForReadCalls; + NumDirBeginCalls += T->NumDirBeginCalls; + NumGetRealPathCalls += T->NumGetRealPathCalls; + NumExistsCalls += T->NumExistsCalls; + NumIsLocalCalls += T->NumIsLocalCalls; + } + }); }; DependencyScanningService Service(ScanMode, Format, OptimizeArgs, - EagerLoadModules); + EagerLoadModules, /*TraceVFS=*/Verbose); llvm::Timer T; T.startTimer(); @@ -1025,6 +1043,16 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) { } T.stopTimer(); + + if (Verbose) + llvm::errs() << "\n*** Virtual File System Stats:\n" + << NumStatusCalls << " status() calls\n" + << NumOpenFileForReadCalls << " openFileForRead() calls\n" + << NumDirBeginCalls << " dir_begin() calls\n" + << NumGetRealPathCalls << " getRealPath() calls\n" + << NumExistsCalls << " exists() calls\n" + << NumIsLocalCalls << " isLocal() calls\n"; + if (PrintTiming) llvm::errs() << llvm::format( "clang-scan-deps timing: %0.2fs wall, %0.2fs process\n", From 1b3e64a9d2a85871a28fc98bcca236df640c64e8 Mon Sep 17 00:00:00 2001 From: Elvis Wang Date: Thu, 12 Sep 2024 07:06:36 +0800 Subject: [PATCH 67/94] [RISCV][TTI] Add vp.cmp intrinsic cost with functionalOPC. (#107504) This patch make the instruction cost of VP compare intrinsics as same as their non-VP counterpart. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 15 + llvm/test/Analysis/CostModel/RISCV/cmp.ll | 660 ++++++++++++++++++ 2 files changed, 675 insertions(+) create mode 100644 llvm/test/Analysis/CostModel/RISCV/cmp.ll diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index e809e15eacf696f..2b5e7c472792847 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1033,6 +1033,21 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::CastContextHint::None, CostKind); break; } + + // vp compare + case Intrinsic::vp_icmp: + case Intrinsic::vp_fcmp: { + Intrinsic::ID IID = ICA.getID(); + std::optional FOp = VPIntrinsic::getFunctionalOpcodeForVP(IID); + auto *UI = dyn_cast(ICA.getInst()); + + // We can only handle vp_cmp intrinsics with underlying instructions. + if (!UI) + break; + assert(FOp); + return getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0], ICA.getReturnType(), + UI->getPredicate(), CostKind); + } } if (ST->hasVInstructions() && RetTy->isVectorTy()) { diff --git a/llvm/test/Analysis/CostModel/RISCV/cmp.ll b/llvm/test/Analysis/CostModel/RISCV/cmp.ll new file mode 100644 index 000000000000000..40938e000b64ece --- /dev/null +++ b/llvm/test/Analysis/CostModel/RISCV/cmp.ll @@ -0,0 +1,660 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+f -passes="print" -cost-kind=throughput --type-based-intrinsic-cost=true 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+f -passes="print" -cost-kind=throughput --type-based-intrinsic-cost=true 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64 + +define void @icmp() { +; RV32-LABEL: 'icmp' +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %86 = call @llvm.vp.icmp.nxv1i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %87 = call @llvm.vp.icmp.nxv1i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %88 = call @llvm.vp.icmp.nxv1i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %89 = call @llvm.vp.icmp.nxv1i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %90 = call @llvm.vp.icmp.nxv1i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %96 = call @llvm.vp.icmp.nxv2i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %97 = call @llvm.vp.icmp.nxv2i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %98 = call @llvm.vp.icmp.nxv2i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %99 = call @llvm.vp.icmp.nxv2i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %100 = call @llvm.vp.icmp.nxv2i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %106 = call @llvm.vp.icmp.nxv4i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %107 = call @llvm.vp.icmp.nxv4i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %108 = call @llvm.vp.icmp.nxv4i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %109 = call @llvm.vp.icmp.nxv4i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %110 = call @llvm.vp.icmp.nxv4i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %116 = call @llvm.vp.icmp.nxv8i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %117 = call @llvm.vp.icmp.nxv8i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %118 = call @llvm.vp.icmp.nxv8i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %119 = call @llvm.vp.icmp.nxv8i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %120 = call @llvm.vp.icmp.nxv8i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %126 = call @llvm.vp.icmp.nxv16i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %127 = call @llvm.vp.icmp.nxv16i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %128 = call @llvm.vp.icmp.nxv16i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %129 = call @llvm.vp.icmp.nxv16i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %130 = call @llvm.vp.icmp.nxv16i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %136 = call @llvm.vp.icmp.nxv32i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %137 = call @llvm.vp.icmp.nxv32i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %138 = call @llvm.vp.icmp.nxv32i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %139 = call @llvm.vp.icmp.nxv32i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %140 = call @llvm.vp.icmp.nxv32i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Invalid cost for instruction: %145 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %146 = call @llvm.vp.icmp.nxv64i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %147 = call @llvm.vp.icmp.nxv64i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %148 = call @llvm.vp.icmp.nxv64i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %149 = call @llvm.vp.icmp.nxv64i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %150 = call @llvm.vp.icmp.nxv64i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Invalid cost for instruction: %155 = icmp slt undef, undef +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %156 = call @llvm.vp.icmp.nxv128i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %157 = call @llvm.vp.icmp.nxv128i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %158 = call @llvm.vp.icmp.nxv128i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %159 = call @llvm.vp.icmp.nxv128i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Invalid cost for instruction: %160 = call @llvm.vp.icmp.nxv128i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV32-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +; RV64-LABEL: 'icmp' +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = icmp slt <2 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = icmp slt <2 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = icmp slt <2 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = icmp slt <2 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = icmp slt <2 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %9 = call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = icmp slt <4 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %12 = icmp slt <4 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = icmp slt <4 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %14 = icmp slt <4 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = icmp slt <4 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %19 = call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %21 = icmp slt <8 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = icmp slt <8 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = icmp slt <8 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %24 = icmp slt <8 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %25 = icmp slt <8 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %26 = call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %27 = call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %30 = call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %31 = icmp slt <16 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %32 = icmp slt <16 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %33 = icmp slt <16 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %34 = icmp slt <16 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %35 = icmp slt <16 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %36 = call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %37 = call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %38 = call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %39 = call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %40 = call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %41 = icmp slt <32 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %42 = icmp slt <32 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %43 = icmp slt <32 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %44 = icmp slt <32 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %45 = icmp slt <32 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %46 = call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %47 = call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %48 = call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %49 = call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %50 = call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %51 = icmp slt <64 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %52 = icmp slt <64 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %53 = icmp slt <64 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %54 = icmp slt <64 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %55 = icmp slt <64 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %56 = call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %57 = call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %58 = call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %59 = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %60 = call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %61 = icmp slt <128 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %62 = icmp slt <128 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %63 = icmp slt <128 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %64 = icmp slt <128 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %65 = icmp slt <128 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %66 = call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %67 = call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %68 = call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %69 = call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %70 = call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %71 = icmp slt <256 x i1> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %72 = icmp slt <256 x i8> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %73 = icmp slt <256 x i16> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %74 = icmp slt <256 x i32> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %75 = icmp slt <256 x i64> undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %76 = call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %77 = call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %78 = call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %79 = call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %80 = call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %81 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %82 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %83 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %84 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %85 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %86 = call @llvm.vp.icmp.nxv1i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %87 = call @llvm.vp.icmp.nxv1i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %88 = call @llvm.vp.icmp.nxv1i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %89 = call @llvm.vp.icmp.nxv1i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %90 = call @llvm.vp.icmp.nxv1i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %91 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %92 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %93 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %94 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %95 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %96 = call @llvm.vp.icmp.nxv2i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %97 = call @llvm.vp.icmp.nxv2i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %98 = call @llvm.vp.icmp.nxv2i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %99 = call @llvm.vp.icmp.nxv2i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %100 = call @llvm.vp.icmp.nxv2i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %101 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %102 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %103 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %104 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %105 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %106 = call @llvm.vp.icmp.nxv4i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %107 = call @llvm.vp.icmp.nxv4i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %108 = call @llvm.vp.icmp.nxv4i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %109 = call @llvm.vp.icmp.nxv4i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %110 = call @llvm.vp.icmp.nxv4i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %111 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %112 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %113 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %114 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %115 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %116 = call @llvm.vp.icmp.nxv8i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %117 = call @llvm.vp.icmp.nxv8i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %118 = call @llvm.vp.icmp.nxv8i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %119 = call @llvm.vp.icmp.nxv8i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %120 = call @llvm.vp.icmp.nxv8i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %121 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %122 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %123 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %124 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %125 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %126 = call @llvm.vp.icmp.nxv16i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %127 = call @llvm.vp.icmp.nxv16i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %128 = call @llvm.vp.icmp.nxv16i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %129 = call @llvm.vp.icmp.nxv16i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %130 = call @llvm.vp.icmp.nxv16i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %131 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %132 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %133 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %134 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %135 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %136 = call @llvm.vp.icmp.nxv32i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %137 = call @llvm.vp.icmp.nxv32i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %138 = call @llvm.vp.icmp.nxv32i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %139 = call @llvm.vp.icmp.nxv32i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %140 = call @llvm.vp.icmp.nxv32i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %141 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %142 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %143 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %144 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %145 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %146 = call @llvm.vp.icmp.nxv64i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %147 = call @llvm.vp.icmp.nxv64i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %148 = call @llvm.vp.icmp.nxv64i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %149 = call @llvm.vp.icmp.nxv64i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %150 = call @llvm.vp.icmp.nxv64i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %151 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %152 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %153 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %154 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %155 = icmp slt undef, undef +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %156 = call @llvm.vp.icmp.nxv128i1( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %157 = call @llvm.vp.icmp.nxv128i8( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %158 = call @llvm.vp.icmp.nxv128i16( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %159 = call @llvm.vp.icmp.nxv128i32( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %160 = call @llvm.vp.icmp.nxv128i64( undef, undef, metadata !"slt", undef, i32 undef) +; RV64-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + icmp slt <2 x i1> undef, undef + icmp slt <2 x i8> undef, undef + icmp slt <2 x i16> undef, undef + icmp slt <2 x i32> undef, undef + icmp slt <2 x i64> undef, undef + call <2 x i1> @llvm.vp.icmp.v2i1(<2 x i1> undef, <2 x i1> undef, metadata !"slt", <2 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.icmp.v2i8(<2 x i8> undef, <2 x i8> undef, metadata !"slt", <2 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.icmp.v2i16(<2 x i16> undef, <2 x i16> undef, metadata !"slt", <2 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.icmp.v2i32(<2 x i32> undef, <2 x i32> undef, metadata !"slt", <2 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.icmp.v2i64(<2 x i64> undef, <2 x i64> undef, metadata !"slt", <2 x i1> undef, i32 undef) + + icmp slt <4 x i1> undef, undef + icmp slt <4 x i8> undef, undef + icmp slt <4 x i16> undef, undef + icmp slt <4 x i32> undef, undef + icmp slt <4 x i64> undef, undef + call <4 x i1> @llvm.vp.icmp.v4i1(<4 x i1> undef, <4 x i1> undef, metadata !"slt", <4 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.icmp.v4i8(<4 x i8> undef, <4 x i8> undef, metadata !"slt", <4 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.icmp.v4i16(<4 x i16> undef, <4 x i16> undef, metadata !"slt", <4 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.icmp.v4i32(<4 x i32> undef, <4 x i32> undef, metadata !"slt", <4 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.icmp.v4i64(<4 x i64> undef, <4 x i64> undef, metadata !"slt", <4 x i1> undef, i32 undef) + + icmp slt <8 x i1> undef, undef + icmp slt <8 x i8> undef, undef + icmp slt <8 x i16> undef, undef + icmp slt <8 x i32> undef, undef + icmp slt <8 x i64> undef, undef + call <8 x i1> @llvm.vp.icmp.v8i1(<8 x i1> undef, <8 x i1> undef, metadata !"slt", <8 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.icmp.v8i8(<8 x i8> undef, <8 x i8> undef, metadata !"slt", <8 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.icmp.v8i16(<8 x i16> undef, <8 x i16> undef, metadata !"slt", <8 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.icmp.v8i32(<8 x i32> undef, <8 x i32> undef, metadata !"slt", <8 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.icmp.v8i64(<8 x i64> undef, <8 x i64> undef, metadata !"slt", <8 x i1> undef, i32 undef) + + icmp slt <16 x i1> undef, undef + icmp slt <16 x i8> undef, undef + icmp slt <16 x i16> undef, undef + icmp slt <16 x i32> undef, undef + icmp slt <16 x i64> undef, undef + call <16 x i1> @llvm.vp.icmp.v16i1(<16 x i1> undef, <16 x i1> undef, metadata !"slt", <16 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.icmp.v16i8(<16 x i8> undef, <16 x i8> undef, metadata !"slt", <16 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.icmp.v16i16(<16 x i16> undef, <16 x i16> undef, metadata !"slt", <16 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.icmp.v16i32(<16 x i32> undef, <16 x i32> undef, metadata !"slt", <16 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.icmp.v16i64(<16 x i64> undef, <16 x i64> undef, metadata !"slt", <16 x i1> undef, i32 undef) + + icmp slt <32 x i1> undef, undef + icmp slt <32 x i8> undef, undef + icmp slt <32 x i16> undef, undef + icmp slt <32 x i32> undef, undef + icmp slt <32 x i64> undef, undef + call <32 x i1> @llvm.vp.icmp.v32i1(<32 x i1> undef, <32 x i1> undef, metadata !"slt", <32 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.icmp.v32i8(<32 x i8> undef, <32 x i8> undef, metadata !"slt", <32 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.icmp.v32i16(<32 x i16> undef, <32 x i16> undef, metadata !"slt", <32 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.icmp.v32i32(<32 x i32> undef, <32 x i32> undef, metadata !"slt", <32 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.icmp.v32i64(<32 x i64> undef, <32 x i64> undef, metadata !"slt", <32 x i1> undef, i32 undef) + + icmp slt <64 x i1> undef, undef + icmp slt <64 x i8> undef, undef + icmp slt <64 x i16> undef, undef + icmp slt <64 x i32> undef, undef + icmp slt <64 x i64> undef, undef + call <64 x i1> @llvm.vp.icmp.v64i1(<64 x i1> undef, <64 x i1> undef, metadata !"slt", <64 x i1> undef, i32 undef) + call <64 x i1> @llvm.vp.icmp.v64i8(<64 x i8> undef, <64 x i8> undef, metadata !"slt", <64 x i1> undef, i32 undef) + call <64 x i1> @llvm.vp.icmp.v64i16(<64 x i16> undef, <64 x i16> undef, metadata !"slt", <64 x i1> undef, i32 undef) + call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> undef, <64 x i32> undef, metadata !"slt", <64 x i1> undef, i32 undef) + call <64 x i1> @llvm.vp.icmp.v64i64(<64 x i64> undef, <64 x i64> undef, metadata !"slt", <64 x i1> undef, i32 undef) + + icmp slt <128 x i1> undef, undef + icmp slt <128 x i8> undef, undef + icmp slt <128 x i16> undef, undef + icmp slt <128 x i32> undef, undef + icmp slt <128 x i64> undef, undef + call <128 x i1> @llvm.vp.icmp.v128i1(<128 x i1> undef, <128 x i1> undef, metadata !"slt", <128 x i1> undef, i32 undef) + call <128 x i1> @llvm.vp.icmp.v128i8(<128 x i8> undef, <128 x i8> undef, metadata !"slt", <128 x i1> undef, i32 undef) + call <128 x i1> @llvm.vp.icmp.v128i16(<128 x i16> undef, <128 x i16> undef, metadata !"slt", <128 x i1> undef, i32 undef) + call <128 x i1> @llvm.vp.icmp.v128i32(<128 x i32> undef, <128 x i32> undef, metadata !"slt", <128 x i1> undef, i32 undef) + call <128 x i1> @llvm.vp.icmp.v128i64(<128 x i64> undef, <128 x i64> undef, metadata !"slt", <128 x i1> undef, i32 undef) + + icmp slt <256 x i1> undef, undef + icmp slt <256 x i8> undef, undef + icmp slt <256 x i16> undef, undef + icmp slt <256 x i32> undef, undef + icmp slt <256 x i64> undef, undef + call <256 x i1> @llvm.vp.icmp.v256i1(<256 x i1> undef, <256 x i1> undef, metadata !"slt", <256 x i1> undef, i32 undef) + call <256 x i1> @llvm.vp.icmp.v256i8(<256 x i8> undef, <256 x i8> undef, metadata !"slt", <256 x i1> undef, i32 undef) + call <256 x i1> @llvm.vp.icmp.v256i16(<256 x i16> undef, <256 x i16> undef, metadata !"slt", <256 x i1> undef, i32 undef) + call <256 x i1> @llvm.vp.icmp.v256i32(<256 x i32> undef, <256 x i32> undef, metadata !"slt", <256 x i1> undef, i32 undef) + call <256 x i1> @llvm.vp.icmp.v256i64(<256 x i64> undef, <256 x i64> undef, metadata !"slt", <256 x i1> undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv1i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv1i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv1i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv1i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv1i64( undef, undef, metadata !"slt", undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv2i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv2i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv2i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv2i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv2i64( undef, undef, metadata !"slt", undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv4i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv4i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv4i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv4i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv4i64( undef, undef, metadata !"slt", undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv8i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv8i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv8i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv8i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv8i64( undef, undef, metadata !"slt", undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv16i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv16i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv16i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv16i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv16i64( undef, undef, metadata !"slt", undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv32i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv32i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv32i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv32i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv32i64( undef, undef, metadata !"slt", undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv64i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv64i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv64i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv64i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv64i64( undef, undef, metadata !"slt", undef, i32 undef) + + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + icmp slt undef, undef + call @llvm.vp.icmp.nxv128i1( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv128i8( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv128i16( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv128i32( undef, undef, metadata !"slt", undef, i32 undef) + call @llvm.vp.icmp.nxv128i64( undef, undef, metadata !"slt", undef, i32 undef) + + ret void +} + +define void @fcmp() { +; CHECK-LABEL: 'fcmp' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = fcmp olt <2 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = fcmp olt <2 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %3 = call <2 x i1> @llvm.vp.fcmp.v2f32(<2 x float> undef, <2 x float> undef, metadata !"olt", <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %4 = call <2 x i1> @llvm.vp.fcmp.v2f64(<2 x double> undef, <2 x double> undef, metadata !"olt", <2 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %5 = fcmp olt <4 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %6 = fcmp olt <4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = call <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float> undef, <4 x float> undef, metadata !"olt", <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = call <4 x i1> @llvm.vp.fcmp.v4f64(<4 x double> undef, <4 x double> undef, metadata !"olt", <4 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %9 = fcmp olt <8 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %10 = fcmp olt <8 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> undef, <8 x float> undef, metadata !"olt", <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> undef, <8 x double> undef, metadata !"olt", <8 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = fcmp olt <16 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %14 = fcmp olt <16 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %15 = call <16 x i1> @llvm.vp.fcmp.v16f32(<16 x float> undef, <16 x float> undef, metadata !"olt", <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %16 = call <16 x i1> @llvm.vp.fcmp.v16f64(<16 x double> undef, <16 x double> undef, metadata !"olt", <16 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %17 = fcmp olt <32 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %18 = fcmp olt <32 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %19 = call <32 x i1> @llvm.vp.fcmp.v32f32(<32 x float> undef, <32 x float> undef, metadata !"olt", <32 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %20 = call <32 x i1> @llvm.vp.fcmp.v32f64(<32 x double> undef, <32 x double> undef, metadata !"olt", <32 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %21 = fcmp olt <64 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %22 = fcmp olt <64 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %23 = call <64 x i1> @llvm.vp.fcmp.v64f32(<64 x float> undef, <64 x float> undef, metadata !"olt", <64 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %24 = call <64 x i1> @llvm.vp.fcmp.v64f64(<64 x double> undef, <64 x double> undef, metadata !"olt", <64 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %25 = fcmp olt <128 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %26 = fcmp olt <128 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %27 = call <128 x i1> @llvm.vp.fcmp.v128f32(<128 x float> undef, <128 x float> undef, metadata !"olt", <128 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %28 = call <128 x i1> @llvm.vp.fcmp.v128f64(<128 x double> undef, <128 x double> undef, metadata !"olt", <128 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %29 = fcmp olt <256 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %30 = fcmp olt <256 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %31 = call <256 x i1> @llvm.vp.fcmp.v256f32(<256 x float> undef, <256 x float> undef, metadata !"olt", <256 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %32 = call <256 x i1> @llvm.vp.fcmp.v256f64(<256 x double> undef, <256 x double> undef, metadata !"olt", <256 x i1> undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %34 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %35 = call @llvm.vp.fcmp.nxv1f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %36 = call @llvm.vp.fcmp.nxv1f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %37 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %38 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %39 = call @llvm.vp.fcmp.nxv2f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %40 = call @llvm.vp.fcmp.nxv2f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %41 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %42 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %43 = call @llvm.vp.fcmp.nxv4f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %44 = call @llvm.vp.fcmp.nxv4f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %45 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %46 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %47 = call @llvm.vp.fcmp.nxv8f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %48 = call @llvm.vp.fcmp.nxv8f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %49 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %50 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %51 = call @llvm.vp.fcmp.nxv16f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %52 = call @llvm.vp.fcmp.nxv16f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %53 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %54 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %55 = call @llvm.vp.fcmp.nxv32f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %56 = call @llvm.vp.fcmp.nxv32f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %57 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %58 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %59 = call @llvm.vp.fcmp.nxv64f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %60 = call @llvm.vp.fcmp.nxv64f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %61 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %62 = fcmp olt undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %63 = call @llvm.vp.fcmp.nxv128f32( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %64 = call @llvm.vp.fcmp.nxv128f64( undef, undef, metadata !"olt", undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; + fcmp olt <2 x float> undef, undef + fcmp olt <2 x double> undef, undef + call <2 x i1> @llvm.vp.fcmp.v2float(<2 x float> undef, <2 x float> undef, metadata !"olt", <2 x i1> undef, i32 undef) + call <2 x i1> @llvm.vp.fcmp.v2double(<2 x double> undef, <2 x double> undef, metadata !"olt", <2 x i1> undef, i32 undef) + + fcmp olt <4 x float> undef, undef + fcmp olt <4 x double> undef, undef + call <4 x i1> @llvm.vp.fcmp.v4float(<4 x float> undef, <4 x float> undef, metadata !"olt", <4 x i1> undef, i32 undef) + call <4 x i1> @llvm.vp.fcmp.v4double(<4 x double> undef, <4 x double> undef, metadata !"olt", <4 x i1> undef, i32 undef) + + fcmp olt <8 x float> undef, undef + fcmp olt <8 x double> undef, undef + call <8 x i1> @llvm.vp.fcmp.v8float(<8 x float> undef, <8 x float> undef, metadata !"olt", <8 x i1> undef, i32 undef) + call <8 x i1> @llvm.vp.fcmp.v8double(<8 x double> undef, <8 x double> undef, metadata !"olt", <8 x i1> undef, i32 undef) + + fcmp olt <16 x float> undef, undef + fcmp olt <16 x double> undef, undef + call <16 x i1> @llvm.vp.fcmp.v16float(<16 x float> undef, <16 x float> undef, metadata !"olt", <16 x i1> undef, i32 undef) + call <16 x i1> @llvm.vp.fcmp.v16double(<16 x double> undef, <16 x double> undef, metadata !"olt", <16 x i1> undef, i32 undef) + + fcmp olt <32 x float> undef, undef + fcmp olt <32 x double> undef, undef + call <32 x i1> @llvm.vp.fcmp.v32float(<32 x float> undef, <32 x float> undef, metadata !"olt", <32 x i1> undef, i32 undef) + call <32 x i1> @llvm.vp.fcmp.v32double(<32 x double> undef, <32 x double> undef, metadata !"olt", <32 x i1> undef, i32 undef) + + fcmp olt <64 x float> undef, undef + fcmp olt <64 x double> undef, undef + call <64 x i1> @llvm.vp.fcmp.v64float(<64 x float> undef, <64 x float> undef, metadata !"olt", <64 x i1> undef, i32 undef) + call <64 x i1> @llvm.vp.fcmp.v64double(<64 x double> undef, <64 x double> undef, metadata !"olt", <64 x i1> undef, i32 undef) + + fcmp olt <128 x float> undef, undef + fcmp olt <128 x double> undef, undef + call <128 x i1> @llvm.vp.fcmp.v128float(<128 x float> undef, <128 x float> undef, metadata !"olt", <128 x i1> undef, i32 undef) + call <128 x i1> @llvm.vp.fcmp.v128double(<128 x double> undef, <128 x double> undef, metadata !"olt", <128 x i1> undef, i32 undef) + + fcmp olt <256 x float> undef, undef + fcmp olt <256 x double> undef, undef + call <256 x i1> @llvm.vp.fcmp.v256float(<256 x float> undef, <256 x float> undef, metadata !"olt", <256 x i1> undef, i32 undef) + call <256 x i1> @llvm.vp.fcmp.v256double(<256 x double> undef, <256 x double> undef, metadata !"olt", <256 x i1> undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv1float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv1double( undef, undef, metadata !"olt", undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv2float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv2double( undef, undef, metadata !"olt", undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv4float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv4double( undef, undef, metadata !"olt", undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv8float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv8double( undef, undef, metadata !"olt", undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv16float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv16double( undef, undef, metadata !"olt", undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv32float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv32double( undef, undef, metadata !"olt", undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv64float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv64double( undef, undef, metadata !"olt", undef, i32 undef) + + fcmp olt undef, undef + fcmp olt undef, undef + call @llvm.vp.fcmp.nxv128float( undef, undef, metadata !"olt", undef, i32 undef) + call @llvm.vp.fcmp.nxv128double( undef, undef, metadata !"olt", undef, i32 undef) + + ret void +} From b690cae01af03237f6b5304e00d529227137b53d Mon Sep 17 00:00:00 2001 From: Joshua Batista Date: Wed, 11 Sep 2024 16:08:01 -0700 Subject: [PATCH 68/94] Update StructuredBuffer-AST test after removal of HLSLResourceClassAttr (#108292) In a previous PR, the `HLSLResourceClassAttr` attribute was removed from the AST, in favor of using an attributed type that stores the same information instead. This PR fixes test failures that assumed that `HLSLResourceClassAttr` was still in the AST, and adjusts for the new AST representation. --- clang/test/AST/HLSL/StructuredBuffer-AST.hlsl | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl index 42991d8dc9c2e3d..11d84ac7b85db2a 100644 --- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl @@ -30,8 +30,7 @@ StructuredBuffer Buffer; // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class StructuredBuffer definition // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'element_type *' -// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <> Implicit UAV +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' @@ -39,7 +38,7 @@ StructuredBuffer Buffer; // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'const StructuredBuffer' lvalue implicit this // CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline @@ -49,7 +48,7 @@ StructuredBuffer Buffer; // CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <> // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type' lvalue -// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} +// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <> 'element_type * {{\[\[}}hlsl::resource_class(UAV)]]':'element_type *' lvalue .h 0x{{[0-9A-Fa-f]+}} // CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <> 'StructuredBuffer' lvalue implicit this // CHECK-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Idx' 'unsigned int' // CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <> Implicit always_inline @@ -59,6 +58,5 @@ StructuredBuffer Buffer; // CHECK: TemplateArgument type 'float' // CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float' // CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final -// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit referenced h 'float *' -// CHECK-NEXT: HLSLResourceClassAttr 0x{{[0-9A-Fa-f]+}} <> Implicit UAV +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit referenced h 'float * {{\[\[}}hlsl::resource_class(UAV)]]':'float *' // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer From 93e45a69dde16e6a3ac0ddbcc596ac3843d59c43 Mon Sep 17 00:00:00 2001 From: Jason Molenda Date: Wed, 11 Sep 2024 16:09:48 -0700 Subject: [PATCH 69/94] [Dexter] Adapt to upcoming lldb stepping behavior (#108127) lldb will change how it reports stop reasons around breakpoints in the near future. I landed an earlier version of this change and noticed debuginfo test failures on the CI bots due to the changes. I'm addressing the issues found by CI at https://github.com/llvm/llvm-project/pull/105594 and will re-land once I've done all of them. Currently, when lldb stops at a breakpoint instruction -- but has not yet executed the instruction -- it will overwrite the thread's Stop Reason with "breakpoint-hit". This caused bugs when the original stop reason was important to the user - for instance, a watchpoint on an AArch64 system where we have to instruction-step past the watchpoint to find the new value. Normally we would instruction step, fetch the new value, then report the user that a watchpoint has been hit with the old and new values. But if the instruction after this access is a breakpoint site, we overwrite the "watchpoint hit" stop reason (and related actions) with "breakpoint hit". dexter sets breakpoints on all source lines, then steps line-to-line, hitting the breakpoints. But with this new behavior, we see two steps per source line: The first step gets us to the start of the next line, with a "step completed" stop reason. Then we step again and we execute the breakpoint instruction, stop with the pc the same, and report "breakpoint hit". Now we can step a second time and move past the breakpoint. I've changed the `step` method in LLDB.py to check if we step to a breakpoint site but have a "step completed" stop reason -- in which case we have this new breakpoint behavior, and we need to step a second time to actually hit the breakpoint like the debuginfo tests expect. --- .../dexter/dex/debugger/lldb/LLDB.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py index 3944c1c4b009dbe..2307550aca047b1 100644 --- a/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py +++ b/cross-project-tests/debuginfo-tests/dexter/dex/debugger/lldb/LLDB.py @@ -206,6 +206,33 @@ def launch(self, cmdline): def step(self): self._thread.StepInto() + stop_reason = self._thread.GetStopReason() + # If we (1) completed a step and (2) are sitting at a breakpoint, + # but (3) the breakpoint is not reported as the stop reason, then + # we'll need to step once more to hit the breakpoint. + # + # dexter sets breakpoints on every source line, then steps + # each source line. Older lldb's would overwrite the stop + # reason with "breakpoint hit" when we stopped at a breakpoint, + # even if the breakpoint hadn't been exectued yet. One + # step per source line, hitting a breakpoint each time. + # + # But a more accurate behavior is that the step completes + # with step-completed stop reason, then when we step again, + # we execute the breakpoint and stop (with the pc the same) and + # a breakpoint-hit stop reason. So we need to step twice per line. + if stop_reason == self._interface.eStopReasonPlanComplete: + stepped_to_breakpoint = False + pc = self._thread.GetFrameAtIndex(0).GetPC() + for bp in self._target.breakpoints: + for bploc in bp.locations: + if ( + bploc.IsEnabled() + and bploc.GetAddress().GetLoadAddress(self._target) == pc + ): + stepped_to_breakpoint = True + if stepped_to_breakpoint: + self._thread.StepInto() def go(self) -> ReturnCode: self._process.Continue() From 34e20f18f0a77cece57e13e179dcf5b58992a705 Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 11 Sep 2024 16:24:38 -0700 Subject: [PATCH 70/94] [DirectX] Implement typedBufferLoad_checkbit (#108087) This represents a typedBufferLoad that's followed by "CheckAccessFullyMapped". It returns an extra `i1` representing that value. Fixes #108085 --- llvm/docs/DirectX/DXILResources.rst | 6 ++ llvm/include/llvm/IR/IntrinsicsDirectX.td | 3 + llvm/lib/Target/DirectX/DXIL.td | 9 +++ llvm/lib/Target/DirectX/DXILOpLowering.cpp | 67 ++++++++++++++++++---- llvm/test/CodeGen/DirectX/BufferLoad.ll | 22 +++++++ 5 files changed, 96 insertions(+), 11 deletions(-) diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst index a982c3a29fcc3b1..ad8ede9c59fbfa4 100644 --- a/llvm/docs/DirectX/DXILResources.rst +++ b/llvm/docs/DirectX/DXILResources.rst @@ -361,6 +361,12 @@ Examples: - ``i32`` - Index into the buffer +.. code-block:: llvm + + %ret = call {<4 x float>, i1} + @llvm.dx.typedBufferLoad.checkbit.v4f32.tdx.TypedBuffer_v4f32_0_0_0t( + target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index) + Texture and Typed Buffer Stores ------------------------------- diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td index c36e98f040ab816..f1017bdd5124969 100644 --- a/llvm/include/llvm/IR/IntrinsicsDirectX.td +++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td @@ -32,6 +32,9 @@ def int_dx_handle_fromBinding def int_dx_typedBufferLoad : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>; +def int_dx_typedBufferLoad_checkbit + : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty], + [llvm_any_ty, llvm_i32_ty]>; def int_dx_typedBufferStore : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>; diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td index 759a58ed3930e38..902ab37bf741ed6 100644 --- a/llvm/lib/Target/DirectX/DXIL.td +++ b/llvm/lib/Target/DirectX/DXIL.td @@ -719,6 +719,15 @@ def BufferStore : DXILOp<69, bufferStore> { let stages = [Stages]; } +def CheckAccessFullyMapped : DXILOp<71, checkAccessFullyMapped> { + let Doc = "checks whether a Sample, Gather, or Load operation " + "accessed mapped tiles in a tiled resource"; + let arguments = [OverloadTy]; + let result = Int1Ty; + let overloads = [Overloads]; + let stages = [Stages]; +} + def ThreadId : DXILOp<93, threadId> { let Doc = "Reads the thread ID"; let LLVMIntrinsic = int_dx_thread_id; diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index f968cab1dccf1ed..d98d0bfde04fc67 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -265,16 +265,50 @@ class OpLowerer { /// Replace uses of \c Intrin with the values in the `dx.ResRet` of \c Op. /// Since we expect to be post-scalarization, make an effort to avoid vectors. - Error replaceResRetUses(CallInst *Intrin, CallInst *Op) { + Error replaceResRetUses(CallInst *Intrin, CallInst *Op, bool HasCheckBit) { IRBuilder<> &IRB = OpBuilder.getIRB(); + Instruction *OldResult = Intrin; Type *OldTy = Intrin->getType(); + if (HasCheckBit) { + auto *ST = cast(OldTy); + + Value *CheckOp = nullptr; + Type *Int32Ty = IRB.getInt32Ty(); + for (Use &U : make_early_inc_range(OldResult->uses())) { + if (auto *EVI = dyn_cast(U.getUser())) { + ArrayRef Indices = EVI->getIndices(); + assert(Indices.size() == 1); + // We're only interested in uses of the check bit for now. + if (Indices[0] != 1) + continue; + if (!CheckOp) { + Value *NewEVI = IRB.CreateExtractValue(Op, 4); + Expected OpCall = OpBuilder.tryCreateOp( + OpCode::CheckAccessFullyMapped, {NewEVI}, Int32Ty); + if (Error E = OpCall.takeError()) + return E; + CheckOp = *OpCall; + } + EVI->replaceAllUsesWith(CheckOp); + EVI->eraseFromParent(); + } + } + + OldResult = cast(IRB.CreateExtractValue(Op, 0)); + OldTy = ST->getElementType(0); + } + // For scalars, we just extract the first element. if (!isa(OldTy)) { Value *EVI = IRB.CreateExtractValue(Op, 0); - Intrin->replaceAllUsesWith(EVI); - Intrin->eraseFromParent(); + OldResult->replaceAllUsesWith(EVI); + OldResult->eraseFromParent(); + if (OldResult != Intrin) { + assert(Intrin->use_empty() && "Intrinsic still has uses?"); + Intrin->eraseFromParent(); + } return Error::success(); } @@ -283,7 +317,7 @@ class OpLowerer { // The users of the operation should all be scalarized, so we attempt to // replace the extractelements with extractvalues directly. - for (Use &U : make_early_inc_range(Intrin->uses())) { + for (Use &U : make_early_inc_range(OldResult->uses())) { if (auto *EEI = dyn_cast(U.getUser())) { if (auto *IndexOp = dyn_cast(EEI->getIndexOperand())) { size_t IndexVal = IndexOp->getZExtValue(); @@ -331,7 +365,7 @@ class OpLowerer { // If we still have uses, then we're not fully scalarized and need to // recreate the vector. This should only happen for things like exported // functions from libraries. - if (!Intrin->use_empty()) { + if (!OldResult->use_empty()) { for (int I = 0, E = N; I != E; ++I) if (!Extracts[I]) Extracts[I] = IRB.CreateExtractValue(Op, I); @@ -339,14 +373,19 @@ class OpLowerer { Value *Vec = UndefValue::get(OldTy); for (int I = 0, E = N; I != E; ++I) Vec = IRB.CreateInsertElement(Vec, Extracts[I], I); - Intrin->replaceAllUsesWith(Vec); + OldResult->replaceAllUsesWith(Vec); + } + + OldResult->eraseFromParent(); + if (OldResult != Intrin) { + assert(Intrin->use_empty() && "Intrinsic still has uses?"); + Intrin->eraseFromParent(); } - Intrin->eraseFromParent(); return Error::success(); } - [[nodiscard]] bool lowerTypedBufferLoad(Function &F) { + [[nodiscard]] bool lowerTypedBufferLoad(Function &F, bool HasCheckBit) { IRBuilder<> &IRB = OpBuilder.getIRB(); Type *Int32Ty = IRB.getInt32Ty(); @@ -358,14 +397,17 @@ class OpLowerer { Value *Index0 = CI->getArgOperand(1); Value *Index1 = UndefValue::get(Int32Ty); - Type *NewRetTy = OpBuilder.getResRetType(CI->getType()->getScalarType()); + Type *OldTy = CI->getType(); + if (HasCheckBit) + OldTy = cast(OldTy)->getElementType(0); + Type *NewRetTy = OpBuilder.getResRetType(OldTy->getScalarType()); std::array Args{Handle, Index0, Index1}; Expected OpCall = OpBuilder.tryCreateOp(OpCode::BufferLoad, Args, NewRetTy); if (Error E = OpCall.takeError()) return E; - if (Error E = replaceResRetUses(CI, *OpCall)) + if (Error E = replaceResRetUses(CI, *OpCall, HasCheckBit)) return E; return Error::success(); @@ -434,7 +476,10 @@ class OpLowerer { HasErrors |= lowerHandleFromBinding(F); break; case Intrinsic::dx_typedBufferLoad: - HasErrors |= lowerTypedBufferLoad(F); + HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/false); + break; + case Intrinsic::dx_typedBufferLoad_checkbit: + HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true); break; case Intrinsic::dx_typedBufferStore: HasErrors |= lowerTypedBufferStore(F); diff --git a/llvm/test/CodeGen/DirectX/BufferLoad.ll b/llvm/test/CodeGen/DirectX/BufferLoad.ll index 4b9fb52f0b52991..e3a4441ad6e8337 100644 --- a/llvm/test/CodeGen/DirectX/BufferLoad.ll +++ b/llvm/test/CodeGen/DirectX/BufferLoad.ll @@ -4,6 +4,7 @@ target triple = "dxil-pc-shadermodel6.6-compute" declare void @scalar_user(float) declare void @vector_user(<4 x float>) +declare void @check_user(i1) define void @loadv4f32() { ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding @@ -128,6 +129,27 @@ define void @loadv2f32() { ret void } +define void @loadv4f32_checkbit() { + ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding + ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]] + %buffer = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) + @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0( + i32 0, i32 0, i32 1, i32 0, i1 false) + + ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef) + %data0 = call {<4 x float>, i1} @llvm.dx.typedBufferLoad.checkbit.f32( + target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0) + + ; CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 4 + ; CHECK: [[MAPPED:%.*]] = call i1 @dx.op.checkAccessFullyMapped.i32(i32 71, i32 [[STATUS]] + %check = extractvalue {<4 x float>, i1} %data0, 1 + + ; CHECK: call void @check_user(i1 [[MAPPED]]) + call void @check_user(i1 %check) + + ret void +} + define void @loadv4i32() { ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BIND]] From c820bd3e33caf8fb8a2ec984c584d54108430b65 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 11 Sep 2024 16:27:33 -0700 Subject: [PATCH 71/94] [BOLT][NFC] Rename profile-use-pseudo-probes The flag currently controls writing of probe information in YAML profile. #99891 adds a separate flag to use probe information for stale profile matching. Thus `profile-use-pseudo-probes` becomes a misnomer and `profile-write-pseudo-probes` better captures the intent. Reviewers: maksfb, WenleiHe, ayermolo, rafaelauler, dcci Reviewed By: rafaelauler Pull Request: https://github.com/llvm/llvm-project/pull/106364 --- bolt/lib/Profile/DataAggregator.cpp | 4 ++-- bolt/lib/Profile/YAMLProfileReader.cpp | 5 ----- bolt/lib/Profile/YAMLProfileWriter.cpp | 11 ++++++++--- bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 6 +++--- bolt/test/X86/pseudoprobe-decoding-inline.test | 6 +++--- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 813d825f8b570c3..10d745cc69824b6 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -88,7 +88,7 @@ MaxSamples("max-samples", cl::cat(AggregatorCategory)); extern cl::opt ProfileFormat; -extern cl::opt ProfileUsePseudoProbes; +extern cl::opt ProfileWritePseudoProbes; extern cl::opt SaveProfile; cl::opt ReadPreAggregated( @@ -2300,7 +2300,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, yaml::bolt::BinaryProfile BP; const MCPseudoProbeDecoder *PseudoProbeDecoder = - opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr; + opts::ProfileWritePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr; // Fill out the header info. BP.Header.Version = 1; diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 2ab4bf9a4b3e13e..67ed32017667d64 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -49,11 +49,6 @@ llvm::cl::opt llvm::cl::opt ProfileUseDFS("profile-use-dfs", cl::desc("use DFS order for YAML profile"), cl::Hidden, cl::cat(BoltOptCategory)); - -llvm::cl::opt ProfileUsePseudoProbes( - "profile-use-pseudo-probes", - cl::desc("Use pseudo probes for profile generation and matching"), - cl::Hidden, cl::cat(BoltOptCategory)); } // namespace opts namespace llvm { diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp index f74cf60e076d0a0..ffbf2388e912fba 100644 --- a/bolt/lib/Profile/YAMLProfileWriter.cpp +++ b/bolt/lib/Profile/YAMLProfileWriter.cpp @@ -13,6 +13,7 @@ #include "bolt/Profile/DataAggregator.h" #include "bolt/Profile/ProfileReaderBase.h" #include "bolt/Rewrite/RewriteInstance.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/raw_ostream.h" @@ -21,8 +22,12 @@ #define DEBUG_TYPE "bolt-prof" namespace opts { -extern llvm::cl::opt ProfileUseDFS; -extern llvm::cl::opt ProfileUsePseudoProbes; +using namespace llvm; +extern cl::opt ProfileUseDFS; +cl::opt ProfileWritePseudoProbes( + "profile-write-pseudo-probes", + cl::desc("Use pseudo probes in profile generation"), cl::Hidden, + cl::cat(BoltOptCategory)); } // namespace opts namespace llvm { @@ -59,7 +64,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS, yaml::bolt::BinaryFunctionProfile YamlBF; const BinaryContext &BC = BF.getBinaryContext(); const MCPseudoProbeDecoder *PseudoProbeDecoder = - opts::ProfileUsePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr; + opts::ProfileWritePseudoProbes ? BC.getPseudoProbeDecoder() : nullptr; const uint16_t LBRProfile = BF.getProfileFlags() & BinaryFunction::PF_LBR; diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp index e97d522844fc022..45bd15c2304956f 100644 --- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp +++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp @@ -50,7 +50,7 @@ static cl::opt PrintPseudoProbes( clEnumValN(PPP_All, "all", "enable all debugging printout")), cl::Hidden, cl::cat(BoltCategory)); -extern cl::opt ProfileUsePseudoProbes; +extern cl::opt ProfileWritePseudoProbes; } // namespace opts namespace { @@ -91,14 +91,14 @@ class PseudoProbeRewriter final : public MetadataRewriter { }; Error PseudoProbeRewriter::preCFGInitializer() { - if (opts::ProfileUsePseudoProbes) + if (opts::ProfileWritePseudoProbes) parsePseudoProbe(); return Error::success(); } Error PseudoProbeRewriter::postEmitFinalizer() { - if (!opts::ProfileUsePseudoProbes) + if (!opts::ProfileWritePseudoProbes) parsePseudoProbe(); updatePseudoProbes(); diff --git a/bolt/test/X86/pseudoprobe-decoding-inline.test b/bolt/test/X86/pseudoprobe-decoding-inline.test index b361551e5711eaa..1fdd00c7ef6c4b9 100644 --- a/bolt/test/X86/pseudoprobe-decoding-inline.test +++ b/bolt/test/X86/pseudoprobe-decoding-inline.test @@ -6,11 +6,11 @@ # PREAGG: B X:0 #main# 1 0 ## Check pseudo-probes in regular YAML profile (non-BOLTed binary) # RUN: link_fdata %s %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin %t.preagg PREAGG -# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-use-pseudo-probes +# RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata --profile-write-pseudo-probes # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-YAML ## Check pseudo-probes in BAT YAML profile (BOLTed binary) # RUN: link_fdata %s %t.bolt %t.preagg2 PREAGG -# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-use-pseudo-probes +# RUN: perf2bolt %t.bolt -p %t.preagg2 --pa -w %t.yaml2 -o %t.fdata2 --profile-write-pseudo-probes # RUN: FileCheck --input-file %t.yaml2 %s --check-prefix CHECK-YAML # CHECK-YAML: name: bar # CHECK-YAML: - bid: 0 @@ -30,7 +30,7 @@ # CHECK-YAML: guid: 0xDB956436E78DD5FA # CHECK-YAML: pseudo_probe_desc_hash: 0x10000FFFFFFFF # -## Check that without --profile-use-pseudo-probes option, no pseudo probes are +## Check that without --profile-write-pseudo-probes option, no pseudo probes are ## generated # RUN: perf2bolt %S/../../../llvm/test/tools/llvm-profgen/Inputs/inline-cs-pseudoprobe.perfbin -p %t.preagg --pa -w %t.yaml -o %t.fdata # RUN: FileCheck --input-file %t.yaml %s --check-prefix CHECK-NO-OPT From 86ec59e2f789ae6469ff434f3b6455f09af5919c Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 11 Sep 2024 16:33:34 -0700 Subject: [PATCH 72/94] [BOLT] Only parse probes for profiled functions in profile-write-pseudo-probes mode (#106365) Implement selective probe parsing for profiled functions only when emitting probe information to YAML profile as suggested in https://github.com/llvm/llvm-project/pull/102904#pullrequestreview-2248714190 For a large binary, this reduces probe parsing - processing time from 10.5925s to 5.6295s, - peak RSS from 10.54 to 7.98 GiB. --- bolt/lib/Rewrite/PseudoProbeRewriter.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp index 45bd15c2304956f..8647df4b0edf82a 100644 --- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp +++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp @@ -72,7 +72,8 @@ class PseudoProbeRewriter final : public MetadataRewriter { /// Parse .pseudo_probe_desc section and .pseudo_probe section /// Setup Pseudo probe decoder - void parsePseudoProbe(); + /// If \p ProfiledOnly is set, only parse records for functions with profile. + void parsePseudoProbe(bool ProfiledOnly = false); /// PseudoProbe decoder std::shared_ptr ProbeDecoderPtr; @@ -92,7 +93,7 @@ class PseudoProbeRewriter final : public MetadataRewriter { Error PseudoProbeRewriter::preCFGInitializer() { if (opts::ProfileWritePseudoProbes) - parsePseudoProbe(); + parsePseudoProbe(true); return Error::success(); } @@ -105,7 +106,7 @@ Error PseudoProbeRewriter::postEmitFinalizer() { return Error::success(); } -void PseudoProbeRewriter::parsePseudoProbe() { +void PseudoProbeRewriter::parsePseudoProbe(bool ProfiledOnly) { MCPseudoProbeDecoder &ProbeDecoder(*ProbeDecoderPtr); PseudoProbeDescSection = BC.getUniqueSectionByName(".pseudo_probe_desc"); PseudoProbeSection = BC.getUniqueSectionByName(".pseudo_probe"); @@ -137,6 +138,7 @@ void PseudoProbeRewriter::parsePseudoProbe() { SmallVector Suffixes( {".destroy", ".resume", ".llvm.", ".cold", ".warm"}); for (const BinaryFunction *F : BC.getAllBinaryFunctions()) { + bool HasProfile = F->hasProfileAvailable(); for (const MCSymbol *Sym : F->getSymbols()) { StringRef SymName = Sym->getName(); for (auto Name : {std::optional(NameResolver::restore(SymName)), @@ -146,6 +148,8 @@ void PseudoProbeRewriter::parsePseudoProbe() { SymName = *Name; uint64_t GUID = Function::getGUID(SymName); FuncStartAddrs[GUID] = F->getAddress(); + if (ProfiledOnly && HasProfile) + GuidFilter.insert(GUID); } } } From ccc7a072db05592cc42c0caac835b22f9a01a89f Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Wed, 11 Sep 2024 16:36:47 -0700 Subject: [PATCH 73/94] [BOLT] Drop blocks without profile in BAT YAML (#107970) Align BAT YAML (DataAggregator) to YAMLProfileWriter which drops blocks without profile: https://github.com/llvm/llvm-project/blob/61372fc5db9b14fd612be8a58a76edd7f0ee38aa/bolt/lib/Profile/YAMLProfileWriter.cpp#L162-L176 Test Plan: NFCI --- bolt/lib/Profile/DataAggregator.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 10d745cc69824b6..4aeeb1daab1b947 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -2427,11 +2427,15 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, } } } - // Drop blocks without a hash, won't be useful for stale matching. - llvm::erase_if(YamlBF.Blocks, - [](const yaml::bolt::BinaryBasicBlockProfile &YamlBB) { - return YamlBB.Hash == (yaml::Hex64)0; - }); + // Skip printing if there's no profile data + llvm::erase_if( + YamlBF.Blocks, [](const yaml::bolt::BinaryBasicBlockProfile &YamlBB) { + auto HasCount = [](const auto &SI) { return SI.Count; }; + bool HasAnyCount = YamlBB.ExecCount || + llvm::any_of(YamlBB.Successors, HasCount) || + llvm::any_of(YamlBB.CallSites, HasCount); + return !HasAnyCount; + }); BP.Functions.emplace_back(YamlBF); } } From 828783177f71d95522763fba92ef1e42dc6101c7 Mon Sep 17 00:00:00 2001 From: "Henrik G. Olsson" Date: Wed, 11 Sep 2024 16:39:28 -0700 Subject: [PATCH 74/94] =?UTF-8?q?Reland=20"[llvm-lit]=20Process=20ANSI=20c?= =?UTF-8?q?olor=20codes=20in=20test=20output=20when=20forma=E2=80=A6=20(#1?= =?UTF-8?q?08107)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …tting" (#108104)" This recommits 0f56ba13bff7ab72bfafcf7c5cf9e5b8bd16d895 (reverted by 6007ad79afeffb1288781b4a7241290386293aff). In the original patch llvm/utils/lit/tests/escape-color.py failed on Windows because it diffed llvm-lit output with a file containing '\n' newlines rather than '\r\n'. This issue is avoided by calling 'diff --strip-trailing-cr'. Original description below: Test output that carried color across newlines previously resulted in the formatting around the output also being colored. Detect the current ANSI color and reset it when printing formatting, and then reapply it. As an added bonus an unterminated color code is also detected, preventing it from leaking out into the rest of the terminal. Fixes #106633 --- llvm/utils/lit/lit/TestRunner.py | 28 +++++++++++++++++-- .../Inputs/escape-color/color-escaped.txt | 10 +++++++ .../lit/tests/Inputs/escape-color/color.txt | 6 ++++ .../lit/tests/Inputs/escape-color/lit.cfg | 8 ++++++ llvm/utils/lit/tests/escape-color.py | 4 +++ 5 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/color.txt create mode 100644 llvm/utils/lit/tests/Inputs/escape-color/lit.cfg create mode 100644 llvm/utils/lit/tests/escape-color.py diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 19f35fc7e212f3f..a1785073547ad06 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1017,6 +1017,20 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): return exitCode +def findColor(line, curr_color): + start = line.rfind("\33[") + if start == -1: + return curr_color + end = line.find("m", start + 2) + if end == -1: + return curr_color + match = line[start : end + 1] + # "\33[0m" means "reset all formatting". Sometimes the 0 is skipped. + if match == "\33[m" or match == "\33[0m": + return None + return match + + def formatOutput(title, data, limit=None): if not data.strip(): return "" @@ -1027,8 +1041,18 @@ def formatOutput(title, data, limit=None): msg = "" ndashes = 30 # fmt: off - out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n" - out += f"# | " + "\n# | ".join(data.splitlines()) + "\n" + out = f"# .---{title}{'-' * (ndashes - 4 - len(title))}\n" + curr_color = None + for line in data.splitlines(): + if curr_color: + out += "\33[0m" + out += "# | " + if curr_color: + out += curr_color + out += line + "\n" + curr_color = findColor(line, curr_color) + if curr_color: + out += "\33[0m" # prevent unterminated formatting from leaking out += f"# `---{msg}{'-' * (ndashes - 4 - len(msg))}\n" # fmt: on return out diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt new file mode 100644 index 000000000000000..e7a33e380b351cd --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/escape-color/color-escaped.txt @@ -0,0 +1,10 @@ +# .---command stdout------------ +# | # RUN: cat %s +# | red +# | still red(B +# | plain +# | green +# | still green (never terminated) +# `----------------------------- + +-- diff --git a/llvm/utils/lit/tests/Inputs/escape-color/color.txt b/llvm/utils/lit/tests/Inputs/escape-color/color.txt new file mode 100644 index 000000000000000..15ffc22d134f0f5 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/escape-color/color.txt @@ -0,0 +1,6 @@ +# RUN: cat %s +red +still red(B +plain +green +still green (never terminated) diff --git a/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg new file mode 100644 index 000000000000000..36f4eb69d4858e5 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/escape-color/lit.cfg @@ -0,0 +1,8 @@ +import lit.formats + +config.name = "escape-color" +config.suffixes = [".txt"] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None + diff --git a/llvm/utils/lit/tests/escape-color.py b/llvm/utils/lit/tests/escape-color.py new file mode 100644 index 000000000000000..8fdda3553da3991 --- /dev/null +++ b/llvm/utils/lit/tests/escape-color.py @@ -0,0 +1,4 @@ +# cut off the first 9 lines to avoid absolute file paths in the output +# then keep only the next 10 lines to avoid test timing in the output +# RUN: %{lit} %{inputs}/escape-color/color.txt -a | tail -n +10 | head -n 10 > %t +# RUN: diff --strip-trailing-cr %{inputs}/escape-color/color-escaped.txt %t From 3cd01371e007b2a8fe32e5d8ce1154057e5e1c2e Mon Sep 17 00:00:00 2001 From: Pranav Kant Date: Wed, 11 Sep 2024 17:21:42 -0700 Subject: [PATCH 75/94] =?UTF-8?q?Revert=20"[RFC][C++20][Modules]=20Fix=20c?= =?UTF-8?q?rash=20when=20function=20and=20lambda=20insi=E2=80=A6=20(#10831?= =?UTF-8?q?1)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …de loaded from different modules (#104512)" This reverts commit d778689fdc812033e7142ed87e4ee13c4997b3f9. --- clang/include/clang/Serialization/ASTReader.h | 9 --- clang/lib/Serialization/ASTReader.cpp | 8 +- clang/lib/Serialization/ASTReaderDecl.cpp | 10 --- clang/lib/Serialization/ASTWriterDecl.cpp | 42 ---------- ...rash-instantiated-in-scope-cxx-modules.cpp | 76 ------------------- ...ash-instantiated-in-scope-cxx-modules2.cpp | 30 -------- 6 files changed, 1 insertion(+), 174 deletions(-) delete mode 100644 clang/test/Headers/crash-instantiated-in-scope-cxx-modules.cpp delete mode 100644 clang/test/Headers/crash-instantiated-in-scope-cxx-modules2.cpp diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index 7331bcf249266d5..898f4392465fdf3 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -1188,15 +1188,6 @@ class ASTReader /// once recursing loading has been completed. llvm::SmallVector PendingOdrMergeChecks; - /// Lambdas that need to be loaded right after the function they belong to. - /// It is required to have canonical declaration for lambda class from the - /// same module as enclosing function. This is required to correctly resolve - /// captured variables in the lambda. Without this, due to lazy - /// deserialization canonical declarations for the function and lambdas can - /// be from different modules and DeclRefExprs may refer to the AST nodes - /// that don't exist in the function. - SmallVector PendingLambdas; - using DataPointers = std::pair; using ObjCInterfaceDataPointers = diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 0ee53e43dff39c5..e5a1e20a265616a 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -9782,8 +9782,7 @@ void ASTReader::finishPendingActions() { !PendingDeducedVarTypes.empty() || !PendingIncompleteDeclChains.empty() || !PendingDeclChains.empty() || !PendingMacroIDs.empty() || !PendingDeclContextInfos.empty() || !PendingUpdateRecords.empty() || - !PendingObjCExtensionIvarRedeclarations.empty() || - !PendingLambdas.empty()) { + !PendingObjCExtensionIvarRedeclarations.empty()) { // If any identifiers with corresponding top-level declarations have // been loaded, load those declarations now. using TopLevelDeclsMap = @@ -9928,11 +9927,6 @@ void ASTReader::finishPendingActions() { } PendingObjCExtensionIvarRedeclarations.pop_back(); } - - // Load any pendiong lambdas. - for (auto ID : PendingLambdas) - GetDecl(ID); - PendingLambdas.clear(); } // At this point, all update records for loaded decls are in place, so any diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index 20e577404d997df..9272e23c7da3fc6 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -1155,16 +1155,6 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) { for (unsigned I = 0; I != NumParams; ++I) Params.push_back(readDeclAs()); FD->setParams(Reader.getContext(), Params); - - // For the first decl add all lambdas inside for loading them later, - // otherwise skip them. - unsigned NumLambdas = Record.readInt(); - if (FD->isFirstDecl()) { - for (unsigned I = 0; I != NumLambdas; ++I) - Reader.PendingLambdas.push_back(Record.readDeclID()); - } else { - Record.skipInts(NumLambdas); - } } void ASTDeclReader::VisitObjCMethodDecl(ObjCMethodDecl *MD) { diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 732a6e21f340d69..555f6325da646bf 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -18,7 +18,6 @@ #include "clang/AST/Expr.h" #include "clang/AST/OpenMPClause.h" #include "clang/AST/PrettyDeclStackTrace.h" -#include "clang/AST/StmtVisitor.h" #include "clang/Basic/SourceManager.h" #include "clang/Serialization/ASTReader.h" #include "clang/Serialization/ASTRecordWriter.h" @@ -626,33 +625,6 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) { : QualType()); } -static llvm::SmallVector collectLambdas(FunctionDecl *D) { - struct LambdaCollector : public ConstStmtVisitor { - llvm::SmallVectorImpl &Lambdas; - - LambdaCollector(llvm::SmallVectorImpl &Lambdas) - : Lambdas(Lambdas) {} - - void VisitLambdaExpr(const LambdaExpr *E) { - VisitStmt(E); - Lambdas.push_back(E->getLambdaClass()); - } - - void VisitStmt(const Stmt *S) { - if (!S) - return; - for (const Stmt *Child : S->children()) - if (Child) - Visit(Child); - } - }; - - llvm::SmallVector Lambdas; - if (D->hasBody()) - LambdaCollector(Lambdas).VisitStmt(D->getBody()); - return Lambdas; -} - void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { static_assert(DeclContext::NumFunctionDeclBits == 44, "You need to update the serializer after you change the " @@ -792,19 +764,6 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) { Record.push_back(D->param_size()); for (auto *P : D->parameters()) Record.AddDeclRef(P); - - // Store references to all lambda decls inside function to load them - // immediately after loading the function to make sure that canonical - // decls for lambdas will be from the same module. - if (D->isCanonicalDecl()) { - llvm::SmallVector Lambdas = collectLambdas(D); - Record.push_back(Lambdas.size()); - for (const auto *L : Lambdas) - Record.AddDeclRef(L); - } else { - Record.push_back(0); - } - Code = serialization::DECL_FUNCTION; } @@ -2280,7 +2239,6 @@ getFunctionDeclAbbrev(serialization::DeclCode Code) { // // This is: // NumParams and Params[] from FunctionDecl, and - // NumLambdas, Lambdas[] from FunctionDecl, and // NumOverriddenMethods, OverriddenMethods[] from CXXMethodDecl. // // Add an AbbrevOp for 'size then elements' and use it here. diff --git a/clang/test/Headers/crash-instantiated-in-scope-cxx-modules.cpp b/clang/test/Headers/crash-instantiated-in-scope-cxx-modules.cpp deleted file mode 100644 index 80844a58ad825a0..000000000000000 --- a/clang/test/Headers/crash-instantiated-in-scope-cxx-modules.cpp +++ /dev/null @@ -1,76 +0,0 @@ -// RUN: rm -fR %t -// RUN: split-file %s %t -// RUN: cd %t -// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header -Werror=uninitialized folly-conv.h -// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header -Werror=uninitialized thrift_cpp2_base.h -// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header -Werror=uninitialized -fmodule-file=folly-conv.pcm -fmodule-file=thrift_cpp2_base.pcm logger_base.h - -//--- Conv.h -#pragma once - -template -_Up __declval(int); - -template -auto declval() noexcept -> decltype(__declval<_Tp>(0)); - -namespace folly { - -template -struct Expected { - template - auto thenOrThrow() -> decltype(declval()) { - return 1; - } -}; - -struct ExpectedHelper { - template - static constexpr Expected return_(T) { - return Expected(); - } - - template - static auto then_(This&&, Fn&&) - -> decltype(T::template return_((declval()(true), 0))) { - return Expected(); - } -}; - -template -inline Expected tryTo() { - Tgt result = 0; - // In build with asserts: - // clang/lib/Sema/SemaTemplateInstantiate.cpp: llvm::PointerUnion *clang::LocalInstantiationScope::findInstantiationOf(const Decl *): Assertion `isa(D) && "declaration not instantiated in this scope"' failed. - // In release build compilation error on the line below inside lambda: - // error: variable 'result' is uninitialized when used here [-Werror,-Wuninitialized] - ExpectedHelper::then_(Expected(), [&](bool) { return result; }); - return {}; -} - -} // namespace folly - -inline void bar() { - folly::tryTo(); -} -// expected-no-diagnostics - -//--- folly-conv.h -#pragma once -#include "Conv.h" -// expected-no-diagnostics - -//--- thrift_cpp2_base.h -#pragma once -#include "Conv.h" -// expected-no-diagnostics - -//--- logger_base.h -#pragma once -import "folly-conv.h"; -import "thrift_cpp2_base.h"; - -inline void foo() { - folly::tryTo(); -} -// expected-no-diagnostics diff --git a/clang/test/Headers/crash-instantiated-in-scope-cxx-modules2.cpp b/clang/test/Headers/crash-instantiated-in-scope-cxx-modules2.cpp deleted file mode 100644 index 5b1a904e928a682..000000000000000 --- a/clang/test/Headers/crash-instantiated-in-scope-cxx-modules2.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// RUN: rm -fR %t -// RUN: split-file %s %t -// RUN: cd %t -// RUN: %clang_cc1 -std=c++20 -emit-header-unit -xc++-user-header header.h -// RUN: %clang_cc1 -std=c++20 -fmodule-file=header.pcm main.cpp - -//--- header.h -template -void f(T) {} - -class A { - virtual ~A(); -}; - -inline A::~A() { - f([](){}); -} - -struct B { - void g() { - f([](){ - [](){}; - }); - } -}; -// expected-no-diagnostics - -//--- main.cpp -import "header.h"; -// expected-no-diagnostics From 480f07ff6c7ac2d928b6f1862698dbd51069735c Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 12 Sep 2024 08:41:50 +0800 Subject: [PATCH 76/94] [RISCV] Add fixed length vector patterns for vfwmaccbf16.vv (#108204) This adds VL patterns for vfwmaccbf16.vv so that we can handle fixed length vectors. It does this by teaching combineOp_VLToVWOp_VL to emit RISCVISD::VFWMADD_VL for bf16. The change in getOrCreateExtendedOp is needed because getNarrowType is based off of the bitwidth so returns f16. We need to explicitly check for bf16. Note that the .vf patterns don't work yet, since the build_vector splat gets lowered to a (vmv_v_x_vl (fmv_x_anyexth x)) instead of a vfmv.v.f, which SplatFP doesn't pick up, see #106637. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 9 +- .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 13 +- .../RISCV/rvv/fixed-vectors-vfwmaccbf16.ll | 467 ++++++++++++++++++ 3 files changed, 485 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4554163d4551dc1..9f0831a1ad92253 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14480,6 +14480,13 @@ struct NodeExtensionHelper { if (Source.getValueType() == NarrowVT) return Source; + // vfmadd_vl -> vfwmadd_vl can take bf16 operands + if (Source.getValueType().getVectorElementType() == MVT::bf16) { + assert(Root->getSimpleValueType(0).getVectorElementType() == MVT::f32 && + Root->getOpcode() == RISCVISD::VFMADD_VL); + return Source; + } + unsigned ExtOpc = getExtOpc(*SupportsExt); // If we need an extension, we should be changing the type. @@ -15731,7 +15738,7 @@ static SDValue performVFMADD_VLCombine(SDNode *N, return V; if (N->getValueType(0).getVectorElementType() == MVT::f32 && - !Subtarget.hasVInstructionsF16()) + !Subtarget.hasVInstructionsF16() && !Subtarget.hasStdExtZvfbfwma()) return SDValue(); // FIXME: Ignore strict opcodes for now. diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 699536b18696928..9afbe567193607d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2009,13 +2009,18 @@ multiclass VPatWidenFPMulAccVL_VV_VF { } } -multiclass VPatWidenFPMulAccVL_VV_VF_RM { - foreach vtiToWti = AllWidenableFloatVectors in { +multiclass VPatWidenFPMulAccVL_VV_VF_RM vtiToWtis = + AllWidenableFloatVectors> { + foreach vtiToWti = vtiToWtis in { defvar vti = vtiToWti.Vti; defvar wti = vtiToWti.Wti; defvar suffix = vti.LMul.MX # "_E" # vti.SEW; let Predicates = !listconcat(GetVTypePredicates.Predicates, - GetVTypePredicates.Predicates) in { + GetVTypePredicates.Predicates, + !if(!eq(vti.Scalar, bf16), + [HasStdExtZvfbfwma], + [])) in { def : Pat<(vop (vti.Vector vti.RegClass:$rs1), (vti.Vector vti.RegClass:$rs2), (wti.Vector wti.RegClass:$rd), (vti.Mask V0), @@ -2451,6 +2456,8 @@ defm : VPatFPMulAccVL_VV_VF_RM; // 13.7. Vector Widening Floating-Point Fused Multiply-Add Instructions defm : VPatWidenFPMulAccVL_VV_VF_RM; +defm : VPatWidenFPMulAccVL_VV_VF_RM; defm : VPatWidenFPMulAccVL_VV_VF_RM; defm : VPatWidenFPMulAccVL_VV_VF_RM; defm : VPatWidenFPMulAccVL_VV_VF_RM; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll new file mode 100644 index 000000000000000..62a479bdedf6493 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll @@ -0,0 +1,467 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefixes=ZVFBFMIN,ZVFBMIN32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefixes=ZVFBFMIN,ZVFBMIN64 + +define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v1f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: addi sp, sp, -16 +; ZVFBFWMA-NEXT: .cfi_def_cfa_offset 16 +; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa0 +; ZVFBFWMA-NEXT: fsw fa5, 8(sp) +; ZVFBFWMA-NEXT: addi a0, sp, 8 +; ZVFBFWMA-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFBFWMA-NEXT: vle32.v v9, (a0) +; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa1 +; ZVFBFWMA-NEXT: fsw fa5, 12(sp) +; ZVFBFWMA-NEXT: addi a0, sp, 12 +; ZVFBFWMA-NEXT: vle32.v v10, (a0) +; ZVFBFWMA-NEXT: vfmacc.vv v8, v9, v10 +; ZVFBFWMA-NEXT: addi sp, sp, 16 +; ZVFBFWMA-NEXT: ret +; +; ZVFBMIN32-LABEL: vfwmaccbf16_vv_v1f32: +; ZVFBMIN32: # %bb.0: +; ZVFBMIN32-NEXT: addi sp, sp, -32 +; ZVFBMIN32-NEXT: .cfi_def_cfa_offset 32 +; ZVFBMIN32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; ZVFBMIN32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; ZVFBMIN32-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill +; ZVFBMIN32-NEXT: .cfi_offset ra, -4 +; ZVFBMIN32-NEXT: .cfi_offset s0, -8 +; ZVFBMIN32-NEXT: .cfi_offset fs0, -16 +; ZVFBMIN32-NEXT: csrr a0, vlenb +; ZVFBMIN32-NEXT: slli a0, a0, 1 +; ZVFBMIN32-NEXT: sub sp, sp, a0 +; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 2 * vlenb +; ZVFBMIN32-NEXT: fmv.s fs0, fa0 +; ZVFBMIN32-NEXT: addi a0, sp, 16 +; ZVFBMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFBMIN32-NEXT: fmv.s fa0, fa1 +; ZVFBMIN32-NEXT: call __truncsfbf2 +; ZVFBMIN32-NEXT: fmv.x.w s0, fa0 +; ZVFBMIN32-NEXT: fmv.s fa0, fs0 +; ZVFBMIN32-NEXT: call __truncsfbf2 +; ZVFBMIN32-NEXT: fmv.x.w a0, fa0 +; ZVFBMIN32-NEXT: slli a0, a0, 16 +; ZVFBMIN32-NEXT: sw a0, 8(sp) +; ZVFBMIN32-NEXT: addi a0, sp, 8 +; ZVFBMIN32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFBMIN32-NEXT: vle32.v v10, (a0) +; ZVFBMIN32-NEXT: slli s0, s0, 16 +; ZVFBMIN32-NEXT: sw s0, 12(sp) +; ZVFBMIN32-NEXT: addi a0, sp, 12 +; ZVFBMIN32-NEXT: vle32.v v9, (a0) +; ZVFBMIN32-NEXT: addi a0, sp, 16 +; ZVFBMIN32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; ZVFBMIN32-NEXT: vfmacc.vv v8, v10, v9 +; ZVFBMIN32-NEXT: csrr a0, vlenb +; ZVFBMIN32-NEXT: slli a0, a0, 1 +; ZVFBMIN32-NEXT: add sp, sp, a0 +; ZVFBMIN32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; ZVFBMIN32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; ZVFBMIN32-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload +; ZVFBMIN32-NEXT: addi sp, sp, 32 +; ZVFBMIN32-NEXT: ret +; +; ZVFBMIN64-LABEL: vfwmaccbf16_vv_v1f32: +; ZVFBMIN64: # %bb.0: +; ZVFBMIN64-NEXT: addi sp, sp, -64 +; ZVFBMIN64-NEXT: .cfi_def_cfa_offset 64 +; ZVFBMIN64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill +; ZVFBMIN64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill +; ZVFBMIN64-NEXT: fsd fs0, 40(sp) # 8-byte Folded Spill +; ZVFBMIN64-NEXT: .cfi_offset ra, -8 +; ZVFBMIN64-NEXT: .cfi_offset s0, -16 +; ZVFBMIN64-NEXT: .cfi_offset fs0, -24 +; ZVFBMIN64-NEXT: csrr a0, vlenb +; ZVFBMIN64-NEXT: slli a0, a0, 1 +; ZVFBMIN64-NEXT: sub sp, sp, a0 +; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 2 * vlenb +; ZVFBMIN64-NEXT: fmv.s fs0, fa0 +; ZVFBMIN64-NEXT: addi a0, sp, 32 +; ZVFBMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFBMIN64-NEXT: fmv.s fa0, fa1 +; ZVFBMIN64-NEXT: call __truncsfbf2 +; ZVFBMIN64-NEXT: fmv.x.w s0, fa0 +; ZVFBMIN64-NEXT: fmv.s fa0, fs0 +; ZVFBMIN64-NEXT: call __truncsfbf2 +; ZVFBMIN64-NEXT: fmv.x.w a0, fa0 +; ZVFBMIN64-NEXT: slli a0, a0, 16 +; ZVFBMIN64-NEXT: fmv.w.x fa5, a0 +; ZVFBMIN64-NEXT: fsw fa5, 16(sp) +; ZVFBMIN64-NEXT: addi a0, sp, 16 +; ZVFBMIN64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFBMIN64-NEXT: vle32.v v10, (a0) +; ZVFBMIN64-NEXT: slli s0, s0, 16 +; ZVFBMIN64-NEXT: fmv.w.x fa5, s0 +; ZVFBMIN64-NEXT: fsw fa5, 20(sp) +; ZVFBMIN64-NEXT: addi a0, sp, 20 +; ZVFBMIN64-NEXT: vle32.v v9, (a0) +; ZVFBMIN64-NEXT: addi a0, sp, 32 +; ZVFBMIN64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; ZVFBMIN64-NEXT: vfmacc.vv v8, v10, v9 +; ZVFBMIN64-NEXT: csrr a0, vlenb +; ZVFBMIN64-NEXT: slli a0, a0, 1 +; ZVFBMIN64-NEXT: add sp, sp, a0 +; ZVFBMIN64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload +; ZVFBMIN64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; ZVFBMIN64-NEXT: fld fs0, 40(sp) # 8-byte Folded Reload +; ZVFBMIN64-NEXT: addi sp, sp, 64 +; ZVFBMIN64-NEXT: ret + %b.ext = fpext <1 x bfloat> %b to <1 x float> + %c.ext = fpext <1 x bfloat> %c to <1 x float> + %res = call <1 x float> @llvm.fma.v1f32(<1 x float> %b.ext, <1 x float> %c.ext, <1 x float> %a) + ret <1 x float> %res +} + +define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v1f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: addi sp, sp, -16 +; ZVFBFWMA-NEXT: .cfi_def_cfa_offset 16 +; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa0 +; ZVFBFWMA-NEXT: fsw fa5, 8(sp) +; ZVFBFWMA-NEXT: addi a0, sp, 8 +; ZVFBFWMA-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFBFWMA-NEXT: vle32.v v9, (a0) +; ZVFBFWMA-NEXT: fcvt.s.bf16 fa5, fa1 +; ZVFBFWMA-NEXT: fsw fa5, 12(sp) +; ZVFBFWMA-NEXT: addi a0, sp, 12 +; ZVFBFWMA-NEXT: vle32.v v10, (a0) +; ZVFBFWMA-NEXT: vfmacc.vv v8, v9, v10 +; ZVFBFWMA-NEXT: addi sp, sp, 16 +; ZVFBFWMA-NEXT: ret +; +; ZVFBMIN32-LABEL: vfwmaccbf16_vf_v1f32: +; ZVFBMIN32: # %bb.0: +; ZVFBMIN32-NEXT: addi sp, sp, -48 +; ZVFBMIN32-NEXT: .cfi_def_cfa_offset 48 +; ZVFBMIN32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill +; ZVFBMIN32-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill +; ZVFBMIN32-NEXT: .cfi_offset ra, -4 +; ZVFBMIN32-NEXT: .cfi_offset fs0, -16 +; ZVFBMIN32-NEXT: csrr a0, vlenb +; ZVFBMIN32-NEXT: slli a0, a0, 1 +; ZVFBMIN32-NEXT: sub sp, sp, a0 +; ZVFBMIN32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; ZVFBMIN32-NEXT: fmv.s fs0, fa0 +; ZVFBMIN32-NEXT: addi a0, sp, 32 +; ZVFBMIN32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFBMIN32-NEXT: fmv.s fa0, fa1 +; ZVFBMIN32-NEXT: call __truncsfbf2 +; ZVFBMIN32-NEXT: fmv.x.w a0, fa0 +; ZVFBMIN32-NEXT: fmv.x.w a1, fs0 +; ZVFBMIN32-NEXT: slli a1, a1, 16 +; ZVFBMIN32-NEXT: sw a1, 8(sp) +; ZVFBMIN32-NEXT: addi a1, sp, 8 +; ZVFBMIN32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFBMIN32-NEXT: vle32.v v10, (a1) +; ZVFBMIN32-NEXT: slli a0, a0, 16 +; ZVFBMIN32-NEXT: sw a0, 12(sp) +; ZVFBMIN32-NEXT: addi a0, sp, 12 +; ZVFBMIN32-NEXT: vle32.v v9, (a0) +; ZVFBMIN32-NEXT: addi a0, sp, 32 +; ZVFBMIN32-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; ZVFBMIN32-NEXT: vfmacc.vv v8, v10, v9 +; ZVFBMIN32-NEXT: csrr a0, vlenb +; ZVFBMIN32-NEXT: slli a0, a0, 1 +; ZVFBMIN32-NEXT: add sp, sp, a0 +; ZVFBMIN32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload +; ZVFBMIN32-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload +; ZVFBMIN32-NEXT: addi sp, sp, 48 +; ZVFBMIN32-NEXT: ret +; +; ZVFBMIN64-LABEL: vfwmaccbf16_vf_v1f32: +; ZVFBMIN64: # %bb.0: +; ZVFBMIN64-NEXT: addi sp, sp, -48 +; ZVFBMIN64-NEXT: .cfi_def_cfa_offset 48 +; ZVFBMIN64-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; ZVFBMIN64-NEXT: fsd fs0, 32(sp) # 8-byte Folded Spill +; ZVFBMIN64-NEXT: .cfi_offset ra, -8 +; ZVFBMIN64-NEXT: .cfi_offset fs0, -16 +; ZVFBMIN64-NEXT: csrr a0, vlenb +; ZVFBMIN64-NEXT: slli a0, a0, 1 +; ZVFBMIN64-NEXT: sub sp, sp, a0 +; ZVFBMIN64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb +; ZVFBMIN64-NEXT: fmv.s fs0, fa0 +; ZVFBMIN64-NEXT: addi a0, sp, 32 +; ZVFBMIN64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; ZVFBMIN64-NEXT: fmv.s fa0, fa1 +; ZVFBMIN64-NEXT: call __truncsfbf2 +; ZVFBMIN64-NEXT: fmv.x.w a0, fa0 +; ZVFBMIN64-NEXT: fmv.x.w a1, fs0 +; ZVFBMIN64-NEXT: slli a1, a1, 16 +; ZVFBMIN64-NEXT: fmv.w.x fa5, a1 +; ZVFBMIN64-NEXT: fsw fa5, 24(sp) +; ZVFBMIN64-NEXT: addi a1, sp, 24 +; ZVFBMIN64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; ZVFBMIN64-NEXT: vle32.v v10, (a1) +; ZVFBMIN64-NEXT: slli a0, a0, 16 +; ZVFBMIN64-NEXT: fmv.w.x fa5, a0 +; ZVFBMIN64-NEXT: fsw fa5, 28(sp) +; ZVFBMIN64-NEXT: addi a0, sp, 28 +; ZVFBMIN64-NEXT: vle32.v v9, (a0) +; ZVFBMIN64-NEXT: addi a0, sp, 32 +; ZVFBMIN64-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; ZVFBMIN64-NEXT: vfmacc.vv v8, v10, v9 +; ZVFBMIN64-NEXT: csrr a0, vlenb +; ZVFBMIN64-NEXT: slli a0, a0, 1 +; ZVFBMIN64-NEXT: add sp, sp, a0 +; ZVFBMIN64-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; ZVFBMIN64-NEXT: fld fs0, 32(sp) # 8-byte Folded Reload +; ZVFBMIN64-NEXT: addi sp, sp, 48 +; ZVFBMIN64-NEXT: ret + %b.head = insertelement <1 x bfloat> poison, bfloat %b, i32 0 + %b.splat = shufflevector <1 x bfloat> %b.head, <1 x bfloat> poison, <1 x i32> zeroinitializer + %b.ext = fpext <1 x bfloat> %b.splat to <1 x float> + %c.ext = fpext <1 x bfloat> %c to <1 x float> + %res = call <1 x float> @llvm.fma.v1f32(<1 x float> %b.ext, <1 x float> %c.ext, <1 x float> %a) + ret <1 x float> %res +} + +define <2 x float> @vfwmaccbf16_vv_v2f32(<2 x float> %a, <2 x bfloat> %b, <2 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v2f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v9, v10 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v2f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v11, v9 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v9, v10 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v11, v9 +; ZVFBFMIN-NEXT: ret + %b.ext = fpext <2 x bfloat> %b to <2 x float> + %c.ext = fpext <2 x bfloat> %c to <2 x float> + %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %b.ext, <2 x float> %c.ext, <2 x float> %a) + ret <2 x float> %res +} + +define <2 x float> @vfwmaccbf16_vf_v2f32(<2 x float> %a, bfloat %b, <2 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v2f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 +; ZVFBFWMA-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFBFWMA-NEXT: vmv.v.x v10, a0 +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v9 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v2f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: fmv.x.w a0, fa0 +; ZVFBFMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; ZVFBFMIN-NEXT: vmv.v.x v10, a0 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v11, v10 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v10, v9 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v11, v10 +; ZVFBFMIN-NEXT: ret + %b.head = insertelement <2 x bfloat> poison, bfloat %b, i32 0 + %b.splat = shufflevector <2 x bfloat> %b.head, <2 x bfloat> poison, <2 x i32> zeroinitializer + %b.ext = fpext <2 x bfloat> %b.splat to <2 x float> + %c.ext = fpext <2 x bfloat> %c to <2 x float> + %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %b.ext, <2 x float> %c.ext, <2 x float> %a) + ret <2 x float> %res +} + +define <4 x float> @vfwmaccbf16_vv_v4f32(<4 x float> %a, <4 x bfloat> %b, <4 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v4f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v9, v10 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v4f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v11, v9 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v9, v10 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v11, v9 +; ZVFBFMIN-NEXT: ret + %b.ext = fpext <4 x bfloat> %b to <4 x float> + %c.ext = fpext <4 x bfloat> %c to <4 x float> + %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %b.ext, <4 x float> %c.ext, <4 x float> %a) + ret <4 x float> %res +} + +define <4 x float> @vfwmaccbf16_vf_v4f32(<4 x float> %a, bfloat %b, <4 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v4f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 +; ZVFBFWMA-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFBFWMA-NEXT: vmv.v.x v10, a0 +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v9 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v4f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: fmv.x.w a0, fa0 +; ZVFBFMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVFBFMIN-NEXT: vmv.v.x v10, a0 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v11, v10 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v10, v9 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v11, v10 +; ZVFBFMIN-NEXT: ret + %b.head = insertelement <4 x bfloat> poison, bfloat %b, i32 0 + %b.splat = shufflevector <4 x bfloat> %b.head, <4 x bfloat> poison, <4 x i32> zeroinitializer + %b.ext = fpext <4 x bfloat> %b.splat to <4 x float> + %c.ext = fpext <4 x bfloat> %c to <4 x float> + %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %b.ext, <4 x float> %c.ext, <4 x float> %a) + ret <4 x float> %res +} + +define <8 x float> @vfwmaccbf16_vv_v8f32(<8 x float> %a, <8 x bfloat> %b, <8 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v8f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v10, v11 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v8f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v12, v10 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v14, v11 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v12, v14 +; ZVFBFMIN-NEXT: ret + %b.ext = fpext <8 x bfloat> %b to <8 x float> + %c.ext = fpext <8 x bfloat> %c to <8 x float> + %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %b.ext, <8 x float> %c.ext, <8 x float> %a) + ret <8 x float> %res +} + +define <8 x float> @vfwmaccbf16_vf_v8f32(<8 x float> %a, bfloat %b, <8 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v8f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 +; ZVFBFWMA-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFBFWMA-NEXT: vmv.v.x v11, a0 +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v11, v10 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v8f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: fmv.x.w a0, fa0 +; ZVFBFMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; ZVFBFMIN-NEXT: vmv.v.x v11, a0 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v12, v11 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v14, v10 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v12, v14 +; ZVFBFMIN-NEXT: ret + %b.head = insertelement <8 x bfloat> poison, bfloat %b, i32 0 + %b.splat = shufflevector <8 x bfloat> %b.head, <8 x bfloat> poison, <8 x i32> zeroinitializer + %b.ext = fpext <8 x bfloat> %b.splat to <8 x float> + %c.ext = fpext <8 x bfloat> %c to <8 x float> + %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %b.ext, <8 x float> %c.ext, <8 x float> %a) + ret <8 x float> %res +} + +define <16 x float> @vfwmaccbf16_vv_v16f32(<16 x float> %a, <16 x bfloat> %b, <16 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v16f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v12, v14 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v16f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v16, v12 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v20, v14 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v16, v20 +; ZVFBFMIN-NEXT: ret + %b.ext = fpext <16 x bfloat> %b to <16 x float> + %c.ext = fpext <16 x bfloat> %c to <16 x float> + %res = call <16 x float> @llvm.fma.v16f32(<16 x float> %b.ext, <16 x float> %c.ext, <16 x float> %a) + ret <16 x float> %res +} + +define <16 x float> @vfwmaccbf16_vf_v16f32(<16 x float> %a, bfloat %b, <16 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v16f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 +; ZVFBFWMA-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFBFWMA-NEXT: vmv.v.x v14, a0 +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v14, v12 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v16f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: fmv.x.w a0, fa0 +; ZVFBFMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; ZVFBFMIN-NEXT: vmv.v.x v14, a0 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v16, v14 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v20, v12 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v16, v20 +; ZVFBFMIN-NEXT: ret + %b.head = insertelement <16 x bfloat> poison, bfloat %b, i32 0 + %b.splat = shufflevector <16 x bfloat> %b.head, <16 x bfloat> poison, <16 x i32> zeroinitializer + %b.ext = fpext <16 x bfloat> %b.splat to <16 x float> + %c.ext = fpext <16 x bfloat> %c to <16 x float> + %res = call <16 x float> @llvm.fma.v16f32(<16 x float> %b.ext, <16 x float> %c.ext, <16 x float> %a) + ret <16 x float> %res +} + +define <32 x float> @vfwmaccbf32_vv_v32f32(<32 x float> %a, <32 x bfloat> %b, <32 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf32_vv_v32f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: li a0, 32 +; ZVFBFWMA-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v16, v20 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf32_vv_v32f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: li a0, 32 +; ZVFBFMIN-NEXT: vsetvli zero, a0, e16, m4, ta, ma +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v24, v16 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v0, v20 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v24, v0 +; ZVFBFMIN-NEXT: ret + %b.ext = fpext <32 x bfloat> %b to <32 x float> + %c.ext = fpext <32 x bfloat> %c to <32 x float> + %res = call <32 x float> @llvm.fma.v32f32(<32 x float> %b.ext, <32 x float> %c.ext, <32 x float> %a) + ret <32 x float> %res +} + +define <32 x float> @vfwmaccbf32_vf_v32f32(<32 x float> %a, bfloat %b, <32 x bfloat> %c) { +; ZVFBFWMA-LABEL: vfwmaccbf32_vf_v32f32: +; ZVFBFWMA: # %bb.0: +; ZVFBFWMA-NEXT: fmv.x.h a0, fa0 +; ZVFBFWMA-NEXT: li a1, 32 +; ZVFBFWMA-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; ZVFBFWMA-NEXT: vmv.v.x v20, a0 +; ZVFBFWMA-NEXT: vfwmaccbf16.vv v8, v20, v16 +; ZVFBFWMA-NEXT: ret +; +; ZVFBFMIN-LABEL: vfwmaccbf32_vf_v32f32: +; ZVFBFMIN: # %bb.0: +; ZVFBFMIN-NEXT: fmv.x.w a0, fa0 +; ZVFBFMIN-NEXT: li a1, 32 +; ZVFBFMIN-NEXT: vsetvli zero, a1, e16, m4, ta, ma +; ZVFBFMIN-NEXT: vmv.v.x v20, a0 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v24, v20 +; ZVFBFMIN-NEXT: vfwcvtbf16.f.f.v v0, v16 +; ZVFBFMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma +; ZVFBFMIN-NEXT: vfmacc.vv v8, v24, v0 +; ZVFBFMIN-NEXT: ret + %b.head = insertelement <32 x bfloat> poison, bfloat %b, i32 0 + %b.splat = shufflevector <32 x bfloat> %b.head, <32 x bfloat> poison, <32 x i32> zeroinitializer + %b.ext = fpext <32 x bfloat> %b.splat to <32 x float> + %c.ext = fpext <32 x bfloat> %c to <32 x float> + %res = call <32 x float> @llvm.fma.v32f32(<32 x float> %b.ext, <32 x float> %c.ext, <32 x float> %a) + ret <32 x float> %res +} From 44d122188e0edf4a834bcd97256cf4af0de05890 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 12 Sep 2024 08:42:25 +0800 Subject: [PATCH 77/94] [RISCV] Expand bf16 vector truncstores and extloads (#108235) Previously they were legal by default, so the truncstore/extload test cases would get combined and crash during selection. These are set to expand for f16 so do the same for bf16. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 + llvm/test/CodeGen/RISCV/rvv/load-bf16.ll | 71 +++++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/store-bf16.ll | 71 +++++++++++++++++++++ 3 files changed, 144 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/rvv/load-bf16.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/store-bf16.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 9f0831a1ad92253..8112b5eb144da94 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1128,6 +1128,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, continue; SetCommonVFPActions(VT); SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs); + SetCommonVFPExtLoadTruncStoreActions(VT, BF16VecVTs); } } @@ -1137,6 +1138,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, continue; SetCommonVFPActions(VT); SetCommonVFPExtLoadTruncStoreActions(VT, F16VecVTs); + SetCommonVFPExtLoadTruncStoreActions(VT, BF16VecVTs); SetCommonVFPExtLoadTruncStoreActions(VT, F32VecVTs); } } diff --git a/llvm/test/CodeGen/RISCV/rvv/load-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/load-bf16.ll new file mode 100644 index 000000000000000..1108bb16b6712bb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/load-bf16.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s + +define @load_nxv1bf16(ptr %p) { +; CHECK-LABEL: load_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %x = load , ptr %p + ret %x +} + +define @load_nxv2bf16(ptr %p) { +; CHECK-LABEL: load_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: ret + %x = load , ptr %p + ret %x +} + +define @load_nxv4bf16(ptr %p) { +; CHECK-LABEL: load_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vl1re16.v v8, (a0) +; CHECK-NEXT: ret + %x = load , ptr %p + ret %x +} + +define @load_nxv8bf16(ptr %p) { +; CHECK-LABEL: load_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vl2re16.v v8, (a0) +; CHECK-NEXT: ret + %x = load , ptr %p + ret %x +} + +define @load_nxv16bf16(ptr %p) { +; CHECK-LABEL: load_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vl4re16.v v8, (a0) +; CHECK-NEXT: ret + %x = load , ptr %p + ret %x +} + +define @load_nxv32bf16(ptr %p) { +; CHECK-LABEL: load_nxv32bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vl8re16.v v8, (a0) +; CHECK-NEXT: ret + %x = load , ptr %p + ret %x +} + +define @extload(ptr %p) { +; CHECK-LABEL: extload: +; CHECK: # %bb.0: +; CHECK-NEXT: vl1re16.v v10, (a0) +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vfwcvtbf16.f.f.v v8, v10 +; CHECK-NEXT: ret + %x = load , ptr %p + %y = fpext %x to + ret %y +} diff --git a/llvm/test/CodeGen/RISCV/rvv/store-bf16.ll b/llvm/test/CodeGen/RISCV/rvv/store-bf16.ll new file mode 100644 index 000000000000000..30cbf9a39031906 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/store-bf16.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s + +define void @store_nxv1bf16( %v, ptr %p) { +; CHECK-LABEL: store_nxv1bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + store %v, ptr %p + ret void +} + +define void @store_nxv2bf16( %v, ptr %p) { +; CHECK-LABEL: store_nxv2bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vse16.v v8, (a0) +; CHECK-NEXT: ret + store %v, ptr %p + ret void +} + +define void @store_nxv4bf16( %v, ptr %p) { +; CHECK-LABEL: store_nxv4bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vs1r.v v8, (a0) +; CHECK-NEXT: ret + store %v, ptr %p + ret void +} + +define void @store_nxv8bf16( %v, ptr %p) { +; CHECK-LABEL: store_nxv8bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vs2r.v v8, (a0) +; CHECK-NEXT: ret + store %v, ptr %p + ret void +} + +define void @store_nxv16bf16( %v, ptr %p) { +; CHECK-LABEL: store_nxv16bf16: +; CHECK: # %bb.0: +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: ret + store %v, ptr %p + ret void +} + +define void @store_nxv32bf32( %v, ptr %p) { +; CHECK-LABEL: store_nxv32bf32: +; CHECK: # %bb.0: +; CHECK-NEXT: vs8r.v v8, (a0) +; CHECK-NEXT: ret + store %v, ptr %p + ret void +} + +define void @truncstore( %v, ptr %p) { +; CHECK-LABEL: truncstore: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 +; CHECK-NEXT: vs1r.v v10, (a0) +; CHECK-NEXT: ret + %w = fptrunc %v to + store %w, ptr %p + ret void +} From 1a431bcea7c2606ebaab47b58e5bba082189675c Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Thu, 12 Sep 2024 09:10:16 +0800 Subject: [PATCH 78/94] [mlir][Tosa] Fix attr type of out_shape for `tosa.transpose_conv2d` (#108041) This patch fixes attr type of out_shape, which is i64 dense array attribute with exactly 4 elements. - Fix description of DenseArrayMaxCt - Add DenseArrayMinCt and move it to CommonAttrConstraints.td - Change type of out_shape to Tosa_IntArrayAttr4 Fixes #107804. --- mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 2 +- mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td | 3 --- mlir/include/mlir/IR/CommonAttrConstraints.td | 8 ++++++++ mlir/test/Dialect/Tosa/invalid.mlir | 9 +++++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index 63572f287b7ddec..539b7cd0b74267c 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -347,7 +347,7 @@ def Tosa_TransposeConv2DOp : Tosa_InferShapedTypeOp<"transpose_conv2d"> { Tosa_Tensor1D:$bias, Tosa_IntArrayAttr4:$out_pad, Tosa_IntArrayAttr2:$stride, - Tosa_IntArrayAttrUpto4:$out_shape, + Tosa_IntArrayAttr4:$out_shape, OptionalAttr:$quantization_info, DefaultValuedOptionalAttr:$local_bound ); diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index c3a0128e95a84bb..a4b43d656fe43e5 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -176,9 +176,6 @@ def Tosa_Int8Like : Tosa_TypeLike<[Tosa_Int8], "signless-integer-8-bit-like">; //===----------------------------------------------------------------------===// // Attribute predicates and classes. //===----------------------------------------------------------------------===// -class DenseArrayMaxCt : AttrConstraint< - CPred<"::llvm::cast<::mlir::DenseArrayAttr>($_self).size() <= " # n>, - "with at least " # n # " elements">; def Tosa_Fp32ArrayAttr2 : ConfinedAttr]>; def Tosa_Fp32ArrayAttr3 : ConfinedAttr]>; diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td index 6774a7c568315d8..853fb318c76e71a 100644 --- a/mlir/include/mlir/IR/CommonAttrConstraints.td +++ b/mlir/include/mlir/IR/CommonAttrConstraints.td @@ -789,6 +789,14 @@ class DenseArrayCount : AttrConstraint< CPred<"::llvm::cast<::mlir::DenseArrayAttr>($_self).size() == " #n>, "with exactly " # n # " elements">; +class DenseArrayMaxCt : AttrConstraint< + CPred<"::llvm::cast<::mlir::DenseArrayAttr>($_self).size() <= " # n>, + "with at most " # n # " elements">; + +class DenseArrayMinCt : AttrConstraint< + CPred<"::llvm::cast<::mlir::DenseArrayAttr>($_self).size() >= " # n>, + "with at least " # n # " elements">; + class DenseArrayStrictlyPositive : AttrConstraint< CPred<"::llvm::all_of(::llvm::cast<" # arrayType #">($_self).asArrayRef(), " "[&](auto v) { return v > 0; })">, diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir index 414bcfe237d7535..311fdb1226c523f 100644 --- a/mlir/test/Dialect/Tosa/invalid.mlir +++ b/mlir/test/Dialect/Tosa/invalid.mlir @@ -578,3 +578,12 @@ func.func @test_table_io_shape_mismatch(%arg0: tensor, %arg1: tensor<6 %0 = tosa.table %arg0, %arg1 : (tensor, tensor<6xi16>) -> tensor return } + +// ----- + +// CHECK-LABEL: test_transpose_conv2d_invalid_outshape +func.func @test_transpose_conv2d_invalid_outshape(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>, %arg2: tensor<16xf32>) -> tensor<1x32x32x16xf32> { + // expected-error@+1 {{'tosa.transpose_conv2d' op attribute 'out_shape' failed to satisfy constraint: i64 dense array attribute with exactly 4 elements}} + %0 = tosa.transpose_conv2d %arg0, %arg1, %arg2 {out_pad = array, out_shape = array, stride = array} : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<1x32x32x16xf32> + return %0 : tensor<1x32x32x16xf32> +} From 757d8b3efdd82a02973d0ab4ebaa2e05e9ab7ae0 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Thu, 12 Sep 2024 09:11:12 +0800 Subject: [PATCH 79/94] [RISCV] Allow -mcmodel= to accept large for RV64 (#107817) --- clang/docs/ReleaseNotes.rst | 2 ++ clang/lib/Driver/ToolChains/CommonArgs.cpp | 7 ++++++- clang/test/Driver/riscv-mcmodel.c | 9 +++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index af6d1c5826a2fcc..9860b25f2e7fa6f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -460,6 +460,8 @@ LoongArch Support RISC-V Support ^^^^^^^^^^^^^^ +- The option ``-mcmodel=large`` for the large code model is supported. + CUDA/HIP Language Changes ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 2ce6779f4b43e35..f58b816a9709dda 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -2902,11 +2902,16 @@ void tools::addMCModel(const Driver &D, const llvm::opt::ArgList &Args, } else if (Triple.isPPC64() || Triple.isOSAIX()) { Ok = CM == "small" || CM == "medium" || CM == "large"; } else if (Triple.isRISCV()) { + // Large code model is disallowed to be used with PIC code model. + if (CM == "large" && RelocationModel != llvm::Reloc::Static) + D.Diag(diag::err_drv_argument_not_allowed_with) + << A->getAsString(Args) << "-fpic"; if (CM == "medlow") CM = "small"; else if (CM == "medany") CM = "medium"; - Ok = CM == "small" || CM == "medium"; + Ok = CM == "small" || CM == "medium" || + (CM == "large" && Triple.isRISCV64()); } else if (Triple.getArch() == llvm::Triple::x86_64) { Ok = llvm::is_contained({"small", "kernel", "medium", "large", "tiny"}, CM); diff --git a/clang/test/Driver/riscv-mcmodel.c b/clang/test/Driver/riscv-mcmodel.c index 4f5fa95f59b6661..c27d7c63a75a4f4 100644 --- a/clang/test/Driver/riscv-mcmodel.c +++ b/clang/test/Driver/riscv-mcmodel.c @@ -10,5 +10,14 @@ // RUN: %clang --target=riscv32 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s // RUN: %clang --target=riscv64 -### -c -mcmodel=medany %s 2>&1 | FileCheck --check-prefix=MEDIUM %s +// RUN: not %clang --target=riscv32 -### -c -mcmodel=large %s 2>&1 | FileCheck --check-prefix=ERR-LARGE %s +// RUN: %clang --target=riscv64 -### -c -mcmodel=large %s 2>&1 | FileCheck --check-prefix=LARGE %s + +// RUN: not %clang --target=riscv64 -### -c -mcmodel=large -fpic %s 2>&1 | FileCheck --check-prefix=LARGE %s + // SMALL: "-mcmodel=small" // MEDIUM: "-mcmodel=medium" +// LARGE: "-mcmodel=large" + +// ERR-LARGE: error: unsupported argument 'large' to option '-mcmodel=' for target 'riscv32' +// ERR-PIC-LARGE: error: invalid argument '-mcmodel=large' not allowed with '-fpic' From 3d129016b1a0cb00a26bfab521350ef824d6d76d Mon Sep 17 00:00:00 2001 From: Justin Bogner Date: Wed, 11 Sep 2024 18:36:40 -0700 Subject: [PATCH 80/94] [DirectX] Preserve value names in DXILOpLowering. NFC (#108089) If the value we're replacing has a name, we might as well preserve it. --- llvm/lib/Target/DirectX/DXILOpBuilder.cpp | 7 ++--- llvm/lib/Target/DirectX/DXILOpBuilder.h | 3 ++- llvm/lib/Target/DirectX/DXILOpLowering.cpp | 27 +++++++++++-------- .../DirectX/CreateHandleFromBinding.ll | 12 ++++----- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp index 3b2a5f5061eb838..7719d6b1079110b 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.cpp +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.cpp @@ -386,6 +386,7 @@ static Error makeOpError(dxil::OpCode OpCode, Twine Msg) { Expected DXILOpBuilder::tryCreateOp(dxil::OpCode OpCode, ArrayRef Args, + const Twine &Name, Type *RetTy) { const OpCodeProperty *Prop = getOpCodeProperty(OpCode); @@ -451,12 +452,12 @@ Expected DXILOpBuilder::tryCreateOp(dxil::OpCode OpCode, OpArgs.push_back(IRB.getInt32(llvm::to_underlying(OpCode))); OpArgs.append(Args.begin(), Args.end()); - return IRB.CreateCall(DXILFn, OpArgs); + return IRB.CreateCall(DXILFn, OpArgs, Name); } CallInst *DXILOpBuilder::createOp(dxil::OpCode OpCode, ArrayRef Args, - Type *RetTy) { - Expected Result = tryCreateOp(OpCode, Args, RetTy); + const Twine &Name, Type *RetTy) { + Expected Result = tryCreateOp(OpCode, Args, Name, RetTy); if (Error E = Result.takeError()) llvm_unreachable("Invalid arguments for operation"); return *Result; diff --git a/llvm/lib/Target/DirectX/DXILOpBuilder.h b/llvm/lib/Target/DirectX/DXILOpBuilder.h index a68f0c43f67afbf..037ae3822cfb906 100644 --- a/llvm/lib/Target/DirectX/DXILOpBuilder.h +++ b/llvm/lib/Target/DirectX/DXILOpBuilder.h @@ -39,11 +39,12 @@ class DXILOpBuilder { /// Create a call instruction for the given DXIL op. The arguments /// must be valid for an overload of the operation. CallInst *createOp(dxil::OpCode Op, ArrayRef Args, - Type *RetTy = nullptr); + const Twine &Name = "", Type *RetTy = nullptr); /// Try to create a call instruction for the given DXIL op. Fails if the /// overload is invalid. Expected tryCreateOp(dxil::OpCode Op, ArrayRef Args, + const Twine &Name = "", Type *RetTy = nullptr); /// Get a `%dx.types.ResRet` type with the given element type. diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index d98d0bfde04fc67..3ee3ee05563c241 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -118,7 +118,7 @@ class OpLowerer { Args.append(CI->arg_begin(), CI->arg_end()); Expected OpCall = - OpBuilder.tryCreateOp(DXILOp, Args, F.getReturnType()); + OpBuilder.tryCreateOp(DXILOp, Args, CI->getName(), F.getReturnType()); if (Error E = OpCall.takeError()) return E; @@ -198,7 +198,7 @@ class OpLowerer { ConstantInt::get(Int32Ty, Binding.RecordID), CI->getArgOperand(3), CI->getArgOperand(4)}; Expected OpCall = - OpBuilder.tryCreateOp(OpCode::CreateHandle, Args); + OpBuilder.tryCreateOp(OpCode::CreateHandle, Args, CI->getName()); if (Error E = OpCall.takeError()) return E; @@ -233,15 +233,16 @@ class OpLowerer { Binding.LowerBound, UpperBound, Binding.Space, RI.getResourceClass()); std::array BindArgs{ResBind, CI->getArgOperand(3), CI->getArgOperand(4)}; - Expected OpBind = - OpBuilder.tryCreateOp(OpCode::CreateHandleFromBinding, BindArgs); + Expected OpBind = OpBuilder.tryCreateOp( + OpCode::CreateHandleFromBinding, BindArgs, CI->getName()); if (Error E = OpBind.takeError()) return E; std::array AnnotateArgs{ *OpBind, OpBuilder.getResProps(Props.first, Props.second)}; - Expected OpAnnotate = - OpBuilder.tryCreateOp(OpCode::AnnotateHandle, AnnotateArgs); + Expected OpAnnotate = OpBuilder.tryCreateOp( + OpCode::AnnotateHandle, AnnotateArgs, + CI->hasName() ? CI->getName() + "_annot" : Twine()); if (Error E = OpAnnotate.takeError()) return E; @@ -286,7 +287,10 @@ class OpLowerer { if (!CheckOp) { Value *NewEVI = IRB.CreateExtractValue(Op, 4); Expected OpCall = OpBuilder.tryCreateOp( - OpCode::CheckAccessFullyMapped, {NewEVI}, Int32Ty); + OpCode::CheckAccessFullyMapped, {NewEVI}, + OldResult->hasName() ? OldResult->getName() + "_check" + : Twine(), + Int32Ty); if (Error E = OpCall.takeError()) return E; CheckOp = *OpCall; @@ -296,7 +300,8 @@ class OpLowerer { } } - OldResult = cast(IRB.CreateExtractValue(Op, 0)); + OldResult = cast( + IRB.CreateExtractValue(Op, 0, OldResult->getName())); OldTy = ST->getElementType(0); } @@ -403,8 +408,8 @@ class OpLowerer { Type *NewRetTy = OpBuilder.getResRetType(OldTy->getScalarType()); std::array Args{Handle, Index0, Index1}; - Expected OpCall = - OpBuilder.tryCreateOp(OpCode::BufferLoad, Args, NewRetTy); + Expected OpCall = OpBuilder.tryCreateOp( + OpCode::BufferLoad, Args, CI->getName(), NewRetTy); if (Error E = OpCall.takeError()) return E; if (Error E = replaceResRetUses(CI, *OpCall, HasCheckBit)) @@ -447,7 +452,7 @@ class OpLowerer { std::array Args{Handle, Index0, Index1, Data0, Data1, Data2, Data3, Mask}; Expected OpCall = - OpBuilder.tryCreateOp(OpCode::BufferStore, Args); + OpBuilder.tryCreateOp(OpCode::BufferStore, Args, CI->getName()); if (Error E = OpCall.takeError()) return E; diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll index d0c80c018b8d7eb..dbdd2e61df7a3b9 100644 --- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll +++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll @@ -19,14 +19,14 @@ define void @test_bindings() { %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0( i32 3, i32 5, i32 1, i32 4, i1 false) - ; CHECK: [[BUF0:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 5, i32 5, i32 3, i8 1 }, i32 4, i1 false) + ; CHECK: [[BUF0:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 5, i32 5, i32 3, i8 1 }, i32 4, i1 false) ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF0]], %dx.types.ResourceProperties { i32 4106, i32 1033 }) ; RWBuffer Buf : register(u7, space2) %typed1 = call target("dx.TypedBuffer", i32, 1, 0, 1) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0_0t( i32 2, i32 7, i32 1, i32 6, i1 false) - ; CHECK: [[BUF1:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 7, i32 7, i32 2, i8 1 }, i32 6, i1 false) + ; CHECK: [[BUF1:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 7, i32 7, i32 2, i8 1 }, i32 6, i1 false) ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF1]], %dx.types.ResourceProperties { i32 4106, i32 260 }) ; Buffer Buf[24] : register(t3, space5) @@ -35,7 +35,7 @@ define void @test_bindings() { %typed2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0_0t( i32 5, i32 3, i32 24, i32 7, i1 false) - ; CHECK: [[BUF2:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 3, i32 26, i32 5, i8 0 }, i32 7, i1 false) + ; CHECK: [[BUF2:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 3, i32 26, i32 5, i8 0 }, i32 7, i1 false) ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF2]], %dx.types.ResourceProperties { i32 10, i32 1029 }) ; struct S { float4 a; uint4 b; }; @@ -43,14 +43,14 @@ define void @test_bindings() { %struct0 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t( i32 4, i32 2, i32 1, i32 10, i1 true) - ; CHECK: [[BUF3:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 2, i32 2, i32 4, i8 0 }, i32 10, i1 true) + ; CHECK: [[BUF3:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 2, i32 2, i32 4, i8 0 }, i32 10, i1 true) ; CHECK: = call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF3]], %dx.types.ResourceProperties { i32 1036, i32 32 }) ; ByteAddressBuffer Buf : register(t8, space1) %byteaddr0 = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t( i32 1, i32 8, i32 1, i32 12, i1 false) - ; CHECK: [[BUF4:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 8, i32 8, i32 1, i8 0 }, i32 12, i1 false) + ; CHECK: [[BUF4:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 8, i32 8, i32 1, i8 0 }, i32 12, i1 false) ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF4]], %dx.types.ResourceProperties { i32 11, i32 0 }) ; Buffer Buf[] : register(t0) @@ -59,7 +59,7 @@ define void @test_bindings() { %typed3 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0t( i32 0, i32 0, i32 -1, i32 %typed3_ix, i1 false) - ; CHECK: [[BUF5:%[0-9]*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 0, i32 -1, i32 0, i8 0 }, i32 %typed3_ix, i1 false) + ; CHECK: [[BUF5:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 218, %dx.types.ResBind { i32 0, i32 -1, i32 0, i8 0 }, i32 %typed3_ix, i1 false) ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 217, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) ret void From 39751e7ff998266bdefeaaf3b3bf3cdba26b0322 Mon Sep 17 00:00:00 2001 From: Congcong Cai Date: Thu, 12 Sep 2024 09:51:38 +0800 Subject: [PATCH 81/94] [clang-tidy][NFC] fix add_new_check python3.8 incompatibility (#107871) Fixes: #107846 --- clang-tools-extra/clang-tidy/add_new_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang-tools-extra/clang-tidy/add_new_check.py b/clang-tools-extra/clang-tidy/add_new_check.py index d384dbae28abbcf..e366f1005353570 100755 --- a/clang-tools-extra/clang-tidy/add_new_check.py +++ b/clang-tools-extra/clang-tidy/add_new_check.py @@ -17,7 +17,7 @@ import textwrap # FIXME Python 3.9: Replace typing.Tuple with builtins.tuple. -from typing import Optional, Tuple +from typing import Optional, Tuple, Match # Adapts the module's CMakelist file. Returns 'True' if it could add a new @@ -511,7 +511,7 @@ def has_auto_fix(check_name: str) -> str: return "" - def process_doc(doc_file: Tuple[str, str]) -> Tuple[str, Optional[re.Match[str]]]: + def process_doc(doc_file: Tuple[str, str]) -> Tuple[str, Optional[Match[str]]]: check_name = doc_file[0] + "-" + doc_file[1].replace(".rst", "") with io.open(os.path.join(docs_dir, *doc_file), "r", encoding="utf8") as doc: From c9ab69798ff92f1fcd150a0e1988d08fb8c2a59d Mon Sep 17 00:00:00 2001 From: vporpo Date: Wed, 11 Sep 2024 18:55:08 -0700 Subject: [PATCH 82/94] [SandboxIR] Implement ConstantTokenNone (#108106) This patch implements sandboxir::ConstantTokenNone mirroring llvm::ConstantTokenNone. --- llvm/include/llvm/SandboxIR/SandboxIR.h | 32 +++++++++++++++++++ .../llvm/SandboxIR/SandboxIRValues.def | 1 + llvm/lib/SandboxIR/SandboxIR.cpp | 9 ++++++ llvm/unittests/SandboxIR/SandboxIRTest.cpp | 24 ++++++++++++++ 4 files changed, 66 insertions(+) diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h index 88884683f591a47..95fe239555fb41f 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIR.h +++ b/llvm/include/llvm/SandboxIR/SandboxIR.h @@ -124,6 +124,7 @@ class ConstantAggregateZero; class ConstantPointerNull; class PoisonValue; class BlockAddress; +class ConstantTokenNone; class Context; class Function; class Instruction; @@ -1141,6 +1142,37 @@ class BlockAddress final : public Constant { } }; +// TODO: This should inherit from ConstantData. +class ConstantTokenNone final : public Constant { + ConstantTokenNone(llvm::ConstantTokenNone *C, Context &Ctx) + : Constant(ClassID::ConstantTokenNone, C, Ctx) {} + friend class Context; // For constructor. + +public: + /// Return the ConstantTokenNone. + static ConstantTokenNone *get(Context &Ctx); + + /// For isa/dyn_cast. + static bool classof(const sandboxir::Value *From) { + return From->getSubclassID() == ClassID::ConstantTokenNone; + } + + unsigned getUseOperandNo(const Use &Use) const final { + llvm_unreachable("ConstantTokenNone has no operands!"); + } + +#ifndef NDEBUG + void verify() const override { + assert(isa(Val) && + "Expected a ConstantTokenNone!"); + } + void dumpOS(raw_ostream &OS) const override { + dumpCommonPrefix(OS); + dumpCommonSuffix(OS); + } +#endif +}; + /// Iterator for `Instruction`s in a `BasicBlock. /// \Returns an sandboxir::Instruction & when derereferenced. class BBIterator { diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def index c29e8be24ea7542..bd2f533e880ac69 100644 --- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def +++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def @@ -35,6 +35,7 @@ DEF_CONST(ConstantPointerNull, ConstantPointerNull) DEF_CONST(UndefValue, UndefValue) DEF_CONST(PoisonValue, PoisonValue) DEF_CONST(BlockAddress, BlockAddress) +DEF_CONST(ConstantTokenNone, ConstantTokenNone) #ifndef DEF_INSTR #define DEF_INSTR(ID, OPCODE, CLASS) diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp index df3839518c9d089..05d05f7ed10fb93 100644 --- a/llvm/lib/SandboxIR/SandboxIR.cpp +++ b/llvm/lib/SandboxIR/SandboxIR.cpp @@ -2521,6 +2521,11 @@ BasicBlock *BlockAddress::getBasicBlock() const { Ctx.getValue(cast(Val)->getBasicBlock())); } +ConstantTokenNone *ConstantTokenNone::get(Context &Ctx) { + auto *LLVMC = llvm::ConstantTokenNone::get(Ctx.LLVMCtx); + return cast(Ctx.getOrCreateConstant(LLVMC)); +} + FunctionType *Function::getFunctionType() const { return cast( Ctx.getType(cast(Val)->getFunctionType())); @@ -2621,6 +2626,10 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) { It->second = std::unique_ptr( new BlockAddress(cast(C), *this)); return It->second.get(); + case llvm::Value::ConstantTokenNoneVal: + It->second = std::unique_ptr( + new ConstantTokenNone(cast(C), *this)); + return It->second.get(); case llvm::Value::ConstantAggregateZeroVal: { auto *CAZ = cast(C); It->second = std::unique_ptr( diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp index 148afd9483d5681..6280963d588facd 100644 --- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp +++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp @@ -777,6 +777,30 @@ define void @foo(ptr %ptr) { EXPECT_EQ(LookupBB2Addr, nullptr); } +TEST_F(SandboxIRTest, ConstantTokenNone) { + parseIR(C, R"IR( +define void @foo(ptr %ptr) { + bb0: + %cs = catchswitch within none [label %handler] unwind to caller + handler: + ret void +} +)IR"); + Function &LLVMF = *M->getFunction("foo"); + sandboxir::Context Ctx(C); + + [[maybe_unused]] auto &F = *Ctx.createFunction(&LLVMF); + auto *BB0 = cast( + Ctx.getValue(getBasicBlockByName(LLVMF, "bb0"))); + auto *CS = cast(&*BB0->begin()); + + // Check classof(), creation, getFunction(), getBasicBlock(). + auto *CTN = cast(CS->getParentPad()); + // Check get(). + auto *NewCTN = sandboxir::ConstantTokenNone::get(Ctx); + EXPECT_EQ(NewCTN, CTN); +} + TEST_F(SandboxIRTest, Use) { parseIR(C, R"IR( define i32 @foo(i32 %v0, i32 %v1) { From 2d4bdfba96d4cf88b12226b2b511bf55ee5e6559 Mon Sep 17 00:00:00 2001 From: Yun-Fly Date: Thu, 12 Sep 2024 10:02:57 +0800 Subject: [PATCH 83/94] [mlir][scf] Extend consumer fuse to single nested `scf.for` (#94190) Refactor current consumer fusion based on `addInitOperandsToLoopNest` to support single nested `scf.for`, E.g. ``` %0 = scf.for() { %1 = scf.for() { tiledProducer } yield %1 } %2 = consumer ins(%0) ``` --- .../SCF/Transforms/TileUsingInterface.cpp | 345 +++++++++--------- .../tile-and-fuse-consumer.mlir | 77 +++- 2 files changed, 241 insertions(+), 181 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index e404c01010a3259..04624638e14c004 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -1481,6 +1481,50 @@ static FailureOr getConsumerFromUses(Value val, return &operand; } +/// Find the perfectly nested loops outside of given loop(included) sorted from +/// outer to inner. +/// +/// E.g. +/// +/// ``` +/// %0 = scf.for() +/// %1 = scf.for() +/// %2 = scf.for() +/// %3 = ... +/// yield %3 +/// yield %2 +/// yield %1 +/// ``` +/// +/// This function will return three perfectly nested loops: %0 + %1 + %2, when +/// target inner loop is %2. +static SmallVector +getPerfectlyNestedLoopsOutsideOf(scf::ForOp loop) { + SmallVector nestLoops = {loop}; + auto outerLoop = dyn_cast(loop->getParentOp()); + + // Check if it is the ForOp that yield the result of inner loop. + auto isForOpYieldResultOfInnerLoop = + [](scf::ForOp outerLoop) -> LogicalResult { + Block *body = outerLoop.getBody(); + if (!llvm::hasSingleElement(body->without_terminator())) + return failure(); + auto yieldOp = cast(body->getTerminator()); + auto innerForOp = dyn_cast(body->front()); + if (!innerForOp) + return failure(); + // All of innerForOp results should be yielded. + return success(innerForOp->getNumResults() == yieldOp->getNumOperands()); + }; + + while (outerLoop && succeeded(isForOpYieldResultOfInnerLoop(outerLoop))) { + nestLoops.push_back(outerLoop); + outerLoop = dyn_cast(outerLoop->getParentOp()); + } + // sorted from outer to inner + return {nestLoops.rbegin(), nestLoops.rend()}; +} + /// Fetch the untiled consumer of a scf.for's result which is yielded by a /// tensor.insert_slice. This function makes the following assumptions : /// 1. tensor.insert_slice has scf.yield as its only user. @@ -1498,9 +1542,10 @@ getUntiledConsumerFromSlice(tensor::InsertSliceOp candidateSliceOp) { auto forOp = dyn_cast(containingOp); if (!forOp) return failure(); - Value resultingValue = forOp->getResult(resultNumber); + scf::ForOp topLevelForOp = getPerfectlyNestedLoopsOutsideOf(forOp).front(); + Value resultingValue = topLevelForOp->getResult(resultNumber); - return getConsumerFromUses(resultingValue, containingOp->getBlock()); + return getConsumerFromUses(resultingValue, topLevelForOp->getBlock()); } /// Fetch the first untiled consumer of a scf.forall's result which is yielded @@ -1563,59 +1608,6 @@ static FailureOr getUntiledConsumerFromSlice(Operation *sliceOp) { } } -/// After fusing consumer into scf.for we want to modify the scf.yield operation -/// to reflect the same by returning the values yielded by the tiled consumer. -static void -fixTerminatorSCFYield(RewriterBase &rewriter, scf::ForOp newForOp, - TilingResult &tilingResult, - ArrayRef> &resultOffsets, - ArrayRef> &resultSizes, - ArrayRef bbArgs) { - scf::YieldOp oldTerminatorOp = - cast(newForOp.getBody()->getTerminator()); - unsigned totalOldResults = oldTerminatorOp->getNumResults(); - unsigned totalTiledResults = tilingResult.tiledOps[0]->getNumResults(); - SmallVector newYieldOperands; - newYieldOperands.reserve(totalOldResults + totalTiledResults); - for (auto oldResult : oldTerminatorOp.getResults()) { - newYieldOperands.push_back(oldResult); - } - rewriter.setInsertionPointAfter(oldTerminatorOp); - Location loc = newForOp.getLoc(); - for (auto [tiledResult, bbArg, resultOffset, resultSize] : - llvm::zip_equal(tilingResult.tiledOps[0]->getResults(), bbArgs, - resultOffsets, resultSizes)) { - SmallVector strides(resultOffset.size(), - rewriter.getIndexAttr(1)); - Value newInsertSliceOp = rewriter.create( - loc, tiledResult, bbArg, resultOffset, resultSize, strides); - newYieldOperands.push_back(newInsertSliceOp); - } - rewriter.create(loc, newYieldOperands); - rewriter.eraseOp(oldTerminatorOp); -} - -/// After fusing consumer into scf.forall we want to yield each of the resulting -/// values by the tiled consumer within scf.forall.in_parallel region. -static void -fixTerminatorSCFInParallel(RewriterBase &rewriter, scf::ForallOp newForallOp, - SmallVector tiledResults, - ArrayRef> &resultOffsets, - ArrayRef> &resultSizes, - ArrayRef bbArgs) { - scf::InParallelOp newTerminatorOp = newForallOp.getTerminator(); - rewriter.setInsertionPointToStart(newTerminatorOp.getBody()); - Location firstYieldOpLoc = - (*(newTerminatorOp.getYieldingOps().begin())).getLoc(); - for (auto [tiledResult, bbArg, resultOffset, resultSize] : - llvm::zip_equal(tiledResults, bbArgs, resultOffsets, resultSizes)) { - SmallVector strides(resultOffset.size(), - rewriter.getIndexAttr(1)); - rewriter.create( - firstYieldOpLoc, tiledResult, bbArg, resultOffset, resultSize, strides); - } -} - /// Implementation of fusing consumer of a single slice by computing the /// slice of the consumer in-place for scf loop. FailureOr @@ -1646,81 +1638,63 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, consumerOp, "consumer op's operand doesn't seem to be an OpResult"); } - Operation *oldLoopOp = nullptr; - SmallVector newOuts; - Block *oldLoopBody = nullptr; - unsigned initSize = 0; - unsigned rank = 1; + // There are two possible cases regarding `oldLoopOp` here: + // 1. single `scf.forall` or `scf.for`. + // 2. inner-most `scf.for` insider nest `scf.loop` structure, where the + // top-level loop is the outer-most one of these nested loops. + LoopLikeOpInterface innerMostLoop = + candidateSliceOp->getParentOfType(); + SmallVector nestedLoops; if (isInsertSliceOp) { - auto forOp = candidateSliceOp->getParentOfType(); - oldLoopOp = forOp; - llvm::append_range(newOuts, forOp.getInits()); - oldLoopBody = forOp.getBody(); - initSize = forOp.getInits().size(); + nestedLoops = llvm::map_to_vector( + getPerfectlyNestedLoopsOutsideOf( + cast(innerMostLoop.getOperation())), + [](scf::ForOp forOp) { + return cast(forOp.getOperation()); + }); } else { - auto forallOp = candidateSliceOp->getParentOfType(); - oldLoopOp = forallOp; - llvm::append_range(newOuts, forallOp.getOutputs()); - oldLoopBody = forallOp.getBody(); - initSize = forallOp.getOutputs().size(); - rank = forallOp.getRank(); + nestedLoops = {innerMostLoop}; } - if (failed(checkAssumptionForLoop(oldLoopOp, consumerOp))) { + LoopLikeOpInterface outerMostLoop = nestedLoops.front(); + + if (failed(checkAssumptionForLoop(outerMostLoop, consumerOp))) { return rewriter.notifyMatchFailure( - oldLoopOp, "containing loop op should either yield just one value or " - "have the consumer op as its first user"); + outerMostLoop, + "containing loop op should either yield just one value or " + "have the consumer op as its first user"); } OpBuilder::InsertionGuard g(rewriter); // 2. Check consumer is not using scf loop's output as init. - auto dstOp = cast(consumerOp); + auto dstOp = dyn_cast(consumerOp); + if (!dstOp) + return rewriter.notifyMatchFailure(consumerOp, + "consumer op is not DPS operation"); SmallVector dpsInits = llvm::map_to_vector(dstOp.getDpsInits(), [](Value v) { return v; }); - if (llvm::is_contained(dpsInits, oldLoopOp->getResult(resultNumber))) { + if (llvm::is_contained(dpsInits, outerMostLoop->getResult(resultNumber))) { return rewriter.notifyMatchFailure( consumerOp, "consumer op taking the result of scf.for as init is not supported"); } - newOuts.append(dpsInits); - - Location loc = oldLoopOp->getLoc(); + SmallVector newInits = dpsInits; - // 3. Create new scf loop op. - rewriter.setInsertionPoint(consumerOp); - Operation *newLoopOp = nullptr; - Block *newLoopBody = nullptr; - if (isInsertSliceOp) { - auto forOp = cast(oldLoopOp); - auto newForOp = rewriter.create(loc, forOp.getLowerBound(), - forOp.getUpperBound(), - forOp.getStep(), newOuts); - newLoopOp = newForOp; - newLoopBody = newForOp.getBody(); - } else { - auto forallOp = cast(oldLoopOp); - auto newForallOp = rewriter.create( - loc, forallOp.getMixedLowerBound(), forallOp.getMixedUpperBound(), - forallOp.getMixedStep(), newOuts, forallOp.getMapping()); - newLoopOp = newForallOp; - rewriter.eraseOp(newForallOp.getTerminator()); - newLoopBody = newForallOp.getBody(); - } + Location loc = outerMostLoop->getLoc(); - // 4. Move the loop body to the new op. - unsigned oldNumArguments = oldLoopBody->getNumArguments(); - rewriter.mergeBlocks(oldLoopBody, newLoopBody, - newLoopBody->getArguments().take_front(oldNumArguments)); + // 3. Move the whole loop structure right before consumer Op, the dominance + // should be already ensured by `checkAssumptionForLoop`. + rewriter.moveOpBefore(outerMostLoop, consumerOp); - // 5. Set insertion point before terminator op of the loop and create a new + // 4. Set insertion point before terminator op of the loop and create a new // tensor.insert_slice. In the scf.for case this is a clone of the // candidateSliceOp whereas in the scf.forall case this is created from the // operands of tensor.parallel_insert_slice. tensor::InsertSliceOp clonedInsertSliceOp; if (auto sliceOp = dyn_cast(candidateSliceOp)) { - auto newForallOp = cast(newLoopOp); + auto newForallOp = cast(innerMostLoop.getOperation()); rewriter.setInsertionPoint(newForallOp.getTerminator()); clonedInsertSliceOp = rewriter.create( loc, sliceOp.getSource(), sliceOp.getDest(), sliceOp.getMixedOffsets(), @@ -1731,20 +1705,17 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, cast(rewriter.clone(*candidateSliceOp)); } - // 6.a. Clone consumer op. - auto newForOpBlockArgsForConsumerDest = - newLoopBody->getArguments().drop_front(oldNumArguments); - auto clonedConsumerOp = cast(cloneOpAndUpdateDestinationArgs( - rewriter, consumerOp, newForOpBlockArgsForConsumerDest)); + // 5.a. Clone consumer op. + auto clonedConsumerOp = cast(rewriter.clone(*consumerOp)); - // 6.b. Replace all uses of the loop result with the result of the cloned + // 5.b. Replace all uses of the loop result with the result of the cloned // tensor.insert_slice. OpOperand &operandToReplace = clonedConsumerOp->getOpOperand(operandNumber); rewriter.modifyOpInPlace(clonedConsumerOp, [&]() { operandToReplace.set(clonedInsertSliceOp.getResult()); }); - // 7 - Perform tiling of the cloned consumer and replace the operand at + // 6. Perform tiling of the cloned consumer and replace the operand at // `operandNumber` with the source of the cloned tensor.insert_slice op. auto ossSliceOp = cast(clonedInsertSliceOp.getOperation()); @@ -1754,79 +1725,105 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, if (failed(tileAndFuseResult)) { return failure(); } - rewriter.replaceAllUsesWith( - tileAndFuseResult->tiledOps[0]->getOperand(operandNumber), - clonedInsertSliceOp.getSource()); - - // 8 - Extract offset/sizes/strides required to create the - // tensor.insert_slice/parallel_insert_slice for each result of the consumer. - SmallVector offsets = ossSliceOp.getMixedOffsets(); - SmallVector sizes = ossSliceOp.getMixedSizes(); - SmallVector strides = ossSliceOp.getMixedStrides(); - - // 9. Check all insert stride is 1. - if (llvm::any_of(strides, [](OpFoldResult stride) { - return !isConstantIntValue(stride, 1); - })) { - return rewriter.notifyMatchFailure( - candidateSliceOp, "containingOp's result yield with stride"); - } + auto tiledConsumerOp = cast(tileAndFuseResult->tiledOps[0]); + rewriter.replaceAllUsesWith(tiledConsumerOp->getOperand(operandNumber), + clonedInsertSliceOp.getSource()); - // 10. Try to get iter domain position from input position. - SmallVector iterDomainOffsets, iterDomainSizes; - if (failed(clonedConsumerOp.getIterationDomainTileFromOperandTile( - rewriter, operandNumber, offsets, sizes, iterDomainOffsets, - iterDomainSizes))) { - return rewriter.notifyMatchFailure( - clonedConsumerOp, "can't get iter domain position from input position"); - } + // 7. Reconstruct [nested] loop with new inits. + YieldTiledValuesFn newYieldValuesFn = + [&](RewriterBase &innerRewriter, Location loc, ValueRange /*ivs*/, + ValueRange newRegionIterArgs, SmallVector &tiledResult, + SmallVector> &tiledOffset, + SmallVector> &tiledSizes) -> LogicalResult { + OpBuilder::InsertionGuard g(innerRewriter); + // 8. Set inner insertPoint right before tiled consumer op. + innerRewriter.setInsertionPoint(tiledConsumerOp); - // 11. Try to fetch the offset and size for all results of the cloned - // consumer. This would then be used to form the corresponding - // tensor.insert_slice/parallel_insert_slice later. - unsigned totalNumResultsOfConsumer = clonedConsumerOp->getNumResults(); - SmallVector> resultOffsets( - totalNumResultsOfConsumer); - SmallVector> resultSizes(totalNumResultsOfConsumer); - for (auto [idx, v] : llvm::enumerate(clonedConsumerOp->getResults())) { - if (failed(clonedConsumerOp.getResultTilePosition( - rewriter, idx, iterDomainOffsets, iterDomainSizes, - resultOffsets[idx], resultSizes[idx]))) { + SmallVector offsets = ossSliceOp.getMixedOffsets(); + SmallVector sizes = ossSliceOp.getMixedSizes(); + SmallVector strides = ossSliceOp.getMixedStrides(); + + // 9. Check all insert stride is 1. + if (llvm::any_of(strides, [](OpFoldResult stride) { + return !isConstantIntValue(stride, 1); + })) { return rewriter.notifyMatchFailure( - clonedConsumerOp, - "can't get result domain position from iter domain position"); + candidateSliceOp, "containingOp's result yield with stride"); } - } - auto arrayRefOffsets = ArrayRef>(resultOffsets); - auto arrayRefSizes = ArrayRef>(resultSizes); - if (isInsertSliceOp) { - auto newForOp = cast(newLoopOp); - fixTerminatorSCFYield( - rewriter, newForOp, *tileAndFuseResult, arrayRefOffsets, arrayRefSizes, - newForOp.getBody()->getArguments().drop_front(1 + initSize)); - } else { - auto newForallOp = cast(newLoopOp); - fixTerminatorSCFInParallel( - rewriter, newForallOp, tileAndFuseResult->tiledOps[0]->getResults(), - arrayRefOffsets, arrayRefSizes, - newForallOp.getBody()->getArguments().drop_front(rank + initSize)); - } + // 10. Try to get iter domain position from input position. + SmallVector iterDomainOffsets, iterDomainSizes; + if (failed(tiledConsumerOp.getIterationDomainTileFromOperandTile( + rewriter, operandNumber, offsets, sizes, iterDomainOffsets, + iterDomainSizes))) { + return rewriter.notifyMatchFailure( + tiledConsumerOp, + "can't get iter domain position from input position"); + } - // 12. Replace the result of scf loop and consumer op with new loop's results. - for (auto &&[oldResult, newResult] : - llvm::zip_first(oldLoopOp->getResults(), newLoopOp->getResults())) { - rewriter.replaceAllUsesWith(oldResult, newResult); + // 11. Try to fetch the offset and size for all results of the cloned + // consumer. This would then be used to form the corresponding + // tensor.insert_slice/parallel_insert_slice later. + unsigned totalNumResultsOfConsumer = tiledConsumerOp->getNumResults(); + SmallVector> resultOffsets( + totalNumResultsOfConsumer); + SmallVector> resultSizes( + totalNumResultsOfConsumer); + for (auto [idx, v] : llvm::enumerate(tiledConsumerOp->getResults())) { + if (failed(tiledConsumerOp.getResultTilePosition( + rewriter, idx, iterDomainOffsets, iterDomainSizes, + resultOffsets[idx], resultSizes[idx]))) { + return rewriter.notifyMatchFailure( + tiledConsumerOp, + "can't get result domain position from iter domain position"); + } + } + + // 12. Create `extract_slice` for `iter_args` for DPS operation if + // necessary. + if (auto tiledDestStyleOp = dyn_cast( + tiledConsumerOp.getOperation())) { + rewriter.setInsertionPoint(tiledDestStyleOp); + for (const auto &&[index, newRegionArg] : + llvm::enumerate(newRegionIterArgs)) { + auto destSlice = rewriter.create( + loc, newRegionArg, resultOffsets[index], resultSizes[index], + SmallVector(resultOffsets[index].size(), + rewriter.getIndexAttr(1))); + rewriter.modifyOpInPlace(tiledDestStyleOp, [&]() { + tiledDestStyleOp.getDpsInitsMutable()[index].set(destSlice); + }); + } + } + + // 13. Prepare tiled offset and sizes for later `insert_slice` creation by + // caller. + Block *block = rewriter.getInsertionPoint()->getBlock(); + rewriter.setInsertionPoint(block->getTerminator()); + for (const auto &&[index, result] : + llvm::enumerate(tiledConsumerOp->getResults())) { + tiledResult.push_back(result); + tiledOffset.emplace_back(resultOffsets[index]); + tiledSizes.emplace_back(resultSizes[index]); + } + return success(); + }; + // 14. Add new inits to [nested] loops. + if (failed(addInitOperandsToLoopNest(rewriter, nestedLoops, newInits, + newYieldValuesFn))) { + return rewriter.notifyMatchFailure(tiledConsumerOp, + "unable to add new inits to nest loop"); } - for (auto &&[oldResult, newResult] : - llvm::zip(consumerOp->getResults(), - newLoopOp->getResults().drop_front(initSize))) { + // 15. Replace the result of scf loop and consumer op with new loop's results. + + for (auto &&[oldResult, newResult] : llvm::zip( + consumerOp->getResults(), + nestedLoops.front()->getResults().take_back(newInits.size()))) { rewriter.replaceAllUsesWith(oldResult, newResult); } - // 13. Need to erase the old scf loop and the cloned consumer op. - rewriter.eraseOp(oldLoopOp); + // 16. Need to erase the old scf loop and the cloned consumer op. rewriter.eraseOp(clonedConsumerOp); return scf::SCFFuseConsumerOfSliceResult{ diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 83c5ec8d7342c85..fdefdcc453ae7aa 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -109,9 +109,9 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: ins(%[[MAT_OUT]], %[[SLICE_OPERAND2]] : // CHECK-SAME: outs(%[[SLICE_OUT]] : // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]] into %[[ELEM_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[MAT_OUT]] into %[[SECOND_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[SECOND_ARG_SLICE]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]] into %[[ELEM_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } // CHECK: return %[[FINAL_RESULT]]#2 : @@ -248,10 +248,10 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: ins(%[[MAT_OUT]], %[[SLICE_OPERAND2]] : // CHECK-SAME: outs(%[[SLICE_OUT_0]], %[[SLICE_OUT_1]] : // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#0 into %[[ELEM_OUT_ARG_0]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] -// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#1 into %[[ELEM_OUT_ARG_1]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[MAT_OUT]] into %[[SECOND_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[SECOND_ARG_SLICE]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#0 into %[[ELEM_OUT_ARG_0]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#1 into %[[ELEM_OUT_ARG_1]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } // CHECK: %[[UNPACK:.*]] = tensor.unpack %[[FINAL_RESULT]]#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %{{.*}} : tensor<64x32xf32> -> tensor<2048xf32> @@ -310,8 +310,8 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [1024] [1] // CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [1024] [1] // CHECK: } // CHECK: } // CHECK: return %[[FINAL_RESULT]]#1 : @@ -369,8 +369,71 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[TILED_PACK_DEST]] // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] -// CHECK: } +// CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] + +// ----- + +module { + func.func @fuse_add_consumer_into_nested_scf_for(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> tensor<256x256xf32> { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index + %cst = arith.constant 0.000000e+00 : f32 + %dest0 = tensor.empty() : tensor<256x256xf32> + %dest1 = linalg.fill ins(%cst : f32) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32> + %1 = scf.for %arg3 = %c0 to %c256 step %c64 iter_args(%arg4 = %dest1) -> (tensor<256x256xf32>) { + %2 = scf.for %arg5 = %c0 to %c256 step %c64 iter_args(%arg6 = %arg4) -> (tensor<256x256xf32>) { + %extracted_slice_1 = tensor.extract_slice %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> + %extracted_slice_2 = tensor.extract_slice %arg0[%arg3, 0] [64, 512] [1, 1] : tensor<256x512xf32> to tensor<64x512xf32> + %extracted_slice_3 = tensor.extract_slice %arg1[0, %arg5] [512, 64] [1, 1] : tensor<512x256xf32> to tensor<512x64xf32> + %3 = linalg.matmul ins(%extracted_slice_2, %extracted_slice_3 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> + %insert_slice = tensor.insert_slice %3 into %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> + scf.yield %insert_slice : tensor<256x256xf32> + } + scf.yield %2 : tensor<256x256xf32> + } + %4 = linalg.add ins(%1, %arg2 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32> + return %4 : tensor<256x256xf32> + } +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %slice_op = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %a, %b = transform.test.fuse_consumer %slice_op + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: func.func @fuse_add_consumer_into_nested_scf_for( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<256x512xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<512x256xf32> +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32> +// CHECK: %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32> +// CHECK: %[[dest1:.*]] = linalg.fill +// CHECK-SAME: outs(%[[dest0]] : +// CHECK: %[[LOOP_RESULT1:.*]]:2 = scf.for %[[IV1:.*]] = %[[C0]] +// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG1:.*]] = %[[dest1]], %[[SECOND_OUT_ARG1:.*]] = %[[dest0]]) +// CHECK-SAME: { +// CHECK: %[[LOOP_RESULT2:.*]]:2 = scf.for %[[IV2:.*]] = %[[C0]] +// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG2:.*]] = %[[FIRST_OUT_ARG1]], %[[SECOND_OUT_ARG2:.*]] = %[[SECOND_OUT_ARG1]]) +// CHECK-SAME: { +// CHECK: %[[MAT_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[INPUT_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 512] [1, 1] +// CHECK: %[[WEIGHT_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, %[[IV2]]] [512, 64] [1, 1] +// CHECK: %[[TILED_MAT_OUT:.*]] = linalg.matmul +// CHECK-SAME: outs(%[[MAT_OUT_SLICE]] : +// CHECK: %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[ADD_OPERAND2_SLICE:.*]] = tensor.extract_slice %[[ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[TILED_ADD_OUT:.*]] = linalg.add +// CHECK-SAME: ins(%[[TILED_MAT_OUT]], %[[ADD_OPERAND2_SLICE]] : +// CHECK-SAME: outs(%[[ADD_OUT_SLICE]] : +// CHECK: %[[INSERT_ADD:.*]] = tensor.insert_slice %[[TILED_ADD_OUT]] into %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: scf.yield %[[INSERT_MAT]], %[[INSERT_ADD]] : +// CHECK: } +// CHECK: scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 : // CHECK: } -// CHECK: return %[[FINAL_RESULT]]#1 : +// CHECK: return %[[LOOP_RESULT1]]#1 : From 335538c271c9c71ef3f2e23680265e7b77595be0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Sep 2024 19:18:37 -0700 Subject: [PATCH 84/94] Revert "[mlir][scf] Extend consumer fuse to single nested `scf.for` (#94190)" This reverts commit 2d4bdfba96d4cf88b12226b2b511bf55ee5e6559. A build breakage is reported at: https://lab.llvm.org/buildbot/#/builders/138/builds/3524 --- .../SCF/Transforms/TileUsingInterface.cpp | 345 +++++++++--------- .../tile-and-fuse-consumer.mlir | 77 +--- 2 files changed, 181 insertions(+), 241 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index 04624638e14c004..e404c01010a3259 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -1481,50 +1481,6 @@ static FailureOr getConsumerFromUses(Value val, return &operand; } -/// Find the perfectly nested loops outside of given loop(included) sorted from -/// outer to inner. -/// -/// E.g. -/// -/// ``` -/// %0 = scf.for() -/// %1 = scf.for() -/// %2 = scf.for() -/// %3 = ... -/// yield %3 -/// yield %2 -/// yield %1 -/// ``` -/// -/// This function will return three perfectly nested loops: %0 + %1 + %2, when -/// target inner loop is %2. -static SmallVector -getPerfectlyNestedLoopsOutsideOf(scf::ForOp loop) { - SmallVector nestLoops = {loop}; - auto outerLoop = dyn_cast(loop->getParentOp()); - - // Check if it is the ForOp that yield the result of inner loop. - auto isForOpYieldResultOfInnerLoop = - [](scf::ForOp outerLoop) -> LogicalResult { - Block *body = outerLoop.getBody(); - if (!llvm::hasSingleElement(body->without_terminator())) - return failure(); - auto yieldOp = cast(body->getTerminator()); - auto innerForOp = dyn_cast(body->front()); - if (!innerForOp) - return failure(); - // All of innerForOp results should be yielded. - return success(innerForOp->getNumResults() == yieldOp->getNumOperands()); - }; - - while (outerLoop && succeeded(isForOpYieldResultOfInnerLoop(outerLoop))) { - nestLoops.push_back(outerLoop); - outerLoop = dyn_cast(outerLoop->getParentOp()); - } - // sorted from outer to inner - return {nestLoops.rbegin(), nestLoops.rend()}; -} - /// Fetch the untiled consumer of a scf.for's result which is yielded by a /// tensor.insert_slice. This function makes the following assumptions : /// 1. tensor.insert_slice has scf.yield as its only user. @@ -1542,10 +1498,9 @@ getUntiledConsumerFromSlice(tensor::InsertSliceOp candidateSliceOp) { auto forOp = dyn_cast(containingOp); if (!forOp) return failure(); - scf::ForOp topLevelForOp = getPerfectlyNestedLoopsOutsideOf(forOp).front(); - Value resultingValue = topLevelForOp->getResult(resultNumber); + Value resultingValue = forOp->getResult(resultNumber); - return getConsumerFromUses(resultingValue, topLevelForOp->getBlock()); + return getConsumerFromUses(resultingValue, containingOp->getBlock()); } /// Fetch the first untiled consumer of a scf.forall's result which is yielded @@ -1608,6 +1563,59 @@ static FailureOr getUntiledConsumerFromSlice(Operation *sliceOp) { } } +/// After fusing consumer into scf.for we want to modify the scf.yield operation +/// to reflect the same by returning the values yielded by the tiled consumer. +static void +fixTerminatorSCFYield(RewriterBase &rewriter, scf::ForOp newForOp, + TilingResult &tilingResult, + ArrayRef> &resultOffsets, + ArrayRef> &resultSizes, + ArrayRef bbArgs) { + scf::YieldOp oldTerminatorOp = + cast(newForOp.getBody()->getTerminator()); + unsigned totalOldResults = oldTerminatorOp->getNumResults(); + unsigned totalTiledResults = tilingResult.tiledOps[0]->getNumResults(); + SmallVector newYieldOperands; + newYieldOperands.reserve(totalOldResults + totalTiledResults); + for (auto oldResult : oldTerminatorOp.getResults()) { + newYieldOperands.push_back(oldResult); + } + rewriter.setInsertionPointAfter(oldTerminatorOp); + Location loc = newForOp.getLoc(); + for (auto [tiledResult, bbArg, resultOffset, resultSize] : + llvm::zip_equal(tilingResult.tiledOps[0]->getResults(), bbArgs, + resultOffsets, resultSizes)) { + SmallVector strides(resultOffset.size(), + rewriter.getIndexAttr(1)); + Value newInsertSliceOp = rewriter.create( + loc, tiledResult, bbArg, resultOffset, resultSize, strides); + newYieldOperands.push_back(newInsertSliceOp); + } + rewriter.create(loc, newYieldOperands); + rewriter.eraseOp(oldTerminatorOp); +} + +/// After fusing consumer into scf.forall we want to yield each of the resulting +/// values by the tiled consumer within scf.forall.in_parallel region. +static void +fixTerminatorSCFInParallel(RewriterBase &rewriter, scf::ForallOp newForallOp, + SmallVector tiledResults, + ArrayRef> &resultOffsets, + ArrayRef> &resultSizes, + ArrayRef bbArgs) { + scf::InParallelOp newTerminatorOp = newForallOp.getTerminator(); + rewriter.setInsertionPointToStart(newTerminatorOp.getBody()); + Location firstYieldOpLoc = + (*(newTerminatorOp.getYieldingOps().begin())).getLoc(); + for (auto [tiledResult, bbArg, resultOffset, resultSize] : + llvm::zip_equal(tiledResults, bbArgs, resultOffsets, resultSizes)) { + SmallVector strides(resultOffset.size(), + rewriter.getIndexAttr(1)); + rewriter.create( + firstYieldOpLoc, tiledResult, bbArg, resultOffset, resultSize, strides); + } +} + /// Implementation of fusing consumer of a single slice by computing the /// slice of the consumer in-place for scf loop. FailureOr @@ -1638,63 +1646,81 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, consumerOp, "consumer op's operand doesn't seem to be an OpResult"); } - // There are two possible cases regarding `oldLoopOp` here: - // 1. single `scf.forall` or `scf.for`. - // 2. inner-most `scf.for` insider nest `scf.loop` structure, where the - // top-level loop is the outer-most one of these nested loops. - LoopLikeOpInterface innerMostLoop = - candidateSliceOp->getParentOfType(); - SmallVector nestedLoops; + Operation *oldLoopOp = nullptr; + SmallVector newOuts; + Block *oldLoopBody = nullptr; + unsigned initSize = 0; + unsigned rank = 1; if (isInsertSliceOp) { - nestedLoops = llvm::map_to_vector( - getPerfectlyNestedLoopsOutsideOf( - cast(innerMostLoop.getOperation())), - [](scf::ForOp forOp) { - return cast(forOp.getOperation()); - }); + auto forOp = candidateSliceOp->getParentOfType(); + oldLoopOp = forOp; + llvm::append_range(newOuts, forOp.getInits()); + oldLoopBody = forOp.getBody(); + initSize = forOp.getInits().size(); } else { - nestedLoops = {innerMostLoop}; + auto forallOp = candidateSliceOp->getParentOfType(); + oldLoopOp = forallOp; + llvm::append_range(newOuts, forallOp.getOutputs()); + oldLoopBody = forallOp.getBody(); + initSize = forallOp.getOutputs().size(); + rank = forallOp.getRank(); } - LoopLikeOpInterface outerMostLoop = nestedLoops.front(); - - if (failed(checkAssumptionForLoop(outerMostLoop, consumerOp))) { + if (failed(checkAssumptionForLoop(oldLoopOp, consumerOp))) { return rewriter.notifyMatchFailure( - outerMostLoop, - "containing loop op should either yield just one value or " - "have the consumer op as its first user"); + oldLoopOp, "containing loop op should either yield just one value or " + "have the consumer op as its first user"); } OpBuilder::InsertionGuard g(rewriter); // 2. Check consumer is not using scf loop's output as init. - auto dstOp = dyn_cast(consumerOp); - if (!dstOp) - return rewriter.notifyMatchFailure(consumerOp, - "consumer op is not DPS operation"); + auto dstOp = cast(consumerOp); SmallVector dpsInits = llvm::map_to_vector(dstOp.getDpsInits(), [](Value v) { return v; }); - if (llvm::is_contained(dpsInits, outerMostLoop->getResult(resultNumber))) { + if (llvm::is_contained(dpsInits, oldLoopOp->getResult(resultNumber))) { return rewriter.notifyMatchFailure( consumerOp, "consumer op taking the result of scf.for as init is not supported"); } - SmallVector newInits = dpsInits; + newOuts.append(dpsInits); + + Location loc = oldLoopOp->getLoc(); - Location loc = outerMostLoop->getLoc(); + // 3. Create new scf loop op. + rewriter.setInsertionPoint(consumerOp); + Operation *newLoopOp = nullptr; + Block *newLoopBody = nullptr; + if (isInsertSliceOp) { + auto forOp = cast(oldLoopOp); + auto newForOp = rewriter.create(loc, forOp.getLowerBound(), + forOp.getUpperBound(), + forOp.getStep(), newOuts); + newLoopOp = newForOp; + newLoopBody = newForOp.getBody(); + } else { + auto forallOp = cast(oldLoopOp); + auto newForallOp = rewriter.create( + loc, forallOp.getMixedLowerBound(), forallOp.getMixedUpperBound(), + forallOp.getMixedStep(), newOuts, forallOp.getMapping()); + newLoopOp = newForallOp; + rewriter.eraseOp(newForallOp.getTerminator()); + newLoopBody = newForallOp.getBody(); + } - // 3. Move the whole loop structure right before consumer Op, the dominance - // should be already ensured by `checkAssumptionForLoop`. - rewriter.moveOpBefore(outerMostLoop, consumerOp); + // 4. Move the loop body to the new op. + unsigned oldNumArguments = oldLoopBody->getNumArguments(); + rewriter.mergeBlocks(oldLoopBody, newLoopBody, + newLoopBody->getArguments().take_front(oldNumArguments)); - // 4. Set insertion point before terminator op of the loop and create a new + // 5. Set insertion point before terminator op of the loop and create a new // tensor.insert_slice. In the scf.for case this is a clone of the // candidateSliceOp whereas in the scf.forall case this is created from the // operands of tensor.parallel_insert_slice. tensor::InsertSliceOp clonedInsertSliceOp; if (auto sliceOp = dyn_cast(candidateSliceOp)) { - auto newForallOp = cast(innerMostLoop.getOperation()); + auto newForallOp = cast(newLoopOp); rewriter.setInsertionPoint(newForallOp.getTerminator()); clonedInsertSliceOp = rewriter.create( loc, sliceOp.getSource(), sliceOp.getDest(), sliceOp.getMixedOffsets(), @@ -1705,17 +1731,20 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, cast(rewriter.clone(*candidateSliceOp)); } - // 5.a. Clone consumer op. - auto clonedConsumerOp = cast(rewriter.clone(*consumerOp)); + // 6.a. Clone consumer op. + auto newForOpBlockArgsForConsumerDest = + newLoopBody->getArguments().drop_front(oldNumArguments); + auto clonedConsumerOp = cast(cloneOpAndUpdateDestinationArgs( + rewriter, consumerOp, newForOpBlockArgsForConsumerDest)); - // 5.b. Replace all uses of the loop result with the result of the cloned + // 6.b. Replace all uses of the loop result with the result of the cloned // tensor.insert_slice. OpOperand &operandToReplace = clonedConsumerOp->getOpOperand(operandNumber); rewriter.modifyOpInPlace(clonedConsumerOp, [&]() { operandToReplace.set(clonedInsertSliceOp.getResult()); }); - // 6. Perform tiling of the cloned consumer and replace the operand at + // 7 - Perform tiling of the cloned consumer and replace the operand at // `operandNumber` with the source of the cloned tensor.insert_slice op. auto ossSliceOp = cast(clonedInsertSliceOp.getOperation()); @@ -1725,105 +1754,79 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, if (failed(tileAndFuseResult)) { return failure(); } - auto tiledConsumerOp = cast(tileAndFuseResult->tiledOps[0]); - rewriter.replaceAllUsesWith(tiledConsumerOp->getOperand(operandNumber), - clonedInsertSliceOp.getSource()); - - // 7. Reconstruct [nested] loop with new inits. - YieldTiledValuesFn newYieldValuesFn = - [&](RewriterBase &innerRewriter, Location loc, ValueRange /*ivs*/, - ValueRange newRegionIterArgs, SmallVector &tiledResult, - SmallVector> &tiledOffset, - SmallVector> &tiledSizes) -> LogicalResult { - OpBuilder::InsertionGuard g(innerRewriter); - // 8. Set inner insertPoint right before tiled consumer op. - innerRewriter.setInsertionPoint(tiledConsumerOp); - - SmallVector offsets = ossSliceOp.getMixedOffsets(); - SmallVector sizes = ossSliceOp.getMixedSizes(); - SmallVector strides = ossSliceOp.getMixedStrides(); + rewriter.replaceAllUsesWith( + tileAndFuseResult->tiledOps[0]->getOperand(operandNumber), + clonedInsertSliceOp.getSource()); + + // 8 - Extract offset/sizes/strides required to create the + // tensor.insert_slice/parallel_insert_slice for each result of the consumer. + SmallVector offsets = ossSliceOp.getMixedOffsets(); + SmallVector sizes = ossSliceOp.getMixedSizes(); + SmallVector strides = ossSliceOp.getMixedStrides(); + + // 9. Check all insert stride is 1. + if (llvm::any_of(strides, [](OpFoldResult stride) { + return !isConstantIntValue(stride, 1); + })) { + return rewriter.notifyMatchFailure( + candidateSliceOp, "containingOp's result yield with stride"); + } - // 9. Check all insert stride is 1. - if (llvm::any_of(strides, [](OpFoldResult stride) { - return !isConstantIntValue(stride, 1); - })) { - return rewriter.notifyMatchFailure( - candidateSliceOp, "containingOp's result yield with stride"); - } + // 10. Try to get iter domain position from input position. + SmallVector iterDomainOffsets, iterDomainSizes; + if (failed(clonedConsumerOp.getIterationDomainTileFromOperandTile( + rewriter, operandNumber, offsets, sizes, iterDomainOffsets, + iterDomainSizes))) { + return rewriter.notifyMatchFailure( + clonedConsumerOp, "can't get iter domain position from input position"); + } - // 10. Try to get iter domain position from input position. - SmallVector iterDomainOffsets, iterDomainSizes; - if (failed(tiledConsumerOp.getIterationDomainTileFromOperandTile( - rewriter, operandNumber, offsets, sizes, iterDomainOffsets, - iterDomainSizes))) { + // 11. Try to fetch the offset and size for all results of the cloned + // consumer. This would then be used to form the corresponding + // tensor.insert_slice/parallel_insert_slice later. + unsigned totalNumResultsOfConsumer = clonedConsumerOp->getNumResults(); + SmallVector> resultOffsets( + totalNumResultsOfConsumer); + SmallVector> resultSizes(totalNumResultsOfConsumer); + for (auto [idx, v] : llvm::enumerate(clonedConsumerOp->getResults())) { + if (failed(clonedConsumerOp.getResultTilePosition( + rewriter, idx, iterDomainOffsets, iterDomainSizes, + resultOffsets[idx], resultSizes[idx]))) { return rewriter.notifyMatchFailure( - tiledConsumerOp, - "can't get iter domain position from input position"); - } - - // 11. Try to fetch the offset and size for all results of the cloned - // consumer. This would then be used to form the corresponding - // tensor.insert_slice/parallel_insert_slice later. - unsigned totalNumResultsOfConsumer = tiledConsumerOp->getNumResults(); - SmallVector> resultOffsets( - totalNumResultsOfConsumer); - SmallVector> resultSizes( - totalNumResultsOfConsumer); - for (auto [idx, v] : llvm::enumerate(tiledConsumerOp->getResults())) { - if (failed(tiledConsumerOp.getResultTilePosition( - rewriter, idx, iterDomainOffsets, iterDomainSizes, - resultOffsets[idx], resultSizes[idx]))) { - return rewriter.notifyMatchFailure( - tiledConsumerOp, - "can't get result domain position from iter domain position"); - } - } - - // 12. Create `extract_slice` for `iter_args` for DPS operation if - // necessary. - if (auto tiledDestStyleOp = dyn_cast( - tiledConsumerOp.getOperation())) { - rewriter.setInsertionPoint(tiledDestStyleOp); - for (const auto &&[index, newRegionArg] : - llvm::enumerate(newRegionIterArgs)) { - auto destSlice = rewriter.create( - loc, newRegionArg, resultOffsets[index], resultSizes[index], - SmallVector(resultOffsets[index].size(), - rewriter.getIndexAttr(1))); - rewriter.modifyOpInPlace(tiledDestStyleOp, [&]() { - tiledDestStyleOp.getDpsInitsMutable()[index].set(destSlice); - }); - } + clonedConsumerOp, + "can't get result domain position from iter domain position"); } + } - // 13. Prepare tiled offset and sizes for later `insert_slice` creation by - // caller. - Block *block = rewriter.getInsertionPoint()->getBlock(); - rewriter.setInsertionPoint(block->getTerminator()); - for (const auto &&[index, result] : - llvm::enumerate(tiledConsumerOp->getResults())) { - tiledResult.push_back(result); - tiledOffset.emplace_back(resultOffsets[index]); - tiledSizes.emplace_back(resultSizes[index]); - } - return success(); - }; - // 14. Add new inits to [nested] loops. - if (failed(addInitOperandsToLoopNest(rewriter, nestedLoops, newInits, - newYieldValuesFn))) { - return rewriter.notifyMatchFailure(tiledConsumerOp, - "unable to add new inits to nest loop"); + auto arrayRefOffsets = ArrayRef>(resultOffsets); + auto arrayRefSizes = ArrayRef>(resultSizes); + if (isInsertSliceOp) { + auto newForOp = cast(newLoopOp); + fixTerminatorSCFYield( + rewriter, newForOp, *tileAndFuseResult, arrayRefOffsets, arrayRefSizes, + newForOp.getBody()->getArguments().drop_front(1 + initSize)); + } else { + auto newForallOp = cast(newLoopOp); + fixTerminatorSCFInParallel( + rewriter, newForallOp, tileAndFuseResult->tiledOps[0]->getResults(), + arrayRefOffsets, arrayRefSizes, + newForallOp.getBody()->getArguments().drop_front(rank + initSize)); } - // 15. Replace the result of scf loop and consumer op with new loop's results. + // 12. Replace the result of scf loop and consumer op with new loop's results. + for (auto &&[oldResult, newResult] : + llvm::zip_first(oldLoopOp->getResults(), newLoopOp->getResults())) { + rewriter.replaceAllUsesWith(oldResult, newResult); + } - for (auto &&[oldResult, newResult] : llvm::zip( - consumerOp->getResults(), - nestedLoops.front()->getResults().take_back(newInits.size()))) { + for (auto &&[oldResult, newResult] : + llvm::zip(consumerOp->getResults(), + newLoopOp->getResults().drop_front(initSize))) { rewriter.replaceAllUsesWith(oldResult, newResult); } - // 16. Need to erase the old scf loop and the cloned consumer op. + // 13. Need to erase the old scf loop and the cloned consumer op. + rewriter.eraseOp(oldLoopOp); rewriter.eraseOp(clonedConsumerOp); return scf::SCFFuseConsumerOfSliceResult{ diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index fdefdcc453ae7aa..83c5ec8d7342c85 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -109,9 +109,9 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: ins(%[[MAT_OUT]], %[[SLICE_OPERAND2]] : // CHECK-SAME: outs(%[[SLICE_OUT]] : // CHECK: scf.forall.in_parallel { +// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]] into %[[ELEM_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[MAT_OUT]] into %[[SECOND_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[SECOND_ARG_SLICE]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] -// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]] into %[[ELEM_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } // CHECK: return %[[FINAL_RESULT]]#2 : @@ -248,10 +248,10 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: ins(%[[MAT_OUT]], %[[SLICE_OPERAND2]] : // CHECK-SAME: outs(%[[SLICE_OUT_0]], %[[SLICE_OUT_1]] : // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[MAT_OUT]] into %[[SECOND_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] -// CHECK: tensor.parallel_insert_slice %[[SECOND_ARG_SLICE]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#0 into %[[ELEM_OUT_ARG_0]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#1 into %[[ELEM_OUT_ARG_1]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[MAT_OUT]] into %[[SECOND_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[SECOND_ARG_SLICE]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } // CHECK: %[[UNPACK:.*]] = tensor.unpack %[[FINAL_RESULT]]#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %{{.*}} : tensor<64x32xf32> -> tensor<2048xf32> @@ -310,8 +310,8 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [1024] [1] +// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } // CHECK: return %[[FINAL_RESULT]]#1 : @@ -369,71 +369,8 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[TILED_PACK_DEST]] // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] - -// ----- - -module { - func.func @fuse_add_consumer_into_nested_scf_for(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> tensor<256x256xf32> { - %c0 = arith.constant 0 : index - %c64 = arith.constant 64 : index - %c256 = arith.constant 256 : index - %cst = arith.constant 0.000000e+00 : f32 - %dest0 = tensor.empty() : tensor<256x256xf32> - %dest1 = linalg.fill ins(%cst : f32) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32> - %1 = scf.for %arg3 = %c0 to %c256 step %c64 iter_args(%arg4 = %dest1) -> (tensor<256x256xf32>) { - %2 = scf.for %arg5 = %c0 to %c256 step %c64 iter_args(%arg6 = %arg4) -> (tensor<256x256xf32>) { - %extracted_slice_1 = tensor.extract_slice %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> - %extracted_slice_2 = tensor.extract_slice %arg0[%arg3, 0] [64, 512] [1, 1] : tensor<256x512xf32> to tensor<64x512xf32> - %extracted_slice_3 = tensor.extract_slice %arg1[0, %arg5] [512, 64] [1, 1] : tensor<512x256xf32> to tensor<512x64xf32> - %3 = linalg.matmul ins(%extracted_slice_2, %extracted_slice_3 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> - %insert_slice = tensor.insert_slice %3 into %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> - scf.yield %insert_slice : tensor<256x256xf32> - } - scf.yield %2 : tensor<256x256xf32> - } - %4 = linalg.add ins(%1, %arg2 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32> - return %4 : tensor<256x256xf32> - } -} - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { - %slice_op = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - %a, %b = transform.test.fuse_consumer %slice_op - : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.yield - } -} -// CHECK: func.func @fuse_add_consumer_into_nested_scf_for( -// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<256x512xf32> -// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<512x256xf32> -// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32> -// CHECK: %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32> -// CHECK: %[[dest1:.*]] = linalg.fill -// CHECK-SAME: outs(%[[dest0]] : -// CHECK: %[[LOOP_RESULT1:.*]]:2 = scf.for %[[IV1:.*]] = %[[C0]] -// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG1:.*]] = %[[dest1]], %[[SECOND_OUT_ARG1:.*]] = %[[dest0]]) -// CHECK-SAME: { -// CHECK: %[[LOOP_RESULT2:.*]]:2 = scf.for %[[IV2:.*]] = %[[C0]] -// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG2:.*]] = %[[FIRST_OUT_ARG1]], %[[SECOND_OUT_ARG2:.*]] = %[[SECOND_OUT_ARG1]]) -// CHECK-SAME: { -// CHECK: %[[MAT_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] -// CHECK: %[[INPUT_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 512] [1, 1] -// CHECK: %[[WEIGHT_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, %[[IV2]]] [512, 64] [1, 1] -// CHECK: %[[TILED_MAT_OUT:.*]] = linalg.matmul -// CHECK-SAME: outs(%[[MAT_OUT_SLICE]] : -// CHECK: %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] -// CHECK: %[[ADD_OPERAND2_SLICE:.*]] = tensor.extract_slice %[[ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] -// CHECK: %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] -// CHECK: %[[TILED_ADD_OUT:.*]] = linalg.add -// CHECK-SAME: ins(%[[TILED_MAT_OUT]], %[[ADD_OPERAND2_SLICE]] : -// CHECK-SAME: outs(%[[ADD_OUT_SLICE]] : -// CHECK: %[[INSERT_ADD:.*]] = tensor.insert_slice %[[TILED_ADD_OUT]] into %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] -// CHECK: scf.yield %[[INSERT_MAT]], %[[INSERT_ADD]] : -// CHECK: } -// CHECK: scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 : +// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: } // CHECK: } -// CHECK: return %[[LOOP_RESULT1]]#1 : +// CHECK: return %[[FINAL_RESULT]]#1 : From 8168088f0a9015bc6d930e8bc1c639dee06ca82c Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Wed, 11 Sep 2024 19:34:54 -0700 Subject: [PATCH 85/94] [clang-format] Fix regressions in BAS_AlwaysBreak (#107506) Fixes #107401. Fixes #107574. --- clang/lib/Format/ContinuationIndenter.cpp | 14 +++++++++++--- clang/unittests/Format/FormatTestJS.cpp | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 5843571718b3a2c..f29f8796ea9290e 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -815,7 +815,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, return Tok.is(tok::l_paren) && Tok.ParameterCount > 0 && Tok.Previous && Tok.Previous->is(tok::identifier); }; - const auto IsInTemplateString = [this](const FormatToken &Tok) { + auto IsInTemplateString = [this](const FormatToken &Tok) { if (!Style.isJavaScript()) return false; for (const auto *Prev = &Tok; Prev; Prev = Prev->Previous) { @@ -827,7 +827,10 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, return false; }; // Identifies simple (no expression) one-argument function calls. - const auto IsSimpleFunction = [&](const FormatToken &Tok) { + auto StartsSimpleOneArgList = [&](const FormatToken &TokAfterLParen) { + assert(TokAfterLParen.isNot(tok::comment) || TokAfterLParen.Next); + const auto &Tok = + TokAfterLParen.is(tok::comment) ? *TokAfterLParen.Next : TokAfterLParen; if (!Tok.FakeLParens.empty() && Tok.FakeLParens.back() > prec::Unknown) return false; // Nested calls that involve `new` expressions also look like simple @@ -836,6 +839,11 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // - foo(::new Bar()) if (Tok.is(tok::kw_new) || Tok.startsSequence(tok::coloncolon, tok::kw_new)) return true; + if (Tok.is(TT_UnaryOperator) || + (Style.isJavaScript() && + Tok.isOneOf(tok::ellipsis, Keywords.kw_await))) { + return true; + } const auto *Previous = Tok.Previous; if (!Previous || (!Previous->isOneOf(TT_FunctionDeclarationLParen, TT_LambdaDefinitionLParen) && @@ -861,7 +869,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, // or // caaaaaaaaaaaaaaaaaaaaal( // new SomethingElseeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee()); - !IsSimpleFunction(Current)) { + !StartsSimpleOneArgList(Current)) { CurrentState.NoLineBreak = true; } diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp index 4b29ba720f68230..c25228a69a748f2 100644 --- a/clang/unittests/Format/FormatTestJS.cpp +++ b/clang/unittests/Format/FormatTestJS.cpp @@ -2850,5 +2850,22 @@ TEST_F(FormatTestJS, DontBreakFieldsAsGoToLabels) { "};"); } +TEST_F(FormatTestJS, BreakAfterOpenBracket) { + auto Style = getGoogleStyle(FormatStyle::LK_JavaScript); + EXPECT_EQ(Style.AlignAfterOpenBracket, FormatStyle::BAS_AlwaysBreak); + verifyFormat("ctrl.onCopy(/** @type {!WizEvent}*/ (\n" + " {event, targetElement: {el: () => selectedElement}}));", + Style); + verifyFormat("failedUserIds.push(...subscriptioxxxxxxxxxxxxnSubset.map(\n" + " subscxxxxxxxxxxxxription => subscription.getUserId()));", + Style); + verifyFormat("failedUserIds.push(!subscriptioxxxxxxxxxxxxnSubset.map(\n" + " subscxxxxxxxxxxxxription => subscription.getUserId()));", + Style); + verifyFormat("failedUserIds.push(await subscriptioxxxxxxxxxxxxnSubset.map(\n" + " subscxxxxxxxxxxxxription => subscription.getUserId()));", + Style); +} + } // namespace format } // end namespace clang From 5e80fc88f484b471ec61ac28894698a946c4fb89 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Thu, 12 Sep 2024 12:48:32 +1000 Subject: [PATCH 86/94] [opt] Fix opt for LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES=Off. Building with -DLLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES=Off should not prevent use of opt plugins. This fix uses the approach implemented in https://github.com/llvm/llvm-project/pull/101741. rdar://135841478 --- llvm/tools/opt/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt index 8d5c9fb62e5bece..6dd74ae1b7f8fe8 100644 --- a/llvm/tools/opt/CMakeLists.txt +++ b/llvm/tools/opt/CMakeLists.txt @@ -46,9 +46,8 @@ add_llvm_tool(opt intrinsics_gen SUPPORT_PLUGINS + EXPORT_SYMBOLS ) target_link_libraries(opt PRIVATE LLVMOptDriver) setup_host_tool(opt OPT opt_exe opt_target) - -export_executable_symbols_for_plugins(opt) From 2740273505ab27c0d8531d35948f0647309842cd Mon Sep 17 00:00:00 2001 From: Amy Wang Date: Wed, 11 Sep 2024 23:22:54 -0400 Subject: [PATCH 87/94] [MLIR][Presburger] Make printing aligned to assist in debugging (#107648) Hello Arjun! Please allow me to contribute this patch as it helps me debugging significantly! When the 1's and 0's don't line up when debugging farkas lemma of numerous polyhedrons using simplex lexmin solver, it is truly straining on the eyes. Hopefully this patch can help others! The unfortunate part is the lack of testcase as I'm not sure how to add testcase for debug dumps. :) However, you can add this testcase to the SimplexTest.cpp to witness the nice printing! ```c++ TEST(SimplexTest, DumpTest) { int COLUMNS = 2; int ROWS = 2; LexSimplex simplex(COLUMNS * 2); IntMatrix m1(ROWS, COLUMNS * 2 + 1); // Adding LHS columns. for (int i = 0; i < ROWS; i++) { // an arbitrary formula to test all kinds of integers for (int j = 0; j < COLUMNS; j++) m1(i, j) = i + (2 << (i % 3)) * (-1 * ((i + j) % 2)); } // Adding RHS columns. for (int i = 0; i < ROWS; i++) { for (int j = 0; j < COLUMNS; j++) m1(i, j + COLUMNS) = j - (3 << (j % 4)) * (-1 * ((i + j * 2) % 2)); } for (int i = 0; i < m1.getNumRows(); i++) { ArrayRef curRow = m1.getRow(i); simplex.addInequality(curRow); } IntegerRelation rel = parseRelationFromSet("(x, y, z)[] : (z - x - 17 * y == 0, x - 11 * z >= 1)",2); simplex.dump(); m1.dump(); rel.dump(); } ``` ``` rows = 2, columns = 7 var: c3, c4, c5, c6 con: r0 [>=0], r1 [>=0] r0: -1, r1: -2 c0: denom, c1: const, c2: 2147483647, c3: 0, c4: 1, c5: 2, c6: 3 1 0 1 0 -2 0 1 1 0 -8 -3 1 3 7 0 -2 0 1 0 -3 1 3 7 0 Domain: 2, Range: 1, Symbols: 0, Locals: 0 2 constraints -1 -17 1 0 = 0 1 0 -11 -1 >= 0 ``` --- mlir/include/mlir/Analysis/Presburger/Utils.h | 50 +++++++++++++++++++ .../Analysis/Presburger/IntegerRelation.cpp | 22 +++++--- mlir/lib/Analysis/Presburger/Matrix.cpp | 12 +++-- mlir/lib/Analysis/Presburger/Simplex.cpp | 11 +++- 4 files changed, 84 insertions(+), 11 deletions(-) diff --git a/mlir/include/mlir/Analysis/Presburger/Utils.h b/mlir/include/mlir/Analysis/Presburger/Utils.h index d3c0802c240bc14..69a5ce4e70178f3 100644 --- a/mlir/include/mlir/Analysis/Presburger/Utils.h +++ b/mlir/include/mlir/Analysis/Presburger/Utils.h @@ -17,7 +17,9 @@ #include "llvm/ADT/DynamicAPInt.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallBitVector.h" +#include "llvm/Support/raw_ostream.h" #include +#include namespace mlir { namespace presburger { @@ -292,6 +294,54 @@ std::vector multiplyPolynomials(ArrayRef a, bool isRangeZero(ArrayRef arr); +/// Example usage: +/// Print .12, 3.4, 56.7 +/// preAlign = ".", minSpacing = 1, +/// .12 .12 +/// 3.4 3.4 +/// 56.7 56.7 +struct PrintTableMetrics { + // If unknown, set to 0 and pass the struct into updatePrintMetrics. + unsigned maxPreIndent; + unsigned maxPostIndent; + std::string preAlign; +}; + +/// Iterate over each val in the table and update 'm' where +/// .maxPreIndent and .maxPostIndent are initialized to 0. +/// class T is any type that can be handled by llvm::raw_string_ostream. +template +void updatePrintMetrics(T val, PrintTableMetrics &m) { + std::string str; + llvm::raw_string_ostream(str) << val; + if (str.empty()) + return; + unsigned preIndent = str.find(m.preAlign); + preIndent = (preIndent != std::string::npos) ? preIndent + 1 : 0; + m.maxPreIndent = std::max(m.maxPreIndent, preIndent); + m.maxPostIndent = + std::max(m.maxPostIndent, (unsigned int)(str.length() - preIndent)); +} + +/// Print val in the table with metrics specified in 'm'. +template +void printWithPrintMetrics(raw_ostream &os, T val, unsigned minSpacing, + const PrintTableMetrics &m) { + std::string str; + llvm::raw_string_ostream(str) << val; + unsigned preIndent; + if (!str.empty()) { + preIndent = str.find(m.preAlign); + preIndent = (preIndent != std::string::npos) ? preIndent + 1 : 0; + } else { + preIndent = 0; + } + for (unsigned i = 0; i < (minSpacing + m.maxPreIndent - preIndent); ++i) + os << " "; + os << str; + for (unsigned i = 0; i < m.maxPostIndent - (str.length() - preIndent); ++i) + os << " "; +} } // namespace presburger } // namespace mlir diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp index 94af81f955e5a5b..74cdf567c0e5699 100644 --- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp +++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp @@ -32,7 +32,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -2589,19 +2592,26 @@ void IntegerRelation::mergeAndCompose(const IntegerRelation &other) { void IntegerRelation::print(raw_ostream &os) const { assert(hasConsistentState()); printSpace(os); + PrintTableMetrics ptm = {0, 0, "-"}; + for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) + for (unsigned j = 0, f = getNumCols(); j < f; ++j) + updatePrintMetrics(atEq(i, j), ptm); + for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) + for (unsigned j = 0, f = getNumCols(); j < f; ++j) + updatePrintMetrics(atIneq(i, j), ptm); + // Print using PrintMetrics. + unsigned MIN_SPACING = 1; for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) { - os << " "; for (unsigned j = 0, f = getNumCols(); j < f; ++j) { - os << atEq(i, j) << "\t"; + printWithPrintMetrics(os, atEq(i, j), MIN_SPACING, ptm); } - os << "= 0\n"; + os << " = 0\n"; } for (unsigned i = 0, e = getNumInequalities(); i < e; ++i) { - os << " "; for (unsigned j = 0, f = getNumCols(); j < f; ++j) { - os << atIneq(i, j) << "\t"; + printWithPrintMetrics(os, atIneq(i, j), MIN_SPACING, ptm); } - os << ">= 0\n"; + os << " >= 0\n"; } os << '\n'; } diff --git a/mlir/lib/Analysis/Presburger/Matrix.cpp b/mlir/lib/Analysis/Presburger/Matrix.cpp index 110c5df1af37c0f..9fc6205eb5ed52c 100644 --- a/mlir/lib/Analysis/Presburger/Matrix.cpp +++ b/mlir/lib/Analysis/Presburger/Matrix.cpp @@ -398,10 +398,16 @@ Matrix Matrix::getSubMatrix(unsigned fromRow, unsigned toRow, template void Matrix::print(raw_ostream &os) const { - for (unsigned row = 0; row < nRows; ++row) { + PrintTableMetrics ptm = {0, 0, "-"}; + for (unsigned row = 0; row < nRows; ++row) for (unsigned column = 0; column < nColumns; ++column) - os << at(row, column) << ' '; - os << '\n'; + updatePrintMetrics(at(row, column), ptm); + unsigned MIN_SPACING = 1; + for (unsigned row = 0; row < nRows; ++row) { + for (unsigned column = 0; column < nColumns; ++column) { + printWithPrintMetrics(os, at(row, column), MIN_SPACING, ptm); + } + os << "\n"; } } diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp index c78a0723a6c0faf..4ffa2d546af4dd0 100644 --- a/mlir/lib/Analysis/Presburger/Simplex.cpp +++ b/mlir/lib/Analysis/Presburger/Simplex.cpp @@ -2153,9 +2153,16 @@ void SimplexBase::print(raw_ostream &os) const { for (unsigned col = 2, e = getNumColumns(); col < e; ++col) os << ", c" << col << ": " << colUnknown[col]; os << '\n'; - for (unsigned row = 0, numRows = getNumRows(); row < numRows; ++row) { + PrintTableMetrics ptm = {0, 0, "-"}; + for (unsigned row = 0, numRows = getNumRows(); row < numRows; ++row) for (unsigned col = 0, numCols = getNumColumns(); col < numCols; ++col) - os << tableau(row, col) << '\t'; + updatePrintMetrics(tableau(row, col), ptm); + unsigned MIN_SPACING = 1; + for (unsigned row = 0, numRows = getNumRows(); row < numRows; ++row) { + for (unsigned col = 0, numCols = getNumColumns(); col < numCols; ++col) { + printWithPrintMetrics(os, tableau(row, col), MIN_SPACING, + ptm); + } os << '\n'; } os << '\n'; From 94698369e9cc211b4d1e666b82dc5848c40ab5ce Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Wed, 11 Sep 2024 20:31:12 -0700 Subject: [PATCH 88/94] [clang-format][NFC] Minor clean of TokenAnnotatorTest --- clang/unittests/Format/TokenAnnotatorTest.cpp | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 36a6db9283893ea..5c28e3a4ea5a1ff 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -2050,7 +2050,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsFunctionDeclarationNames) { EXPECT_TOKEN(Tokens[4], tok::l_paren, TT_FunctionTypeLParen); Tokens = annotate("void instanceof();"); - ASSERT_EQ(Tokens.size(), 6u); + ASSERT_EQ(Tokens.size(), 6u) << Tokens; EXPECT_TOKEN(Tokens[1], tok::identifier, TT_FunctionDeclarationName); EXPECT_TOKEN(Tokens[2], tok::l_paren, TT_FunctionDeclarationLParen); @@ -3365,55 +3365,55 @@ TEST_F(TokenAnnotatorTest, SwitchExpression) { TEST_F(TokenAnnotatorTest, CppAltOperatorKeywords) { auto Tokens = annotate("a = b and c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_BinaryOperator); Tokens = annotate("a = b and_eq c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::ampequal, TT_BinaryOperator); Tokens = annotate("a = b bitand c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::amp, TT_BinaryOperator); Tokens = annotate("a = b bitor c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::pipe, TT_BinaryOperator); Tokens = annotate("a = b compl c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::tilde, TT_UnaryOperator); Tokens = annotate("a = b not c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::exclaim, TT_UnaryOperator); Tokens = annotate("a = b not_eq c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::exclaimequal, TT_BinaryOperator); Tokens = annotate("a = b or c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::pipepipe, TT_BinaryOperator); Tokens = annotate("a = b or_eq c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::pipeequal, TT_BinaryOperator); Tokens = annotate("a = b xor c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::caret, TT_BinaryOperator); Tokens = annotate("a = b xor_eq c;"); - ASSERT_EQ(Tokens.size(), 7u); + ASSERT_EQ(Tokens.size(), 7u) << Tokens; EXPECT_TOKEN(Tokens[3], tok::caretequal, TT_BinaryOperator); Tokens = annotate("xor = foo;"); - ASSERT_EQ(Tokens.size(), 5u); + ASSERT_EQ(Tokens.size(), 5u) << Tokens; EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown); Tokens = annotate("int xor = foo;"); - ASSERT_EQ(Tokens.size(), 6u); + ASSERT_EQ(Tokens.size(), 6u) << Tokens; EXPECT_TOKEN(Tokens[1], tok::identifier, TT_StartOfName); } @@ -3423,7 +3423,7 @@ TEST_F(TokenAnnotatorTest, FunctionTryBlock) { " : foo{[] -> std::string { return {}; }(), x}, bar{y} {\n" "} catch (...) {\n" "}"); - ASSERT_EQ(Tokens.size(), 45u); + ASSERT_EQ(Tokens.size(), 45u) << Tokens; EXPECT_TOKEN(Tokens[2], tok::identifier, TT_CtorDtorDeclName); EXPECT_TOKEN(Tokens[3], tok::l_paren, TT_FunctionDeclarationLParen); EXPECT_TOKEN(Tokens[11], tok::colon, TT_CtorInitializerColon); @@ -3439,7 +3439,7 @@ TEST_F(TokenAnnotatorTest, TypenameMacro) { Style.TypenameMacros.push_back("STRUCT"); auto Tokens = annotate("STRUCT(T, B) { int i; };", Style); - ASSERT_EQ(Tokens.size(), 13u); + ASSERT_EQ(Tokens.size(), 13u) << Tokens; EXPECT_TOKEN(Tokens[0], tok::identifier, TT_TypenameMacro); EXPECT_TOKEN(Tokens[1], tok::l_paren, TT_TypeDeclarationParen); EXPECT_TOKEN(Tokens[5], tok::r_paren, TT_TypeDeclarationParen); @@ -3451,7 +3451,7 @@ TEST_F(TokenAnnotatorTest, GNULanguageStandard) { EXPECT_EQ(Style.Standard, FormatStyle::LS_Latest); auto Tokens = annotate("return 1 <=> 2;", Style); - ASSERT_EQ(Tokens.size(), 6u); + ASSERT_EQ(Tokens.size(), 6u) << Tokens; EXPECT_TOKEN(Tokens[2], tok::spaceship, TT_BinaryOperator); } From ded080152acceca5d68014d63f5027a6d8266cbb Mon Sep 17 00:00:00 2001 From: Sirui Mu Date: Thu, 12 Sep 2024 11:41:32 +0800 Subject: [PATCH 89/94] [libc] Add osutils for Windows and make libc and its tests build on Windows target (#104676) This PR first adds osutils for Windows, and changes some libc code to make libc and its tests build on the Windows target. It then temporarily disables some libc tests that are currently problematic on Windows. Specifically, the changes besides the addition of osutils include: - Macro `LIBC_TYPES_HAS_FLOAT16` is disabled on Windows. `clang-cl` generates calls to functions in `compiler-rt` to handle float16 arithmetic and these functions are currently not linked in on Windows. - Macro `LIBC_TYPES_HAS_INT128` is disabled on Windows. - The invocation to `::aligned_malloc` is changed to an invocation to `::_aligned_malloc`. - The following unit tests are temporarily disabled because they currently fail on Windows: - `test.src.__support.big_int_test` - `test.src.__support.arg_list_test` - `test.src.fenv.getenv_and_setenv_test` - Tests involving `__m128i`, `__m256i`, and `__m512i` in `test.src.string.memory_utils.op_tests.cpp` - `test_range_errors` in `libc/test/src/math/smoke/AddTest.h` and `libc/test/src/math/smoke/SubTest.h` --- .../cmake/modules/LLVMLibCArchitectures.cmake | 7 ++++++ .../include/llvm-libc-macros/float16-macros.h | 3 ++- .../llvm-libc-macros/stdckdint-macros.h | 6 +++-- libc/src/__support/CPP/CMakeLists.txt | 1 + libc/src/__support/CPP/new.cpp | 16 +++++++++++- libc/src/__support/CPP/new.h | 9 +++++++ libc/src/__support/OSUtil/io.h | 2 ++ .../__support/OSUtil/windows/CMakeLists.txt | 10 ++++++++ libc/src/__support/OSUtil/windows/exit.cpp | 23 +++++++++++++++++ libc/src/__support/OSUtil/windows/io.cpp | 25 +++++++++++++++++++ libc/src/__support/OSUtil/windows/io.h | 21 ++++++++++++++++ libc/src/__support/macros/properties/types.h | 2 +- libc/test/src/__support/CMakeLists.txt | 7 +++++- libc/test/src/__support/FPUtil/CMakeLists.txt | 1 + .../test/src/__support/FPUtil/fpbits_test.cpp | 6 ++--- libc/test/src/__support/arg_list_test.cpp | 3 ++- libc/test/src/fenv/CMakeLists.txt | 1 + libc/test/src/fenv/getenv_and_setenv_test.cpp | 5 +++- libc/test/src/math/smoke/AddTest.h | 3 +++ libc/test/src/math/smoke/CMakeLists.txt | 18 +++++++++++++ libc/test/src/math/smoke/SubTest.h | 3 +++ .../src/string/memory_utils/CMakeLists.txt | 1 + .../test/src/string/memory_utils/op_tests.cpp | 3 ++- 23 files changed, 163 insertions(+), 13 deletions(-) create mode 100644 libc/src/__support/OSUtil/windows/CMakeLists.txt create mode 100644 libc/src/__support/OSUtil/windows/exit.cpp create mode 100644 libc/src/__support/OSUtil/windows/io.cpp create mode 100644 libc/src/__support/OSUtil/windows/io.h diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake index d922b4f21a8ac64..7711127c1a81e14 100644 --- a/libc/cmake/modules/LLVMLibCArchitectures.cmake +++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake @@ -206,6 +206,13 @@ if(explicit_target_triple AND endif() endif() + +# Windows does not support full mode build. +if (LIBC_TARGET_OS_IS_WINDOWS AND LLVM_LIBC_FULL_BUILD) + message(FATAL_ERROR "Windows does not support full mode build.") +endif () + + message(STATUS "Building libc for ${LIBC_TARGET_ARCHITECTURE} on ${LIBC_TARGET_OS} with LIBC_COMPILE_OPTIONS_DEFAULT: ${LIBC_COMPILE_OPTIONS_DEFAULT}") diff --git a/libc/include/llvm-libc-macros/float16-macros.h b/libc/include/llvm-libc-macros/float16-macros.h index 9a11ecc49307e24..229e3e62f2aedf5 100644 --- a/libc/include/llvm-libc-macros/float16-macros.h +++ b/libc/include/llvm-libc-macros/float16-macros.h @@ -13,7 +13,8 @@ #if defined(__FLT16_MANT_DIG__) && \ (!defined(__GNUC__) || __GNUC__ >= 13 || defined(__clang__)) && \ - !defined(__arm__) && !defined(_M_ARM) && !defined(__riscv) + !defined(__arm__) && !defined(_M_ARM) && !defined(__riscv) && \ + !defined(_WIN32) #define LIBC_TYPES_HAS_FLOAT16 // TODO: This would no longer be required if HdrGen let us guard function diff --git a/libc/include/llvm-libc-macros/stdckdint-macros.h b/libc/include/llvm-libc-macros/stdckdint-macros.h index 694412290bbca0a..17e4ccdc2d5f8ee 100644 --- a/libc/include/llvm-libc-macros/stdckdint-macros.h +++ b/libc/include/llvm-libc-macros/stdckdint-macros.h @@ -10,8 +10,10 @@ #define LLVM_LIBC_MACROS_STDCKDINT_MACROS_H // We need to use __builtin_*_overflow from GCC/Clang to implement the overflow -// macros. Check __GNUC__ for availability of such builtins. -#ifdef __GNUC__ +// macros. Check __GNUC__ or __clang__ for availability of such builtins. +// Note that clang-cl defines __clang__ only and does not define __GNUC__ so we +// have to check for both. +#if defined(__GNUC__) || defined(__clang__) // clang/gcc overlay may provides similar macros, we need to avoid redefining // them. #ifndef __STDC_VERSION_STDCKDINT_H__ diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt index f2e774f166f666f..c1981b827042caf 100644 --- a/libc/src/__support/CPP/CMakeLists.txt +++ b/libc/src/__support/CPP/CMakeLists.txt @@ -199,4 +199,5 @@ add_object_library( DEPENDS libc.include.stdlib libc.src.__support.common + libc.src.__support.macros.properties.os ) diff --git a/libc/src/__support/CPP/new.cpp b/libc/src/__support/CPP/new.cpp index 5a40d4a6d3b272a..88db8377b2fac4c 100644 --- a/libc/src/__support/CPP/new.cpp +++ b/libc/src/__support/CPP/new.cpp @@ -16,15 +16,29 @@ void operator delete(void *mem, std::align_val_t) noexcept { ::free(mem); } void operator delete(void *mem, size_t) noexcept { ::free(mem); } void operator delete(void *mem, size_t, std::align_val_t) noexcept { +#ifdef LIBC_TARGET_OS_IS_WINDOWS + ::_aligned_free(mem); +#else ::free(mem); +#endif } void operator delete[](void *mem) noexcept { ::free(mem); } -void operator delete[](void *mem, std::align_val_t) noexcept { ::free(mem); } +void operator delete[](void *mem, std::align_val_t) noexcept { +#ifdef LIBC_TARGET_OS_IS_WINDOWS + ::_aligned_free(mem); +#else + ::free(mem); +#endif +} void operator delete[](void *mem, size_t) noexcept { ::free(mem); } void operator delete[](void *mem, size_t, std::align_val_t) noexcept { +#ifdef LIBC_TARGET_OS_IS_WINDOWS + ::_aligned_free(mem); +#else ::free(mem); +#endif } diff --git a/libc/src/__support/CPP/new.h b/libc/src/__support/CPP/new.h index 94a8466a39677be..c1b6b95033f84c0 100644 --- a/libc/src/__support/CPP/new.h +++ b/libc/src/__support/CPP/new.h @@ -11,6 +11,7 @@ #include "src/__support/common.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/os.h" #include // For size_t #include // For malloc, free etc. @@ -47,7 +48,15 @@ class AllocChecker { LIBC_INLINE static void *aligned_alloc(size_t s, std::align_val_t align, AllocChecker &ac) { +#ifdef LIBC_TARGET_OS_IS_WINDOWS + // std::aligned_alloc is not available on Windows because std::free on + // Windows cannot deallocate any over-aligned memory. Microsoft provides an + // alternative for std::aligned_alloc named _aligned_malloc, but it must be + // paired with _aligned_free instead of std::free. + void *mem = ::_aligned_malloc(static_cast(align), s); +#else void *mem = ::aligned_alloc(static_cast(align), s); +#endif ac = (mem != nullptr); return mem; } diff --git a/libc/src/__support/OSUtil/io.h b/libc/src/__support/OSUtil/io.h index cb7e748fc644264..80119da77fc0275 100644 --- a/libc/src/__support/OSUtil/io.h +++ b/libc/src/__support/OSUtil/io.h @@ -19,6 +19,8 @@ #include "linux/io.h" #elif defined(__Fuchsia__) #include "fuchsia/io.h" +#elif defined(_WIN32) +#include "windows/io.h" #elif defined(__ELF__) // TODO: Ideally we would have LIBC_TARGET_OS_IS_BAREMETAL. #include "baremetal/io.h" diff --git a/libc/src/__support/OSUtil/windows/CMakeLists.txt b/libc/src/__support/OSUtil/windows/CMakeLists.txt new file mode 100644 index 000000000000000..be316d77f5d06a4 --- /dev/null +++ b/libc/src/__support/OSUtil/windows/CMakeLists.txt @@ -0,0 +1,10 @@ +add_object_library( + windows_util + SRCS + exit.cpp + io.cpp + HDRS + io.h + DEPENDS + libc.src.__support.macros.config +) diff --git a/libc/src/__support/OSUtil/windows/exit.cpp b/libc/src/__support/OSUtil/windows/exit.cpp new file mode 100644 index 000000000000000..369b07b848878eb --- /dev/null +++ b/libc/src/__support/OSUtil/windows/exit.cpp @@ -0,0 +1,23 @@ +//===-- Windows implementation of an exit function ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/macros/config.h" + +// On Windows we cannot make direct syscalls since Microsoft changes system call +// IDs periodically. We must rely on functions exported from ntdll.dll or +// kernel32.dll to invoke system service procedures. +#define WIN32_LEAN_AND_MEAN +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +[[noreturn]] void exit(int status) { ::ExitProcess(status); } + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/OSUtil/windows/io.cpp b/libc/src/__support/OSUtil/windows/io.cpp new file mode 100644 index 000000000000000..af3d1b9e43976df --- /dev/null +++ b/libc/src/__support/OSUtil/windows/io.cpp @@ -0,0 +1,25 @@ +//===------------- Windows implementation of IO utils -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "io.h" +#include "src/__support/macros/config.h" + +// On Windows we cannot make direct syscalls since Microsoft changes system call +// IDs periodically. We must rely on functions exported from ntdll.dll or +// kernel32.dll to invoke system service procedures. +#define WIN32_LEAN_AND_MEAN +#include + +namespace LIBC_NAMESPACE_DECL { + +void write_to_stderr(cpp::string_view msg) { + ::HANDLE stream = ::GetStdHandle(STD_ERROR_HANDLE); + ::WriteFile(stream, msg.data(), msg.size(), nullptr, nullptr); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/OSUtil/windows/io.h b/libc/src/__support/OSUtil/windows/io.h new file mode 100644 index 000000000000000..bafc00254a7cffa --- /dev/null +++ b/libc/src/__support/OSUtil/windows/io.h @@ -0,0 +1,21 @@ +//===------------- Windows implementation of IO utils -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_WINDOWS_IO_H +#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_WINDOWS_IO_H + +#include "src/__support/CPP/string_view.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +void write_to_stderr(cpp::string_view msg); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_WINDOWS_IO_H diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h index 69ddc912238e74c..3ede8a6503d7714 100644 --- a/libc/src/__support/macros/properties/types.h +++ b/libc/src/__support/macros/properties/types.h @@ -35,7 +35,7 @@ #endif // UINT64_MAX // int128 / uint128 support -#if defined(__SIZEOF_INT128__) +#if defined(__SIZEOF_INT128__) && !defined(LIBC_TARGET_OS_IS_WINDOWS) #define LIBC_TYPES_HAS_INT128 #endif // defined(__SIZEOF_INT128__) diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 90de520405981b6..7ad262d5f1f3007 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -140,9 +140,14 @@ add_libc_test( arg_list_test.cpp DEPENDS libc.src.__support.arg_list + libc.src.__support.macros.properties.os ) -if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) +# TODO: clang-cl generates calls into runtime library functions to +# handle 128-bit integer arithmetics and conversions which are not yet +# available on Windows. Re-enable 128-bit integer support on Windows once +# these functions are ready. +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX AND NOT LIBC_TARGET_OS_IS_WINDOWS) add_libc_test( big_int_test SUITE diff --git a/libc/test/src/__support/FPUtil/CMakeLists.txt b/libc/test/src/__support/FPUtil/CMakeLists.txt index 22fbd2664b546b8..1e64e9ba425a581 100644 --- a/libc/test/src/__support/FPUtil/CMakeLists.txt +++ b/libc/test/src/__support/FPUtil/CMakeLists.txt @@ -25,6 +25,7 @@ add_libc_test( libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fpbits_str libc.src.__support.integer_literals + libc.src.__support.macros.properties.types libc.src.__support.sign ) diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp index 99acc03010344f3..df50d8546f34f26 100644 --- a/libc/test/src/__support/FPUtil/fpbits_test.cpp +++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp @@ -9,6 +9,7 @@ #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/fpbits_str.h" #include "src/__support/integer_literals.h" +#include "src/__support/macros/properties/types.h" #include "src/__support/sign.h" // Sign #include "test/UnitTest/Test.h" @@ -425,13 +426,10 @@ TEST(LlvmLibcFPBitsTest, DoubleType) { EXPECT_EQ(quiet_nan.is_quiet_nan(), true); } -#ifdef LIBC_TARGET_ARCH_IS_X86 +#ifdef LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80 TEST(LlvmLibcFPBitsTest, X86LongDoubleType) { using LongDoubleBits = FPBits; - if constexpr (sizeof(long double) == sizeof(double)) - return; // The tests for the "double" type cover for this case. - EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::POS)).c_str(), "(+Infinity)"); EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::NEG)).c_str(), diff --git a/libc/test/src/__support/arg_list_test.cpp b/libc/test/src/__support/arg_list_test.cpp index 4f229e2bfe69408..79a715e91068701 100644 --- a/libc/test/src/__support/arg_list_test.cpp +++ b/libc/test/src/__support/arg_list_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/arg_list.h" +#include "src/__support/macros/properties/os.h" #include "test/UnitTest/Test.h" @@ -120,7 +121,7 @@ TEST(LlvmLibcArgListTest, TestStructTypes) { } // Test vector extensions from clang. -#if __has_attribute(ext_vector_type) +#if !defined(LIBC_TARGET_OS_IS_WINDOWS) && __has_attribute(ext_vector_type) using int1 = int __attribute__((ext_vector_type(1))); using int2 = int __attribute__((ext_vector_type(2))); diff --git a/libc/test/src/fenv/CMakeLists.txt b/libc/test/src/fenv/CMakeLists.txt index b776f9a0706e861..d79b4a49a5e4f34 100644 --- a/libc/test/src/fenv/CMakeLists.txt +++ b/libc/test/src/fenv/CMakeLists.txt @@ -41,6 +41,7 @@ add_libc_unittest( libc.src.fenv.fesetenv libc.src.fenv.fesetround libc.src.__support.FPUtil.fenv_impl + libc.src.__support.macros.properties.os LINK_LIBRARIES LibcFPTestHelpers ) diff --git a/libc/test/src/fenv/getenv_and_setenv_test.cpp b/libc/test/src/fenv/getenv_and_setenv_test.cpp index 8fc2787ecb5b1ee..fa4ef662222afa3 100644 --- a/libc/test/src/fenv/getenv_and_setenv_test.cpp +++ b/libc/test/src/fenv/getenv_and_setenv_test.cpp @@ -13,6 +13,7 @@ #include "src/fenv/fesetround.h" #include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/macros/properties/os.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/Test.h" @@ -20,6 +21,7 @@ using LlvmLibcFEnvTest = LIBC_NAMESPACE::testing::FEnvSafeTest; +#ifndef LIBC_TARGET_OS_IS_WINDOWS TEST_F(LlvmLibcFEnvTest, GetEnvAndSetEnv) { // We will disable all exceptions to prevent invocation of the exception // handler. @@ -71,8 +73,9 @@ TEST_F(LlvmLibcFEnvTest, Set_FE_DFL_ENV) { int rm = LIBC_NAMESPACE::fegetround(); EXPECT_EQ(rm, FE_TONEAREST); } +#endif -#ifdef _WIN32 +#ifdef LIBC_TARGET_OS_IS_WINDOWS TEST_F(LlvmLibcFEnvTest, Windows_Set_Get_Test) { // If a valid fenv_t is written, then reading it back out should be identical. fenv_t setEnv = {0x7e00053e, 0x0f00000f}; diff --git a/libc/test/src/math/smoke/AddTest.h b/libc/test/src/math/smoke/AddTest.h index 0b7e395a22d4cde..88c2067ca14748c 100644 --- a/libc/test/src/math/smoke/AddTest.h +++ b/libc/test/src/math/smoke/AddTest.h @@ -12,6 +12,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/macros/properties/os.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -53,6 +54,7 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } void test_range_errors(AddFunc func) { +#ifndef LIBC_TARGET_OS_IS_WINDOWS using namespace LIBC_NAMESPACE::fputil::testing; if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { @@ -121,6 +123,7 @@ class AddTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } +#endif } void test_inexact_results(AddFunc func) { diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index e943d98256a97b2..47e16926f10df1f 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -153,6 +153,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.fadd + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -168,6 +169,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.faddl + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -183,6 +185,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.faddf128 + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -424,6 +427,7 @@ add_fp_unittest( libc.src.errno.errno libc.hdr.fenv_macros libc.src.math.dsubl + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -438,6 +442,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.dsubf128 + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4173,6 +4178,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16add + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4187,6 +4193,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16addf + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4201,6 +4208,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16addl + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4215,6 +4223,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16addf128 + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4229,6 +4238,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16sub + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4243,6 +4253,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16subf + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4257,6 +4268,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16subl + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4271,6 +4283,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.f16subf128 + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4552,6 +4565,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.fsub + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4566,6 +4580,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.fsubl + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4580,6 +4595,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.fsubf128 + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4753,6 +4769,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.daddl + libc.src.__support.macros.properties.os ) add_fp_unittest( @@ -4767,6 +4784,7 @@ add_fp_unittest( libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.math.daddf128 + libc.src.__support.macros.properties.os ) add_fp_unittest( diff --git a/libc/test/src/math/smoke/SubTest.h b/libc/test/src/math/smoke/SubTest.h index 9ee4220b3820855..99c4b6c760af720 100644 --- a/libc/test/src/math/smoke/SubTest.h +++ b/libc/test/src/math/smoke/SubTest.h @@ -11,6 +11,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" +#include "src/__support/macros/properties/os.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -52,6 +53,7 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { } void test_range_errors(SubFunc func) { +#ifndef LIBC_TARGET_OS_IS_WINDOWS using namespace LIBC_NAMESPACE::fputil::testing; if (ForceRoundingMode r(RoundingMode::Nearest); r.success) { @@ -123,6 +125,7 @@ class SubTest : public LIBC_NAMESPACE::testing::FEnvSafeTest { FE_UNDERFLOW | FE_INEXACT); EXPECT_MATH_ERRNO(ERANGE); } +#endif } void test_inexact_results(SubFunc func) { diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt index a0dddd2f97b585c..8374be4a1d01a33 100644 --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -12,6 +12,7 @@ add_libc_test( libc.src.__support.CPP.array libc.src.__support.CPP.cstddef libc.src.__support.CPP.span + libc.src.__support.macros.properties.os libc.src.__support.macros.properties.types libc.src.__support.macros.sanitizer libc.src.string.memory_utils.memory_utils diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp index 978561f31a29619..c6197d1afa266b2 100644 --- a/libc/test/src/string/memory_utils/op_tests.cpp +++ b/libc/test/src/string/memory_utils/op_tests.cpp @@ -8,6 +8,7 @@ #include "memory_check_utils.h" #include "src/__support/macros/config.h" +#include "src/__support/macros/properties/os.h" #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT64 #include "src/string/memory_utils/op_aarch64.h" #include "src/string/memory_utils/op_builtin.h" @@ -294,7 +295,7 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) { #endif // LIBC_TARGET_ARCH_IS_X86_64 using MemcmpImplementations = testing::TypeList< -#ifdef LIBC_TARGET_ARCH_IS_X86_64 +#if defined(LIBC_TARGET_ARCH_IS_X86_64) && !defined(LIBC_TARGET_OS_IS_WINDOWS) #ifdef __SSE2__ generic::Memcmp<__m128i>, // #endif From b7167c784486581dad3f3188232951b79c6d0fd9 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Thu, 12 Sep 2024 11:57:29 +0800 Subject: [PATCH 90/94] [mlir] Fix incorrect comparison due to -Wtautological-constant-out-of-range-compare (NFC) /llvm-project/mlir/include/mlir/Analysis/Presburger/Utils.h:320:26: error: result of comparison of constant 18446744073709551615 with expression of type 'unsigned int' is always true [-Werror,-Wtautological-constant-out-of-range-compare] preIndent = (preIndent != std::string::npos) ? preIndent + 1 : 0; ~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~ /llvm-project/mlir/include/mlir/Analysis/Presburger/Utils.h:335:28: error: result of comparison of constant 18446744073709551615 with expression of type 'unsigned int' is always true [-Werror,-Wtautological-constant-out-of-range-compare] preIndent = (preIndent != std::string::npos) ? preIndent + 1 : 0; ~~~~~~~~~ ^ ~~~~~~~~~~~~~~~~~ 2 errors generated. --- mlir/include/mlir/Analysis/Presburger/Utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Analysis/Presburger/Utils.h b/mlir/include/mlir/Analysis/Presburger/Utils.h index 69a5ce4e70178f3..0e6d18279d67ed5 100644 --- a/mlir/include/mlir/Analysis/Presburger/Utils.h +++ b/mlir/include/mlir/Analysis/Presburger/Utils.h @@ -317,7 +317,7 @@ void updatePrintMetrics(T val, PrintTableMetrics &m) { if (str.empty()) return; unsigned preIndent = str.find(m.preAlign); - preIndent = (preIndent != std::string::npos) ? preIndent + 1 : 0; + preIndent = (preIndent != (unsigned)std::string::npos) ? preIndent + 1 : 0; m.maxPreIndent = std::max(m.maxPreIndent, preIndent); m.maxPostIndent = std::max(m.maxPostIndent, (unsigned int)(str.length() - preIndent)); @@ -332,7 +332,7 @@ void printWithPrintMetrics(raw_ostream &os, T val, unsigned minSpacing, unsigned preIndent; if (!str.empty()) { preIndent = str.find(m.preAlign); - preIndent = (preIndent != std::string::npos) ? preIndent + 1 : 0; + preIndent = (preIndent != (unsigned)std::string::npos) ? preIndent + 1 : 0; } else { preIndent = 0; } From a9ba1b6dd5133aa4432759c203e807d8039b4cbd Mon Sep 17 00:00:00 2001 From: Yun-Fly Date: Thu, 12 Sep 2024 12:01:23 +0800 Subject: [PATCH 91/94] [mlir][scf] Extend consumer fuse to single nested `scf.for` (#108318) Refactor current consumer fusion based on `addInitOperandsToLoopNest` to support single nested `scf.for`, E.g. ``` %0 = scf.for() { %1 = scf.for() { tiledProducer } yield %1 } %2 = consumer ins(%0) ``` Compared with #94190, this PR fix build failure by making C++17 happy. --- .../SCF/Transforms/TileUsingInterface.cpp | 348 +++++++++--------- .../tile-and-fuse-consumer.mlir | 77 +++- 2 files changed, 244 insertions(+), 181 deletions(-) diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp index e404c01010a3259..3650caa681620ad 100644 --- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp @@ -1481,6 +1481,50 @@ static FailureOr getConsumerFromUses(Value val, return &operand; } +/// Find the perfectly nested loops outside of given loop(included) sorted from +/// outer to inner. +/// +/// E.g. +/// +/// ``` +/// %0 = scf.for() +/// %1 = scf.for() +/// %2 = scf.for() +/// %3 = ... +/// yield %3 +/// yield %2 +/// yield %1 +/// ``` +/// +/// This function will return three perfectly nested loops: %0 + %1 + %2, when +/// target inner loop is %2. +static SmallVector +getPerfectlyNestedLoopsOutsideOf(scf::ForOp loop) { + SmallVector nestLoops = {loop}; + auto outerLoop = dyn_cast(loop->getParentOp()); + + // Check if it is the ForOp that yield the result of inner loop. + auto isForOpYieldResultOfInnerLoop = + [](scf::ForOp outerLoop) -> LogicalResult { + Block *body = outerLoop.getBody(); + if (!llvm::hasSingleElement(body->without_terminator())) + return failure(); + auto yieldOp = cast(body->getTerminator()); + auto innerForOp = dyn_cast(body->front()); + if (!innerForOp) + return failure(); + // All of innerForOp results should be yielded. + return success(innerForOp->getNumResults() == yieldOp->getNumOperands()); + }; + + while (outerLoop && succeeded(isForOpYieldResultOfInnerLoop(outerLoop))) { + nestLoops.push_back(outerLoop); + outerLoop = dyn_cast(outerLoop->getParentOp()); + } + // sorted from outer to inner + return {nestLoops.rbegin(), nestLoops.rend()}; +} + /// Fetch the untiled consumer of a scf.for's result which is yielded by a /// tensor.insert_slice. This function makes the following assumptions : /// 1. tensor.insert_slice has scf.yield as its only user. @@ -1498,9 +1542,10 @@ getUntiledConsumerFromSlice(tensor::InsertSliceOp candidateSliceOp) { auto forOp = dyn_cast(containingOp); if (!forOp) return failure(); - Value resultingValue = forOp->getResult(resultNumber); + scf::ForOp topLevelForOp = getPerfectlyNestedLoopsOutsideOf(forOp).front(); + Value resultingValue = topLevelForOp->getResult(resultNumber); - return getConsumerFromUses(resultingValue, containingOp->getBlock()); + return getConsumerFromUses(resultingValue, topLevelForOp->getBlock()); } /// Fetch the first untiled consumer of a scf.forall's result which is yielded @@ -1563,59 +1608,6 @@ static FailureOr getUntiledConsumerFromSlice(Operation *sliceOp) { } } -/// After fusing consumer into scf.for we want to modify the scf.yield operation -/// to reflect the same by returning the values yielded by the tiled consumer. -static void -fixTerminatorSCFYield(RewriterBase &rewriter, scf::ForOp newForOp, - TilingResult &tilingResult, - ArrayRef> &resultOffsets, - ArrayRef> &resultSizes, - ArrayRef bbArgs) { - scf::YieldOp oldTerminatorOp = - cast(newForOp.getBody()->getTerminator()); - unsigned totalOldResults = oldTerminatorOp->getNumResults(); - unsigned totalTiledResults = tilingResult.tiledOps[0]->getNumResults(); - SmallVector newYieldOperands; - newYieldOperands.reserve(totalOldResults + totalTiledResults); - for (auto oldResult : oldTerminatorOp.getResults()) { - newYieldOperands.push_back(oldResult); - } - rewriter.setInsertionPointAfter(oldTerminatorOp); - Location loc = newForOp.getLoc(); - for (auto [tiledResult, bbArg, resultOffset, resultSize] : - llvm::zip_equal(tilingResult.tiledOps[0]->getResults(), bbArgs, - resultOffsets, resultSizes)) { - SmallVector strides(resultOffset.size(), - rewriter.getIndexAttr(1)); - Value newInsertSliceOp = rewriter.create( - loc, tiledResult, bbArg, resultOffset, resultSize, strides); - newYieldOperands.push_back(newInsertSliceOp); - } - rewriter.create(loc, newYieldOperands); - rewriter.eraseOp(oldTerminatorOp); -} - -/// After fusing consumer into scf.forall we want to yield each of the resulting -/// values by the tiled consumer within scf.forall.in_parallel region. -static void -fixTerminatorSCFInParallel(RewriterBase &rewriter, scf::ForallOp newForallOp, - SmallVector tiledResults, - ArrayRef> &resultOffsets, - ArrayRef> &resultSizes, - ArrayRef bbArgs) { - scf::InParallelOp newTerminatorOp = newForallOp.getTerminator(); - rewriter.setInsertionPointToStart(newTerminatorOp.getBody()); - Location firstYieldOpLoc = - (*(newTerminatorOp.getYieldingOps().begin())).getLoc(); - for (auto [tiledResult, bbArg, resultOffset, resultSize] : - llvm::zip_equal(tiledResults, bbArgs, resultOffsets, resultSizes)) { - SmallVector strides(resultOffset.size(), - rewriter.getIndexAttr(1)); - rewriter.create( - firstYieldOpLoc, tiledResult, bbArg, resultOffset, resultSize, strides); - } -} - /// Implementation of fusing consumer of a single slice by computing the /// slice of the consumer in-place for scf loop. FailureOr @@ -1646,81 +1638,63 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, consumerOp, "consumer op's operand doesn't seem to be an OpResult"); } - Operation *oldLoopOp = nullptr; - SmallVector newOuts; - Block *oldLoopBody = nullptr; - unsigned initSize = 0; - unsigned rank = 1; + // There are two possible cases regarding `oldLoopOp` here: + // 1. single `scf.forall` or `scf.for`. + // 2. inner-most `scf.for` insider nest `scf.loop` structure, where the + // top-level loop is the outer-most one of these nested loops. + LoopLikeOpInterface innerMostLoop = + candidateSliceOp->getParentOfType(); + SmallVector nestedLoops; if (isInsertSliceOp) { - auto forOp = candidateSliceOp->getParentOfType(); - oldLoopOp = forOp; - llvm::append_range(newOuts, forOp.getInits()); - oldLoopBody = forOp.getBody(); - initSize = forOp.getInits().size(); + nestedLoops = llvm::map_to_vector( + getPerfectlyNestedLoopsOutsideOf( + cast(innerMostLoop.getOperation())), + [](scf::ForOp forOp) { + return cast(forOp.getOperation()); + }); } else { - auto forallOp = candidateSliceOp->getParentOfType(); - oldLoopOp = forallOp; - llvm::append_range(newOuts, forallOp.getOutputs()); - oldLoopBody = forallOp.getBody(); - initSize = forallOp.getOutputs().size(); - rank = forallOp.getRank(); + nestedLoops = {innerMostLoop}; } - if (failed(checkAssumptionForLoop(oldLoopOp, consumerOp))) { + LoopLikeOpInterface outerMostLoop = nestedLoops.front(); + + if (failed(checkAssumptionForLoop(outerMostLoop, consumerOp))) { return rewriter.notifyMatchFailure( - oldLoopOp, "containing loop op should either yield just one value or " - "have the consumer op as its first user"); + outerMostLoop, + "containing loop op should either yield just one value or " + "have the consumer op as its first user"); } OpBuilder::InsertionGuard g(rewriter); // 2. Check consumer is not using scf loop's output as init. - auto dstOp = cast(consumerOp); + auto dstOp = dyn_cast(consumerOp); + if (!dstOp) + return rewriter.notifyMatchFailure(consumerOp, + "consumer op is not DPS operation"); SmallVector dpsInits = llvm::map_to_vector(dstOp.getDpsInits(), [](Value v) { return v; }); - if (llvm::is_contained(dpsInits, oldLoopOp->getResult(resultNumber))) { + if (llvm::is_contained(dpsInits, outerMostLoop->getResult(resultNumber))) { return rewriter.notifyMatchFailure( consumerOp, "consumer op taking the result of scf.for as init is not supported"); } - newOuts.append(dpsInits); - - Location loc = oldLoopOp->getLoc(); + SmallVector newInits = dpsInits; - // 3. Create new scf loop op. - rewriter.setInsertionPoint(consumerOp); - Operation *newLoopOp = nullptr; - Block *newLoopBody = nullptr; - if (isInsertSliceOp) { - auto forOp = cast(oldLoopOp); - auto newForOp = rewriter.create(loc, forOp.getLowerBound(), - forOp.getUpperBound(), - forOp.getStep(), newOuts); - newLoopOp = newForOp; - newLoopBody = newForOp.getBody(); - } else { - auto forallOp = cast(oldLoopOp); - auto newForallOp = rewriter.create( - loc, forallOp.getMixedLowerBound(), forallOp.getMixedUpperBound(), - forallOp.getMixedStep(), newOuts, forallOp.getMapping()); - newLoopOp = newForallOp; - rewriter.eraseOp(newForallOp.getTerminator()); - newLoopBody = newForallOp.getBody(); - } + Location loc = outerMostLoop->getLoc(); - // 4. Move the loop body to the new op. - unsigned oldNumArguments = oldLoopBody->getNumArguments(); - rewriter.mergeBlocks(oldLoopBody, newLoopBody, - newLoopBody->getArguments().take_front(oldNumArguments)); + // 3. Move the whole loop structure right before consumer Op, the dominance + // should be already ensured by `checkAssumptionForLoop`. + rewriter.moveOpBefore(outerMostLoop, consumerOp); - // 5. Set insertion point before terminator op of the loop and create a new + // 4. Set insertion point before terminator op of the loop and create a new // tensor.insert_slice. In the scf.for case this is a clone of the // candidateSliceOp whereas in the scf.forall case this is created from the // operands of tensor.parallel_insert_slice. tensor::InsertSliceOp clonedInsertSliceOp; if (auto sliceOp = dyn_cast(candidateSliceOp)) { - auto newForallOp = cast(newLoopOp); + auto newForallOp = cast(innerMostLoop.getOperation()); rewriter.setInsertionPoint(newForallOp.getTerminator()); clonedInsertSliceOp = rewriter.create( loc, sliceOp.getSource(), sliceOp.getDest(), sliceOp.getMixedOffsets(), @@ -1731,20 +1705,17 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, cast(rewriter.clone(*candidateSliceOp)); } - // 6.a. Clone consumer op. - auto newForOpBlockArgsForConsumerDest = - newLoopBody->getArguments().drop_front(oldNumArguments); - auto clonedConsumerOp = cast(cloneOpAndUpdateDestinationArgs( - rewriter, consumerOp, newForOpBlockArgsForConsumerDest)); + // 5.a. Clone consumer op. + auto clonedConsumerOp = cast(rewriter.clone(*consumerOp)); - // 6.b. Replace all uses of the loop result with the result of the cloned + // 5.b. Replace all uses of the loop result with the result of the cloned // tensor.insert_slice. OpOperand &operandToReplace = clonedConsumerOp->getOpOperand(operandNumber); rewriter.modifyOpInPlace(clonedConsumerOp, [&]() { operandToReplace.set(clonedInsertSliceOp.getResult()); }); - // 7 - Perform tiling of the cloned consumer and replace the operand at + // 6. Perform tiling of the cloned consumer and replace the operand at // `operandNumber` with the source of the cloned tensor.insert_slice op. auto ossSliceOp = cast(clonedInsertSliceOp.getOperation()); @@ -1754,79 +1725,108 @@ mlir::scf::tileAndFuseConsumerOfSlice(RewriterBase &rewriter, if (failed(tileAndFuseResult)) { return failure(); } - rewriter.replaceAllUsesWith( - tileAndFuseResult->tiledOps[0]->getOperand(operandNumber), - clonedInsertSliceOp.getSource()); - - // 8 - Extract offset/sizes/strides required to create the - // tensor.insert_slice/parallel_insert_slice for each result of the consumer. - SmallVector offsets = ossSliceOp.getMixedOffsets(); - SmallVector sizes = ossSliceOp.getMixedSizes(); - SmallVector strides = ossSliceOp.getMixedStrides(); - - // 9. Check all insert stride is 1. - if (llvm::any_of(strides, [](OpFoldResult stride) { - return !isConstantIntValue(stride, 1); - })) { - return rewriter.notifyMatchFailure( - candidateSliceOp, "containingOp's result yield with stride"); - } + auto tiledConsumerOp = cast(tileAndFuseResult->tiledOps[0]); + rewriter.replaceAllUsesWith(tiledConsumerOp->getOperand(operandNumber), + clonedInsertSliceOp.getSource()); - // 10. Try to get iter domain position from input position. - SmallVector iterDomainOffsets, iterDomainSizes; - if (failed(clonedConsumerOp.getIterationDomainTileFromOperandTile( - rewriter, operandNumber, offsets, sizes, iterDomainOffsets, - iterDomainSizes))) { - return rewriter.notifyMatchFailure( - clonedConsumerOp, "can't get iter domain position from input position"); - } + // 7. Reconstruct [nested] loop with new inits. + YieldTiledValuesFn newYieldValuesFn = + [&](RewriterBase &innerRewriter, Location loc, ValueRange /*ivs*/, + ValueRange newRegionIterArgs, SmallVector &tiledResult, + SmallVector> &tiledOffset, + SmallVector> &tiledSizes) -> LogicalResult { + OpBuilder::InsertionGuard g(innerRewriter); + // 8. Set inner insertPoint right before tiled consumer op. + innerRewriter.setInsertionPoint(tiledConsumerOp); - // 11. Try to fetch the offset and size for all results of the cloned - // consumer. This would then be used to form the corresponding - // tensor.insert_slice/parallel_insert_slice later. - unsigned totalNumResultsOfConsumer = clonedConsumerOp->getNumResults(); - SmallVector> resultOffsets( - totalNumResultsOfConsumer); - SmallVector> resultSizes(totalNumResultsOfConsumer); - for (auto [idx, v] : llvm::enumerate(clonedConsumerOp->getResults())) { - if (failed(clonedConsumerOp.getResultTilePosition( - rewriter, idx, iterDomainOffsets, iterDomainSizes, - resultOffsets[idx], resultSizes[idx]))) { + SmallVector offsets = ossSliceOp.getMixedOffsets(); + SmallVector sizes = ossSliceOp.getMixedSizes(); + SmallVector strides = ossSliceOp.getMixedStrides(); + + // 9. Check all insert stride is 1. + if (llvm::any_of(strides, [](OpFoldResult stride) { + return !isConstantIntValue(stride, 1); + })) { return rewriter.notifyMatchFailure( - clonedConsumerOp, - "can't get result domain position from iter domain position"); + candidateSliceOp, "containingOp's result yield with stride"); } - } - auto arrayRefOffsets = ArrayRef>(resultOffsets); - auto arrayRefSizes = ArrayRef>(resultSizes); - if (isInsertSliceOp) { - auto newForOp = cast(newLoopOp); - fixTerminatorSCFYield( - rewriter, newForOp, *tileAndFuseResult, arrayRefOffsets, arrayRefSizes, - newForOp.getBody()->getArguments().drop_front(1 + initSize)); - } else { - auto newForallOp = cast(newLoopOp); - fixTerminatorSCFInParallel( - rewriter, newForallOp, tileAndFuseResult->tiledOps[0]->getResults(), - arrayRefOffsets, arrayRefSizes, - newForallOp.getBody()->getArguments().drop_front(rank + initSize)); - } + // 10. Try to get iter domain position from input position. + SmallVector iterDomainOffsets, iterDomainSizes; + if (failed(tiledConsumerOp.getIterationDomainTileFromOperandTile( + rewriter, operandNumber, offsets, sizes, iterDomainOffsets, + iterDomainSizes))) { + return rewriter.notifyMatchFailure( + tiledConsumerOp, + "can't get iter domain position from input position"); + } - // 12. Replace the result of scf loop and consumer op with new loop's results. - for (auto &&[oldResult, newResult] : - llvm::zip_first(oldLoopOp->getResults(), newLoopOp->getResults())) { - rewriter.replaceAllUsesWith(oldResult, newResult); + // 11. Try to fetch the offset and size for all results of the cloned + // consumer. This would then be used to form the corresponding + // tensor.insert_slice/parallel_insert_slice later. + unsigned totalNumResultsOfConsumer = tiledConsumerOp->getNumResults(); + SmallVector> resultOffsets( + totalNumResultsOfConsumer); + SmallVector> resultSizes( + totalNumResultsOfConsumer); + for (auto [idx, v] : llvm::enumerate(tiledConsumerOp->getResults())) { + if (failed(tiledConsumerOp.getResultTilePosition( + rewriter, idx, iterDomainOffsets, iterDomainSizes, + resultOffsets[idx], resultSizes[idx]))) { + return rewriter.notifyMatchFailure( + tiledConsumerOp, + "can't get result domain position from iter domain position"); + } + } + + // 12. Create `extract_slice` for `iter_args` for DPS operation if + // necessary. + if (auto tiledDestStyleOp = dyn_cast( + tiledConsumerOp.getOperation())) { + rewriter.setInsertionPoint(tiledDestStyleOp); + for (const auto &&[index, newRegionArg] : + llvm::enumerate(newRegionIterArgs)) { + auto destSlice = rewriter.create( + loc, newRegionArg, resultOffsets[index], resultSizes[index], + SmallVector(resultOffsets[index].size(), + rewriter.getIndexAttr(1))); + // Make a copy of index to avoid a capturing structured binding, which + // is a C++20 extension. + auto dstNumber = index; + rewriter.modifyOpInPlace(tiledDestStyleOp, [&]() { + tiledDestStyleOp.getDpsInitsMutable()[dstNumber].set(destSlice); + }); + } + } + + // 13. Prepare tiled offset and sizes for later `insert_slice` creation by + // caller. + Block *block = rewriter.getInsertionPoint()->getBlock(); + rewriter.setInsertionPoint(block->getTerminator()); + for (const auto &&[index, result] : + llvm::enumerate(tiledConsumerOp->getResults())) { + tiledResult.push_back(result); + tiledOffset.emplace_back(resultOffsets[index]); + tiledSizes.emplace_back(resultSizes[index]); + } + return success(); + }; + // 14. Add new inits to [nested] loops. + if (failed(addInitOperandsToLoopNest(rewriter, nestedLoops, newInits, + newYieldValuesFn))) { + return rewriter.notifyMatchFailure(tiledConsumerOp, + "unable to add new inits to nest loop"); } - for (auto &&[oldResult, newResult] : - llvm::zip(consumerOp->getResults(), - newLoopOp->getResults().drop_front(initSize))) { + // 15. Replace the result of scf loop and consumer op with new loop's results. + + for (auto &&[oldResult, newResult] : llvm::zip( + consumerOp->getResults(), + nestedLoops.front()->getResults().take_back(newInits.size()))) { rewriter.replaceAllUsesWith(oldResult, newResult); } - // 13. Need to erase the old scf loop and the cloned consumer op. - rewriter.eraseOp(oldLoopOp); + // 16. Need to erase the old scf loop and the cloned consumer op. rewriter.eraseOp(clonedConsumerOp); return scf::SCFFuseConsumerOfSliceResult{ diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 83c5ec8d7342c85..fdefdcc453ae7aa 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -109,9 +109,9 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: ins(%[[MAT_OUT]], %[[SLICE_OPERAND2]] : // CHECK-SAME: outs(%[[SLICE_OUT]] : // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]] into %[[ELEM_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[MAT_OUT]] into %[[SECOND_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[SECOND_ARG_SLICE]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]] into %[[ELEM_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } // CHECK: return %[[FINAL_RESULT]]#2 : @@ -248,10 +248,10 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: ins(%[[MAT_OUT]], %[[SLICE_OPERAND2]] : // CHECK-SAME: outs(%[[SLICE_OUT_0]], %[[SLICE_OUT_1]] : // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#0 into %[[ELEM_OUT_ARG_0]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] -// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#1 into %[[ELEM_OUT_ARG_1]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[MAT_OUT]] into %[[SECOND_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: tensor.parallel_insert_slice %[[SECOND_ARG_SLICE]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#0 into %[[ELEM_OUT_ARG_0]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[ELEM_OUT]]#1 into %[[ELEM_OUT_ARG_1]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] // CHECK: } // CHECK: } // CHECK: %[[UNPACK:.*]] = tensor.unpack %[[FINAL_RESULT]]#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %{{.*}} : tensor<64x32xf32> -> tensor<2048xf32> @@ -310,8 +310,8 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] // CHECK-SAME: into %[[TILED_UNPACK_DEST]] // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [1024] [1] // CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_UNPACK_OUT]] into %[[UNPACK_OUT_ARG]][%[[UNPACK_RESULT_OFFSET]]] [1024] [1] // CHECK: } // CHECK: } // CHECK: return %[[FINAL_RESULT]]#1 : @@ -369,8 +369,71 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: inner_dims_pos = [0] inner_tiles = [16] // CHECK-SAME: into %[[TILED_PACK_DEST]] // CHECK: scf.forall.in_parallel { -// CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] // CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] -// CHECK: } +// CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], %[[IV2]], 0] [2, 32, 16] [1, 1, 1] + +// ----- + +module { + func.func @fuse_add_consumer_into_nested_scf_for(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> tensor<256x256xf32> { + %c0 = arith.constant 0 : index + %c64 = arith.constant 64 : index + %c256 = arith.constant 256 : index + %cst = arith.constant 0.000000e+00 : f32 + %dest0 = tensor.empty() : tensor<256x256xf32> + %dest1 = linalg.fill ins(%cst : f32) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32> + %1 = scf.for %arg3 = %c0 to %c256 step %c64 iter_args(%arg4 = %dest1) -> (tensor<256x256xf32>) { + %2 = scf.for %arg5 = %c0 to %c256 step %c64 iter_args(%arg6 = %arg4) -> (tensor<256x256xf32>) { + %extracted_slice_1 = tensor.extract_slice %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<256x256xf32> to tensor<64x64xf32> + %extracted_slice_2 = tensor.extract_slice %arg0[%arg3, 0] [64, 512] [1, 1] : tensor<256x512xf32> to tensor<64x512xf32> + %extracted_slice_3 = tensor.extract_slice %arg1[0, %arg5] [512, 64] [1, 1] : tensor<512x256xf32> to tensor<512x64xf32> + %3 = linalg.matmul ins(%extracted_slice_2, %extracted_slice_3 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice_1 : tensor<64x64xf32>) -> tensor<64x64xf32> + %insert_slice = tensor.insert_slice %3 into %arg6[%arg3, %arg5] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<256x256xf32> + scf.yield %insert_slice : tensor<256x256xf32> + } + scf.yield %2 : tensor<256x256xf32> + } + %4 = linalg.add ins(%1, %arg2 : tensor<256x256xf32>, tensor<256x256xf32>) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32> + return %4 : tensor<256x256xf32> + } +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %slice_op = transform.structured.match ops{["tensor.insert_slice"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %a, %b = transform.test.fuse_consumer %slice_op + : (!transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: func.func @fuse_add_consumer_into_nested_scf_for( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<256x512xf32> +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<512x256xf32> +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32> +// CHECK: %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32> +// CHECK: %[[dest1:.*]] = linalg.fill +// CHECK-SAME: outs(%[[dest0]] : +// CHECK: %[[LOOP_RESULT1:.*]]:2 = scf.for %[[IV1:.*]] = %[[C0]] +// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG1:.*]] = %[[dest1]], %[[SECOND_OUT_ARG1:.*]] = %[[dest0]]) +// CHECK-SAME: { +// CHECK: %[[LOOP_RESULT2:.*]]:2 = scf.for %[[IV2:.*]] = %[[C0]] +// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG2:.*]] = %[[FIRST_OUT_ARG1]], %[[SECOND_OUT_ARG2:.*]] = %[[SECOND_OUT_ARG1]]) +// CHECK-SAME: { +// CHECK: %[[MAT_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[INPUT_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 512] [1, 1] +// CHECK: %[[WEIGHT_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, %[[IV2]]] [512, 64] [1, 1] +// CHECK: %[[TILED_MAT_OUT:.*]] = linalg.matmul +// CHECK-SAME: outs(%[[MAT_OUT_SLICE]] : +// CHECK: %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[FIRST_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[ADD_OPERAND2_SLICE:.*]] = tensor.extract_slice %[[ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: %[[TILED_ADD_OUT:.*]] = linalg.add +// CHECK-SAME: ins(%[[TILED_MAT_OUT]], %[[ADD_OPERAND2_SLICE]] : +// CHECK-SAME: outs(%[[ADD_OUT_SLICE]] : +// CHECK: %[[INSERT_ADD:.*]] = tensor.insert_slice %[[TILED_ADD_OUT]] into %[[SECOND_OUT_ARG2]][%[[IV1]], %[[IV2]]] [64, 64] [1, 1] +// CHECK: scf.yield %[[INSERT_MAT]], %[[INSERT_ADD]] : +// CHECK: } +// CHECK: scf.yield %[[LOOP_RESULT2]]#0, %[[LOOP_RESULT2]]#1 : // CHECK: } -// CHECK: return %[[FINAL_RESULT]]#1 : +// CHECK: return %[[LOOP_RESULT1]]#1 : From 08740a6157375c4173023f28fc9e90689afee5ba Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Wed, 11 Sep 2024 21:07:03 -0700 Subject: [PATCH 92/94] [CodeGen] Fix documentation for ISD::ATOMIC_STORE. NFC (#108126) Update ISDOpcodes.h documentation according to commit ad9d13d ("SelectionDAG: Swap operands of atomic_store") for less confusion. --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 65514882343dbe7..18ed60ebb124dc8 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1304,7 +1304,7 @@ enum NodeType { /// This corresponds to "load atomic" instruction. ATOMIC_LOAD, - /// OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) + /// OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) /// This corresponds to "store atomic" instruction. ATOMIC_STORE, From 8c17ed1512239a5a9b1320f678a8cd89db8b0981 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 11 Sep 2024 21:13:26 -0700 Subject: [PATCH 93/94] [RISCV] Generalize RISCVDAGToDAGISel::selectFPImm to handle bitcasts from int to FP. (#108284) selectFPImm previously handled cases where an FPImm could be materialized in an integer register. We can generalize this to cases where a value was in an integer register and then copied to a scalar FP register to be used by a vector instruction. In the affected test, the call lowering code used up all of the FP argument registers and started using GPRs. Now we use integer vector instructions to consume those GPRs instead of moving them to scalar FP first. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 16 ++- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h | 2 +- .../Target/RISCV/RISCVInstrInfoVPseudos.td | 3 +- .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 2 +- .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 6 +- .../RISCV/rvv/fixed-vectors-fp-buildvec.ll | 130 ++++++++---------- 6 files changed, 79 insertions(+), 80 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index ff4c0e9bbd50e72..02585c9f6037364 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3535,7 +3535,21 @@ bool RISCVDAGToDAGISel::selectLow8BitsVSplat(SDValue N, SDValue &SplatVal) { return selectVSplat(N, SplatVal); } -bool RISCVDAGToDAGISel::selectFPImm(SDValue N, SDValue &Imm) { +bool RISCVDAGToDAGISel::selectScalarFPAsInt(SDValue N, SDValue &Imm) { + // Allow bitcasts from XLenVT -> FP. + if (N.getOpcode() == ISD::BITCAST && + N.getOperand(0).getValueType() == Subtarget->getXLenVT()) { + Imm = N.getOperand(0); + return true; + } + // Allow moves from XLenVT to FP. + if (N.getOpcode() == RISCVISD::FMV_H_X || + N.getOpcode() == RISCVISD::FMV_W_X_RV64) { + Imm = N.getOperand(0); + return true; + } + + // Otherwise, look for FP constants that can materialized with scalar int. ConstantFPSDNode *CFP = dyn_cast(N.getNode()); if (!CFP) return false; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index 1d120c13442d51f..2e738d8d25a6dc4 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -140,7 +140,7 @@ class RISCVDAGToDAGISel : public SelectionDAGISel { // Matches the splat of a value which can be extended or truncated, such that // only the bottom 8 bits are preserved. bool selectLow8BitsVSplat(SDValue N, SDValue &SplatVal); - bool selectFPImm(SDValue N, SDValue &Imm); + bool selectScalarFPAsInt(SDValue N, SDValue &Imm); bool selectRVVSimm5(SDValue N, unsigned Width, SDValue &Imm); template bool selectRVVSimm5(SDValue N, SDValue &Imm) { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 430e09fd834ba7e..fe7de9d7bc79aa6 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -236,7 +236,8 @@ def VLOpFrag : PatFrag<(ops), (XLenVT (VLOp (XLenVT AVL:$vl)))>; // This must be kept in sync with RISCV::VLMaxSentinel. def VLMax : OutPatFrag<(ops), (XLenVT -1)>; -def SelectFPImm : ComplexPattern; +def SelectScalarFPAsInt : ComplexPattern; // List of EEW. defvar EEWList = [8, 16, 32, 64]; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index f12f82cb1595298..b54cdcbd1b0e9c2 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -1374,7 +1374,7 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { fvti.AVL, fvti.Log2SEW)>; def : Pat<(fvti.Vector (vselect (fvti.Mask V0), - (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))), + (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), fvti.RegClass:$rs2)), (!cast("PseudoVMERGE_VXM_"#fvti.LMul.MX) (fvti.Vector (IMPLICIT_DEF)), diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 9afbe567193607d..a27c3a416816e2b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2575,7 +2575,7 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { GPR:$vl, fvti.Log2SEW)>; def : Pat<(fvti.Vector (riscv_vmerge_vl (fvti.Mask V0), - (SplatFPOp (SelectFPImm (XLenVT GPR:$imm))), + (SplatFPOp (SelectScalarFPAsInt (XLenVT GPR:$imm))), fvti.RegClass:$rs2, fvti.RegClass:$passthru, VLOpFrag)), @@ -2619,7 +2619,7 @@ foreach fvti = !listconcat(AllFloatVectors, AllBFloatVectors) in { (!cast("PseudoVMV_V_I_"#fvti.LMul.MX) $passthru, 0, GPR:$vl, fvti.Log2SEW, TU_MU)>; def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl - fvti.Vector:$passthru, (fvti.Scalar (SelectFPImm (XLenVT GPR:$imm))), VLOpFrag)), + fvti.Vector:$passthru, (fvti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)), (!cast("PseudoVMV_V_X_"#fvti.LMul.MX) $passthru, GPR:$imm, GPR:$vl, fvti.Log2SEW, TU_MU)>; } @@ -2940,7 +2940,7 @@ foreach vti = NoGroupFloatVectors in { VLOpFrag)), (PseudoVMV_S_X $passthru, (XLenVT X0), GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), - (vti.Scalar (SelectFPImm (XLenVT GPR:$imm))), + (vti.Scalar (SelectScalarFPAsInt (XLenVT GPR:$imm))), VLOpFrag)), (PseudoVMV_S_X $passthru, GPR:$imm, GPR:$vl, vti.Log2SEW)>; def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$passthru), diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index e3aabb5de29c28b..b5d3e2cd776f270 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -1348,20 +1348,16 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; ; RV64-LABEL: buildvec_v32f64_exact_vlen: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -96 -; RV64-NEXT: .cfi_def_cfa_offset 96 -; RV64-NEXT: fsd fs0, 88(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs1, 80(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs2, 72(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs3, 64(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs4, 56(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs5, 48(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs6, 40(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs7, 32(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs8, 24(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs9, 16(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs10, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: fsd fs11, 0(sp) # 8-byte Folded Spill +; RV64-NEXT: addi sp, sp, -64 +; RV64-NEXT: .cfi_def_cfa_offset 64 +; RV64-NEXT: fsd fs0, 56(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs1, 48(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs2, 40(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs3, 32(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs4, 24(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs5, 16(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs6, 8(sp) # 8-byte Folded Spill +; RV64-NEXT: fsd fs7, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset fs0, -8 ; RV64-NEXT: .cfi_offset fs1, -16 ; RV64-NEXT: .cfi_offset fs2, -24 @@ -1370,34 +1366,26 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: .cfi_offset fs5, -48 ; RV64-NEXT: .cfi_offset fs6, -56 ; RV64-NEXT: .cfi_offset fs7, -64 -; RV64-NEXT: .cfi_offset fs8, -72 -; RV64-NEXT: .cfi_offset fs9, -80 -; RV64-NEXT: .cfi_offset fs10, -88 -; RV64-NEXT: .cfi_offset fs11, -96 ; RV64-NEXT: fmv.d.x ft4, a7 -; RV64-NEXT: fmv.d.x ft5, a6 -; RV64-NEXT: fmv.d.x ft6, a5 -; RV64-NEXT: fmv.d.x ft7, a4 -; RV64-NEXT: fmv.d.x ft8, a3 -; RV64-NEXT: fmv.d.x ft9, a2 -; RV64-NEXT: fmv.d.x ft10, a1 -; RV64-NEXT: fmv.d.x ft11, a0 -; RV64-NEXT: fld ft0, 216(sp) -; RV64-NEXT: fld ft1, 208(sp) -; RV64-NEXT: fld ft2, 200(sp) -; RV64-NEXT: fld ft3, 192(sp) -; RV64-NEXT: fld fs0, 184(sp) -; RV64-NEXT: fld fs1, 176(sp) -; RV64-NEXT: fld fs2, 168(sp) -; RV64-NEXT: fld fs3, 160(sp) -; RV64-NEXT: fld fs4, 152(sp) -; RV64-NEXT: fld fs5, 144(sp) -; RV64-NEXT: fld fs6, 136(sp) -; RV64-NEXT: fld fs7, 128(sp) -; RV64-NEXT: fld fs8, 104(sp) -; RV64-NEXT: fld fs9, 96(sp) -; RV64-NEXT: fld fs10, 120(sp) -; RV64-NEXT: fld fs11, 112(sp) +; RV64-NEXT: fmv.d.x ft5, a5 +; RV64-NEXT: fmv.d.x ft6, a3 +; RV64-NEXT: fmv.d.x ft7, a1 +; RV64-NEXT: fld ft0, 184(sp) +; RV64-NEXT: fld ft1, 176(sp) +; RV64-NEXT: fld ft2, 168(sp) +; RV64-NEXT: fld ft3, 160(sp) +; RV64-NEXT: fld ft8, 152(sp) +; RV64-NEXT: fld ft9, 144(sp) +; RV64-NEXT: fld ft10, 136(sp) +; RV64-NEXT: fld ft11, 128(sp) +; RV64-NEXT: fld fs0, 120(sp) +; RV64-NEXT: fld fs1, 112(sp) +; RV64-NEXT: fld fs2, 104(sp) +; RV64-NEXT: fld fs3, 96(sp) +; RV64-NEXT: fld fs4, 72(sp) +; RV64-NEXT: fld fs5, 64(sp) +; RV64-NEXT: fld fs6, 88(sp) +; RV64-NEXT: fld fs7, 80(sp) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vfmv.v.f v8, fa2 ; RV64-NEXT: vfslide1down.vf v9, v8, fa3 @@ -1407,43 +1395,39 @@ define <32 x double> @buildvec_v32f64_exact_vlen(double %e0, double %e1, double ; RV64-NEXT: vfslide1down.vf v10, v10, fa5 ; RV64-NEXT: vfmv.v.f v11, fa6 ; RV64-NEXT: vfslide1down.vf v11, v11, fa7 -; RV64-NEXT: vfmv.v.f v12, ft11 -; RV64-NEXT: vfslide1down.vf v12, v12, ft10 -; RV64-NEXT: vfmv.v.f v13, ft9 -; RV64-NEXT: vfslide1down.vf v13, v13, ft8 -; RV64-NEXT: vfmv.v.f v14, ft7 -; RV64-NEXT: vfslide1down.vf v14, v14, ft6 -; RV64-NEXT: vfmv.v.f v15, ft5 +; RV64-NEXT: vmv.v.x v12, a0 +; RV64-NEXT: vfslide1down.vf v12, v12, ft7 +; RV64-NEXT: vmv.v.x v13, a2 +; RV64-NEXT: vfslide1down.vf v13, v13, ft6 +; RV64-NEXT: vmv.v.x v14, a4 +; RV64-NEXT: vfslide1down.vf v14, v14, ft5 +; RV64-NEXT: vmv.v.x v15, a6 ; RV64-NEXT: vfslide1down.vf v15, v15, ft4 -; RV64-NEXT: vfmv.v.f v16, fs11 -; RV64-NEXT: vfslide1down.vf v17, v16, fs10 -; RV64-NEXT: vfmv.v.f v16, fs9 -; RV64-NEXT: vfslide1down.vf v16, v16, fs8 -; RV64-NEXT: vfmv.v.f v18, fs7 -; RV64-NEXT: vfslide1down.vf v18, v18, fs6 -; RV64-NEXT: vfmv.v.f v19, fs5 -; RV64-NEXT: vfslide1down.vf v19, v19, fs4 -; RV64-NEXT: vfmv.v.f v20, fs3 -; RV64-NEXT: vfslide1down.vf v20, v20, fs2 -; RV64-NEXT: vfmv.v.f v21, fs1 -; RV64-NEXT: vfslide1down.vf v21, v21, fs0 +; RV64-NEXT: vfmv.v.f v16, fs7 +; RV64-NEXT: vfslide1down.vf v17, v16, fs6 +; RV64-NEXT: vfmv.v.f v16, fs5 +; RV64-NEXT: vfslide1down.vf v16, v16, fs4 +; RV64-NEXT: vfmv.v.f v18, fs3 +; RV64-NEXT: vfslide1down.vf v18, v18, fs2 +; RV64-NEXT: vfmv.v.f v19, fs1 +; RV64-NEXT: vfslide1down.vf v19, v19, fs0 +; RV64-NEXT: vfmv.v.f v20, ft11 +; RV64-NEXT: vfslide1down.vf v20, v20, ft10 +; RV64-NEXT: vfmv.v.f v21, ft9 +; RV64-NEXT: vfslide1down.vf v21, v21, ft8 ; RV64-NEXT: vfmv.v.f v22, ft3 ; RV64-NEXT: vfslide1down.vf v22, v22, ft2 ; RV64-NEXT: vfmv.v.f v23, ft1 ; RV64-NEXT: vfslide1down.vf v23, v23, ft0 -; RV64-NEXT: fld fs0, 88(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs1, 80(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs2, 72(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs3, 64(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs4, 56(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs5, 48(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs6, 40(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs7, 32(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs8, 24(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs9, 16(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs10, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: fld fs11, 0(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 96 +; RV64-NEXT: fld fs0, 56(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs1, 48(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs2, 40(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs3, 32(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs4, 24(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs5, 16(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs6, 8(sp) # 8-byte Folded Reload +; RV64-NEXT: fld fs7, 0(sp) # 8-byte Folded Reload +; RV64-NEXT: addi sp, sp, 64 ; RV64-NEXT: ret %v0 = insertelement <32 x double> poison, double %e0, i64 0 %v1 = insertelement <32 x double> %v0, double %e1, i64 1 From 1211d97922d62470ac8bc658f7bfe57e8b46a107 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Thu, 12 Sep 2024 03:48:27 +0000 Subject: [PATCH 94/94] [X86] Use SWAR techniques for some vector i8 shifts SSE & AVX do not include instructions for shifting i8 vectors. Instead, they must be synthesized via other instructions. If pairs of i8 vectors share a shift amount, we can use SWAR techniques to substantially reduce the amount of code generated. Say we were going to execute this shift right: x >> {0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, ...} LLVM would previously generate: vpxor %xmm1, %xmm1, %xmm1 vpunpckhbw %ymm0, %ymm1, %ymm2 vpunpckhbw %ymm1, %ymm0, %ymm3 vpsllw $4, %ymm3, %ymm3 vpblendd $204, %ymm3, %ymm2, %ymm2 vpsrlw $8, %ymm2, %ymm2 vpunpcklbw %ymm0, %ymm1, %ymm3 vpunpcklbw %ymm1, %ymm0, %ymm0 vpsllw $4, %ymm0, %ymm0 vpblendd $204, %ymm0, %ymm3, %ymm0 vpsrlw $8, %ymm0, %ymm0 vpackuswb %ymm2, %ymm0, %ymm0 Instead, we can reinterpret a pair of i8 elements as an i16 and shift use the same shift amount. The only thing we need to do is mask out any bits which crossed the boundary from the top i8 to the bottom i8. This SWAR-style technique achieves: vpsrlw $4, %ymm0, %ymm1 vpblendd $170, %ymm1, %ymm0, %ymm0 vpand .LCPI0_0(%rip), %ymm0, %ymm0 This is implemented for both left and right logical shift operations. Arithmetic shifts are less well behaved here because the shift cannot also perform the sign extension for the lower 8 bits. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 95 ++++++++++++++++++- .../test/CodeGen/X86/vector-shift-lshr-128.ll | 61 ++++++++++++ .../test/CodeGen/X86/vector-shift-lshr-256.ll | 66 +++++++++++++ .../test/CodeGen/X86/vector-shift-lshr-512.ll | 23 +++++ 4 files changed, 240 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d0794cb9bfde3db..c3b919921f23b3a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29851,17 +29851,103 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, DAG.getNode(Opc, dl, ExtVT, R, Amt)); } - // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we - // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI. + // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors by using + // vXi16 vector operations. if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) && (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && !Subtarget.hasXOP()) { int NumElts = VT.getVectorNumElements(); + MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2); + // We can do this extra fast if each pair of i8 elements is shifted by the + // same amount by doing this SWAR style: use a shift to move the valid bits + // to the right position, mask out any bits which crossed from one element + // to the other. + if (Opc == ISD::SRL || Opc == ISD::SHL) { + APInt UndefElts; + SmallVector AmtBits; + if (getTargetConstantBitsFromNode(Amt, /*EltSizeInBits=*/8, UndefElts, + AmtBits, /*AllowWholeUndefs=*/true, + /*AllowPartialUndefs=*/false)) { + // This optimized lowering is only valid if the elements in a pair can + // be treated identically. + bool SameShifts = true; + SmallVector AmtBits16(NumElts / 2); + APInt UndefElts16 = APInt::getZero(AmtBits16.size()); + for (unsigned SrcI = 0, E = AmtBits.size(); SrcI != E; SrcI += 2) { + unsigned DstI = SrcI / 2; + // Both elements are undef? Make a note and keep going. + if (UndefElts[SrcI] && UndefElts[SrcI + 1]) { + AmtBits16[DstI] = APInt::getZero(16); + UndefElts16.setBit(DstI); + continue; + } + // Even element is undef? We will shift it by the same shift amount as + // the odd element. + if (UndefElts[SrcI]) { + AmtBits16[DstI] = AmtBits[SrcI + 1].zext(16); + continue; + } + // Odd element is undef? We will shift it by the same shift amount as + // the even element. + if (UndefElts[SrcI + 1]) { + AmtBits16[DstI] = AmtBits[SrcI].zext(16); + continue; + } + // Both elements are equal. + if (AmtBits[SrcI] == AmtBits[SrcI + 1]) { + AmtBits16[DstI] = AmtBits[SrcI].zext(16); + continue; + } + // One of the provisional i16 elements will not have the same shift + // amount. Let's bail. + SameShifts = false; + break; + } + + // We are only dealing with identical pairs and the operation is a + // logical shift. + if (SameShifts) { + // Cast the operand to vXi16. + SDValue R16 = DAG.getBitcast(VT16, R); + // Create our new vector of shift amounts. + SDValue Amt16 = getConstVector(AmtBits16, UndefElts16, VT16, DAG, dl); + // Perform the actual shift. + SDValue ShiftedR = DAG.getNode(Opc, dl, VT16, R16, Amt16); + // Now we need to construct a mask which will "drop" bits that get + // shifted past the LSB/MSB. For a logical shift left, it will look + // like: + // MaskLowBits = (0xff << Amt16) & 0xff; + // MaskHighBits = MaskLowBits << 8; + // Mask = MaskLowBits | MaskHighBits; + // + // This masking ensures that bits cannot migrate from one i8 to + // another. The construction of this mask will be constant folded. + // The mask for a logical right shift is nearly identical, the only + // difference is that 0xff is shifted right instead of left. + SDValue Cst255 = DAG.getConstant(0xff, dl, MVT::i16); + SDValue Splat255 = DAG.getSplat(VT16, dl, Cst255); + // The mask for the low bits is most simply expressed as an 8-bit + // field of all ones which is shifted in the exact same way the data + // is shifted but masked with 0xff. + SDValue MaskLowBits = DAG.getNode(Opc, dl, VT16, Splat255, Amt16); + MaskLowBits = DAG.getNode(ISD::AND, dl, VT16, MaskLowBits, Splat255); + SDValue Cst8 = DAG.getConstant(8, dl, MVT::i16); + SDValue Splat8 = DAG.getSplat(VT16, dl, Cst8); + // Thie mask for the high bits is the same as the mask for the low + // bits but shifted up by 8. + SDValue MaskHighBits = DAG.getNode(ISD::SHL, dl, VT16, MaskLowBits, Splat8); + SDValue Mask = DAG.getNode(ISD::OR, dl, VT16, MaskLowBits, MaskHighBits); + // Finally, we mask the shifted vector with the SWAR mask. + SDValue Masked = DAG.getNode(ISD::AND, dl, VT16, ShiftedR, Mask); + return DAG.getBitcast(VT, Masked); + } + } + } SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); - // Extend constant shift amount to vXi16 (it doesn't matter if the type - // isn't legal). + // Extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI (it + // doesn't matter if the type isn't legal). MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT); Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt); @@ -29885,7 +29971,6 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, } } - MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2); SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt); SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt); diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 62b95eedc9d4f1b..43c6e4b0db16f29 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1226,6 +1226,67 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind { ret <8 x i16> %shift } +define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind { +; SSE-LABEL: constant_shift_v16i8_pairs: +; SSE: # %bb.0: +; SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: constant_shift_v16i8_pairs: +; AVX: # %bb.0: +; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq +; +; XOP-LABEL: constant_shift_v16i8_pairs: +; XOP: # %bb.0: +; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; XOP-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v16i8_pairs: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQ-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v16i8_pairs: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [7,7,2,2,4,4,6,6,1,1,2,2,3,3,4,4] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v16i8_pairs: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQVL-NEXT: vzeroupper +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v16i8_pairs: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq +; +; X86-SSE-LABEL: constant_shift_v16i8_pairs: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096] +; X86-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 +; X86-SSE-NEXT: retl + %shift = lshr <16 x i8> %a, + ret <16 x i8> %shift +} + define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind { ; SSE2-LABEL: constant_shift_v16i8: ; SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll index 0ef5d650535d23d..932f210e239932d 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1345,6 +1345,72 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ret <16 x i16> %shift } +define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind { +; AVX1-LABEL: constant_shift_v32i8_pairs: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [512,16384,4096,1024,32768,16384,8192,4096] +; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [257,16191,3855,771,32639,16191,7967,3855] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: constant_shift_v32i8_pairs: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; XOPAVX1-LABEL: constant_shift_v32i8_pairs: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252] +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: constant_shift_v32i8_pairs: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252] +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq +; +; AVX512DQ-LABEL: constant_shift_v32i8_pairs: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v32i8_pairs: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQVL-LABEL: constant_shift_v32i8_pairs: +; AVX512DQVL: # %bb.0: +; AVX512DQVL-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX512DQVL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512DQVL-NEXT: retq +; +; AVX512BWVL-LABEL: constant_shift_v32i8_pairs: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; AVX512BWVL-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: retq + %shift = lshr <32 x i8> %a, + ret <32 x i8> %shift +} + define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX1-LABEL: constant_shift_v32i8: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll index efd73b4ca132bbf..8b61540081a7c75 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -306,6 +306,29 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind { ret <32 x i16> %shift } +define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind { +; AVX512DQ-LABEL: constant_shift_v64i8_pairs: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096] +; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1] +; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 +; AVX512DQ-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855,257,16191,3855,771,32639,16191,7967,3855] +; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512BW-LABEL: constant_shift_v64i8_pairs: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: retq + %shift = lshr <64 x i8> %a, + ret <64 x i8> %shift +} + define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512DQ-LABEL: constant_shift_v64i8: ; AVX512DQ: # %bb.0: