[MachineLICM] Correctly Apply Register Masks #95746

Pierre-vh · 2024-06-17T07:40:56Z

Fix regression introduced in d4b8b72

llvmbot · 2024-06-17T07:41:26Z

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/95746.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/MachineLICM.cpp (+20-25)
(modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+2-2)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 6c5170e918e00..237527a165e85 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -426,38 +426,33 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
 static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
                                                 BitVector &RUs,
                                                 const uint32_t *Mask) {
-  // Iterate over the RegMask raw to avoid constructing a BitVector, which is
-  // expensive as it implies dynamically allocating memory.
-  //
-  // We also work backwards.
+  const unsigned NumRUs = TRI.getNumRegUnits();
+  BitVector RUsInMask(NumRUs);
+  BitVector RUsNotInMask(NumRUs);
   const unsigned NumRegs = TRI.getNumRegs();
   const unsigned MaskWords = (NumRegs + 31) / 32;
   for (unsigned K = 0; K < MaskWords; ++K) {
-    // We want to set the bits that aren't in RegMask, so flip it.
-    uint32_t Word = ~Mask[K];
-
-    // Iterate all set bits, starting from the right.
-    while (Word) {
-      const unsigned SetBitIdx = countr_zero(Word);
-
-      // The bits are numbered from the LSB in each word.
-      const unsigned PhysReg = (K * 32) + SetBitIdx;
-
-      // Clear the bit at SetBitIdx. Doing it this way appears to generate less
-      // instructions on x86. This works because negating a number will flip all
-      // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
-      // faster.
-      Word ^= Word & -Word;
-
+    uint32_t Word = Mask[K];
+    for (unsigned Bit = 0; Bit < 32; ++Bit) {
+      const unsigned PhysReg = (K * 32) + Bit;
       if (PhysReg == NumRegs)
-        return;
+        break;
 
-      if (PhysReg) {
-        for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
-          RUs.set(*RUI);
-      }
+      if (!PhysReg)
+        continue;
+
+      // Extract the bit and apply it to the appropriate mask.
+      auto &Mask = ((Word >> Bit) & 1) ? RUsInMask : RUsNotInMask;
+      for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
+        Mask.set(*RUI);
     }
   }
+
+  // If a RU needs to be set because it's not in the RegMask, only set it
+  // if all registers from that RU are not in the mask either.
+  RUsNotInMask &= RUsInMask.flip();
+
+  RUs |= RUsNotInMask;
 }
 
 /// Examine the instruction for potentai LICM candidate. Also
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index da8aa54469835..7799b9509ceb0 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s62, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s63, 31
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
+; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]

jayfoad · 2024-06-17T08:45:58Z

You shouldn't need both RUsInMask and RUsNotInMask. Suggestion:

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 237527a165e8..dd71a9a3fc79 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -424,38 +424,33 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
 }
 
 static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
                                                 BitVector &RUs,
                                                 const uint32_t *Mask) {
   const unsigned NumRUs = TRI.getNumRegUnits();
-  BitVector RUsInMask(NumRUs);
-  BitVector RUsNotInMask(NumRUs);
+  BitVector ClobberedRUs(NumRUs, true);
   const unsigned NumRegs = TRI.getNumRegs();
   const unsigned MaskWords = (NumRegs + 31) / 32;
   for (unsigned K = 0; K < MaskWords; ++K) {
     uint32_t Word = Mask[K];
     for (unsigned Bit = 0; Bit < 32; ++Bit) {
       const unsigned PhysReg = (K * 32) + Bit;
       if (PhysReg == NumRegs)
         break;
 
       if (!PhysReg)
         continue;
 
-      // Extract the bit and apply it to the appropriate mask.
-      auto &Mask = ((Word >> Bit) & 1) ? RUsInMask : RUsNotInMask;
-      for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
-        Mask.set(*RUI);
+      if ((Word >> Bit) & 1) {
+        for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
+          ClobberedRUs.reset(*RUI);
+      }
     }
   }
 
-  // If a RU needs to be set because it's not in the RegMask, only set it
-  // if all registers from that RU are not in the mask either.
-  RUsNotInMask &= RUsInMask.flip();
-
-  RUs |= RUsNotInMask;
+  RUs |= ClobberedRUs;
 }
 
 /// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
 void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
                                 BitVector &RUClobbers,

arsenm

Better to describe the regression in the first line and leave the hash for the body of the message

llvm/lib/CodeGen/MachineLICM.cpp

mstorsjo · 2024-06-18T09:07:13Z

This commit causes miscompilations for aarch64 (where the generated code does the wrong thing). This can be observed in openh264 unit tests, that suddenly are failing where they previously were passing.

To reproduce, run the following sequence on aarch64 linux:

$ git clone https://github.com/cisco/openh264
$ cd openh264
$ make gtest-bootstrap
$ make CC=clang CXX=clang++ -j$(nproc) codec_unittest
$ ./codec_unittest --gtest_filter=McHorVer.16x8_c

For closer investigation of what's happening, the culprit object file for this test is test/encoder/EncUT_MotionCompensation.o. (There are also test failures triggered in the test DecoderDecodeMbAux.IdctResAddPred8x8_c, due to similar miscompilations in test/decoder/DecUT_IdctResAddPred.o.)

If there's no straightforward fix in sight, can we revert this change until the issue is resolved? (As this commit is a fix for a regression, do we need to revert that original change meanwhile as well?)

Pierre-vh · 2024-06-18T09:14:43Z

@mstorsjo Did you verify that reverting this, and only this commit fixes it?

Can you please create a lit test case that shows how codegen goes wrong? I don't have access to a AArch64 machine. I would like to first see what the issue is before deciding whether to revert or not.
If you have a lit test, I can then create a fix and commit the test at the same time.

mstorsjo · 2024-06-18T09:32:28Z

@mstorsjo Did you verify that reverting this, and only this commit fixes it?

I bisected it; with commit 770393b it fails, with the preceding one it works, so it's definitely pinpointed to this one. It doesn't revert cleanly on git main, as 457e895 also touches the same file.

Can you please create a lit test case that shows how codegen goes wrong? I don't have access to a AArch64 machine. I would like to first see what the issue is before deciding whether to revert or not. If you have a lit test, I can then create a fix and commit the test at the same time.

First off - I'm approaching this primarily as a integrator, I continuously build latest llvm/clang and test projects with it, so I'm not very well versed with the LLVM internals. I'm not very familiar with pinpointing the error in codegen... (I've done it some times, but it takes me quite a long time, and I'm doing this as a side hobby essentially.)

I tried adding -mllvm -print-after-all to the compilation command that fails, in order to get a before/after printout and diff them, but it seems to be way too massive to reasonably diff.

I guess I can try to split the source file into minimal functions, if that allows pinpointing the codegen issue better.

Pierre-vh · 2024-06-18T09:37:19Z

I tried adding -mllvm -print-after-all to the compilation command that fails, in order to get a before/after printout and diff them, but it seems to be way too massive to reasonably diff.

You can just do -stop-after=machinelicm. This should give you a MIR file right after MachineLICM.
You can generate one before and after this commit for the offending CU where the miscompile happens and attach them both here, I can then have a look at it.
Attaching a third file generated on either commit using -stop-before=machinelicm would also be helpful as I can use that as a base for a testcase.

Thank you :)

mstorsjo · 2024-06-18T09:47:52Z

Ok, here goes:
before-after.zip

before.mir from the output of -stop-before=machinelicm, and after-good.mir and after-bad.mir from -stop-after=machinelicm. The diff between the two after cases is pretty small and I can't say for certain whether it's faulty or not, hopefully you can.

Otherwise I'm afraid that this actually would be a lingering aarch64 target bug that just happens to appear due to this, otherwise potentially valid, change. For that, we'd have to invoke some aarch64 target maintainer...

Pierre-vh · 2024-06-18T10:00:43Z

Investigating right now. As it doesn't break any CI I won't revert yet if that's okay.

I'll revert if some AArch64 maintainer wants me to (@aemerson / @sjoerdmeijer perhaps?) or if I can't tell what the problem is by tomorrow.

Pierre-vh · 2024-06-18T10:42:02Z

@mstorsjo I must be missing some flags. When I run it locally like this, LICM doesn't kick in.

llc -run-pass=machinelicm before.mir -o after-trunk.mir -mtriple=aarch64-linux-gnu

I also had to add names to the EH_LABEL instructions because they didn't have any and the MIR couldn't be parsed without that.

The only difference is that in the "after" patch, we have

    renamable $q10 = MVNIv4i32 4, 0
    renamable $q11 = MOVIv4i32 2, 8

being hoisted

I wonder if some clobber is missing from an instruction somewhere and this just exposes it?

Just to be sure, you checked latest main too and it still fails over there?

Pierre-vh · 2024-06-18T10:56:58Z

I also observe that some predecessors are added to the IR after it goes through llc locally.
I wonder if this is caused by some pass changing the CFG without invalidating it, and the stale results affect MachineLICM and cause it to miscompile.

In other words, MachineLICM isn't at fault but some earlier pass that changes the CFG is.
Just a guess.

Can you also provide the unoptimized IR? You can get it through clang by disabling all LLVM passes (-Xclang -disable-llvm-passes or something?)

mstorsjo · 2024-06-18T10:59:10Z

@mstorsjo I must be missing some flags. When I run it locally like this, LICM doesn't kick in.
llc -run-pass=machinelicm before.mir -o after-trunk.mir -mtriple=aarch64-linux-gnu

Sorry, no idea about that... In this case, it's triggered with a compilation command like this: clang++ -O3 -fno-strict-aliasing -fPIC -fstack-protector-all -march=armv8-a -c -o test/encoder/EncUT_MotionCompensation2.o test/encoder/EncUT_MotionCompensation2.cpp, not sure if that sets any flags that affect whether this is run or not.

I also had to add names to the EH_LABEL instructions because they didn't have any and the MIR couldn't be parsed without that.

The only difference is that in the "after" patch, we have
    renamable $q10 = MVNIv4i32 4, 0
    renamable $q11 = MOVIv4i32 2, 8
being hoisted

I wonder if some clobber is missing from an instruction somewhere and this just exposes it?

Just to be sure, you checked latest main too and it still fails over there?

Yes, it's still failing with latest main too.

DavidSpickett · 2024-06-18T11:08:56Z

@davemgreen might be able to help.

mstorsjo · 2024-06-18T11:13:04Z

Can you also provide the unoptimized IR? You can get it through clang by disabling all LLVM passes (-Xclang -disable-llvm-passes or something?)

Sure:
mchorver-16x8.zip

With this, I can repro the same issue by compiling this IR as clang mchorver-16x8.ll -O3 -c -o test/encoder/EncUT_MotionCompensation2.o.

Pierre-vh · 2024-06-18T11:54:15Z

I think I have a fix, the issue is with how we apply the regmask indeed so this patch is at fault.
The current logic isn't quite right and the lack of test coverage didn't help. I'll need some time to create a testcase from the reproducer.

[MachineLICM] Fix regression introduced by d4b8b72

63224bb

Pierre-vh requested review from jayfoad and arsenm June 17, 2024 07:40

llvmbot added the backend:AMDGPU label Jun 17, 2024

arsenm reviewed Jun 17, 2024

View reviewed changes

Pierre-vh changed the title ~~[MachineLICM] Fix regression introduced by d4b8b72~~ [MachineLICM] Correctly Apply Register Masks Jun 17, 2024

Comments

1b848f8

jayfoad reviewed Jun 17, 2024

View reviewed changes

llvm/lib/CodeGen/MachineLICM.cpp Show resolved Hide resolved

jayfoad reviewed Jun 17, 2024

View reviewed changes

llvm/lib/CodeGen/MachineLICM.cpp Outdated Show resolved Hide resolved

jayfoad approved these changes Jun 17, 2024

View reviewed changes

skip empty words + add fixme

e3f3dae

Pierre-vh merged commit 770393b into llvm:main Jun 17, 2024
4 of 6 checks passed

Pierre-vh deleted the fix-licm-regr branch June 17, 2024 11:42

jpienaar mentioned this pull request Jun 17, 2024

[mlirc] Add missing extern C #95829

Merged

Pierre-vh mentioned this pull request Jun 20, 2024

Potential Miscompiles with RegUnits-based MachineLICM liveness calculation #96146

Open

mstorsjo mentioned this pull request Jun 20, 2024

[MachineLICM] Work-around Incomplete RegUnits #95926

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[MachineLICM] Correctly Apply Register Masks #95746

[MachineLICM] Correctly Apply Register Masks #95746

Pierre-vh commented Jun 17, 2024 •

edited

Loading

llvmbot commented Jun 17, 2024

jayfoad commented Jun 17, 2024

arsenm left a comment

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024 •

edited

Loading

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

mstorsjo commented Jun 18, 2024

DavidSpickett commented Jun 18, 2024

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

[MachineLICM] Correctly Apply Register Masks #95746

[MachineLICM] Correctly Apply Register Masks #95746

Conversation

Pierre-vh commented Jun 17, 2024 • edited Loading

llvmbot commented Jun 17, 2024

jayfoad commented Jun 17, 2024

arsenm left a comment

Choose a reason for hiding this comment

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024 • edited Loading

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

mstorsjo commented Jun 18, 2024

DavidSpickett commented Jun 18, 2024

mstorsjo commented Jun 18, 2024

Pierre-vh commented Jun 18, 2024

Pierre-vh commented Jun 17, 2024 •

edited

Loading

Pierre-vh commented Jun 18, 2024 •

edited

Loading