-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP]Fix PR87133: crash because of different altopcodes for cmps after reordering. #87267
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesIf the node has cmp instruction with 3 or more different but swappable Full diff: https://github.com/llvm/llvm-project/pull/87267.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1ffc39a9067431..75bfdac99cb4e9 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -658,6 +658,32 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
unsigned AltOpcode = Opcode;
unsigned AltIndex = BaseIndex;
+ bool SwappedPredsCompatible = [&]() {
+ if (!IsCmpOp)
+ return false;
+ SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
+ UniquePreds.insert(BasePred);
+ UniqueNonSwappedPreds.insert(BasePred);
+ for (Value *V : VL) {
+ auto *I = dyn_cast<CmpInst>(V);
+ if (!I) {
+ UniquePreds.clear();
+ UniqueNonSwappedPreds.clear();
+ break;
+ }
+ CmpInst::Predicate CurrentPred = I->getPredicate();
+ CmpInst::Predicate SwappedCurrentPred =
+ CmpInst::getSwappedPredicate(CurrentPred);
+ UniqueNonSwappedPreds.insert(CurrentPred);
+ if (!UniquePreds.contains(CurrentPred) &&
+ !UniquePreds.contains(SwappedCurrentPred))
+ UniquePreds.insert(CurrentPred);
+ }
+ // Total number of predicates > 2, but if consider swapped predicates
+ // compatible only 2, consider swappable predicates as compatible opcodes,
+ // not alternate.
+ return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
+ }();
// Check for one alternate opcode from another BinaryOperator.
// TODO - generalize to support all operators (types, calls etc.).
auto *IBase = cast<Instruction>(VL[BaseIndex]);
@@ -710,7 +736,7 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
CmpInst::Predicate SwappedCurrentPred =
CmpInst::getSwappedPredicate(CurrentPred);
- if (E == 2 &&
+ if ((E == 2 || SwappedPredsCompatible) &&
(BasePred == CurrentPred || BasePred == SwappedCurrentPred))
continue;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
new file mode 100644
index 00000000000000..6b270150985ef3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %sptr, i64 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[SPTR:%.*]], i64 [[TMP0:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CONV_I:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT: [[IV2:%.*]] = getelementptr i8, ptr [[SPTR]], i64 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IV2]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[CONV_I]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 1>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 1, i32 5, i32 1, i32 poison>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 poison, i32 poison, i32 0>, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT: [[TMP6:%.*]] = icmp sle <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP10]])
+; CHECK-NEXT: [[AND33:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT: ret i32 [[AND33]]
+;
+entry:
+ %conv.i = trunc i64 %0 to i32
+ %iv2 = getelementptr i8, ptr %sptr, i64 4
+ %1 = load i32, ptr %iv2, align 4
+ %cmp11 = icmp slt i32 %1, %conv.i
+ %cmp.i57 = icmp eq i32 %1, 0
+ %or.i5977 = or i1 %cmp.i57, %cmp11
+ %iv4 = getelementptr i8, ptr %sptr, i64 12
+ %2 = load i32, ptr %iv4, align 4
+ %cmp16 = icmp sle i32 %2, %conv.i
+ %cmp.i62 = icmp eq i32 %2, 0
+ %or.i6478 = or i1 %cmp.i62, %cmp16
+ %iv3 = getelementptr i8, ptr %sptr, i64 8
+ %3 = load i32, ptr %iv3, align 8
+ %cmp21 = icmp sgt i32 %3, %conv.i
+ %cmp.i67 = icmp eq i32 %3, 0
+ %or.i6979 = or i1 %cmp.i67, %cmp21
+ %iv5 = getelementptr i8, ptr %sptr, i64 16
+ %4 = load i32, ptr %iv5, align 8
+ %cmp26 = icmp slt i32 %conv.i, 0
+ %cmp.i72 = icmp eq i32 %4, 0
+ %or.i7480 = or i1 %cmp.i72, %cmp26
+ %and3183 = and i1 %or.i5977, %or.i6478
+ %and3284 = and i1 %and3183, %or.i6979
+ %and3385 = and i1 %and3284, %or.i7480
+ %and33 = zext i1 %and3385 to i32
+ ret i32 %and33
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
index b5a3c57414e784..acc04bece598a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll
@@ -94,17 +94,13 @@ define i1 @logical_or_fcmp(<4 x float> %x) {
define i1 @logical_and_icmp_diff_preds(<4 x i32> %x) {
; SSE-LABEL: @logical_and_icmp_diff_preds(
-; SSE-NEXT: [[X0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 0
-; SSE-NEXT: [[X2:%.*]] = extractelement <4 x i32> [[X]], i32 2
-; SSE-NEXT: [[C0:%.*]] = icmp ult i32 [[X0]], 0
-; SSE-NEXT: [[C2:%.*]] = icmp sgt i32 [[X2]], 0
-; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <2 x i32> <i32 3, i32 1>
-; SSE-NEXT: [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
-; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; SSE-NEXT: [[S1:%.*]] = select i1 [[C0]], i1 [[TMP3]], i1 false
-; SSE-NEXT: [[S2:%.*]] = select i1 [[S1]], i1 [[C2]], i1 false
-; SSE-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; SSE-NEXT: [[S3:%.*]] = select i1 [[S2]], i1 [[TMP4]], i1 false
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 poison>, <4 x i32> <i32 1, i32 3, i32 6, i32 0>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = icmp ult <4 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT: [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; SSE-NEXT: [[S3:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP6]])
; SSE-NEXT: ret i1 [[S3]]
;
; AVX-LABEL: @logical_and_icmp_diff_preds(
@@ -391,17 +387,28 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) {
}
define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) {
-; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff(
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
-; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
-; CHECK-NEXT: ret i1 [[OP_RDX]]
+; SSE-LABEL: @logical_and_icmp_clamp_pred_diff(
+; SSE-NEXT: [[TMP1:%.*]] = icmp slt <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT: [[TMP2:%.*]] = icmp ult <4 x i32> [[X]], <i32 42, i32 42, i32 42, i32 42>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i32> [[X]], <i32 17, i32 17, i32 17, i32 17>
+; SSE-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
+; SSE-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP5]])
+; SSE-NEXT: [[TMP7:%.*]] = freeze <4 x i1> [[TMP3]]
+; SSE-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP7]])
+; SSE-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 [[TMP8]], i1 false
+; SSE-NEXT: ret i1 [[OP_RDX]]
+;
+; AVX-LABEL: @logical_and_icmp_clamp_pred_diff(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 2, i32 3>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 42, i32 42, i32 42, i32 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 3>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> <i32 17, i32 17, i32 17, i32 17, i32 poison, i32 poison, i32 poison, i32 42>, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]]
+; AVX-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]])
+; AVX-NEXT: ret i1 [[TMP8]]
;
%x0 = extractelement <4 x i32> %x, i32 0
%x1 = extractelement <4 x i32> %x, i32 1
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not really qualified to review this, but I can confirm that this change also fixes my original (non-reduced) test-case.
if (!I) { | ||
UniquePreds.clear(); | ||
UniqueNonSwappedPreds.clear(); | ||
break; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not just return false?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, remains from the initial version of the patch, will fix.
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
/cherry-pick cd29126 |
Error: Command failed due to missing milestone. |
@AtariDreams #87133 indicates this has been broken since llvm15 - we don't cherry pick to the current release branch after its first release unless the fix is for a regression from the previous release only (17.x for 18.x etc.). |
If the node has cmp instruction with 3 or more different but swappable
predicates, need to keep same kind of main/alternate opcodes to avoid
incorrect detection of opcodes after reordering. Reordering changes the
order and we may erroneously consider swappable opcodes as
non-compatible/alternate, which may lead to a later compiler crash.