-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Consider histcnt smaller than i32 in the cost model #108521
Conversation
This PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from llvm#101017 and llvm#103037 Work by Max Beck-Jones (@DevM-uk)
@llvm/pr-subscribers-llvm-analysis Author: Sam Tebbs (SamTebbs33) ChangesThis PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from Work by Max Beck-Jones (@DevM-uk) Full diff: https://github.com/llvm/llvm-project/pull/108521.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 58c267f1ce4bd6..83b5344fc8ed24 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -517,25 +517,31 @@ static bool isUnpackedVectorVT(EVT VecVT) {
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
+ unsigned TotalHistCnts = 1;
- // Only allow (32b and 64b) integers or pointers for now...
+ // Only allow (up to 64b) integers or pointers
if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
- (EltTy->getScalarSizeInBits() != 32 &&
- EltTy->getScalarSizeInBits() != 64))
+ EltTy->getScalarSizeInBits() > 64)
return InstructionCost::getInvalid();
- // FIXME: Hacky check for legal vector types. We can promote smaller types
- // but we cannot legalize vectors via splitting for histcnt.
// FIXME: We should be able to generate histcnt for fixed-length vectors
// using ptrue with a specific VL.
- if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy))
- if ((VTy->getElementCount().getKnownMinValue() != 2 &&
- VTy->getElementCount().getKnownMinValue() != 4) ||
- VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
- !VTy->isScalableTy())
+ if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
+ unsigned EC = VTy->getElementCount().getKnownMinValue();
+ if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
return InstructionCost::getInvalid();
- return InstructionCost(BaseHistCntCost);
+ bool Element64b = EltTy->isIntegerTy(64);
+
+ if (EC == 2 || (!Element64b && EC == 4))
+ return InstructionCost(BaseHistCntCost);
+
+ unsigned NaturalVectorWidth = Element64b ? AArch64::SVEBitsPerBlock / 64
+ : AArch64::SVEBitsPerBlock / 32;
+ TotalHistCnts = EC / NaturalVectorWidth;
+ }
+
+ return InstructionCost(BaseHistCntCost * TotalHistCnts);
}
InstructionCost
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index aede9c89843128..1ecd02e5c124a6 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -971,26 +971,26 @@ define void @histogram_nxv4i32(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %m
ret void
}
-define void @histogram_nxv8i16(<vscale x 8 x ptr> %buckets, <vscale x 8 x i1> %mask) {
+define void @histogram_nxv8i16(<vscale x 8 x ptr> %buckets, <vscale x 8 x i1> %mask) #3 {
; CHECK-LABEL: 'histogram_nxv8i16'
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'histogram_nxv8i16'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
ret void
}
-define void @histogram_nxv16i8(<vscale x 16 x ptr> %buckets, <vscale x 16 x i1> %mask) {
+define void @histogram_nxv16i8(<vscale x 16 x ptr> %buckets, <vscale x 16 x i1> %mask) #3 {
; CHECK-LABEL: 'histogram_nxv16i8'
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'histogram_nxv16i8'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call void @llvm.experimental.vector.histogram.add.nxv16p0.i64(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
@@ -1049,13 +1049,13 @@ define void @histogram_v16i8(<16 x ptr> %buckets, <16 x i1> %mask) {
ret void
}
-define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %mask) {
+define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %mask) #3 {
; CHECK-LABEL: 'histogram_nxv4i64'
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'histogram_nxv4i64'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
|
@llvm/pr-subscribers-backend-aarch64 Author: Sam Tebbs (SamTebbs33) ChangesThis PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from Work by Max Beck-Jones (@DevM-uk) Full diff: https://github.com/llvm/llvm-project/pull/108521.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 58c267f1ce4bd6..83b5344fc8ed24 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -517,25 +517,31 @@ static bool isUnpackedVectorVT(EVT VecVT) {
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA) {
Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
+ unsigned TotalHistCnts = 1;
- // Only allow (32b and 64b) integers or pointers for now...
+ // Only allow (up to 64b) integers or pointers
if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
- (EltTy->getScalarSizeInBits() != 32 &&
- EltTy->getScalarSizeInBits() != 64))
+ EltTy->getScalarSizeInBits() > 64)
return InstructionCost::getInvalid();
- // FIXME: Hacky check for legal vector types. We can promote smaller types
- // but we cannot legalize vectors via splitting for histcnt.
// FIXME: We should be able to generate histcnt for fixed-length vectors
// using ptrue with a specific VL.
- if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy))
- if ((VTy->getElementCount().getKnownMinValue() != 2 &&
- VTy->getElementCount().getKnownMinValue() != 4) ||
- VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
- !VTy->isScalableTy())
+ if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
+ unsigned EC = VTy->getElementCount().getKnownMinValue();
+ if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
return InstructionCost::getInvalid();
- return InstructionCost(BaseHistCntCost);
+ bool Element64b = EltTy->isIntegerTy(64);
+
+ if (EC == 2 || (!Element64b && EC == 4))
+ return InstructionCost(BaseHistCntCost);
+
+ unsigned NaturalVectorWidth = Element64b ? AArch64::SVEBitsPerBlock / 64
+ : AArch64::SVEBitsPerBlock / 32;
+ TotalHistCnts = EC / NaturalVectorWidth;
+ }
+
+ return InstructionCost(BaseHistCntCost * TotalHistCnts);
}
InstructionCost
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index aede9c89843128..1ecd02e5c124a6 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -971,26 +971,26 @@ define void @histogram_nxv4i32(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %m
ret void
}
-define void @histogram_nxv8i16(<vscale x 8 x ptr> %buckets, <vscale x 8 x i1> %mask) {
+define void @histogram_nxv8i16(<vscale x 8 x ptr> %buckets, <vscale x 8 x i1> %mask) #3 {
; CHECK-LABEL: 'histogram_nxv8i16'
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'histogram_nxv8i16'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 1, <vscale x 8 x i1> %mask)
ret void
}
-define void @histogram_nxv16i8(<vscale x 16 x ptr> %buckets, <vscale x 16 x i1> %mask) {
+define void @histogram_nxv16i8(<vscale x 16 x ptr> %buckets, <vscale x 16 x i1> %mask) #3 {
; CHECK-LABEL: 'histogram_nxv16i8'
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'histogram_nxv16i8'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call void @llvm.experimental.vector.histogram.add.nxv16p0.i64(<vscale x 16 x ptr> %buckets, i8 1, <vscale x 16 x i1> %mask)
@@ -1049,13 +1049,13 @@ define void @histogram_v16i8(<16 x ptr> %buckets, <16 x i1> %mask) {
ret void
}
-define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %mask) {
+define void @histogram_nxv4i64(<vscale x 4 x ptr> %buckets, <vscale x 4 x i1> %mask) #3 {
; CHECK-LABEL: 'histogram_nxv4i64'
-; CHECK-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
; TYPE_BASED_ONLY-LABEL: 'histogram_nxv4i64'
-; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
+; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
;
call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 1, <vscale x 4 x i1> %mask)
|
|
||
// Only allow (32b and 64b) integers or pointers for now... | ||
// Only allow (up to 64b) integers or pointers |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd suggest adding a variable unsigned EltSize = EltTy->getScalarSizeInBits();
and using it both in the initial size check and the total size check below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
return InstructionCost::getInvalid(); | ||
|
||
return InstructionCost(BaseHistCntCost); | ||
bool Element64b = EltTy->isIntegerTy(64); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using the EltSize
suggestion from above:
bool Element64b = EltTy->isIntegerTy(64); | |
// HistCnt only supports 32b and 64b element types. | |
unsigned LegalEltSize = EltSize <= 32 ? 32 : 64; | |
if (EC == 2 || (LegalEltSize == 32 && EC == 4)) | |
return InstructionCost(BaseHistCntCost); | |
unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize; |
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, though you'll need to fix the formatting :)
I think this caused a new build failure:
|
Thanks Aaron, I'll commit a fix. |
This fixes a build failure caused by #108521
…08521) This PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from llvm#101017 and llvm#103037 Work by Max Beck-Jones (@DevM-uk) --------- Co-authored-by: DevM-uk <max.beck-jones@arm.com>
This fixes a build failure caused by llvm#108521
This PR updates the AArch64 cost model to consider the cheaper cost of <i32 histograms to reflect the improvements from
#101017 and #103037
Work by Max Beck-Jones (@DevM-uk)