update P7 32-bit partial vector load cost #108261

RolandF77 · 2024-09-11T18:07:34Z

Update cost model to reflect codegen change to use lfiwzx from #104507.

llvmbot · 2024-09-11T18:08:05Z

@llvm/pr-subscribers-llvm-analysis

Author: None (RolandF77)

Changes

Update cost model to reflect codegen change to use lfiwzx from #104507.

Full diff: https://github.com/llvm/llvm-project/pull/108261.diff

2 Files Affected:

(modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (+10-4)
(modified) llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll (+5-2)

diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b7bdbeb535d526..df0047022a2c04 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -802,12 +802,18 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   // explicitly check this case. There are also corresponding store
   // instructions.
   unsigned MemBytes = Src->getPrimitiveSizeInBits();
-  if (ST->hasVSX() && IsAltivecType &&
-      (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
-    return 1;
+  Align AlignBytes = Alignment ? *Alignment : Align(1);
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (ST->hasVSX() && IsAltivecType) {
+    if (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32))
+      return 1;
+    // Use lfiwax/xxspltw
+    if (Opcode == Instruction::Load && MemBytes == 32)
+      if (AlignBytes < SrcBytes || Cost > 2)
+        return 2;
+  }
 
   // Aligned loads and stores are easy.
-  unsigned SrcBytes = LT.second.getStoreSize();
   if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
     return Cost;
 
diff --git a/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
index 54cafa0ae59f39..0e7e89c18c1cba 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
@@ -1,18 +1,21 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck --check-prefix=P7 %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @loads(i32 %arg) {
   ; CHECK: cost of 1 {{.*}} load
+  ; P7: cost of 2 {{.*}} load
   load <4 x i8>, ptr undef, align 1
 
-  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK, P7: cost of 1 {{.*}} load
   load <8 x i8>, ptr undef, align 1
 
   ; CHECK: cost of 1 {{.*}} load
+  ; P7: cost of 2 {{.*}} load
   load <2 x i16>, ptr undef, align 2
 
-  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK, P7: cost of 1 {{.*}} load
   load <4 x i16>, ptr undef, align 2
 
   ret i32 undef

llvmbot · 2024-09-11T18:08:06Z

@llvm/pr-subscribers-backend-powerpc

Author: None (RolandF77)

Changes

Update cost model to reflect codegen change to use lfiwzx from #104507.

Full diff: https://github.com/llvm/llvm-project/pull/108261.diff

2 Files Affected:

(modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (+10-4)
(modified) llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll (+5-2)

diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index b7bdbeb535d526..df0047022a2c04 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -802,12 +802,18 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   // explicitly check this case. There are also corresponding store
   // instructions.
   unsigned MemBytes = Src->getPrimitiveSizeInBits();
-  if (ST->hasVSX() && IsAltivecType &&
-      (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
-    return 1;
+  Align AlignBytes = Alignment ? *Alignment : Align(1);
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (ST->hasVSX() && IsAltivecType) {
+    if (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32))
+      return 1;
+    // Use lfiwax/xxspltw
+    if (Opcode == Instruction::Load && MemBytes == 32)
+      if (AlignBytes < SrcBytes || Cost > 2)
+        return 2;
+  }
 
   // Aligned loads and stores are easy.
-  unsigned SrcBytes = LT.second.getStoreSize();
   if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
     return Cost;
 
diff --git a/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
index 54cafa0ae59f39..0e7e89c18c1cba 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll
@@ -1,18 +1,21 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck --check-prefix=P7 %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 define i32 @loads(i32 %arg) {
   ; CHECK: cost of 1 {{.*}} load
+  ; P7: cost of 2 {{.*}} load
   load <4 x i8>, ptr undef, align 1
 
-  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK, P7: cost of 1 {{.*}} load
   load <8 x i8>, ptr undef, align 1
 
   ; CHECK: cost of 1 {{.*}} load
+  ; P7: cost of 2 {{.*}} load
   load <2 x i16>, ptr undef, align 2
 
-  ; CHECK: cost of 1 {{.*}} load
+  ; CHECK, P7: cost of 1 {{.*}} load
   load <4 x i16>, ptr undef, align 2
 
   ret i32 undef

chenzheng1030 · 2024-09-13T23:26:14Z

llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

+      return 1;
+    // Use lfiwax/xxspltw
+    if (Opcode == Instruction::Load && MemBytes == 32)
+      if (AlignBytes < SrcBytes || Cost > 2)


The logic if(Cost > 2) return 2; seems not perfect. Can we exclude the case at the place where Cost is set to be bigger than 2?

Cost check removed. It was left over from an earlier version of the change.

diggerlin · 2024-09-17T21:02:51Z

llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

-  if (ST->hasVSX() && IsAltivecType &&
-      (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
-    return 1;
+  Align AlignBytes = Alignment ? *Alignment : Align(1);


we can put the definition of the variable before the
if (Opcode == Instruction::Load && MemBytes == 32 && AlignBytes < SrcBytes) )

diggerlin · 2024-09-18T13:31:21Z

llvm/test/Analysis/CostModel/PowerPC/vsr_load_32_64.ll

 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"

 define i32 @loads(i32 %arg) {
  ; CHECK: cost of 1 {{.*}} load
+  ; P7: cost of 2 {{.*}} load


I would change to

opt < %s -passes="print" 2>&1 -disable-output -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s -DCOST=1

opt < %s -passes="print" 2>&1 -disable-output -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s -DCOST=2

add change

; CHECK: cost of 1 {{.*}} load ; P7: cost of 2 {{.*}} load

to
CHECK: cost of [[COST]] {{.*}} load

it is only a suggestion, feel free to keep it if you do not want to modify

diggerlin · 2024-09-18T19:44:21Z

llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

-    return 1;
+  unsigned SrcBytes = LT.second.getStoreSize();
+  if (ST->hasVSX() && IsAltivecType) {
+    if (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32))


nit: this maybe not related to the patch,

I think the variable MemBytes should be MemBits , otherwise it maybe cause mis_understand (it looks like compare with 64bytes and 32bytes)

if (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32))

I think we can modify the variable name in the patch by the way?

diggerlin · 2024-09-18T19:53:11Z

llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp

+
+    // Use lfiwax/xxspltw
+    Align AlignBytes = Alignment ? *Alignment : Align(1);
+    if (Opcode == Instruction::Load && MemBytes == 32 && AlignBytes < SrcBytes)


I am curious that why need the AlignBytes < SrcBytes here ?

If a partial vector (< 128 bits) is being loaded with a full vector aligned address (>= 128 bits), the load will be done as a full vector load since we know from alignment that it is safe. Therefore the cost of a partial vector load does not apply.

thanks for explaining.

diggerlin

I do not have further comment on it. thanks for addressing the comments

amy-kwan

I think overall LGTM. I have one question I was wondering about.

Is it more accurate to update the title so that it doesn't mention v4i8 specifically? For example, doesn't v2i16 also count?

Update cost model to reflect codegen change to use lfiwzx for 32-bit partial vector loads on pwr7 with llvm#104507.

RolandF77 self-assigned this Sep 11, 2024

llvmbot added backend:PowerPC llvm:analysis labels Sep 11, 2024

update P7 v4i8 load cost

3677531

RolandF77 requested review from chenzheng1030, diggerlin, amy-kwan and stefanp-ibm September 11, 2024 21:59

chenzheng1030 reviewed Sep 13, 2024

View reviewed changes

diggerlin reviewed Sep 17, 2024

View reviewed changes

RolandF77 added 2 commits September 17, 2024 21:06

address comments

7103010

move decl

a6bf81b

diggerlin reviewed Sep 18, 2024

View reviewed changes

update test

f004e2b

diggerlin reviewed Sep 18, 2024

View reviewed changes

MemBits

e44e082

diggerlin approved these changes Oct 1, 2024

View reviewed changes

amy-kwan approved these changes Oct 2, 2024

View reviewed changes

RolandF77 changed the title ~~update P7 v4i8 load cost~~ update P7 32-bit partial vector load cost Oct 2, 2024

RolandF77 merged commit 06c8210 into llvm:main Oct 3, 2024
8 checks passed

xgupta pushed a commit to xgupta/llvm-project that referenced this pull request Oct 4, 2024

update P7 32-bit partial vector load cost (llvm#108261)

090aeae

Update cost model to reflect codegen change to use lfiwzx for 32-bit partial vector loads on pwr7 with llvm#104507.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

update P7 32-bit partial vector load cost #108261

update P7 32-bit partial vector load cost #108261

RolandF77 commented Sep 11, 2024

llvmbot commented Sep 11, 2024

llvmbot commented Sep 11, 2024

chenzheng1030 Sep 13, 2024

RolandF77 Sep 17, 2024

diggerlin Sep 17, 2024 •

edited

Loading

diggerlin Sep 18, 2024 •

edited

Loading

diggerlin Sep 18, 2024 •

edited

Loading

diggerlin Sep 18, 2024

RolandF77 Sep 18, 2024 •

edited

Loading

diggerlin Oct 1, 2024

diggerlin left a comment •

edited

Loading

amy-kwan left a comment

update P7 32-bit partial vector load cost #108261

update P7 32-bit partial vector load cost #108261

Conversation

RolandF77 commented Sep 11, 2024

llvmbot commented Sep 11, 2024

llvmbot commented Sep 11, 2024

chenzheng1030 Sep 13, 2024

Choose a reason for hiding this comment

RolandF77 Sep 17, 2024

Choose a reason for hiding this comment

diggerlin Sep 17, 2024 • edited Loading

Choose a reason for hiding this comment

diggerlin Sep 18, 2024 • edited Loading

Choose a reason for hiding this comment

diggerlin Sep 18, 2024 • edited Loading

Choose a reason for hiding this comment

diggerlin Sep 18, 2024

Choose a reason for hiding this comment

RolandF77 Sep 18, 2024 • edited Loading

Choose a reason for hiding this comment

diggerlin Oct 1, 2024

Choose a reason for hiding this comment

diggerlin left a comment • edited Loading

Choose a reason for hiding this comment

amy-kwan left a comment

Choose a reason for hiding this comment

diggerlin Sep 17, 2024 •

edited

Loading

diggerlin Sep 18, 2024 •

edited

Loading

diggerlin Sep 18, 2024 •

edited

Loading

RolandF77 Sep 18, 2024 •

edited

Loading

diggerlin left a comment •

edited

Loading