[NVPTX] Don't use stack memory when bitcasting to/from v2i8 #113928

peterbell10 · 2024-10-28T15:32:12Z

v2i8 is an unsupported type, so we hit the default legalization rules which perform the bitcast in stack memory and is very inefficient on GPU.

This adds a custom lowering where we pack v2i8 into i16 and from there use another bitcast node to reach the final desired type. And also the inverse unpacking i16 into v2i8.

`v2i8` is and unsupported type, so we hit the default legalization rules which perform the bitcast in stack memory and is very inefficient on GPU. This adds a custom lowering where we pack `v2i8` into `i16` and from there use another bitcast node to reach the final desired type. And also the inverse unpacking `i16` into `v2i8`.

github-actions · 2024-10-28T15:35:29Z

✅ With the latest revision this PR passed the C/C++ code formatter.

llvmbot · 2024-10-29T00:00:37Z

@llvm/pr-subscribers-backend-nvptx

Author: None (peterbell10)

Changes

v2i8 is an unsupported type, so we hit the default legalization rules which perform the bitcast in stack memory and is very inefficient on GPU.

This adds a custom lowering where we pack v2i8 into i16 and from there use another bitcast node to reach the final desired type. And also the inverse unpacking i16 into v2i8.

Full diff: https://github.com/llvm/llvm-project/pull/113928.diff

3 Files Affected:

(modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+50)
(modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.h (+2)
(added) llvm/test/CodeGen/NVPTX/i8x2-instructions.ll (+36)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a95cba586b8fc3..050fbcfbcd8165 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -551,6 +551,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8, Custom);
+
+  // Custom conversions to/from v2i8.
+  setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
+
   // Only logical ops can be done on v4i8 directly, others must be done
   // elementwise.
   setOperationAction(
@@ -2311,6 +2315,47 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
+SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
+  // Handle bitcasting to/from v2i8 without hitting the default promotion
+  // strategy which goes through stack memory.
+  SDNode *Node = Op.getNode();
+  SDLoc dl(Node);
+
+  auto maybeBitcast = [&](EVT vt, SDValue val) {
+    if (val->getValueType(0) == vt) {
+      return val;
+    }
+    return DAG.getNode(ISD::BITCAST, dl, vt, val);
+  };
+
+  EVT VT = Op->getValueType(0);
+  EVT fromVT = Op->getOperand(0)->getValueType(0);
+
+  if (VT == MVT::v2i8) {
+    // Bitcast to i16 and unpack elements into a vector
+    SDValue reg = maybeBitcast(MVT::i16, Op->getOperand(0));
+    SDValue v0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, reg);
+    SDValue C8 = DAG.getConstant(8, dl, MVT::i16);
+    SDValue v1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                             DAG.getNode(ISD::SRL, dl, MVT::i16, {reg, C8}));
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i8, {v0, v1});
+  } else if (fromVT == MVT::v2i8) {
+    // Pack vector elements into i16 and bitcast to final type
+    SDValue v0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8,
+                             Op->getOperand(0), DAG.getIntPtrConstant(0, dl));
+    SDValue v1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8,
+                             Op->getOperand(0), DAG.getIntPtrConstant(1, dl));
+    SDValue E0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, v0);
+    SDValue E1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, v1);
+    SDValue C8 = DAG.getConstant(8, dl, MVT::i16);
+    SDValue reg =
+        DAG.getNode(ISD::OR, dl, MVT::i16,
+                    {E0, DAG.getNode(ISD::SHL, dl, MVT::i16, {E1, C8})});
+    return maybeBitcast(VT, reg);
+  }
+  return Op;
+}
+
 // We can init constant f16x2/v2i16/v4i8 with a single .b32 move.  Normally it
 // would get lowered as two constant loads and vector-packing move.
 // Instead we want just a constant move:
@@ -2818,6 +2863,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return Op;
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::BITCAST:
+    return LowerBITCAST(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return Op;
   case ISD::EXTRACT_VECTOR_ELT:
@@ -6413,6 +6460,9 @@ void NVPTXTargetLowering::ReplaceNodeResults(
   switch (N->getOpcode()) {
   default:
     report_fatal_error("Unhandled custom legalization");
+  case ISD::BITCAST:
+    Results.push_back(LowerBITCAST(SDValue(N, 0), DAG));
+    return;
   case ISD::LOAD:
     ReplaceLoadVector(N, DAG, Results);
     return;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 824a659671967a..13153f4830b695 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -616,6 +616,8 @@ class NVPTXTargetLowering : public TargetLowering {
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
 
+  SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
new file mode 100644
index 00000000000000..2f5d8cfed2b7b7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck  %s
+; RUN: %if ptxas %{                                                           \
+; RUN:   llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \
+; RUN:   | %ptxas-verify -arch=sm_90                                          \
+; RUN: %}
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_trunc_2xi8(
+; CHECK:      ld.param.u32 [[R1:%r[0-9]+]], [test_trunc_2xi8_param_0];
+; CHECK:      mov.b32 {[[RS1:%rs[0-9]+]], [[RS2:%rs[0-9]+]]}, [[R1]];
+; CHECK:      shl.b16 	[[RS3:%rs[0-9]+]], [[RS2]], 8;
+; CHECK:      and.b16  [[RS4:%rs[0-9]+]], [[RS1]], 255;
+; CHECK:      or.b16   [[RS5:%rs[0-9]+]], [[RS4]], [[RS3]]
+; CHECK:      cvt.u32.u16  [[R2:%r[0-9]]], [[RS5]]
+; CHECK:      st.param.b32  [func_retval0], [[R2]];
+define i16 @test_trunc_2xi8(<2 x i16> %a) #0 {
+  %trunc = trunc <2 x i16> %a to <2 x i8>
+  %res = bitcast <2 x i8> %trunc to i16
+  ret i16 %res
+}
+
+; CHECK-LABEL: test_zext_2xi8(
+; CHECK:      ld.param.u16  [[RS1:%rs[0-9]+]], [test_zext_2xi8_param_0];
+; CHECK:      shr.u16 	[[RS2:%rs[0-9]+]], [[RS1]], 8;
+; CHECK:      mov.b32  [[R1:%r[0-9]+]], {[[RS1]], [[RS2]]}
+; CHECK:      and.b32  [[R2:%r[0-9]+]], [[R1]], 16711935;
+; CHECK:      st.param.b32  [func_retval0], [[R2]];
+define <2 x i16> @test_zext_2xi8(i16 %a) #0 {
+  %vec = bitcast i16 %a to <2 x i8>
+  %ext = zext <2 x i8> %vec to <2 x i16>
+  ret <2 x i16> %ext
+}

justinfargnoli

Overall, thanks for this PR. The current lowering is definitely not what we want!

we hit the default legalization rules which perform the bitcast in stack memory

Do you know where this default legalization rule is implemented?

Note: mov vector-to-scalar (pack) or scalar-to-vector (unpack) doesn't support .b8.

justinfargnoli · 2024-10-29T05:41:15Z

llvm/test/CodeGen/NVPTX/i8x2-instructions.ll

@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \


Since we're already checking for most of the PTX that are generated for each function, I'd recommend auto-generating the CHECK statements.

I haven't used that before but I gave it a shot and it didn't generate any checks at all for some reason, perhaps I was doing something wrong. Not sure.

I think you need to delete all the existing CHECK statements first.

If that doesn't work (and the rest of the MR looks good), I'll just submit a follow-up patch that auto-generates the test.

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

llvm/test/CodeGen/NVPTX/i8x2-instructions.ll

peterbell10 · 2024-10-29T14:11:01Z

we hit the default legalization rules which perform the bitcast in stack memory

Do you know where this default legalization rule is implemented?

For bitcast to v2i8 we hit

llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Lines 562 to 563 in 87b6ec3

    
           return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, 
        
                              CreateStackStoreLoad(InOp, OutVT));

And for bitcasts from v2i8 we hit

llvm-project/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Line 2156 in 87b6ec3

return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

justinfargnoli

LGTM, pending the resolution of all conversations!

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Update commit to include llvm/llvm-project#113928

) `v2i8` is an unsupported type, so we hit the default legalization rules which perform the bitcast in stack memory and is very inefficient on GPU. This adds a custom lowering where we pack `v2i8` into `i16` and from there use another bitcast node to reach the final desired type. And also the inverse unpacking `i16` into `v2i8`.

Update commit to include llvm/llvm-project#113928

peterbell10 changed the title ~~[NVPTX] Don't use stack memory when bitcasting to/from 2xi8~~ [NVPTX] Don't use stack memory when bitcasting to/from v2i8 Oct 28, 2024

peterbell10 added 2 commits October 28, 2024 15:51

Formatting

b0aa1db

Fix lit test

fc9ec25

peterbell10 marked this pull request as ready for review October 29, 2024 00:00

llvmbot added the backend:NVPTX label Oct 29, 2024

justinfargnoli assigned peterbell10 Oct 29, 2024

justinfargnoli self-requested a review October 29, 2024 06:00

justinfargnoli reviewed Oct 29, 2024

View reviewed changes

Address review comments

6e16bb5

peterbell10 force-pushed the nvptx-v2i8-no-stack branch from fea7293 to 6e16bb5 Compare October 29, 2024 14:16

justinfargnoli reviewed Oct 29, 2024

View reviewed changes

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp Outdated Show resolved Hide resolved

Split ReplaceNodeResults path out from LowerBITCAST

b611165

justinfargnoli approved these changes Oct 29, 2024

View reviewed changes

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp Outdated Show resolved Hide resolved

Use guard clauses

be5bc92

peterbell10 mentioned this pull request Oct 31, 2024

Why triton convert to float8e5 will cause local memory read/write triton-lang/triton#4769

Closed

ThomasRaoux merged commit b74e588 into llvm:main Nov 1, 2024
8 checks passed

peterbell10 deleted the nvptx-v2i8-no-stack branch November 1, 2024 15:03

peterbell10 added a commit to triton-lang/triton that referenced this pull request Nov 1, 2024

Update LLVM hash

7c48648

Update commit to include llvm/llvm-project#113928

peterbell10 mentioned this pull request Nov 1, 2024

Update LLVM hash on llvm-head triton-lang/triton#5039

Merged

ThomasRaoux pushed a commit to triton-lang/triton that referenced this pull request Nov 1, 2024

Update LLVM hash on llvm-head (#5039)

d3d80f3

Update commit to include llvm/llvm-project#113928

chsigg pushed a commit to openxla/triton that referenced this pull request Nov 4, 2024

Update LLVM hash on llvm-head (triton-lang#5039)

f5abb2e

Update commit to include llvm/llvm-project#113928

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[NVPTX] Don't use stack memory when bitcasting to/from v2i8 #113928

[NVPTX] Don't use stack memory when bitcasting to/from v2i8 #113928

peterbell10 commented Oct 28, 2024 •

edited

Loading

github-actions bot commented Oct 28, 2024 •

edited

Loading

llvmbot commented Oct 29, 2024

justinfargnoli left a comment

justinfargnoli Oct 29, 2024

peterbell10 Oct 29, 2024

justinfargnoli Oct 29, 2024

peterbell10 commented Oct 29, 2024

justinfargnoli left a comment

		@@ -0,0 +1,36 @@
		; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 -asm-verbose=false \

[NVPTX] Don't use stack memory when bitcasting to/from v2i8 #113928

[NVPTX] Don't use stack memory when bitcasting to/from v2i8 #113928

Conversation

peterbell10 commented Oct 28, 2024 • edited Loading

github-actions bot commented Oct 28, 2024 • edited Loading

llvmbot commented Oct 29, 2024

justinfargnoli left a comment

Choose a reason for hiding this comment

justinfargnoli Oct 29, 2024

Choose a reason for hiding this comment

peterbell10 Oct 29, 2024

Choose a reason for hiding this comment

justinfargnoli Oct 29, 2024

Choose a reason for hiding this comment

peterbell10 commented Oct 29, 2024

justinfargnoli left a comment

Choose a reason for hiding this comment

peterbell10 commented Oct 28, 2024 •

edited

Loading

github-actions bot commented Oct 28, 2024 •

edited

Loading