Skip to content

Commit

Permalink
[AMDGPU][PromoteAlloca] Support memsets to ptr allocas (#80678)
Browse files Browse the repository at this point in the history
Fixes #80366
  • Loading branch information
Pierre-vh authored Feb 5, 2024
1 parent ff9af4c commit 4e958ab
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 4 deletions.
16 changes: 12 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,10 +521,18 @@ static Value *promoteAllocaUserToVector(
// For memset, we don't need to know the previous value because we
// currently only allow memsets that cover the whole alloca.
Value *Elt = MSI->getOperand(1);
if (DL.getTypeStoreSize(VecEltTy) > 1) {
Value *EltBytes =
Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
if (BytesPerElt > 1) {
Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);

// If the element type of the vector is a pointer, we need to first cast
// to an integer, then use a PtrCast.
if (VecEltTy->isPointerTy()) {
Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
Elt = Builder.CreateBitCast(EltBytes, PtrInt);
Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
} else
Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
}

return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
Expand Down
54 changes: 54 additions & 0 deletions llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -84,4 +84,58 @@ entry:
ret void
}

define amdgpu_kernel void @memset_array_ptr_alloca(ptr %out) {
; CHECK-LABEL: @memset_array_ptr_alloca(
; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8
; CHECK-NEXT: ret void
;
%alloca = alloca [6 x ptr], align 16, addrspace(5)
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
%load = load i64, ptr addrspace(5) %alloca
store i64 %load, ptr %out
ret void
}

define amdgpu_kernel void @memset_vector_ptr_alloca(ptr %out) {
; CHECK-LABEL: @memset_vector_ptr_alloca(
; CHECK-NEXT: store i64 0, ptr [[OUT:%.*]], align 8
; CHECK-NEXT: ret void
;
%alloca = alloca <6 x ptr>, align 16, addrspace(5)
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
%load = load i64, ptr addrspace(5) %alloca
store i64 %load, ptr %out
ret void
}

define amdgpu_kernel void @memset_array_of_array_ptr_alloca(ptr %out) {
; CHECK-LABEL: @memset_array_of_array_ptr_alloca(
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x [3 x ptr]], align 16, addrspace(5)
; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
; CHECK-NEXT: ret void
;
%alloca = alloca [2 x [3 x ptr]], align 16, addrspace(5)
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
%load = load i64, ptr addrspace(5) %alloca
store i64 %load, ptr %out
ret void
}

define amdgpu_kernel void @memset_array_of_vec_ptr_alloca(ptr %out) {
; CHECK-LABEL: @memset_array_of_vec_ptr_alloca(
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x ptr>], align 16, addrspace(5)
; CHECK-NEXT: call void @llvm.memset.p5.i64(ptr addrspace(5) [[ALLOCA]], i8 0, i64 48, i1 false)
; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr addrspace(5) [[ALLOCA]], align 8
; CHECK-NEXT: store i64 [[LOAD]], ptr [[OUT:%.*]], align 8
; CHECK-NEXT: ret void
;
%alloca = alloca [2 x <3 x ptr>], align 16, addrspace(5)
call void @llvm.memset.p5.i64(ptr addrspace(5) %alloca, i8 0, i64 48, i1 false)
%load = load i64, ptr addrspace(5) %alloca
store i64 %load, ptr %out
ret void
}

declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)

0 comments on commit 4e958ab

Please sign in to comment.