[AMDGPU] fix SIPeepholeSDWA optimization for fp16 #109395

PankajDwivedi-25 · 2024-09-20T09:27:27Z

Without SDWA optimization
%124:vgpr_32 = V_LSHRREV_B32_e64 16, %121:vgpr_32, implicit $exec
%126:vgpr_32 = contract nofpexcept V_ADD_F16_e64 0, %121:vgpr_32, 0, %124:vgpr_32, 0, 0, implicit $mode, implicit $exec
%129:vgpr_32 = contract nofpexcept V_SUB_F16_e64 0, %121:vgpr_32, 0, %124:vgpr_32, 0, 0, implicit $mode, implicit $exec

With SDWA optimization
%124:vgpr_32 = V_LSHRREV_B32_e64 16, %121:vgpr_32, implicit $exec
%126:vgpr_32 = contract nofpexcept V_ADD_F16_sdwa 0, %121:vgpr_32, 0, %121:vgpr_32, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
%129:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %121:vgpr_32, 0, %121:vgpr_32, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec

llvmbot · 2024-09-20T09:27:46Z

@llvm/pr-subscribers-backend-amdgpu

Author: Pankaj Dwivedi (PankajDwivedi-25)

Changes

Without SDWA optimization
%124:vgpr_32 = V_LSHRREV_B32_e64 16, %121:vgpr_32, implicit $exec
%126:vgpr_32 = contract nofpexcept V_ADD_F16_e64 0, %121:vgpr_32, 0, %124:vgpr_32, 0, 0, implicit $mode, implicit $exec
%129:vgpr_32 = contract nofpexcept V_SUB_F16_e64 0, %121:vgpr_32, 0, %124:vgpr_32, 0, 0, implicit $mode, implicit $exec

With SDWA optimization
%124:vgpr_32 = V_LSHRREV_B32_e64 16, %121:vgpr_32, implicit $exec
%126:vgpr_32 = contract nofpexcept V_ADD_F16_sdwa 0, %121:vgpr_32, 0, %121:vgpr_32, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec
%129:vgpr_32 = contract nofpexcept V_SUB_F16_sdwa 0, %121:vgpr_32, 0, %121:vgpr_32, 0, 0, 6, 0, 6, 5, implicit $mode, implicit $exec

Full diff: https://github.com/llvm/llvm-project/pull/109395.diff

1 Files Affected:

(added) llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll (+265)

diff --git a/llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll b/llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll
new file mode 100644
index 00000000000000..ffbb7f89de050b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll
@@ -0,0 +1,265 @@
+%struct.rocfft_complex = type { half, half }
+
+$_Z32real_post_process_kernel_inplaceI14rocfft_complexIDF16_ELb1EEvmmmPT_mPKS2_ = comdat any
+
+; Function Attrs: convergent inlinehint mustprogress nounwind
+define weak_odr hidden void @_Z32real_post_process_kernel_inplaceI14rocfft_complexIDF16_ELb1EEvmmmPT_mPKS2_(i64 noundef %0, i64 noundef %1, i64 noundef %2, ptr noundef %3, i64 noundef %4, ptr noundef %5) #2 comdat {
+  %7 = alloca i64, align 8, addrspace(5)
+  %8 = alloca i64, align 8, addrspace(5)
+  %9 = alloca i64, align 8, addrspace(5)
+  %10 = alloca ptr, align 8, addrspace(5)
+  %11 = alloca i64, align 8, addrspace(5)
+  %12 = alloca ptr, align 8, addrspace(5)
+  %13 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+  %14 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+  %15 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+  %16 = alloca double, align 8, addrspace(5)
+  %17 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+  %18 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+  %19 = alloca double, align 8, addrspace(5)
+  %20 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+  %21 = alloca %struct.rocfft_complex, align 2, addrspace(5)
+  %22 = addrspacecast ptr addrspace(5) %7 to ptr
+  %23 = addrspacecast ptr addrspace(5) %8 to ptr
+  %24 = addrspacecast ptr addrspace(5) %9 to ptr
+  %25 = addrspacecast ptr addrspace(5) %10 to ptr
+  %26 = addrspacecast ptr addrspace(5) %11 to ptr
+  %27 = addrspacecast ptr addrspace(5) %12 to ptr
+  %28 = addrspacecast ptr addrspace(5) %13 to ptr
+  %29 = addrspacecast ptr addrspace(5) %14 to ptr
+  %30 = addrspacecast ptr addrspace(5) %15 to ptr
+  %31 = addrspacecast ptr addrspace(5) %16 to ptr
+  %32 = addrspacecast ptr addrspace(5) %17 to ptr
+  %33 = addrspacecast ptr addrspace(5) %18 to ptr
+  %34 = addrspacecast ptr addrspace(5) %19 to ptr
+  %35 = addrspacecast ptr addrspace(5) %20 to ptr
+  %36 = addrspacecast ptr addrspace(5) %21 to ptr
+  store i64 %0, ptr %22, align 8, !tbaa !6
+  store i64 %1, ptr %23, align 8, !tbaa !6
+  store i64 %2, ptr %24, align 8, !tbaa !6
+  store ptr %3, ptr %25, align 8, !tbaa !10
+  store i64 %4, ptr %26, align 8, !tbaa !6
+  store ptr %5, ptr %27, align 8, !tbaa !10
+  %37 = load i64, ptr %22, align 8, !tbaa !6
+  %38 = load i64, ptr %24, align 8, !tbaa !6
+  br label %40
+
+40:                                               ; preds = %6
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %13) #4
+  %41 = load ptr, ptr %25, align 8, !tbaa !10
+  %42 = load i64, ptr %26, align 8, !tbaa !6
+  %43 = load i64, ptr %22, align 8, !tbaa !6
+  %44 = add i64 %42, %43
+  %45 = getelementptr inbounds %struct.rocfft_complex, ptr %41, i64 %44
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %28, ptr align 2 %45, i64 4, i1 false), !tbaa.struct !12
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %14) #4
+  %46 = load ptr, ptr %25, align 8, !tbaa !10
+  %47 = load i64, ptr %26, align 8, !tbaa !6
+  %48 = load i64, ptr %23, align 8, !tbaa !6
+  %49 = add i64 %47, %48
+  %50 = getelementptr inbounds %struct.rocfft_complex, ptr %46, i64 %49
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %29, ptr align 2 %50, i64 4, i1 false), !tbaa.struct !12
+  %51 = load i64, ptr %22, align 8, !tbaa !6
+  %52 = icmp eq i64 %51, 0
+  br i1 %52, label %53, label %102
+
+53:                                               ; preds = %40
+  %54 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 0
+  %55 = load half, ptr %54, align 2, !tbaa !15
+  %56 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 1
+  %57 = load half, ptr %56, align 2, !tbaa !17
+  %58 = fadd contract half %55, %57
+  %59 = load ptr, ptr %25, align 8, !tbaa !10
+  %60 = load i64, ptr %26, align 8, !tbaa !6
+  %61 = load i64, ptr %22, align 8, !tbaa !6
+  %62 = add i64 %60, %61
+  %63 = getelementptr inbounds %struct.rocfft_complex, ptr %59, i64 %62
+  %64 = getelementptr inbounds %struct.rocfft_complex, ptr %63, i32 0, i32 0
+  store half %58, ptr %64, align 2, !tbaa !15
+  %65 = load ptr, ptr %25, align 8, !tbaa !10
+  %66 = load i64, ptr %26, align 8, !tbaa !6
+  %67 = load i64, ptr %22, align 8, !tbaa !6
+  %68 = add i64 %66, %67
+  %69 = getelementptr inbounds %struct.rocfft_complex, ptr %65, i64 %68
+  %70 = getelementptr inbounds %struct.rocfft_complex, ptr %69, i32 0, i32 1
+  store half 0xH0000, ptr %70, align 2, !tbaa !17
+  %71 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 0
+  %72 = load half, ptr %71, align 2, !tbaa !15
+  %73 = getelementptr inbounds %struct.rocfft_complex, ptr %28, i32 0, i32 1
+  %74 = load half, ptr %73, align 2, !tbaa !17
+  %75 = fsub contract half %72, %74
+  %76 = load ptr, ptr %25, align 8, !tbaa !10
+  %77 = load i64, ptr %26, align 8, !tbaa !6
+  %78 = load i64, ptr %23, align 8, !tbaa !6
+  %79 = add i64 %77, %78
+  %80 = getelementptr inbounds %struct.rocfft_complex, ptr %76, i64 %79
+  %81 = getelementptr inbounds %struct.rocfft_complex, ptr %80, i32 0, i32 0
+  store half %75, ptr %81, align 2, !tbaa !15
+  %82 = load ptr, ptr %25, align 8, !tbaa !10
+  %83 = load i64, ptr %26, align 8, !tbaa !6
+  %84 = load i64, ptr %23, align 8, !tbaa !6
+  %85 = add i64 %83, %84
+  %86 = getelementptr inbounds %struct.rocfft_complex, ptr %82, i64 %85
+  %87 = getelementptr inbounds %struct.rocfft_complex, ptr %86, i32 0, i32 1
+  store half 0xH0000, ptr %87, align 2, !tbaa !17
+  %88 = load ptr, ptr %25, align 8, !tbaa !10
+  %89 = load i64, ptr %26, align 8, !tbaa !6
+  %90 = load i64, ptr %24, align 8, !tbaa !6
+  %91 = add i64 %89, %90
+  %92 = getelementptr inbounds %struct.rocfft_complex, ptr %88, i64 %91
+  %93 = getelementptr inbounds %struct.rocfft_complex, ptr %92, i32 0, i32 1
+  %94 = load half, ptr %93, align 2, !tbaa !17
+  %95 = fneg contract half %94
+  %96 = load ptr, ptr %25, align 8, !tbaa !10
+  %97 = load i64, ptr %26, align 8, !tbaa !6
+  %98 = load i64, ptr %24, align 8, !tbaa !6
+  %99 = add i64 %97, %98
+  %100 = getelementptr inbounds %struct.rocfft_complex, ptr %96, i64 %99
+  %101 = getelementptr inbounds %struct.rocfft_complex, ptr %100, i32 0, i32 1
+  store half %95, ptr %101, align 2, !tbaa !17
+  ret void
+
+102:                                              ; preds = %40
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %15) #4
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %16) #4
+  store double 5.000000e-01, ptr %31, align 8, !tbaa !18
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %17) #4
+  store i32 0, ptr %32, align 2
+  store i32 0, ptr %30, align 2
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %17) #4
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %16) #4
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %18) #4
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %19) #4
+  store double 5.000000e-01, ptr %34, align 8, !tbaa !18
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %20) #4
+  store i32 0, ptr %35, align 2
+  store i32 0, ptr %33, align 2
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %20) #4
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %19) #4
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %21) #4
+  %107 = load ptr, ptr %27, align 8, !tbaa !10
+  %108 = load i64, ptr %22, align 8, !tbaa !6
+  %109 = getelementptr inbounds %struct.rocfft_complex, ptr %107, i64 %108
+  call void @llvm.memcpy.p0.p0.i64(ptr align 2 %36, ptr align 2 %109, i64 4, i1 false), !tbaa.struct !12
+  %110 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 0
+  %111 = load half, ptr %110, align 2, !tbaa !15
+  %112 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+  %113 = load half, ptr %112, align 2, !tbaa !15
+  %114 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+  %115 = load half, ptr %114, align 2, !tbaa !17
+  %116 = fmul contract half %113, %115
+  %117 = fadd contract half %111, %116
+  %118 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+  %119 = load half, ptr %118, align 2, !tbaa !17
+  %120 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+  %121 = load half, ptr %120, align 2, !tbaa !15
+  %122 = fmul contract half %119, %121
+  %123 = fadd contract half %117, %122
+  %124 = load ptr, ptr %25, align 8, !tbaa !10
+  %125 = load i64, ptr %26, align 8, !tbaa !6
+  %126 = load i64, ptr %22, align 8, !tbaa !6
+  %127 = add i64 %125, %126
+  %128 = getelementptr inbounds %struct.rocfft_complex, ptr %124, i64 %127
+  %129 = getelementptr inbounds %struct.rocfft_complex, ptr %128, i32 0, i32 0
+  store half %123, ptr %129, align 2, !tbaa !15
+  %130 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 1
+  %131 = load half, ptr %130, align 2, !tbaa !17
+  %132 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+  %133 = load half, ptr %132, align 2, !tbaa !17
+  %134 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+  %135 = load half, ptr %134, align 2, !tbaa !17
+  %136 = fmul contract half %133, %135
+  %137 = fadd contract half %131, %136
+  %138 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+  %139 = load half, ptr %138, align 2, !tbaa !15
+  %140 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+  %141 = load half, ptr %140, align 2, !tbaa !15
+  %142 = fmul contract half %139, %141
+  %143 = fsub contract half %137, %142
+  %144 = load ptr, ptr %25, align 8, !tbaa !10
+  %145 = load i64, ptr %26, align 8, !tbaa !6
+  %146 = load i64, ptr %22, align 8, !tbaa !6
+  %147 = add i64 %145, %146
+  %148 = getelementptr inbounds %struct.rocfft_complex, ptr %144, i64 %147
+  %149 = getelementptr inbounds %struct.rocfft_complex, ptr %148, i32 0, i32 1
+  store half %143, ptr %149, align 2, !tbaa !17
+  %150 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 0
+  %151 = load half, ptr %150, align 2, !tbaa !15
+  %152 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+  %153 = load half, ptr %152, align 2, !tbaa !15
+  %154 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+  %155 = load half, ptr %154, align 2, !tbaa !17
+  %156 = fmul contract half %153, %155
+  %157 = fsub contract half %151, %156
+  %158 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+  %159 = load half, ptr %158, align 2, !tbaa !17
+  %160 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+  %161 = load half, ptr %160, align 2, !tbaa !15
+  %162 = fmul contract half %159, %161
+  %163 = fsub contract half %157, %162
+  %164 = load ptr, ptr %25, align 8, !tbaa !10
+  %165 = load i64, ptr %26, align 8, !tbaa !6
+  %166 = load i64, ptr %23, align 8, !tbaa !6
+  %167 = add i64 %165, %166
+  %168 = getelementptr inbounds %struct.rocfft_complex, ptr %164, i64 %167
+  %169 = getelementptr inbounds %struct.rocfft_complex, ptr %168, i32 0, i32 0
+  store half %163, ptr %169, align 2, !tbaa !15
+  %170 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 1
+  %171 = load half, ptr %170, align 2, !tbaa !17
+  %172 = fneg contract half %171
+  %173 = getelementptr inbounds %struct.rocfft_complex, ptr %30, i32 0, i32 1
+  %174 = load half, ptr %173, align 2, !tbaa !17
+  %175 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 1
+  %176 = load half, ptr %175, align 2, !tbaa !17
+  %177 = fmul contract half %174, %176
+  %178 = fadd contract half %172, %177
+  %179 = getelementptr inbounds %struct.rocfft_complex, ptr %33, i32 0, i32 0
+  %180 = load half, ptr %179, align 2, !tbaa !15
+  %181 = getelementptr inbounds %struct.rocfft_complex, ptr %36, i32 0, i32 0
+  %182 = load half, ptr %181, align 2, !tbaa !15
+  %183 = fmul contract half %180, %182
+  %184 = fsub contract half %178, %183
+  %185 = load ptr, ptr %25, align 8, !tbaa !10
+  %186 = load i64, ptr %26, align 8, !tbaa !6
+  %187 = load i64, ptr %23, align 8, !tbaa !6
+  %188 = add i64 %186, %187
+  %189 = getelementptr inbounds %struct.rocfft_complex, ptr %185, i64 %188
+  %190 = getelementptr inbounds %struct.rocfft_complex, ptr %189, i32 0, i32 1
+  store half %184, ptr %190, align 2, !tbaa !17
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %21) #4
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %18) #4
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %15) #4
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { convergent inlinehint mustprogress nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+cumode,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+sramecc,+wavefrontsize64,-xnack" }
+attributes #3 = { convergent mustprogress nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx90a" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+cumode,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+sramecc,+wavefrontsize64,-xnack" }
+attributes #4 = { nounwind }
+attributes #5 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+!llvm.ident = !{!4, !4, !4, !4, !4, !4, !4, !4, !4, !4, !4}
+!opencl.ocl.version = !{!5, !5, !5, !5, !5, !5, !5, !5, !5, !5}
+
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 8, !"PIC Level", i32 2}
+!4 = !{!"clang version 19.0.0git (ssh://padivedi@gerrit-git.amd.com:29418/lightning/ec/llvm-project a2421f3d00e8e99003ddde4ce19939737b57d043)"}
+!5 = !{i32 2, i32 0}
+!6 = !{!7, !7, i64 0}
+!7 = !{!"long", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+!10 = !{!11, !11, i64 0}
+!11 = !{!"any pointer", !8, i64 0}
+!12 = !{i64 0, i64 2, !13, i64 2, i64 2, !13}
+!13 = !{!14, !14, i64 0}
+!14 = !{!"_Float16", !8, i64 0}
+!15 = !{!16, !14, i64 0}
+!16 = !{!"_ZTS14rocfft_complexIDF16_E", !14, i64 0, !14, i64 2}
+!17 = !{!16, !14, i64 2}
+!18 = !{!19, !19, i64 0}
+!19 = !{!"double", !8, i64 0}

arsenm

Patch title should be a description of what this does, the title sounds more like a bug report. Patch part is also missing?

arsenm · 2024-09-20T09:30:26Z

llvm/test/CodeGen/AMDGPU/fix-failure-si-peephole-sdwa.ll

@@ -0,0 +1,265 @@
+%struct.rocfft_complex = type { half, half }


This test is way too big, and is missing run lines

PankajDwivedi-25 · 2024-09-20T10:19:56Z

@arsenm @bfavela Do you see any issues with the two versions of MIRs? mentioned in the description.

arsenm · 2024-09-20T10:22:04Z

@arsenm @bfavela Do you see any issues with the two versions of MIRs? mentioned in the description.

I can't read the raw enum values. What do these print as?

PankajDwivedi-25 · 2024-09-20T10:23:45Z

WORD0,WORD1,DWORD enum values here are 4,5,6

PankajDwivedi-25 · 2024-09-20T10:42:02Z

This was observed after patch #94800.

arsenm · 2024-09-20T10:42:39Z

WORD0,WORD1,DWORD enum values here are 4,5,6

This is still unreadable. I have to figure out which operands are which indexes. What does this print as, in human readable form, out of the asm printer?

PankajDwivedi-25 · 2024-09-20T11:32:55Z

This is what I'm seeing in the assembly
"v_add_f16_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1" which corrosponds to v_add_f16_sdwa

arsenm · 2024-09-20T13:47:59Z

This is what I'm seeing in the assembly "v_add_f16_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1" which corrosponds to v_add_f16_sdwa

This looks different. The result operands are both using the same source register in the MIR, and here they are different. The original MIR looks like it's trying to do an add of the low and high 16-bit halves of the same register.

I'm not sure what the rules are when using the selectors on the same register in multiple operands. Maybe src0_sel should be using WORD_0? I'm not sure how DWORD is interpreted on a 16-bit source

arsenm · 2024-09-20T13:50:09Z

Could #106920 be related?

PankajDwivedi-25 · 2024-09-20T13:53:26Z

This is what I'm seeing in the assembly "v_add_f16_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1" which corrosponds to v_add_f16_sdwa

This looks different. The result operands are both using the same source register in the MIR, and here they are different. The original MIR looks like it's trying to do an add of the low and high 16-bit halves of the same register.

I'm not sure what the rules are when using the selectors on the same register in multiple operands. Maybe src0_sel should be using WORD_0? I'm not sure how DWORD is interpreted on a 16-bit source

yes, the final assembly is having different set of registers. but the subword operands are the same.

From my finding, both are doing the same job as you mentioned. not sure what goes wrong here my be should i use WORD-1 as a first operand?

arsenm · 2024-09-20T13:58:37Z

yes, the final assembly is having different set of registers. but the subword operands are the same.

From my finding, both are doing the same job as you mentioned. not sure what goes wrong here my be should i use WORD-1 as a first operand?

You've still only provided fragments of information, so I have no idea what is going on. Can you reduce this, preferably to just the relevant instructions with runnable output

PankajDwivedi-25 · 2024-09-20T14:18:13Z

Could #106920 be related?

yeah, it talks about the same pass. but that is not the case here.

PankajDwivedi-25 · 2024-09-20T14:19:23Z

yes, the final assembly is having different set of registers. but the subword operands are the same.
From my finding, both are doing the same job as you mentioned. not sure what goes wrong here my be should i use WORD-1 as a first operand?

You've still only provided fragments of information, so I have no idea what is going on. Can you reduce this, preferably to just the relevant instructions with runnable output

sure, let me get minimal MIR for the same.

bfavela · 2024-09-20T15:01:41Z

Two thoughts:

The original title and commit title are about precision - AFAIK, there should be no assumptions of precision with f16. There aren't enough bits to ever guarantee that optimizations won't drift ULP errors. Like Matt has said, though, there is zero information about what the actual error is here.
One observation I can make is that the final assembly has "dst_unused:unused_pad", which means that the upper 16 bits of the result for both ALUs are going to get cleared to 0. The assumption is that this is the expected outcome of any f16 operation, but maybe some other pass is not expecting that?

PankajDwivedi-25 · 2024-09-23T12:33:56Z

Could #106920 be related?

Your suspect was correct @arsenm, Including this patch fixes the issue.

arsenm · 2024-09-23T12:38:27Z

Should this be closed then?

PankajDwivedi-25 · 2024-09-24T06:53:49Z

Should this be closed then?

yes, I am waiting for that PR to be merged.

arsenm · 2024-09-24T07:00:18Z

Should this be closed then?

yes, I am waiting for that PR to be merged.

No point in waiting

PankajDwivedi-25 · 2024-11-29T10:41:06Z

@arsenm @yxsamliu @bfavela There is a similar issue in int8 where some instruction are getting translated into SDWA.

Without SDWA:
v_cmp_eq_u32_e32 vcc, 1, v17
v_bfe_i32 v12, v15, 0, 8
v_cmp_ge_i32_e64 s[8:9], v16, v12
v_cmp_lt_i32_e64 s[4:5], v16, v12

With SDWA:
v_cmp_ge_i32_sdwa s[8:9], v16, sext(v15) src0_sel:DWORD src1_sel:BYTE_0
v_cmp_lt_i32_sdwa s[4:5], v16, sext(v15) src0_sel:DWORD src1_sel:BYTE_0

There are multiple similar instances in the final asm, are you suspecting similar issue with int8 as well?

arsenm · 2024-11-29T15:46:25Z

There are multiple similar instances in the final asm, are you suspecting similar issue with int8 as well?

You're just describing SDWA being used at all. This is not evidence of an issue on its own, it's supposed to be used.

PankajDwivedi-25 · 2024-12-09T10:46:35Z

Right, After Including this patch sorting output for some int8 cases has output values as -1.
These values appear to have been converted to -1 from the original input provided. which originally does not have any -1 in the list.

PankajDwivedi-25 · 2024-12-11T11:15:06Z

I suspect that v_bfe_i32 v12, v15, 0, 8 is not equivalent to sext(v15) src0_sel:DWORD src1_sel:BYTE_0 ?
which is the optimization for folding v_bfe_i32 in SDWA optimization.
If they are not equivalent, then signed byte extract with V_BFE_I32_e64 should be excluded from the SDWA peephole optimization? for above pytorch sorting test failure.

@yxsamliu @arsenm @bfavela

arsenm · 2024-12-11T11:18:20Z

Looks equivalent to me. The BFE is extracting 8 bits at offset 0, and sign extending it. That should be the same thing as sign extending BYTE_0.

PankajDwivedi-25 · 2024-12-11T11:35:34Z

I see two version of v_cmp_lt_i32 having suffix _32 & _64
First case:

v_bfe_i32 v0, v18, 0, 8
v_cmp_ge_i32_e64 s[12:13], v23, v0
v_cmp_lt_i32_e32 vcc, v23, v0

respective sdwa

v_cmp_ge_i32_sdwa s[12:13], v23, sext(v18) src0_sel:DWORD src1_sel:BYTE_0
v_cmp_lt_i32_sdwa s[16:17], v23, sext(v18) src0_sel:DWORD src1_sel:BYTE_0

where v_cmp_lt_i32_e32 vcc, v23, v0 is getting replaced with v_cmp_lt_i32_sdwa s[16:17], v23, sext(v18) src0_sel:DWORD src1_sel:BYTE_0

Second case:

v_bfe_i32 v0, v22, 0, 8
v_cmp_ge_i32_e64 s[28:29], v26, v0
v_cmp_lt_i32_e64 s[4:5], v26, v0

where:
v_cmp_lt_i32_e64 s[4:5], v26, v0 is also getting replace with v_cmp_lt_i32_sdwa s[4:5], v26, sext(v22) src0_sel:DWORD src1_sel:BYTE_0

is both v_cmp_lt_i32 having the same sdwa optimization.

bfavela · 2024-12-16T16:02:39Z

_e32 vs _e64 is just different encodings (1 dword vs 2 dwords of ISA). The first case can use the single dword encoding (VOPC) as "VCC" is the implied destination in that encoding. The second case writes to 2 SGPRs which means it needs to be promoted to a 2-dword VOP3 encoding.
SDWA version is still equivalent in what you've shown here for the reasons arsenm outlined earlier.

PankajDwivedi-25 · 2024-12-17T07:01:04Z

SDWA version is still equivalent in what you've shown here for the reasons arsenm outlined earlier.

Is it expected to be the same? Also, in the first case, I can see the combination of two encodings: 64 in v_cmp_ge_i32_e64 and 32 in v_cmp_lt_i32_e32. Does it work correctly? If so, do you have any idea what wrong could go here?

arsenm · 2024-12-17T07:17:37Z

Is it expected to be the same? Also, in the first case, I can see the combination of two encodings: 64 in v_cmp_ge_i32_e64 and 32 in v_cmp_lt_i32_e32. Does it work correctly? If so, do you have any idea what wrong could go here?

Not sure what the question is. Yes, VOPC instructions are available in a VOP3 form as well.

PankajDwivedi-25 · 2024-12-17T07:27:56Z

Yes, VOPC instructions are available in a VOP3 form as well.

Thank you for your feedback!

It's a weird case here then. I have no clue what is wrong with int8 sorting. Why some of the output values are becoming -1's?
all I can see in the final ASM diff is the folding of bfe into sdwa.

arsenm · 2024-12-17T09:48:22Z

Yes, VOPC instructions are available in a VOP3 form as well.

Thank you for your feedback!

It's a weird case here then. I have no clue what is wrong with int8 sorting. Why some of the output values are becoming -1's? all I can see in the final ASM diff is the folding of bfe into sdwa.

You keep stating random fragments of information and ISA with no context. I have no idea what testcase you're looking at, or how it's related to the issue title of "fp16", int8 sorting, or how you concluded your issue is related to SDWA

[AMDGPU] precision error observed

a9541c7

PankajDwivedi-25 added the backend:AMDGPU label Sep 20, 2024

arsenm reviewed Sep 20, 2024

View reviewed changes

PankajDwivedi-25 changed the title ~~[AMDGPU] precision error observed after SIPeepholeSDWA optimization for fp16~~ [AMDGPU] fix SIPeepholeSDWA optimization for fp16 Sep 20, 2024

arsenm closed this Sep 24, 2024

PankajDwivedi-25 reopened this Nov 29, 2024

arsenm closed this Nov 29, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] fix SIPeepholeSDWA optimization for fp16 #109395

[AMDGPU] fix SIPeepholeSDWA optimization for fp16 #109395

PankajDwivedi-25 commented Sep 20, 2024

llvmbot commented Sep 20, 2024

arsenm left a comment •

edited

Loading

arsenm Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024 •

edited

Loading

arsenm commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

bfavela commented Sep 20, 2024

PankajDwivedi-25 commented Sep 23, 2024 •

edited

Loading

arsenm commented Sep 23, 2024

PankajDwivedi-25 commented Sep 24, 2024

arsenm commented Sep 24, 2024

PankajDwivedi-25 commented Nov 29, 2024 •

edited

Loading

arsenm commented Nov 29, 2024

PankajDwivedi-25 commented Dec 9, 2024 •

edited

Loading

PankajDwivedi-25 commented Dec 11, 2024

arsenm commented Dec 11, 2024

PankajDwivedi-25 commented Dec 11, 2024 •

edited

Loading

bfavela commented Dec 16, 2024 •

edited

Loading

PankajDwivedi-25 commented Dec 17, 2024

arsenm commented Dec 17, 2024

PankajDwivedi-25 commented Dec 17, 2024

arsenm commented Dec 17, 2024

		@@ -0,0 +1,265 @@
		%struct.rocfft_complex = type { half, half }

[AMDGPU] fix SIPeepholeSDWA optimization for fp16 #109395

[AMDGPU] fix SIPeepholeSDWA optimization for fp16 #109395

Conversation

PankajDwivedi-25 commented Sep 20, 2024

llvmbot commented Sep 20, 2024

arsenm left a comment • edited Loading

Choose a reason for hiding this comment

arsenm Sep 20, 2024

Choose a reason for hiding this comment

PankajDwivedi-25 commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024 • edited Loading

arsenm commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

arsenm commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

PankajDwivedi-25 commented Sep 20, 2024

bfavela commented Sep 20, 2024

PankajDwivedi-25 commented Sep 23, 2024 • edited Loading

arsenm commented Sep 23, 2024

PankajDwivedi-25 commented Sep 24, 2024

arsenm commented Sep 24, 2024

PankajDwivedi-25 commented Nov 29, 2024 • edited Loading

arsenm commented Nov 29, 2024

PankajDwivedi-25 commented Dec 9, 2024 • edited Loading

PankajDwivedi-25 commented Dec 11, 2024

arsenm commented Dec 11, 2024

PankajDwivedi-25 commented Dec 11, 2024 • edited Loading

bfavela commented Dec 16, 2024 • edited Loading

PankajDwivedi-25 commented Dec 17, 2024

arsenm commented Dec 17, 2024

PankajDwivedi-25 commented Dec 17, 2024

arsenm commented Dec 17, 2024

arsenm left a comment •

edited

Loading

PankajDwivedi-25 commented Sep 20, 2024 •

edited

Loading

PankajDwivedi-25 commented Sep 23, 2024 •

edited

Loading

PankajDwivedi-25 commented Nov 29, 2024 •

edited

Loading

PankajDwivedi-25 commented Dec 9, 2024 •

edited

Loading

PankajDwivedi-25 commented Dec 11, 2024 •

edited

Loading

bfavela commented Dec 16, 2024 •

edited

Loading