forked from llvm/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[RISCV] Add fixed vector coverage for sum-absolute-difference (sad) p…
…attern This builds on the previously added absolute difference cases, and adds the reduction at the end. This is mostly interesting for examining impact of extend placement when changing the abdu lowering.
- Loading branch information
Showing
1 changed file
with
191 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 | ||
; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s | ||
; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s | ||
|
||
define signext i16 @sad_4x8_as_i16(<4 x i8> %a, <4 x i8> %b) { | ||
; CHECK-LABEL: sad_4x8_as_i16: | ||
; CHECK: # %bb.0: # %entry | ||
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma | ||
; CHECK-NEXT: vwsubu.vv v10, v8, v9 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma | ||
; CHECK-NEXT: vrsub.vi v8, v10, 0 | ||
; CHECK-NEXT: vmax.vv v8, v10, v8 | ||
; CHECK-NEXT: vmv.s.x v9, zero | ||
; CHECK-NEXT: vredsum.vs v8, v8, v9 | ||
; CHECK-NEXT: vmv.x.s a0, v8 | ||
; CHECK-NEXT: ret | ||
entry: | ||
%1 = zext <4 x i8> %a to <4 x i16> | ||
%3 = zext <4 x i8> %b to <4 x i16> | ||
%4 = sub nsw <4 x i16> %1, %3 | ||
%5 = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %4, i1 true) | ||
%6 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %5) | ||
ret i16 %6 | ||
} | ||
|
||
define signext i32 @sad_4x8_as_i32(<4 x i8> %a, <4 x i8> %b) { | ||
; CHECK-LABEL: sad_4x8_as_i32: | ||
; CHECK: # %bb.0: # %entry | ||
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma | ||
; CHECK-NEXT: vwsubu.vv v10, v8, v9 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma | ||
; CHECK-NEXT: vrsub.vi v8, v10, 0 | ||
; CHECK-NEXT: vmax.vv v8, v10, v8 | ||
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma | ||
; CHECK-NEXT: vmv.s.x v9, zero | ||
; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma | ||
; CHECK-NEXT: vwredsumu.vs v8, v8, v9 | ||
; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma | ||
; CHECK-NEXT: vmv.x.s a0, v8 | ||
; CHECK-NEXT: ret | ||
entry: | ||
%1 = zext <4 x i8> %a to <4 x i32> | ||
%3 = zext <4 x i8> %b to <4 x i32> | ||
%4 = sub nsw <4 x i32> %1, %3 | ||
%5 = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> %4, i1 true) | ||
%6 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) | ||
ret i32 %6 | ||
} | ||
|
||
define signext i16 @sad_16x8_as_i16(<16 x i8> %a, <16 x i8> %b) { | ||
; CHECK-LABEL: sad_16x8_as_i16: | ||
; CHECK: # %bb.0: # %entry | ||
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma | ||
; CHECK-NEXT: vwsubu.vv v10, v8, v9 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: vrsub.vi v8, v10, 0 | ||
; CHECK-NEXT: vmax.vv v8, v10, v8 | ||
; CHECK-NEXT: vmv.s.x v10, zero | ||
; CHECK-NEXT: vredsum.vs v8, v8, v10 | ||
; CHECK-NEXT: vmv.x.s a0, v8 | ||
; CHECK-NEXT: ret | ||
entry: | ||
%1 = zext <16 x i8> %a to <16 x i16> | ||
%3 = zext <16 x i8> %b to <16 x i16> | ||
%4 = sub nsw <16 x i16> %1, %3 | ||
%5 = tail call <16 x i16> @llvm.abs.v16i16(<16 x i16> %4, i1 true) | ||
%6 = tail call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %5) | ||
ret i16 %6 | ||
} | ||
|
||
define signext i32 @sad_16x8_as_i32(<16 x i8> %a, <16 x i8> %b) { | ||
; CHECK-LABEL: sad_16x8_as_i32: | ||
; CHECK: # %bb.0: # %entry | ||
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma | ||
; CHECK-NEXT: vwsubu.vv v10, v8, v9 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: vrsub.vi v8, v10, 0 | ||
; CHECK-NEXT: vmax.vv v8, v10, v8 | ||
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma | ||
; CHECK-NEXT: vmv.s.x v10, zero | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: vwredsumu.vs v8, v8, v10 | ||
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma | ||
; CHECK-NEXT: vmv.x.s a0, v8 | ||
; CHECK-NEXT: ret | ||
entry: | ||
%1 = zext <16 x i8> %a to <16 x i32> | ||
%3 = zext <16 x i8> %b to <16 x i32> | ||
%4 = sub nsw <16 x i32> %1, %3 | ||
%5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true) | ||
%6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) | ||
ret i32 %6 | ||
} | ||
|
||
define signext i32 @sad_2block_16xi8_as_i32(ptr %a, ptr %b, i32 signext %stridea, i32 signext %strideb) { | ||
; CHECK-LABEL: sad_2block_16xi8_as_i32: | ||
; CHECK: # %bb.0: # %entry | ||
; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma | ||
; CHECK-NEXT: vle8.v v8, (a0) | ||
; CHECK-NEXT: vle8.v v9, (a1) | ||
; CHECK-NEXT: vwsubu.vv v10, v8, v9 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: add a0, a0, a2 | ||
; CHECK-NEXT: add a1, a1, a3 | ||
; CHECK-NEXT: vle8.v v8, (a0) | ||
; CHECK-NEXT: vle8.v v9, (a1) | ||
; CHECK-NEXT: vrsub.vi v12, v10, 0 | ||
; CHECK-NEXT: vmax.vv v12, v10, v12 | ||
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma | ||
; CHECK-NEXT: vwsubu.vv v10, v8, v9 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: vrsub.vi v8, v10, 0 | ||
; CHECK-NEXT: add a0, a0, a2 | ||
; CHECK-NEXT: add a1, a1, a3 | ||
; CHECK-NEXT: vle8.v v14, (a0) | ||
; CHECK-NEXT: vle8.v v15, (a1) | ||
; CHECK-NEXT: vmax.vv v16, v10, v8 | ||
; CHECK-NEXT: vwaddu.vv v8, v16, v12 | ||
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma | ||
; CHECK-NEXT: vwsubu.vv v12, v14, v15 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: vrsub.vi v14, v12, 0 | ||
; CHECK-NEXT: add a0, a0, a2 | ||
; CHECK-NEXT: add a1, a1, a3 | ||
; CHECK-NEXT: vle8.v v16, (a0) | ||
; CHECK-NEXT: vle8.v v17, (a1) | ||
; CHECK-NEXT: vmax.vv v12, v12, v14 | ||
; CHECK-NEXT: vwaddu.wv v8, v8, v12 | ||
; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma | ||
; CHECK-NEXT: vwsubu.vv v12, v16, v17 | ||
; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma | ||
; CHECK-NEXT: vrsub.vi v14, v12, 0 | ||
; CHECK-NEXT: vmax.vv v12, v12, v14 | ||
; CHECK-NEXT: vwaddu.wv v8, v8, v12 | ||
; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma | ||
; CHECK-NEXT: vmv.s.x v12, zero | ||
; CHECK-NEXT: vredsum.vs v8, v8, v12 | ||
; CHECK-NEXT: vmv.x.s a0, v8 | ||
; CHECK-NEXT: ret | ||
entry: | ||
%idx.ext8 = sext i32 %strideb to i64 | ||
%idx.ext = sext i32 %stridea to i64 | ||
%0 = load <16 x i8>, ptr %a, align 1 | ||
%1 = zext <16 x i8> %0 to <16 x i32> | ||
%2 = load <16 x i8>, ptr %b, align 1 | ||
%3 = zext <16 x i8> %2 to <16 x i32> | ||
%4 = sub nsw <16 x i32> %1, %3 | ||
%5 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %4, i1 true) | ||
%6 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %5) | ||
%add.ptr = getelementptr inbounds i8, ptr %a, i64 %idx.ext | ||
%add.ptr9 = getelementptr inbounds i8, ptr %b, i64 %idx.ext8 | ||
%7 = load <16 x i8>, ptr %add.ptr, align 1 | ||
%8 = zext <16 x i8> %7 to <16 x i32> | ||
%9 = load <16 x i8>, ptr %add.ptr9, align 1 | ||
%10 = zext <16 x i8> %9 to <16 x i32> | ||
%11 = sub nsw <16 x i32> %8, %10 | ||
%12 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %11, i1 true) | ||
%13 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %12) | ||
%op.rdx.1 = add i32 %13, %6 | ||
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext | ||
%add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8 | ||
%14 = load <16 x i8>, ptr %add.ptr.1, align 1 | ||
%15 = zext <16 x i8> %14 to <16 x i32> | ||
%16 = load <16 x i8>, ptr %add.ptr9.1, align 1 | ||
%17 = zext <16 x i8> %16 to <16 x i32> | ||
%18 = sub nsw <16 x i32> %15, %17 | ||
%19 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %18, i1 true) | ||
%20 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %19) | ||
%op.rdx.2 = add i32 %20, %op.rdx.1 | ||
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext | ||
%add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8 | ||
%21 = load <16 x i8>, ptr %add.ptr.2, align 1 | ||
%22 = zext <16 x i8> %21 to <16 x i32> | ||
%23 = load <16 x i8>, ptr %add.ptr9.2, align 1 | ||
%24 = zext <16 x i8> %23 to <16 x i32> | ||
%25 = sub nsw <16 x i32> %22, %24 | ||
%26 = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> %25, i1 true) | ||
%27 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %26) | ||
%op.rdx.3 = add i32 %27, %op.rdx.2 | ||
ret i32 %op.rdx.3 | ||
} | ||
|
||
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) | ||
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) | ||
declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1) | ||
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) | ||
|
||
declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1) | ||
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) | ||
declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) | ||
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) |