From 1cfcf7acd1146f7a166ad520ff64d19e88165ef1 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 7 Mar 2024 13:47:02 -0800 Subject: [PATCH 01/22] arm64: Use different instruction sequence for taking global address with HWASan When dav1d is built with HWASan, the build fails because globals are tagged and the normal adrp/add instruction sequence does not have enough range to take the tagged address. Therefore, use an alternative instruction sequence when HWASan is enabled, which is the same as what the compiler generates. --- src/arm/64/util.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/arm/64/util.S b/src/arm/64/util.S index 64d73e3a5..1b3f319ce 100644 --- a/src/arm/64/util.S +++ b/src/arm/64/util.S @@ -32,6 +32,10 @@ #include "config.h" #include "src/arm/asm.S" +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + .macro movrel rd, val, offset=0 #if defined(__APPLE__) .if \offset < 0 @@ -51,6 +55,10 @@ adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) .endif +#elif __has_feature(hwaddress_sanitizer) + adrp \rd, :pg_hi21_nc:\val+(\offset) + movk \rd, #:prel_g3:\val+0x100000000 + add \rd, \rd, :lo12:\val+(\offset) #elif defined(PIC) adrp \rd, \val+(\offset) add \rd, \rd, :lo12:\val+(\offset) From a18310da554a4b3865c707c86db9ae9a3d781192 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Fri, 22 Mar 2024 16:20:38 +0100 Subject: [PATCH 02/22] AArch64: Add DotProd support for convolutions Add an Armv8.4-A DotProd code path for standard bitdepth convolutions. Only horizontal-vertical (HV) convolutions have 6-tap specialisations of their vertical passes. All other convolutions are 4- or 8-tap filters which fit well with the 4-element SDOT instruction. Benchmarks show up-to 7-29% FPS increase depending on the input video and the CPU used. This patch will increase the .text by around 6.5 KiB. Performance highly depends on the SDOT and MLA throughput ratio, this can be seen on the vertical filter cases. Small cores are also affected by the TBL execution latencies: Relative performance to the C reference on some CPUs: A76 A78 X1 A55 regular w4 hv neon: 5.52x 5.78x 10.75x 8.27x regular w4 hv dotprod: 7.94x 8.49x 16.84x 8.09x sharp w4 hv neon: 5.27x 5.22x 9.06x 7.87x sharp w4 hv dotprod: 6.61x 6.73x 12.64x 6.89x regular w8 hv neon: 1.95x 2.19x 2.56x 3.16x regular w8 hv dotprod: 3.23x 2.81x 3.20x 3.26x sharp w8 hv neon: 1.61x 1.79x 2.05x 2.72x sharp w8 hv dotprod: 2.72x 2.29x 2.66x 2.76x regular w16 hv neon: 1.63x 2.04x 2.16x 2.73x regular w16 hv dotprod: 2.72x 2.57x 2.67x 2.80x sharp w16 hv neon: 1.33x 1.67x 1.74x 2.34x sharp w16 hv dotprod: 2.31x 2.14x 2.26x 2.39x regular w32 hv neon: 1.48x 1.92x 1.94x 2.51x regular w32 hv dotprod: 2.49x 2.40x 2.33x 2.58x sharp w32 hv neon: 1.21x 1.56x 1.53x 2.14x sharp w32 hv dotprod: 2.12x 2.02x 2.00x 2.22x regular w64 hv neon: 1.42x 1.87x 1.85x 2.40x regular w64 hv dotprod: 2.40x 2.32x 2.21x 2.46x sharp w64 hv neon: 1.16x 1.52x 1.46x 2.04x sharp w64 hv dotprod: 2.02x 1.96x 1.90x 2.11x regular w128 hv neon: 1.39x 1.84x 1.80x 2.27x regular w128 hv dotprod: 2.33x 2.28x 2.14x 2.35x sharp w128 hv neon: 1.14x 1.50x 1.42x 1.94x sharp w128 hv dotprod: 1.98x 1.93x 1.84x 2.03x regular w8 h neon: 2.61x 3.20x 3.51x 3.55x regular w8 h dotprod: 4.43x 5.17x 6.26x 4.30x sharp w8 h neon: 2.01x 2.80x 2.89x 3.12x sharp w8 h dotprod: 4.42x 5.16x 6.27x 4.28x regular w16 h neon: 2.17x 3.13x 2.92x 3.35x regular w16 h dotprod: 4.38x 4.27x 4.53x 3.90x sharp w16 h neon: 1.74x 2.65x 2.48x 2.92x sharp w16 h dotprod: 4.33x 4.27x 4.53x 3.91x regular w64 h neon: 1.92x 2.82x 2.39x 2.96x regular w64 h dotprod: 3.68x 3.60x 3.40x 3.18x sharp w64 h neon: 1.47x 2.33x 2.05x 2.54x sharp w64 h dotprod: 3.68x 3.60x 3.40x 3.17x regular w4 v neon: 5.39x 7.38x 10.27x 11.41x regular w4 v dotprod: 9.46x 14.15x 18.72x 9.84x sharp w4 v neon: 4.51x 6.39x 8.17x 10.70x sharp w4 v dotprod: 9.35x 14.20x 18.63x 9.78x regular w16 v neon: 3.03x 4.03x 4.65x 6.28x regular w16 v dotprod: 4.64x 3.75x 4.78x 3.89x sharp w16 v neon: 2.29x 3.09x 3.44x 5.52x sharp w16 v dotprod: 4.62x 3.74x 4.77x 3.89x regular w64 v neon: 2.17x 3.14x 3.19x 4.46x regular w64 v dotprod: 3.43x 3.00x 3.31x 2.74x sharp w64 v neon: 1.61x 2.42x 2.34x 3.89x sharp w64 v dotprod: 3.38x 3.00x 3.29x 2.73x --- src/arm/64/mc.S | 4 +- src/arm/64/mc_dotprod.S | 1413 +++++++++++++++++++++++++++++++++++++++ src/arm/mc.h | 85 +-- src/meson.build | 1 + 4 files changed, 1461 insertions(+), 42 deletions(-) create mode 100644 src/arm/64/mc_dotprod.S diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 3df0393c3..5b493be82 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -837,7 +837,7 @@ endfunc // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). -function put_neon +function put_neon, export=1 adr x9, L(put_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw @@ -939,7 +939,7 @@ endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. -function prep_neon +function prep_neon, export=1 adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] sub x9, x9, w8, uxtw diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S new file mode 100644 index 000000000..fcf04ee4d --- /dev/null +++ b/src/arm/64/mc_dotprod.S @@ -0,0 +1,1413 @@ +/* + * Copyright © 2024, VideoLAN and dav1d authors + * Copyright © 2024, Janne Grunau + * Copyright © 2024, Martin Storsjo + * Copyright © 2024, Arm Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "src/arm/asm.S" +#include "util.S" + + +#if HAVE_DOTPROD +ENABLE_DOTPROD + +// No spaces in these expressions, due to gas-preprocessor. It is translated by +// -1 to save the negative offset at getting the address of `mc_subpel_filters`. +#define REGULAR1 (((0*15-1)<<7)|(3*15-1)) +#define SMOOTH1 (((1*15-1)<<7)|(4*15-1)) +#define SHARP1 (((2*15-1)<<7)|(3*15-1)) + +#define FUNC_ALIGN 2 +#define JUMP_ALIGN 2 +#define LOOP_ALIGN 2 + + +// Lookup table used to help conversion of shifted 32-bit values to 8-bit. + .align 4 +L(hv_tbl_neon_dotprod): + .byte 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30 + +// Shuffle indices to permute horizontal samples in preparation for input to +// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the +// interval of [-3, 4] relative to the current sample position. We load samples +// from index value -4 to keep loads word aligned, so the shuffle bytes are +// translated by 1 to handle this. + .align 4 +L(h_tbl_neon_dotprod): + .byte 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7 + .byte 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11 + .byte 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15 + .byte 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19 + +// Vertical convolutions are also using SDOT instructions, where a 128-bit +// register contains a transposed 4x4 matrix of values. Subsequent iterations of +// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop +// iteration. These shuffle indices shift and merge this 4x4 matrix with the +// values of a new line. + .align 4 +L(v_tbl_neon_dotprod): + .byte 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28 + .byte 1, 2, 3, 16, 5, 6, 7, 17, 9, 10, 11, 18, 13, 14, 15, 19 + .byte 1, 2, 3, 20, 5, 6, 7, 21, 9, 10, 11, 22, 13, 14, 15, 23 + .byte 1, 2, 3, 24, 5, 6, 7, 25, 9, 10, 11, 26, 13, 14, 15, 27 + .byte 1, 2, 3, 28, 5, 6, 7, 29, 9, 10, 11, 30, 13, 14, 15, 31 + + +.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1 +function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN + mov x9, \type_h + mov x10, \type_v + .if \jump + b \op\()_8tap_\isa + .endif +endfunc +.endm + +.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd +make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa +make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa +make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa +make_8tap_fn \type, smooth_sharp, SMOOTH1, SHARP1, \isa +make_8tap_fn \type, smooth, SMOOTH1, SMOOTH1, \isa +make_8tap_fn \type, smooth_regular, SMOOTH1, REGULAR1, \isa +make_8tap_fn \type, regular_sharp, REGULAR1, SHARP1, \isa +make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1, \isa +make_8tap_fn \type, regular, REGULAR1, REGULAR1, \isa, jump=0 + +function \type\()_8tap_\isa, align=FUNC_ALIGN + clz w8, \w + mov w11, #0x4081 // (1 << 14) | (1 << 7) | (1 << 0) + sub w8, w8, #24 // for jump tables + movrel x12, X(mc_subpel_filters) + cbnz \mx, L(\type\()_8tap_h_hv_\isa) + cbnz \my, L(\type\()_8tap_v_\isa) +.ifc \type, prep + add \wd_strd, \w, \w // prep_neon needs w * 2 as stride +.endif + b X(\type\()_neon) + + .align JUMP_ALIGN +L(\type\()_8tap_v_\isa): + madd \my, \my, w11, w10 +.ifc \type, prep + mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding +.endif + sub \src, \src, \s_strd + ldr q6, L(v_tbl_neon_dotprod) +.ifc \type, prep + dup v4.4s, w8 +.endif + ubfx w11, \my, #7, #7 + and \my, \my, #0x7F + ldr q28, L(v_tbl_neon_dotprod) + 16 + cmp \h, #4 + csel \my, \my, w11, le + sub \src, \src, \s_strd, lsl #1 // src - src_stride * 3 + ldr q29, L(v_tbl_neon_dotprod) + 32 + add \xmy, x12, \xmy, lsl #3 // subpel V filter address + movi v5.16b, #128 + ldr d7, [\xmy] + cmp \w, #8 + b.eq 80f + b.lt 40f + + // .align JUMP_ALIGN // fallthrough +160: // V - 16xN+ + ldr q30, L(v_tbl_neon_dotprod) + 48 + ldr q31, L(v_tbl_neon_dotprod) + 64 +.ifc \type, prep + add \wd_strd, \w, \w +.endif + .align LOOP_ALIGN +161: + mov \lsrc, \src + mov \ldst, \dst + sub w8, \h, #1 + + ldr q16, [\lsrc] + ldr q17, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + ldr q18, [\lsrc] + ldr q19, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + + zip1 v0.16b, v16.16b, v17.16b + zip2 v1.16b, v16.16b, v17.16b + zip1 v2.16b, v18.16b, v19.16b + zip2 v3.16b, v18.16b, v19.16b + + ldr q20, [\lsrc] + ldr q21, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + ldr q22, [\lsrc] + ldr q23, [\lsrc, \s_strd] + add \lsrc, \lsrc, \s_strd, lsl #1 + + zip1 v18.16b, v20.16b, v21.16b + zip2 v21.16b, v20.16b, v21.16b + zip1 v24.16b, v22.16b, v23.16b + zip2 v27.16b, v22.16b, v23.16b + + zip1 v16.8h, v0.8h, v2.8h + zip2 v19.8h, v0.8h, v2.8h + zip1 v22.8h, v1.8h, v3.8h + zip2 v25.8h, v1.8h, v3.8h + + zip1 v17.8h, v18.8h, v24.8h + zip2 v20.8h, v18.8h, v24.8h + zip1 v23.8h, v21.8h, v27.8h + zip2 v26.8h, v21.8h, v27.8h + + sub v16.16b, v16.16b, v5.16b + sub v19.16b, v19.16b, v5.16b + sub v22.16b, v22.16b, v5.16b + sub v25.16b, v25.16b, v5.16b + + sub v17.16b, v17.16b, v5.16b + sub v20.16b, v20.16b, v5.16b + sub v23.16b, v23.16b, v5.16b + sub v26.16b, v26.16b, v5.16b + + .align LOOP_ALIGN +16: + ldr q27, [\lsrc] + add \lsrc, \lsrc, \s_strd +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sub v18.16b, v27.16b, v5.16b + sub v21.16b, v27.16b, v5.16b + sub v24.16b, v27.16b, v5.16b + sub v27.16b, v27.16b, v5.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v2.4s, v22.16b, v7.4b[0] + sdot v3.4s, v25.16b, v7.4b[0] + + tbl v16.16b, {v16.16b, v17.16b}, v6.16b + tbl v19.16b, {v19.16b, v20.16b}, v6.16b + tbl v22.16b, {v22.16b, v23.16b}, v6.16b + tbl v25.16b, {v25.16b, v26.16b}, v6.16b + + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v20.16b, v7.4b[1] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v26.16b, v7.4b[1] + + tbl v17.16b, {v17.16b, v18.16b}, v28.16b + tbl v20.16b, {v20.16b, v21.16b}, v29.16b + tbl v23.16b, {v23.16b, v24.16b}, v30.16b + tbl v26.16b, {v26.16b, v27.16b}, v31.16b + + subs w8, w8, #1 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + st1 {v0.8h, v1.8h}, [\ldst], \d_strd +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun2 v0.16b, v2.8h, #6 + st1 {v0.16b}, [\ldst], \d_strd +.endif + b.gt 16b + +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sdot v0.4s, v16.16b, v7.4b[0] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v2.4s, v22.16b, v7.4b[0] + sdot v3.4s, v25.16b, v7.4b[0] + + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v20.16b, v7.4b[1] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v26.16b, v7.4b[1] + + subs \w, \w, #16 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\ldst] + add \dst, \dst, #32 +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun2 v0.16b, v2.8h, #6 + str q0, [\ldst] + add \dst, \dst, #16 +.endif + add \src, \src, #16 + b.gt 161b + ret + + .align JUMP_ALIGN +80: // V - 8xN + ldr d16, [\src] + ldr d17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d18, [\src] + ldr d19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr d20, [\src] + ldr d21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr d22, [\src] + ldr d23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 // for prep: sub is enough + + zip1 v0.16b, v16.16b, v17.16b + zip1 v2.16b, v18.16b, v19.16b + zip1 v18.16b, v20.16b, v21.16b + zip1 v24.16b, v22.16b, v23.16b + + zip1 v16.8h, v0.8h, v2.8h + zip2 v19.8h, v0.8h, v2.8h + zip1 v17.8h, v18.8h, v24.8h + zip2 v20.8h, v18.8h, v24.8h + + sub v16.16b, v16.16b, v5.16b + sub v19.16b, v19.16b, v5.16b + sub v17.16b, v17.16b, v5.16b + sub v20.16b, v20.16b, v5.16b +.ifc \type, put + b.eq 82f +.endif + + .align LOOP_ALIGN +8: + ldr d21, [\src] + ldr d27, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.endif + sub v18.16b, v21.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + sub v24.16b, v27.16b, v5.16b + sub v27.16b, v27.16b, v5.16b + + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + tbl v16.16b, {v22.16b, v23.16b}, v6.16b + tbl v19.16b, {v25.16b, v26.16b}, v6.16b + tbl v17.16b, {v23.16b, v24.16b}, v28.16b + tbl v20.16b, {v26.16b, v27.16b}, v29.16b + + sdot v2.4s, v22.16b, v7.4b[0] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v25.16b, v7.4b[0] + sdot v3.4s, v26.16b, v7.4b[1] + + subs \h, \h, #2 + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\dst], #32 +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun v1.8b, v2.8h, #6 + str d0, [\dst] + str d1, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 8b + +.ifc \type, put + .align JUMP_ALIGN +82: + ldr d21, [\src] + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + movi v2.4s, #32, lsl 8 + movi v3.4s, #32, lsl 8 +.else + ldr d21, [\src] + mov v0.16b, v4.16b + mov v1.16b, v4.16b + mov v2.16b, v4.16b + mov v3.16b, v4.16b +.endif + sub v18.16b, v21.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + + tbl v22.16b, {v16.16b, v17.16b}, v6.16b + tbl v25.16b, {v19.16b, v20.16b}, v6.16b + tbl v23.16b, {v17.16b, v18.16b}, v28.16b + tbl v26.16b, {v20.16b, v21.16b}, v29.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + sdot v2.4s, v22.16b, v7.4b[0] + sdot v2.4s, v23.16b, v7.4b[1] + sdot v3.4s, v25.16b, v7.4b[0] + sdot v3.4s, v26.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + uzp1 v2.8h, v2.8h, v3.8h +.ifc \type, prep + sshr v0.8h, v0.8h, #2 + sshr v1.8h, v2.8h, #2 + stp q0, q1, [\dst] +.else + sqrshrun v0.8b, v0.8h, #6 + sqrshrun v1.8b, v2.8h, #6 + str d0, [\dst] + str d1, [\dst, \d_strd] +.endif + ret + + .align JUMP_ALIGN +40: // V - 4xN or 2xN (put only) +.ifc \type, put + cmp \w, #2 + b.eq 20f +.endif + ldr s16, [\src] + ldr s17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr s18, [\src] + ldr s19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr s20, [\src] + ldr s21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr s22, [\src] + ldr s23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 // for prep: sub is enough + + zip1 v0.8b, v16.8b, v17.8b + zip1 v2.8b, v18.8b, v19.8b + zip1 v18.8b, v20.8b, v21.8b + zip1 v24.8b, v22.8b, v23.8b + + zip1 v16.8h, v0.8h, v2.8h + zip1 v17.8h, v18.8h, v24.8h + + sub v16.16b, v16.16b, v5.16b + sub v17.16b, v17.16b, v5.16b +.ifc \type, put + b.eq 42f +.endif + + .align LOOP_ALIGN +4: + ldr s18, [\src] + ldr s21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 +.ifc \type, prep + mov v0.16b, v4.16b + mov v1.16b, v4.16b +.else + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 +.endif + sub v18.16b, v18.16b, v5.16b + sub v21.16b, v21.16b, v5.16b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + tbl v16.16b, {v19.16b, v20.16b}, v6.16b + tbl v17.16b, {v20.16b, v21.16b}, v28.16b + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] +.ifc \type, prep + subs \h, \h, #2 + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + str q0, [\dst], #16 +.else + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + subs \h, \h, #2 + fmov x8, d0 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + +.ifc \type, put + .align JUMP_ALIGN +42: + ldr s18, [\src] + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 +.else + ldr s18, [\src] + mov v0.16b, v4.16b + mov v1.16b, v4.16b +.endif + sub v18.16b, v18.16b, v5.16b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] +.ifc \type, prep + shrn v0.4h, v0.4s, #2 + shrn2 v0.8h, v1.4s, #2 + str q0, [\dst] + ret +.else + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + fmov x8, d0 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + ret + + .align JUMP_ALIGN +20: // V - 2xN + ldr h16, [\src] + ldr h17, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr h18, [\src] + ldr h19, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + ldr h20, [\src] + ldr h21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + ldr h22, [\src] + ldr h23, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + subs \h, \h, #2 + + zip1 v0.8b, v16.8b, v17.8b + zip1 v2.8b, v18.8b, v19.8b + zip1 v18.8b, v20.8b, v21.8b + zip1 v24.8b, v22.8b, v23.8b + + zip1 v16.4h, v0.4h, v2.4h + zip1 v17.4h, v18.4h, v24.4h + + sub v16.8b, v16.8b, v5.8b + sub v17.8b, v17.8b, v5.8b + + b.eq 22f + + .align LOOP_ALIGN +2: + ldr h18, [\src] + ldr h21, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + + sub v18.8b, v18.8b, v5.8b + sub v21.8b, v21.8b, v5.8b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + tbl v16.16b, {v19.16b, v20.16b}, v6.16b + tbl v17.16b, {v20.16b, v21.16b}, v28.16b + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + + subs \h, \h, #2 + fmov x8, d0 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 + b.gt 2b + + .align JUMP_ALIGN +22: + ldr h18, [\src] + + movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT + movi v1.4s, #32, lsl 8 + + sub v18.8b, v18.8b, v5.8b + + tbl v19.16b, {v16.16b, v17.16b}, v6.16b + tbl v20.16b, {v17.16b, v18.16b}, v28.16b + + sdot v0.4s, v16.16b, v7.4b[0] + sdot v0.4s, v17.16b, v7.4b[1] + + sdot v1.4s, v19.16b, v7.4b[0] + sdot v1.4s, v20.16b, v7.4b[1] + + uzp1 v0.8h, v0.8h, v1.8h + sqrshrun v0.8b, v0.8h, #6 + + fmov x8, d0 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + ret +.endif + + .align JUMP_ALIGN +L(\type\()_8tap_h_hv_\isa): + madd \mx, \mx, w11, w9 + madd w14, \my, w11, w10 // for HV + ldr q28, L(h_tbl_neon_dotprod) + mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding + sub \src, \src, #4 // src - 4 + dup v27.4s, w13 + ubfx w9, \mx, #7, #7 + and \mx, \mx, #0x7F + ubfx w11, w14, #7, #7 // for HV + and w14, w14, #0x7F // for HV + cmp \w, #4 + csel \mx, \mx, w9, le + add \xmx, x12, \xmx, lsl #3 // subpel H filter address + movi v24.16b, #128 + cbz \my, L(\type\()_8tap_h_\isa) + + // HV cases + cmp \h, #4 + csel w14, w14, w11, le + sub \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4 + add \xmy, x12, x14, lsl #3 // subpel V filter address + mov x15, x30 + ldr d7, [\xmy] +.ifc \type, put + ldr q25, L(hv_tbl_neon_dotprod) +.endif + sxtl v7.8h, v7.8b + cmp w10, SHARP1 + b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 + + // HV 8-tap cases + sub \src, \src, \s_strd // src - src_stride * 3 - 4 + cmp \w, #4 + b.eq 40f +.ifc \type, put + b.lt 20f +.endif + + // .align JUMP_ALIGN // fallthrough +80: // HV8 - 8xN+ + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] +.ifc \type, prep + add \wd_strd, \w, \w +.endif + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + + .align LOOP_ALIGN +8: + ldr q23, [\lsrc] + add \lsrc, \lsrc, \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smull2 v1.4s, v16.8h, v7.h[0] + mov v16.16b, v17.16b + + sub v23.16b, v23.16b, v24.16b + + mov v5.16b, v27.16b + mov v6.16b, v27.16b + + smlal v0.4s, v17.4h, v7.h[1] + smlal2 v1.4s, v17.8h, v7.h[1] + mov v17.16b, v18.16b + + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b + tbl v4.16b, {v23.16b}, v30.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal2 v1.4s, v18.8h, v7.h[2] + mov v18.16b, v19.16b + + sdot v5.4s, v2.16b, v26.4b[0] + sdot v6.4s, v3.16b, v26.4b[0] + + smlal v0.4s, v19.4h, v7.h[3] + smlal2 v1.4s, v19.8h, v7.h[3] + mov v19.16b, v20.16b + + sdot v5.4s, v3.16b, v26.4b[1] + sdot v6.4s, v4.16b, v26.4b[1] + + smlal v0.4s, v20.4h, v7.h[4] + smlal2 v1.4s, v20.8h, v7.h[4] + mov v20.16b, v21.16b + + smlal v0.4s, v21.4h, v7.h[5] + smlal2 v1.4s, v21.8h, v7.h[5] +.ifc \type, prep + uzp1 v23.8h, v5.8h, v6.8h +.endif + mov v21.16b, v22.16b + + smlal v0.4s, v22.4h, v7.h[6] + smlal2 v1.4s, v22.8h, v7.h[6] +.ifc \type, prep + sshr v22.8h, v23.8h, #2 + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + subs w8, w8, #1 + st1 {v0.8h}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #16 +.else + shrn v22.4h, v5.4s, #2 + shrn2 v22.8h, v6.4s, #2 + smlal v0.4s, v22.4h, v7.h[7] + smlal2 v1.4s, v22.8h, v7.h[7] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + subs w8, w8, #1 + sqrshrun v0.8b, v0.8h, #2 + st1 {v0.8b}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #8 +.endif + add \src, \src, #8 + subs \w, \w, #8 + b.gt 81b + ret x15 + + .align JUMP_ALIGN +40: // HV8 - 4xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + + .align LOOP_ALIGN +4: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smlal v0.4s, v17.4h, v7.h[1] + mov v16.16b, v17.16b + mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal v0.4s, v19.4h, v7.h[3] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + + smlal v0.4s, v20.4h, v7.h[4] + smlal v0.4s, v21.4h, v7.h[5] + + sdot v5.4s, v2.16b, v26.4b[0] + mov v20.16b, v21.16b + mov v21.16b, v22.16b +.ifc \type, put + subs \h, \h, #1 +.endif + smlal v0.4s, v22.4h, v7.h[6] + shrn v22.4h, v5.4s, #2 + + smlal v0.4s, v22.4h, v7.h[7] +.ifc \type, prep + rshrn v0.4h, v0.4s, #6 + str d0, [\dst], #8 + subs \h, \h, #1 +.else + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + str s0, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 4b + ret x15 + +.ifc \type, put + .align JUMP_ALIGN +20: // HV8 - 2xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v21.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + + .align LOOP_ALIGN +2: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[0] + smlal v0.4s, v17.4h, v7.h[1] + mov v16.16b, v17.16b + mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b + + smlal v0.4s, v18.4h, v7.h[2] + smlal v0.4s, v19.4h, v7.h[3] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + + smlal v0.4s, v20.4h, v7.h[4] + smlal v0.4s, v21.4h, v7.h[5] + + sdot v5.4s, v2.16b, v26.4b[0] + mov v20.16b, v21.16b + mov v21.16b, v22.16b + + subs \h, \h, #1 + smlal v0.4s, v22.4h, v7.h[6] + shrn v22.4h, v5.4s, #2 + + smlal v0.4s, v22.4h, v7.h[7] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + + str h0, [\dst] + add \dst, \dst, \d_strd + b.gt 2b + ret x15 +.endif + + .align JUMP_ALIGN +L(\type\()_6tap_hv_\isa): + cmp \w, #4 + b.eq 40f +.ifc \type, put + b.lt 20f +.endif + + // .align JUMP_ALIGN // fallthrough +80: // HV6 - 8xN+ + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] +.ifc \type, prep + add \wd_strd, \w, \w +.endif + + .align LOOP_ALIGN +81: + mov \lsrc, \src + mov \ldst, \dst + mov w8, \h + + bl L(\type\()_hv_filter8_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter8_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +8: + ldr q23, [\xmy] + add \xmy, \xmy, \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smull2 v1.4s, v16.8h, v7.h[1] + sub v23.16b, v23.16b, v24.16b + mov v16.16b, v17.16b + + mov v5.16b, v27.16b + mov v6.16b, v27.16b + + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b + + smlal v0.4s, v17.4h, v7.h[2] + smlal2 v1.4s, v17.8h, v7.h[2] + tbl v4.16b, {v23.16b}, v30.16b + mov v17.16b, v18.16b + + sdot v5.4s, v2.16b, v26.4b[0] + sdot v6.4s, v3.16b, v26.4b[0] + smlal v0.4s, v18.4h, v7.h[3] + smlal2 v1.4s, v18.8h, v7.h[3] + mov v18.16b, v19.16b + + sdot v5.4s, v3.16b, v26.4b[1] + sdot v6.4s, v4.16b, v26.4b[1] + smlal v0.4s, v19.4h, v7.h[4] + smlal2 v1.4s, v19.8h, v7.h[4] + mov v19.16b, v20.16b + uzp1 v23.8h, v5.8h, v6.8h + + smlal v0.4s, v20.4h, v7.h[5] + smlal2 v1.4s, v20.8h, v7.h[5] + sshr v20.8h, v23.8h, #2 +.ifc \type, prep + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + rshrn v0.4h, v0.4s, #6 + rshrn2 v0.8h, v1.4s, #6 + st1 {v0.8h}, [\ldst], \d_strd + subs w8, w8, #1 + b.gt 8b + add \dst, \dst, #16 +.else + subs w8, w8, #1 + smlal v0.4s, v20.4h, v7.h[6] + smlal2 v1.4s, v20.8h, v7.h[6] + tbl v0.16b, {v0.16b, v1.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + st1 {v0.8b}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #8 +.endif + add \src, \src, #8 + subs \w, \w, #8 + b.gt 81b + ret x15 + + .align FUNC_ALIGN +L(\type\()_hv_filter8_\isa): + ldr q4, [\lsrc] + add \lsrc, \lsrc, \s_strd + sub v4.16b, v4.16b, v24.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + tbl v2.16b, {v4.16b}, v28.16b + tbl v3.16b, {v4.16b}, v29.16b + tbl v4.16b, {v4.16b}, v30.16b + sdot v22.4s, v2.16b, v26.4b[0] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v23.4s, v4.16b, v26.4b[1] + shrn v22.4h, v22.4s, #2 + shrn2 v22.8h, v23.4s, #2 + ret + + .align FUNC_ALIGN +L(\type\()_hv_filter4_\isa): + mov v22.16b, v27.16b + ld1 {v4.8b}, [\src], \s_strd + sub v4.16b, v4.16b, v24.16b + tbl v2.16b, {v4.16b}, v28.16b + sdot v22.4s, v2.16b, v26.4b[0] + shrn v22.4h, v22.4s, #2 + ret + + .align JUMP_ALIGN +40: // HV6 - 4xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +4: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smlal v0.4s, v17.4h, v7.h[2] + sub v4.16b, v4.16b, v24.16b + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + smlal v0.4s, v18.4h, v7.h[3] + smlal v0.4s, v19.4h, v7.h[4] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + sdot v5.4s, v2.16b, v26.4b[0] + + smlal v0.4s, v20.4h, v7.h[5] + shrn v20.4h, v5.4s, #2 +.ifc \type, prep + smlal v0.4s, v20.4h, v7.h[6] + rshrn v0.4h, v0.4s, #6 + str d0, [\dst], #8 + subs \h, \h, #1 +.else + subs \h, \h, #1 + smlal v0.4s, v20.4h, v7.h[6] + tbl v0.16b, {v0.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + str s0, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 4b + ret x15 + +.ifc \type, put + .align JUMP_ALIGN +20: // HV6 - 2xN + ldr s26, [\xmx, #2] + add \src, \src, #2 + + bl L(\type\()_hv_filter4_\isa) + mov v16.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v17.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v18.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v19.16b, v22.16b + bl L(\type\()_hv_filter4_\isa) + mov v20.16b, v22.16b + + .align LOOP_ALIGN +2: + ld1 {v4.8b}, [\src], \s_strd + + smull v0.4s, v16.4h, v7.h[1] + smlal v0.4s, v17.4h, v7.h[2] + sub v4.16b, v4.16b, v24.16b + mov v16.16b, v17.16b + mov v17.16b, v18.16b + + smlal v0.4s, v18.4h, v7.h[3] + smlal v0.4s, v19.4h, v7.h[4] + tbl v2.16b, {v4.16b}, v28.16b + mov v5.16b, v27.16b + + mov v18.16b, v19.16b + mov v19.16b, v20.16b + sdot v5.4s, v2.16b, v26.4b[0] + + smlal v0.4s, v20.4h, v7.h[5] + shrn v20.4h, v5.4s, #2 + + subs \h, \h, #1 + smlal v0.4s, v20.4h, v7.h[6] + + tbl v0.16b, {v0.16b}, v25.16b + sqrshrun v0.8b, v0.8h, #2 + + str h0, [\dst] + add \dst, \dst, \d_strd + b.gt 2b + ret x15 +.endif + + .align JUMP_ALIGN +L(\type\()_8tap_h_\isa): + adr x9, L(\type\()_8tap_h_\isa\()_tbl) + ldrh w8, [x9, x8, lsl #1] +.ifc \type, put + mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT + dup v27.4s, w10 +.endif + sub x9, x9, x8 + br x9 + +.ifc \type, put + .align JUMP_ALIGN +20: // H - 2xN + AARCH64_VALID_JUMP_TARGET + add \src, \src, #2 + ldr s6, [\xmx, #2] + + .align LOOP_ALIGN +2: + ldr d0, [\src] + ldr d1, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.8b, v0.8b, v24.8b + sub v1.8b, v1.8b, v24.8b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + + tbl v2.16b, {v0.16b}, v28.16b + tbl v3.16b, {v1.16b}, v28.16b + + sdot v4.4s, v2.16b, v6.4b[0] + sdot v5.4s, v3.16b, v6.4b[0] + + uzp1 v4.8h, v4.8h, v5.8h + sqshrun v4.8b, v4.8h, #6 + + subs \h, \h, #2 + fmov x8, d4 + lsr x9, x8, #32 + strh w8, [\dst] + strh w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 + b.gt 2b + ret + +.endif + + .align JUMP_ALIGN +40: // H - 4xN + AARCH64_VALID_JUMP_TARGET + add \src, \src, #2 + ldr s26, [\xmx, #2] + + .align LOOP_ALIGN +4: + ldr d0, [\src] + ldr d1, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.8b, v0.8b, v24.8b + sub v1.8b, v1.8b, v24.8b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + + tbl v2.16b, {v0.16b}, v28.16b + tbl v3.16b, {v1.16b}, v28.16b + + sdot v4.4s, v2.16b, v26.4b[0] + sdot v5.4s, v3.16b, v26.4b[0] +.ifc \type, prep + subs \h, \h, #2 + shrn v4.4h, v4.4s, #2 + shrn2 v4.8h, v5.4s, #2 + str q4, [\dst], #16 +.else + uzp1 v4.8h, v4.8h, v5.8h + sqshrun v4.8b, v4.8h, #6 + subs \h, \h, #2 + fmov x8, d4 + lsr x9, x8, #32 + str w8, [\dst] + str w9, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 4b + ret + + .align JUMP_ALIGN +80: // H - 8xN + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr d26, [\xmx] + + .align LOOP_ALIGN +8: + ldr q0, [\src] + ldr q16, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + + sub v0.16b, v0.16b, v24.16b + sub v16.16b, v16.16b, v24.16b + + mov v4.16b, v27.16b + mov v5.16b, v27.16b + mov v20.16b, v27.16b + mov v21.16b, v27.16b + + tbl v1.16b, {v0.16b}, v28.16b + tbl v2.16b, {v0.16b}, v29.16b + tbl v3.16b, {v0.16b}, v30.16b + tbl v17.16b, {v16.16b}, v28.16b + tbl v18.16b, {v16.16b}, v29.16b + tbl v19.16b, {v16.16b}, v30.16b + + sdot v4.4s, v1.16b, v26.4b[0] + sdot v5.4s, v2.16b, v26.4b[0] + sdot v20.4s, v17.16b, v26.4b[0] + sdot v21.4s, v18.16b, v26.4b[0] + sdot v4.4s, v2.16b, v26.4b[1] + sdot v5.4s, v3.16b, v26.4b[1] + sdot v20.4s, v18.16b, v26.4b[1] + sdot v21.4s, v19.16b, v26.4b[1] + + uzp1 v4.8h, v4.8h, v5.8h + uzp1 v20.8h, v20.8h, v21.8h +.ifc \type, prep + sshr v4.8h, v4.8h, #2 + sshr v20.8h, v20.8h, #2 + subs \h, \h, #2 + stp q4, q20, [\dst], #32 +.else + sqshrun v4.8b, v4.8h, #6 + sqshrun v20.8b, v20.8h, #6 + subs \h, \h, #2 + str d4, [\dst] + str d20, [\dst, \d_strd] + add \dst, \dst, \d_strd, lsl #1 +.endif + b.gt 8b + ret + + .align JUMP_ALIGN +160: // H - 16xN + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr q31, L(h_tbl_neon_dotprod) + 48 + ldr d26, [\xmx] + + .align LOOP_ALIGN +16: + ldp q16, q17, [\src] + add \src, \src, \s_strd + + sub v16.16b, v16.16b, v24.16b + sub v17.16b, v17.16b, v24.16b + + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + + tbl v0.16b, {v16.16b}, v28.16b + tbl v1.16b, {v16.16b}, v29.16b + tbl v2.16b, {v16.16b}, v30.16b + tbl v3.16b, {v16.16b, v17.16b}, v31.16b + tbl v4.16b, {v17.16b}, v28.16b + + sdot v6.4s, v0.16b, v26.4b[0] + sdot v7.4s, v1.16b, v26.4b[0] + sdot v22.4s, v2.16b, v26.4b[0] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v6.4s, v1.16b, v26.4b[1] + sdot v7.4s, v2.16b, v26.4b[1] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v4.16b, v26.4b[1] + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h +.ifc \type, prep + sshr v6.8h, v6.8h, #2 + sshr v22.8h, v22.8h, #2 + subs \h, \h, #1 + stp q6, q22, [\dst], #32 +.else + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs \h, \h, #1 + str q6, [\dst] + add \dst, \dst, \d_strd +.endif + b.gt 16b + ret + + .align JUMP_ALIGN +320: // H - 32xN+ +640: +1280: + AARCH64_VALID_JUMP_TARGET + ldr q29, L(h_tbl_neon_dotprod) + 16 + ldr q30, L(h_tbl_neon_dotprod) + 32 + ldr q31, L(h_tbl_neon_dotprod) + 48 + ldr d26, [\xmx] +.ifc \type, put + sub \d_strd, \d_strd, \w, uxtw +.endif + sub \s_strd, \s_strd, \w, uxtw + mov w8, \w + + .align LOOP_ALIGN +32: + ldp q16, q17, [\src], #16 + + sub v16.16b, v16.16b, v24.16b + sub v17.16b, v17.16b, v24.16b + + mov v6.16b, v27.16b + mov v7.16b, v27.16b + mov v22.16b, v27.16b + mov v23.16b, v27.16b + + tbl v0.16b, {v16.16b}, v28.16b + tbl v1.16b, {v16.16b}, v29.16b + tbl v2.16b, {v16.16b}, v30.16b + tbl v3.16b, {v16.16b, v17.16b}, v31.16b + tbl v4.16b, {v17.16b}, v28.16b + + sdot v6.4s, v0.16b, v26.4b[0] + sdot v7.4s, v1.16b, v26.4b[0] + sdot v22.4s, v2.16b, v26.4b[0] + sdot v23.4s, v3.16b, v26.4b[0] + sdot v6.4s, v1.16b, v26.4b[1] + sdot v7.4s, v2.16b, v26.4b[1] + sdot v22.4s, v3.16b, v26.4b[1] + sdot v23.4s, v4.16b, v26.4b[1] + + uzp1 v6.8h, v6.8h, v7.8h + uzp1 v22.8h, v22.8h, v23.8h +.ifc \type, prep + sshr v6.8h, v6.8h, #2 + sshr v22.8h, v22.8h, #2 + subs w8, w8, #16 + stp q6, q22, [\dst], #32 +.else + sqshrun v6.8b, v6.8h, #6 + sqshrun2 v6.16b, v22.8h, #6 + subs w8, w8, #16 + str q6, [\dst], #16 +.endif + b.gt 32b + + add \src, \src, \s_strd +.ifc \type, put + add \dst, \dst, \d_strd +.endif + mov w8, \w + subs \h, \h, #1 + b.gt 32b + ret + +L(\type\()_8tap_h_\isa\()_tbl): + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b) + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b) +.ifc \type, put + .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b) +.endif +endfunc +.endm + +// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) +// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) +filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 + +// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) +// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) +filter_8tap_fn put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 + +DISABLE_DOTPROD +#endif // HAVE_DOTPROD diff --git a/src/arm/mc.h b/src/arm/mc.h index 06cd533a9..7e57fd37c 100644 --- a/src/arm/mc.h +++ b/src/arm/mc.h @@ -30,26 +30,40 @@ #include "src/mc.h" #include "src/cpu.h" -decl_mc_fn(BF(dav1d_put_8tap_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon)); -decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon)); -decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon)); -decl_mc_fn(BF(dav1d_put_bilin, neon)); +#define decl_8tap_gen(decl_name, fn_name, opt) \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_sharp, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_sharp, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_regular, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_smooth, opt)); \ + decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp, opt)) + +#define decl_8tap_fns(opt) \ + decl_8tap_gen(mc, put, opt); \ + decl_8tap_gen(mct, prep, opt) + +#define init_8tap_gen(name, opt) \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, opt); \ + init_##name##_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, opt) + +#define init_8tap_fns(opt) \ + init_8tap_gen(mc, opt); \ + init_8tap_gen(mct, opt) + +decl_8tap_fns(neon); +decl_8tap_fns(neon_dotprod); -decl_mct_fn(BF(dav1d_prep_8tap_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon)); -decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon)); +decl_mc_fn(BF(dav1d_put_bilin, neon)); decl_mct_fn(BF(dav1d_prep_bilin, neon)); decl_avg_fn(BF(dav1d_avg, neon)); @@ -77,27 +91,10 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; - init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); - init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); - init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); - - init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, neon); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, neon); - init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, neon); - init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); + init_8tap_fns(neon); + + init_mc_fn (FILTER_2D_BILINEAR, bilin, neon); + init_mct_fn(FILTER_2D_BILINEAR, bilin, neon); c->avg = BF(dav1d_avg, neon); c->w_avg = BF(dav1d_w_avg, neon); @@ -111,4 +108,12 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { c->warp8x8 = BF(dav1d_warp_affine_8x8, neon); c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); c->emu_edge = BF(dav1d_emu_edge, neon); + +#if ARCH_AARCH64 +#if HAVE_DOTPROD && BITDEPTH == 8 + if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return; + + init_8tap_fns(neon_dotprod); +#endif // HAVE_DOTPROD && BITDEPTH == 8 +#endif // ARCH_AARCH64 } diff --git a/src/meson.build b/src/meson.build index 56daf005c..4a4747c73 100644 --- a/src/meson.build +++ b/src/meson.build @@ -106,6 +106,7 @@ if is_asm_enabled 'arm/64/loopfilter.S', 'arm/64/looprestoration.S', 'arm/64/mc.S', + 'arm/64/mc_dotprod.S', ) endif From c7f38b303e7dc707ffcc6ce30594598f83f03254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 19 Apr 2024 00:07:44 +0300 Subject: [PATCH 03/22] aarch64: Avoid unaligned jump tables Manually add a padding 0 entry to make the odd number of .hword entries align with the instruction size. This fixes assembling with GAS, with the --gdwarf2 option, where it previously produced the error message "unaligned opcodes detected in executable segment". The message is slightly misleading, as the error is printed even if there actually are no opcodes that are misaligned, as the jump table is the last thing within the .text section. The issue can be reproduced with an input as small as this, assembled with "as --gdwarf2 -c test.s". .text nop .hword 0 See a6228f47f0eebcdfebb1753a786e3e1654b51ea4 for earlier cases of the same error - although in those cases, we actually did have more code and labels following the unaligned jump tables. This error is present with binutils 2.39 and earlier; in binutils 2.40, this input no longer is considered an error, fixed in https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=6f6f5b0adc9efd103c434fd316e8c880a259775d. --- src/arm/64/mc_dotprod.S | 1 + 1 file changed, 1 insertion(+) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index fcf04ee4d..051a201dd 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -1397,6 +1397,7 @@ L(\type\()_8tap_h_\isa\()_tbl): .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b) .ifc \type, put .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b) + .hword 0 .endif endfunc .endm From 08417d57a400868aa11cb70bae3693b0c60ed992 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Tue, 23 Apr 2024 16:50:35 +0200 Subject: [PATCH 04/22] AArch64: Add \dot parameter to filter_8tap_fn macro Add \dot parameter to filter_8tap_fn macro in preparation to extend it with i8mm code path. This patch also contains string fixes and some instruction reorderings along with some register renaming to make it more uniform. These changes don't affect performance but simplifies the code a bit. --- src/arm/64/mc_dotprod.S | 239 ++++++++++++++++++++-------------------- 1 file changed, 119 insertions(+), 120 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 051a201dd..c6040145b 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -86,7 +86,7 @@ function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN endfunc .endm -.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd +.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd make_8tap_fn \type, sharp, SHARP1, SHARP1, \isa make_8tap_fn \type, sharp_smooth, SHARP1, SMOOTH1, \isa make_8tap_fn \type, sharp_regular, SHARP1, REGULAR1, \isa @@ -112,12 +112,10 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN .align JUMP_ALIGN L(\type\()_8tap_v_\isa): madd \my, \my, w11, w10 -.ifc \type, prep - mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding -.endif - sub \src, \src, \s_strd ldr q6, L(v_tbl_neon_dotprod) + sub \src, \src, \s_strd .ifc \type, prep + mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 .endif ubfx w11, \my, #7, #7 @@ -125,9 +123,9 @@ L(\type\()_8tap_v_\isa): ldr q28, L(v_tbl_neon_dotprod) + 16 cmp \h, #4 csel \my, \my, w11, le - sub \src, \src, \s_strd, lsl #1 // src - src_stride * 3 - ldr q29, L(v_tbl_neon_dotprod) + 32 + sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 add \xmy, x12, \xmy, lsl #3 // subpel V filter address + ldr q29, L(v_tbl_neon_dotprod) + 32 movi v5.16b, #128 ldr d7, [\xmy] cmp \w, #8 @@ -211,20 +209,20 @@ L(\type\()_8tap_v_\isa): sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v2.4s, v22.16b, v7.4b[0] - sdot v3.4s, v25.16b, v7.4b[0] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v3.4s, v25.16b, v7.4b[0] tbl v16.16b, {v16.16b, v17.16b}, v6.16b tbl v19.16b, {v19.16b, v20.16b}, v6.16b tbl v22.16b, {v22.16b, v23.16b}, v6.16b tbl v25.16b, {v25.16b, v26.16b}, v6.16b - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v20.16b, v7.4b[1] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v20.16b, v7.4b[1] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v26.16b, v7.4b[1] tbl v17.16b, {v17.16b, v18.16b}, v28.16b tbl v20.16b, {v20.16b, v21.16b}, v29.16b @@ -238,7 +236,7 @@ L(\type\()_8tap_v_\isa): sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 st1 {v0.8h, v1.8h}, [\ldst], \d_strd -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 st1 {v0.16b}, [\ldst], \d_strd @@ -256,15 +254,15 @@ L(\type\()_8tap_v_\isa): movi v2.4s, #32, lsl 8 movi v3.4s, #32, lsl 8 .endif - sdot v0.4s, v16.16b, v7.4b[0] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v2.4s, v22.16b, v7.4b[0] - sdot v3.4s, v25.16b, v7.4b[0] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v3.4s, v25.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v20.16b, v7.4b[1] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v20.16b, v7.4b[1] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v26.16b, v7.4b[1] subs \w, \w, #16 uzp1 v0.8h, v0.8h, v1.8h @@ -274,7 +272,7 @@ L(\type\()_8tap_v_\isa): sshr v1.8h, v2.8h, #2 stp q0, q1, [\ldst] add \dst, \dst, #32 -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun2 v0.16b, v2.8h, #6 str q0, [\ldst] @@ -318,7 +316,6 @@ L(\type\()_8tap_v_\isa): .ifc \type, put b.eq 82f .endif - .align LOOP_ALIGN 8: ldr d21, [\src] @@ -345,20 +342,20 @@ L(\type\()_8tap_v_\isa): tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] tbl v16.16b, {v22.16b, v23.16b}, v6.16b tbl v19.16b, {v25.16b, v26.16b}, v6.16b tbl v17.16b, {v23.16b, v24.16b}, v28.16b tbl v20.16b, {v26.16b, v27.16b}, v29.16b - sdot v2.4s, v22.16b, v7.4b[0] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v25.16b, v7.4b[0] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v25.16b, v7.4b[0] + \dot v3.4s, v26.16b, v7.4b[1] subs \h, \h, #2 uzp1 v0.8h, v0.8h, v1.8h @@ -367,7 +364,7 @@ L(\type\()_8tap_v_\isa): sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 stp q0, q1, [\dst], #32 -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] @@ -399,15 +396,15 @@ L(\type\()_8tap_v_\isa): tbl v23.16b, {v17.16b, v18.16b}, v28.16b tbl v26.16b, {v20.16b, v21.16b}, v29.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] - sdot v2.4s, v22.16b, v7.4b[0] - sdot v2.4s, v23.16b, v7.4b[1] - sdot v3.4s, v25.16b, v7.4b[0] - sdot v3.4s, v26.16b, v7.4b[1] + \dot v2.4s, v22.16b, v7.4b[0] + \dot v2.4s, v23.16b, v7.4b[1] + \dot v3.4s, v25.16b, v7.4b[0] + \dot v3.4s, v26.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h @@ -415,7 +412,7 @@ L(\type\()_8tap_v_\isa): sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 stp q0, q1, [\dst] -.else +.else // put sqrshrun v0.8b, v0.8h, #6 sqrshrun v1.8b, v2.8h, #6 str d0, [\dst] @@ -457,7 +454,6 @@ L(\type\()_8tap_v_\isa): .ifc \type, put b.eq 42f .endif - .align LOOP_ALIGN 4: ldr s18, [\src] @@ -476,14 +472,14 @@ L(\type\()_8tap_v_\isa): tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep subs \h, \h, #2 shrn v0.4h, v0.4s, #2 @@ -517,16 +513,15 @@ L(\type\()_8tap_v_\isa): tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 str q0, [\dst] - ret .else uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 @@ -534,8 +529,10 @@ L(\type\()_8tap_v_\isa): lsr x9, x8, #32 str w8, [\dst] str w9, [\dst, \d_strd] +.endif ret +.ifc \type, put .align JUMP_ALIGN 20: // V - 2xN ldr h16, [\src] @@ -581,14 +578,14 @@ L(\type\()_8tap_v_\isa): tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] tbl v16.16b, {v19.16b, v20.16b}, v6.16b tbl v17.16b, {v20.16b, v21.16b}, v28.16b - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 @@ -613,11 +610,11 @@ L(\type\()_8tap_v_\isa): tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b - sdot v0.4s, v16.16b, v7.4b[0] - sdot v0.4s, v17.16b, v7.4b[1] + \dot v0.4s, v16.16b, v7.4b[0] + \dot v0.4s, v17.16b, v7.4b[1] - sdot v1.4s, v19.16b, v7.4b[0] - sdot v1.4s, v20.16b, v7.4b[1] + \dot v1.4s, v19.16b, v7.4b[0] + \dot v1.4s, v20.16b, v7.4b[1] uzp1 v0.8h, v0.8h, v1.8h sqrshrun v0.8b, v0.8h, #6 @@ -635,8 +632,8 @@ L(\type\()_8tap_h_hv_\isa): madd w14, \my, w11, w10 // for HV ldr q28, L(h_tbl_neon_dotprod) mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding + dup v27.4s, w13 // put H overrides this sub \src, \src, #4 // src - 4 - dup v27.4s, w13 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7F ubfx w11, w14, #7, #7 // for HV @@ -650,7 +647,7 @@ L(\type\()_8tap_h_hv_\isa): // HV cases cmp \h, #4 csel w14, w14, w11, le - sub \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4 + sub \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 4 add \xmy, x12, x14, lsl #3 // subpel V filter address mov x15, x30 ldr d7, [\xmy] @@ -662,7 +659,7 @@ L(\type\()_8tap_h_hv_\isa): b.ne L(\type\()_6tap_hv_\isa) // vertical != SHARP1 // HV 8-tap cases - sub \src, \src, \s_strd // src - src_stride * 3 - 4 + sub \src, \src, \s_strd // src - s_strd * 3 - 4 cmp \w, #4 b.eq 40f .ifc \type, put @@ -677,7 +674,6 @@ L(\type\()_8tap_h_hv_\isa): .ifc \type, prep add \wd_strd, \w, \w .endif - .align LOOP_ALIGN 81: mov \lsrc, \src @@ -724,15 +720,15 @@ L(\type\()_8tap_h_hv_\isa): smlal2 v1.4s, v18.8h, v7.h[2] mov v18.16b, v19.16b - sdot v5.4s, v2.16b, v26.4b[0] - sdot v6.4s, v3.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] + \dot v6.4s, v3.16b, v26.4b[0] smlal v0.4s, v19.4h, v7.h[3] smlal2 v1.4s, v19.8h, v7.h[3] mov v19.16b, v20.16b - sdot v5.4s, v3.16b, v26.4b[1] - sdot v6.4s, v4.16b, v26.4b[1] + \dot v5.4s, v3.16b, v26.4b[1] + \dot v6.4s, v4.16b, v26.4b[1] smlal v0.4s, v20.4h, v7.h[4] smlal2 v1.4s, v20.8h, v7.h[4] @@ -757,7 +753,7 @@ L(\type\()_8tap_h_hv_\isa): st1 {v0.8h}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #16 -.else +.else // put shrn v22.4h, v5.4s, #2 shrn2 v22.8h, v6.4s, #2 smlal v0.4s, v22.4h, v7.h[7] @@ -801,6 +797,7 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b smlal v0.4s, v18.4h, v7.h[2] @@ -814,7 +811,7 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b .ifc \type, put @@ -865,6 +862,7 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b + sub v4.16b, v4.16b, v24.16b smlal v0.4s, v18.4h, v7.h[2] @@ -878,7 +876,7 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v20.4h, v7.h[4] smlal v0.4s, v21.4h, v7.h[5] - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b @@ -951,14 +949,16 @@ L(\type\()_6tap_hv_\isa): tbl v4.16b, {v23.16b}, v30.16b mov v17.16b, v18.16b - sdot v5.4s, v2.16b, v26.4b[0] - sdot v6.4s, v3.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] + \dot v6.4s, v3.16b, v26.4b[0] + smlal v0.4s, v18.4h, v7.h[3] smlal2 v1.4s, v18.8h, v7.h[3] mov v18.16b, v19.16b - sdot v5.4s, v3.16b, v26.4b[1] - sdot v6.4s, v4.16b, v26.4b[1] + \dot v5.4s, v3.16b, v26.4b[1] + \dot v6.4s, v4.16b, v26.4b[1] + smlal v0.4s, v19.4h, v7.h[4] smlal2 v1.4s, v19.8h, v7.h[4] mov v19.16b, v20.16b @@ -1001,10 +1001,10 @@ L(\type\()_hv_filter8_\isa): tbl v2.16b, {v4.16b}, v28.16b tbl v3.16b, {v4.16b}, v29.16b tbl v4.16b, {v4.16b}, v30.16b - sdot v22.4s, v2.16b, v26.4b[0] - sdot v22.4s, v3.16b, v26.4b[1] - sdot v23.4s, v3.16b, v26.4b[0] - sdot v23.4s, v4.16b, v26.4b[1] + \dot v22.4s, v2.16b, v26.4b[0] + \dot v22.4s, v3.16b, v26.4b[1] + \dot v23.4s, v3.16b, v26.4b[0] + \dot v23.4s, v4.16b, v26.4b[1] shrn v22.4h, v22.4s, #2 shrn2 v22.8h, v23.4s, #2 ret @@ -1015,7 +1015,7 @@ L(\type\()_hv_filter4_\isa): ld1 {v4.8b}, [\src], \s_strd sub v4.16b, v4.16b, v24.16b tbl v2.16b, {v4.16b}, v28.16b - sdot v22.4s, v2.16b, v26.4b[0] + \dot v22.4s, v2.16b, v26.4b[0] shrn v22.4h, v22.4s, #2 ret @@ -1052,7 +1052,7 @@ L(\type\()_hv_filter4_\isa): mov v18.16b, v19.16b mov v19.16b, v20.16b - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] shrn v20.4h, v5.4s, #2 @@ -1106,7 +1106,7 @@ L(\type\()_hv_filter4_\isa): mov v18.16b, v19.16b mov v19.16b, v20.16b - sdot v5.4s, v2.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] shrn v20.4h, v5.4s, #2 @@ -1139,7 +1139,7 @@ L(\type\()_8tap_h_\isa): 20: // H - 2xN AARCH64_VALID_JUMP_TARGET add \src, \src, #2 - ldr s6, [\xmx, #2] + ldr s26, [\xmx, #2] .align LOOP_ALIGN 2: @@ -1156,8 +1156,8 @@ L(\type\()_8tap_h_\isa): tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b - sdot v4.4s, v2.16b, v6.4b[0] - sdot v5.4s, v3.16b, v6.4b[0] + \dot v4.4s, v2.16b, v26.4b[0] + \dot v5.4s, v3.16b, v26.4b[0] uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 @@ -1170,7 +1170,6 @@ L(\type\()_8tap_h_\isa): add \dst, \dst, \d_strd, lsl #1 b.gt 2b ret - .endif .align JUMP_ALIGN @@ -1194,14 +1193,14 @@ L(\type\()_8tap_h_\isa): tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b - sdot v4.4s, v2.16b, v26.4b[0] - sdot v5.4s, v3.16b, v26.4b[0] + \dot v4.4s, v2.16b, v26.4b[0] + \dot v5.4s, v3.16b, v26.4b[0] .ifc \type, prep subs \h, \h, #2 shrn v4.4h, v4.4s, #2 shrn2 v4.8h, v5.4s, #2 str q4, [\dst], #16 -.else +.else // put uzp1 v4.8h, v4.8h, v5.8h sqshrun v4.8b, v4.8h, #6 subs \h, \h, #2 @@ -1242,14 +1241,14 @@ L(\type\()_8tap_h_\isa): tbl v18.16b, {v16.16b}, v29.16b tbl v19.16b, {v16.16b}, v30.16b - sdot v4.4s, v1.16b, v26.4b[0] - sdot v5.4s, v2.16b, v26.4b[0] - sdot v20.4s, v17.16b, v26.4b[0] - sdot v21.4s, v18.16b, v26.4b[0] - sdot v4.4s, v2.16b, v26.4b[1] - sdot v5.4s, v3.16b, v26.4b[1] - sdot v20.4s, v18.16b, v26.4b[1] - sdot v21.4s, v19.16b, v26.4b[1] + \dot v4.4s, v1.16b, v26.4b[0] + \dot v5.4s, v2.16b, v26.4b[0] + \dot v20.4s, v17.16b, v26.4b[0] + \dot v21.4s, v18.16b, v26.4b[0] + \dot v4.4s, v2.16b, v26.4b[1] + \dot v5.4s, v3.16b, v26.4b[1] + \dot v20.4s, v18.16b, v26.4b[1] + \dot v21.4s, v19.16b, v26.4b[1] uzp1 v4.8h, v4.8h, v5.8h uzp1 v20.8h, v20.8h, v21.8h @@ -1258,7 +1257,7 @@ L(\type\()_8tap_h_\isa): sshr v20.8h, v20.8h, #2 subs \h, \h, #2 stp q4, q20, [\dst], #32 -.else +.else // put sqshrun v4.8b, v4.8h, #6 sqshrun v20.8b, v20.8h, #6 subs \h, \h, #2 @@ -1296,14 +1295,14 @@ L(\type\()_8tap_h_\isa): tbl v3.16b, {v16.16b, v17.16b}, v31.16b tbl v4.16b, {v17.16b}, v28.16b - sdot v6.4s, v0.16b, v26.4b[0] - sdot v7.4s, v1.16b, v26.4b[0] - sdot v22.4s, v2.16b, v26.4b[0] - sdot v23.4s, v3.16b, v26.4b[0] - sdot v6.4s, v1.16b, v26.4b[1] - sdot v7.4s, v2.16b, v26.4b[1] - sdot v22.4s, v3.16b, v26.4b[1] - sdot v23.4s, v4.16b, v26.4b[1] + \dot v6.4s, v0.16b, v26.4b[0] + \dot v7.4s, v1.16b, v26.4b[0] + \dot v22.4s, v2.16b, v26.4b[0] + \dot v23.4s, v3.16b, v26.4b[0] + \dot v6.4s, v1.16b, v26.4b[1] + \dot v7.4s, v2.16b, v26.4b[1] + \dot v22.4s, v3.16b, v26.4b[1] + \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h @@ -1312,7 +1311,7 @@ L(\type\()_8tap_h_\isa): sshr v22.8h, v22.8h, #2 subs \h, \h, #1 stp q6, q22, [\dst], #32 -.else +.else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs \h, \h, #1 @@ -1355,14 +1354,14 @@ L(\type\()_8tap_h_\isa): tbl v3.16b, {v16.16b, v17.16b}, v31.16b tbl v4.16b, {v17.16b}, v28.16b - sdot v6.4s, v0.16b, v26.4b[0] - sdot v7.4s, v1.16b, v26.4b[0] - sdot v22.4s, v2.16b, v26.4b[0] - sdot v23.4s, v3.16b, v26.4b[0] - sdot v6.4s, v1.16b, v26.4b[1] - sdot v7.4s, v2.16b, v26.4b[1] - sdot v22.4s, v3.16b, v26.4b[1] - sdot v23.4s, v4.16b, v26.4b[1] + \dot v6.4s, v0.16b, v26.4b[0] + \dot v7.4s, v1.16b, v26.4b[0] + \dot v22.4s, v2.16b, v26.4b[0] + \dot v23.4s, v3.16b, v26.4b[0] + \dot v6.4s, v1.16b, v26.4b[1] + \dot v7.4s, v2.16b, v26.4b[1] + \dot v22.4s, v3.16b, v26.4b[1] + \dot v23.4s, v4.16b, v26.4b[1] uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h @@ -1371,7 +1370,7 @@ L(\type\()_8tap_h_\isa): sshr v22.8h, v22.8h, #2 subs w8, w8, #16 stp q6, q22, [\dst], #32 -.else +.else // put sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs w8, w8, #16 @@ -1404,11 +1403,11 @@ endfunc // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) -filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 +filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) -filter_8tap_fn put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 +filter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 DISABLE_DOTPROD #endif // HAVE_DOTPROD From 7351d94f04ddf76be9e1e6c46768d42109611630 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Tue, 23 Apr 2024 16:52:38 +0200 Subject: [PATCH 05/22] AArch64: Simplify DotProd path of vertical subpel filters Simplify the accumulator initializations of the DotProd code path of vertical subpel filters. This also makes it possible for some CPUs to use zero latency vector register moves. The load is also simplified (ldr + add -> ld1) in the inner loop of vertical filter for block size 16. --- src/arm/64/mc_dotprod.S | 66 +++++++++++++---------------------------- 1 file changed, 21 insertions(+), 45 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index c6040145b..3d397d1ba 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -115,8 +115,10 @@ L(\type\()_8tap_v_\isa): ldr q6, L(v_tbl_neon_dotprod) sub \src, \src, \s_strd .ifc \type, prep - mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding + mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 +.else + movi v4.4s, #32, lsl 8 // FILTER_WEIGHT * 128, bias for SDOT .endif ubfx w11, \my, #7, #7 and \my, \my, #0x7F @@ -191,19 +193,13 @@ L(\type\()_8tap_v_\isa): .align LOOP_ALIGN 16: - ldr q27, [\lsrc] - add \lsrc, \lsrc, \s_strd -.ifc \type, prep + ld1 {v27.16b}, [\lsrc], \s_strd + mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 -.endif + sub v18.16b, v27.16b, v5.16b sub v21.16b, v27.16b, v5.16b sub v24.16b, v27.16b, v5.16b @@ -243,17 +239,11 @@ L(\type\()_8tap_v_\isa): .endif b.gt 16b -.ifc \type, prep mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 -.endif + \dot v0.4s, v16.16b, v7.4b[0] \dot v1.4s, v19.16b, v7.4b[0] \dot v2.4s, v22.16b, v7.4b[0] @@ -321,17 +311,12 @@ L(\type\()_8tap_v_\isa): ldr d21, [\src] ldr d27, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 -.ifc \type, prep + mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 -.endif + sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b sub v24.16b, v27.16b, v5.16b @@ -376,18 +361,14 @@ L(\type\()_8tap_v_\isa): .ifc \type, put .align JUMP_ALIGN 82: +.endif ldr d21, [\src] - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 - movi v2.4s, #32, lsl 8 - movi v3.4s, #32, lsl 8 -.else - ldr d21, [\src] + mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.endif + sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b @@ -459,13 +440,10 @@ L(\type\()_8tap_v_\isa): ldr s18, [\src] ldr s21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 -.ifc \type, prep + mov v0.16b, v4.16b mov v1.16b, v4.16b -.else - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 -.endif + sub v18.16b, v18.16b, v5.16b sub v21.16b, v21.16b, v5.16b @@ -500,14 +478,12 @@ L(\type\()_8tap_v_\isa): .ifc \type, put .align JUMP_ALIGN 42: +.endif ldr s18, [\src] - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 -.else - ldr s18, [\src] + mov v0.16b, v4.16b mov v1.16b, v4.16b -.endif + sub v18.16b, v18.16b, v5.16b tbl v19.16b, {v16.16b, v17.16b}, v6.16b @@ -569,8 +545,8 @@ L(\type\()_8tap_v_\isa): ldr h21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 + mov v0.16b, v4.16b + mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b sub v21.8b, v21.8b, v5.8b @@ -602,8 +578,8 @@ L(\type\()_8tap_v_\isa): 22: ldr h18, [\src] - movi v0.4s, #32, lsl 8 // 64 * 128, bias for SDOT - movi v1.4s, #32, lsl 8 + mov v0.16b, v4.16b + mov v1.16b, v4.16b sub v18.8b, v18.8b, v5.8b From 417cdc55cc98c07acb8f3a49c69831200d8ab954 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Tue, 23 Apr 2024 16:55:24 +0200 Subject: [PATCH 06/22] AArch64: Simplify DotProd path of horizontal subpel filters Simplify the inner loops of the DotProd code path of horizontal subpel filters to avoid using 2-register TBL instructions. The store part of block size 16 of the horizontal put case is also simplified (str + add -> st1). This patch can improve performance mostly on small cores like Cortex-A510 and newer. Other CPUs are mostly unaffected. Cortex-A510: mct_8tap_sharp_w16_h_8bpc_dotprod: 2.77x -> 3.13x mct_8tap_sharp_w32_h_8bpc_dotprod: 2.32x -> 2.56x Cortex-A55: mct_8tap_sharp_w16_h_8bpc_dotprod: 3.89x -> 3.89x mct_8tap_sharp_w32_h_8bpc_dotprod: 3.35x -> 3.35x Cortex-A715: mct_8tap_sharp_w16_h_8bpc_dotprod: 3.79x -> 3.78x mct_8tap_sharp_w32_h_8bpc_dotprod: 3.30x -> 3.30x Cortex-A78: mct_8tap_sharp_w16_h_8bpc_dotprod: 4.30x -> 4.31x mct_8tap_sharp_w32_h_8bpc_dotprod: 3.79x -> 3.80x Cortex-X3: mct_8tap_sharp_w16_h_8bpc_dotprod: 4.74x -> 4.75x mct_8tap_sharp_w32_h_8bpc_dotprod: 3.89x -> 3.91x Cortex-X1: mct_8tap_sharp_w16_h_8bpc_dotprod: 4.61x -> 4.62x mct_8tap_sharp_w32_h_8bpc_dotprod: 3.67x -> 3.66x --- src/arm/64/mc_dotprod.S | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 3d397d1ba..0a2dc9f10 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -60,7 +60,6 @@ L(h_tbl_neon_dotprod): .byte 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7 .byte 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11 .byte 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15 - .byte 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18, 16, 17, 18, 19 // Vertical convolutions are also using SDOT instructions, where a 128-bit // register contains a transposed 4x4 matrix of values. Subsequent iterations of @@ -1249,12 +1248,12 @@ L(\type\()_8tap_h_\isa): AARCH64_VALID_JUMP_TARGET ldr q29, L(h_tbl_neon_dotprod) + 16 ldr q30, L(h_tbl_neon_dotprod) + 32 - ldr q31, L(h_tbl_neon_dotprod) + 48 ldr d26, [\xmx] .align LOOP_ALIGN 16: - ldp q16, q17, [\src] + ldr q16, [\src] + ldr q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, \s_strd sub v16.16b, v16.16b, v24.16b @@ -1268,8 +1267,8 @@ L(\type\()_8tap_h_\isa): tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b - tbl v3.16b, {v16.16b, v17.16b}, v31.16b - tbl v4.16b, {v17.16b}, v28.16b + tbl v3.16b, {v17.16b}, v28.16b + tbl v4.16b, {v17.16b}, v29.16b \dot v6.4s, v0.16b, v26.4b[0] \dot v7.4s, v1.16b, v26.4b[0] @@ -1291,8 +1290,7 @@ L(\type\()_8tap_h_\isa): sqshrun v6.8b, v6.8h, #6 sqshrun2 v6.16b, v22.8h, #6 subs \h, \h, #1 - str q6, [\dst] - add \dst, \dst, \d_strd + st1 {v6.16b}, [\dst], \d_strd .endif b.gt 16b ret @@ -1304,7 +1302,6 @@ L(\type\()_8tap_h_\isa): AARCH64_VALID_JUMP_TARGET ldr q29, L(h_tbl_neon_dotprod) + 16 ldr q30, L(h_tbl_neon_dotprod) + 32 - ldr q31, L(h_tbl_neon_dotprod) + 48 ldr d26, [\xmx] .ifc \type, put sub \d_strd, \d_strd, \w, uxtw @@ -1314,7 +1311,9 @@ L(\type\()_8tap_h_\isa): .align LOOP_ALIGN 32: - ldp q16, q17, [\src], #16 + ldr q16, [\src] + ldr q17, [\src, #12] // avoid 2 register TBL for small cores + add \src, \src, #16 sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b @@ -1327,8 +1326,8 @@ L(\type\()_8tap_h_\isa): tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b - tbl v3.16b, {v16.16b, v17.16b}, v31.16b - tbl v4.16b, {v17.16b}, v28.16b + tbl v3.16b, {v17.16b}, v28.16b + tbl v4.16b, {v17.16b}, v29.16b \dot v6.4s, v0.16b, v26.4b[0] \dot v7.4s, v1.16b, v26.4b[0] From e54e6d9f7d174f27704d229bf4822f2f8dca1f01 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Thu, 25 Apr 2024 16:53:04 +0200 Subject: [PATCH 07/22] AArch64: Simplify TBL usage in 2D DotProd filters Simplify the TBL usages in small block size (2, 4) parts of the 2D (horizontal-vertical) put subpel filters. The 2-register TBLs are replaced with the 1-register form because we only need the lower 64-bits of the result and it can be extracted from only one source register. Performance is not affected by this change. --- src/arm/64/mc_dotprod.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 0a2dc9f10..e076abf46 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -801,7 +801,7 @@ L(\type\()_8tap_h_hv_\isa): str d0, [\dst], #8 subs \h, \h, #1 .else - tbl v0.16b, {v0.16b, v1.16b}, v25.16b + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd @@ -860,7 +860,7 @@ L(\type\()_8tap_h_hv_\isa): shrn v22.4h, v5.4s, #2 smlal v0.4s, v22.4h, v7.h[7] - tbl v0.16b, {v0.16b, v1.16b}, v25.16b + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] @@ -1039,7 +1039,7 @@ L(\type\()_hv_filter4_\isa): .else subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] - tbl v0.16b, {v0.16b}, v25.16b + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] add \dst, \dst, \d_strd @@ -1089,7 +1089,7 @@ L(\type\()_hv_filter4_\isa): subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] - tbl v0.16b, {v0.16b}, v25.16b + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str h0, [\dst] From 1cdba4879043da2bacf8caa592375311f09df03b Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Thu, 25 Apr 2024 16:54:13 +0200 Subject: [PATCH 08/22] AArch64: Simplify loads in *hv_filter* of DotProd path Simplify the load sequences in *hv_filter* functions (ldr + add -> ld1) to be more uniform and smaller. Performance is not affected. --- src/arm/64/mc_dotprod.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index e076abf46..31abe6235 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -968,8 +968,7 @@ L(\type\()_6tap_hv_\isa): .align FUNC_ALIGN L(\type\()_hv_filter8_\isa): - ldr q4, [\lsrc] - add \lsrc, \lsrc, \s_strd + ld1 {v4.16b}, [\lsrc], \s_strd sub v4.16b, v4.16b, v24.16b mov v22.16b, v27.16b mov v23.16b, v27.16b From 3980f14220b91421e4b958ea3b6abc5837bc47a4 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Tue, 23 Apr 2024 16:58:59 +0200 Subject: [PATCH 09/22] AArch64: Simplify DotProd path of 2D subpel filters Simplify the DotProd code path of the 2D (horizontal-vertical) subpel filters. It contains some instruction reordering and some macro simplifications to be more similar to the upcoming i8mm version. These changes have negligible effect on performance. Cortex-A510: mc_8tap_regular_w2_hv_8bpc_dotprod: 8.3769 -> 8.3380 mc_8tap_sharp_w2_hv_8bpc_dotprod: 9.5441 -> 9.5457 mc_8tap_regular_w4_hv_8bpc_dotprod: 8.3422 -> 8.3444 mc_8tap_sharp_w4_hv_8bpc_dotprod: 9.5441 -> 9.5367 mc_8tap_regular_w8_hv_8bpc_dotprod: 9.9852 -> 9.9666 mc_8tap_sharp_w8_hv_8bpc_dotprod: 12.5554 -> 12.5314 Cortex-A55: mc_8tap_regular_w2_hv_8bpc_dotprod: 6.4504 -> 6.4892 mc_8tap_sharp_w2_hv_8bpc_dotprod: 7.5732 -> 7.6078 mc_8tap_regular_w4_hv_8bpc_dotprod: 6.5088 -> 6.4760 mc_8tap_sharp_w4_hv_8bpc_dotprod: 7.5796 -> 7.5763 mc_8tap_regular_w8_hv_8bpc_dotprod: 9.3384 -> 9.3078 mc_8tap_sharp_w8_hv_8bpc_dotprod: 11.1159 -> 11.1401 Cortex-A78: mc_8tap_regular_w2_hv_8bpc_dotprod: 1.4122 -> 1.4250 mc_8tap_sharp_w2_hv_8bpc_dotprod: 1.7696 -> 1.7821 mc_8tap_regular_w4_hv_8bpc_dotprod: 1.4243 -> 1.4243 mc_8tap_sharp_w4_hv_8bpc_dotprod: 1.7866 -> 1.7863 mc_8tap_regular_w8_hv_8bpc_dotprod: 2.5304 -> 2.5171 mc_8tap_sharp_w8_hv_8bpc_dotprod: 3.0815 -> 3.0632 Cortex-X1: mc_8tap_regular_w2_hv_8bpc_dotprod: 0.8195 -> 0.8194 mc_8tap_sharp_w2_hv_8bpc_dotprod: 1.0092 -> 1.0081 mc_8tap_regular_w4_hv_8bpc_dotprod: 0.8197 -> 0.8166 mc_8tap_sharp_w4_hv_8bpc_dotprod: 1.0089 -> 1.0068 mc_8tap_regular_w8_hv_8bpc_dotprod: 1.5230 -> 1.5166 mc_8tap_sharp_w8_hv_8bpc_dotprod: 1.8683 -> 1.8625 --- src/arm/64/mc_dotprod.S | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 31abe6235..04b60aa48 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -724,18 +724,20 @@ L(\type\()_8tap_h_hv_\isa): smlal2 v1.4s, v22.8h, v7.h[7] rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 - subs w8, w8, #1 - st1 {v0.8h}, [\ldst], \d_strd - b.gt 8b - add \dst, \dst, #16 .else // put shrn v22.4h, v5.4s, #2 shrn2 v22.8h, v6.4s, #2 smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] tbl v0.16b, {v0.16b, v1.16b}, v25.16b - subs w8, w8, #1 sqrshrun v0.8b, v0.8h, #2 +.endif + subs w8, w8, #1 +.ifc \type, prep + st1 {v0.8h}, [\ldst], \d_strd + b.gt 8b + add \dst, \dst, #16 +.else st1 {v0.8b}, [\ldst], \d_strd b.gt 8b add \dst, \dst, #8 @@ -789,9 +791,7 @@ L(\type\()_8tap_h_hv_\isa): \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b -.ifc \type, put - subs \h, \h, #1 -.endif + smlal v0.4s, v22.4h, v7.h[6] shrn v22.4h, v5.4s, #2 @@ -801,6 +801,7 @@ L(\type\()_8tap_h_hv_\isa): str d0, [\dst], #8 subs \h, \h, #1 .else + subs \h, \h, #1 tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] @@ -855,11 +856,12 @@ L(\type\()_8tap_h_hv_\isa): mov v20.16b, v21.16b mov v21.16b, v22.16b - subs \h, \h, #1 smlal v0.4s, v22.4h, v7.h[6] shrn v22.4h, v5.4s, #2 smlal v0.4s, v22.4h, v7.h[7] + + subs \h, \h, #1 tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 @@ -942,19 +944,17 @@ L(\type\()_6tap_hv_\isa): smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] sshr v20.8h, v23.8h, #2 -.ifc \type, prep + + subs w8, w8, #1 smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] +.ifc \type, prep rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 st1 {v0.8h}, [\ldst], \d_strd - subs w8, w8, #1 b.gt 8b add \dst, \dst, #16 .else - subs w8, w8, #1 - smlal v0.4s, v20.4h, v7.h[6] - smlal2 v1.4s, v20.8h, v7.h[6] tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 st1 {v0.8b}, [\ldst], \d_strd @@ -985,8 +985,8 @@ L(\type\()_hv_filter8_\isa): .align FUNC_ALIGN L(\type\()_hv_filter4_\isa): - mov v22.16b, v27.16b ld1 {v4.8b}, [\src], \s_strd + mov v22.16b, v27.16b sub v4.16b, v4.16b, v24.16b tbl v2.16b, {v4.16b}, v28.16b \dot v22.4s, v2.16b, v26.4b[0] @@ -1030,14 +1030,12 @@ L(\type\()_hv_filter4_\isa): smlal v0.4s, v20.4h, v7.h[5] shrn v20.4h, v5.4s, #2 -.ifc \type, prep + subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] +.ifc \type, prep rshrn v0.4h, v0.4s, #6 str d0, [\dst], #8 - subs \h, \h, #1 .else - subs \h, \h, #1 - smlal v0.4s, v20.4h, v7.h[6] tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 str s0, [\dst] From fb2a00792e0f03b6b617453315e93e3d6f7ff8df Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 17 Apr 2024 20:00:07 +0200 Subject: [PATCH 10/22] AArch64: Add basic i8mm support for convolutions Add an Armv8.6-A i8mm code path for standard bitdepth convolutions. Only horizontal-vertical (HV) convolutions have 6-tap specialisations of their vertical passes. All other convolutions are 4- or 8-tap filters which fit well with the 4-element USDOT instruction. Benchmarks show 4-9% FPS increase relative to the Armv8.4-A code path depending on the input video and the CPU used. This patch will increase the .text by around 5.7 KiB. Relative performance to the C reference on some Cortex CPU cores: Cortex-A715 Cortex-X3 Cortex-A510 regular w4 hv neon: 7.20x 11.20x 4.40x regular w4 hv dotprod: 12.77x 18.35x 6.21x regular w4 hv i8mm: 14.50x 21.42x 6.16x sharp w4 hv neon: 6.24x 9.77x 3.96x sharp w4 hv dotprod: 9.76x 14.02x 5.20x sharp w4 hv i8mm: 10.84x 16.09x 5.42x regular w8 hv neon: 2.17x 2.46x 3.17x regular w8 hv dotprod: 3.04x 3.11x 3.03x regular w8 hv i8mm: 3.57x 3.40x 3.27x sharp w8 hv neon: 1.72x 1.93x 2.75x sharp w8 hv dotprod: 2.49x 2.54x 2.62x sharp w8 hv i8mm: 2.80x 2.79x 2.70x regular w16 hv neon: 1.90x 2.17x 2.02x regular w16 hv dotprod: 2.59x 2.64x 1.93x regular w16 hv i8mm: 3.01x 2.85x 2.05x sharp w16 hv neon: 1.51x 1.72x 1.74x sharp w16 hv dotprod: 2.17x 2.22x 1.70x sharp w16 hv i8mm: 2.42x 2.42x 1.72x regular w32 hv neon: 1.80x 1.96x 1.81x regular w32 hv dotprod: 2.43x 2.36x 1.74x regular w32 hv i8mm: 2.83x 2.51x 1.83x sharp w32 hv neon: 1.42x 1.54x 1.56x sharp w32 hv dotprod: 2.07x 2.00x 1.55x sharp w32 hv i8mm: 2.29x 2.16x 1.55x regular w64 hv neon: 1.82x 1.89x 1.70x regular w64 hv dotprod: 2.43x 2.25x 1.65x regular w64 hv i8mm: 2.84x 2.39x 1.73x sharp w64 hv neon: 1.43x 1.47x 1.49x sharp w64 hv dotprod: 2.08x 1.91x 1.49x sharp w64 hv i8mm: 2.30x 2.07x 1.48x regular w128 hv neon: 1.77x 1.84x 1.75x regular w128 hv dotprod: 2.37x 2.18x 1.70x regular w128 hv i8mm: 2.76x 2.33x 1.78x sharp w128 hv neon: 1.40x 1.45x 1.42x sharp w128 hv dotprod: 2.04x 1.87x 1.43x sharp w128 hv i8mm: 2.24x 2.02x 1.42x regular w8 h neon: 3.16x 3.51x 3.43x regular w8 h dotprod: 4.97x 7.43x 4.95x regular w8 h i8mm: 7.28x 10.38x 5.69x sharp w8 h neon: 2.71x 2.77x 3.10x sharp w8 h dotprod: 4.92x 7.14x 4.94x sharp w8 h i8mm: 7.21x 10.11x 5.70x regular w16 h neon: 2.79x 2.76x 3.53x regular w16 h dotprod: 3.81x 4.77x 3.13x regular w16 h i8mm: 5.21x 6.04x 3.56x sharp w16 h neon: 2.31x 2.38x 3.12x sharp w16 h dotprod: 3.80x 4.74x 3.13x sharp w16 h i8mm: 5.20x 5.98x 3.56x regular w64 h neon: 2.49x 2.46x 2.94x regular w64 h dotprod: 3.17x 3.60x 2.41x regular w64 h i8mm: 4.22x 4.40x 2.72x sharp w64 h neon: 2.07x 2.06x 2.60x sharp w64 h dotprod: 3.16x 3.58x 2.40x sharp w64 h i8mm: 4.20x 4.38x 2.71x regular w8 v neon: 6.11x 8.05x 4.07x regular w8 v dotprod: 5.45x 8.15x 4.01x regular w8 v i8mm: 7.30x 9.46x 4.19x sharp w8 v neon: 4.23x 5.46x 3.09x sharp w8 v dotprod: 5.43x 7.96x 4.01x sharp w8 v i8mm: 7.26x 9.12x 4.19x regular w16 v neon: 3.44x 4.33x 2.40x regular w16 v dotprod: 3.20x 4.53x 2.85x regular w16 v i8mm: 4.09x 5.27x 2.87x sharp w16 v neon: 2.50x 3.14x 1.82x sharp w16 v dotprod: 3.20x 4.52x 2.86x sharp w16 v i8mm: 4.09x 5.15x 2.86x regular w64 v neon: 2.74x 3.11x 1.53x regular w64 v dotprod: 2.63x 3.30x 1.84x regular w64 v i8mm: 3.31x 3.73x 1.84x sharp w64 v neon: 2.01x 2.29x 1.16x sharp w64 v dotprod: 2.61x 3.27x 1.83x sharp w64 v i8mm: 3.29x 3.68x 1.84x --- src/arm/64/mc_dotprod.S | 131 ++++++++++++++++++++++++++++------------ src/arm/mc.h | 15 +++-- 2 files changed, 104 insertions(+), 42 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 04b60aa48..4671707d0 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -113,11 +113,19 @@ L(\type\()_8tap_v_\isa): madd \my, \my, w11, w10 ldr q6, L(v_tbl_neon_dotprod) sub \src, \src, \s_strd -.ifc \type, prep +.ifc \isa, neon_i8mm + .ifc \type, prep + movi v4.4s, #2 // rounding + .else + movi v4.4s, #0 + .endif +.else // neon_dotprod + .ifc \type, prep mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 -.else + .else movi v4.4s, #32, lsl 8 // FILTER_WEIGHT * 128, bias for SDOT + .endif .endif ubfx w11, \my, #7, #7 and \my, \my, #0x7F @@ -127,7 +135,9 @@ L(\type\()_8tap_v_\isa): sub \src, \src, \s_strd, lsl #1 // src - s_strd * 3 add \xmy, x12, \xmy, lsl #3 // subpel V filter address ldr q29, L(v_tbl_neon_dotprod) + 32 +.ifc \isa, neon_dotprod movi v5.16b, #128 +.endif ldr d7, [\xmy] cmp \w, #8 b.eq 80f @@ -179,7 +189,7 @@ L(\type\()_8tap_v_\isa): zip2 v20.8h, v18.8h, v24.8h zip1 v23.8h, v21.8h, v27.8h zip2 v26.8h, v21.8h, v27.8h - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v22.16b, v22.16b, v5.16b @@ -189,7 +199,7 @@ L(\type\()_8tap_v_\isa): sub v20.16b, v20.16b, v5.16b sub v23.16b, v23.16b, v5.16b sub v26.16b, v26.16b, v5.16b - +.endif .align LOOP_ALIGN 16: ld1 {v27.16b}, [\lsrc], \s_strd @@ -198,12 +208,16 @@ L(\type\()_8tap_v_\isa): mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b - +.ifc \isa, neon_i8mm + mov v18.16b, v27.16b + mov v21.16b, v27.16b + mov v24.16b, v27.16b +.else // neon_dotprod sub v18.16b, v27.16b, v5.16b sub v21.16b, v27.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b - +.endif \dot v0.4s, v16.16b, v7.4b[0] \dot v1.4s, v19.16b, v7.4b[0] \dot v2.4s, v22.16b, v7.4b[0] @@ -297,11 +311,12 @@ L(\type\()_8tap_v_\isa): zip2 v19.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h zip2 v20.8h, v18.8h, v24.8h - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v19.16b, v19.16b, v5.16b sub v17.16b, v17.16b, v5.16b sub v20.16b, v20.16b, v5.16b +.endif .ifc \type, put b.eq 82f .endif @@ -315,12 +330,15 @@ L(\type\()_8tap_v_\isa): mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b - +.ifc \isa, neon_i8mm + mov v18.16b, v21.16b + mov v24.16b, v27.16b +.else // neon_dotprod sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b sub v24.16b, v27.16b, v5.16b sub v27.16b, v27.16b, v5.16b - +.endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b @@ -367,10 +385,12 @@ L(\type\()_8tap_v_\isa): mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b - +.ifc \isa, neon_i8mm + mov v18.16b, v21.16b +.else sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b - +.endif tbl v22.16b, {v16.16b, v17.16b}, v6.16b tbl v25.16b, {v19.16b, v20.16b}, v6.16b tbl v23.16b, {v17.16b, v18.16b}, v28.16b @@ -428,9 +448,10 @@ L(\type\()_8tap_v_\isa): zip1 v16.8h, v0.8h, v2.8h zip1 v17.8h, v18.8h, v24.8h - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v5.16b sub v17.16b, v17.16b, v5.16b +.endif .ifc \type, put b.eq 42f .endif @@ -442,10 +463,10 @@ L(\type\()_8tap_v_\isa): mov v0.16b, v4.16b mov v1.16b, v4.16b - +.ifc \isa, neon_dotprod sub v18.16b, v18.16b, v5.16b sub v21.16b, v21.16b, v5.16b - +.endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b @@ -482,9 +503,9 @@ L(\type\()_8tap_v_\isa): mov v0.16b, v4.16b mov v1.16b, v4.16b - +.ifc \isa, neon_dotprod sub v18.16b, v18.16b, v5.16b - +.endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b @@ -532,10 +553,10 @@ L(\type\()_8tap_v_\isa): zip1 v16.4h, v0.4h, v2.4h zip1 v17.4h, v18.4h, v24.4h - + .ifc \isa, neon_dotprod sub v16.8b, v16.8b, v5.8b sub v17.8b, v17.8b, v5.8b - + .endif b.eq 22f .align LOOP_ALIGN @@ -546,10 +567,10 @@ L(\type\()_8tap_v_\isa): mov v0.16b, v4.16b mov v1.16b, v4.16b - + .ifc \isa, neon_dotprod sub v18.8b, v18.8b, v5.8b sub v21.8b, v21.8b, v5.8b - + .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b @@ -579,9 +600,9 @@ L(\type\()_8tap_v_\isa): mov v0.16b, v4.16b mov v1.16b, v4.16b - + .ifc \isa, neon_dotprod sub v18.8b, v18.8b, v5.8b - + .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b tbl v20.16b, {v17.16b, v18.16b}, v28.16b @@ -606,8 +627,12 @@ L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 madd w14, \my, w11, w10 // for HV ldr q28, L(h_tbl_neon_dotprod) +.ifc \isa, neon_i8mm + movi v27.4s, #2 // rounding +.else mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v27.4s, w13 // put H overrides this +.endif sub \src, \src, #4 // src - 4 ubfx w9, \mx, #7, #7 and \mx, \mx, #0x7F @@ -616,7 +641,9 @@ L(\type\()_8tap_h_hv_\isa): cmp \w, #4 csel \mx, \mx, w9, le add \xmx, x12, \xmx, lsl #3 // subpel H filter address +.ifc \isa, neon_dotprod movi v24.16b, #128 +.endif cbz \my, L(\type\()_8tap_h_\isa) // HV cases @@ -677,9 +704,9 @@ L(\type\()_8tap_h_hv_\isa): smull v0.4s, v16.4h, v7.h[0] smull2 v1.4s, v16.8h, v7.h[0] mov v16.16b, v17.16b - +.ifc \isa, neon_dotprod sub v23.16b, v23.16b, v24.16b - +.endif mov v5.16b, v27.16b mov v6.16b, v27.16b @@ -774,9 +801,9 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b - +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b - +.endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b @@ -838,9 +865,9 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b - +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b - +.endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b @@ -912,7 +939,9 @@ L(\type\()_6tap_hv_\isa): smull v0.4s, v16.4h, v7.h[1] smull2 v1.4s, v16.8h, v7.h[1] +.ifc \isa, neon_dotprod sub v23.16b, v23.16b, v24.16b +.endif mov v16.16b, v17.16b mov v5.16b, v27.16b @@ -969,7 +998,9 @@ L(\type\()_6tap_hv_\isa): .align FUNC_ALIGN L(\type\()_hv_filter8_\isa): ld1 {v4.16b}, [\lsrc], \s_strd +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b +.endif mov v22.16b, v27.16b mov v23.16b, v27.16b tbl v2.16b, {v4.16b}, v28.16b @@ -987,7 +1018,9 @@ L(\type\()_hv_filter8_\isa): L(\type\()_hv_filter4_\isa): ld1 {v4.8b}, [\src], \s_strd mov v22.16b, v27.16b +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b +.endif tbl v2.16b, {v4.16b}, v28.16b \dot v22.4s, v2.16b, v26.4b[0] shrn v22.4h, v22.4s, #2 @@ -1015,7 +1048,9 @@ L(\type\()_hv_filter4_\isa): smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b +.endif mov v16.16b, v17.16b mov v17.16b, v18.16b @@ -1067,7 +1102,9 @@ L(\type\()_hv_filter4_\isa): smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] +.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b +.endif mov v16.16b, v17.16b mov v17.16b, v18.16b @@ -1100,8 +1137,12 @@ L(\type\()_8tap_h_\isa): adr x9, L(\type\()_8tap_h_\isa\()_tbl) ldrh w8, [x9, x8, lsl #1] .ifc \type, put + .ifc \isa, neon_i8mm + movi v27.4s, #34 // special rounding + .else mov w10, #0x2022 // 64 * 128 + 34, bias and rounding for SDOT dup v27.4s, w10 + .endif .endif sub x9, x9, x8 br x9 @@ -1118,10 +1159,10 @@ L(\type\()_8tap_h_\isa): ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - + .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b - + .endif mov v4.16b, v27.16b mov v5.16b, v27.16b @@ -1155,10 +1196,10 @@ L(\type\()_8tap_h_\isa): ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - +.ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b - +.endif mov v4.16b, v27.16b mov v5.16b, v27.16b @@ -1197,10 +1238,10 @@ L(\type\()_8tap_h_\isa): ldr q0, [\src] ldr q16, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - +.ifc \isa, neon_dotprod sub v0.16b, v0.16b, v24.16b sub v16.16b, v16.16b, v24.16b - +.endif mov v4.16b, v27.16b mov v5.16b, v27.16b mov v20.16b, v27.16b @@ -1252,10 +1293,10 @@ L(\type\()_8tap_h_\isa): ldr q16, [\src] ldr q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, \s_strd - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b - +.endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b @@ -1311,10 +1352,10 @@ L(\type\()_8tap_h_\isa): ldr q16, [\src] ldr q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, #16 - +.ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b - +.endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b @@ -1381,5 +1422,19 @@ filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) filter_8tap_fn put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 +#if HAVE_I8MM +ENABLE_I8MM + +// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6) +// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7) +filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7 + +// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7) +// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1) +filter_8tap_fn put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1 + +DISABLE_I8MM +#endif // HAVE_I8MM + DISABLE_DOTPROD #endif // HAVE_DOTPROD diff --git a/src/arm/mc.h b/src/arm/mc.h index 7e57fd37c..dabdab357 100644 --- a/src/arm/mc.h +++ b/src/arm/mc.h @@ -62,6 +62,7 @@ decl_8tap_fns(neon); decl_8tap_fns(neon_dotprod); +decl_8tap_fns(neon_i8mm); decl_mc_fn(BF(dav1d_put_bilin, neon)); decl_mct_fn(BF(dav1d_prep_bilin, neon)); @@ -109,11 +110,17 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) { c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon); c->emu_edge = BF(dav1d_emu_edge, neon); -#if ARCH_AARCH64 -#if HAVE_DOTPROD && BITDEPTH == 8 +#if ARCH_AARCH64 && BITDEPTH == 8 +#if HAVE_DOTPROD if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return; init_8tap_fns(neon_dotprod); -#endif // HAVE_DOTPROD && BITDEPTH == 8 -#endif // ARCH_AARCH64 +#endif // HAVE_DOTPROD + +#if HAVE_I8MM + if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return; + + init_8tap_fns(neon_i8mm); +#endif // HAVE_I8MM +#endif // ARCH_AARCH64 && BITDEPTH == 8 } From 488a191df8e2bae8ab5681102b8d49dac06f3f89 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Fri, 26 Apr 2024 18:24:29 +0200 Subject: [PATCH 11/22] AArch64: Optimize horizontal i8mm prep filters Replace the accumulator initializations of the horizontal prep filters with register fills by zeros. Most i8mm capable CPUs can do these with zero latency, but we also need to use rounding shifts at the end of the filter. We can see better performance with this change on out-of-order CPUs. Relative performance of micro benchmarks (lower is better): Cortex-X3: mct_8tap_sharp_w32_h_8bpc_i8mm: 0.914x mct_8tap_sharp_w16_h_8bpc_i8mm: 0.906x mct_8tap_sharp_w8_h_8bpc_i8mm: 0.877x Cortex-A715: mct_8tap_sharp_w32_h_8bpc_i8mm: 0.819x mct_8tap_sharp_w16_h_8bpc_i8mm: 0.805x mct_8tap_sharp_w8_h_8bpc_i8mm: 0.779x Cortex-A510: mct_8tap_sharp_w32_h_8bpc_i8mm: 0.999x mct_8tap_sharp_w16_h_8bpc_i8mm: 1.001x mct_8tap_sharp_w8_h_8bpc_i8mm: 0.996x mct_8tap_sharp_w4_h_8bpc_i8mm: 0.915x --- src/arm/64/mc_dotprod.S | 66 +++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 4671707d0..19431abfa 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -1196,13 +1196,17 @@ L(\type\()_8tap_h_\isa): ldr d0, [\src] ldr d1, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 -.ifc \isa, neon_dotprod +.ifc \type\()_\isa, prep_neon_i8mm + movi v4.4s, #0 + movi v5.4s, #0 +.else + .ifc \isa, neon_dotprod sub v0.8b, v0.8b, v24.8b sub v1.8b, v1.8b, v24.8b -.endif + .endif mov v4.16b, v27.16b mov v5.16b, v27.16b - +.endif tbl v2.16b, {v0.16b}, v28.16b tbl v3.16b, {v1.16b}, v28.16b @@ -1210,8 +1214,13 @@ L(\type\()_8tap_h_\isa): \dot v5.4s, v3.16b, v26.4b[0] .ifc \type, prep subs \h, \h, #2 + .ifc \isa, neon_i8mm + uzp1 v4.8h, v4.8h, v5.8h + srshr v4.8h, v4.8h, #2 + .else shrn v4.4h, v4.4s, #2 shrn2 v4.8h, v5.4s, #2 + .endif str q4, [\dst], #16 .else // put uzp1 v4.8h, v4.8h, v5.8h @@ -1238,15 +1247,21 @@ L(\type\()_8tap_h_\isa): ldr q0, [\src] ldr q16, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 -.ifc \isa, neon_dotprod +.ifc \type\()_\isa, prep_neon_i8mm + movi v4.4s, #0 + movi v5.4s, #0 + movi v20.4s, #0 + movi v21.4s, #0 +.else + .ifc \isa, neon_dotprod sub v0.16b, v0.16b, v24.16b sub v16.16b, v16.16b, v24.16b -.endif + .endif mov v4.16b, v27.16b mov v5.16b, v27.16b mov v20.16b, v27.16b mov v21.16b, v27.16b - +.endif tbl v1.16b, {v0.16b}, v28.16b tbl v2.16b, {v0.16b}, v29.16b tbl v3.16b, {v0.16b}, v30.16b @@ -1266,8 +1281,13 @@ L(\type\()_8tap_h_\isa): uzp1 v4.8h, v4.8h, v5.8h uzp1 v20.8h, v20.8h, v21.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v4.8h, v4.8h, #2 + srshr v20.8h, v20.8h, #2 + .else sshr v4.8h, v4.8h, #2 sshr v20.8h, v20.8h, #2 + .endif subs \h, \h, #2 stp q4, q20, [\dst], #32 .else // put @@ -1293,15 +1313,21 @@ L(\type\()_8tap_h_\isa): ldr q16, [\src] ldr q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, \s_strd -.ifc \isa, neon_dotprod +.ifc \type\()_\isa, prep_neon_i8mm + movi v6.4s, #0 + movi v7.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 +.else + .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b -.endif + .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b - +.endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b @@ -1320,8 +1346,13 @@ L(\type\()_8tap_h_\isa): uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v6.8h, v6.8h, #2 + srshr v22.8h, v22.8h, #2 + .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 + .endif subs \h, \h, #1 stp q6, q22, [\dst], #32 .else // put @@ -1352,15 +1383,21 @@ L(\type\()_8tap_h_\isa): ldr q16, [\src] ldr q17, [\src, #12] // avoid 2 register TBL for small cores add \src, \src, #16 -.ifc \isa, neon_dotprod +.ifc \type\()_\isa, prep_neon_i8mm + movi v6.4s, #0 + movi v7.4s, #0 + movi v22.4s, #0 + movi v23.4s, #0 +.else + .ifc \isa, neon_dotprod sub v16.16b, v16.16b, v24.16b sub v17.16b, v17.16b, v24.16b -.endif + .endif mov v6.16b, v27.16b mov v7.16b, v27.16b mov v22.16b, v27.16b mov v23.16b, v27.16b - +.endif tbl v0.16b, {v16.16b}, v28.16b tbl v1.16b, {v16.16b}, v29.16b tbl v2.16b, {v16.16b}, v30.16b @@ -1379,8 +1416,13 @@ L(\type\()_8tap_h_\isa): uzp1 v6.8h, v6.8h, v7.8h uzp1 v22.8h, v22.8h, v23.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v6.8h, v6.8h, #2 + srshr v22.8h, v22.8h, #2 + .else sshr v6.8h, v6.8h, #2 sshr v22.8h, v22.8h, #2 + .endif subs w8, w8, #16 stp q6, q22, [\dst], #32 .else // put From 670e5219b40ee19fba6bed3c84c0aa8e923aa6ad Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Fri, 26 Apr 2024 17:51:35 +0200 Subject: [PATCH 12/22] AArch64: Optimize vertical i8mm subpel filters Replace the accumulator initializations of the vertical subpel filters with register fills by zeros (which are usually zero latency operations in this feature class), this implies the usage of rounding shifts at the end in the prep cases. Out-of-order CPU cores can benefit from this change. The width=16 case uses a simpler register duplication scheme that relies on MOV instructions for the subsequent shuffles. This approach uses a different register to load the data into for better instruction scheduling and data dependency chain. Relative performance of micro benchmarks (lower is better): Cortex-X3: mct_8tap_sharp_w16_v_8bpc_i8mm: 0.910x mct_8tap_sharp_w8_v_8bpc_i8mm: 0.986x mc_8tap_sharp_w16_v_8bpc_i8mm: 0.864x mc_8tap_sharp_w8_v_8bpc_i8mm: 0.882x mc_8tap_sharp_w4_v_8bpc_i8mm: 0.933x mc_8tap_sharp_w2_v_8bpc_i8mm: 0.926x Cortex-A715: mct_8tap_sharp_w16_v_8bpc_i8mm: 0.855x mct_8tap_sharp_w8_v_8bpc_i8mm: 0.784x mct_8tap_sharp_w4_v_8bpc_i8mm: 1.069x mc_8tap_sharp_w16_v_8bpc_i8mm: 0.850x mc_8tap_sharp_w8_v_8bpc_i8mm: 0.779x mc_8tap_sharp_w4_v_8bpc_i8mm: 0.971x mc_8tap_sharp_w2_v_8bpc_i8mm: 0.975x Cortex-A510: mct_8tap_sharp_w16_v_8bpc_i8mm: 1.001x mct_8tap_sharp_w8_v_8bpc_i8mm: 0.979x mct_8tap_sharp_w4_v_8bpc_i8mm: 0.998x mc_8tap_sharp_w16_v_8bpc_i8mm: 0.998x mc_8tap_sharp_w8_v_8bpc_i8mm: 1.004x mc_8tap_sharp_w4_v_8bpc_i8mm: 1.003x mc_8tap_sharp_w2_v_8bpc_i8mm: 0.996x --- src/arm/64/mc_dotprod.S | 114 +++++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 31 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 19431abfa..b61ee2623 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -113,13 +113,7 @@ L(\type\()_8tap_v_\isa): madd \my, \my, w11, w10 ldr q6, L(v_tbl_neon_dotprod) sub \src, \src, \s_strd -.ifc \isa, neon_i8mm - .ifc \type, prep - movi v4.4s, #2 // rounding - .else - movi v4.4s, #0 - .endif -.else // neon_dotprod +.ifc \isa, neon_dotprod .ifc \type, prep mov w8, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v4.4s, w8 @@ -202,17 +196,21 @@ L(\type\()_8tap_v_\isa): .endif .align LOOP_ALIGN 16: +.ifc \isa, neon_i8mm + ld1 {v18.16b}, [\lsrc], \s_strd + movi v0.4s, #0 + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 + mov v21.16b, v18.16b + mov v24.16b, v18.16b + mov v27.16b, v18.16b +.else // neon_dotprod ld1 {v27.16b}, [\lsrc], \s_strd - mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.ifc \isa, neon_i8mm - mov v18.16b, v27.16b - mov v21.16b, v27.16b - mov v24.16b, v27.16b -.else // neon_dotprod sub v18.16b, v27.16b, v5.16b sub v21.16b, v27.16b, v5.16b sub v24.16b, v27.16b, v5.16b @@ -242,8 +240,13 @@ L(\type\()_8tap_v_\isa): uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif st1 {v0.8h, v1.8h}, [\ldst], \d_strd .else // put sqrshrun v0.8b, v0.8h, #6 @@ -252,11 +255,17 @@ L(\type\()_8tap_v_\isa): .endif b.gt 16b +.ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 +.else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b - +.endif \dot v0.4s, v16.16b, v7.4b[0] \dot v1.4s, v19.16b, v7.4b[0] \dot v2.4s, v22.16b, v7.4b[0] @@ -271,8 +280,13 @@ L(\type\()_8tap_v_\isa): uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif stp q0, q1, [\ldst] add \dst, \dst, #32 .else // put @@ -322,18 +336,24 @@ L(\type\()_8tap_v_\isa): .endif .align LOOP_ALIGN 8: +.ifc \isa, neon_i8mm + ldr d18, [\src] + movi v0.4s, #0 + movi v1.4s, #0 + ldr d24, [\src, \s_strd] + add \src, \src, \s_strd, lsl #1 + movi v2.4s, #0 + movi v3.4s, #0 + mov v21.8b, v18.8b + mov v27.8b, v24.8b +.else // neon_dotprod ldr d21, [\src] ldr d27, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.ifc \isa, neon_i8mm - mov v18.16b, v21.16b - mov v24.16b, v27.16b -.else // neon_dotprod sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b sub v24.16b, v27.16b, v5.16b @@ -363,8 +383,13 @@ L(\type\()_8tap_v_\isa): uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif stp q0, q1, [\dst], #32 .else // put sqrshrun v0.8b, v0.8h, #6 @@ -379,15 +404,19 @@ L(\type\()_8tap_v_\isa): .align JUMP_ALIGN 82: .endif +.ifc \isa, neon_i8mm + ldr d18, [\src] + movi v0.4s, #0 + movi v1.4s, #0 + movi v2.4s, #0 + movi v3.4s, #0 + mov v21.8b, v18.8b +.else // neon_dotprod ldr d21, [\src] - mov v0.16b, v4.16b mov v1.16b, v4.16b mov v2.16b, v4.16b mov v3.16b, v4.16b -.ifc \isa, neon_i8mm - mov v18.16b, v21.16b -.else sub v18.16b, v21.16b, v5.16b sub v21.16b, v21.16b, v5.16b .endif @@ -409,8 +438,13 @@ L(\type\()_8tap_v_\isa): uzp1 v0.8h, v0.8h, v1.8h uzp1 v2.8h, v2.8h, v3.8h .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v0.8h, v0.8h, #2 + srshr v1.8h, v2.8h, #2 + .else sshr v0.8h, v0.8h, #2 sshr v1.8h, v2.8h, #2 + .endif stp q0, q1, [\dst] .else // put sqrshrun v0.8b, v0.8h, #6 @@ -460,10 +494,12 @@ L(\type\()_8tap_v_\isa): ldr s18, [\src] ldr s21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - +.ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 +.else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b -.ifc \isa, neon_dotprod sub v18.16b, v18.16b, v5.16b sub v21.16b, v21.16b, v5.16b .endif @@ -480,8 +516,13 @@ L(\type\()_8tap_v_\isa): \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep subs \h, \h, #2 + .ifc \isa, neon_i8mm + rshrn v0.4h, v0.4s, #2 + rshrn2 v0.8h, v1.4s, #2 + .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 + .endif str q0, [\dst], #16 .else uzp1 v0.8h, v0.8h, v1.8h @@ -500,10 +541,12 @@ L(\type\()_8tap_v_\isa): 42: .endif ldr s18, [\src] - +.ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 +.else // neon_dotprod mov v0.16b, v4.16b mov v1.16b, v4.16b -.ifc \isa, neon_dotprod sub v18.16b, v18.16b, v5.16b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b @@ -515,8 +558,13 @@ L(\type\()_8tap_v_\isa): \dot v1.4s, v19.16b, v7.4b[0] \dot v1.4s, v20.16b, v7.4b[1] .ifc \type, prep + .ifc \isa, neon_i8mm + rshrn v0.4h, v0.4s, #2 + rshrn2 v0.8h, v1.4s, #2 + .else shrn v0.4h, v0.4s, #2 shrn2 v0.8h, v1.4s, #2 + .endif str q0, [\dst] .else uzp1 v0.8h, v0.8h, v1.8h @@ -564,10 +612,12 @@ L(\type\()_8tap_v_\isa): ldr h18, [\src] ldr h21, [\src, \s_strd] add \src, \src, \s_strd, lsl #1 - + .ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 + .else // put mov v0.16b, v4.16b mov v1.16b, v4.16b - .ifc \isa, neon_dotprod sub v18.8b, v18.8b, v5.8b sub v21.8b, v21.8b, v5.8b .endif @@ -597,10 +647,12 @@ L(\type\()_8tap_v_\isa): .align JUMP_ALIGN 22: ldr h18, [\src] - + .ifc \isa, neon_i8mm + movi v0.4s, #0 + movi v1.4s, #0 + .else // put mov v0.16b, v4.16b mov v1.16b, v4.16b - .ifc \isa, neon_dotprod sub v18.8b, v18.8b, v5.8b .endif tbl v19.16b, {v16.16b, v17.16b}, v6.16b From 346bb04dc2bd762fe07a3ba7aa5e0a2a1d438787 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 1 May 2024 21:54:55 +0200 Subject: [PATCH 13/22] AArch64: Optimize 2D i8mm subpel filters Rewrite the accumulator initializations of the horizontal part of the 2D filters with zero register fills. It can improve the performance on out-of-order CPUs which can fill vector registers by zero with zero latency. Zeroed accumulators imply the usage of the rounding shifts at the end of filters. The only exception is the very short *hv_filter4*, where the longer latency of rounding shift could decrease the performance. The *filter8* function uses a different (alternating) dot product computation order for DotProd+ feature level, it gives a better overall performance for out-of-order and some in-order CPU cores. The i8mm version does not need to use bias for the loaded samples, so a different instruction scheduling is beneficial mostly affecting the order of TBL instructions in the 8-tap case. Relative performance of micro benchmarks (lower is better): Cortex-X3: mct_8tap_regular_w16_hv_8bpc_i8mm: 0.982x mct_8tap_sharp_w16_hv_8bpc_i8mm: 0.979x mct_8tap_regular_w8_hv_8bpc_i8mm: 0.972x mct_8tap_sharp_w8_hv_8bpc_i8mm: 0.969x mct_8tap_regular_w4_hv_8bpc_i8mm: 0.942x mct_8tap_sharp_w4_hv_8bpc_i8mm: 0.935x mc_8tap_regular_w16_hv_8bpc_i8mm: 0.988x mc_8tap_sharp_w16_hv_8bpc_i8mm: 0.982x mc_8tap_regular_w8_hv_8bpc_i8mm: 0.981x mc_8tap_sharp_w8_hv_8bpc_i8mm: 0.975x mc_8tap_regular_w4_hv_8bpc_i8mm: 0.998x mc_8tap_sharp_w4_hv_8bpc_i8mm: 0.996x mc_8tap_regular_w2_hv_8bpc_i8mm: 1.006x mc_8tap_sharp_w2_hv_8bpc_i8mm: 0.993x Cortex-A715: mct_8tap_regular_w16_hv_8bpc_i8mm: 0.883x mct_8tap_sharp_w16_hv_8bpc_i8mm: 0.931x mct_8tap_regular_w8_hv_8bpc_i8mm: 0.882x mct_8tap_sharp_w8_hv_8bpc_i8mm: 0.928x mct_8tap_regular_w4_hv_8bpc_i8mm: 0.969x mct_8tap_sharp_w4_hv_8bpc_i8mm: 0.934x mc_8tap_regular_w16_hv_8bpc_i8mm: 0.881x mc_8tap_sharp_w16_hv_8bpc_i8mm: 0.925x mc_8tap_regular_w8_hv_8bpc_i8mm: 0.879x mc_8tap_sharp_w8_hv_8bpc_i8mm: 0.925x mc_8tap_regular_w4_hv_8bpc_i8mm: 0.917x mc_8tap_sharp_w4_hv_8bpc_i8mm: 0.976x mc_8tap_regular_w2_hv_8bpc_i8mm: 0.915x mc_8tap_sharp_w2_hv_8bpc_i8mm: 0.972x Cortex-A510: mct_8tap_regular_w16_hv_8bpc_i8mm: 0.994x mct_8tap_sharp_w16_hv_8bpc_i8mm: 0.949x mct_8tap_regular_w8_hv_8bpc_i8mm: 0.987x mct_8tap_sharp_w8_hv_8bpc_i8mm: 0.947x mct_8tap_regular_w4_hv_8bpc_i8mm: 1.002x mct_8tap_sharp_w4_hv_8bpc_i8mm: 0.999x mc_8tap_regular_w16_hv_8bpc_i8mm: 0.989x mc_8tap_sharp_w16_hv_8bpc_i8mm: 1.003x mc_8tap_regular_w8_hv_8bpc_i8mm: 0.986x mc_8tap_sharp_w8_hv_8bpc_i8mm: 1.000x mc_8tap_regular_w4_hv_8bpc_i8mm: 1.007x mc_8tap_sharp_w4_hv_8bpc_i8mm: 1.000x mc_8tap_regular_w2_hv_8bpc_i8mm: 1.005x mc_8tap_sharp_w2_hv_8bpc_i8mm: 1.000x --- src/arm/64/mc_dotprod.S | 114 ++++++++++++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 27 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index b61ee2623..28f8856cc 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -679,9 +679,7 @@ L(\type\()_8tap_h_hv_\isa): madd \mx, \mx, w11, w9 madd w14, \my, w11, w10 // for HV ldr q28, L(h_tbl_neon_dotprod) -.ifc \isa, neon_i8mm - movi v27.4s, #2 // rounding -.else +.ifc \isa, neon_dotprod mov w13, 0x2002 // FILTER_WEIGHT * 128 + rounding dup v27.4s, w13 // put H overrides this .endif @@ -756,20 +754,27 @@ L(\type\()_8tap_h_hv_\isa): smull v0.4s, v16.4h, v7.h[0] smull2 v1.4s, v16.8h, v7.h[0] mov v16.16b, v17.16b -.ifc \isa, neon_dotprod +.ifc \isa, neon_i8mm + movi v5.4s, #0 + movi v6.4s, #0 + tbl v2.16b, {v23.16b}, v28.16b + tbl v3.16b, {v23.16b}, v29.16b +.else // neon_dotprod sub v23.16b, v23.16b, v24.16b -.endif mov v5.16b, v27.16b mov v6.16b, v27.16b - +.endif smlal v0.4s, v17.4h, v7.h[1] smlal2 v1.4s, v17.8h, v7.h[1] +.ifc \isa, neon_i8mm + tbl v4.16b, {v23.16b}, v30.16b + mov v17.16b, v18.16b +.else // neon_dotprod mov v17.16b, v18.16b - tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b tbl v4.16b, {v23.16b}, v30.16b - +.endif smlal v0.4s, v18.4h, v7.h[2] smlal2 v1.4s, v18.8h, v7.h[2] mov v18.16b, v19.16b @@ -794,24 +799,37 @@ L(\type\()_8tap_h_hv_\isa): uzp1 v23.8h, v5.8h, v6.8h .endif mov v21.16b, v22.16b - smlal v0.4s, v22.4h, v7.h[6] smlal2 v1.4s, v22.8h, v7.h[6] +.ifc \isa, neon_i8mm + subs w8, w8, #1 +.endif .ifc \type, prep + .ifc \isa, neon_i8mm + srshr v22.8h, v23.8h, #2 + .else sshr v22.8h, v23.8h, #2 + .endif smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] rshrn v0.4h, v0.4s, #6 rshrn2 v0.8h, v1.4s, #6 .else // put + .ifc \isa, neon_i8mm + rshrn v22.4h, v5.4s, #2 + rshrn2 v22.8h, v6.4s, #2 + .else shrn v22.4h, v5.4s, #2 shrn2 v22.8h, v6.4s, #2 + .endif smlal v0.4s, v22.4h, v7.h[7] smlal2 v1.4s, v22.8h, v7.h[7] tbl v0.16b, {v0.16b, v1.16b}, v25.16b sqrshrun v0.8b, v0.8h, #2 .endif +.ifc \isa, neon_dotprod subs w8, w8, #1 +.endif .ifc \type, prep st1 {v0.8h}, [\ldst], \d_strd b.gt 8b @@ -859,8 +877,11 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b +.ifc \isa, neon_i8mm + movi v5.4s, #0 +.else mov v5.16b, v27.16b - +.endif mov v18.16b, v19.16b mov v19.16b, v20.16b @@ -870,10 +891,12 @@ L(\type\()_8tap_h_hv_\isa): \dot v5.4s, v2.16b, v26.4b[0] mov v20.16b, v21.16b mov v21.16b, v22.16b - smlal v0.4s, v22.4h, v7.h[6] +.ifc \isa, neon_i8mm + rshrn v22.4h, v5.4s, #2 +.else shrn v22.4h, v5.4s, #2 - +.endif smlal v0.4s, v22.4h, v7.h[7] .ifc \type, prep rshrn v0.4h, v0.4s, #6 @@ -917,14 +940,17 @@ L(\type\()_8tap_h_hv_\isa): smlal v0.4s, v17.4h, v7.h[1] mov v16.16b, v17.16b mov v17.16b, v18.16b -.ifc \isa, neon_dotprod + .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b -.endif + .endif smlal v0.4s, v18.4h, v7.h[2] smlal v0.4s, v19.4h, v7.h[3] tbl v2.16b, {v4.16b}, v28.16b + .ifc \isa, neon_i8mm + movi v5.4s, #0 + .else mov v5.16b, v27.16b - + .endif mov v18.16b, v19.16b mov v19.16b, v20.16b @@ -936,11 +962,14 @@ L(\type\()_8tap_h_hv_\isa): mov v21.16b, v22.16b smlal v0.4s, v22.4h, v7.h[6] + .ifc \isa, neon_i8mm + rshrn v22.4h, v5.4s, #2 + .else shrn v22.4h, v5.4s, #2 - + .endif smlal v0.4s, v22.4h, v7.h[7] - subs \h, \h, #1 + tbl v0.8b, {v0.16b}, v25.8b sqrshrun v0.8b, v0.8h, #2 @@ -995,10 +1024,13 @@ L(\type\()_6tap_hv_\isa): sub v23.16b, v23.16b, v24.16b .endif mov v16.16b, v17.16b - +.ifc \isa, neon_i8mm + movi v5.4s, #0 + movi v6.4s, #0 +.else mov v5.16b, v27.16b mov v6.16b, v27.16b - +.endif tbl v2.16b, {v23.16b}, v28.16b tbl v3.16b, {v23.16b}, v29.16b @@ -1024,8 +1056,11 @@ L(\type\()_6tap_hv_\isa): smlal v0.4s, v20.4h, v7.h[5] smlal2 v1.4s, v20.8h, v7.h[5] +.ifc \isa, neon_i8mm + srshr v20.8h, v23.8h, #2 +.else sshr v20.8h, v23.8h, #2 - +.endif subs w8, w8, #1 smlal v0.4s, v20.4h, v7.h[6] smlal2 v1.4s, v20.8h, v7.h[6] @@ -1050,27 +1085,37 @@ L(\type\()_6tap_hv_\isa): .align FUNC_ALIGN L(\type\()_hv_filter8_\isa): ld1 {v4.16b}, [\lsrc], \s_strd -.ifc \isa, neon_dotprod +.ifc \isa, neon_i8mm + movi v22.4s, #0 + movi v23.4s, #0 +.else // neon_dotprod sub v4.16b, v4.16b, v24.16b -.endif mov v22.16b, v27.16b mov v23.16b, v27.16b +.endif tbl v2.16b, {v4.16b}, v28.16b tbl v3.16b, {v4.16b}, v29.16b tbl v4.16b, {v4.16b}, v30.16b \dot v22.4s, v2.16b, v26.4b[0] - \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v3.16b, v26.4b[0] + \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] +.ifc \isa, neon_i8mm + uzp1 v22.8h, v22.8h, v23.8h + srshr v22.8h, v22.8h, #2 +.else shrn v22.4h, v22.4s, #2 shrn2 v22.8h, v23.4s, #2 +.endif ret .align FUNC_ALIGN L(\type\()_hv_filter4_\isa): ld1 {v4.8b}, [\src], \s_strd +.ifc \isa, neon_i8mm + movi v22.4s, #2 +.else mov v22.16b, v27.16b -.ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b .endif tbl v2.16b, {v4.16b}, v28.16b @@ -1109,14 +1154,21 @@ L(\type\()_hv_filter4_\isa): smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b +.ifc \isa, neon_i8mm + movi v5.4s, #0 +.else mov v5.16b, v27.16b - +.endif mov v18.16b, v19.16b mov v19.16b, v20.16b \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] +.ifc \isa, neon_i8mm + rshrn v20.4h, v5.4s, #2 +.else shrn v20.4h, v5.4s, #2 +.endif subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] .ifc \type, prep @@ -1154,23 +1206,31 @@ L(\type\()_hv_filter4_\isa): smull v0.4s, v16.4h, v7.h[1] smlal v0.4s, v17.4h, v7.h[2] -.ifc \isa, neon_dotprod + .ifc \isa, neon_dotprod sub v4.16b, v4.16b, v24.16b -.endif + .endif mov v16.16b, v17.16b mov v17.16b, v18.16b smlal v0.4s, v18.4h, v7.h[3] smlal v0.4s, v19.4h, v7.h[4] tbl v2.16b, {v4.16b}, v28.16b + .ifc \isa, neon_i8mm + movi v5.4s, #0 + .else mov v5.16b, v27.16b + .endif mov v18.16b, v19.16b mov v19.16b, v20.16b \dot v5.4s, v2.16b, v26.4b[0] smlal v0.4s, v20.4h, v7.h[5] + .ifc \isa, neon_i8mm + rshrn v20.4h, v5.4s, #2 + .else shrn v20.4h, v5.4s, #2 + .endif subs \h, \h, #1 smlal v0.4s, v20.4h, v7.h[6] From 51b63abf744b5f054d4ef6ad2d17ef8e1ad95be9 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Thu, 9 May 2024 11:24:30 +0200 Subject: [PATCH 14/22] AArch64: Optimize the init of DotProd+ 2D subpel filters Removed some unnecessary vector register copies from the initial horizontal filter parts of the HV subpel filters. The performance improvements are better for the smaller filter block sizes. The narrowing shifts were also rewritten at the end of the *filter8* because it was only beneficial for the Cortex-A55 among the DotProd capable CPU cores. On other out-of-order or newer CPUs the UZP1+SHRN instruction combination is better. Relative performance of micro benchmarks (lower is better): Cortex-A55: mct regular w4: 0.980x mct regular w8: 1.007x mct regular w16: 1.007x mct sharp w4: 0.983x mct sharp w8: 1.012x mct sharp w16: 1.005x Cortex-A510: mct regular w4: 0.935x mct regular w8: 0.984x mct regular w16: 0.986x mct sharp w4: 0.927x mct sharp w8: 0.983x mct sharp w16: 0.987x Cortex-A78: mct regular w4: 0.974x mct regular w8: 0.988x mct regular w16: 0.991x mct sharp w4: 0.971x mct sharp w8: 0.987x mct sharp w16: 0.979x Cortex-715: mct regular w4: 0.958x mct regular w8: 0.993x mct regular w16: 0.998x mct sharp w4: 0.974x mct sharp w8: 0.991x mct sharp w16: 0.997x Cortex-X1: mct regular w4: 0.983x mct regular w8: 0.993x mct regular w16: 0.996x mct sharp w4: 0.974x mct sharp w8: 0.990x mct sharp w16: 0.995x Cortex-X3: mct regular w4: 0.953x mct regular w8: 0.993x mct regular w16: 0.997x mct sharp w4: 0.981x mct sharp w8: 0.993x mct sharp w16: 0.995x --- src/arm/64/mc_dotprod.S | 110 ++++++++++++++++++++++++---------------- 1 file changed, 66 insertions(+), 44 deletions(-) diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S index 28f8856cc..a4f98a2ca 100644 --- a/src/arm/64/mc_dotprod.S +++ b/src/arm/64/mc_dotprod.S @@ -731,21 +731,37 @@ L(\type\()_8tap_h_hv_\isa): mov \lsrc, \src mov \ldst, \dst mov w8, \h - +.ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) - mov v16.16b, v22.16b + srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v17.16b, v22.16b + srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v18.16b, v22.16b + srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v19.16b, v22.16b + srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v20.16b, v22.16b + srshr v20.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v21.16b, v22.16b + srshr v21.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - + srshr v22.8h, v22.8h, #2 +.else + bl L(\type\()_hv_filter8_\isa) + sshr v16.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v17.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v18.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v19.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v20.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v21.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v22.8h, v22.8h, #2 +.endif .align LOOP_ALIGN 8: ldr q23, [\lsrc] @@ -850,18 +866,19 @@ L(\type\()_8tap_h_hv_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v21.16b, v22.16b + shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) + shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 4: @@ -919,18 +936,19 @@ L(\type\()_8tap_h_hv_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v21.16b, v22.16b + shrn v21.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) + shrn v22.4h, v22.4s, #2 .align LOOP_ALIGN 2: @@ -1001,18 +1019,29 @@ L(\type\()_6tap_hv_\isa): mov \lsrc, \src mov \ldst, \dst mov w8, \h - +.ifc \isa, neon_i8mm bl L(\type\()_hv_filter8_\isa) - mov v16.16b, v22.16b + srshr v16.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v17.16b, v22.16b + srshr v17.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v18.16b, v22.16b + srshr v18.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v19.16b, v22.16b + srshr v19.8h, v22.8h, #2 bl L(\type\()_hv_filter8_\isa) - mov v20.16b, v22.16b - + srshr v20.8h, v22.8h, #2 +.else + bl L(\type\()_hv_filter8_\isa) + sshr v16.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v17.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v18.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v19.8h, v22.8h, #2 + bl L(\type\()_hv_filter8_\isa) + sshr v20.8h, v22.8h, #2 +.endif .align LOOP_ALIGN 8: ldr q23, [\xmy] @@ -1100,13 +1129,7 @@ L(\type\()_hv_filter8_\isa): \dot v23.4s, v3.16b, v26.4b[0] \dot v22.4s, v3.16b, v26.4b[1] \dot v23.4s, v4.16b, v26.4b[1] -.ifc \isa, neon_i8mm uzp1 v22.8h, v22.8h, v23.8h - srshr v22.8h, v22.8h, #2 -.else - shrn v22.4h, v22.4s, #2 - shrn2 v22.8h, v23.4s, #2 -.endif ret .align FUNC_ALIGN @@ -1120,7 +1143,6 @@ L(\type\()_hv_filter4_\isa): .endif tbl v2.16b, {v4.16b}, v28.16b \dot v22.4s, v2.16b, v26.4b[0] - shrn v22.4h, v22.4s, #2 ret .align JUMP_ALIGN @@ -1129,15 +1151,15 @@ L(\type\()_hv_filter4_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 4: @@ -1190,15 +1212,15 @@ L(\type\()_hv_filter4_\isa): add \src, \src, #2 bl L(\type\()_hv_filter4_\isa) - mov v16.16b, v22.16b + shrn v16.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v17.16b, v22.16b + shrn v17.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v18.16b, v22.16b + shrn v18.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v19.16b, v22.16b + shrn v19.4h, v22.4s, #2 bl L(\type\()_hv_filter4_\isa) - mov v20.16b, v22.16b + shrn v20.4h, v22.4s, #2 .align LOOP_ALIGN 2: From 9b362df677dcbff3cb6ad2da3f5f9ba60235a463 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 1 May 2024 16:07:23 +0200 Subject: [PATCH 15/22] AArch64: Optimize BTI landing pads of put_neon Move the BTI landing pads out of the inner loops of put_neon function, the only exception is the width=16 case where it is already outside of the loops. When BTI is enabled, the relative performance of omitting the AARCH64_VALID_JUMP_TARGET from the inner loops on Cortex-A510 (lower is better): w2: 0.981x w4: 0.991x w8: 0.612x w32: 0.687x w64: 0.813x w128: 0.892x Out-of-order CPUs are mostly unaffected. --- src/arm/64/mc.S | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 5b493be82..be6a67c71 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -843,8 +843,9 @@ function put_neon, export=1 sub x9, x9, w8, uxtw br x9 -2: +20: AARCH64_VALID_JUMP_TARGET +2: ld1 {v0.h}[0], [x2], x3 ld1 {v1.h}[0], [x2], x3 subs w5, w5, #2 @@ -852,8 +853,9 @@ function put_neon, export=1 st1 {v1.h}[0], [x0], x1 b.gt 2b ret -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld1 {v0.s}[0], [x2], x3 ld1 {v1.s}[0], [x2], x3 subs w5, w5, #2 @@ -861,8 +863,9 @@ function put_neon, export=1 st1 {v1.s}[0], [x0], x1 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld1 {v0.8b}, [x2], x3 ld1 {v1.8b}, [x2], x3 subs w5, w5, #2 @@ -884,8 +887,9 @@ function put_neon, export=1 st1 {v1.16b}, [x8], x1 b.gt 16b ret -32: +320: AARCH64_VALID_JUMP_TARGET +32: ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -895,8 +899,9 @@ function put_neon, export=1 add x0, x0, x1 b.gt 32b ret -64: +640: AARCH64_VALID_JUMP_TARGET +64: ldp x6, x7, [x2] ldp x8, x9, [x2, #16] stp x6, x7, [x0] @@ -910,8 +915,9 @@ function put_neon, export=1 add x0, x0, x1 b.gt 64b ret -128: +1280: AARCH64_VALID_JUMP_TARGET +128: ldp q0, q1, [x2] ldp q2, q3, [x2, #32] stp q0, q1, [x0] @@ -927,13 +933,13 @@ function put_neon, export=1 ret L(put_tbl): - .hword L(put_tbl) - 128b - .hword L(put_tbl) - 64b - .hword L(put_tbl) - 32b - .hword L(put_tbl) - 160b - .hword L(put_tbl) - 8b - .hword L(put_tbl) - 4b - .hword L(put_tbl) - 2b + .hword L(put_tbl) - 1280b + .hword L(put_tbl) - 640b + .hword L(put_tbl) - 320b + .hword L(put_tbl) - 160b + .hword L(put_tbl) - 80b + .hword L(put_tbl) - 40b + .hword L(put_tbl) - 20b endfunc From e9f815d473dde2a8f67147c0efe83dc065d08120 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 1 May 2024 16:10:54 +0200 Subject: [PATCH 16/22] AArch64: Optimize jump table calculation of put_neon Save a complex arithmetic instruction in the jump table address calculation of put_neon function. --- src/arm/64/mc.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index be6a67c71..fa96f64a7 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -840,7 +840,7 @@ endfunc function put_neon, export=1 adr x9, L(put_tbl) ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + sub x9, x9, x8 br x9 20: From 9ee822a85f035b8c5ef8655e1d439892872066f7 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 1 May 2024 16:12:19 +0200 Subject: [PATCH 17/22] AArch64: Optimize put_neon function Optimize the copy part of subpel filters (the put_neon function). For small block sizes (<16) the usage of general purpose registers is usually the best way to do the copy. Relative performance of micro benchmarks (lower is better): Cortex-A55: w2: 0.991x w4: 0.992x w8: 0.999x w16: 0.875x w32: 0.775x w64: 0.914x w128: 0.998x Cortex-A510: w2: 0.159x w4: 0.080x w8: 0.583x w16: 0.588x w32: 0.966x w64: 1.111x w128: 0.957x Cortex-A76: w2: 0.903x w4: 0.683x w8: 0.944x w16: 0.948x w32: 0.919x w64: 0.855x w128: 0.991x Cortex-A78: w32: 0.867x w64: 0.820x w128: 1.011x Cortex-A715: w32: 0.834x w64: 0.778x w128: 1.000x Cortex-X1: w32: 0.809x w64: 0.762x w128: 1.000x Cortex-X3: w32: 0.733x w64: 0.720x w128: 0.999x --- src/arm/64/mc.S | 108 ++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 53 deletions(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index fa96f64a7..68dbbe79a 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -846,89 +846,91 @@ function put_neon, export=1 20: AARCH64_VALID_JUMP_TARGET 2: - ld1 {v0.h}[0], [x2], x3 - ld1 {v1.h}[0], [x2], x3 - subs w5, w5, #2 - st1 {v0.h}[0], [x0], x1 - st1 {v1.h}[0], [x0], x1 + ldrh w9, [x2] + ldrh w10, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + strh w9, [x0] + strh w10, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 2b ret 40: AARCH64_VALID_JUMP_TARGET 4: - ld1 {v0.s}[0], [x2], x3 - ld1 {v1.s}[0], [x2], x3 - subs w5, w5, #2 - st1 {v0.s}[0], [x0], x1 - st1 {v1.s}[0], [x0], x1 + ldr w9, [x2] + ldr w10, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + str w9, [x0] + str w10, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: - ld1 {v0.8b}, [x2], x3 - ld1 {v1.8b}, [x2], x3 - subs w5, w5, #2 - st1 {v0.8b}, [x0], x1 - st1 {v1.8b}, [x0], x1 + ldr x9, [x2] + ldr x10, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + str x9, [x0] + str x10, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET - add x8, x0, x1 - lsl x1, x1, #1 - add x9, x2, x3 - lsl x3, x3, #1 16: - ld1 {v0.16b}, [x2], x3 - ld1 {v1.16b}, [x9], x3 - subs w5, w5, #2 - st1 {v0.16b}, [x0], x1 - st1 {v1.16b}, [x8], x1 + ldr q0, [x2] + ldr q1, [x2, x3] + add x2, x2, x3, lsl #1 + subs w5, w5, #2 + str q0, [x0] + str q1, [x0, x1] + add x0, x0, x1, lsl #1 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET 32: - ldp x6, x7, [x2] - ldp x8, x9, [x2, #16] - stp x6, x7, [x0] - subs w5, w5, #1 - stp x8, x9, [x0, #16] - add x2, x2, x3 - add x0, x0, x1 + ldp q0, q1, [x2] + add x2, x2, x3 + stp q0, q1, [x0] + add x0, x0, x1 + ldp q2, q3, [x2] + add x2, x2, x3 + stp q2, q3, [x0] + subs w5, w5, #2 + add x0, x0, x1 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET 64: - ldp x6, x7, [x2] - ldp x8, x9, [x2, #16] - stp x6, x7, [x0] - ldp x10, x11, [x2, #32] - stp x8, x9, [x0, #16] - subs w5, w5, #1 - ldp x12, x13, [x2, #48] - stp x10, x11, [x0, #32] - stp x12, x13, [x0, #48] - add x2, x2, x3 - add x0, x0, x1 + ldp q0, q1, [x2] + stp q0, q1, [x0] + ldp q2, q3, [x2, #32] + add x2, x2, x3 + stp q2, q3, [x0, #32] + subs w5, w5, #1 + add x0, x0, x1 b.gt 64b ret 1280: AARCH64_VALID_JUMP_TARGET 128: - ldp q0, q1, [x2] - ldp q2, q3, [x2, #32] - stp q0, q1, [x0] - ldp q4, q5, [x2, #64] - stp q2, q3, [x0, #32] - ldp q6, q7, [x2, #96] - subs w5, w5, #1 - stp q4, q5, [x0, #64] - stp q6, q7, [x0, #96] - add x2, x2, x3 - add x0, x0, x1 + ldp q0, q1, [x2] + stp q0, q1, [x0] + ldp q2, q3, [x2, #32] + stp q2, q3, [x0, #32] + ldp q4, q5, [x2, #64] + stp q4, q5, [x0, #64] + ldp q6, q7, [x2, #96] + add x2, x2, x3 + stp q6, q7, [x0, #96] + subs w5, w5, #1 + add x0, x0, x1 b.gt 128b ret From 03610dfef226888d409503ad509cf88342f78371 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 1 May 2024 15:42:24 +0200 Subject: [PATCH 18/22] AArch64: Optimize BTI landing pads of prep_neon Move the BTI landing pads out of the inner loops of prep_neon function. Only the width=4 and width=8 cases are affected. If BTI is enabled, moving the AARCH64_VALID_JUMP_TARGET out of the inner loops we get better execution speed on Cortex-A510 relative to the original (lower is better): w4: 0.969x w8: 0.722x Out-of-order cores are not affected. --- src/arm/64/mc.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 68dbbe79a..1ea8aeab9 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -953,8 +953,9 @@ function prep_neon, export=1 sub x9, x9, w8, uxtw br x9 -4: +40: AARCH64_VALID_JUMP_TARGET +4: ld1 {v0.s}[0], [x1], x2 ld1 {v1.s}[0], [x1], x2 subs w4, w4, #2 @@ -963,8 +964,9 @@ function prep_neon, export=1 st1 {v0.4h, v1.4h}, [x0], #16 b.gt 4b ret -8: +80: AARCH64_VALID_JUMP_TARGET +8: ld1 {v0.8b}, [x1], x2 ld1 {v1.8b}, [x1], x2 subs w4, w4, #2 @@ -1071,8 +1073,8 @@ L(prep_tbl): .hword L(prep_tbl) - 640b .hword L(prep_tbl) - 320b .hword L(prep_tbl) - 160b - .hword L(prep_tbl) - 8b - .hword L(prep_tbl) - 4b + .hword L(prep_tbl) - 80b + .hword L(prep_tbl) - 40b endfunc From 465916958d507c430a43ebb1e3611cd8c94597e5 Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 1 May 2024 15:46:02 +0200 Subject: [PATCH 19/22] AArch64: Optimize jump table calculation of prep_neon Save a complex arithmetic instruction in the jump table address calculation of prep_neon function. --- src/arm/64/mc.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 1ea8aeab9..02ed1a928 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -950,7 +950,7 @@ endfunc function prep_neon, export=1 adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + sub x9, x9, x8 br x9 40: From f3c7b6724497eb9abfb549c6c7a629d3415dd7ed Mon Sep 17 00:00:00 2001 From: Arpad Panyik Date: Wed, 1 May 2024 15:50:51 +0200 Subject: [PATCH 20/22] AArch64: Optimize prep_neon function Optimize the widening copy part of subpel filters (the prep_neon function). In this patch we combine widening shifts with widening multiplications in the inner loops to get maximum throughput. The change will increase .text by 36 bytes. Relative performance of micro benchmarks (lower is better): Cortex-A55: mct_w4: 0.795x mct_w8: 0.913x mct_w16: 0.912x mct_w32: 0.838x mct_w64: 1.025x mct_w128: 1.002x Cortex-A510: mct_w4: 0.760x mct_w8: 0.636x mct_w16: 0.640x mct_w32: 0.854x mct_w64: 0.864x mct_w128: 0.995x Cortex-A72: mct_w4: 0.616x mct_w8: 0.854x mct_w16: 0.756x mct_w32: 1.052x mct_w64: 1.044x mct_w128: 0.702x Cortex-A76: mct_w4: 0.837x mct_w8: 0.797x mct_w16: 0.841x mct_w32: 0.804x mct_w64: 0.948x mct_w128: 0.904x Cortex-A78: mct_w16: 0.542x mct_w32: 0.725x mct_w64: 0.741x mct_w128: 0.745x Cortex-A715: mct_w16: 0.561x mct_w32: 0.720x mct_w64: 0.740x mct_w128: 0.748x Cortex-X1: mct_w32: 0.886x mct_w64: 0.882x mct_w128: 0.917x Cortex-X3: mct_w32: 0.835x mct_w64: 0.803x mct_w128: 0.808x --- src/arm/64/mc.S | 181 +++++++++++++++++++++++++++--------------------- 1 file changed, 103 insertions(+), 78 deletions(-) diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S index 02ed1a928..736b2bb4e 100644 --- a/src/arm/64/mc.S +++ b/src/arm/64/mc.S @@ -950,6 +950,7 @@ endfunc function prep_neon, export=1 adr x9, L(prep_tbl) ldrh w8, [x9, x8, lsl #1] + movi v24.16b, #16 sub x9, x9, x8 br x9 @@ -957,114 +958,138 @@ function prep_neon, export=1 AARCH64_VALID_JUMP_TARGET 4: ld1 {v0.s}[0], [x1], x2 + ld1 {v0.s}[1], [x1], x2 ld1 {v1.s}[0], [x1], x2 - subs w4, w4, #2 + ld1 {v1.s}[1], [x1], x2 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 - st1 {v0.4h, v1.4h}, [x0], #16 + subs w4, w4, #4 + stp q0, q1, [x0], #32 b.gt 4b ret 80: AARCH64_VALID_JUMP_TARGET 8: - ld1 {v0.8b}, [x1], x2 - ld1 {v1.8b}, [x1], x2 - subs w4, w4, #2 + ldr d0, [x1] + ldr d1, [x1, x2] + add x1, x1, x2, lsl #1 + ldr d2, [x1] + ldr d3, [x1, x2] + add x1, x1, x2, lsl #1 ushll v0.8h, v0.8b, #4 ushll v1.8h, v1.8b, #4 - st1 {v0.8h, v1.8h}, [x0], #32 + umull v2.8h, v2.8b, v24.8b + umull v3.8h, v3.8b, v24.8b + subs w4, w4, #4 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + add x0, x0, #64 b.gt 8b ret 160: AARCH64_VALID_JUMP_TARGET - add x9, x1, x2 - lsl x2, x2, #1 16: - ld1 {v0.16b}, [x1], x2 - ld1 {v1.16b}, [x9], x2 - subs w4, w4, #2 - ushll v4.8h, v0.8b, #4 - ushll2 v5.8h, v0.16b, #4 - ushll v6.8h, v1.8b, #4 - ushll2 v7.8h, v1.16b, #4 - st1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64 + ldr q1, [x1] + ldr q3, [x1, x2] + add x1, x1, x2, lsl #1 + ushll v0.8h, v1.8b, #4 + ushll2 v1.8h, v1.16b, #4 + ldr q5, [x1] + ldr q7, [x1, x2] + add x1, x1, x2, lsl #1 + umull v2.8h, v3.8b, v24.8b + umull2 v3.8h, v3.16b, v24.16b + ushll v4.8h, v5.8b, #4 + ushll2 v5.8h, v5.16b, #4 + umull v6.8h, v7.8b, v24.8b + umull2 v7.8h, v7.16b, v24.16b + subs w4, w4, #4 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 b.gt 16b ret 320: AARCH64_VALID_JUMP_TARGET - add x8, x0, w3, uxtw 32: - ld1 {v0.16b, v1.16b}, [x1], x2 - subs w4, w4, #2 - ushll v4.8h, v0.8b, #4 - ushll2 v5.8h, v0.16b, #4 - ld1 {v2.16b, v3.16b}, [x1], x2 - ushll v6.8h, v1.8b, #4 - ushll2 v7.8h, v1.16b, #4 - ushll v16.8h, v2.8b, #4 - st1 {v4.8h, v5.8h}, [x0], x7 - ushll2 v17.8h, v2.16b, #4 - st1 {v6.8h, v7.8h}, [x8], x7 - ushll v18.8h, v3.8b, #4 - st1 {v16.8h, v17.8h}, [x0], x7 - ushll2 v19.8h, v3.16b, #4 - st1 {v18.8h, v19.8h}, [x8], x7 + ldp q4, q5, [x1] + add x1, x1, x2 + ldp q6, q7, [x1] + add x1, x1, x2 + ushll v0.8h, v4.8b, #4 + ushll2 v1.8h, v4.16b, #4 + umull v2.8h, v5.8b, v24.8b + umull2 v3.8h, v5.16b, v24.16b + ushll v4.8h, v6.8b, #4 + ushll2 v5.8h, v6.16b, #4 + umull v6.8h, v7.8b, v24.8b + umull2 v7.8h, v7.16b, v24.16b + subs w4, w4, #2 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 b.gt 32b ret 640: AARCH64_VALID_JUMP_TARGET - add x8, x0, #32 - mov x6, #64 64: - ldp q0, q1, [x1] - subs w4, w4, #1 - ushll v4.8h, v0.8b, #4 - ushll2 v5.8h, v0.16b, #4 - ldp q2, q3, [x1, #32] - ushll v6.8h, v1.8b, #4 - ushll2 v7.8h, v1.16b, #4 - add x1, x1, x2 - ushll v16.8h, v2.8b, #4 - st1 {v4.8h, v5.8h}, [x0], x6 - ushll2 v17.8h, v2.16b, #4 - ushll v18.8h, v3.8b, #4 - st1 {v6.8h, v7.8h}, [x8], x6 - ushll2 v19.8h, v3.16b, #4 - st1 {v16.8h, v17.8h}, [x0], x6 - st1 {v18.8h, v19.8h}, [x8], x6 + ldp q4, q5, [x1] + ldp q6, q7, [x1, #32] + add x1, x1, x2 + ushll v0.8h, v4.8b, #4 + ushll2 v1.8h, v4.16b, #4 + umull v2.8h, v5.8b, v24.8b + umull2 v3.8h, v5.16b, v24.16b + ushll v4.8h, v6.8b, #4 + ushll2 v5.8h, v6.16b, #4 + umull v6.8h, v7.8b, v24.8b + umull2 v7.8h, v7.16b, v24.16b + subs w4, w4, #1 + stp q0, q1, [x0] + stp q2, q3, [x0, #32] + stp q4, q5, [x0, #64] + stp q6, q7, [x0, #96] + add x0, x0, #128 b.gt 64b ret 1280: AARCH64_VALID_JUMP_TARGET - add x8, x0, #64 - mov x6, #128 128: - ldp q0, q1, [x1] - ldp q2, q3, [x1, #32] - ushll v16.8h, v0.8b, #4 - ushll2 v17.8h, v0.16b, #4 - ushll v18.8h, v1.8b, #4 - ushll2 v19.8h, v1.16b, #4 - ushll v20.8h, v2.8b, #4 - ushll2 v21.8h, v2.16b, #4 - ldp q4, q5, [x1, #64] - st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6 - ushll v22.8h, v3.8b, #4 - ushll2 v23.8h, v3.16b, #4 - ushll v24.8h, v4.8b, #4 - ushll2 v25.8h, v4.16b, #4 - ushll v26.8h, v5.8b, #4 - ushll2 v27.8h, v5.16b, #4 - ldp q6, q7, [x1, #96] - st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6 - ushll v28.8h, v6.8b, #4 - ushll2 v29.8h, v6.16b, #4 - ushll v30.8h, v7.8b, #4 - ushll2 v31.8h, v7.16b, #4 - subs w4, w4, #1 - add x1, x1, x2 - st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6 - st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6 + ldp q28, q29, [x1] + ldp q30, q31, [x1, #32] + ushll v16.8h, v28.8b, #4 + ushll2 v17.8h, v28.16b, #4 + umull v18.8h, v29.8b, v24.8b + umull2 v19.8h, v29.16b, v24.16b + ushll v20.8h, v30.8b, #4 + ushll2 v21.8h, v30.16b, #4 + umull v22.8h, v31.8b, v24.8b + umull2 v23.8h, v31.16b, v24.16b + ldp q28, q29, [x1, #64] + ldp q30, q31, [x1, #96] + add x1, x1, x2 + stp q16, q17, [x0] + stp q18, q19, [x0, #32] + stp q20, q21, [x0, #64] + stp q22, q23, [x0, #96] + ushll v16.8h, v28.8b, #4 + ushll2 v17.8h, v28.16b, #4 + umull v18.8h, v29.8b, v24.8b + umull2 v19.8h, v29.16b, v24.16b + ushll v20.8h, v30.8b, #4 + ushll2 v21.8h, v30.16b, #4 + umull v22.8h, v31.8b, v24.8b + umull2 v23.8h, v31.16b, v24.16b + subs w4, w4, #1 + stp q16, q17, [x0, #128] + stp q18, q19, [x0, #160] + stp q20, q21, [x0, #192] + stp q22, q23, [x0, #224] + add x0, x0, #256 b.gt 128b ret From 3ae38b3d643cc9bcbb5572b721ee9cf50c8ba0ad Mon Sep 17 00:00:00 2001 From: Frank Bossen Date: Mon, 17 Jun 2024 18:33:41 -0400 Subject: [PATCH 21/22] Port C code changes to Rust --- build.rs | 19 ++++++++++- include/common/bitdepth.rs | 10 ++++-- src/mc.rs | 67 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 3 deletions(-) diff --git a/build.rs b/build.rs index 3062cf902..5f6bc6f81 100644 --- a/build.rs +++ b/build.rs @@ -103,6 +103,13 @@ mod asm { if let Arch::Arm(arch) = arch { define(Define::bool("ARCH_ARM", arch == ArchArm::Arm32)); define(Define::bool("ARCH_AARCH64", arch == ArchArm::Arm64)); + + if arch == ArchArm::Arm64 { + define(Define::bool("HAVE_DOTPROD", features.contains("dotprod"))); + } + if arch == ArchArm::Arm64 { + define(Define::bool("HAVE_I8MM", features.contains("i8mm"))); + } } if let Arch::X86(arch) = arch { @@ -199,6 +206,7 @@ mod asm { ][..]; let arm_generic = &["itx", "msac", "refmvs", "looprestoration_common"][..]; + let arm_dotprod = &["mc_dotprod"][..]; let arm_bpc8 = &[ "cdef", "filmgrain", @@ -243,11 +251,20 @@ mod asm { #[cfg(feature = "bitdepth_16")] arm_bpc16, ][..]; + let arm64_all = &[ + arm_generic, + arm_dotprod, + #[cfg(feature = "bitdepth_8")] + arm_bpc8, + #[cfg(feature = "bitdepth_16")] + arm_bpc16, + ][..]; let asm_file_names = match arch { Arch::X86(ArchX86::X86_32) => x86_all, Arch::X86(ArchX86::X86_64) => x86_64_all, - Arch::Arm(..) => arm_all, + Arch::Arm(ArchArm::Arm32) => arm_all, + Arch::Arm(ArchArm::Arm64) => arm64_all, }; let asm_file_dir = match arch { diff --git a/include/common/bitdepth.rs b/include/common/bitdepth.rs index 002d4e201..398a29019 100644 --- a/include/common/bitdepth.rs +++ b/include/common/bitdepth.rs @@ -457,7 +457,10 @@ macro_rules! bd_fn { /// /// Similar to [`bd_fn!`] except that it selects which [`BitDepth`] `fn` /// based on `$bpc:literal bpc` instead of `$BD:ty`. -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] +#[cfg(all( + feature = "asm", + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] macro_rules! bpc_fn { ($bpc:literal bpc, $name:ident, $asm:ident) => {{ use $crate::include::common::bitdepth::fn_identity; @@ -487,7 +490,10 @@ macro_rules! fn_identity { ))] pub(crate) use bd_fn; -#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] +#[cfg(all( + feature = "asm", + any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64") +))] pub(crate) use bpc_fn; #[allow(unused)] diff --git a/src/mc.rs b/src/mc.rs index 7834afad3..bc4db6f4a 100644 --- a/src/mc.rs +++ b/src/mc.rs @@ -46,6 +46,9 @@ use crate::include::common::bitdepth::bd_fn; #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] use crate::include::common::bitdepth::{bpc_fn, BPC}; +#[cfg(all(feature = "asm", target_arch = "aarch64"))] +use crate::include::common::bitdepth::bpc_fn; + #[inline(never)] fn put_rust( dst: Rav1dPictureDataComponentOffset, @@ -2300,6 +2303,70 @@ impl Rav1dMCDSPContext { self.warp8x8t = bd_fn!(warp8x8t::decl_fn, BD, warp_affine_8x8t, neon); self.emu_edge = bd_fn!(emu_edge::decl_fn, BD, emu_edge, neon); + #[cfg(target_feature = "dotprod")] + if BD::BITDEPTH == 8 { + if !flags.contains(CpuFlags::DOTPROD) { + return self; + } + + self.mc = enum_map!(Filter2d => mc::Fn; match key { + Regular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular, neon_dotprod), + RegularSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_smooth, neon_dotprod), + RegularSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_sharp, neon_dotprod), + SmoothRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_regular, neon_dotprod), + Smooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth, neon_dotprod), + SmoothSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_sharp, neon_dotprod), + SharpRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_regular, neon_dotprod), + SharpSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_smooth, neon_dotprod), + Sharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp, neon_dotprod), + Bilinear => bpc_fn!(mc::decl_fn, 8 bpc, put_bilin, neon), + }); + self.mct = enum_map!(Filter2d => mct::Fn; match key { + Regular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular, neon_dotprod), + RegularSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_smooth, neon_dotprod), + RegularSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_sharp, neon_dotprod), + SmoothRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_regular, neon_dotprod), + Smooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth, neon_dotprod), + SmoothSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_sharp, neon_dotprod), + SharpRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_regular, neon_dotprod), + SharpSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_smooth, neon_dotprod), + Sharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp, neon_dotprod), + Bilinear => bpc_fn!(mct::decl_fn, 8 bpc, prep_bilin, neon), + }); + } + + #[cfg(target_feature = "i8mm")] + if BD::BITDEPTH == 8 { + if !flags.contains(CpuFlags::I8MM) { + return self; + } + + self.mc = enum_map!(Filter2d => mc::Fn; match key { + Regular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular, neon_i8mm), + RegularSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_smooth, neon_i8mm), + RegularSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_sharp, neon_i8mm), + SmoothRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_regular, neon_i8mm), + Smooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth, neon_i8mm), + SmoothSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_sharp, neon_i8mm), + SharpRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_regular, neon_i8mm), + SharpSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_smooth, neon_i8mm), + Sharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp, neon_i8mm), + Bilinear => bpc_fn!(mc::decl_fn, 8 bpc, put_bilin, neon), + }); + self.mct = enum_map!(Filter2d => mct::Fn; match key { + Regular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular, neon_i8mm), + RegularSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_smooth, neon_i8mm), + RegularSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_sharp, neon_i8mm), + SmoothRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_regular, neon_i8mm), + Smooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth, neon_i8mm), + SmoothSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_sharp, neon_i8mm), + SharpRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_regular, neon_i8mm), + SharpSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_smooth, neon_i8mm), + Sharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp, neon_i8mm), + Bilinear => bpc_fn!(mct::decl_fn, 8 bpc, prep_bilin, neon), + }); + } + self } From 6b87a77489f2c2bfb652f1446c9b1012932ade14 Mon Sep 17 00:00:00 2001 From: Frank Bossen Date: Wed, 17 Jul 2024 18:40:04 -0400 Subject: [PATCH 22/22] Specify `armv8.6-a` architecture when building aarch64 This enables building code requiring `i8mm` ISA extension --- build.rs | 13 +++++++------ src/mc.rs | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/build.rs b/build.rs index 5f6bc6f81..4cb7c48ea 100644 --- a/build.rs +++ b/build.rs @@ -105,10 +105,8 @@ mod asm { define(Define::bool("ARCH_AARCH64", arch == ArchArm::Arm64)); if arch == ArchArm::Arm64 { - define(Define::bool("HAVE_DOTPROD", features.contains("dotprod"))); - } - if arch == ArchArm::Arm64 { - define(Define::bool("HAVE_I8MM", features.contains("i8mm"))); + define(Define::bool("HAVE_DOTPROD", true)); + define(Define::bool("HAVE_I8MM", true)); } } @@ -308,8 +306,11 @@ mod asm { } cc.compile(rav1dasm); } else { - cc::Build::new() - .files(asm_file_paths) + let mut cc = cc::Build::new(); + if arch == Arch::Arm(ArchArm::Arm64) { + cc.flag("-march=armv8.6-a"); + } + cc.files(asm_file_paths) .include(".") .include(&out_dir) .debug(cfg!(debug_assertions)) diff --git a/src/mc.rs b/src/mc.rs index bc4db6f4a..538aa7546 100644 --- a/src/mc.rs +++ b/src/mc.rs @@ -2303,7 +2303,7 @@ impl Rav1dMCDSPContext { self.warp8x8t = bd_fn!(warp8x8t::decl_fn, BD, warp_affine_8x8t, neon); self.emu_edge = bd_fn!(emu_edge::decl_fn, BD, emu_edge, neon); - #[cfg(target_feature = "dotprod")] + #[cfg(target_arch = "aarch64")] if BD::BITDEPTH == 8 { if !flags.contains(CpuFlags::DOTPROD) { return self; @@ -2335,7 +2335,7 @@ impl Rav1dMCDSPContext { }); } - #[cfg(target_feature = "i8mm")] + #[cfg(target_arch = "aarch64")] if BD::BITDEPTH == 8 { if !flags.contains(CpuFlags::I8MM) { return self;