From 1cfcf7acd1146f7a166ad520ff64d19e88165ef1 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <pcc@google.com>
Date: Thu, 7 Mar 2024 13:47:02 -0800
Subject: [PATCH 01/22] arm64: Use different instruction sequence for taking
 global address with HWASan

When dav1d is built with HWASan, the build fails because globals are
tagged and the normal adrp/add instruction sequence does not have
enough range to take the tagged address. Therefore, use an alternative
instruction sequence when HWASan is enabled, which is the same as
what the compiler generates.
---
 src/arm/64/util.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/arm/64/util.S b/src/arm/64/util.S
index 64d73e3a5..1b3f319ce 100644
--- a/src/arm/64/util.S
+++ b/src/arm/64/util.S
@@ -32,6 +32,10 @@
 #include "config.h"
 #include "src/arm/asm.S"
 
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
 .macro  movrel rd, val, offset=0
 #if defined(__APPLE__)
   .if \offset < 0
@@ -51,6 +55,10 @@
         adrp            \rd, \val+(\offset)
         add             \rd, \rd, :lo12:\val+(\offset)
   .endif
+#elif __has_feature(hwaddress_sanitizer)
+        adrp            \rd, :pg_hi21_nc:\val+(\offset)
+        movk            \rd, #:prel_g3:\val+0x100000000
+        add             \rd, \rd, :lo12:\val+(\offset)
 #elif defined(PIC)
         adrp            \rd, \val+(\offset)
         add             \rd, \rd, :lo12:\val+(\offset)

From a18310da554a4b3865c707c86db9ae9a3d781192 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Fri, 22 Mar 2024 16:20:38 +0100
Subject: [PATCH 02/22] AArch64: Add DotProd support for convolutions

Add an Armv8.4-A DotProd code path for standard bitdepth convolutions.
Only horizontal-vertical (HV) convolutions have 6-tap specialisations
of their vertical passes. All other convolutions are 4- or 8-tap
filters which fit well with the 4-element SDOT instruction.

Benchmarks show up-to 7-29% FPS increase depending on the input video
and the CPU used.

This patch will increase the .text by around 6.5 KiB.

Performance highly depends on the SDOT and MLA throughput ratio, this
can be seen on the vertical filter cases. Small cores are also
affected by the TBL execution latencies:

Relative performance to the C reference on some CPUs:

                          A76      A78       X1      A55
regular w4 hv neon:      5.52x    5.78x   10.75x    8.27x
regular w4 hv dotprod:   7.94x    8.49x   16.84x    8.09x
sharp w4 hv neon:        5.27x    5.22x    9.06x    7.87x
sharp w4 hv dotprod:     6.61x    6.73x   12.64x    6.89x

regular w8 hv neon:      1.95x    2.19x    2.56x    3.16x
regular w8 hv dotprod:   3.23x    2.81x    3.20x    3.26x
sharp w8 hv neon:        1.61x    1.79x    2.05x    2.72x
sharp w8 hv dotprod:     2.72x    2.29x    2.66x    2.76x

regular w16 hv neon:     1.63x    2.04x    2.16x    2.73x
regular w16 hv dotprod:  2.72x    2.57x    2.67x    2.80x
sharp w16 hv neon:       1.33x    1.67x    1.74x    2.34x
sharp w16 hv dotprod:    2.31x    2.14x    2.26x    2.39x

regular w32 hv neon:     1.48x    1.92x    1.94x    2.51x
regular w32 hv dotprod:  2.49x    2.40x    2.33x    2.58x
sharp w32 hv neon:       1.21x    1.56x    1.53x    2.14x
sharp w32 hv dotprod:    2.12x    2.02x    2.00x    2.22x

regular w64 hv neon:     1.42x    1.87x    1.85x    2.40x
regular w64 hv dotprod:  2.40x    2.32x    2.21x    2.46x
sharp w64 hv neon:       1.16x    1.52x    1.46x    2.04x
sharp w64 hv dotprod:    2.02x    1.96x    1.90x    2.11x

regular w128 hv neon:    1.39x    1.84x    1.80x    2.27x
regular w128 hv dotprod: 2.33x    2.28x    2.14x    2.35x
sharp w128 hv neon:      1.14x    1.50x    1.42x    1.94x
sharp w128 hv dotprod:   1.98x    1.93x    1.84x    2.03x

regular w8 h neon:       2.61x    3.20x    3.51x    3.55x
regular w8 h dotprod:    4.43x    5.17x    6.26x    4.30x
sharp w8 h neon:         2.01x    2.80x    2.89x    3.12x
sharp w8 h dotprod:      4.42x    5.16x    6.27x    4.28x

regular w16 h neon:      2.17x    3.13x    2.92x    3.35x
regular w16 h dotprod:   4.38x    4.27x    4.53x    3.90x
sharp w16 h neon:        1.74x    2.65x    2.48x    2.92x
sharp w16 h dotprod:     4.33x    4.27x    4.53x    3.91x

regular w64 h neon:      1.92x    2.82x    2.39x    2.96x
regular w64 h dotprod:   3.68x    3.60x    3.40x    3.18x
sharp w64 h neon:        1.47x    2.33x    2.05x    2.54x
sharp w64 h dotprod:     3.68x    3.60x    3.40x    3.17x

regular w4 v neon:       5.39x    7.38x   10.27x   11.41x
regular w4 v dotprod:    9.46x   14.15x   18.72x    9.84x
sharp w4 v neon:         4.51x    6.39x    8.17x   10.70x
sharp w4 v dotprod:      9.35x   14.20x   18.63x    9.78x

regular w16 v neon:      3.03x    4.03x    4.65x    6.28x
regular w16 v dotprod:   4.64x    3.75x    4.78x    3.89x
sharp w16 v neon:        2.29x    3.09x    3.44x    5.52x
sharp w16 v dotprod:     4.62x    3.74x    4.77x    3.89x

regular w64 v neon:      2.17x    3.14x    3.19x    4.46x
regular w64 v dotprod:   3.43x    3.00x    3.31x    2.74x
sharp w64 v neon:        1.61x    2.42x    2.34x    3.89x
sharp w64 v dotprod:     3.38x    3.00x    3.29x    2.73x
---
 src/arm/64/mc.S         |    4 +-
 src/arm/64/mc_dotprod.S | 1413 +++++++++++++++++++++++++++++++++++++++
 src/arm/mc.h            |   85 +--
 src/meson.build         |    1 +
 4 files changed, 1461 insertions(+), 42 deletions(-)
 create mode 100644 src/arm/64/mc_dotprod.S

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 3df0393c3..5b493be82 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -837,7 +837,7 @@ endfunc
 
 // This has got the same signature as the put_8tap functions,
 // and assumes that x8 is set to (clz(w)-24).
-function put_neon
+function put_neon, export=1
         adr             x9,  L(put_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
@@ -939,7 +939,7 @@ endfunc
 
 // This has got the same signature as the prep_8tap functions,
 // and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
-function prep_neon
+function prep_neon, export=1
         adr             x9,  L(prep_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
         sub             x9,  x9,  w8, uxtw
diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
new file mode 100644
index 000000000..fcf04ee4d
--- /dev/null
+++ b/src/arm/64/mc_dotprod.S
@@ -0,0 +1,1413 @@
+/*
+ * Copyright © 2024, VideoLAN and dav1d authors
+ * Copyright © 2024, Janne Grunau
+ * Copyright © 2024, Martin Storsjo
+ * Copyright © 2024, Arm Limited
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+
+#if HAVE_DOTPROD
+ENABLE_DOTPROD
+
+// No spaces in these expressions, due to gas-preprocessor. It is translated by
+// -1 to save the negative offset at getting the address of `mc_subpel_filters`.
+#define REGULAR1        (((0*15-1)<<7)|(3*15-1))
+#define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
+#define SHARP1          (((2*15-1)<<7)|(3*15-1))
+
+#define FUNC_ALIGN      2
+#define JUMP_ALIGN      2
+#define LOOP_ALIGN      2
+
+
+// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
+        .align 4
+L(hv_tbl_neon_dotprod):
+        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
+
+// Shuffle indices to permute horizontal samples in preparation for input to
+// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
+// interval of [-3, 4] relative to the current sample position. We load samples
+// from index value -4 to keep loads word aligned, so the shuffle bytes are
+// translated by 1 to handle this.
+        .align 4
+L(h_tbl_neon_dotprod):
+        .byte  1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6,   4,  5,  6,  7
+        .byte  5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10,   8,  9, 10, 11
+        .byte  9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14,  12, 13, 14, 15
+        .byte 13, 14, 15, 16,  14, 15, 16, 17,  15, 16, 17, 18,  16, 17, 18, 19
+
+// Vertical convolutions are also using SDOT instructions, where a 128-bit
+// register contains a transposed 4x4 matrix of values. Subsequent iterations of
+// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop
+// iteration. These shuffle indices shift and merge this 4x4 matrix with the
+// values of a new line.
+        .align 4
+L(v_tbl_neon_dotprod):
+        .byte  1,  2,  3, 16,   5,  6,  7, 20,   9, 10, 11, 24,  13, 14, 15, 28
+        .byte  1,  2,  3, 16,   5,  6,  7, 17,   9, 10, 11, 18,  13, 14, 15, 19
+        .byte  1,  2,  3, 20,   5,  6,  7, 21,   9, 10, 11, 22,  13, 14, 15, 23
+        .byte  1,  2,  3, 24,   5,  6,  7, 25,   9, 10, 11, 26,  13, 14, 15, 27
+        .byte  1,  2,  3, 28,   5,  6,  7, 29,   9, 10, 11, 30,  13, 14, 15, 31
+
+
+.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
+function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN
+        mov             x9,  \type_h
+        mov             x10, \type_v
+    .if \jump
+        b               \op\()_8tap_\isa
+    .endif
+endfunc
+.endm
+
+.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
+make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
+make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
+make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
+make_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
+make_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
+make_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
+make_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
+make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
+make_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
+
+function \type\()_8tap_\isa, align=FUNC_ALIGN
+        clz             w8, \w
+        mov             w11,  #0x4081   // (1 << 14) | (1 << 7) | (1 << 0)
+        sub             w8, w8, #24     // for jump tables
+        movrel          x12, X(mc_subpel_filters)
+        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
+        cbnz            \my, L(\type\()_8tap_v_\isa)
+.ifc \type, prep
+        add             \wd_strd, \w, \w    // prep_neon needs w * 2 as stride
+.endif
+        b               X(\type\()_neon)
+
+        .align JUMP_ALIGN
+L(\type\()_8tap_v_\isa):
+        madd            \my, \my, w11, w10
+.ifc \type, prep
+        mov             w8, 0x2002  // FILTER_WEIGHT * 128 + rounding
+.endif
+        sub             \src, \src, \s_strd
+        ldr             q6, L(v_tbl_neon_dotprod)
+.ifc \type, prep
+        dup             v4.4s, w8
+.endif
+        ubfx            w11, \my, #7, #7
+        and             \my, \my, #0x7F
+        ldr             q28, L(v_tbl_neon_dotprod) + 16
+        cmp             \h, #4
+        csel            \my, \my, w11, le
+        sub             \src, \src, \s_strd, lsl #1     // src - src_stride * 3
+        ldr             q29, L(v_tbl_neon_dotprod) + 32
+        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
+        movi            v5.16b, #128
+        ldr             d7, [\xmy]
+        cmp             \w, #8
+        b.eq            80f
+        b.lt            40f
+
+        // .align JUMP_ALIGN    // fallthrough
+160:    // V - 16xN+
+        ldr             q30, L(v_tbl_neon_dotprod) + 48
+        ldr             q31, L(v_tbl_neon_dotprod) + 64
+.ifc \type, prep
+        add             \wd_strd, \w, \w
+.endif
+        .align LOOP_ALIGN
+161:
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        sub             w8, \h, #1
+
+        ldr             q16, [\lsrc]
+        ldr             q17, [\lsrc, \s_strd]
+        add             \lsrc, \lsrc, \s_strd, lsl #1
+        ldr             q18, [\lsrc]
+        ldr             q19, [\lsrc, \s_strd]
+        add             \lsrc, \lsrc, \s_strd, lsl #1
+
+        zip1            v0.16b, v16.16b, v17.16b
+        zip2            v1.16b, v16.16b, v17.16b
+        zip1            v2.16b, v18.16b, v19.16b
+        zip2            v3.16b, v18.16b, v19.16b
+
+        ldr             q20, [\lsrc]
+        ldr             q21, [\lsrc, \s_strd]
+        add             \lsrc, \lsrc, \s_strd, lsl #1
+        ldr             q22, [\lsrc]
+        ldr             q23, [\lsrc, \s_strd]
+        add             \lsrc, \lsrc, \s_strd, lsl #1
+
+        zip1            v18.16b, v20.16b, v21.16b
+        zip2            v21.16b, v20.16b, v21.16b
+        zip1            v24.16b, v22.16b, v23.16b
+        zip2            v27.16b, v22.16b, v23.16b
+
+        zip1            v16.8h, v0.8h, v2.8h
+        zip2            v19.8h, v0.8h, v2.8h
+        zip1            v22.8h, v1.8h, v3.8h
+        zip2            v25.8h, v1.8h, v3.8h
+
+        zip1            v17.8h, v18.8h, v24.8h
+        zip2            v20.8h, v18.8h, v24.8h
+        zip1            v23.8h, v21.8h, v27.8h
+        zip2            v26.8h, v21.8h, v27.8h
+
+        sub             v16.16b, v16.16b, v5.16b
+        sub             v19.16b, v19.16b, v5.16b
+        sub             v22.16b, v22.16b, v5.16b
+        sub             v25.16b, v25.16b, v5.16b
+
+        sub             v17.16b, v17.16b, v5.16b
+        sub             v20.16b, v20.16b, v5.16b
+        sub             v23.16b, v23.16b, v5.16b
+        sub             v26.16b, v26.16b, v5.16b
+
+        .align LOOP_ALIGN
+16:
+        ldr             q27, [\lsrc]
+        add             \lsrc, \lsrc, \s_strd
+.ifc \type, prep
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
+        mov             v2.16b, v4.16b
+        mov             v3.16b, v4.16b
+.else
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+        movi            v2.4s, #32, lsl 8
+        movi            v3.4s, #32, lsl 8
+.endif
+        sub             v18.16b, v27.16b, v5.16b
+        sub             v21.16b, v27.16b, v5.16b
+        sub             v24.16b, v27.16b, v5.16b
+        sub             v27.16b, v27.16b, v5.16b
+
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v2.4s, v22.16b, v7.4b[0]
+        sdot            v3.4s, v25.16b, v7.4b[0]
+
+        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
+        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
+        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
+        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
+
+        sdot            v0.4s, v17.16b, v7.4b[1]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+        sdot            v2.4s, v23.16b, v7.4b[1]
+        sdot            v3.4s, v26.16b, v7.4b[1]
+
+        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
+        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
+        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
+        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
+
+        subs            w8, w8, #1
+        uzp1            v0.8h, v0.8h, v1.8h
+        uzp1            v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+        sshr            v0.8h, v0.8h, #2
+        sshr            v1.8h, v2.8h, #2
+        st1             {v0.8h, v1.8h}, [\ldst], \d_strd
+.else
+        sqrshrun        v0.8b, v0.8h, #6
+        sqrshrun2       v0.16b, v2.8h, #6
+        st1             {v0.16b}, [\ldst], \d_strd
+.endif
+        b.gt            16b
+
+.ifc \type, prep
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
+        mov             v2.16b, v4.16b
+        mov             v3.16b, v4.16b
+.else
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+        movi            v2.4s, #32, lsl 8
+        movi            v3.4s, #32, lsl 8
+.endif
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v2.4s, v22.16b, v7.4b[0]
+        sdot            v3.4s, v25.16b, v7.4b[0]
+
+        sdot            v0.4s, v17.16b, v7.4b[1]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+        sdot            v2.4s, v23.16b, v7.4b[1]
+        sdot            v3.4s, v26.16b, v7.4b[1]
+
+        subs            \w, \w, #16
+        uzp1            v0.8h, v0.8h, v1.8h
+        uzp1            v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+        sshr            v0.8h, v0.8h, #2
+        sshr            v1.8h, v2.8h, #2
+        stp             q0, q1, [\ldst]
+        add             \dst, \dst, #32
+.else
+        sqrshrun        v0.8b, v0.8h, #6
+        sqrshrun2       v0.16b, v2.8h, #6
+        str             q0, [\ldst]
+        add             \dst, \dst, #16
+.endif
+        add             \src, \src, #16
+        b.gt            161b
+        ret
+
+        .align JUMP_ALIGN
+80:     // V - 8xN
+        ldr             d16, [\src]
+        ldr             d17, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             d18, [\src]
+        ldr             d19, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        ldr             d20, [\src]
+        ldr             d21, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             d22, [\src]
+        ldr             d23, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        subs            \h, \h, #2  // for prep: sub is enough
+
+        zip1            v0.16b, v16.16b, v17.16b
+        zip1            v2.16b, v18.16b, v19.16b
+        zip1            v18.16b, v20.16b, v21.16b
+        zip1            v24.16b, v22.16b, v23.16b
+
+        zip1            v16.8h,  v0.8h,  v2.8h
+        zip2            v19.8h,  v0.8h,  v2.8h
+        zip1            v17.8h, v18.8h, v24.8h
+        zip2            v20.8h, v18.8h, v24.8h
+
+        sub             v16.16b, v16.16b, v5.16b
+        sub             v19.16b, v19.16b, v5.16b
+        sub             v17.16b, v17.16b, v5.16b
+        sub             v20.16b, v20.16b, v5.16b
+.ifc \type, put
+        b.eq            82f
+.endif
+
+        .align LOOP_ALIGN
+8:
+        ldr             d21, [\src]
+        ldr             d27, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+.ifc \type, prep
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
+        mov             v2.16b, v4.16b
+        mov             v3.16b, v4.16b
+.else
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+        movi            v2.4s, #32, lsl 8
+        movi            v3.4s, #32, lsl 8
+.endif
+        sub             v18.16b, v21.16b, v5.16b
+        sub             v21.16b, v21.16b, v5.16b
+        sub             v24.16b, v27.16b, v5.16b
+        sub             v27.16b, v27.16b, v5.16b
+
+        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
+        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
+        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
+        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
+
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v0.4s, v17.16b, v7.4b[1]
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+
+        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
+        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
+        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
+        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
+
+        sdot            v2.4s, v22.16b, v7.4b[0]
+        sdot            v2.4s, v23.16b, v7.4b[1]
+        sdot            v3.4s, v25.16b, v7.4b[0]
+        sdot            v3.4s, v26.16b, v7.4b[1]
+
+        subs            \h, \h, #2
+        uzp1            v0.8h, v0.8h, v1.8h
+        uzp1            v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+        sshr            v0.8h, v0.8h, #2
+        sshr            v1.8h, v2.8h, #2
+        stp             q0, q1, [\dst], #32
+.else
+        sqrshrun        v0.8b, v0.8h, #6
+        sqrshrun        v1.8b, v2.8h, #6
+        str             d0, [\dst]
+        str             d1, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+.endif
+        b.gt            8b
+
+.ifc \type, put
+        .align JUMP_ALIGN
+82:
+        ldr             d21, [\src]
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+        movi            v2.4s, #32, lsl 8
+        movi            v3.4s, #32, lsl 8
+.else
+        ldr             d21, [\src]
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
+        mov             v2.16b, v4.16b
+        mov             v3.16b, v4.16b
+.endif
+        sub             v18.16b, v21.16b, v5.16b
+        sub             v21.16b, v21.16b, v5.16b
+
+        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
+        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
+        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
+        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
+
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v0.4s, v17.16b, v7.4b[1]
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+
+        sdot            v2.4s, v22.16b, v7.4b[0]
+        sdot            v2.4s, v23.16b, v7.4b[1]
+        sdot            v3.4s, v25.16b, v7.4b[0]
+        sdot            v3.4s, v26.16b, v7.4b[1]
+
+        uzp1            v0.8h, v0.8h, v1.8h
+        uzp1            v2.8h, v2.8h, v3.8h
+.ifc \type, prep
+        sshr            v0.8h, v0.8h, #2
+        sshr            v1.8h, v2.8h, #2
+        stp             q0, q1, [\dst]
+.else
+        sqrshrun        v0.8b, v0.8h, #6
+        sqrshrun        v1.8b, v2.8h, #6
+        str             d0, [\dst]
+        str             d1, [\dst, \d_strd]
+.endif
+        ret
+
+        .align JUMP_ALIGN
+40:     // V - 4xN or 2xN (put only)
+.ifc \type, put
+        cmp             \w, #2
+        b.eq            20f
+.endif
+        ldr             s16, [\src]
+        ldr             s17, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             s18, [\src]
+        ldr             s19, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        ldr             s20, [\src]
+        ldr             s21, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             s22, [\src]
+        ldr             s23, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        subs            \h, \h, #2  // for prep: sub is enough
+
+        zip1            v0.8b, v16.8b, v17.8b
+        zip1            v2.8b, v18.8b, v19.8b
+        zip1            v18.8b, v20.8b, v21.8b
+        zip1            v24.8b, v22.8b, v23.8b
+
+        zip1            v16.8h, v0.8h, v2.8h
+        zip1            v17.8h, v18.8h, v24.8h
+
+        sub             v16.16b, v16.16b, v5.16b
+        sub             v17.16b, v17.16b, v5.16b
+.ifc \type, put
+        b.eq            42f
+.endif
+
+        .align LOOP_ALIGN
+4:
+        ldr             s18, [\src]
+        ldr             s21, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+.ifc \type, prep
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
+.else
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+.endif
+        sub             v18.16b, v18.16b, v5.16b
+        sub             v21.16b, v21.16b, v5.16b
+
+        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
+        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
+
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v0.4s, v17.16b, v7.4b[1]
+
+        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
+        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
+
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+.ifc \type, prep
+        subs            \h, \h, #2
+        shrn            v0.4h, v0.4s, #2
+        shrn2           v0.8h, v1.4s, #2
+        str             q0, [\dst], #16
+.else
+        uzp1            v0.8h, v0.8h, v1.8h
+        sqrshrun        v0.8b, v0.8h, #6
+        subs            \h, \h, #2
+        fmov            x8, d0
+        lsr             x9, x8, #32
+        str             w8, [\dst]
+        str             w9, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+.endif
+        b.gt            4b
+
+.ifc \type, put
+        .align JUMP_ALIGN
+42:
+        ldr             s18, [\src]
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+.else
+        ldr             s18, [\src]
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
+.endif
+        sub             v18.16b, v18.16b, v5.16b
+
+        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
+        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
+
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v0.4s, v17.16b, v7.4b[1]
+
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+.ifc \type, prep
+        shrn            v0.4h, v0.4s, #2
+        shrn2           v0.8h, v1.4s, #2
+        str             q0, [\dst]
+        ret
+.else
+        uzp1            v0.8h, v0.8h, v1.8h
+        sqrshrun        v0.8b, v0.8h, #6
+        fmov            x8, d0
+        lsr             x9, x8, #32
+        str             w8, [\dst]
+        str             w9, [\dst, \d_strd]
+        ret
+
+        .align JUMP_ALIGN
+20:     // V - 2xN
+        ldr             h16, [\src]
+        ldr             h17, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             h18, [\src]
+        ldr             h19, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        ldr             h20, [\src]
+        ldr             h21, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        ldr             h22, [\src]
+        ldr             h23, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        subs            \h, \h, #2
+
+        zip1            v0.8b, v16.8b, v17.8b
+        zip1            v2.8b, v18.8b, v19.8b
+        zip1            v18.8b, v20.8b, v21.8b
+        zip1            v24.8b, v22.8b, v23.8b
+
+        zip1            v16.4h, v0.4h, v2.4h
+        zip1            v17.4h, v18.4h, v24.4h
+
+        sub             v16.8b, v16.8b, v5.8b
+        sub             v17.8b, v17.8b, v5.8b
+
+        b.eq            22f
+
+        .align LOOP_ALIGN
+2:
+        ldr             h18, [\src]
+        ldr             h21, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+
+        sub             v18.8b, v18.8b, v5.8b
+        sub             v21.8b, v21.8b, v5.8b
+
+        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
+        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
+
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v0.4s, v17.16b, v7.4b[1]
+
+        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
+        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
+
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+
+        uzp1            v0.8h, v0.8h, v1.8h
+        sqrshrun        v0.8b, v0.8h, #6
+
+        subs            \h, \h, #2
+        fmov            x8, d0
+        lsr             x9, x8, #32
+        strh            w8, [\dst]
+        strh            w9, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+        b.gt            2b
+
+        .align JUMP_ALIGN
+22:
+        ldr             h18, [\src]
+
+        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
+        movi            v1.4s, #32, lsl 8
+
+        sub             v18.8b, v18.8b, v5.8b
+
+        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
+        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
+
+        sdot            v0.4s, v16.16b, v7.4b[0]
+        sdot            v0.4s, v17.16b, v7.4b[1]
+
+        sdot            v1.4s, v19.16b, v7.4b[0]
+        sdot            v1.4s, v20.16b, v7.4b[1]
+
+        uzp1            v0.8h, v0.8h, v1.8h
+        sqrshrun        v0.8b, v0.8h, #6
+
+        fmov            x8, d0
+        lsr             x9, x8, #32
+        strh            w8, [\dst]
+        strh            w9, [\dst, \d_strd]
+        ret
+.endif
+
+        .align JUMP_ALIGN
+L(\type\()_8tap_h_hv_\isa):
+        madd            \mx, \mx, w11, w9
+        madd            w14, \my, w11, w10      // for HV
+        ldr             q28, L(h_tbl_neon_dotprod)
+        mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
+        sub             \src, \src, #4          // src - 4
+        dup             v27.4s, w13
+        ubfx            w9, \mx, #7, #7
+        and             \mx, \mx, #0x7F
+        ubfx            w11, w14, #7, #7        // for HV
+        and             w14, w14, #0x7F         // for HV
+        cmp             \w, #4
+        csel            \mx, \mx, w9, le
+        add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
+        movi            v24.16b, #128
+        cbz             \my, L(\type\()_8tap_h_\isa)
+
+        // HV cases
+        cmp             \h, #4
+        csel            w14, w14, w11, le
+        sub             \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4
+        add             \xmy, x12, x14, lsl #3      // subpel V filter address
+        mov             x15, x30
+        ldr             d7, [\xmy]
+.ifc \type, put
+        ldr             q25, L(hv_tbl_neon_dotprod)
+.endif
+        sxtl            v7.8h, v7.8b
+        cmp             w10, SHARP1
+        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
+
+        // HV 8-tap cases
+        sub             \src, \src, \s_strd         // src - src_stride * 3 - 4
+        cmp             \w, #4
+        b.eq            40f
+.ifc \type, put
+        b.lt            20f
+.endif
+
+        // .align JUMP_ALIGN    // fallthrough
+80:     // HV8 - 8xN+
+        ldr             q29, L(h_tbl_neon_dotprod) + 16
+        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldr             d26, [\xmx]
+.ifc \type, prep
+        add             \wd_strd, \w, \w
+.endif
+
+        .align LOOP_ALIGN
+81:
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        mov             w8, \h
+
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v16.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v17.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v18.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v19.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v20.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v21.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+
+        .align LOOP_ALIGN
+8:
+        ldr             q23, [\lsrc]
+        add             \lsrc, \lsrc, \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[0]
+        smull2          v1.4s, v16.8h, v7.h[0]
+        mov             v16.16b, v17.16b
+
+        sub             v23.16b, v23.16b, v24.16b
+
+        mov             v5.16b, v27.16b
+        mov             v6.16b, v27.16b
+
+        smlal           v0.4s, v17.4h, v7.h[1]
+        smlal2          v1.4s, v17.8h, v7.h[1]
+        mov             v17.16b, v18.16b
+
+        tbl             v2.16b, {v23.16b}, v28.16b
+        tbl             v3.16b, {v23.16b}, v29.16b
+        tbl             v4.16b, {v23.16b}, v30.16b
+
+        smlal           v0.4s, v18.4h, v7.h[2]
+        smlal2          v1.4s, v18.8h, v7.h[2]
+        mov             v18.16b, v19.16b
+
+        sdot            v5.4s, v2.16b, v26.4b[0]
+        sdot            v6.4s, v3.16b, v26.4b[0]
+
+        smlal           v0.4s, v19.4h, v7.h[3]
+        smlal2          v1.4s, v19.8h, v7.h[3]
+        mov             v19.16b, v20.16b
+
+        sdot            v5.4s, v3.16b, v26.4b[1]
+        sdot            v6.4s, v4.16b, v26.4b[1]
+
+        smlal           v0.4s, v20.4h, v7.h[4]
+        smlal2          v1.4s, v20.8h, v7.h[4]
+        mov             v20.16b, v21.16b
+
+        smlal           v0.4s, v21.4h, v7.h[5]
+        smlal2          v1.4s, v21.8h, v7.h[5]
+.ifc \type, prep
+        uzp1            v23.8h, v5.8h, v6.8h
+.endif
+        mov             v21.16b, v22.16b
+
+        smlal           v0.4s, v22.4h, v7.h[6]
+        smlal2          v1.4s, v22.8h, v7.h[6]
+.ifc \type, prep
+        sshr            v22.8h, v23.8h, #2
+        smlal           v0.4s, v22.4h, v7.h[7]
+        smlal2          v1.4s, v22.8h, v7.h[7]
+        rshrn           v0.4h, v0.4s, #6
+        rshrn2          v0.8h, v1.4s, #6
+        subs            w8, w8, #1
+        st1             {v0.8h}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #16
+.else
+        shrn            v22.4h, v5.4s, #2
+        shrn2           v22.8h, v6.4s, #2
+        smlal           v0.4s, v22.4h, v7.h[7]
+        smlal2          v1.4s, v22.8h, v7.h[7]
+        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        subs            w8, w8, #1
+        sqrshrun        v0.8b, v0.8h, #2
+        st1             {v0.8b}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #8
+.endif
+        add             \src, \src, #8
+        subs            \w, \w, #8
+        b.gt            81b
+        ret             x15
+
+        .align JUMP_ALIGN
+40:     // HV8 - 4xN
+        ldr             s26, [\xmx, #2]
+        add             \src, \src, #2
+
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v16.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v17.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v18.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v19.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v20.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v21.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+
+        .align LOOP_ALIGN
+4:
+        ld1             {v4.8b}, [\src], \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[0]
+        smlal           v0.4s, v17.4h, v7.h[1]
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+        sub             v4.16b, v4.16b, v24.16b
+
+        smlal           v0.4s, v18.4h, v7.h[2]
+        smlal           v0.4s, v19.4h, v7.h[3]
+        tbl             v2.16b, {v4.16b}, v28.16b
+        mov             v5.16b, v27.16b
+
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+
+        smlal           v0.4s, v20.4h, v7.h[4]
+        smlal           v0.4s, v21.4h, v7.h[5]
+
+        sdot            v5.4s, v2.16b, v26.4b[0]
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+.ifc \type, put
+        subs            \h, \h, #1
+.endif
+        smlal           v0.4s, v22.4h, v7.h[6]
+        shrn            v22.4h, v5.4s, #2
+
+        smlal           v0.4s, v22.4h, v7.h[7]
+.ifc \type, prep
+        rshrn           v0.4h, v0.4s, #6
+        str             d0, [\dst], #8
+        subs            \h, \h, #1
+.else
+        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        sqrshrun        v0.8b, v0.8h, #2
+        str             s0, [\dst]
+        add             \dst, \dst, \d_strd
+.endif
+        b.gt            4b
+        ret             x15
+
+.ifc \type, put
+        .align JUMP_ALIGN
+20:     // HV8 - 2xN
+        ldr             s26, [\xmx, #2]
+        add             \src, \src, #2
+
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v16.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v17.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v18.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v19.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v20.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v21.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+
+        .align LOOP_ALIGN
+2:
+        ld1             {v4.8b}, [\src], \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[0]
+        smlal           v0.4s, v17.4h, v7.h[1]
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+        sub             v4.16b, v4.16b, v24.16b
+
+        smlal           v0.4s, v18.4h, v7.h[2]
+        smlal           v0.4s, v19.4h, v7.h[3]
+        tbl             v2.16b, {v4.16b}, v28.16b
+        mov             v5.16b, v27.16b
+
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+
+        smlal           v0.4s, v20.4h, v7.h[4]
+        smlal           v0.4s, v21.4h, v7.h[5]
+
+        sdot            v5.4s, v2.16b, v26.4b[0]
+        mov             v20.16b, v21.16b
+        mov             v21.16b, v22.16b
+
+        subs            \h, \h, #1
+        smlal           v0.4s, v22.4h, v7.h[6]
+        shrn            v22.4h, v5.4s, #2
+
+        smlal           v0.4s, v22.4h, v7.h[7]
+        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        sqrshrun        v0.8b, v0.8h, #2
+
+        str             h0, [\dst]
+        add             \dst, \dst, \d_strd
+        b.gt            2b
+        ret             x15
+.endif
+
+        .align JUMP_ALIGN
+L(\type\()_6tap_hv_\isa):
+        cmp             \w, #4
+        b.eq            40f
+.ifc \type, put
+        b.lt            20f
+.endif
+
+        // .align JUMP_ALIGN    // fallthrough
+80:     // HV6 - 8xN+
+        ldr             q29, L(h_tbl_neon_dotprod) + 16
+        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldr             d26, [\xmx]
+.ifc \type, prep
+        add             \wd_strd, \w, \w
+.endif
+
+        .align LOOP_ALIGN
+81:
+        mov             \lsrc, \src
+        mov             \ldst, \dst
+        mov             w8, \h
+
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v16.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v17.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v18.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v19.16b, v22.16b
+        bl              L(\type\()_hv_filter8_\isa)
+        mov             v20.16b, v22.16b
+
+        .align LOOP_ALIGN
+8:
+        ldr             q23, [\xmy]
+        add             \xmy, \xmy, \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[1]
+        smull2          v1.4s, v16.8h, v7.h[1]
+        sub             v23.16b, v23.16b, v24.16b
+        mov             v16.16b, v17.16b
+
+        mov             v5.16b, v27.16b
+        mov             v6.16b, v27.16b
+
+        tbl             v2.16b, {v23.16b}, v28.16b
+        tbl             v3.16b, {v23.16b}, v29.16b
+
+        smlal           v0.4s, v17.4h, v7.h[2]
+        smlal2          v1.4s, v17.8h, v7.h[2]
+        tbl             v4.16b, {v23.16b}, v30.16b
+        mov             v17.16b, v18.16b
+
+        sdot            v5.4s, v2.16b, v26.4b[0]
+        sdot            v6.4s, v3.16b, v26.4b[0]
+        smlal           v0.4s, v18.4h, v7.h[3]
+        smlal2          v1.4s, v18.8h, v7.h[3]
+        mov             v18.16b, v19.16b
+
+        sdot            v5.4s, v3.16b, v26.4b[1]
+        sdot            v6.4s, v4.16b, v26.4b[1]
+        smlal           v0.4s, v19.4h, v7.h[4]
+        smlal2          v1.4s, v19.8h, v7.h[4]
+        mov             v19.16b, v20.16b
+        uzp1            v23.8h, v5.8h, v6.8h
+
+        smlal           v0.4s, v20.4h, v7.h[5]
+        smlal2          v1.4s, v20.8h, v7.h[5]
+        sshr            v20.8h, v23.8h, #2
+.ifc \type, prep
+        smlal           v0.4s, v20.4h, v7.h[6]
+        smlal2          v1.4s, v20.8h, v7.h[6]
+        rshrn           v0.4h, v0.4s, #6
+        rshrn2          v0.8h, v1.4s, #6
+        st1             {v0.8h}, [\ldst], \d_strd
+        subs            w8, w8, #1
+        b.gt            8b
+        add             \dst, \dst, #16
+.else
+        subs            w8, w8, #1
+        smlal           v0.4s, v20.4h, v7.h[6]
+        smlal2          v1.4s, v20.8h, v7.h[6]
+        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        sqrshrun        v0.8b, v0.8h, #2
+        st1             {v0.8b}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #8
+.endif
+        add             \src, \src, #8
+        subs            \w, \w, #8
+        b.gt            81b
+        ret             x15
+
+        .align FUNC_ALIGN
+L(\type\()_hv_filter8_\isa):
+        ldr             q4, [\lsrc]
+        add             \lsrc, \lsrc, \s_strd
+        sub             v4.16b, v4.16b, v24.16b
+        mov             v22.16b, v27.16b
+        mov             v23.16b, v27.16b
+        tbl             v2.16b, {v4.16b}, v28.16b
+        tbl             v3.16b, {v4.16b}, v29.16b
+        tbl             v4.16b, {v4.16b}, v30.16b
+        sdot            v22.4s, v2.16b, v26.4b[0]
+        sdot            v22.4s, v3.16b, v26.4b[1]
+        sdot            v23.4s, v3.16b, v26.4b[0]
+        sdot            v23.4s, v4.16b, v26.4b[1]
+        shrn            v22.4h, v22.4s, #2
+        shrn2           v22.8h, v23.4s, #2
+        ret
+
+        .align FUNC_ALIGN
+L(\type\()_hv_filter4_\isa):
+        mov             v22.16b, v27.16b
+        ld1             {v4.8b}, [\src], \s_strd
+        sub             v4.16b, v4.16b, v24.16b
+        tbl             v2.16b, {v4.16b}, v28.16b
+        sdot            v22.4s, v2.16b, v26.4b[0]
+        shrn            v22.4h, v22.4s, #2
+        ret
+
+        .align JUMP_ALIGN
+40:     // HV6 - 4xN
+        ldr             s26, [\xmx, #2]
+        add             \src, \src, #2
+
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v16.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v17.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v18.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v19.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v20.16b, v22.16b
+
+        .align LOOP_ALIGN
+4:
+        ld1             {v4.8b}, [\src], \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[1]
+        smlal           v0.4s, v17.4h, v7.h[2]
+        sub             v4.16b, v4.16b, v24.16b
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        smlal           v0.4s, v18.4h, v7.h[3]
+        smlal           v0.4s, v19.4h, v7.h[4]
+        tbl             v2.16b, {v4.16b}, v28.16b
+        mov             v5.16b, v27.16b
+
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        sdot            v5.4s, v2.16b, v26.4b[0]
+
+        smlal           v0.4s, v20.4h, v7.h[5]
+        shrn            v20.4h, v5.4s, #2
+.ifc \type, prep
+        smlal           v0.4s, v20.4h, v7.h[6]
+        rshrn           v0.4h, v0.4s, #6
+        str             d0, [\dst], #8
+        subs            \h, \h, #1
+.else
+        subs            \h, \h, #1
+        smlal           v0.4s, v20.4h, v7.h[6]
+        tbl             v0.16b, {v0.16b}, v25.16b
+        sqrshrun        v0.8b, v0.8h, #2
+        str             s0, [\dst]
+        add             \dst, \dst, \d_strd
+.endif
+        b.gt            4b
+        ret             x15
+
+.ifc \type, put
+        .align JUMP_ALIGN
+20:     // HV6 - 2xN
+        ldr             s26, [\xmx, #2]
+        add             \src, \src, #2
+
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v16.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v17.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v18.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v19.16b, v22.16b
+        bl              L(\type\()_hv_filter4_\isa)
+        mov             v20.16b, v22.16b
+
+        .align LOOP_ALIGN
+2:
+        ld1             {v4.8b}, [\src], \s_strd
+
+        smull           v0.4s, v16.4h, v7.h[1]
+        smlal           v0.4s, v17.4h, v7.h[2]
+        sub             v4.16b, v4.16b, v24.16b
+        mov             v16.16b, v17.16b
+        mov             v17.16b, v18.16b
+
+        smlal           v0.4s, v18.4h, v7.h[3]
+        smlal           v0.4s, v19.4h, v7.h[4]
+        tbl             v2.16b, {v4.16b}, v28.16b
+        mov             v5.16b, v27.16b
+
+        mov             v18.16b, v19.16b
+        mov             v19.16b, v20.16b
+        sdot            v5.4s, v2.16b, v26.4b[0]
+
+        smlal           v0.4s, v20.4h, v7.h[5]
+        shrn            v20.4h, v5.4s, #2
+
+        subs            \h, \h, #1
+        smlal           v0.4s, v20.4h, v7.h[6]
+
+        tbl             v0.16b, {v0.16b}, v25.16b
+        sqrshrun        v0.8b, v0.8h, #2
+
+        str             h0, [\dst]
+        add             \dst, \dst, \d_strd
+        b.gt            2b
+        ret             x15
+.endif
+
+        .align JUMP_ALIGN
+L(\type\()_8tap_h_\isa):
+        adr             x9, L(\type\()_8tap_h_\isa\()_tbl)
+        ldrh            w8, [x9, x8, lsl #1]
+.ifc \type, put
+        mov             w10, #0x2022    // 64 * 128 + 34, bias and rounding for SDOT
+        dup             v27.4s, w10
+.endif
+        sub             x9, x9, x8
+        br              x9
+
+.ifc \type, put
+        .align JUMP_ALIGN
+20:     // H - 2xN
+        AARCH64_VALID_JUMP_TARGET
+        add             \src, \src, #2
+        ldr             s6, [\xmx, #2]
+
+        .align LOOP_ALIGN
+2:
+        ldr             d0, [\src]
+        ldr             d1, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        sub             v0.8b, v0.8b, v24.8b
+        sub             v1.8b, v1.8b, v24.8b
+
+        mov             v4.16b, v27.16b
+        mov             v5.16b, v27.16b
+
+        tbl             v2.16b, {v0.16b}, v28.16b
+        tbl             v3.16b, {v1.16b}, v28.16b
+
+        sdot            v4.4s, v2.16b, v6.4b[0]
+        sdot            v5.4s, v3.16b, v6.4b[0]
+
+        uzp1            v4.8h, v4.8h, v5.8h
+        sqshrun         v4.8b, v4.8h, #6
+
+        subs            \h, \h, #2
+        fmov            x8, d4
+        lsr             x9, x8, #32
+        strh            w8, [\dst]
+        strh            w9, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+        b.gt            2b
+        ret
+
+.endif
+
+        .align JUMP_ALIGN
+40:     // H - 4xN
+        AARCH64_VALID_JUMP_TARGET
+        add             \src, \src, #2
+        ldr             s26, [\xmx, #2]
+
+        .align LOOP_ALIGN
+4:
+        ldr             d0, [\src]
+        ldr             d1, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        sub             v0.8b, v0.8b, v24.8b
+        sub             v1.8b, v1.8b, v24.8b
+
+        mov             v4.16b, v27.16b
+        mov             v5.16b, v27.16b
+
+        tbl             v2.16b, {v0.16b}, v28.16b
+        tbl             v3.16b, {v1.16b}, v28.16b
+
+        sdot            v4.4s, v2.16b, v26.4b[0]
+        sdot            v5.4s, v3.16b, v26.4b[0]
+.ifc \type, prep
+        subs            \h, \h, #2
+        shrn            v4.4h, v4.4s, #2
+        shrn2           v4.8h, v5.4s, #2
+        str             q4, [\dst], #16
+.else
+        uzp1            v4.8h, v4.8h, v5.8h
+        sqshrun         v4.8b, v4.8h, #6
+        subs            \h, \h, #2
+        fmov            x8, d4
+        lsr             x9, x8, #32
+        str             w8, [\dst]
+        str             w9, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+.endif
+        b.gt            4b
+        ret
+
+        .align JUMP_ALIGN
+80:     // H - 8xN
+        AARCH64_VALID_JUMP_TARGET
+        ldr             q29, L(h_tbl_neon_dotprod) + 16
+        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldr             d26, [\xmx]
+
+        .align LOOP_ALIGN
+8:
+        ldr             q0, [\src]
+        ldr             q16, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+
+        sub             v0.16b, v0.16b, v24.16b
+        sub             v16.16b, v16.16b, v24.16b
+
+        mov             v4.16b, v27.16b
+        mov             v5.16b, v27.16b
+        mov             v20.16b, v27.16b
+        mov             v21.16b, v27.16b
+
+        tbl             v1.16b, {v0.16b}, v28.16b
+        tbl             v2.16b, {v0.16b}, v29.16b
+        tbl             v3.16b, {v0.16b}, v30.16b
+        tbl             v17.16b, {v16.16b}, v28.16b
+        tbl             v18.16b, {v16.16b}, v29.16b
+        tbl             v19.16b, {v16.16b}, v30.16b
+
+        sdot            v4.4s, v1.16b, v26.4b[0]
+        sdot            v5.4s, v2.16b, v26.4b[0]
+        sdot            v20.4s, v17.16b, v26.4b[0]
+        sdot            v21.4s, v18.16b, v26.4b[0]
+        sdot            v4.4s, v2.16b, v26.4b[1]
+        sdot            v5.4s, v3.16b, v26.4b[1]
+        sdot            v20.4s, v18.16b, v26.4b[1]
+        sdot            v21.4s, v19.16b, v26.4b[1]
+
+        uzp1            v4.8h, v4.8h, v5.8h
+        uzp1            v20.8h, v20.8h, v21.8h
+.ifc \type, prep
+        sshr            v4.8h, v4.8h, #2
+        sshr            v20.8h, v20.8h, #2
+        subs            \h, \h, #2
+        stp             q4, q20, [\dst], #32
+.else
+        sqshrun         v4.8b, v4.8h, #6
+        sqshrun         v20.8b, v20.8h, #6
+        subs            \h, \h, #2
+        str             d4, [\dst]
+        str             d20, [\dst, \d_strd]
+        add             \dst, \dst, \d_strd, lsl #1
+.endif
+        b.gt            8b
+        ret
+
+        .align JUMP_ALIGN
+160:    // H - 16xN
+        AARCH64_VALID_JUMP_TARGET
+        ldr             q29, L(h_tbl_neon_dotprod) + 16
+        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldr             q31, L(h_tbl_neon_dotprod) + 48
+        ldr             d26, [\xmx]
+
+        .align LOOP_ALIGN
+16:
+        ldp             q16, q17, [\src]
+        add             \src, \src, \s_strd
+
+        sub             v16.16b, v16.16b, v24.16b
+        sub             v17.16b, v17.16b, v24.16b
+
+        mov             v6.16b, v27.16b
+        mov             v7.16b, v27.16b
+        mov             v22.16b, v27.16b
+        mov             v23.16b, v27.16b
+
+        tbl             v0.16b, {v16.16b}, v28.16b
+        tbl             v1.16b, {v16.16b}, v29.16b
+        tbl             v2.16b, {v16.16b}, v30.16b
+        tbl             v3.16b, {v16.16b, v17.16b}, v31.16b
+        tbl             v4.16b, {v17.16b}, v28.16b
+
+        sdot            v6.4s, v0.16b, v26.4b[0]
+        sdot            v7.4s, v1.16b, v26.4b[0]
+        sdot            v22.4s, v2.16b, v26.4b[0]
+        sdot            v23.4s, v3.16b, v26.4b[0]
+        sdot            v6.4s, v1.16b, v26.4b[1]
+        sdot            v7.4s, v2.16b, v26.4b[1]
+        sdot            v22.4s, v3.16b, v26.4b[1]
+        sdot            v23.4s, v4.16b, v26.4b[1]
+
+        uzp1            v6.8h, v6.8h, v7.8h
+        uzp1            v22.8h, v22.8h, v23.8h
+.ifc \type, prep
+        sshr            v6.8h, v6.8h, #2
+        sshr            v22.8h, v22.8h, #2
+        subs            \h, \h, #1
+        stp             q6, q22, [\dst], #32
+.else
+        sqshrun         v6.8b, v6.8h, #6
+        sqshrun2        v6.16b, v22.8h, #6
+        subs            \h, \h, #1
+        str             q6, [\dst]
+        add             \dst, \dst, \d_strd
+.endif
+        b.gt            16b
+        ret
+
+        .align JUMP_ALIGN
+320:    // H - 32xN+
+640:
+1280:
+        AARCH64_VALID_JUMP_TARGET
+        ldr             q29, L(h_tbl_neon_dotprod) + 16
+        ldr             q30, L(h_tbl_neon_dotprod) + 32
+        ldr             q31, L(h_tbl_neon_dotprod) + 48
+        ldr             d26, [\xmx]
+.ifc \type, put
+        sub             \d_strd, \d_strd, \w, uxtw
+.endif
+        sub             \s_strd, \s_strd, \w, uxtw
+        mov             w8, \w
+
+        .align LOOP_ALIGN
+32:
+        ldp             q16, q17, [\src], #16
+
+        sub             v16.16b, v16.16b, v24.16b
+        sub             v17.16b, v17.16b, v24.16b
+
+        mov             v6.16b, v27.16b
+        mov             v7.16b, v27.16b
+        mov             v22.16b, v27.16b
+        mov             v23.16b, v27.16b
+
+        tbl             v0.16b, {v16.16b}, v28.16b
+        tbl             v1.16b, {v16.16b}, v29.16b
+        tbl             v2.16b, {v16.16b}, v30.16b
+        tbl             v3.16b, {v16.16b, v17.16b}, v31.16b
+        tbl             v4.16b, {v17.16b}, v28.16b
+
+        sdot            v6.4s, v0.16b, v26.4b[0]
+        sdot            v7.4s, v1.16b, v26.4b[0]
+        sdot            v22.4s, v2.16b, v26.4b[0]
+        sdot            v23.4s, v3.16b, v26.4b[0]
+        sdot            v6.4s, v1.16b, v26.4b[1]
+        sdot            v7.4s, v2.16b, v26.4b[1]
+        sdot            v22.4s, v3.16b, v26.4b[1]
+        sdot            v23.4s, v4.16b, v26.4b[1]
+
+        uzp1            v6.8h, v6.8h, v7.8h
+        uzp1            v22.8h, v22.8h, v23.8h
+.ifc \type, prep
+        sshr            v6.8h, v6.8h, #2
+        sshr            v22.8h, v22.8h, #2
+        subs            w8, w8, #16
+        stp             q6, q22, [\dst], #32
+.else
+        sqshrun         v6.8b, v6.8h, #6
+        sqshrun2        v6.16b, v22.8h, #6
+        subs            w8, w8, #16
+        str             q6, [\dst], #16
+.endif
+        b.gt            32b
+
+        add             \src, \src, \s_strd
+.ifc \type, put
+        add             \dst, \dst, \d_strd
+.endif
+        mov             w8, \w
+        subs            \h, \h, #1
+        b.gt            32b
+        ret
+
+L(\type\()_8tap_h_\isa\()_tbl):
+        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b)
+        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b)
+        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b)
+        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b)
+        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b)
+        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
+.ifc \type, put
+        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
+.endif
+endfunc
+.endm
+
+// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
+// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
+filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
+
+// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
+// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
+filter_8tap_fn  put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
+
+DISABLE_DOTPROD
+#endif  // HAVE_DOTPROD
diff --git a/src/arm/mc.h b/src/arm/mc.h
index 06cd533a9..7e57fd37c 100644
--- a/src/arm/mc.h
+++ b/src/arm/mc.h
@@ -30,26 +30,40 @@
 #include "src/mc.h"
 #include "src/cpu.h"
 
-decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
-decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
-decl_mc_fn(BF(dav1d_put_bilin, neon));
+#define decl_8tap_gen(decl_name, fn_name, opt) \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular,        opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_smooth, opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_sharp,  opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_regular, opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth,         opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_sharp,   opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_regular,  opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_smooth,   opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp,          opt))
+
+#define decl_8tap_fns(opt) \
+    decl_8tap_gen(mc,  put,  opt); \
+    decl_8tap_gen(mct, prep, opt)
+
+#define init_8tap_gen(name, opt) \
+    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
+    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
+    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
+
+#define init_8tap_fns(opt) \
+    init_8tap_gen(mc,  opt); \
+    init_8tap_gen(mct, opt)
+
+decl_8tap_fns(neon);
+decl_8tap_fns(neon_dotprod);
 
-decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+decl_mc_fn(BF(dav1d_put_bilin, neon));
 decl_mct_fn(BF(dav1d_prep_bilin, neon));
 
 decl_avg_fn(BF(dav1d_avg, neon));
@@ -77,27 +91,10 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
 
     if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
 
-    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
-    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               neon);
-
-    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
-    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
-    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
-    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
+    init_8tap_fns(neon);
+
+    init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+    init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);
 
     c->avg = BF(dav1d_avg, neon);
     c->w_avg = BF(dav1d_w_avg, neon);
@@ -111,4 +108,12 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
     c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
     c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
     c->emu_edge = BF(dav1d_emu_edge, neon);
+
+#if ARCH_AARCH64
+#if HAVE_DOTPROD && BITDEPTH == 8
+    if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
+
+    init_8tap_fns(neon_dotprod);
+#endif  // HAVE_DOTPROD && BITDEPTH == 8
+#endif  // ARCH_AARCH64
 }
diff --git a/src/meson.build b/src/meson.build
index 56daf005c..4a4747c73 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -106,6 +106,7 @@ if is_asm_enabled
                     'arm/64/loopfilter.S',
                     'arm/64/looprestoration.S',
                     'arm/64/mc.S',
+                    'arm/64/mc_dotprod.S',
                 )
             endif
 

From c7f38b303e7dc707ffcc6ce30594598f83f03254 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 19 Apr 2024 00:07:44 +0300
Subject: [PATCH 03/22] aarch64: Avoid unaligned jump tables

Manually add a padding 0 entry to make the odd number of .hword
entries align with the instruction size.

This fixes assembling with GAS, with the --gdwarf2 option, where
it previously produced the error message "unaligned opcodes detected
in executable segment".

The message is slightly misleading, as the error is printed even
if there actually are no opcodes that are misaligned, as the jump
table is the last thing within the .text section. The issue can
be reproduced with an input as small as this, assembled with
"as --gdwarf2 -c test.s".

        .text
        nop
        .hword 0

See a6228f47f0eebcdfebb1753a786e3e1654b51ea4 for earlier cases of
the same error - although in those cases, we actually did have more
code and labels following the unaligned jump tables.

This error is present with binutils 2.39 and earlier; in
binutils 2.40, this input no longer is considered an error, fixed
in https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=6f6f5b0adc9efd103c434fd316e8c880a259775d.
---
 src/arm/64/mc_dotprod.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index fcf04ee4d..051a201dd 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -1397,6 +1397,7 @@ L(\type\()_8tap_h_\isa\()_tbl):
         .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
 .ifc \type, put
         .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
+        .hword 0
 .endif
 endfunc
 .endm

From 08417d57a400868aa11cb70bae3693b0c60ed992 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Tue, 23 Apr 2024 16:50:35 +0200
Subject: [PATCH 04/22] AArch64: Add \dot parameter to filter_8tap_fn macro

Add \dot parameter to filter_8tap_fn macro in preparation to extend
it with i8mm code path. This patch also contains string fixes and
some instruction reorderings along with some register renaming to
make it more uniform. These changes don't affect performance but
simplifies the code a bit.
---
 src/arm/64/mc_dotprod.S | 239 ++++++++++++++++++++--------------------
 1 file changed, 119 insertions(+), 120 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 051a201dd..c6040145b 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -86,7 +86,7 @@ function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN
 endfunc
 .endm
 
-.macro filter_8tap_fn type, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
+.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
 make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
 make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
 make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
@@ -112,12 +112,10 @@ function \type\()_8tap_\isa, align=FUNC_ALIGN
         .align JUMP_ALIGN
 L(\type\()_8tap_v_\isa):
         madd            \my, \my, w11, w10
-.ifc \type, prep
-        mov             w8, 0x2002  // FILTER_WEIGHT * 128 + rounding
-.endif
-        sub             \src, \src, \s_strd
         ldr             q6, L(v_tbl_neon_dotprod)
+        sub             \src, \src, \s_strd
 .ifc \type, prep
+        mov             w8, 0x2002  // FILTER_WEIGHT * 128 + rounding
         dup             v4.4s, w8
 .endif
         ubfx            w11, \my, #7, #7
@@ -125,9 +123,9 @@ L(\type\()_8tap_v_\isa):
         ldr             q28, L(v_tbl_neon_dotprod) + 16
         cmp             \h, #4
         csel            \my, \my, w11, le
-        sub             \src, \src, \s_strd, lsl #1     // src - src_stride * 3
-        ldr             q29, L(v_tbl_neon_dotprod) + 32
+        sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
         add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
+        ldr             q29, L(v_tbl_neon_dotprod) + 32
         movi            v5.16b, #128
         ldr             d7, [\xmy]
         cmp             \w, #8
@@ -211,20 +209,20 @@ L(\type\()_8tap_v_\isa):
         sub             v24.16b, v27.16b, v5.16b
         sub             v27.16b, v27.16b, v5.16b
 
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v2.4s, v22.16b, v7.4b[0]
-        sdot            v3.4s, v25.16b, v7.4b[0]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v2.4s, v22.16b, v7.4b[0]
+        \dot            v3.4s, v25.16b, v7.4b[0]
 
         tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
         tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
         tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
 
-        sdot            v0.4s, v17.16b, v7.4b[1]
-        sdot            v1.4s, v20.16b, v7.4b[1]
-        sdot            v2.4s, v23.16b, v7.4b[1]
-        sdot            v3.4s, v26.16b, v7.4b[1]
+        \dot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v2.4s, v23.16b, v7.4b[1]
+        \dot            v3.4s, v26.16b, v7.4b[1]
 
         tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
         tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
@@ -238,7 +236,7 @@ L(\type\()_8tap_v_\isa):
         sshr            v0.8h, v0.8h, #2
         sshr            v1.8h, v2.8h, #2
         st1             {v0.8h, v1.8h}, [\ldst], \d_strd
-.else
+.else   // put
         sqrshrun        v0.8b, v0.8h, #6
         sqrshrun2       v0.16b, v2.8h, #6
         st1             {v0.16b}, [\ldst], \d_strd
@@ -256,15 +254,15 @@ L(\type\()_8tap_v_\isa):
         movi            v2.4s, #32, lsl 8
         movi            v3.4s, #32, lsl 8
 .endif
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v2.4s, v22.16b, v7.4b[0]
-        sdot            v3.4s, v25.16b, v7.4b[0]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v2.4s, v22.16b, v7.4b[0]
+        \dot            v3.4s, v25.16b, v7.4b[0]
 
-        sdot            v0.4s, v17.16b, v7.4b[1]
-        sdot            v1.4s, v20.16b, v7.4b[1]
-        sdot            v2.4s, v23.16b, v7.4b[1]
-        sdot            v3.4s, v26.16b, v7.4b[1]
+        \dot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v2.4s, v23.16b, v7.4b[1]
+        \dot            v3.4s, v26.16b, v7.4b[1]
 
         subs            \w, \w, #16
         uzp1            v0.8h, v0.8h, v1.8h
@@ -274,7 +272,7 @@ L(\type\()_8tap_v_\isa):
         sshr            v1.8h, v2.8h, #2
         stp             q0, q1, [\ldst]
         add             \dst, \dst, #32
-.else
+.else   // put
         sqrshrun        v0.8b, v0.8h, #6
         sqrshrun2       v0.16b, v2.8h, #6
         str             q0, [\ldst]
@@ -318,7 +316,6 @@ L(\type\()_8tap_v_\isa):
 .ifc \type, put
         b.eq            82f
 .endif
-
         .align LOOP_ALIGN
 8:
         ldr             d21, [\src]
@@ -345,20 +342,20 @@ L(\type\()_8tap_v_\isa):
         tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
         tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
 
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v0.4s, v17.16b, v7.4b[1]
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v1.4s, v20.16b, v7.4b[1]
 
         tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
         tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
         tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
         tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
 
-        sdot            v2.4s, v22.16b, v7.4b[0]
-        sdot            v2.4s, v23.16b, v7.4b[1]
-        sdot            v3.4s, v25.16b, v7.4b[0]
-        sdot            v3.4s, v26.16b, v7.4b[1]
+        \dot            v2.4s, v22.16b, v7.4b[0]
+        \dot            v2.4s, v23.16b, v7.4b[1]
+        \dot            v3.4s, v25.16b, v7.4b[0]
+        \dot            v3.4s, v26.16b, v7.4b[1]
 
         subs            \h, \h, #2
         uzp1            v0.8h, v0.8h, v1.8h
@@ -367,7 +364,7 @@ L(\type\()_8tap_v_\isa):
         sshr            v0.8h, v0.8h, #2
         sshr            v1.8h, v2.8h, #2
         stp             q0, q1, [\dst], #32
-.else
+.else   // put
         sqrshrun        v0.8b, v0.8h, #6
         sqrshrun        v1.8b, v2.8h, #6
         str             d0, [\dst]
@@ -399,15 +396,15 @@ L(\type\()_8tap_v_\isa):
         tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
         tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
 
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v0.4s, v17.16b, v7.4b[1]
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v1.4s, v20.16b, v7.4b[1]
 
-        sdot            v2.4s, v22.16b, v7.4b[0]
-        sdot            v2.4s, v23.16b, v7.4b[1]
-        sdot            v3.4s, v25.16b, v7.4b[0]
-        sdot            v3.4s, v26.16b, v7.4b[1]
+        \dot            v2.4s, v22.16b, v7.4b[0]
+        \dot            v2.4s, v23.16b, v7.4b[1]
+        \dot            v3.4s, v25.16b, v7.4b[0]
+        \dot            v3.4s, v26.16b, v7.4b[1]
 
         uzp1            v0.8h, v0.8h, v1.8h
         uzp1            v2.8h, v2.8h, v3.8h
@@ -415,7 +412,7 @@ L(\type\()_8tap_v_\isa):
         sshr            v0.8h, v0.8h, #2
         sshr            v1.8h, v2.8h, #2
         stp             q0, q1, [\dst]
-.else
+.else   // put
         sqrshrun        v0.8b, v0.8h, #6
         sqrshrun        v1.8b, v2.8h, #6
         str             d0, [\dst]
@@ -457,7 +454,6 @@ L(\type\()_8tap_v_\isa):
 .ifc \type, put
         b.eq            42f
 .endif
-
         .align LOOP_ALIGN
 4:
         ldr             s18, [\src]
@@ -476,14 +472,14 @@ L(\type\()_8tap_v_\isa):
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v0.4s, v17.16b, v7.4b[1]
 
         tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
         tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
 
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v1.4s, v20.16b, v7.4b[1]
 .ifc \type, prep
         subs            \h, \h, #2
         shrn            v0.4h, v0.4s, #2
@@ -517,16 +513,15 @@ L(\type\()_8tap_v_\isa):
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v0.4s, v17.16b, v7.4b[1]
 
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v1.4s, v20.16b, v7.4b[1]
 .ifc \type, prep
         shrn            v0.4h, v0.4s, #2
         shrn2           v0.8h, v1.4s, #2
         str             q0, [\dst]
-        ret
 .else
         uzp1            v0.8h, v0.8h, v1.8h
         sqrshrun        v0.8b, v0.8h, #6
@@ -534,8 +529,10 @@ L(\type\()_8tap_v_\isa):
         lsr             x9, x8, #32
         str             w8, [\dst]
         str             w9, [\dst, \d_strd]
+.endif
         ret
 
+.ifc \type, put
         .align JUMP_ALIGN
 20:     // V - 2xN
         ldr             h16, [\src]
@@ -581,14 +578,14 @@ L(\type\()_8tap_v_\isa):
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v0.4s, v17.16b, v7.4b[1]
 
         tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
         tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
 
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v1.4s, v20.16b, v7.4b[1]
 
         uzp1            v0.8h, v0.8h, v1.8h
         sqrshrun        v0.8b, v0.8h, #6
@@ -613,11 +610,11 @@ L(\type\()_8tap_v_\isa):
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
-        sdot            v0.4s, v16.16b, v7.4b[0]
-        sdot            v0.4s, v17.16b, v7.4b[1]
+        \dot            v0.4s, v16.16b, v7.4b[0]
+        \dot            v0.4s, v17.16b, v7.4b[1]
 
-        sdot            v1.4s, v19.16b, v7.4b[0]
-        sdot            v1.4s, v20.16b, v7.4b[1]
+        \dot            v1.4s, v19.16b, v7.4b[0]
+        \dot            v1.4s, v20.16b, v7.4b[1]
 
         uzp1            v0.8h, v0.8h, v1.8h
         sqrshrun        v0.8b, v0.8h, #6
@@ -635,8 +632,8 @@ L(\type\()_8tap_h_hv_\isa):
         madd            w14, \my, w11, w10      // for HV
         ldr             q28, L(h_tbl_neon_dotprod)
         mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
+        dup             v27.4s, w13             // put H overrides this
         sub             \src, \src, #4          // src - 4
-        dup             v27.4s, w13
         ubfx            w9, \mx, #7, #7
         and             \mx, \mx, #0x7F
         ubfx            w11, w14, #7, #7        // for HV
@@ -650,7 +647,7 @@ L(\type\()_8tap_h_hv_\isa):
         // HV cases
         cmp             \h, #4
         csel            w14, w14, w11, le
-        sub             \src, \src, \s_strd, lsl #1 // src - src_stride * 2 - 4
+        sub             \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 4
         add             \xmy, x12, x14, lsl #3      // subpel V filter address
         mov             x15, x30
         ldr             d7, [\xmy]
@@ -662,7 +659,7 @@ L(\type\()_8tap_h_hv_\isa):
         b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
 
         // HV 8-tap cases
-        sub             \src, \src, \s_strd         // src - src_stride * 3 - 4
+        sub             \src, \src, \s_strd         // src - s_strd * 3 - 4
         cmp             \w, #4
         b.eq            40f
 .ifc \type, put
@@ -677,7 +674,6 @@ L(\type\()_8tap_h_hv_\isa):
 .ifc \type, prep
         add             \wd_strd, \w, \w
 .endif
-
         .align LOOP_ALIGN
 81:
         mov             \lsrc, \src
@@ -724,15 +720,15 @@ L(\type\()_8tap_h_hv_\isa):
         smlal2          v1.4s, v18.8h, v7.h[2]
         mov             v18.16b, v19.16b
 
-        sdot            v5.4s, v2.16b, v26.4b[0]
-        sdot            v6.4s, v3.16b, v26.4b[0]
+        \dot            v5.4s, v2.16b, v26.4b[0]
+        \dot            v6.4s, v3.16b, v26.4b[0]
 
         smlal           v0.4s, v19.4h, v7.h[3]
         smlal2          v1.4s, v19.8h, v7.h[3]
         mov             v19.16b, v20.16b
 
-        sdot            v5.4s, v3.16b, v26.4b[1]
-        sdot            v6.4s, v4.16b, v26.4b[1]
+        \dot            v5.4s, v3.16b, v26.4b[1]
+        \dot            v6.4s, v4.16b, v26.4b[1]
 
         smlal           v0.4s, v20.4h, v7.h[4]
         smlal2          v1.4s, v20.8h, v7.h[4]
@@ -757,7 +753,7 @@ L(\type\()_8tap_h_hv_\isa):
         st1             {v0.8h}, [\ldst], \d_strd
         b.gt            8b
         add             \dst, \dst, #16
-.else
+.else   // put
         shrn            v22.4h, v5.4s, #2
         shrn2           v22.8h, v6.4s, #2
         smlal           v0.4s, v22.4h, v7.h[7]
@@ -801,6 +797,7 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v17.4h, v7.h[1]
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
+
         sub             v4.16b, v4.16b, v24.16b
 
         smlal           v0.4s, v18.4h, v7.h[2]
@@ -814,7 +811,7 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v20.4h, v7.h[4]
         smlal           v0.4s, v21.4h, v7.h[5]
 
-        sdot            v5.4s, v2.16b, v26.4b[0]
+        \dot            v5.4s, v2.16b, v26.4b[0]
         mov             v20.16b, v21.16b
         mov             v21.16b, v22.16b
 .ifc \type, put
@@ -865,6 +862,7 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v17.4h, v7.h[1]
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
+
         sub             v4.16b, v4.16b, v24.16b
 
         smlal           v0.4s, v18.4h, v7.h[2]
@@ -878,7 +876,7 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v20.4h, v7.h[4]
         smlal           v0.4s, v21.4h, v7.h[5]
 
-        sdot            v5.4s, v2.16b, v26.4b[0]
+        \dot            v5.4s, v2.16b, v26.4b[0]
         mov             v20.16b, v21.16b
         mov             v21.16b, v22.16b
 
@@ -951,14 +949,16 @@ L(\type\()_6tap_hv_\isa):
         tbl             v4.16b, {v23.16b}, v30.16b
         mov             v17.16b, v18.16b
 
-        sdot            v5.4s, v2.16b, v26.4b[0]
-        sdot            v6.4s, v3.16b, v26.4b[0]
+        \dot            v5.4s, v2.16b, v26.4b[0]
+        \dot            v6.4s, v3.16b, v26.4b[0]
+
         smlal           v0.4s, v18.4h, v7.h[3]
         smlal2          v1.4s, v18.8h, v7.h[3]
         mov             v18.16b, v19.16b
 
-        sdot            v5.4s, v3.16b, v26.4b[1]
-        sdot            v6.4s, v4.16b, v26.4b[1]
+        \dot            v5.4s, v3.16b, v26.4b[1]
+        \dot            v6.4s, v4.16b, v26.4b[1]
+
         smlal           v0.4s, v19.4h, v7.h[4]
         smlal2          v1.4s, v19.8h, v7.h[4]
         mov             v19.16b, v20.16b
@@ -1001,10 +1001,10 @@ L(\type\()_hv_filter8_\isa):
         tbl             v2.16b, {v4.16b}, v28.16b
         tbl             v3.16b, {v4.16b}, v29.16b
         tbl             v4.16b, {v4.16b}, v30.16b
-        sdot            v22.4s, v2.16b, v26.4b[0]
-        sdot            v22.4s, v3.16b, v26.4b[1]
-        sdot            v23.4s, v3.16b, v26.4b[0]
-        sdot            v23.4s, v4.16b, v26.4b[1]
+        \dot            v22.4s, v2.16b, v26.4b[0]
+        \dot            v22.4s, v3.16b, v26.4b[1]
+        \dot            v23.4s, v3.16b, v26.4b[0]
+        \dot            v23.4s, v4.16b, v26.4b[1]
         shrn            v22.4h, v22.4s, #2
         shrn2           v22.8h, v23.4s, #2
         ret
@@ -1015,7 +1015,7 @@ L(\type\()_hv_filter4_\isa):
         ld1             {v4.8b}, [\src], \s_strd
         sub             v4.16b, v4.16b, v24.16b
         tbl             v2.16b, {v4.16b}, v28.16b
-        sdot            v22.4s, v2.16b, v26.4b[0]
+        \dot            v22.4s, v2.16b, v26.4b[0]
         shrn            v22.4h, v22.4s, #2
         ret
 
@@ -1052,7 +1052,7 @@ L(\type\()_hv_filter4_\isa):
 
         mov             v18.16b, v19.16b
         mov             v19.16b, v20.16b
-        sdot            v5.4s, v2.16b, v26.4b[0]
+        \dot            v5.4s, v2.16b, v26.4b[0]
 
         smlal           v0.4s, v20.4h, v7.h[5]
         shrn            v20.4h, v5.4s, #2
@@ -1106,7 +1106,7 @@ L(\type\()_hv_filter4_\isa):
 
         mov             v18.16b, v19.16b
         mov             v19.16b, v20.16b
-        sdot            v5.4s, v2.16b, v26.4b[0]
+        \dot            v5.4s, v2.16b, v26.4b[0]
 
         smlal           v0.4s, v20.4h, v7.h[5]
         shrn            v20.4h, v5.4s, #2
@@ -1139,7 +1139,7 @@ L(\type\()_8tap_h_\isa):
 20:     // H - 2xN
         AARCH64_VALID_JUMP_TARGET
         add             \src, \src, #2
-        ldr             s6, [\xmx, #2]
+        ldr             s26, [\xmx, #2]
 
         .align LOOP_ALIGN
 2:
@@ -1156,8 +1156,8 @@ L(\type\()_8tap_h_\isa):
         tbl             v2.16b, {v0.16b}, v28.16b
         tbl             v3.16b, {v1.16b}, v28.16b
 
-        sdot            v4.4s, v2.16b, v6.4b[0]
-        sdot            v5.4s, v3.16b, v6.4b[0]
+        \dot            v4.4s, v2.16b, v26.4b[0]
+        \dot            v5.4s, v3.16b, v26.4b[0]
 
         uzp1            v4.8h, v4.8h, v5.8h
         sqshrun         v4.8b, v4.8h, #6
@@ -1170,7 +1170,6 @@ L(\type\()_8tap_h_\isa):
         add             \dst, \dst, \d_strd, lsl #1
         b.gt            2b
         ret
-
 .endif
 
         .align JUMP_ALIGN
@@ -1194,14 +1193,14 @@ L(\type\()_8tap_h_\isa):
         tbl             v2.16b, {v0.16b}, v28.16b
         tbl             v3.16b, {v1.16b}, v28.16b
 
-        sdot            v4.4s, v2.16b, v26.4b[0]
-        sdot            v5.4s, v3.16b, v26.4b[0]
+        \dot            v4.4s, v2.16b, v26.4b[0]
+        \dot            v5.4s, v3.16b, v26.4b[0]
 .ifc \type, prep
         subs            \h, \h, #2
         shrn            v4.4h, v4.4s, #2
         shrn2           v4.8h, v5.4s, #2
         str             q4, [\dst], #16
-.else
+.else   // put
         uzp1            v4.8h, v4.8h, v5.8h
         sqshrun         v4.8b, v4.8h, #6
         subs            \h, \h, #2
@@ -1242,14 +1241,14 @@ L(\type\()_8tap_h_\isa):
         tbl             v18.16b, {v16.16b}, v29.16b
         tbl             v19.16b, {v16.16b}, v30.16b
 
-        sdot            v4.4s, v1.16b, v26.4b[0]
-        sdot            v5.4s, v2.16b, v26.4b[0]
-        sdot            v20.4s, v17.16b, v26.4b[0]
-        sdot            v21.4s, v18.16b, v26.4b[0]
-        sdot            v4.4s, v2.16b, v26.4b[1]
-        sdot            v5.4s, v3.16b, v26.4b[1]
-        sdot            v20.4s, v18.16b, v26.4b[1]
-        sdot            v21.4s, v19.16b, v26.4b[1]
+        \dot            v4.4s, v1.16b, v26.4b[0]
+        \dot            v5.4s, v2.16b, v26.4b[0]
+        \dot            v20.4s, v17.16b, v26.4b[0]
+        \dot            v21.4s, v18.16b, v26.4b[0]
+        \dot            v4.4s, v2.16b, v26.4b[1]
+        \dot            v5.4s, v3.16b, v26.4b[1]
+        \dot            v20.4s, v18.16b, v26.4b[1]
+        \dot            v21.4s, v19.16b, v26.4b[1]
 
         uzp1            v4.8h, v4.8h, v5.8h
         uzp1            v20.8h, v20.8h, v21.8h
@@ -1258,7 +1257,7 @@ L(\type\()_8tap_h_\isa):
         sshr            v20.8h, v20.8h, #2
         subs            \h, \h, #2
         stp             q4, q20, [\dst], #32
-.else
+.else   // put
         sqshrun         v4.8b, v4.8h, #6
         sqshrun         v20.8b, v20.8h, #6
         subs            \h, \h, #2
@@ -1296,14 +1295,14 @@ L(\type\()_8tap_h_\isa):
         tbl             v3.16b, {v16.16b, v17.16b}, v31.16b
         tbl             v4.16b, {v17.16b}, v28.16b
 
-        sdot            v6.4s, v0.16b, v26.4b[0]
-        sdot            v7.4s, v1.16b, v26.4b[0]
-        sdot            v22.4s, v2.16b, v26.4b[0]
-        sdot            v23.4s, v3.16b, v26.4b[0]
-        sdot            v6.4s, v1.16b, v26.4b[1]
-        sdot            v7.4s, v2.16b, v26.4b[1]
-        sdot            v22.4s, v3.16b, v26.4b[1]
-        sdot            v23.4s, v4.16b, v26.4b[1]
+        \dot            v6.4s, v0.16b, v26.4b[0]
+        \dot            v7.4s, v1.16b, v26.4b[0]
+        \dot            v22.4s, v2.16b, v26.4b[0]
+        \dot            v23.4s, v3.16b, v26.4b[0]
+        \dot            v6.4s, v1.16b, v26.4b[1]
+        \dot            v7.4s, v2.16b, v26.4b[1]
+        \dot            v22.4s, v3.16b, v26.4b[1]
+        \dot            v23.4s, v4.16b, v26.4b[1]
 
         uzp1            v6.8h, v6.8h, v7.8h
         uzp1            v22.8h, v22.8h, v23.8h
@@ -1312,7 +1311,7 @@ L(\type\()_8tap_h_\isa):
         sshr            v22.8h, v22.8h, #2
         subs            \h, \h, #1
         stp             q6, q22, [\dst], #32
-.else
+.else   // put
         sqshrun         v6.8b, v6.8h, #6
         sqshrun2        v6.16b, v22.8h, #6
         subs            \h, \h, #1
@@ -1355,14 +1354,14 @@ L(\type\()_8tap_h_\isa):
         tbl             v3.16b, {v16.16b, v17.16b}, v31.16b
         tbl             v4.16b, {v17.16b}, v28.16b
 
-        sdot            v6.4s, v0.16b, v26.4b[0]
-        sdot            v7.4s, v1.16b, v26.4b[0]
-        sdot            v22.4s, v2.16b, v26.4b[0]
-        sdot            v23.4s, v3.16b, v26.4b[0]
-        sdot            v6.4s, v1.16b, v26.4b[1]
-        sdot            v7.4s, v2.16b, v26.4b[1]
-        sdot            v22.4s, v3.16b, v26.4b[1]
-        sdot            v23.4s, v4.16b, v26.4b[1]
+        \dot            v6.4s, v0.16b, v26.4b[0]
+        \dot            v7.4s, v1.16b, v26.4b[0]
+        \dot            v22.4s, v2.16b, v26.4b[0]
+        \dot            v23.4s, v3.16b, v26.4b[0]
+        \dot            v6.4s, v1.16b, v26.4b[1]
+        \dot            v7.4s, v2.16b, v26.4b[1]
+        \dot            v22.4s, v3.16b, v26.4b[1]
+        \dot            v23.4s, v4.16b, v26.4b[1]
 
         uzp1            v6.8h, v6.8h, v7.8h
         uzp1            v22.8h, v22.8h, v23.8h
@@ -1371,7 +1370,7 @@ L(\type\()_8tap_h_\isa):
         sshr            v22.8h, v22.8h, #2
         subs            w8, w8, #16
         stp             q6, q22, [\dst], #32
-.else
+.else   // put
         sqshrun         v6.8b, v6.8h, #6
         sqshrun2        v6.16b, v22.8h, #6
         subs            w8, w8, #16
@@ -1404,11 +1403,11 @@ endfunc
 
 // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
 // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
-filter_8tap_fn prep, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
+filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
 
 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
 // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
-filter_8tap_fn  put, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
+filter_8tap_fn  put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
 
 DISABLE_DOTPROD
 #endif  // HAVE_DOTPROD

From 7351d94f04ddf76be9e1e6c46768d42109611630 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Tue, 23 Apr 2024 16:52:38 +0200
Subject: [PATCH 05/22] AArch64: Simplify DotProd path of vertical subpel
 filters

Simplify the accumulator initializations of the DotProd code path of
vertical subpel filters. This also makes it possible for some CPUs to
use zero latency vector register moves. The load is also simplified
(ldr + add -> ld1) in the inner loop of vertical filter for block
size 16.
---
 src/arm/64/mc_dotprod.S | 66 +++++++++++++----------------------------
 1 file changed, 21 insertions(+), 45 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index c6040145b..3d397d1ba 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -115,8 +115,10 @@ L(\type\()_8tap_v_\isa):
         ldr             q6, L(v_tbl_neon_dotprod)
         sub             \src, \src, \s_strd
 .ifc \type, prep
-        mov             w8, 0x2002  // FILTER_WEIGHT * 128 + rounding
+        mov             w8, 0x2002          // FILTER_WEIGHT * 128 + rounding
         dup             v4.4s, w8
+.else
+        movi            v4.4s, #32, lsl 8   // FILTER_WEIGHT * 128, bias for SDOT
 .endif
         ubfx            w11, \my, #7, #7
         and             \my, \my, #0x7F
@@ -191,19 +193,13 @@ L(\type\()_8tap_v_\isa):
 
         .align LOOP_ALIGN
 16:
-        ldr             q27, [\lsrc]
-        add             \lsrc, \lsrc, \s_strd
-.ifc \type, prep
+        ld1             {v27.16b}, [\lsrc], \s_strd
+
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-.else
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
-        movi            v2.4s, #32, lsl 8
-        movi            v3.4s, #32, lsl 8
-.endif
+
         sub             v18.16b, v27.16b, v5.16b
         sub             v21.16b, v27.16b, v5.16b
         sub             v24.16b, v27.16b, v5.16b
@@ -243,17 +239,11 @@ L(\type\()_8tap_v_\isa):
 .endif
         b.gt            16b
 
-.ifc \type, prep
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-.else
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
-        movi            v2.4s, #32, lsl 8
-        movi            v3.4s, #32, lsl 8
-.endif
+
         \dot            v0.4s, v16.16b, v7.4b[0]
         \dot            v1.4s, v19.16b, v7.4b[0]
         \dot            v2.4s, v22.16b, v7.4b[0]
@@ -321,17 +311,12 @@ L(\type\()_8tap_v_\isa):
         ldr             d21, [\src]
         ldr             d27, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-.ifc \type, prep
+
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-.else
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
-        movi            v2.4s, #32, lsl 8
-        movi            v3.4s, #32, lsl 8
-.endif
+
         sub             v18.16b, v21.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
         sub             v24.16b, v27.16b, v5.16b
@@ -376,18 +361,14 @@ L(\type\()_8tap_v_\isa):
 .ifc \type, put
         .align JUMP_ALIGN
 82:
+.endif
         ldr             d21, [\src]
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
-        movi            v2.4s, #32, lsl 8
-        movi            v3.4s, #32, lsl 8
-.else
-        ldr             d21, [\src]
+
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-.endif
+
         sub             v18.16b, v21.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
 
@@ -459,13 +440,10 @@ L(\type\()_8tap_v_\isa):
         ldr             s18, [\src]
         ldr             s21, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-.ifc \type, prep
+
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-.else
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
-.endif
+
         sub             v18.16b, v18.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
 
@@ -500,14 +478,12 @@ L(\type\()_8tap_v_\isa):
 .ifc \type, put
         .align JUMP_ALIGN
 42:
+.endif
         ldr             s18, [\src]
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
-.else
-        ldr             s18, [\src]
+
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-.endif
+
         sub             v18.16b, v18.16b, v5.16b
 
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
@@ -569,8 +545,8 @@ L(\type\()_8tap_v_\isa):
         ldr             h21, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
 
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
 
         sub             v18.8b, v18.8b, v5.8b
         sub             v21.8b, v21.8b, v5.8b
@@ -602,8 +578,8 @@ L(\type\()_8tap_v_\isa):
 22:
         ldr             h18, [\src]
 
-        movi            v0.4s, #32, lsl 8   // 64 * 128, bias for SDOT
-        movi            v1.4s, #32, lsl 8
+        mov             v0.16b, v4.16b
+        mov             v1.16b, v4.16b
 
         sub             v18.8b, v18.8b, v5.8b
 

From 417cdc55cc98c07acb8f3a49c69831200d8ab954 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Tue, 23 Apr 2024 16:55:24 +0200
Subject: [PATCH 06/22] AArch64: Simplify DotProd path of horizontal subpel
 filters

Simplify the inner loops of the DotProd code path of horizontal
subpel filters to avoid using 2-register TBL instructions. The
store part of block size 16 of the horizontal put case is also
simplified (str + add -> st1). This patch can improve performance
mostly on small cores like Cortex-A510 and newer. Other CPUs are
mostly unaffected.

Cortex-A510:
mct_8tap_sharp_w16_h_8bpc_dotprod:  2.77x -> 3.13x
mct_8tap_sharp_w32_h_8bpc_dotprod:  2.32x -> 2.56x

Cortex-A55:
mct_8tap_sharp_w16_h_8bpc_dotprod:  3.89x -> 3.89x
mct_8tap_sharp_w32_h_8bpc_dotprod:  3.35x -> 3.35x

Cortex-A715:
mct_8tap_sharp_w16_h_8bpc_dotprod:  3.79x -> 3.78x
mct_8tap_sharp_w32_h_8bpc_dotprod:  3.30x -> 3.30x

Cortex-A78:
mct_8tap_sharp_w16_h_8bpc_dotprod:  4.30x -> 4.31x
mct_8tap_sharp_w32_h_8bpc_dotprod:  3.79x -> 3.80x

Cortex-X3:
mct_8tap_sharp_w16_h_8bpc_dotprod:  4.74x -> 4.75x
mct_8tap_sharp_w32_h_8bpc_dotprod:  3.89x -> 3.91x

Cortex-X1:
mct_8tap_sharp_w16_h_8bpc_dotprod:  4.61x -> 4.62x
mct_8tap_sharp_w32_h_8bpc_dotprod:  3.67x -> 3.66x
---
 src/arm/64/mc_dotprod.S | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 3d397d1ba..0a2dc9f10 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -60,7 +60,6 @@ L(h_tbl_neon_dotprod):
         .byte  1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6,   4,  5,  6,  7
         .byte  5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10,   8,  9, 10, 11
         .byte  9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14,  12, 13, 14, 15
-        .byte 13, 14, 15, 16,  14, 15, 16, 17,  15, 16, 17, 18,  16, 17, 18, 19
 
 // Vertical convolutions are also using SDOT instructions, where a 128-bit
 // register contains a transposed 4x4 matrix of values. Subsequent iterations of
@@ -1249,12 +1248,12 @@ L(\type\()_8tap_h_\isa):
         AARCH64_VALID_JUMP_TARGET
         ldr             q29, L(h_tbl_neon_dotprod) + 16
         ldr             q30, L(h_tbl_neon_dotprod) + 32
-        ldr             q31, L(h_tbl_neon_dotprod) + 48
         ldr             d26, [\xmx]
 
         .align LOOP_ALIGN
 16:
-        ldp             q16, q17, [\src]
+        ldr             q16, [\src]
+        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
         add             \src, \src, \s_strd
 
         sub             v16.16b, v16.16b, v24.16b
@@ -1268,8 +1267,8 @@ L(\type\()_8tap_h_\isa):
         tbl             v0.16b, {v16.16b}, v28.16b
         tbl             v1.16b, {v16.16b}, v29.16b
         tbl             v2.16b, {v16.16b}, v30.16b
-        tbl             v3.16b, {v16.16b, v17.16b}, v31.16b
-        tbl             v4.16b, {v17.16b}, v28.16b
+        tbl             v3.16b, {v17.16b}, v28.16b
+        tbl             v4.16b, {v17.16b}, v29.16b
 
         \dot            v6.4s, v0.16b, v26.4b[0]
         \dot            v7.4s, v1.16b, v26.4b[0]
@@ -1291,8 +1290,7 @@ L(\type\()_8tap_h_\isa):
         sqshrun         v6.8b, v6.8h, #6
         sqshrun2        v6.16b, v22.8h, #6
         subs            \h, \h, #1
-        str             q6, [\dst]
-        add             \dst, \dst, \d_strd
+        st1             {v6.16b}, [\dst], \d_strd
 .endif
         b.gt            16b
         ret
@@ -1304,7 +1302,6 @@ L(\type\()_8tap_h_\isa):
         AARCH64_VALID_JUMP_TARGET
         ldr             q29, L(h_tbl_neon_dotprod) + 16
         ldr             q30, L(h_tbl_neon_dotprod) + 32
-        ldr             q31, L(h_tbl_neon_dotprod) + 48
         ldr             d26, [\xmx]
 .ifc \type, put
         sub             \d_strd, \d_strd, \w, uxtw
@@ -1314,7 +1311,9 @@ L(\type\()_8tap_h_\isa):
 
         .align LOOP_ALIGN
 32:
-        ldp             q16, q17, [\src], #16
+        ldr             q16, [\src]
+        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
+        add             \src, \src, #16
 
         sub             v16.16b, v16.16b, v24.16b
         sub             v17.16b, v17.16b, v24.16b
@@ -1327,8 +1326,8 @@ L(\type\()_8tap_h_\isa):
         tbl             v0.16b, {v16.16b}, v28.16b
         tbl             v1.16b, {v16.16b}, v29.16b
         tbl             v2.16b, {v16.16b}, v30.16b
-        tbl             v3.16b, {v16.16b, v17.16b}, v31.16b
-        tbl             v4.16b, {v17.16b}, v28.16b
+        tbl             v3.16b, {v17.16b}, v28.16b
+        tbl             v4.16b, {v17.16b}, v29.16b
 
         \dot            v6.4s, v0.16b, v26.4b[0]
         \dot            v7.4s, v1.16b, v26.4b[0]

From e54e6d9f7d174f27704d229bf4822f2f8dca1f01 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Thu, 25 Apr 2024 16:53:04 +0200
Subject: [PATCH 07/22] AArch64: Simplify TBL usage in 2D DotProd filters

Simplify the TBL usages in small block size (2, 4) parts of the 2D
(horizontal-vertical) put subpel filters. The 2-register TBLs are
replaced with the 1-register form because we only need the lower
64-bits of the result and it can be extracted from only one source
register. Performance is not affected by this change.
---
 src/arm/64/mc_dotprod.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 0a2dc9f10..e076abf46 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -801,7 +801,7 @@ L(\type\()_8tap_h_hv_\isa):
         str             d0, [\dst], #8
         subs            \h, \h, #1
 .else
-        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
         str             s0, [\dst]
         add             \dst, \dst, \d_strd
@@ -860,7 +860,7 @@ L(\type\()_8tap_h_hv_\isa):
         shrn            v22.4h, v5.4s, #2
 
         smlal           v0.4s, v22.4h, v7.h[7]
-        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
+        tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
 
         str             h0, [\dst]
@@ -1039,7 +1039,7 @@ L(\type\()_hv_filter4_\isa):
 .else
         subs            \h, \h, #1
         smlal           v0.4s, v20.4h, v7.h[6]
-        tbl             v0.16b, {v0.16b}, v25.16b
+        tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
         str             s0, [\dst]
         add             \dst, \dst, \d_strd
@@ -1089,7 +1089,7 @@ L(\type\()_hv_filter4_\isa):
         subs            \h, \h, #1
         smlal           v0.4s, v20.4h, v7.h[6]
 
-        tbl             v0.16b, {v0.16b}, v25.16b
+        tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
 
         str             h0, [\dst]

From 1cdba4879043da2bacf8caa592375311f09df03b Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Thu, 25 Apr 2024 16:54:13 +0200
Subject: [PATCH 08/22] AArch64: Simplify loads in *hv_filter* of DotProd path

Simplify the load sequences in *hv_filter* functions (ldr + add -> ld1)
to be more uniform and smaller. Performance is not affected.
---
 src/arm/64/mc_dotprod.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index e076abf46..31abe6235 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -968,8 +968,7 @@ L(\type\()_6tap_hv_\isa):
 
         .align FUNC_ALIGN
 L(\type\()_hv_filter8_\isa):
-        ldr             q4, [\lsrc]
-        add             \lsrc, \lsrc, \s_strd
+        ld1             {v4.16b}, [\lsrc], \s_strd
         sub             v4.16b, v4.16b, v24.16b
         mov             v22.16b, v27.16b
         mov             v23.16b, v27.16b

From 3980f14220b91421e4b958ea3b6abc5837bc47a4 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Tue, 23 Apr 2024 16:58:59 +0200
Subject: [PATCH 09/22] AArch64: Simplify DotProd path of 2D subpel filters

Simplify the DotProd code path of the 2D (horizontal-vertical) subpel
filters. It contains some instruction reordering and some macro
simplifications to be more similar to the upcoming i8mm version.

These changes have negligible effect on performance.

Cortex-A510:
mc_8tap_regular_w2_hv_8bpc_dotprod:   8.3769 ->  8.3380
mc_8tap_sharp_w2_hv_8bpc_dotprod:     9.5441 ->  9.5457
mc_8tap_regular_w4_hv_8bpc_dotprod:   8.3422 ->  8.3444
mc_8tap_sharp_w4_hv_8bpc_dotprod:     9.5441 ->  9.5367
mc_8tap_regular_w8_hv_8bpc_dotprod:   9.9852 ->  9.9666
mc_8tap_sharp_w8_hv_8bpc_dotprod:    12.5554 -> 12.5314

Cortex-A55:
mc_8tap_regular_w2_hv_8bpc_dotprod:  6.4504  ->  6.4892
mc_8tap_sharp_w2_hv_8bpc_dotprod:    7.5732  ->  7.6078
mc_8tap_regular_w4_hv_8bpc_dotprod:  6.5088  ->  6.4760
mc_8tap_sharp_w4_hv_8bpc_dotprod:    7.5796  ->  7.5763
mc_8tap_regular_w8_hv_8bpc_dotprod:  9.3384  ->  9.3078
mc_8tap_sharp_w8_hv_8bpc_dotprod:   11.1159  -> 11.1401

Cortex-A78:
mc_8tap_regular_w2_hv_8bpc_dotprod:  1.4122  ->  1.4250
mc_8tap_sharp_w2_hv_8bpc_dotprod:    1.7696  ->  1.7821
mc_8tap_regular_w4_hv_8bpc_dotprod:  1.4243  ->  1.4243
mc_8tap_sharp_w4_hv_8bpc_dotprod:    1.7866  ->  1.7863
mc_8tap_regular_w8_hv_8bpc_dotprod:  2.5304  ->  2.5171
mc_8tap_sharp_w8_hv_8bpc_dotprod:    3.0815  ->  3.0632

Cortex-X1:
mc_8tap_regular_w2_hv_8bpc_dotprod:  0.8195  ->  0.8194
mc_8tap_sharp_w2_hv_8bpc_dotprod:    1.0092  ->  1.0081
mc_8tap_regular_w4_hv_8bpc_dotprod:  0.8197  ->  0.8166
mc_8tap_sharp_w4_hv_8bpc_dotprod:    1.0089  ->  1.0068
mc_8tap_regular_w8_hv_8bpc_dotprod:  1.5230  ->  1.5166
mc_8tap_sharp_w8_hv_8bpc_dotprod:    1.8683  ->  1.8625
---
 src/arm/64/mc_dotprod.S | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 31abe6235..04b60aa48 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -724,18 +724,20 @@ L(\type\()_8tap_h_hv_\isa):
         smlal2          v1.4s, v22.8h, v7.h[7]
         rshrn           v0.4h, v0.4s, #6
         rshrn2          v0.8h, v1.4s, #6
-        subs            w8, w8, #1
-        st1             {v0.8h}, [\ldst], \d_strd
-        b.gt            8b
-        add             \dst, \dst, #16
 .else   // put
         shrn            v22.4h, v5.4s, #2
         shrn2           v22.8h, v6.4s, #2
         smlal           v0.4s, v22.4h, v7.h[7]
         smlal2          v1.4s, v22.8h, v7.h[7]
         tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
-        subs            w8, w8, #1
         sqrshrun        v0.8b, v0.8h, #2
+.endif
+        subs            w8, w8, #1
+.ifc \type, prep
+        st1             {v0.8h}, [\ldst], \d_strd
+        b.gt            8b
+        add             \dst, \dst, #16
+.else
         st1             {v0.8b}, [\ldst], \d_strd
         b.gt            8b
         add             \dst, \dst, #8
@@ -789,9 +791,7 @@ L(\type\()_8tap_h_hv_\isa):
         \dot            v5.4s, v2.16b, v26.4b[0]
         mov             v20.16b, v21.16b
         mov             v21.16b, v22.16b
-.ifc \type, put
-        subs            \h, \h, #1
-.endif
+
         smlal           v0.4s, v22.4h, v7.h[6]
         shrn            v22.4h, v5.4s, #2
 
@@ -801,6 +801,7 @@ L(\type\()_8tap_h_hv_\isa):
         str             d0, [\dst], #8
         subs            \h, \h, #1
 .else
+        subs            \h, \h, #1
         tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
         str             s0, [\dst]
@@ -855,11 +856,12 @@ L(\type\()_8tap_h_hv_\isa):
         mov             v20.16b, v21.16b
         mov             v21.16b, v22.16b
 
-        subs            \h, \h, #1
         smlal           v0.4s, v22.4h, v7.h[6]
         shrn            v22.4h, v5.4s, #2
 
         smlal           v0.4s, v22.4h, v7.h[7]
+
+        subs            \h, \h, #1
         tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
 
@@ -942,19 +944,17 @@ L(\type\()_6tap_hv_\isa):
         smlal           v0.4s, v20.4h, v7.h[5]
         smlal2          v1.4s, v20.8h, v7.h[5]
         sshr            v20.8h, v23.8h, #2
-.ifc \type, prep
+
+        subs            w8, w8, #1
         smlal           v0.4s, v20.4h, v7.h[6]
         smlal2          v1.4s, v20.8h, v7.h[6]
+.ifc \type, prep
         rshrn           v0.4h, v0.4s, #6
         rshrn2          v0.8h, v1.4s, #6
         st1             {v0.8h}, [\ldst], \d_strd
-        subs            w8, w8, #1
         b.gt            8b
         add             \dst, \dst, #16
 .else
-        subs            w8, w8, #1
-        smlal           v0.4s, v20.4h, v7.h[6]
-        smlal2          v1.4s, v20.8h, v7.h[6]
         tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
         sqrshrun        v0.8b, v0.8h, #2
         st1             {v0.8b}, [\ldst], \d_strd
@@ -985,8 +985,8 @@ L(\type\()_hv_filter8_\isa):
 
         .align FUNC_ALIGN
 L(\type\()_hv_filter4_\isa):
-        mov             v22.16b, v27.16b
         ld1             {v4.8b}, [\src], \s_strd
+        mov             v22.16b, v27.16b
         sub             v4.16b, v4.16b, v24.16b
         tbl             v2.16b, {v4.16b}, v28.16b
         \dot            v22.4s, v2.16b, v26.4b[0]
@@ -1030,14 +1030,12 @@ L(\type\()_hv_filter4_\isa):
 
         smlal           v0.4s, v20.4h, v7.h[5]
         shrn            v20.4h, v5.4s, #2
-.ifc \type, prep
+        subs            \h, \h, #1
         smlal           v0.4s, v20.4h, v7.h[6]
+.ifc \type, prep
         rshrn           v0.4h, v0.4s, #6
         str             d0, [\dst], #8
-        subs            \h, \h, #1
 .else
-        subs            \h, \h, #1
-        smlal           v0.4s, v20.4h, v7.h[6]
         tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
         str             s0, [\dst]

From fb2a00792e0f03b6b617453315e93e3d6f7ff8df Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 17 Apr 2024 20:00:07 +0200
Subject: [PATCH 10/22] AArch64: Add basic i8mm support for convolutions

Add an Armv8.6-A i8mm code path for standard bitdepth convolutions.
Only horizontal-vertical (HV) convolutions have 6-tap specialisations
of their vertical passes. All other convolutions are 4- or 8-tap
filters which fit well with the 4-element USDOT instruction.

Benchmarks show 4-9% FPS increase relative to the Armv8.4-A
code path depending on the input video and the CPU used.

This patch will increase the .text by around 5.7 KiB.

Relative performance to the C reference on some Cortex CPU cores:

                       Cortex-A715   Cortex-X3  Cortex-A510
regular w4 hv neon:          7.20x      11.20x        4.40x
regular w4 hv dotprod:      12.77x      18.35x        6.21x
regular w4 hv i8mm:         14.50x      21.42x        6.16x

  sharp w4 hv neon:          6.24x       9.77x        3.96x
  sharp w4 hv dotprod:       9.76x      14.02x        5.20x
  sharp w4 hv i8mm:         10.84x      16.09x        5.42x

regular w8 hv neon:          2.17x       2.46x        3.17x
regular w8 hv dotprod:       3.04x       3.11x        3.03x
regular w8 hv i8mm:          3.57x       3.40x        3.27x

  sharp w8 hv neon:          1.72x       1.93x        2.75x
  sharp w8 hv dotprod:       2.49x       2.54x        2.62x
  sharp w8 hv i8mm:          2.80x       2.79x        2.70x

regular w16 hv neon:         1.90x       2.17x        2.02x
regular w16 hv dotprod:      2.59x       2.64x        1.93x
regular w16 hv i8mm:         3.01x       2.85x        2.05x

  sharp w16 hv neon:         1.51x       1.72x        1.74x
  sharp w16 hv dotprod:      2.17x       2.22x        1.70x
  sharp w16 hv i8mm:         2.42x       2.42x        1.72x

regular w32 hv neon:         1.80x       1.96x        1.81x
regular w32 hv dotprod:      2.43x       2.36x        1.74x
regular w32 hv i8mm:         2.83x       2.51x        1.83x

  sharp w32 hv neon:         1.42x       1.54x        1.56x
  sharp w32 hv dotprod:      2.07x       2.00x        1.55x
  sharp w32 hv i8mm:         2.29x       2.16x        1.55x

regular w64 hv neon:         1.82x       1.89x        1.70x
regular w64 hv dotprod:      2.43x       2.25x        1.65x
regular w64 hv i8mm:         2.84x       2.39x        1.73x

  sharp w64 hv neon:         1.43x       1.47x        1.49x
  sharp w64 hv dotprod:      2.08x       1.91x        1.49x
  sharp w64 hv i8mm:         2.30x       2.07x        1.48x

regular w128 hv neon:        1.77x       1.84x        1.75x
regular w128 hv dotprod:     2.37x       2.18x        1.70x
regular w128 hv i8mm:        2.76x       2.33x        1.78x

  sharp w128 hv neon:        1.40x       1.45x        1.42x
  sharp w128 hv dotprod:     2.04x       1.87x        1.43x
  sharp w128 hv i8mm:        2.24x       2.02x        1.42x

regular w8 h neon:           3.16x       3.51x        3.43x
regular w8 h dotprod:        4.97x       7.43x        4.95x
regular w8 h i8mm:           7.28x      10.38x        5.69x

  sharp w8 h neon:           2.71x       2.77x        3.10x
  sharp w8 h dotprod:        4.92x       7.14x        4.94x
  sharp w8 h i8mm:           7.21x      10.11x        5.70x

regular w16 h neon:          2.79x       2.76x        3.53x
regular w16 h dotprod:       3.81x       4.77x        3.13x
regular w16 h i8mm:          5.21x       6.04x        3.56x

  sharp w16 h neon:          2.31x       2.38x        3.12x
  sharp w16 h dotprod:       3.80x       4.74x        3.13x
  sharp w16 h i8mm:          5.20x       5.98x        3.56x

regular w64 h neon:          2.49x       2.46x        2.94x
regular w64 h dotprod:       3.17x       3.60x        2.41x
regular w64 h i8mm:          4.22x       4.40x        2.72x

  sharp w64 h neon:          2.07x       2.06x        2.60x
  sharp w64 h dotprod:       3.16x       3.58x        2.40x
  sharp w64 h i8mm:          4.20x       4.38x        2.71x

regular w8 v neon:           6.11x       8.05x        4.07x
regular w8 v dotprod:        5.45x       8.15x        4.01x
regular w8 v i8mm:           7.30x       9.46x        4.19x

  sharp w8 v neon:           4.23x       5.46x        3.09x
  sharp w8 v dotprod:        5.43x       7.96x        4.01x
  sharp w8 v i8mm:           7.26x       9.12x        4.19x

regular w16 v neon:          3.44x       4.33x        2.40x
regular w16 v dotprod:       3.20x       4.53x        2.85x
regular w16 v i8mm:          4.09x       5.27x        2.87x

  sharp w16 v neon:          2.50x       3.14x        1.82x
  sharp w16 v dotprod:       3.20x       4.52x        2.86x
  sharp w16 v i8mm:          4.09x       5.15x        2.86x

regular w64 v neon:          2.74x       3.11x        1.53x
regular w64 v dotprod:       2.63x       3.30x        1.84x
regular w64 v i8mm:          3.31x       3.73x        1.84x

  sharp w64 v neon:          2.01x       2.29x        1.16x
  sharp w64 v dotprod:       2.61x       3.27x        1.83x
  sharp w64 v i8mm:          3.29x       3.68x        1.84x
---
 src/arm/64/mc_dotprod.S | 131 ++++++++++++++++++++++++++++------------
 src/arm/mc.h            |  15 +++--
 2 files changed, 104 insertions(+), 42 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 04b60aa48..4671707d0 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -113,11 +113,19 @@ L(\type\()_8tap_v_\isa):
         madd            \my, \my, w11, w10
         ldr             q6, L(v_tbl_neon_dotprod)
         sub             \src, \src, \s_strd
-.ifc \type, prep
+.ifc \isa, neon_i8mm
+    .ifc \type, prep
+        movi            v4.4s, #2           // rounding
+    .else
+        movi            v4.4s, #0
+    .endif
+.else   // neon_dotprod
+    .ifc \type, prep
         mov             w8, 0x2002          // FILTER_WEIGHT * 128 + rounding
         dup             v4.4s, w8
-.else
+    .else
         movi            v4.4s, #32, lsl 8   // FILTER_WEIGHT * 128, bias for SDOT
+    .endif
 .endif
         ubfx            w11, \my, #7, #7
         and             \my, \my, #0x7F
@@ -127,7 +135,9 @@ L(\type\()_8tap_v_\isa):
         sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
         add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
         ldr             q29, L(v_tbl_neon_dotprod) + 32
+.ifc \isa, neon_dotprod
         movi            v5.16b, #128
+.endif
         ldr             d7, [\xmy]
         cmp             \w, #8
         b.eq            80f
@@ -179,7 +189,7 @@ L(\type\()_8tap_v_\isa):
         zip2            v20.8h, v18.8h, v24.8h
         zip1            v23.8h, v21.8h, v27.8h
         zip2            v26.8h, v21.8h, v27.8h
-
+.ifc \isa, neon_dotprod
         sub             v16.16b, v16.16b, v5.16b
         sub             v19.16b, v19.16b, v5.16b
         sub             v22.16b, v22.16b, v5.16b
@@ -189,7 +199,7 @@ L(\type\()_8tap_v_\isa):
         sub             v20.16b, v20.16b, v5.16b
         sub             v23.16b, v23.16b, v5.16b
         sub             v26.16b, v26.16b, v5.16b
-
+.endif
         .align LOOP_ALIGN
 16:
         ld1             {v27.16b}, [\lsrc], \s_strd
@@ -198,12 +208,16 @@ L(\type\()_8tap_v_\isa):
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-
+.ifc \isa, neon_i8mm
+        mov             v18.16b, v27.16b
+        mov             v21.16b, v27.16b
+        mov             v24.16b, v27.16b
+.else   // neon_dotprod
         sub             v18.16b, v27.16b, v5.16b
         sub             v21.16b, v27.16b, v5.16b
         sub             v24.16b, v27.16b, v5.16b
         sub             v27.16b, v27.16b, v5.16b
-
+.endif
         \dot            v0.4s, v16.16b, v7.4b[0]
         \dot            v1.4s, v19.16b, v7.4b[0]
         \dot            v2.4s, v22.16b, v7.4b[0]
@@ -297,11 +311,12 @@ L(\type\()_8tap_v_\isa):
         zip2            v19.8h,  v0.8h,  v2.8h
         zip1            v17.8h, v18.8h, v24.8h
         zip2            v20.8h, v18.8h, v24.8h
-
+.ifc \isa, neon_dotprod
         sub             v16.16b, v16.16b, v5.16b
         sub             v19.16b, v19.16b, v5.16b
         sub             v17.16b, v17.16b, v5.16b
         sub             v20.16b, v20.16b, v5.16b
+.endif
 .ifc \type, put
         b.eq            82f
 .endif
@@ -315,12 +330,15 @@ L(\type\()_8tap_v_\isa):
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-
+.ifc \isa, neon_i8mm
+        mov             v18.16b, v21.16b
+        mov             v24.16b, v27.16b
+.else   // neon_dotprod
         sub             v18.16b, v21.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
         sub             v24.16b, v27.16b, v5.16b
         sub             v27.16b, v27.16b, v5.16b
-
+.endif
         tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
         tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
@@ -367,10 +385,12 @@ L(\type\()_8tap_v_\isa):
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-
+.ifc \isa, neon_i8mm
+        mov             v18.16b, v21.16b
+.else
         sub             v18.16b, v21.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
-
+.endif
         tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
         tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
@@ -428,9 +448,10 @@ L(\type\()_8tap_v_\isa):
 
         zip1            v16.8h, v0.8h, v2.8h
         zip1            v17.8h, v18.8h, v24.8h
-
+.ifc \isa, neon_dotprod
         sub             v16.16b, v16.16b, v5.16b
         sub             v17.16b, v17.16b, v5.16b
+.endif
 .ifc \type, put
         b.eq            42f
 .endif
@@ -442,10 +463,10 @@ L(\type\()_8tap_v_\isa):
 
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-
+.ifc \isa, neon_dotprod
         sub             v18.16b, v18.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
-
+.endif
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
@@ -482,9 +503,9 @@ L(\type\()_8tap_v_\isa):
 
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-
+.ifc \isa, neon_dotprod
         sub             v18.16b, v18.16b, v5.16b
-
+.endif
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
@@ -532,10 +553,10 @@ L(\type\()_8tap_v_\isa):
 
         zip1            v16.4h, v0.4h, v2.4h
         zip1            v17.4h, v18.4h, v24.4h
-
+    .ifc \isa, neon_dotprod
         sub             v16.8b, v16.8b, v5.8b
         sub             v17.8b, v17.8b, v5.8b
-
+    .endif
         b.eq            22f
 
         .align LOOP_ALIGN
@@ -546,10 +567,10 @@ L(\type\()_8tap_v_\isa):
 
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-
+    .ifc \isa, neon_dotprod
         sub             v18.8b, v18.8b, v5.8b
         sub             v21.8b, v21.8b, v5.8b
-
+    .endif
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
@@ -579,9 +600,9 @@ L(\type\()_8tap_v_\isa):
 
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-
+    .ifc \isa, neon_dotprod
         sub             v18.8b, v18.8b, v5.8b
-
+    .endif
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
         tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
 
@@ -606,8 +627,12 @@ L(\type\()_8tap_h_hv_\isa):
         madd            \mx, \mx, w11, w9
         madd            w14, \my, w11, w10      // for HV
         ldr             q28, L(h_tbl_neon_dotprod)
+.ifc \isa, neon_i8mm
+        movi            v27.4s, #2              // rounding
+.else
         mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
         dup             v27.4s, w13             // put H overrides this
+.endif
         sub             \src, \src, #4          // src - 4
         ubfx            w9, \mx, #7, #7
         and             \mx, \mx, #0x7F
@@ -616,7 +641,9 @@ L(\type\()_8tap_h_hv_\isa):
         cmp             \w, #4
         csel            \mx, \mx, w9, le
         add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
+.ifc \isa, neon_dotprod
         movi            v24.16b, #128
+.endif
         cbz             \my, L(\type\()_8tap_h_\isa)
 
         // HV cases
@@ -677,9 +704,9 @@ L(\type\()_8tap_h_hv_\isa):
         smull           v0.4s, v16.4h, v7.h[0]
         smull2          v1.4s, v16.8h, v7.h[0]
         mov             v16.16b, v17.16b
-
+.ifc \isa, neon_dotprod
         sub             v23.16b, v23.16b, v24.16b
-
+.endif
         mov             v5.16b, v27.16b
         mov             v6.16b, v27.16b
 
@@ -774,9 +801,9 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v17.4h, v7.h[1]
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
-
+.ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
-
+.endif
         smlal           v0.4s, v18.4h, v7.h[2]
         smlal           v0.4s, v19.4h, v7.h[3]
         tbl             v2.16b, {v4.16b}, v28.16b
@@ -838,9 +865,9 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v17.4h, v7.h[1]
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
-
+.ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
-
+.endif
         smlal           v0.4s, v18.4h, v7.h[2]
         smlal           v0.4s, v19.4h, v7.h[3]
         tbl             v2.16b, {v4.16b}, v28.16b
@@ -912,7 +939,9 @@ L(\type\()_6tap_hv_\isa):
 
         smull           v0.4s, v16.4h, v7.h[1]
         smull2          v1.4s, v16.8h, v7.h[1]
+.ifc \isa, neon_dotprod
         sub             v23.16b, v23.16b, v24.16b
+.endif
         mov             v16.16b, v17.16b
 
         mov             v5.16b, v27.16b
@@ -969,7 +998,9 @@ L(\type\()_6tap_hv_\isa):
         .align FUNC_ALIGN
 L(\type\()_hv_filter8_\isa):
         ld1             {v4.16b}, [\lsrc], \s_strd
+.ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
+.endif
         mov             v22.16b, v27.16b
         mov             v23.16b, v27.16b
         tbl             v2.16b, {v4.16b}, v28.16b
@@ -987,7 +1018,9 @@ L(\type\()_hv_filter8_\isa):
 L(\type\()_hv_filter4_\isa):
         ld1             {v4.8b}, [\src], \s_strd
         mov             v22.16b, v27.16b
+.ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
+.endif
         tbl             v2.16b, {v4.16b}, v28.16b
         \dot            v22.4s, v2.16b, v26.4b[0]
         shrn            v22.4h, v22.4s, #2
@@ -1015,7 +1048,9 @@ L(\type\()_hv_filter4_\isa):
 
         smull           v0.4s, v16.4h, v7.h[1]
         smlal           v0.4s, v17.4h, v7.h[2]
+.ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
+.endif
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
 
@@ -1067,7 +1102,9 @@ L(\type\()_hv_filter4_\isa):
 
         smull           v0.4s, v16.4h, v7.h[1]
         smlal           v0.4s, v17.4h, v7.h[2]
+.ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
+.endif
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
 
@@ -1100,8 +1137,12 @@ L(\type\()_8tap_h_\isa):
         adr             x9, L(\type\()_8tap_h_\isa\()_tbl)
         ldrh            w8, [x9, x8, lsl #1]
 .ifc \type, put
+    .ifc \isa, neon_i8mm
+        movi            v27.4s, #34     // special rounding
+    .else
         mov             w10, #0x2022    // 64 * 128 + 34, bias and rounding for SDOT
         dup             v27.4s, w10
+    .endif
 .endif
         sub             x9, x9, x8
         br              x9
@@ -1118,10 +1159,10 @@ L(\type\()_8tap_h_\isa):
         ldr             d0, [\src]
         ldr             d1, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-
+    .ifc \isa, neon_dotprod
         sub             v0.8b, v0.8b, v24.8b
         sub             v1.8b, v1.8b, v24.8b
-
+    .endif
         mov             v4.16b, v27.16b
         mov             v5.16b, v27.16b
 
@@ -1155,10 +1196,10 @@ L(\type\()_8tap_h_\isa):
         ldr             d0, [\src]
         ldr             d1, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-
+.ifc \isa, neon_dotprod
         sub             v0.8b, v0.8b, v24.8b
         sub             v1.8b, v1.8b, v24.8b
-
+.endif
         mov             v4.16b, v27.16b
         mov             v5.16b, v27.16b
 
@@ -1197,10 +1238,10 @@ L(\type\()_8tap_h_\isa):
         ldr             q0, [\src]
         ldr             q16, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-
+.ifc \isa, neon_dotprod
         sub             v0.16b, v0.16b, v24.16b
         sub             v16.16b, v16.16b, v24.16b
-
+.endif
         mov             v4.16b, v27.16b
         mov             v5.16b, v27.16b
         mov             v20.16b, v27.16b
@@ -1252,10 +1293,10 @@ L(\type\()_8tap_h_\isa):
         ldr             q16, [\src]
         ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
         add             \src, \src, \s_strd
-
+.ifc \isa, neon_dotprod
         sub             v16.16b, v16.16b, v24.16b
         sub             v17.16b, v17.16b, v24.16b
-
+.endif
         mov             v6.16b, v27.16b
         mov             v7.16b, v27.16b
         mov             v22.16b, v27.16b
@@ -1311,10 +1352,10 @@ L(\type\()_8tap_h_\isa):
         ldr             q16, [\src]
         ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
         add             \src, \src, #16
-
+.ifc \isa, neon_dotprod
         sub             v16.16b, v16.16b, v24.16b
         sub             v17.16b, v17.16b, v24.16b
-
+.endif
         mov             v6.16b, v27.16b
         mov             v7.16b, v27.16b
         mov             v22.16b, v27.16b
@@ -1381,5 +1422,19 @@ filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6,
 // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
 filter_8tap_fn  put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
 
+#if HAVE_I8MM
+ENABLE_I8MM
+
+// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
+// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
+filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
+
+// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
+// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
+filter_8tap_fn  put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
+
+DISABLE_I8MM
+#endif  // HAVE_I8MM
+
 DISABLE_DOTPROD
 #endif  // HAVE_DOTPROD
diff --git a/src/arm/mc.h b/src/arm/mc.h
index 7e57fd37c..dabdab357 100644
--- a/src/arm/mc.h
+++ b/src/arm/mc.h
@@ -62,6 +62,7 @@
 
 decl_8tap_fns(neon);
 decl_8tap_fns(neon_dotprod);
+decl_8tap_fns(neon_i8mm);
 
 decl_mc_fn(BF(dav1d_put_bilin, neon));
 decl_mct_fn(BF(dav1d_prep_bilin, neon));
@@ -109,11 +110,17 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
     c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
     c->emu_edge = BF(dav1d_emu_edge, neon);
 
-#if ARCH_AARCH64
-#if HAVE_DOTPROD && BITDEPTH == 8
+#if ARCH_AARCH64 && BITDEPTH == 8
+#if HAVE_DOTPROD
     if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
 
     init_8tap_fns(neon_dotprod);
-#endif  // HAVE_DOTPROD && BITDEPTH == 8
-#endif  // ARCH_AARCH64
+#endif  // HAVE_DOTPROD
+
+#if HAVE_I8MM
+    if (!(flags & DAV1D_ARM_CPU_FLAG_I8MM)) return;
+
+    init_8tap_fns(neon_i8mm);
+#endif  // HAVE_I8MM
+#endif  // ARCH_AARCH64 && BITDEPTH == 8
 }

From 488a191df8e2bae8ab5681102b8d49dac06f3f89 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Fri, 26 Apr 2024 18:24:29 +0200
Subject: [PATCH 11/22] AArch64: Optimize horizontal i8mm prep filters

Replace the accumulator initializations of the horizontal prep
filters with register fills by zeros. Most i8mm capable CPUs can do
these with zero latency, but we also need to use rounding shifts at
the end of the filter. We can see better performance with this
change on out-of-order CPUs.

Relative performance of micro benchmarks (lower is better):

Cortex-X3:
mct_8tap_sharp_w32_h_8bpc_i8mm:  0.914x
mct_8tap_sharp_w16_h_8bpc_i8mm:  0.906x
mct_8tap_sharp_w8_h_8bpc_i8mm:   0.877x

Cortex-A715:
mct_8tap_sharp_w32_h_8bpc_i8mm:  0.819x
mct_8tap_sharp_w16_h_8bpc_i8mm:  0.805x
mct_8tap_sharp_w8_h_8bpc_i8mm:   0.779x

Cortex-A510:
mct_8tap_sharp_w32_h_8bpc_i8mm:  0.999x
mct_8tap_sharp_w16_h_8bpc_i8mm:  1.001x
mct_8tap_sharp_w8_h_8bpc_i8mm:   0.996x
mct_8tap_sharp_w4_h_8bpc_i8mm:   0.915x
---
 src/arm/64/mc_dotprod.S | 66 +++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 12 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 4671707d0..19431abfa 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -1196,13 +1196,17 @@ L(\type\()_8tap_h_\isa):
         ldr             d0, [\src]
         ldr             d1, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-.ifc \isa, neon_dotprod
+.ifc \type\()_\isa, prep_neon_i8mm
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+.else
+    .ifc \isa, neon_dotprod
         sub             v0.8b, v0.8b, v24.8b
         sub             v1.8b, v1.8b, v24.8b
-.endif
+    .endif
         mov             v4.16b, v27.16b
         mov             v5.16b, v27.16b
-
+.endif
         tbl             v2.16b, {v0.16b}, v28.16b
         tbl             v3.16b, {v1.16b}, v28.16b
 
@@ -1210,8 +1214,13 @@ L(\type\()_8tap_h_\isa):
         \dot            v5.4s, v3.16b, v26.4b[0]
 .ifc \type, prep
         subs            \h, \h, #2
+    .ifc \isa, neon_i8mm
+        uzp1            v4.8h, v4.8h, v5.8h
+        srshr           v4.8h, v4.8h, #2
+    .else
         shrn            v4.4h, v4.4s, #2
         shrn2           v4.8h, v5.4s, #2
+    .endif
         str             q4, [\dst], #16
 .else   // put
         uzp1            v4.8h, v4.8h, v5.8h
@@ -1238,15 +1247,21 @@ L(\type\()_8tap_h_\isa):
         ldr             q0, [\src]
         ldr             q16, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-.ifc \isa, neon_dotprod
+.ifc \type\()_\isa, prep_neon_i8mm
+        movi            v4.4s, #0
+        movi            v5.4s, #0
+        movi            v20.4s, #0
+        movi            v21.4s, #0
+.else
+    .ifc \isa, neon_dotprod
         sub             v0.16b, v0.16b, v24.16b
         sub             v16.16b, v16.16b, v24.16b
-.endif
+    .endif
         mov             v4.16b, v27.16b
         mov             v5.16b, v27.16b
         mov             v20.16b, v27.16b
         mov             v21.16b, v27.16b
-
+.endif
         tbl             v1.16b, {v0.16b}, v28.16b
         tbl             v2.16b, {v0.16b}, v29.16b
         tbl             v3.16b, {v0.16b}, v30.16b
@@ -1266,8 +1281,13 @@ L(\type\()_8tap_h_\isa):
         uzp1            v4.8h, v4.8h, v5.8h
         uzp1            v20.8h, v20.8h, v21.8h
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v4.8h, v4.8h, #2
+        srshr           v20.8h, v20.8h, #2
+    .else
         sshr            v4.8h, v4.8h, #2
         sshr            v20.8h, v20.8h, #2
+    .endif
         subs            \h, \h, #2
         stp             q4, q20, [\dst], #32
 .else   // put
@@ -1293,15 +1313,21 @@ L(\type\()_8tap_h_\isa):
         ldr             q16, [\src]
         ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
         add             \src, \src, \s_strd
-.ifc \isa, neon_dotprod
+.ifc \type\()_\isa, prep_neon_i8mm
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+.else
+    .ifc \isa, neon_dotprod
         sub             v16.16b, v16.16b, v24.16b
         sub             v17.16b, v17.16b, v24.16b
-.endif
+    .endif
         mov             v6.16b, v27.16b
         mov             v7.16b, v27.16b
         mov             v22.16b, v27.16b
         mov             v23.16b, v27.16b
-
+.endif
         tbl             v0.16b, {v16.16b}, v28.16b
         tbl             v1.16b, {v16.16b}, v29.16b
         tbl             v2.16b, {v16.16b}, v30.16b
@@ -1320,8 +1346,13 @@ L(\type\()_8tap_h_\isa):
         uzp1            v6.8h, v6.8h, v7.8h
         uzp1            v22.8h, v22.8h, v23.8h
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v6.8h, v6.8h, #2
+        srshr           v22.8h, v22.8h, #2
+    .else
         sshr            v6.8h, v6.8h, #2
         sshr            v22.8h, v22.8h, #2
+    .endif
         subs            \h, \h, #1
         stp             q6, q22, [\dst], #32
 .else   // put
@@ -1352,15 +1383,21 @@ L(\type\()_8tap_h_\isa):
         ldr             q16, [\src]
         ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
         add             \src, \src, #16
-.ifc \isa, neon_dotprod
+.ifc \type\()_\isa, prep_neon_i8mm
+        movi            v6.4s, #0
+        movi            v7.4s, #0
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+.else
+    .ifc \isa, neon_dotprod
         sub             v16.16b, v16.16b, v24.16b
         sub             v17.16b, v17.16b, v24.16b
-.endif
+    .endif
         mov             v6.16b, v27.16b
         mov             v7.16b, v27.16b
         mov             v22.16b, v27.16b
         mov             v23.16b, v27.16b
-
+.endif
         tbl             v0.16b, {v16.16b}, v28.16b
         tbl             v1.16b, {v16.16b}, v29.16b
         tbl             v2.16b, {v16.16b}, v30.16b
@@ -1379,8 +1416,13 @@ L(\type\()_8tap_h_\isa):
         uzp1            v6.8h, v6.8h, v7.8h
         uzp1            v22.8h, v22.8h, v23.8h
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v6.8h, v6.8h, #2
+        srshr           v22.8h, v22.8h, #2
+    .else
         sshr            v6.8h, v6.8h, #2
         sshr            v22.8h, v22.8h, #2
+    .endif
         subs            w8, w8, #16
         stp             q6, q22, [\dst], #32
 .else   // put

From 670e5219b40ee19fba6bed3c84c0aa8e923aa6ad Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Fri, 26 Apr 2024 17:51:35 +0200
Subject: [PATCH 12/22] AArch64: Optimize vertical i8mm subpel filters

Replace the accumulator initializations of the vertical subpel
filters with register fills by zeros (which are usually zero latency
operations in this feature class), this implies the usage of rounding
shifts at the end in the prep cases. Out-of-order CPU cores can
benefit from this change.

The width=16 case uses a simpler register duplication scheme that
relies on MOV instructions for the subsequent shuffles. This approach
uses a different register to load the data into for better instruction
scheduling and data dependency chain.

Relative performance of micro benchmarks (lower is better):

Cortex-X3:
mct_8tap_sharp_w16_v_8bpc_i8mm:	0.910x
mct_8tap_sharp_w8_v_8bpc_i8mm: 	0.986x

mc_8tap_sharp_w16_v_8bpc_i8mm: 	0.864x
mc_8tap_sharp_w8_v_8bpc_i8mm:  	0.882x
mc_8tap_sharp_w4_v_8bpc_i8mm:  	0.933x
mc_8tap_sharp_w2_v_8bpc_i8mm:  	0.926x

Cortex-A715:
mct_8tap_sharp_w16_v_8bpc_i8mm:	0.855x
mct_8tap_sharp_w8_v_8bpc_i8mm: 	0.784x
mct_8tap_sharp_w4_v_8bpc_i8mm:  1.069x

mc_8tap_sharp_w16_v_8bpc_i8mm: 	0.850x
mc_8tap_sharp_w8_v_8bpc_i8mm:  	0.779x
mc_8tap_sharp_w4_v_8bpc_i8mm:  	0.971x
mc_8tap_sharp_w2_v_8bpc_i8mm:  	0.975x

Cortex-A510:
mct_8tap_sharp_w16_v_8bpc_i8mm: 1.001x
mct_8tap_sharp_w8_v_8bpc_i8mm: 	0.979x
mct_8tap_sharp_w4_v_8bpc_i8mm: 	0.998x

mc_8tap_sharp_w16_v_8bpc_i8mm: 	0.998x
mc_8tap_sharp_w8_v_8bpc_i8mm:   1.004x
mc_8tap_sharp_w4_v_8bpc_i8mm:   1.003x
mc_8tap_sharp_w2_v_8bpc_i8mm:  	0.996x
---
 src/arm/64/mc_dotprod.S | 114 +++++++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 31 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 19431abfa..b61ee2623 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -113,13 +113,7 @@ L(\type\()_8tap_v_\isa):
         madd            \my, \my, w11, w10
         ldr             q6, L(v_tbl_neon_dotprod)
         sub             \src, \src, \s_strd
-.ifc \isa, neon_i8mm
-    .ifc \type, prep
-        movi            v4.4s, #2           // rounding
-    .else
-        movi            v4.4s, #0
-    .endif
-.else   // neon_dotprod
+.ifc \isa, neon_dotprod
     .ifc \type, prep
         mov             w8, 0x2002          // FILTER_WEIGHT * 128 + rounding
         dup             v4.4s, w8
@@ -202,17 +196,21 @@ L(\type\()_8tap_v_\isa):
 .endif
         .align LOOP_ALIGN
 16:
+.ifc \isa, neon_i8mm
+        ld1             {v18.16b}, [\lsrc], \s_strd
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+        mov             v21.16b, v18.16b
+        mov             v24.16b, v18.16b
+        mov             v27.16b, v18.16b
+.else   // neon_dotprod
         ld1             {v27.16b}, [\lsrc], \s_strd
-
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-.ifc \isa, neon_i8mm
-        mov             v18.16b, v27.16b
-        mov             v21.16b, v27.16b
-        mov             v24.16b, v27.16b
-.else   // neon_dotprod
         sub             v18.16b, v27.16b, v5.16b
         sub             v21.16b, v27.16b, v5.16b
         sub             v24.16b, v27.16b, v5.16b
@@ -242,8 +240,13 @@ L(\type\()_8tap_v_\isa):
         uzp1            v0.8h, v0.8h, v1.8h
         uzp1            v2.8h, v2.8h, v3.8h
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v0.8h, v0.8h, #2
+        srshr           v1.8h, v2.8h, #2
+    .else
         sshr            v0.8h, v0.8h, #2
         sshr            v1.8h, v2.8h, #2
+    .endif
         st1             {v0.8h, v1.8h}, [\ldst], \d_strd
 .else   // put
         sqrshrun        v0.8b, v0.8h, #6
@@ -252,11 +255,17 @@ L(\type\()_8tap_v_\isa):
 .endif
         b.gt            16b
 
+.ifc \isa, neon_i8mm
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+.else   // neon_dotprod
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-
+.endif
         \dot            v0.4s, v16.16b, v7.4b[0]
         \dot            v1.4s, v19.16b, v7.4b[0]
         \dot            v2.4s, v22.16b, v7.4b[0]
@@ -271,8 +280,13 @@ L(\type\()_8tap_v_\isa):
         uzp1            v0.8h, v0.8h, v1.8h
         uzp1            v2.8h, v2.8h, v3.8h
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v0.8h, v0.8h, #2
+        srshr           v1.8h, v2.8h, #2
+    .else
         sshr            v0.8h, v0.8h, #2
         sshr            v1.8h, v2.8h, #2
+    .endif
         stp             q0, q1, [\ldst]
         add             \dst, \dst, #32
 .else   // put
@@ -322,18 +336,24 @@ L(\type\()_8tap_v_\isa):
 .endif
         .align LOOP_ALIGN
 8:
+.ifc \isa, neon_i8mm
+        ldr             d18, [\src]
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+        ldr             d24, [\src, \s_strd]
+        add             \src, \src, \s_strd, lsl #1
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+        mov             v21.8b, v18.8b
+        mov             v27.8b, v24.8b
+.else   // neon_dotprod
         ldr             d21, [\src]
         ldr             d27, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-.ifc \isa, neon_i8mm
-        mov             v18.16b, v21.16b
-        mov             v24.16b, v27.16b
-.else   // neon_dotprod
         sub             v18.16b, v21.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
         sub             v24.16b, v27.16b, v5.16b
@@ -363,8 +383,13 @@ L(\type\()_8tap_v_\isa):
         uzp1            v0.8h, v0.8h, v1.8h
         uzp1            v2.8h, v2.8h, v3.8h
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v0.8h, v0.8h, #2
+        srshr           v1.8h, v2.8h, #2
+    .else
         sshr            v0.8h, v0.8h, #2
         sshr            v1.8h, v2.8h, #2
+    .endif
         stp             q0, q1, [\dst], #32
 .else   // put
         sqrshrun        v0.8b, v0.8h, #6
@@ -379,15 +404,19 @@ L(\type\()_8tap_v_\isa):
         .align JUMP_ALIGN
 82:
 .endif
+.ifc \isa, neon_i8mm
+        ldr             d18, [\src]
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+        movi            v2.4s, #0
+        movi            v3.4s, #0
+        mov             v21.8b, v18.8b
+.else   // neon_dotprod
         ldr             d21, [\src]
-
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
         mov             v2.16b, v4.16b
         mov             v3.16b, v4.16b
-.ifc \isa, neon_i8mm
-        mov             v18.16b, v21.16b
-.else
         sub             v18.16b, v21.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
 .endif
@@ -409,8 +438,13 @@ L(\type\()_8tap_v_\isa):
         uzp1            v0.8h, v0.8h, v1.8h
         uzp1            v2.8h, v2.8h, v3.8h
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v0.8h, v0.8h, #2
+        srshr           v1.8h, v2.8h, #2
+    .else
         sshr            v0.8h, v0.8h, #2
         sshr            v1.8h, v2.8h, #2
+    .endif
         stp             q0, q1, [\dst]
 .else   // put
         sqrshrun        v0.8b, v0.8h, #6
@@ -460,10 +494,12 @@ L(\type\()_8tap_v_\isa):
         ldr             s18, [\src]
         ldr             s21, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-
+.ifc \isa, neon_i8mm
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+.else   // neon_dotprod
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-.ifc \isa, neon_dotprod
         sub             v18.16b, v18.16b, v5.16b
         sub             v21.16b, v21.16b, v5.16b
 .endif
@@ -480,8 +516,13 @@ L(\type\()_8tap_v_\isa):
         \dot            v1.4s, v20.16b, v7.4b[1]
 .ifc \type, prep
         subs            \h, \h, #2
+    .ifc \isa, neon_i8mm
+        rshrn           v0.4h, v0.4s, #2
+        rshrn2          v0.8h, v1.4s, #2
+    .else
         shrn            v0.4h, v0.4s, #2
         shrn2           v0.8h, v1.4s, #2
+    .endif
         str             q0, [\dst], #16
 .else
         uzp1            v0.8h, v0.8h, v1.8h
@@ -500,10 +541,12 @@ L(\type\()_8tap_v_\isa):
 42:
 .endif
         ldr             s18, [\src]
-
+.ifc \isa, neon_i8mm
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+.else   // neon_dotprod
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-.ifc \isa, neon_dotprod
         sub             v18.16b, v18.16b, v5.16b
 .endif
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
@@ -515,8 +558,13 @@ L(\type\()_8tap_v_\isa):
         \dot            v1.4s, v19.16b, v7.4b[0]
         \dot            v1.4s, v20.16b, v7.4b[1]
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        rshrn           v0.4h, v0.4s, #2
+        rshrn2          v0.8h, v1.4s, #2
+    .else
         shrn            v0.4h, v0.4s, #2
         shrn2           v0.8h, v1.4s, #2
+    .endif
         str             q0, [\dst]
 .else
         uzp1            v0.8h, v0.8h, v1.8h
@@ -564,10 +612,12 @@ L(\type\()_8tap_v_\isa):
         ldr             h18, [\src]
         ldr             h21, [\src, \s_strd]
         add             \src, \src, \s_strd, lsl #1
-
+    .ifc \isa, neon_i8mm
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+    .else   // put
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-    .ifc \isa, neon_dotprod
         sub             v18.8b, v18.8b, v5.8b
         sub             v21.8b, v21.8b, v5.8b
     .endif
@@ -597,10 +647,12 @@ L(\type\()_8tap_v_\isa):
         .align JUMP_ALIGN
 22:
         ldr             h18, [\src]
-
+    .ifc \isa, neon_i8mm
+        movi            v0.4s, #0
+        movi            v1.4s, #0
+    .else   // put
         mov             v0.16b, v4.16b
         mov             v1.16b, v4.16b
-    .ifc \isa, neon_dotprod
         sub             v18.8b, v18.8b, v5.8b
     .endif
         tbl             v19.16b, {v16.16b, v17.16b}, v6.16b

From 346bb04dc2bd762fe07a3ba7aa5e0a2a1d438787 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 1 May 2024 21:54:55 +0200
Subject: [PATCH 13/22] AArch64: Optimize 2D i8mm subpel filters

Rewrite the accumulator initializations of the horizontal part of the
2D filters with zero register fills. It can improve the performance
on out-of-order CPUs which can fill vector registers by zero with
zero latency. Zeroed accumulators imply the usage of the rounding
shifts at the end of filters.

The only exception is the very short *hv_filter4*, where the longer
latency of rounding shift could decrease the performance.

The *filter8* function uses a different (alternating) dot product
computation order for DotProd+ feature level, it gives a better
overall performance for out-of-order and some in-order CPU cores.

The i8mm version does not need to use bias for the loaded samples, so
a different instruction scheduling is beneficial mostly affecting the
order of TBL instructions in the 8-tap case.

Relative performance of micro benchmarks (lower is better):

Cortex-X3:
  mct_8tap_regular_w16_hv_8bpc_i8mm:  0.982x
  mct_8tap_sharp_w16_hv_8bpc_i8mm:    0.979x
  mct_8tap_regular_w8_hv_8bpc_i8mm:   0.972x
  mct_8tap_sharp_w8_hv_8bpc_i8mm:     0.969x
  mct_8tap_regular_w4_hv_8bpc_i8mm:   0.942x
  mct_8tap_sharp_w4_hv_8bpc_i8mm:     0.935x
  mc_8tap_regular_w16_hv_8bpc_i8mm:   0.988x
  mc_8tap_sharp_w16_hv_8bpc_i8mm:     0.982x
  mc_8tap_regular_w8_hv_8bpc_i8mm:    0.981x
  mc_8tap_sharp_w8_hv_8bpc_i8mm:      0.975x
  mc_8tap_regular_w4_hv_8bpc_i8mm:    0.998x
  mc_8tap_sharp_w4_hv_8bpc_i8mm:      0.996x
  mc_8tap_regular_w2_hv_8bpc_i8mm:    1.006x
  mc_8tap_sharp_w2_hv_8bpc_i8mm:      0.993x

Cortex-A715:
  mct_8tap_regular_w16_hv_8bpc_i8mm:  0.883x
  mct_8tap_sharp_w16_hv_8bpc_i8mm:    0.931x
  mct_8tap_regular_w8_hv_8bpc_i8mm:   0.882x
  mct_8tap_sharp_w8_hv_8bpc_i8mm:     0.928x
  mct_8tap_regular_w4_hv_8bpc_i8mm:   0.969x
  mct_8tap_sharp_w4_hv_8bpc_i8mm:     0.934x
  mc_8tap_regular_w16_hv_8bpc_i8mm:   0.881x
  mc_8tap_sharp_w16_hv_8bpc_i8mm:     0.925x
  mc_8tap_regular_w8_hv_8bpc_i8mm:    0.879x
  mc_8tap_sharp_w8_hv_8bpc_i8mm:      0.925x
  mc_8tap_regular_w4_hv_8bpc_i8mm:    0.917x
  mc_8tap_sharp_w4_hv_8bpc_i8mm:      0.976x
  mc_8tap_regular_w2_hv_8bpc_i8mm:    0.915x
  mc_8tap_sharp_w2_hv_8bpc_i8mm:      0.972x

Cortex-A510:
  mct_8tap_regular_w16_hv_8bpc_i8mm:  0.994x
  mct_8tap_sharp_w16_hv_8bpc_i8mm:    0.949x
  mct_8tap_regular_w8_hv_8bpc_i8mm:   0.987x
  mct_8tap_sharp_w8_hv_8bpc_i8mm:     0.947x
  mct_8tap_regular_w4_hv_8bpc_i8mm:   1.002x
  mct_8tap_sharp_w4_hv_8bpc_i8mm:     0.999x
  mc_8tap_regular_w16_hv_8bpc_i8mm:   0.989x
  mc_8tap_sharp_w16_hv_8bpc_i8mm:     1.003x
  mc_8tap_regular_w8_hv_8bpc_i8mm:    0.986x
  mc_8tap_sharp_w8_hv_8bpc_i8mm:      1.000x
  mc_8tap_regular_w4_hv_8bpc_i8mm:    1.007x
  mc_8tap_sharp_w4_hv_8bpc_i8mm:      1.000x
  mc_8tap_regular_w2_hv_8bpc_i8mm:    1.005x
  mc_8tap_sharp_w2_hv_8bpc_i8mm:      1.000x
---
 src/arm/64/mc_dotprod.S | 114 ++++++++++++++++++++++++++++++----------
 1 file changed, 87 insertions(+), 27 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index b61ee2623..28f8856cc 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -679,9 +679,7 @@ L(\type\()_8tap_h_hv_\isa):
         madd            \mx, \mx, w11, w9
         madd            w14, \my, w11, w10      // for HV
         ldr             q28, L(h_tbl_neon_dotprod)
-.ifc \isa, neon_i8mm
-        movi            v27.4s, #2              // rounding
-.else
+.ifc \isa, neon_dotprod
         mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
         dup             v27.4s, w13             // put H overrides this
 .endif
@@ -756,20 +754,27 @@ L(\type\()_8tap_h_hv_\isa):
         smull           v0.4s, v16.4h, v7.h[0]
         smull2          v1.4s, v16.8h, v7.h[0]
         mov             v16.16b, v17.16b
-.ifc \isa, neon_dotprod
+.ifc \isa, neon_i8mm
+        movi            v5.4s, #0
+        movi            v6.4s, #0
+        tbl             v2.16b, {v23.16b}, v28.16b
+        tbl             v3.16b, {v23.16b}, v29.16b
+.else   // neon_dotprod
         sub             v23.16b, v23.16b, v24.16b
-.endif
         mov             v5.16b, v27.16b
         mov             v6.16b, v27.16b
-
+.endif
         smlal           v0.4s, v17.4h, v7.h[1]
         smlal2          v1.4s, v17.8h, v7.h[1]
+.ifc \isa, neon_i8mm
+        tbl             v4.16b, {v23.16b}, v30.16b
+        mov             v17.16b, v18.16b
+.else   // neon_dotprod
         mov             v17.16b, v18.16b
-
         tbl             v2.16b, {v23.16b}, v28.16b
         tbl             v3.16b, {v23.16b}, v29.16b
         tbl             v4.16b, {v23.16b}, v30.16b
-
+.endif
         smlal           v0.4s, v18.4h, v7.h[2]
         smlal2          v1.4s, v18.8h, v7.h[2]
         mov             v18.16b, v19.16b
@@ -794,24 +799,37 @@ L(\type\()_8tap_h_hv_\isa):
         uzp1            v23.8h, v5.8h, v6.8h
 .endif
         mov             v21.16b, v22.16b
-
         smlal           v0.4s, v22.4h, v7.h[6]
         smlal2          v1.4s, v22.8h, v7.h[6]
+.ifc \isa, neon_i8mm
+        subs            w8, w8, #1
+.endif
 .ifc \type, prep
+    .ifc \isa, neon_i8mm
+        srshr           v22.8h, v23.8h, #2
+    .else
         sshr            v22.8h, v23.8h, #2
+    .endif
         smlal           v0.4s, v22.4h, v7.h[7]
         smlal2          v1.4s, v22.8h, v7.h[7]
         rshrn           v0.4h, v0.4s, #6
         rshrn2          v0.8h, v1.4s, #6
 .else   // put
+    .ifc \isa, neon_i8mm
+        rshrn           v22.4h, v5.4s, #2
+        rshrn2          v22.8h, v6.4s, #2
+    .else
         shrn            v22.4h, v5.4s, #2
         shrn2           v22.8h, v6.4s, #2
+    .endif
         smlal           v0.4s, v22.4h, v7.h[7]
         smlal2          v1.4s, v22.8h, v7.h[7]
         tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
         sqrshrun        v0.8b, v0.8h, #2
 .endif
+.ifc \isa, neon_dotprod
         subs            w8, w8, #1
+.endif
 .ifc \type, prep
         st1             {v0.8h}, [\ldst], \d_strd
         b.gt            8b
@@ -859,8 +877,11 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v18.4h, v7.h[2]
         smlal           v0.4s, v19.4h, v7.h[3]
         tbl             v2.16b, {v4.16b}, v28.16b
+.ifc \isa, neon_i8mm
+        movi            v5.4s, #0
+.else
         mov             v5.16b, v27.16b
-
+.endif
         mov             v18.16b, v19.16b
         mov             v19.16b, v20.16b
 
@@ -870,10 +891,12 @@ L(\type\()_8tap_h_hv_\isa):
         \dot            v5.4s, v2.16b, v26.4b[0]
         mov             v20.16b, v21.16b
         mov             v21.16b, v22.16b
-
         smlal           v0.4s, v22.4h, v7.h[6]
+.ifc \isa, neon_i8mm
+        rshrn           v22.4h, v5.4s, #2
+.else
         shrn            v22.4h, v5.4s, #2
-
+.endif
         smlal           v0.4s, v22.4h, v7.h[7]
 .ifc \type, prep
         rshrn           v0.4h, v0.4s, #6
@@ -917,14 +940,17 @@ L(\type\()_8tap_h_hv_\isa):
         smlal           v0.4s, v17.4h, v7.h[1]
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
-.ifc \isa, neon_dotprod
+    .ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
-.endif
+    .endif
         smlal           v0.4s, v18.4h, v7.h[2]
         smlal           v0.4s, v19.4h, v7.h[3]
         tbl             v2.16b, {v4.16b}, v28.16b
+    .ifc \isa, neon_i8mm
+        movi            v5.4s, #0
+    .else
         mov             v5.16b, v27.16b
-
+    .endif
         mov             v18.16b, v19.16b
         mov             v19.16b, v20.16b
 
@@ -936,11 +962,14 @@ L(\type\()_8tap_h_hv_\isa):
         mov             v21.16b, v22.16b
 
         smlal           v0.4s, v22.4h, v7.h[6]
+    .ifc \isa, neon_i8mm
+        rshrn           v22.4h, v5.4s, #2
+    .else
         shrn            v22.4h, v5.4s, #2
-
+    .endif
         smlal           v0.4s, v22.4h, v7.h[7]
-
         subs            \h, \h, #1
+
         tbl             v0.8b, {v0.16b}, v25.8b
         sqrshrun        v0.8b, v0.8h, #2
 
@@ -995,10 +1024,13 @@ L(\type\()_6tap_hv_\isa):
         sub             v23.16b, v23.16b, v24.16b
 .endif
         mov             v16.16b, v17.16b
-
+.ifc \isa, neon_i8mm
+        movi            v5.4s, #0
+        movi            v6.4s, #0
+.else
         mov             v5.16b, v27.16b
         mov             v6.16b, v27.16b
-
+.endif
         tbl             v2.16b, {v23.16b}, v28.16b
         tbl             v3.16b, {v23.16b}, v29.16b
 
@@ -1024,8 +1056,11 @@ L(\type\()_6tap_hv_\isa):
 
         smlal           v0.4s, v20.4h, v7.h[5]
         smlal2          v1.4s, v20.8h, v7.h[5]
+.ifc \isa, neon_i8mm
+        srshr           v20.8h, v23.8h, #2
+.else
         sshr            v20.8h, v23.8h, #2
-
+.endif
         subs            w8, w8, #1
         smlal           v0.4s, v20.4h, v7.h[6]
         smlal2          v1.4s, v20.8h, v7.h[6]
@@ -1050,27 +1085,37 @@ L(\type\()_6tap_hv_\isa):
         .align FUNC_ALIGN
 L(\type\()_hv_filter8_\isa):
         ld1             {v4.16b}, [\lsrc], \s_strd
-.ifc \isa, neon_dotprod
+.ifc \isa, neon_i8mm
+        movi            v22.4s, #0
+        movi            v23.4s, #0
+.else   // neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
-.endif
         mov             v22.16b, v27.16b
         mov             v23.16b, v27.16b
+.endif
         tbl             v2.16b, {v4.16b}, v28.16b
         tbl             v3.16b, {v4.16b}, v29.16b
         tbl             v4.16b, {v4.16b}, v30.16b
         \dot            v22.4s, v2.16b, v26.4b[0]
-        \dot            v22.4s, v3.16b, v26.4b[1]
         \dot            v23.4s, v3.16b, v26.4b[0]
+        \dot            v22.4s, v3.16b, v26.4b[1]
         \dot            v23.4s, v4.16b, v26.4b[1]
+.ifc \isa, neon_i8mm
+        uzp1            v22.8h, v22.8h, v23.8h
+        srshr           v22.8h, v22.8h, #2
+.else
         shrn            v22.4h, v22.4s, #2
         shrn2           v22.8h, v23.4s, #2
+.endif
         ret
 
         .align FUNC_ALIGN
 L(\type\()_hv_filter4_\isa):
         ld1             {v4.8b}, [\src], \s_strd
+.ifc \isa, neon_i8mm
+        movi            v22.4s, #2
+.else
         mov             v22.16b, v27.16b
-.ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
 .endif
         tbl             v2.16b, {v4.16b}, v28.16b
@@ -1109,14 +1154,21 @@ L(\type\()_hv_filter4_\isa):
         smlal           v0.4s, v18.4h, v7.h[3]
         smlal           v0.4s, v19.4h, v7.h[4]
         tbl             v2.16b, {v4.16b}, v28.16b
+.ifc \isa, neon_i8mm
+        movi            v5.4s, #0
+.else
         mov             v5.16b, v27.16b
-
+.endif
         mov             v18.16b, v19.16b
         mov             v19.16b, v20.16b
         \dot            v5.4s, v2.16b, v26.4b[0]
 
         smlal           v0.4s, v20.4h, v7.h[5]
+.ifc \isa, neon_i8mm
+        rshrn           v20.4h, v5.4s, #2
+.else
         shrn            v20.4h, v5.4s, #2
+.endif
         subs            \h, \h, #1
         smlal           v0.4s, v20.4h, v7.h[6]
 .ifc \type, prep
@@ -1154,23 +1206,31 @@ L(\type\()_hv_filter4_\isa):
 
         smull           v0.4s, v16.4h, v7.h[1]
         smlal           v0.4s, v17.4h, v7.h[2]
-.ifc \isa, neon_dotprod
+    .ifc \isa, neon_dotprod
         sub             v4.16b, v4.16b, v24.16b
-.endif
+    .endif
         mov             v16.16b, v17.16b
         mov             v17.16b, v18.16b
 
         smlal           v0.4s, v18.4h, v7.h[3]
         smlal           v0.4s, v19.4h, v7.h[4]
         tbl             v2.16b, {v4.16b}, v28.16b
+    .ifc \isa, neon_i8mm
+        movi            v5.4s, #0
+    .else
         mov             v5.16b, v27.16b
+    .endif
 
         mov             v18.16b, v19.16b
         mov             v19.16b, v20.16b
         \dot            v5.4s, v2.16b, v26.4b[0]
 
         smlal           v0.4s, v20.4h, v7.h[5]
+    .ifc \isa, neon_i8mm
+        rshrn           v20.4h, v5.4s, #2
+    .else
         shrn            v20.4h, v5.4s, #2
+    .endif
 
         subs            \h, \h, #1
         smlal           v0.4s, v20.4h, v7.h[6]

From 51b63abf744b5f054d4ef6ad2d17ef8e1ad95be9 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Thu, 9 May 2024 11:24:30 +0200
Subject: [PATCH 14/22] AArch64: Optimize the init of DotProd+ 2D subpel
 filters

Removed some unnecessary vector register copies from the initial
horizontal filter parts of the HV subpel filters. The performance
improvements are better for the smaller filter block sizes.

The narrowing shifts were also rewritten at the end of the *filter8*
because it was only beneficial for the Cortex-A55 among the DotProd
capable CPU cores. On other out-of-order or newer CPUs the UZP1+SHRN
instruction combination is better.

Relative performance of micro benchmarks (lower is better):

Cortex-A55:
  mct regular w4:  0.980x
  mct regular w8:  1.007x
  mct regular w16: 1.007x

  mct sharp w4:    0.983x
  mct sharp w8:    1.012x
  mct sharp w16:   1.005x

Cortex-A510:
  mct regular w4:  0.935x
  mct regular w8:  0.984x
  mct regular w16: 0.986x

  mct sharp w4:    0.927x
  mct sharp w8:    0.983x
  mct sharp w16:   0.987x

Cortex-A78:
  mct regular w4:  0.974x
  mct regular w8:  0.988x
  mct regular w16: 0.991x

  mct sharp w4:    0.971x
  mct sharp w8:    0.987x
  mct sharp w16:   0.979x

Cortex-715:
  mct regular w4:  0.958x
  mct regular w8:  0.993x
  mct regular w16: 0.998x

  mct sharp w4:    0.974x
  mct sharp w8:    0.991x
  mct sharp w16:   0.997x

Cortex-X1:
  mct regular w4:  0.983x
  mct regular w8:  0.993x
  mct regular w16: 0.996x

  mct sharp w4:    0.974x
  mct sharp w8:    0.990x
  mct sharp w16:   0.995x

Cortex-X3:
  mct regular w4:  0.953x
  mct regular w8:  0.993x
  mct regular w16: 0.997x

  mct sharp w4:    0.981x
  mct sharp w8:    0.993x
  mct sharp w16:   0.995x
---
 src/arm/64/mc_dotprod.S | 110 ++++++++++++++++++++++++----------------
 1 file changed, 66 insertions(+), 44 deletions(-)

diff --git a/src/arm/64/mc_dotprod.S b/src/arm/64/mc_dotprod.S
index 28f8856cc..a4f98a2ca 100644
--- a/src/arm/64/mc_dotprod.S
+++ b/src/arm/64/mc_dotprod.S
@@ -731,21 +731,37 @@ L(\type\()_8tap_h_hv_\isa):
         mov             \lsrc, \src
         mov             \ldst, \dst
         mov             w8, \h
-
+.ifc \isa, neon_i8mm
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v16.16b, v22.16b
+        srshr           v16.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v17.16b, v22.16b
+        srshr           v17.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v18.16b, v22.16b
+        srshr           v18.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v19.16b, v22.16b
+        srshr           v19.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v20.16b, v22.16b
+        srshr           v20.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v21.16b, v22.16b
+        srshr           v21.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-
+        srshr           v22.8h, v22.8h, #2
+.else
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v16.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v17.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v18.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v19.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v20.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v21.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v22.8h, v22.8h, #2
+.endif
         .align LOOP_ALIGN
 8:
         ldr             q23, [\lsrc]
@@ -850,18 +866,19 @@ L(\type\()_8tap_h_hv_\isa):
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v16.16b, v22.16b
+        shrn            v16.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v17.16b, v22.16b
+        shrn            v17.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v18.16b, v22.16b
+        shrn            v18.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v19.16b, v22.16b
+        shrn            v19.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v20.16b, v22.16b
+        shrn            v20.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v21.16b, v22.16b
+        shrn            v21.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
+        shrn            v22.4h, v22.4s, #2
 
         .align LOOP_ALIGN
 4:
@@ -919,18 +936,19 @@ L(\type\()_8tap_h_hv_\isa):
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v16.16b, v22.16b
+        shrn            v16.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v17.16b, v22.16b
+        shrn            v17.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v18.16b, v22.16b
+        shrn            v18.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v19.16b, v22.16b
+        shrn            v19.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v20.16b, v22.16b
+        shrn            v20.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v21.16b, v22.16b
+        shrn            v21.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
+        shrn            v22.4h, v22.4s, #2
 
         .align LOOP_ALIGN
 2:
@@ -1001,18 +1019,29 @@ L(\type\()_6tap_hv_\isa):
         mov             \lsrc, \src
         mov             \ldst, \dst
         mov             w8, \h
-
+.ifc \isa, neon_i8mm
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v16.16b, v22.16b
+        srshr           v16.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v17.16b, v22.16b
+        srshr           v17.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v18.16b, v22.16b
+        srshr           v18.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v19.16b, v22.16b
+        srshr           v19.8h, v22.8h, #2
         bl              L(\type\()_hv_filter8_\isa)
-        mov             v20.16b, v22.16b
-
+        srshr           v20.8h, v22.8h, #2
+.else
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v16.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v17.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v18.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v19.8h, v22.8h, #2
+        bl              L(\type\()_hv_filter8_\isa)
+        sshr            v20.8h, v22.8h, #2
+.endif
         .align LOOP_ALIGN
 8:
         ldr             q23, [\xmy]
@@ -1100,13 +1129,7 @@ L(\type\()_hv_filter8_\isa):
         \dot            v23.4s, v3.16b, v26.4b[0]
         \dot            v22.4s, v3.16b, v26.4b[1]
         \dot            v23.4s, v4.16b, v26.4b[1]
-.ifc \isa, neon_i8mm
         uzp1            v22.8h, v22.8h, v23.8h
-        srshr           v22.8h, v22.8h, #2
-.else
-        shrn            v22.4h, v22.4s, #2
-        shrn2           v22.8h, v23.4s, #2
-.endif
         ret
 
         .align FUNC_ALIGN
@@ -1120,7 +1143,6 @@ L(\type\()_hv_filter4_\isa):
 .endif
         tbl             v2.16b, {v4.16b}, v28.16b
         \dot            v22.4s, v2.16b, v26.4b[0]
-        shrn            v22.4h, v22.4s, #2
         ret
 
         .align JUMP_ALIGN
@@ -1129,15 +1151,15 @@ L(\type\()_hv_filter4_\isa):
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v16.16b, v22.16b
+        shrn            v16.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v17.16b, v22.16b
+        shrn            v17.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v18.16b, v22.16b
+        shrn            v18.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v19.16b, v22.16b
+        shrn            v19.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v20.16b, v22.16b
+        shrn            v20.4h, v22.4s, #2
 
         .align LOOP_ALIGN
 4:
@@ -1190,15 +1212,15 @@ L(\type\()_hv_filter4_\isa):
         add             \src, \src, #2
 
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v16.16b, v22.16b
+        shrn            v16.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v17.16b, v22.16b
+        shrn            v17.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v18.16b, v22.16b
+        shrn            v18.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v19.16b, v22.16b
+        shrn            v19.4h, v22.4s, #2
         bl              L(\type\()_hv_filter4_\isa)
-        mov             v20.16b, v22.16b
+        shrn            v20.4h, v22.4s, #2
 
         .align LOOP_ALIGN
 2:

From 9b362df677dcbff3cb6ad2da3f5f9ba60235a463 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 1 May 2024 16:07:23 +0200
Subject: [PATCH 15/22] AArch64: Optimize BTI landing pads of put_neon

Move the BTI landing pads out of the inner loops of put_neon
function, the only exception is the width=16 case where it is already
outside of the loops.

When BTI is enabled, the relative performance of omitting the
AARCH64_VALID_JUMP_TARGET from the inner loops on Cortex-A510 (lower
is better):
  w2:   0.981x
  w4:   0.991x
  w8:   0.612x
  w32:  0.687x
  w64:  0.813x
  w128: 0.892x

Out-of-order CPUs are mostly unaffected.
---
 src/arm/64/mc.S | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 5b493be82..be6a67c71 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -843,8 +843,9 @@ function put_neon, export=1
         sub             x9,  x9,  w8, uxtw
         br              x9
 
-2:
+20:
         AARCH64_VALID_JUMP_TARGET
+2:
         ld1             {v0.h}[0], [x2], x3
         ld1             {v1.h}[0], [x2], x3
         subs            w5,  w5,  #2
@@ -852,8 +853,9 @@ function put_neon, export=1
         st1             {v1.h}[0], [x0], x1
         b.gt            2b
         ret
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld1             {v0.s}[0], [x2], x3
         ld1             {v1.s}[0], [x2], x3
         subs            w5,  w5,  #2
@@ -861,8 +863,9 @@ function put_neon, export=1
         st1             {v1.s}[0], [x0], x1
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld1             {v0.8b}, [x2], x3
         ld1             {v1.8b}, [x2], x3
         subs            w5,  w5,  #2
@@ -884,8 +887,9 @@ function put_neon, export=1
         st1             {v1.16b}, [x8], x1
         b.gt            16b
         ret
-32:
+320:
         AARCH64_VALID_JUMP_TARGET
+32:
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
@@ -895,8 +899,9 @@ function put_neon, export=1
         add             x0,  x0,  x1
         b.gt            32b
         ret
-64:
+640:
         AARCH64_VALID_JUMP_TARGET
+64:
         ldp             x6,  x7,  [x2]
         ldp             x8,  x9,  [x2, #16]
         stp             x6,  x7,  [x0]
@@ -910,8 +915,9 @@ function put_neon, export=1
         add             x0,  x0,  x1
         b.gt            64b
         ret
-128:
+1280:
         AARCH64_VALID_JUMP_TARGET
+128:
         ldp             q0,  q1,  [x2]
         ldp             q2,  q3,  [x2, #32]
         stp             q0,  q1,  [x0]
@@ -927,13 +933,13 @@ function put_neon, export=1
         ret
 
 L(put_tbl):
-        .hword L(put_tbl) - 128b
-        .hword L(put_tbl) -  64b
-        .hword L(put_tbl) -  32b
-        .hword L(put_tbl) - 160b
-        .hword L(put_tbl) -   8b
-        .hword L(put_tbl) -   4b
-        .hword L(put_tbl) -   2b
+        .hword L(put_tbl) - 1280b
+        .hword L(put_tbl) -  640b
+        .hword L(put_tbl) -  320b
+        .hword L(put_tbl) -  160b
+        .hword L(put_tbl) -   80b
+        .hword L(put_tbl) -   40b
+        .hword L(put_tbl) -   20b
 endfunc
 
 

From e9f815d473dde2a8f67147c0efe83dc065d08120 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 1 May 2024 16:10:54 +0200
Subject: [PATCH 16/22] AArch64: Optimize jump table calculation of put_neon

Save a complex arithmetic instruction in the jump table address
calculation of put_neon function.
---
 src/arm/64/mc.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index be6a67c71..fa96f64a7 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -840,7 +840,7 @@ endfunc
 function put_neon, export=1
         adr             x9,  L(put_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        sub             x9,  x9,  x8
         br              x9
 
 20:

From 9ee822a85f035b8c5ef8655e1d439892872066f7 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 1 May 2024 16:12:19 +0200
Subject: [PATCH 17/22] AArch64: Optimize put_neon function

Optimize the copy part of subpel filters (the put_neon function).
For small block sizes (<16) the usage of general purpose registers
is usually the best way to do the copy.

Relative performance of micro benchmarks (lower is better):

Cortex-A55:
  w2:   0.991x
  w4:   0.992x
  w8:   0.999x
  w16:  0.875x
  w32:  0.775x
  w64:  0.914x
  w128: 0.998x

Cortex-A510:
  w2:   0.159x
  w4:   0.080x
  w8:   0.583x
  w16:  0.588x
  w32:  0.966x
  w64:  1.111x
  w128: 0.957x

Cortex-A76:
  w2:   0.903x
  w4:   0.683x
  w8:   0.944x
  w16:  0.948x
  w32:  0.919x
  w64:  0.855x
  w128: 0.991x

Cortex-A78:
  w32:  0.867x
  w64:  0.820x
  w128: 1.011x

Cortex-A715:
  w32:  0.834x
  w64:  0.778x
  w128: 1.000x

Cortex-X1:
  w32:  0.809x
  w64:  0.762x
  w128: 1.000x

Cortex-X3:
  w32: 0.733x
  w64: 0.720x
  w128: 0.999x
---
 src/arm/64/mc.S | 108 ++++++++++++++++++++++++------------------------
 1 file changed, 55 insertions(+), 53 deletions(-)

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index fa96f64a7..68dbbe79a 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -846,89 +846,91 @@ function put_neon, export=1
 20:
         AARCH64_VALID_JUMP_TARGET
 2:
-        ld1             {v0.h}[0], [x2], x3
-        ld1             {v1.h}[0], [x2], x3
-        subs            w5,  w5,  #2
-        st1             {v0.h}[0], [x0], x1
-        st1             {v1.h}[0], [x0], x1
+        ldrh            w9, [x2]
+        ldrh            w10, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        subs            w5, w5, #2
+        strh            w9, [x0]
+        strh            w10, [x0, x1]
+        add             x0, x0, x1, lsl #1
         b.gt            2b
         ret
 40:
         AARCH64_VALID_JUMP_TARGET
 4:
-        ld1             {v0.s}[0], [x2], x3
-        ld1             {v1.s}[0], [x2], x3
-        subs            w5,  w5,  #2
-        st1             {v0.s}[0], [x0], x1
-        st1             {v1.s}[0], [x0], x1
+        ldr             w9, [x2]
+        ldr             w10, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        subs            w5, w5, #2
+        str             w9, [x0]
+        str             w10, [x0, x1]
+        add             x0, x0, x1, lsl #1
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
 8:
-        ld1             {v0.8b}, [x2], x3
-        ld1             {v1.8b}, [x2], x3
-        subs            w5,  w5,  #2
-        st1             {v0.8b}, [x0], x1
-        st1             {v1.8b}, [x0], x1
+        ldr             x9, [x2]
+        ldr             x10, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        subs            w5, w5, #2
+        str             x9, [x0]
+        str             x10, [x0, x1]
+        add             x0, x0, x1, lsl #1
         b.gt            8b
         ret
 160:
         AARCH64_VALID_JUMP_TARGET
-        add             x8,  x0,  x1
-        lsl             x1,  x1,  #1
-        add             x9,  x2,  x3
-        lsl             x3,  x3,  #1
 16:
-        ld1             {v0.16b}, [x2], x3
-        ld1             {v1.16b}, [x9], x3
-        subs            w5,  w5,  #2
-        st1             {v0.16b}, [x0], x1
-        st1             {v1.16b}, [x8], x1
+        ldr             q0, [x2]
+        ldr             q1, [x2, x3]
+        add             x2, x2, x3, lsl #1
+        subs            w5, w5, #2
+        str             q0, [x0]
+        str             q1, [x0, x1]
+        add             x0, x0, x1, lsl #1
         b.gt            16b
         ret
 320:
         AARCH64_VALID_JUMP_TARGET
 32:
-        ldp             x6,  x7,  [x2]
-        ldp             x8,  x9,  [x2, #16]
-        stp             x6,  x7,  [x0]
-        subs            w5,  w5,  #1
-        stp             x8,  x9,  [x0, #16]
-        add             x2,  x2,  x3
-        add             x0,  x0,  x1
+        ldp             q0, q1, [x2]
+        add             x2, x2, x3
+        stp             q0, q1, [x0]
+        add             x0, x0, x1
+        ldp             q2, q3, [x2]
+        add             x2, x2, x3
+        stp             q2, q3, [x0]
+        subs            w5, w5, #2
+        add             x0, x0, x1
         b.gt            32b
         ret
 640:
         AARCH64_VALID_JUMP_TARGET
 64:
-        ldp             x6,  x7,  [x2]
-        ldp             x8,  x9,  [x2, #16]
-        stp             x6,  x7,  [x0]
-        ldp             x10, x11, [x2, #32]
-        stp             x8,  x9,  [x0, #16]
-        subs            w5,  w5,  #1
-        ldp             x12, x13, [x2, #48]
-        stp             x10, x11, [x0, #32]
-        stp             x12, x13, [x0, #48]
-        add             x2,  x2,  x3
-        add             x0,  x0,  x1
+        ldp             q0, q1, [x2]
+        stp             q0, q1, [x0]
+        ldp             q2, q3, [x2, #32]
+        add             x2, x2, x3
+        stp             q2, q3, [x0, #32]
+        subs            w5, w5, #1
+        add             x0, x0, x1
         b.gt            64b
         ret
 1280:
         AARCH64_VALID_JUMP_TARGET
 128:
-        ldp             q0,  q1,  [x2]
-        ldp             q2,  q3,  [x2, #32]
-        stp             q0,  q1,  [x0]
-        ldp             q4,  q5,  [x2, #64]
-        stp             q2,  q3,  [x0, #32]
-        ldp             q6,  q7,  [x2, #96]
-        subs            w5,  w5,  #1
-        stp             q4,  q5,  [x0, #64]
-        stp             q6,  q7,  [x0, #96]
-        add             x2,  x2,  x3
-        add             x0,  x0,  x1
+        ldp             q0, q1, [x2]
+        stp             q0, q1, [x0]
+        ldp             q2, q3, [x2, #32]
+        stp             q2, q3, [x0, #32]
+        ldp             q4, q5, [x2, #64]
+        stp             q4, q5, [x0, #64]
+        ldp             q6, q7, [x2, #96]
+        add             x2, x2, x3
+        stp             q6, q7, [x0, #96]
+        subs            w5, w5, #1
+        add             x0, x0, x1
         b.gt            128b
         ret
 

From 03610dfef226888d409503ad509cf88342f78371 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 1 May 2024 15:42:24 +0200
Subject: [PATCH 18/22] AArch64: Optimize BTI landing pads of prep_neon

Move the BTI landing pads out of the inner loops of prep_neon
function. Only the width=4 and width=8 cases are affected.

If BTI is enabled, moving the AARCH64_VALID_JUMP_TARGET out of the
inner loops we get better execution speed on Cortex-A510 relative to
the original (lower is better):
  w4: 0.969x
  w8: 0.722x

Out-of-order cores are not affected.
---
 src/arm/64/mc.S | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 68dbbe79a..1ea8aeab9 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -953,8 +953,9 @@ function prep_neon, export=1
         sub             x9,  x9,  w8, uxtw
         br              x9
 
-4:
+40:
         AARCH64_VALID_JUMP_TARGET
+4:
         ld1             {v0.s}[0], [x1], x2
         ld1             {v1.s}[0], [x1], x2
         subs            w4,  w4,  #2
@@ -963,8 +964,9 @@ function prep_neon, export=1
         st1             {v0.4h, v1.4h}, [x0], #16
         b.gt            4b
         ret
-8:
+80:
         AARCH64_VALID_JUMP_TARGET
+8:
         ld1             {v0.8b}, [x1], x2
         ld1             {v1.8b}, [x1], x2
         subs            w4,  w4,  #2
@@ -1071,8 +1073,8 @@ L(prep_tbl):
         .hword L(prep_tbl) -  640b
         .hword L(prep_tbl) -  320b
         .hword L(prep_tbl) -  160b
-        .hword L(prep_tbl) -    8b
-        .hword L(prep_tbl) -    4b
+        .hword L(prep_tbl) -   80b
+        .hword L(prep_tbl) -   40b
 endfunc
 
 

From 465916958d507c430a43ebb1e3611cd8c94597e5 Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 1 May 2024 15:46:02 +0200
Subject: [PATCH 19/22] AArch64: Optimize jump table calculation of prep_neon

Save a complex arithmetic instruction in the jump table address
calculation of prep_neon function.
---
 src/arm/64/mc.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 1ea8aeab9..02ed1a928 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -950,7 +950,7 @@ endfunc
 function prep_neon, export=1
         adr             x9,  L(prep_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
-        sub             x9,  x9,  w8, uxtw
+        sub             x9,  x9,  x8
         br              x9
 
 40:

From f3c7b6724497eb9abfb549c6c7a629d3415dd7ed Mon Sep 17 00:00:00 2001
From: Arpad Panyik <Arpad.Panyik@arm.com>
Date: Wed, 1 May 2024 15:50:51 +0200
Subject: [PATCH 20/22] AArch64: Optimize prep_neon function

Optimize the widening copy part of subpel filters (the prep_neon
function). In this patch we combine widening shifts with widening
multiplications in the inner loops to get maximum throughput.

The change will increase .text by 36 bytes.

Relative performance of micro benchmarks (lower is better):

Cortex-A55:
  mct_w4:   0.795x
  mct_w8:   0.913x
  mct_w16:  0.912x
  mct_w32:  0.838x
  mct_w64:  1.025x
  mct_w128: 1.002x

Cortex-A510:
  mct_w4:   0.760x
  mct_w8:   0.636x
  mct_w16:  0.640x
  mct_w32:  0.854x
  mct_w64:  0.864x
  mct_w128: 0.995x

Cortex-A72:
  mct_w4:   0.616x
  mct_w8:   0.854x
  mct_w16:  0.756x
  mct_w32:  1.052x
  mct_w64:  1.044x
  mct_w128: 0.702x

Cortex-A76:
  mct_w4:   0.837x
  mct_w8:   0.797x
  mct_w16:  0.841x
  mct_w32:  0.804x
  mct_w64:  0.948x
  mct_w128: 0.904x

Cortex-A78:
  mct_w16:  0.542x
  mct_w32:  0.725x
  mct_w64:  0.741x
  mct_w128: 0.745x

Cortex-A715:
  mct_w16:  0.561x
  mct_w32:  0.720x
  mct_w64:  0.740x
  mct_w128: 0.748x

Cortex-X1:
  mct_w32:  0.886x
  mct_w64:  0.882x
  mct_w128: 0.917x

Cortex-X3:
  mct_w32:  0.835x
  mct_w64:  0.803x
  mct_w128: 0.808x
---
 src/arm/64/mc.S | 181 +++++++++++++++++++++++++++---------------------
 1 file changed, 103 insertions(+), 78 deletions(-)

diff --git a/src/arm/64/mc.S b/src/arm/64/mc.S
index 02ed1a928..736b2bb4e 100644
--- a/src/arm/64/mc.S
+++ b/src/arm/64/mc.S
@@ -950,6 +950,7 @@ endfunc
 function prep_neon, export=1
         adr             x9,  L(prep_tbl)
         ldrh            w8,  [x9, x8, lsl #1]
+        movi            v24.16b, #16
         sub             x9,  x9,  x8
         br              x9
 
@@ -957,114 +958,138 @@ function prep_neon, export=1
         AARCH64_VALID_JUMP_TARGET
 4:
         ld1             {v0.s}[0], [x1], x2
+        ld1             {v0.s}[1], [x1], x2
         ld1             {v1.s}[0], [x1], x2
-        subs            w4,  w4,  #2
+        ld1             {v1.s}[1], [x1], x2
         ushll           v0.8h, v0.8b, #4
         ushll           v1.8h, v1.8b, #4
-        st1             {v0.4h, v1.4h}, [x0], #16
+        subs            w4, w4, #4
+        stp             q0, q1, [x0], #32
         b.gt            4b
         ret
 80:
         AARCH64_VALID_JUMP_TARGET
 8:
-        ld1             {v0.8b}, [x1], x2
-        ld1             {v1.8b}, [x1], x2
-        subs            w4,  w4,  #2
+        ldr             d0, [x1]
+        ldr             d1, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ldr             d2, [x1]
+        ldr             d3, [x1, x2]
+        add             x1, x1, x2, lsl #1
         ushll           v0.8h, v0.8b, #4
         ushll           v1.8h, v1.8b, #4
-        st1             {v0.8h, v1.8h}, [x0], #32
+        umull           v2.8h, v2.8b, v24.8b
+        umull           v3.8h, v3.8b, v24.8b
+        subs            w4, w4, #4
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        add             x0, x0, #64
         b.gt            8b
         ret
 160:
         AARCH64_VALID_JUMP_TARGET
-        add             x9,  x1,  x2
-        lsl             x2,  x2,  #1
 16:
-        ld1             {v0.16b}, [x1], x2
-        ld1             {v1.16b}, [x9], x2
-        subs            w4,  w4,  #2
-        ushll           v4.8h, v0.8b,  #4
-        ushll2          v5.8h, v0.16b, #4
-        ushll           v6.8h, v1.8b,  #4
-        ushll2          v7.8h, v1.16b, #4
-        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
+        ldr             q1, [x1]
+        ldr             q3, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        ushll           v0.8h, v1.8b, #4
+        ushll2          v1.8h, v1.16b, #4
+        ldr             q5, [x1]
+        ldr             q7, [x1, x2]
+        add             x1, x1, x2, lsl #1
+        umull           v2.8h, v3.8b, v24.8b
+        umull2          v3.8h, v3.16b, v24.16b
+        ushll           v4.8h, v5.8b, #4
+        ushll2          v5.8h, v5.16b, #4
+        umull           v6.8h, v7.8b, v24.8b
+        umull2          v7.8h, v7.16b, v24.16b
+        subs            w4, w4, #4
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        stp             q4, q5, [x0, #64]
+        stp             q6, q7, [x0, #96]
+        add             x0, x0, #128
         b.gt            16b
         ret
 320:
         AARCH64_VALID_JUMP_TARGET
-        add             x8,  x0,  w3, uxtw
 32:
-        ld1             {v0.16b, v1.16b},  [x1], x2
-        subs            w4,  w4,  #2
-        ushll           v4.8h,  v0.8b,  #4
-        ushll2          v5.8h,  v0.16b, #4
-        ld1             {v2.16b, v3.16b},  [x1], x2
-        ushll           v6.8h,  v1.8b,  #4
-        ushll2          v7.8h,  v1.16b, #4
-        ushll           v16.8h, v2.8b,  #4
-        st1             {v4.8h,  v5.8h},  [x0], x7
-        ushll2          v17.8h, v2.16b, #4
-        st1             {v6.8h,  v7.8h},  [x8], x7
-        ushll           v18.8h, v3.8b,  #4
-        st1             {v16.8h, v17.8h}, [x0], x7
-        ushll2          v19.8h, v3.16b, #4
-        st1             {v18.8h, v19.8h}, [x8], x7
+        ldp             q4, q5, [x1]
+        add             x1, x1, x2
+        ldp             q6, q7, [x1]
+        add             x1, x1, x2
+        ushll           v0.8h, v4.8b, #4
+        ushll2          v1.8h, v4.16b, #4
+        umull           v2.8h, v5.8b, v24.8b
+        umull2          v3.8h, v5.16b, v24.16b
+        ushll           v4.8h, v6.8b, #4
+        ushll2          v5.8h, v6.16b, #4
+        umull           v6.8h, v7.8b, v24.8b
+        umull2          v7.8h, v7.16b, v24.16b
+        subs            w4, w4, #2
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        stp             q4, q5, [x0, #64]
+        stp             q6, q7, [x0, #96]
+        add             x0, x0, #128
         b.gt            32b
         ret
 640:
         AARCH64_VALID_JUMP_TARGET
-        add             x8,  x0,  #32
-        mov             x6,  #64
 64:
-        ldp             q0,  q1,  [x1]
-        subs            w4,  w4,  #1
-        ushll           v4.8h,  v0.8b,  #4
-        ushll2          v5.8h,  v0.16b, #4
-        ldp             q2,  q3,  [x1, #32]
-        ushll           v6.8h,  v1.8b,  #4
-        ushll2          v7.8h,  v1.16b, #4
-        add             x1,  x1,  x2
-        ushll           v16.8h, v2.8b,  #4
-        st1             {v4.8h,  v5.8h},  [x0], x6
-        ushll2          v17.8h, v2.16b, #4
-        ushll           v18.8h, v3.8b,  #4
-        st1             {v6.8h,  v7.8h},  [x8], x6
-        ushll2          v19.8h, v3.16b, #4
-        st1             {v16.8h, v17.8h}, [x0], x6
-        st1             {v18.8h, v19.8h}, [x8], x6
+        ldp             q4, q5, [x1]
+        ldp             q6, q7, [x1, #32]
+        add             x1, x1, x2
+        ushll           v0.8h, v4.8b, #4
+        ushll2          v1.8h, v4.16b, #4
+        umull           v2.8h, v5.8b, v24.8b
+        umull2          v3.8h, v5.16b, v24.16b
+        ushll           v4.8h, v6.8b, #4
+        ushll2          v5.8h, v6.16b, #4
+        umull           v6.8h, v7.8b, v24.8b
+        umull2          v7.8h, v7.16b, v24.16b
+        subs            w4, w4, #1
+        stp             q0, q1, [x0]
+        stp             q2, q3, [x0, #32]
+        stp             q4, q5, [x0, #64]
+        stp             q6, q7, [x0, #96]
+        add             x0, x0, #128
         b.gt            64b
         ret
 1280:
         AARCH64_VALID_JUMP_TARGET
-        add             x8,  x0,  #64
-        mov             x6,  #128
 128:
-        ldp             q0,  q1,  [x1]
-        ldp             q2,  q3,  [x1, #32]
-        ushll           v16.8h,  v0.8b,  #4
-        ushll2          v17.8h,  v0.16b, #4
-        ushll           v18.8h,  v1.8b,  #4
-        ushll2          v19.8h,  v1.16b, #4
-        ushll           v20.8h,  v2.8b,  #4
-        ushll2          v21.8h,  v2.16b, #4
-        ldp             q4,  q5,  [x1, #64]
-        st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x6
-        ushll           v22.8h,  v3.8b,  #4
-        ushll2          v23.8h,  v3.16b, #4
-        ushll           v24.8h,  v4.8b,  #4
-        ushll2          v25.8h,  v4.16b, #4
-        ushll           v26.8h,  v5.8b,  #4
-        ushll2          v27.8h,  v5.16b, #4
-        ldp             q6,  q7,  [x1, #96]
-        st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x8], x6
-        ushll           v28.8h,  v6.8b,  #4
-        ushll2          v29.8h,  v6.16b, #4
-        ushll           v30.8h,  v7.8b,  #4
-        ushll2          v31.8h,  v7.16b, #4
-        subs            w4,  w4,  #1
-        add             x1,  x1,  x2
-        st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x6
-        st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x8], x6
+        ldp             q28, q29, [x1]
+        ldp             q30, q31, [x1, #32]
+        ushll           v16.8h, v28.8b, #4
+        ushll2          v17.8h, v28.16b, #4
+        umull           v18.8h, v29.8b, v24.8b
+        umull2          v19.8h, v29.16b, v24.16b
+        ushll           v20.8h, v30.8b, #4
+        ushll2          v21.8h, v30.16b, #4
+        umull           v22.8h, v31.8b, v24.8b
+        umull2          v23.8h, v31.16b, v24.16b
+        ldp             q28, q29, [x1, #64]
+        ldp             q30, q31, [x1, #96]
+        add             x1, x1, x2
+        stp             q16, q17, [x0]
+        stp             q18, q19, [x0, #32]
+        stp             q20, q21, [x0, #64]
+        stp             q22, q23, [x0, #96]
+        ushll           v16.8h, v28.8b, #4
+        ushll2          v17.8h, v28.16b, #4
+        umull           v18.8h, v29.8b, v24.8b
+        umull2          v19.8h, v29.16b, v24.16b
+        ushll           v20.8h, v30.8b, #4
+        ushll2          v21.8h, v30.16b, #4
+        umull           v22.8h, v31.8b, v24.8b
+        umull2          v23.8h, v31.16b, v24.16b
+        subs            w4, w4, #1
+        stp             q16, q17, [x0, #128]
+        stp             q18, q19, [x0, #160]
+        stp             q20, q21, [x0, #192]
+        stp             q22, q23, [x0, #224]
+        add             x0, x0, #256
         b.gt            128b
         ret
 

From 3ae38b3d643cc9bcbb5572b721ee9cf50c8ba0ad Mon Sep 17 00:00:00 2001
From: Frank Bossen <fbossen@gmail.com>
Date: Mon, 17 Jun 2024 18:33:41 -0400
Subject: [PATCH 21/22] Port C code changes to Rust

---
 build.rs                   | 19 ++++++++++-
 include/common/bitdepth.rs | 10 ++++--
 src/mc.rs                  | 67 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 3 deletions(-)

diff --git a/build.rs b/build.rs
index 3062cf902..5f6bc6f81 100644
--- a/build.rs
+++ b/build.rs
@@ -103,6 +103,13 @@ mod asm {
         if let Arch::Arm(arch) = arch {
             define(Define::bool("ARCH_ARM", arch == ArchArm::Arm32));
             define(Define::bool("ARCH_AARCH64", arch == ArchArm::Arm64));
+
+            if arch == ArchArm::Arm64 {
+                define(Define::bool("HAVE_DOTPROD", features.contains("dotprod")));
+            }
+            if arch == ArchArm::Arm64 {
+                define(Define::bool("HAVE_I8MM", features.contains("i8mm")));
+            }
         }
 
         if let Arch::X86(arch) = arch {
@@ -199,6 +206,7 @@ mod asm {
         ][..];
 
         let arm_generic = &["itx", "msac", "refmvs", "looprestoration_common"][..];
+        let arm_dotprod = &["mc_dotprod"][..];
         let arm_bpc8 = &[
             "cdef",
             "filmgrain",
@@ -243,11 +251,20 @@ mod asm {
             #[cfg(feature = "bitdepth_16")]
             arm_bpc16,
         ][..];
+        let arm64_all = &[
+            arm_generic,
+            arm_dotprod,
+            #[cfg(feature = "bitdepth_8")]
+            arm_bpc8,
+            #[cfg(feature = "bitdepth_16")]
+            arm_bpc16,
+        ][..];
 
         let asm_file_names = match arch {
             Arch::X86(ArchX86::X86_32) => x86_all,
             Arch::X86(ArchX86::X86_64) => x86_64_all,
-            Arch::Arm(..) => arm_all,
+            Arch::Arm(ArchArm::Arm32) => arm_all,
+            Arch::Arm(ArchArm::Arm64) => arm64_all,
         };
 
         let asm_file_dir = match arch {
diff --git a/include/common/bitdepth.rs b/include/common/bitdepth.rs
index 002d4e201..398a29019 100644
--- a/include/common/bitdepth.rs
+++ b/include/common/bitdepth.rs
@@ -457,7 +457,10 @@ macro_rules! bd_fn {
 ///
 /// Similar to [`bd_fn!`] except that it selects which [`BitDepth`] `fn`
 /// based on `$bpc:literal bpc` instead of `$BD:ty`.
-#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(all(
+    feature = "asm",
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
 macro_rules! bpc_fn {
     ($bpc:literal bpc, $name:ident, $asm:ident) => {{
         use $crate::include::common::bitdepth::fn_identity;
@@ -487,7 +490,10 @@ macro_rules! fn_identity {
 ))]
 pub(crate) use bd_fn;
 
-#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(all(
+    feature = "asm",
+    any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")
+))]
 pub(crate) use bpc_fn;
 
 #[allow(unused)]
diff --git a/src/mc.rs b/src/mc.rs
index 7834afad3..bc4db6f4a 100644
--- a/src/mc.rs
+++ b/src/mc.rs
@@ -46,6 +46,9 @@ use crate::include::common::bitdepth::bd_fn;
 #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))]
 use crate::include::common::bitdepth::{bpc_fn, BPC};
 
+#[cfg(all(feature = "asm", target_arch = "aarch64"))]
+use crate::include::common::bitdepth::bpc_fn;
+
 #[inline(never)]
 fn put_rust<BD: BitDepth>(
     dst: Rav1dPictureDataComponentOffset,
@@ -2300,6 +2303,70 @@ impl Rav1dMCDSPContext {
         self.warp8x8t = bd_fn!(warp8x8t::decl_fn, BD, warp_affine_8x8t, neon);
         self.emu_edge = bd_fn!(emu_edge::decl_fn, BD, emu_edge, neon);
 
+        #[cfg(target_feature = "dotprod")]
+        if BD::BITDEPTH == 8 {
+            if !flags.contains(CpuFlags::DOTPROD) {
+                return self;
+            }
+
+            self.mc = enum_map!(Filter2d => mc::Fn; match key {
+                Regular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular, neon_dotprod),
+                RegularSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_smooth, neon_dotprod),
+                RegularSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_sharp, neon_dotprod),
+                SmoothRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_regular, neon_dotprod),
+                Smooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth, neon_dotprod),
+                SmoothSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_sharp, neon_dotprod),
+                SharpRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_regular, neon_dotprod),
+                SharpSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_smooth, neon_dotprod),
+                Sharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp, neon_dotprod),
+                Bilinear => bpc_fn!(mc::decl_fn, 8 bpc, put_bilin, neon),
+            });
+            self.mct = enum_map!(Filter2d => mct::Fn; match key {
+                Regular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular, neon_dotprod),
+                RegularSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_smooth, neon_dotprod),
+                RegularSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_sharp, neon_dotprod),
+                SmoothRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_regular, neon_dotprod),
+                Smooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth, neon_dotprod),
+                SmoothSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_sharp, neon_dotprod),
+                SharpRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_regular, neon_dotprod),
+                SharpSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_smooth, neon_dotprod),
+                Sharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp, neon_dotprod),
+                Bilinear => bpc_fn!(mct::decl_fn, 8 bpc, prep_bilin, neon),
+            });
+        }
+
+        #[cfg(target_feature = "i8mm")]
+        if BD::BITDEPTH == 8 {
+            if !flags.contains(CpuFlags::I8MM) {
+                return self;
+            }
+
+            self.mc = enum_map!(Filter2d => mc::Fn; match key {
+                Regular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular, neon_i8mm),
+                RegularSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_smooth, neon_i8mm),
+                RegularSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_regular_sharp, neon_i8mm),
+                SmoothRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_regular, neon_i8mm),
+                Smooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth, neon_i8mm),
+                SmoothSharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_smooth_sharp, neon_i8mm),
+                SharpRegular8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_regular, neon_i8mm),
+                SharpSmooth8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp_smooth, neon_i8mm),
+                Sharp8Tap => bpc_fn!(mc::decl_fn, 8 bpc, put_8tap_sharp, neon_i8mm),
+                Bilinear => bpc_fn!(mc::decl_fn, 8 bpc, put_bilin, neon),
+            });
+            self.mct = enum_map!(Filter2d => mct::Fn; match key {
+                Regular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular, neon_i8mm),
+                RegularSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_smooth, neon_i8mm),
+                RegularSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_regular_sharp, neon_i8mm),
+                SmoothRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_regular, neon_i8mm),
+                Smooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth, neon_i8mm),
+                SmoothSharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_smooth_sharp, neon_i8mm),
+                SharpRegular8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_regular, neon_i8mm),
+                SharpSmooth8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp_smooth, neon_i8mm),
+                Sharp8Tap => bpc_fn!(mct::decl_fn, 8 bpc, prep_8tap_sharp, neon_i8mm),
+                Bilinear => bpc_fn!(mct::decl_fn, 8 bpc, prep_bilin, neon),
+            });
+        }
+
         self
     }
 

From 6b87a77489f2c2bfb652f1446c9b1012932ade14 Mon Sep 17 00:00:00 2001
From: Frank Bossen <fbossen@gmail.com>
Date: Wed, 17 Jul 2024 18:40:04 -0400
Subject: [PATCH 22/22] Specify `armv8.6-a` architecture when building aarch64

This enables building code requiring `i8mm` ISA extension
---
 build.rs  | 13 +++++++------
 src/mc.rs |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/build.rs b/build.rs
index 5f6bc6f81..4cb7c48ea 100644
--- a/build.rs
+++ b/build.rs
@@ -105,10 +105,8 @@ mod asm {
             define(Define::bool("ARCH_AARCH64", arch == ArchArm::Arm64));
 
             if arch == ArchArm::Arm64 {
-                define(Define::bool("HAVE_DOTPROD", features.contains("dotprod")));
-            }
-            if arch == ArchArm::Arm64 {
-                define(Define::bool("HAVE_I8MM", features.contains("i8mm")));
+                define(Define::bool("HAVE_DOTPROD", true));
+                define(Define::bool("HAVE_I8MM", true));
             }
         }
 
@@ -308,8 +306,11 @@ mod asm {
             }
             cc.compile(rav1dasm);
         } else {
-            cc::Build::new()
-                .files(asm_file_paths)
+            let mut cc = cc::Build::new();
+            if arch == Arch::Arm(ArchArm::Arm64) {
+                cc.flag("-march=armv8.6-a");
+            }
+            cc.files(asm_file_paths)
                 .include(".")
                 .include(&out_dir)
                 .debug(cfg!(debug_assertions))
diff --git a/src/mc.rs b/src/mc.rs
index bc4db6f4a..538aa7546 100644
--- a/src/mc.rs
+++ b/src/mc.rs
@@ -2303,7 +2303,7 @@ impl Rav1dMCDSPContext {
         self.warp8x8t = bd_fn!(warp8x8t::decl_fn, BD, warp_affine_8x8t, neon);
         self.emu_edge = bd_fn!(emu_edge::decl_fn, BD, emu_edge, neon);
 
-        #[cfg(target_feature = "dotprod")]
+        #[cfg(target_arch = "aarch64")]
         if BD::BITDEPTH == 8 {
             if !flags.contains(CpuFlags::DOTPROD) {
                 return self;
@@ -2335,7 +2335,7 @@ impl Rav1dMCDSPContext {
             });
         }
 
-        #[cfg(target_feature = "i8mm")]
+        #[cfg(target_arch = "aarch64")]
         if BD::BITDEPTH == 8 {
             if !flags.contains(CpuFlags::I8MM) {
                 return self;