diff --git a/api/docs/release.dox b/api/docs/release.dox
index 96d552fb4a9..cc044060dbd 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -212,12 +212,15 @@ Further non-compatibility-affecting changes include:
    #dynamorio::drmemtrace::analysis_tool_t to allow the tool to make holistic
    adjustments to the interval snapshots after all have been generated, and before
    they are used for merging across shards (potentially), and printing the results.
+ - Added opnd_is_vector_base_disp() to test if an opnd_t is a base+disp memory operand
+   that uses a vector register for the base or index register.
  - Added -abort_on_invariant_error flag that instructs the invariant checker drmemtrace
    analysis tool to abort trace analysis when a trace invariant error is found. This
    is set to true by default to match the existing behavior of the invariant checker.
  - Added a new instr API instr_is_xrstor() that tells whether an instruction is any
    variant of the x86 xrstor opcode.
 
+
 **************************************************
 <hr>
 
diff --git a/core/arch/arch.h b/core/arch/arch.h
index 2de3d2915c9..35f2855a65e 100644
--- a/core/arch/arch.h
+++ b/core/arch/arch.h
@@ -156,6 +156,9 @@ mixed_mode_enabled(void)
 #    define SCRATCH_REG4_OFFS R4_OFFSET
 #    define SCRATCH_REG5_OFFS R5_OFFSET
 #    define REG_OFFSET(reg) (R0_OFFSET + ((reg)-DR_REG_R0) * sizeof(reg_t))
+#    define Z_REG_OFFSET(reg) \
+        ((MC_OFFS) +          \
+         (offsetof(priv_mcontext_t, simd) + ((reg)-DR_REG_Z0) * sizeof(dr_simd_t)))
 #    define CALL_SCRATCH_REG DR_REG_R11
 #    define MC_IBL_REG r2
 #    define MC_RETVAL_REG r0
diff --git a/core/ir/aarch64/codec.c b/core/ir/aarch64/codec.c
index bd3330f9fcf..ffe676cc09a 100644
--- a/core/ir/aarch64/codec.c
+++ b/core/ir/aarch64/codec.c
@@ -1032,7 +1032,7 @@ get_elements_in_sve_vector(aarch64_reg_offset element_size)
 {
     const uint element_length =
         opnd_size_in_bits(get_opnd_size_from_offset(element_size));
-    return opnd_size_in_bits(OPSZ_SVE_VL_BYTES) / element_length;
+    return opnd_size_in_bits(OPSZ_SVE_VECLEN_BYTES) / element_length;
 }
 
 /*******************************************************************************
@@ -5195,7 +5195,8 @@ decode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
     const int offset = extract_int(enc, 16, 6);
     IF_RETURN_FALSE(offset < -32 || offset > 31)
     const reg_id_t rn = decode_reg(extract_uint(enc, 5, 5), true, true);
-    const opnd_size_t mem_transfer = op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VL_BYTES;
+    const opnd_size_t mem_transfer =
+        op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VECLEN_BYTES;
 
     /* As specified in the AArch64 SVE reference manual for contiguous prefetch
      * instructions, the immediate index value is a vector index into memory, NOT
@@ -5214,7 +5215,8 @@ static inline bool
 encode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
                                 OUT uint *enc_out)
 {
-    const opnd_size_t mem_transfer = op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VL_BYTES;
+    const opnd_size_t mem_transfer =
+        op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VECLEN_BYTES;
     if (!opnd_is_base_disp(opnd) || opnd_get_index(opnd) != DR_REG_NULL ||
         opnd_get_size(opnd) != mem_transfer)
         return false;
@@ -5344,7 +5346,8 @@ decode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
     bool is_vector = TEST(1u << 14, enc);
 
     /* Transfer size depends on whether we are transferring a Z or a P register. */
-    opnd_size_t memory_transfer_size = is_vector ? OPSZ_SVE_VL_BYTES : OPSZ_SVE_PL_BYTES;
+    opnd_size_t memory_transfer_size =
+        is_vector ? OPSZ_SVE_VECLEN_BYTES : OPSZ_SVE_PREDLEN_BYTES;
 
     /* As specified in the AArch64 SVE reference manual for unpredicated vector
      * register load LDR and store STR instructions, the immediate index value is a
@@ -5374,7 +5377,8 @@ encode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
     bool is_vector = TEST(1u << 14, enc);
 
     /* Transfer size depends on whether we are transferring a Z or a P register. */
-    opnd_size_t memory_transfer_size = is_vector ? OPSZ_SVE_VL_BYTES : OPSZ_SVE_PL_BYTES;
+    opnd_size_t memory_transfer_size =
+        is_vector ? OPSZ_SVE_VECLEN_BYTES : OPSZ_SVE_PREDLEN_BYTES;
 
     if (!opnd_is_base_disp(opnd) || opnd_get_size(opnd) != memory_transfer_size)
         return false;
diff --git a/core/ir/aarch64/codec.h b/core/ir/aarch64/codec.h
index 81de59b069f..b5d47b86236 100644
--- a/core/ir/aarch64/codec.h
+++ b/core/ir/aarch64/codec.h
@@ -57,21 +57,6 @@ encode_common(byte *pc, instr_t *i, decode_info_t *di);
 #define BITS(_enc, bitmax, bitmin) \
     ((((uint32)(_enc)) >> (bitmin)) & (uint32)MASK((bitmax) - (bitmin) + 1))
 
-#if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && !defined(BUILD_TESTS)
-#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
-#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
-#else
-/* SVE vector length for off-line decoder set using -vl option with drdisas,
- * e.g.
- * $ drdisas -vl 256 e58057a1 85865e6b
- *  e58057a1   str    %z1 -> +0x05(%x29)[32byte]
- *  85865e6b   ldr    +0x37(%x19)[32byte] -> %z11
- * $
- */
-#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
-#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
-#endif
-
 #define RETURN_FALSE                                               \
     do {                                                           \
         CLIENT_ASSERT(false, "Unexpected state in AArch64 codec"); \
diff --git a/core/ir/aarch64/instr.c b/core/ir/aarch64/instr.c
index 63a0ec57f2c..80e6917fa65 100644
--- a/core/ir/aarch64/instr.c
+++ b/core/ir/aarch64/instr.c
@@ -1,6 +1,6 @@
 /* **********************************************************
  * Copyright (c) 2017-2023 Google, Inc.  All rights reserved.
- * Copyright (c) 2016 ARM Limited. All rights reserved.
+ * Copyright (c) 2016-2024 ARM Limited. All rights reserved.
  * **********************************************************/
 
 /*
@@ -37,6 +37,8 @@
 #include "encode_api.h"
 #include "opcode_names.h"
 
+#include <stddef.h>
+
 /* XXX i#6690: currently only A64 is supported for instruction encoding.
  * We want to add support for A64 decoding and synthetic ISA encoding as well.
  * XXX i#1684: move this function to core/ir/instr_shared.c once we can support
@@ -447,7 +449,7 @@ reg_is_gpr(reg_id_t reg)
 bool
 reg_is_simd(reg_id_t reg)
 {
-    return (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
+    return reg_is_z(reg) || (DR_REG_Q0 <= reg && reg <= DR_REG_B31);
 }
 
 bool
@@ -737,3 +739,122 @@ instr_invert_predicate(dr_pred_type_t pred)
     default: CLIENT_ASSERT(false, "Incorrect predicate value"); return DR_PRED_NONE;
     }
 }
+
+ptr_int_t
+d_r_compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val)
+{
+    bool scaled = false;
+    uint amount = 0;
+    dr_extend_type_t type = opnd_get_index_extend(opnd, &scaled, &amount);
+    reg_t extended = 0;
+    uint msb = 0;
+    switch (type) {
+    default: CLIENT_ASSERT(false, "Unsupported extend type"); return 0;
+    case DR_EXTEND_UXTW: extended = index_val & 0x00000000ffffffffULL; break;
+    case DR_EXTEND_SXTW:
+        extended = index_val & 0x00000000ffffffffULL;
+        msb = extended >> 31u;
+        if (msb == 1) {
+            extended = ((~0ull) << 32u) | extended;
+        }
+        break;
+    case DR_EXTEND_UXTX:
+    case DR_EXTEND_SXTX: extended = index_val; break;
+    }
+    if (scaled) {
+        return extended << amount;
+    } else {
+        return extended;
+    }
+}
+
+static bool
+is_active_in_mask(size_t element, uint64 mask, size_t element_size_bytes)
+{
+    const uint64 element_flag = 1ull << (element_size_bytes * element);
+    return TESTALL(element_flag, mask);
+}
+
+bool
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint addr_index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write)
+{
+    CLIENT_ASSERT(have_addr != NULL && addr != NULL && mc != NULL && write != NULL,
+                  "SVE address computation: invalid args");
+    CLIENT_ASSERT(TEST(DR_MC_MULTIMEDIA, mc_flags),
+                  "dr_mcontext_t.flags must include DR_MC_MULTIMEDIA");
+    CLIENT_ASSERT(mc_size >= offsetof(dr_mcontext_t, svep) + sizeof(mc->svep),
+                  "Incompatible client, invalid dr_mcontext_t.size.");
+
+    *write = instr_is_scatter(instr);
+    ASSERT(*write || instr_is_gather(instr));
+
+    const size_t vl_bytes = opnd_size_in_bytes(OPSZ_SVE_VECLEN_BYTES);
+    /* DynamoRIO currently supports up to 512-bit vector registers so a predicate register
+     * value should be <= 64-bits.
+     * If DynamoRIO is extended in the future to support large vector lengths this
+     * function will need to be updated to cope with larger predicate mask values.
+     */
+    ASSERT(vl_bytes / 8 < sizeof(uint64));
+
+    const reg_t governing_pred = opnd_get_reg(instr_get_src(instr, 1));
+    ASSERT(governing_pred >= DR_REG_START_P && governing_pred <= DR_REG_STOP_P);
+    uint64 mask = mc->svep[governing_pred - DR_REG_START_P].d;
+
+    if (mask == 0) {
+        return false;
+    }
+
+    const size_t element_size_bytes =
+        opnd_size_in_bytes(opnd_get_vector_element_size(curop));
+    const size_t num_elements = vl_bytes / element_size_bytes;
+
+    size_t active_elements_found = 0;
+    for (size_t element = 0; element < num_elements; element++) {
+        if (is_active_in_mask(element, mask, element_size_bytes)) {
+            active_elements_found++;
+            if (active_elements_found - 1 == addr_index) {
+                const reg_t base_reg = opnd_get_base(curop);
+                if (reg_is_z(base_reg)) {
+                    /* Vector base: extract the current element. */
+                    size_t base_reg_num = base_reg - DR_REG_START_Z;
+                    if (element_size_bytes == 4) {
+                        *addr = (app_pc)(reg_t)mc->simd[base_reg_num].u32[element];
+                    } else {
+                        ASSERT(element_size_bytes == 8);
+                        *addr = (app_pc)mc->simd[base_reg_num].u64[element];
+                    }
+                } else {
+                    /* Scalar base. */
+                    *addr = (app_pc)reg_get_value_priv(base_reg, mc);
+                }
+
+                const reg_t index_reg = opnd_get_index(curop);
+                reg_t unscaled_index_val = 0;
+                if (reg_is_z(index_reg)) {
+                    /* Vector index: extract the current element. */
+                    size_t index_reg_num = index_reg - DR_REG_START_Z;
+                    if (element_size_bytes == 4) {
+                        unscaled_index_val = mc->simd[index_reg_num].u32[element];
+                    } else {
+                        ASSERT(element_size_bytes == 8);
+                        unscaled_index_val = mc->simd[index_reg_num].u64[element];
+                    }
+                } else {
+                    /* Scalar index or no index. */
+                    unscaled_index_val = reg_get_value_priv(index_reg, mc);
+                }
+
+                *have_addr = true;
+                *addr += d_r_compute_scaled_index_aarch64(curop, unscaled_index_val);
+                *addr += opnd_get_disp(curop);
+
+                return addr_index < num_elements;
+            }
+        }
+    }
+
+    return false;
+}
diff --git a/core/ir/aarchxx/opnd.c b/core/ir/aarchxx/opnd.c
index d4d121a30b8..934779f6b36 100644
--- a/core/ir/aarchxx/opnd.c
+++ b/core/ir/aarchxx/opnd.c
@@ -63,6 +63,8 @@ opnd_get_reg_dcontext_offs(reg_id_t reg)
         return R0_OFFSET + (R1_OFFSET - R0_OFFSET) * (reg - DR_REG_W0);
     if (reg == DR_REG_XSP || reg == DR_REG_WSP)
         return XSP_OFFSET;
+    if (DR_REG_Z0 <= reg && reg <= DR_REG_Z31)
+        return Z_REG_OFFSET(reg);
     CLIENT_ASSERT(false, "opnd_get_reg_dcontext_offs: invalid reg");
     return -1;
 #else
diff --git a/core/ir/arm/instr.c b/core/ir/arm/instr.c
index 08c7e90014f..30994552a45 100644
--- a/core/ir/arm/instr.c
+++ b/core/ir/arm/instr.c
@@ -909,7 +909,7 @@ DR_API
 bool
 instr_is_scatter(instr_t *instr)
 {
-    /* XXX i#3837: no scatter-store on ARM? */
+    /* No scatter-store on AArch32. */
     return false;
 }
 
@@ -917,6 +917,16 @@ DR_API
 bool
 instr_is_gather(instr_t *instr)
 {
-    /* XXX i#3837: no gather-load on ARM? */
+    /* No gather-load on AArch32. */
+    return false;
+}
+
+bool
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint addr_index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write)
+{
+    CLIENT_ASSERT(false, "There are no AArch32 instructions that use vector addressing");
     return false;
 }
diff --git a/core/ir/instr.h b/core/ir/instr.h
index 76dc3a02c82..de35fd1b3d1 100644
--- a/core/ir/instr.h
+++ b/core/ir/instr.h
@@ -676,11 +676,19 @@ int
 instr_length_arch(dcontext_t *dcontext, instr_t *instr);
 bool
 opc_is_not_a_real_memory_load(int opc);
+
+/* Compute the index-th address for a memory operand that uses a vector register for the
+ * base or index register.
+ * The return value has the same semantics as instr_compute_address_ex(). It returns:
+ *   true if index is in bounds and an address was calculated and returned,
+ *   false if index >= the number of addresses this instruction accesses.
+ */
 bool
-instr_compute_address_VSIB(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
-                           dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
-                           DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
-                           DR_PARAM_OUT bool *write);
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write);
+
 uint
 instr_branch_type(instr_t *cti_instr);
 #ifdef AARCH64
diff --git a/core/ir/instr_api.h b/core/ir/instr_api.h
index 44068e84c43..b68f9706178 100644
--- a/core/ir/instr_api.h
+++ b/core/ir/instr_api.h
@@ -1648,7 +1648,8 @@ DR_API
  * write is returned in \p is_write.  Either or both OUT variables can
  * be NULL.
  * \p mc->flags must include DR_MC_CONTROL and DR_MC_INTEGER.
- * For instructions that use vector addressing (VSIB, introduced in AVX2),
+ * For instructions that use vector addressing (x86 VSIB, introduced in AVX2, or
+ * AArch64 scatter/gather instructions introduced in SVE/SVE2),
  * mc->flags must additionally include DR_MC_MULTIMEDIA.
  *
  * Like instr_reads_memory(), this routine does not consider
diff --git a/core/ir/instr_shared.c b/core/ir/instr_shared.c
index f44ed2979ce..c7f76766bd3 100644
--- a/core/ir/instr_shared.c
+++ b/core/ir/instr_shared.c
@@ -2655,20 +2655,16 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
     for (i = 0; i < instr_num_dsts(instr); i++) {
         curop = instr_get_dst(instr, i);
         if (opnd_is_memory_reference(curop)) {
-            if (opnd_is_vsib(curop)) {
-#ifdef X86
-                if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop, index,
-                                               &have_addr, addr, &write)) {
-                    CLIENT_ASSERT(
-                        write,
-                        "VSIB found in destination but instruction is not a scatter");
+            if (opnd_is_vector_base_disp(curop)) {
+                if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
+                                                 index, &have_addr, addr, &write)) {
+                    CLIENT_ASSERT(write,
+                                  "Vector address found in destination but instruction "
+                                  "is not a scatter");
                     break;
                 } else {
                     return false;
                 }
-#else
-                CLIENT_ASSERT(false, "VSIB should be x86-only");
-#endif
             }
             memcount++;
             if (memcount == (int)index) {
@@ -2683,16 +2679,12 @@ instr_compute_address_helper(instr_t *instr, priv_mcontext_t *mc, size_t mc_size
         for (i = 0; i < instr_num_srcs(instr); i++) {
             curop = instr_get_src(instr, i);
             if (opnd_is_memory_reference(curop)) {
-                if (opnd_is_vsib(curop)) {
-#ifdef X86
-                    if (instr_compute_address_VSIB(instr, mc, mc_size, mc_flags, curop,
-                                                   index, &have_addr, addr, &write))
+                if (opnd_is_vector_base_disp(curop)) {
+                    if (instr_compute_vector_address(instr, mc, mc_size, mc_flags, curop,
+                                                     index, &have_addr, addr, &write))
                         break;
                     else
                         return false;
-#else
-                    CLIENT_ASSERT(false, "VSIB should be x86-only");
-#endif
                 }
                 memcount++;
                 if (memcount == (int)index)
diff --git a/core/ir/opnd.h b/core/ir/opnd.h
index 4d90ac18369..f440b90e7ac 100644
--- a/core/ir/opnd.h
+++ b/core/ir/opnd.h
@@ -196,6 +196,12 @@ opnd_compute_address_helper(opnd_t opnd, priv_mcontext_t *mc, ptr_int_t scaled_i
 bool
 opnd_is_abs_base_disp(opnd_t opnd);
 
+#if defined(AARCH64)
+/* Internal function shared with vector address calculation */
+ptr_int_t
+d_r_compute_scaled_index_aarch64(opnd_t opnd, reg_t index_val);
+#endif
+
 #ifndef STANDALONE_DECODER
 opnd_t
 opnd_create_dcontext_field(dcontext_t *dcontext, int offs);
@@ -339,4 +345,29 @@ extern reg_id_t dr_reg_stolen;
 extern reg_id_t dr_reg_stolen;
 #endif
 
+#ifdef AARCH64
+#    if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER) && \
+        !defined(BUILD_TESTS)
+/* Size of the SVE Z vector registers in bytes. */
+#        define OPSZ_SVE_VECLEN_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
+/* Size of the SVE P predicate registers in bytes. */
+#        define OPSZ_SVE_PREDLEN_BYTES \
+            opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
+#    else
+/* SVE vector length for off-line decoder set using dr_set_sve_vector_length() or -vl
+ * option with drdisas,
+ * e.g.
+ * $ drdisas -vl 256 e58057a1 85865e6b
+ *  e58057a1   str    %z1 -> +0x05(%x29)[32byte]
+ *  85865e6b   ldr    +0x37(%x19)[32byte] -> %z11
+ * $
+ */
+/* Size of the SVE Z vector registers in bytes. */
+#        define OPSZ_SVE_VECLEN_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
+/* Size of the SVE P predicate registers in bytes. */
+#        define OPSZ_SVE_PREDLEN_BYTES \
+            opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
+#    endif
+#endif /*AARCH64*/
+
 #endif /* _OPND_H_ */
diff --git a/core/ir/opnd_api.h b/core/ir/opnd_api.h
index b62a5807440..9fd3b19817a 100644
--- a/core/ir/opnd_api.h
+++ b/core/ir/opnd_api.h
@@ -1119,6 +1119,10 @@ enum {
     DR_REG_STOP_32 = DR_REG_WSP,  /**< End of 32-bit general register enum values */
     DR_REG_START_GPR = DR_REG_X0, /**< Start of full-size general-purpose registers */
     DR_REG_STOP_GPR = DR_REG_XSP, /**< End of full-size general-purpose registers */
+    DR_REG_START_Z = DR_REG_Z0,   /**< Start of Z scalable vector registers */
+    DR_REG_STOP_Z = DR_REG_Z31,   /**< Start of Z scalable vector registers */
+    DR_REG_START_P = DR_REG_P0,   /**< Start of P scalable predicate registers */
+    DR_REG_STOP_P = DR_REG_P15,   /**< Start of P scalable predicate registers */
 #    else
     DR_REG_START_32 = DR_REG_R0,  /**< Start of 32-bit general register enum values */
     DR_REG_STOP_32 = DR_REG_R15,  /**< End of 32-bit general register enum values */
@@ -1128,7 +1132,8 @@ enum {
 
     DR_NUM_GPR_REGS = DR_REG_STOP_GPR - DR_REG_START_GPR + 1, /**< Count of GPR regs. */
 #    ifdef AARCH64
-    DR_NUM_SIMD_VECTOR_REGS = DR_REG_Z31 - DR_REG_Z0 + 1,     /**< Count of SIMD regs. */
+    DR_NUM_SIMD_VECTOR_REGS =
+        DR_REG_STOP_Z - DR_REG_START_Z + 1, /**< Count of SIMD regs. */
 #    else
     /* XXX: maybe we want more distinct names that provide counts for 64-bit D or 32-bit
      * S registers.
@@ -2604,6 +2609,14 @@ DR_API
 bool
 opnd_is_vsib(opnd_t opnd);
 
+DR_API
+/**
+ * Returns true iff \p opnd is a base+disp memory reference operand which uses vector
+ * registers.
+ */
+bool
+opnd_is_vector_base_disp(opnd_t opnd);
+
 DR_API
 /**
  * Returns true iff \p opnd is a (near or far) absolute address operand.
@@ -3262,7 +3275,13 @@ opnd_is_reg_64bit(opnd_t opnd);
 DR_API
 /**
  * Assumes that \p reg is a DR_REG_ constant.
- * Returns true iff it refers to a pointer-sized general-purpose register.
+ * Returns true iff it refers to a pointer-sized register. \p reg is a general
+ * purpose register for all architectures apart from AArch64. For AArch64, \p
+ * reg can also be a scalable vector (SVE) Z register. Although Z registers are
+ * supported from 128 to 512 bits in length on DynamoRIO, addressing uses 32 or
+ * 64 bit elements of a vector for scatter/gather instructions, e.g.
+ * LD1SB Z0.D, P0/Z, [Z1.D]. See also issue
+ * <a href="https://github.com/DynamoRIO/dynamorio/issues/6750">6750</a>
  */
 bool
 reg_is_pointer_sized(reg_id_t reg);
diff --git a/core/ir/opnd_shared.c b/core/ir/opnd_shared.c
index f78d56a8d6e..176f060ff15 100644
--- a/core/ir/opnd_shared.c
+++ b/core/ir/opnd_shared.c
@@ -183,6 +183,13 @@ opnd_is_vsib(opnd_t op)
              reg_is_strictly_zmm(opnd_get_index(op))));
 }
 
+bool
+opnd_is_vector_base_disp(opnd_t op)
+{
+    return opnd_is_base_disp(op) &&
+        (reg_is_simd(opnd_get_base(op)) || reg_is_simd(opnd_get_index(op)));
+}
+
 bool
 opnd_is_reg_32bit(opnd_t opnd)
 {
@@ -232,7 +239,15 @@ bool
 reg_is_pointer_sized(reg_id_t reg)
 {
 #ifdef X64
+#    ifdef AARCH64
+    /* XXX i#6750: We need to generalize reg_{to,is}_pointer_sized() for non-GPR
+     * registers. Change names or add new keeping old or update docs?
+     */
+    return (reg >= DR_REG_Z0 && reg <= DR_REG_Z31) ||
+        (reg >= REG_START_64 && reg <= REG_STOP_64);
+#    else
     return (reg >= REG_START_64 && reg <= REG_STOP_64);
+#    endif
 #else
     return (reg >= REG_START_32 && reg <= REG_STOP_32);
 #endif
@@ -2211,6 +2226,21 @@ reg_get_value_ex(reg_id_t reg, dr_mcontext_t *mc, DR_PARAM_OUT byte *val)
         reg_t regval = reg_get_value(reg, mc);
         *(reg_t *)val = regval;
     }
+#elif defined(AARCH64)
+    if (reg >= DR_REG_START_Z && reg <= DR_REG_STOP_Z) {
+        if (!TEST(DR_MC_MULTIMEDIA, mc->flags) || mc->size != sizeof(dr_mcontext_t))
+            return false;
+        memcpy(val, &mc->simd[reg - DR_REG_START_Z],
+               opnd_size_in_bytes(reg_get_size(reg)));
+    } else if (reg >= DR_REG_START_P && reg <= DR_REG_STOP_P) {
+        if (!TEST(DR_MC_MULTIMEDIA, mc->flags) || mc->size != sizeof(dr_mcontext_t))
+            return false;
+        memcpy(val, &mc->svep[reg - DR_REG_START_P],
+               opnd_size_in_bytes(reg_get_size(reg)));
+    } else {
+        reg_t regval = reg_get_value(reg, mc);
+        *(reg_t *)val = regval;
+    }
 #else
     CLIENT_ASSERT(false, "NYI i#1551");
 #endif
@@ -2334,30 +2364,8 @@ opnd_compute_address_priv(opnd_t opnd, priv_mcontext_t *mc)
         ptr_int_t scale = opnd_get_scale(opnd);
         scaled_index = scale * reg_get_value_priv(index, mc);
 #elif defined(AARCH64)
-        bool scaled = false;
-        uint amount = 0;
-        dr_extend_type_t type = opnd_get_index_extend(opnd, &scaled, &amount);
-        reg_t index_val = reg_get_value_priv(index, mc);
-        reg_t extended = 0;
-        uint msb = 0;
-        switch (type) {
-        default: CLIENT_ASSERT(false, "Unsupported extend type"); return NULL;
-        case DR_EXTEND_UXTW: extended = (index_val << (63u - 31u)) >> (63u - 31u); break;
-        case DR_EXTEND_SXTW:
-            extended = (index_val << (63u - 31u)) >> (63u - 31u);
-            msb = extended >> 31u;
-            if (msb == 1) {
-                extended = ((~0ull) << 32u) | extended;
-            }
-            break;
-        case DR_EXTEND_UXTX:
-        case DR_EXTEND_SXTX: extended = index_val; break;
-        }
-        if (scaled) {
-            scaled_index = extended << amount;
-        } else {
-            scaled_index = extended;
-        }
+        scaled_index =
+            d_r_compute_scaled_index_aarch64(opnd, reg_get_value_priv(index, mc));
 #elif defined(ARM)
         uint amount;
         dr_shift_type_t type = opnd_get_index_shift(opnd, &amount);
@@ -2758,14 +2766,10 @@ reg_get_size(reg_id_t reg)
     if (reg >= DR_REG_MDCCSR_EL0 && reg <= DR_REG_SPSR_FIQ)
         return OPSZ_8;
     if (reg >= DR_REG_Z0 && reg <= DR_REG_Z31) {
-#        if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER)
-        return opnd_size_from_bytes(proc_get_vector_length_bytes());
-#        else
-        return OPSZ_SCALABLE;
-#        endif
+        return OPSZ_SVE_VECLEN_BYTES;
     }
     if ((reg >= DR_REG_P0 && reg <= DR_REG_P15) || reg == DR_REG_FFR)
-        return OPSZ_SCALABLE_PRED;
+        return OPSZ_SVE_PREDLEN_BYTES;
     if (reg == DR_REG_CNTVCT_EL0)
         return OPSZ_8;
     if (reg >= DR_REG_NZCV && reg <= DR_REG_FPSR)
diff --git a/core/ir/riscv64/instr.c b/core/ir/riscv64/instr.c
index 6c3cf6c22fb..31e3a3882af 100644
--- a/core/ir/riscv64/instr.c
+++ b/core/ir/riscv64/instr.c
@@ -519,3 +519,14 @@ instr_is_gather(instr_t *instr)
     ASSERT_NOT_IMPLEMENTED(false);
     return false;
 }
+
+bool
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint addr_index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write)
+{
+    /* FIXME i#3544: Not implemented */
+    ASSERT_NOT_IMPLEMENTED(false);
+    return false;
+}
diff --git a/core/ir/x86/instr.c b/core/ir/x86/instr.c
index b9cf49eea69..31438fa047d 100644
--- a/core/ir/x86/instr.c
+++ b/core/ir/x86/instr.c
@@ -410,10 +410,10 @@ instr_compute_VSIB_index(bool *selected DR_PARAM_OUT, app_pc *result DR_PARAM_OU
 }
 
 bool
-instr_compute_address_VSIB(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
-                           dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
-                           DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
-                           DR_PARAM_OUT bool *write)
+instr_compute_vector_address(instr_t *instr, priv_mcontext_t *mc, size_t mc_size,
+                             dr_mcontext_flags_t mc_flags, opnd_t curop, uint index,
+                             DR_PARAM_OUT bool *have_addr, DR_PARAM_OUT app_pc *addr,
+                             DR_PARAM_OUT bool *write)
 {
     /* We assume that any instr w/ a VSIB opnd has no other
      * memory reference (and the VSIB is a source)!  Else we'll
diff --git a/core/lib/globals_api.h b/core/lib/globals_api.h
index 5988379cb35..659e32d0a2b 100644
--- a/core/lib/globals_api.h
+++ b/core/lib/globals_api.h
@@ -701,12 +701,15 @@ typedef uint64 dr_opmask_t;
  */
 #    ifdef X64
 typedef union ALIGN_VAR(16) _dr_simd_t {
-    byte b;       /**< Byte (8 bit, Bn) scalar element of Vn, Zn, or Pn.        */
-    ushort h;     /**< Halfword (16 bit, Hn) scalar element of Vn, Zn and Pn.   */
-    uint s;       /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */
-    uint64 d;     /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */
-    uint q[4];    /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */
-    uint u32[16]; /**< The full 512 bit Zn, Pn and FFR registers. */
+    byte b;        /**< Byte (8 bit, Bn) scalar element of Vn, Zn, or Pn.        */
+    ushort h;      /**< Halfword (16 bit, Hn) scalar element of Vn, Zn and Pn.   */
+    uint s;        /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */
+    uint64 d;      /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */
+    uint q[4];     /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */
+    uint u32[16];  /**< The full 512 bit Zn, Pn and FFR registers as Singleword (32-bit)
+                      elements. */
+    uint64 u64[8]; /**< The full 512 bit Zn, Pn and FFR registers as Doubleword (64-bit)
+                      elements. */
 } dr_simd_t;
 #    else
 typedef union _dr_simd_t {
diff --git a/core/unix/include/sigcontext.h b/core/unix/include/sigcontext.h
index b4acbbdcfb0..82ad8f1c5cc 100644
--- a/core/unix/include/sigcontext.h
+++ b/core/unix/include/sigcontext.h
@@ -332,6 +332,25 @@ typedef struct _kernel_sigcontext_t {
     unsigned char __reserved[4096] __attribute__((__aligned__(16)));
 } kernel_sigcontext_t;
 
+/*
+ * Allocation of 4k bytes of __reserved[]:
+ * (Note: records do not necessarily occur in the order shown here.)
+ *
+ * size   description
+ *
+ * 528    fpsimd_context
+ * 16     esr_context (not used in DynamoRIO)
+ * 16     sve_context
+ * 32     extra_context
+ * 16     terminator (null _aarch64_ctx)
+ *
+ * 3488 (reserved for future allocation)
+ *
+ * The above table documents the maximum set and sizes of records that can be
+ * generated for userspace. New records which exceed this space will need to
+ * implement a mechanism to handle expanded signal frames.
+ */
+
 /* XXX: These defines come from the system include files for a regular
  * build (signal.h is included), but for DR_HOST_NOT_TARGET we need
  * them defined here.  Probably what we should do is rename them so
@@ -341,8 +360,9 @@ typedef struct _kernel_sigcontext_t {
 /*
  * Header to be used at the beginning of structures extending the user
  * context. Such structures must be placed after the rt_sigframe on the stack
- * and be 16-byte aligned. The last structure must be a dummy one with the
- * magic and size set to 0.
+ * and be 16-byte aligned. The last structure must be a null terminator context
+ * with a magic number and size set to 0. The magic number is one of the
+ * *_MAGIC constants, see below. The null terminator's magic number is 0.
  */
 struct _aarch64_ctx {
     __u32 magic;
@@ -352,26 +372,72 @@ struct _aarch64_ctx {
 #        define FPSIMD_MAGIC 0x46508001
 
 struct fpsimd_context {
-    struct _aarch64_ctx head;
-    __u32 fpsr;
-    __u32 fpcr;
-    __uint128_t vregs[32];
+    struct _aarch64_ctx head; /* 8 bytes */
+    __u32 fpsr;               /* 4 bytes */
+    __u32 fpcr;               /* 4 bytes */
+    __uint128_t vregs[32];    /* 512 bytes */
 };
 
-/* TODO i#5365: Storage of sve_context in kernel_sigcontext_t.__reserved, see
- * above. See also sigcontext_to_mcontext_simd() and
- * mcontext_to_sigcontext_simd().
+/* Storage of sve_context in kernel_sigcontext_t.__reserved, see above. See
+ * also sigcontext_to_mcontext_simd() and mcontext_to_sigcontext_simd().
  */
 
 #        define SVE_MAGIC 0x53564501
 
 struct sve_context {
+    struct _aarch64_ctx head; /* 8 bytes */
+    __u16 vl;                 /* 2 bytes */
+    __u16 __reserved[3];      /* 6 bytes */
+};
+
+/*
+ * extra_context: describes extra space in the signal frame for
+ * additional structures that don't fit in sigcontext.__reserved[].
+ *
+ * Note:
+ *
+ * 1) fpsimd_context, esr_context and extra_context must be placed in
+ * sigcontext.__reserved[] if present.  They cannot be placed in the
+ * extra space.  Any other record can be placed either in the extra
+ * space or in sigcontext.__reserved[], unless otherwise specified in
+ * this file.
+ *
+ * 2) There must not be more than one extra_context.
+ *
+ * 3) If extra_context is present, it must be followed immediately in
+ * sigcontext.__reserved[] by the terminating null _aarch64_ctx.
+ *
+ * 4) The extra space to which datap points must start at the first
+ * 16-byte aligned address immediately after the terminating null
+ * _aarch64_ctx that follows the extra_context structure in
+ * __reserved[].  The extra space may overrun the end of __reserved[],
+ * as indicated by a sufficiently large value for the size field.
+ *
+ * 5) The extra space must itself be terminated with a null
+ * _aarch64_ctx.
+ */
+#        define EXTRA_MAGIC 0x45585401
+
+struct extra_context {
+    struct _aarch64_ctx head; /* 8 bytes */
+    __u64 datap; /*  8 bytes. 16-byte aligned pointer to extra space cast to __u64 */
+    __u32 size;  /* 4 bytes. size in bytes of the extra space */
+    __u32 __reserved[3]; /* 12 bytes */
+};
+
+#        define ESR_MAGIC 0x45535201
+
+struct esr_context {
     struct _aarch64_ctx head;
-    __u16 vl;
-    __u16 __reserved[3];
+    __u64 esr;
 };
+
 #    endif
 
+/* SVE helper macro. */
+#    define BYTES_PER_QUADWORD 16 /* A quadword is 128 bits. */
+#    define sve_vecquad_from_veclen(veclen) ((veclen) / BYTES_PER_QUADWORD)
+
 #endif /* AARCH64 */
 
 #ifdef RISCV64
diff --git a/core/unix/signal_linux_aarch64.c b/core/unix/signal_linux_aarch64.c
index 585365930a2..abd9fe92b64 100644
--- a/core/unix/signal_linux_aarch64.c
+++ b/core/unix/signal_linux_aarch64.c
@@ -54,21 +54,162 @@ save_fpstate(dcontext_t *dcontext, sigframe_rt_t *frame)
 }
 
 #ifdef DEBUG
+/* Representation of quadwords (128 bits) as 2 doublewords. */
+typedef union {
+    __uint128_t as_128;
+    struct {
+        uint64 lo;
+        uint64 hi;
+    } as_2x64;
+} reinterpret128_2x64_t;
+
 void
 dump_sigcontext(dcontext_t *dcontext, sigcontext_t *sc)
 {
+#    ifdef DR_HOST_NOT_TARGET
+    ASSERT_NOT_REACHED();
+#    endif
+    LOG(THREAD, LOG_ASYNCH, 1, "\tSignal context:\n");
     int i;
     for (i = 0; i <= DR_REG_X30 - DR_REG_X0; i++)
         LOG(THREAD, LOG_ASYNCH, 1, "\tx%-2d    = " PFX "\n", i, sc->regs[i]);
     LOG(THREAD, LOG_ASYNCH, 1, "\tsp     = " PFX "\n", sc->sp);
     LOG(THREAD, LOG_ASYNCH, 1, "\tpc     = " PFX "\n", sc->pc);
     LOG(THREAD, LOG_ASYNCH, 1, "\tpstate = " PFX "\n", sc->pstate);
+    LOG(THREAD, LOG_ASYNCH, 1, "\n");
+
+    struct _aarch64_ctx *head = (struct _aarch64_ctx *)sc->__reserved;
+    ASSERT(head->magic == FPSIMD_MAGIC);
+    ASSERT(head->size == sizeof(struct fpsimd_context));
+
+    struct fpsimd_context *fpsimd = (struct fpsimd_context *)sc->__reserved;
+    LOG(THREAD, LOG_ASYNCH, 2, "\tfpsr 0x%x\n", fpsimd->fpsr);
+    LOG(THREAD, LOG_ASYNCH, 2, "\tfpcr 0x%x\n", fpsimd->fpcr);
+    reinterpret128_2x64_t vreg;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        vreg.as_128 = fpsimd->vregs[i];
+        LOG(THREAD, LOG_ASYNCH, 2, "\tq%-2d  0x%016lx %016lx\n", i, vreg.as_2x64.hi,
+            vreg.as_2x64.lo);
+    }
+    LOG(THREAD, LOG_ASYNCH, 2, "\n");
+
+#    ifndef DR_HOST_NOT_TARGET
+    if (proc_has_feature(FEATURE_SVE)) {
+        size_t offset = sizeof(struct fpsimd_context);
+        struct _aarch64_ctx *next_head = (struct _aarch64_ctx *)(sc->__reserved + offset);
+        while (next_head->magic != 0) {
+            switch (next_head->magic) {
+            case ESR_MAGIC: break;
+            case EXTRA_MAGIC: break;
+            case SVE_MAGIC: {
+                const struct sve_context *sve = (struct sve_context *)(next_head);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE vector length %d bytes\n", sve->vl);
+                ASSERT(sve->vl == proc_get_vector_length_bytes());
+                const unsigned int quads_per_vector = sve_vecquad_from_veclen(sve->vl);
+                /* The size and offset macros below are used to access register
+                 * state from memory. These are defined in the kernel's
+                 * sigcontext.h. For scalable vectors, we typically deal in
+                 * units of bytes and quadwords (128 bits). All scalable
+                 * vectors are mutiples of 128 bits. For the purposes of signal
+                 * context handling, these are the simplest and most consistent
+                 * units for calculating sizes and offsets in order to access
+                 * scalable register state in memory.
+                 */
+                LOG(THREAD, LOG_ASYNCH, 2, "\tQuadwords (128 bits) per vector %d\n\n",
+                    quads_per_vector);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_ZREG_SIZE %d\n",
+                    SVE_SIG_ZREG_SIZE(quads_per_vector));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_PREG_SIZE %d\n",
+                    SVE_SIG_PREG_SIZE(quads_per_vector));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_FFR_SIZE  %d\n",
+                    SVE_SIG_FFR_SIZE(quads_per_vector));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tsve->head.size %d\n\n", sve->head.size);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_ZREGS_OFFSET %d\n",
+                    SVE_SIG_ZREGS_OFFSET);
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_PREGS_OFFSET %d\n",
+                    SVE_SIG_PREGS_OFFSET(quads_per_vector));
+                LOG(THREAD, LOG_ASYNCH, 2, "\tSVE_SIG_FFR_OFFSET   %d\n\n",
+                    SVE_SIG_FFR_OFFSET(quads_per_vector));
+
+                dr_simd_t z;
+                int boff; /* Byte offset for each doubleword in a vector. */
+                for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+                    LOG(THREAD, LOG_ASYNCH, 2, "\tz%-2d  0x", i);
+                    for (boff = ((quads_per_vector * 2) - 1); boff >= 0; boff--) {
+                        /* We access data in the scalable vector using the kernel's
+                         * SVE_SIG_ZREG_OFFSET macro which gives the byte offset into a
+                         * vector based on units of 128 bits (quadwords). In this loop we
+                         * offset from the start of struct sve_context. We log the data
+                         * as 64 bit ints, so 2 per quadword (2 x 64 bits -> 128 bits).
+                         *
+                         * For example, for a 256 bit vector (2 quadwords (2 x 128 bits),
+                         * 4 doublewords (4 x 64 bits)), the byte offset (boff) for each
+                         * scalable vector register is:
+                         * boff=3  z.u64[3] = sve + SVE_SIG_ZREG_OFFSET + 24
+                         * boff=2  z.u64[2] = sve + SVE_SIG_ZREG_OFFSET + 16
+                         * boff=1  z.u64[1] = sve + SVE_SIG_ZREG_OFFSET + 8
+                         * boff=0  z.u64[0] = sve + SVE_SIG_ZREG_OFFSET
+                         *
+                         * Note that at present we support little endian only.
+                         * All major Linux arm64 kernel distributions are
+                         * little-endian.
+                         */
+                        z.u64[boff] = *((uint64 *)((
+                            ((byte *)sve) + (SVE_SIG_ZREG_OFFSET(quads_per_vector, i)) +
+                            (boff * 8))));
+                        LOG(THREAD, LOG_ASYNCH, 2, "%016lx ", z.u64[boff]);
+                    }
+                    LOG(THREAD, LOG_ASYNCH, 2, "\n");
+                }
+                LOG(THREAD, LOG_ASYNCH, 2, "\n");
+                /* We access data in predicate and first-fault registers using
+                 * the kernel's SVE_SIG_PREG_OFFSET and SVE_SIG_FFR_OFFSET
+                 * macros. SVE predicate and FFR registers are an 1/8th the
+                 * size of SVE vector registers (1 bit per byte) and are logged
+                 * as 32 bit ints.
+                 */
+                dr_simd_t p;
+                for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+                    p.u32[i] = *((uint32 *)((byte *)sve +
+                                            SVE_SIG_PREG_OFFSET(quads_per_vector, i)));
+                    LOG(THREAD, LOG_ASYNCH, 2, "\tp%-2d  0x%08lx\n", i, p.u32[i]);
+                }
+                LOG(THREAD, LOG_ASYNCH, 2, "\n");
+                LOG(THREAD, LOG_ASYNCH, 2, "\tFFR  0x%08lx\n\n",
+                    *((uint32 *)((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector))));
+                break;
+            }
+            default:
+                SYSLOG_INTERNAL_WARNING("%s %d Unknown section found in signal context "
+                                        "with magic number 0x%x",
+                                        __func__, __LINE__, next_head->magic);
+                break;
+            }
+            offset += next_head->size;
+            next_head = (struct _aarch64_ctx *)(sc->__reserved + offset);
+        }
+    }
+#    endif
 }
 #endif /* DEBUG */
 
+/* Representation of quadword (128 bits) as 4 words, used for SIMD. */
+typedef union {
+    __uint128_t as_128;
+    struct {
+        uint32 lowest;
+        uint32 lo;
+        uint32 hi;
+        uint32 highest;
+    } as_4x32;
+} reinterpret128_4x32_t;
+
 void
 sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
 {
+#ifdef DR_HOST_NOT_TARGET
+    ASSERT_NOT_REACHED();
+#endif
     struct fpsimd_context *fpc = (struct fpsimd_context *)sc_full->fp_simd_state;
     if (fpc == NULL)
         return;
@@ -77,30 +218,116 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
     mc->fpsr = fpc->fpsr;
     mc->fpcr = fpc->fpcr;
     ASSERT((sizeof(mc->simd->q) * MCXT_NUM_SIMD_SVE_SLOTS) == sizeof(fpc->vregs));
-    memcpy(&mc->simd, &fpc->vregs, sizeof(mc->simd));
-    /* TODO i#5365: memcpy(&mc->simd->u32,...)
-     * See also sve_context in core/unix/include/sigcontext.h.
-     */
+    int i;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q));
+    }
+
+#ifndef DR_HOST_NOT_TARGET
+    if (proc_has_feature(FEATURE_SVE)) {
+        size_t offset = sizeof(struct fpsimd_context);
+        /* fpsimd_context is always the first section. After that the esr_context,
+         * extra_context and sve_context sections can be in any order.
+         */
+        struct _aarch64_ctx *next_head =
+            (struct _aarch64_ctx *)(sc_full->sc->__reserved + offset);
+        while (next_head->magic != 0) {
+            ASSERT(next_head->magic == ESR_MAGIC || next_head->magic == SVE_MAGIC ||
+                   next_head->magic == EXTRA_MAGIC);
+            switch (next_head->magic) {
+            case ESR_MAGIC: break;
+            case EXTRA_MAGIC: break;
+            case SVE_MAGIC: {
+                const struct sve_context *sve = (struct sve_context *)(next_head);
+                ASSERT(sve->vl == proc_get_vector_length_bytes());
+                const unsigned int quads_per_vector = sve_vecquad_from_veclen(sve->vl);
+                if (sve->head.size != sizeof(struct sve_context)) {
+                    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+                        /* SVE specifies that AArch64's SIMD&FP registers
+                         * (V0-V31) which hold FP scalars and NEON 128-bit
+                         * vectors overlay the bottom 128 bits of the SVE
+                         * registers (Z0-Z31). For backward compatibility
+                         * reasons, bits 0->127 of Z0-Z31 are always restored
+                         * from the corresponding members of fpsimd_context's
+                         * vregs and not from sve_context.
+                         */
+                        memcpy(&mc->simd[i].u32,
+                               (byte *)sve + SVE_SIG_ZREG_OFFSET(quads_per_vector, i),
+                               sve->vl);
+                        memcpy(&mc->simd[i].q, &fpc->vregs[i], sizeof(mc->simd->q));
+                    }
+                    for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+                        memcpy(&mc->svep[i].u32,
+                               (byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i),
+                               sve->vl);
+                    }
+                    memcpy(&mc->ffr, (byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector),
+                           sve->vl);
+                }
+                break;
+            }
+            default:
+                SYSLOG_INTERNAL_WARNING("%s %d Unhandled section with magic number 0x%x",
+                                        __func__, __LINE__, next_head->magic);
+            }
+            offset += next_head->size;
+            next_head = (struct _aarch64_ctx *)(sc_full->sc->__reserved + offset);
+        }
+    }
+#endif
 }
 
 void
 mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
 {
+#ifdef DR_HOST_NOT_TARGET
+    ASSERT_NOT_REACHED();
+#endif
+    /* sig_full_initialize() will have set the fp_simd_state pointer in the
+     * user level machine context's (uc_mcontext) to __reserved.
+     */
     struct fpsimd_context *fpc = (struct fpsimd_context *)sc_full->fp_simd_state;
     if (fpc == NULL)
         return;
-    struct _aarch64_ctx *next = (void *)((char *)fpc + sizeof(struct fpsimd_context));
     fpc->head.magic = FPSIMD_MAGIC;
     fpc->head.size = sizeof(struct fpsimd_context);
     fpc->fpsr = mc->fpsr;
     fpc->fpcr = mc->fpcr;
     ASSERT(sizeof(fpc->vregs) == (sizeof(mc->simd->q) * MCXT_NUM_SIMD_SVE_SLOTS));
-    memcpy(&fpc->vregs, &mc->simd, sizeof(fpc->vregs));
-    /* TODO i#5365: memcpy(..., &mc->simd->u32)
-     * See also sve_context in core/unix/include/sigcontext.h.
-     */
-    next->magic = 0;
-    next->size = 0;
+    int i;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        memcpy(&fpc->vregs[i], &mc->simd[i].u32[0], sizeof(fpc->vregs[i]));
+    }
+
+#ifndef DR_HOST_NOT_TARGET
+    if (proc_has_feature(FEATURE_SVE)) {
+        struct _aarch64_ctx *esr = (void *)((byte *)fpc + sizeof(struct fpsimd_context));
+        esr->magic = ESR_MAGIC;
+        esr->size = sizeof(struct esr_context);
+
+        struct sve_context *sve = (void *)((byte *)esr + sizeof(struct esr_context));
+        sve->head.magic = SVE_MAGIC;
+        sve->vl = proc_get_vector_length_bytes();
+        const uint quads_per_vector = sve_vecquad_from_veclen(sve->vl);
+        sve->head.size = ALIGN_FORWARD(SVE_SIG_CONTEXT_SIZE(quads_per_vector), 16);
+        for (uint i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+            memcpy((byte *)sve + SVE_SIG_ZREG_OFFSET(quads_per_vector, i),
+                   &mc->simd[i].u32, sve->vl);
+        }
+        for (uint i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+            memcpy((byte *)sve + SVE_SIG_PREG_OFFSET(quads_per_vector, i),
+                   &mc->svep[i].u32, sve->vl);
+        }
+        memcpy((byte *)sve + SVE_SIG_FFR_OFFSET(quads_per_vector), &mc->ffr, sve->vl);
+
+        size_t offset = (proc_get_vector_length_bytes() * MCXT_NUM_SIMD_SVE_SLOTS) +
+            ((proc_get_vector_length_bytes() / 8) * MCXT_NUM_SVEP_SLOTS) + 16;
+        struct _aarch64_ctx *null =
+            (void *)((byte *)sve + sizeof(struct sve_context) + offset);
+        null->magic = 0;
+        null->size = 0;
+    }
+#endif
 }
 
 size_t
diff --git a/suite/runsuite_wrapper.pl b/suite/runsuite_wrapper.pl
index ac130846498..bd1027ca6a9 100755
--- a/suite/runsuite_wrapper.pl
+++ b/suite/runsuite_wrapper.pl
@@ -338,6 +338,7 @@
                                    );
             # FIXME i#2417: fix flaky/regressed AArch64 tests
             %ignore_failures_64 = ('code_api|linux.sigsuspend' => 1,
+                                   'code_api|linux.thread-reset' => 1, # i#6741
                                    'code_api|pthreads.pthreads_exit' => 1,
                                    'code_api|tool.histogram.offline' => 1, # i#3980
                                    'code_api|linux.fib-conflict' => 1,
diff --git a/suite/tests/api/opnd-a64.c b/suite/tests/api/opnd-a64.c
index 5278196e87f..aed49964759 100644
--- a/suite/tests/api/opnd-a64.c
+++ b/suite/tests/api/opnd-a64.c
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2018 Arm Limited.  All rights reserved.
+ * Copyright (c) 2018 - 2024 Arm Limited.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -35,6 +35,7 @@
 #include "configure.h"
 #include "dr_api.h"
 #include <stdio.h>
+#include <string.h>
 
 #define ASSERT(x)                                                                        \
     ((void)((!(x)) ? (fprintf(stderr, "ASSERT FAILURE: %s:%d: %s\n", __FILE__, __LINE__, \
@@ -67,7 +68,8 @@ test_get_size()
         }
     }
 
-    opnd_size_t opsz_vl = OPSZ_NA;
+    opnd_size_t opsz_veclen = OPSZ_NA;  /* Length of a Z vector register in bytes */
+    opnd_size_t opsz_predlen = OPSZ_NA; /* Length of a P predicate register in bytes */
     if (proc_has_feature(FEATURE_SVE)) {
         /* Check sizes of SVE vector and predicate registers. Read vector length
          * directly from hardware and compare with OPSZ_ value reg_get_size()
@@ -80,18 +82,20 @@ test_get_size()
             : "=r"(vl)
             :
             : "x0");
-        opsz_vl = opnd_size_from_bytes(vl);
+        opsz_veclen = opnd_size_from_bytes(vl);
+        opsz_predlen = opnd_size_from_bytes(vl / 8);
     } else {
         /* Set vector length to 256 bits for unit tests on non-SVE hardware. */
-        opsz_vl = OPSZ_32;
+        ASSERT(dr_get_sve_vector_length() == 256);
+        opsz_veclen = OPSZ_32;
+        opsz_predlen = OPSZ_4;
     }
     for (uint i = 0; i < 32; i++) {
-        ASSERT(reg_get_size((reg_id_t)DR_REG_Z0 + i) == opsz_vl);
+        ASSERT(reg_get_size((reg_id_t)DR_REG_Z0 + i) == opsz_veclen);
     }
 
-    /* TODO i#5365: Check sizes of SVE predicate regs. */
     for (uint i = 0; i < 16; i++) {
-        ASSERT(reg_get_size((reg_id_t)DR_REG_P0 + i) == OPSZ_SCALABLE_PRED);
+        ASSERT(reg_get_size((reg_id_t)DR_REG_P0 + i) == opsz_predlen);
     }
 }
 
@@ -303,6 +307,480 @@ test_opnd_invert_immed_int()
 #endif
 }
 
+typedef struct _vector_address_test_expectation_t {
+    app_pc *addresses;
+    uint num_addresses;
+    bool is_write;
+} vector_address_test_expectation_t;
+
+void
+test_compute_vector_address_helper(void *drcontext, instr_t *instr, dr_mcontext_t *mc,
+                                   const vector_address_test_expectation_t *expected,
+                                   uint line)
+{
+    bool printed_instr = false;
+#define TEST_FAILED()                                             \
+    do {                                                          \
+        if (!printed_instr) {                                     \
+            printf("%s:%u:\n", __FILE__, line);                   \
+            dr_print_instr(drcontext, STDOUT, instr,              \
+                           "Failed to compute addresses for:\n"); \
+            printed_instr = true;                                 \
+        }                                                         \
+    } while (0)
+
+#define EXPECT_CMP(cmp, fmt, a, b)                                                   \
+    do {                                                                             \
+        if (!(a cmp b)) {                                                            \
+            TEST_FAILED();                                                           \
+            printf("Expected " #a " " #cmp " " #b ":\n    " #a " = " fmt "\n    " #b \
+                   " = " fmt "\n",                                                   \
+                   a, b);                                                            \
+        }                                                                            \
+    } while (0)
+
+#define EXPECT_EQ(fmt, a, b) EXPECT_CMP(==, fmt, a, b)
+#define EXPECT_LT(fmt, a, b) EXPECT_CMP(<, fmt, a, b)
+
+    app_pc addr;
+    bool is_write;
+    uint index = 0;
+    while (instr_compute_address_ex(instr, mc, index, &addr, &is_write)) {
+        EXPECT_LT("%u", index, expected->num_addresses);
+        EXPECT_EQ("%p", addr, expected->addresses[index]);
+        EXPECT_EQ("%u", is_write, expected->is_write);
+        index++;
+    }
+    EXPECT_EQ("%u", index, expected->num_addresses);
+
+#undef TEST_FAILED
+#undef EXPECT_CMP
+#undef EXPECT_EQ
+#undef EXPECT_LT
+}
+
+/* Used by test_compute_vector_address() to determine whether an instruction reads or
+ * writes its memory operand and set test expectations.
+ * This isn't an exhaustive list of opcodes; it just contains the ones used in the test
+ */
+static bool
+op_is_write(int op)
+{
+    switch (op) {
+    case OP_ld1b:
+    case OP_ld1h:
+    case OP_ld1w:
+    case OP_ld1d:
+    case OP_ldnt1b:
+    case OP_ldnt1h:
+    case OP_ldnt1w:
+    case OP_ldnt1d: return false;
+    case OP_st1b:
+    case OP_st1h:
+    case OP_st1w:
+    case OP_st1d:
+    case OP_stnt1b:
+    case OP_stnt1h:
+    case OP_stnt1w:
+    case OP_stnt1d: return true;
+
+    default: ASSERT(false);
+    }
+}
+
+/* Used by test_compute_vector_address() to determine whether an instruction reads or
+ * writes its memory operand and set test expectations.
+ * This isn't an exhaustive list of opcodes; it just contains the ones used in the test
+ */
+static opnd_size_t
+op_mem_size(int op)
+{
+    switch (op) {
+    case OP_ld1b:
+    case OP_ldnt1b:
+    case OP_st1b:
+    case OP_stnt1b: return OPSZ_1;
+    case OP_ld1h:
+    case OP_ldnt1h:
+    case OP_st1h:
+    case OP_stnt1h: return OPSZ_2;
+    case OP_ld1w:
+    case OP_ldnt1w:
+    case OP_st1w:
+    case OP_stnt1w: return OPSZ_4;
+    case OP_ld1d:
+    case OP_ldnt1d:
+    case OP_st1d:
+    case OP_stnt1d: return OPSZ_8;
+
+    default: ASSERT(false);
+    }
+}
+
+void
+test_compute_vector_address(void *drcontext)
+{
+    const int original_vector_length = dr_get_sve_vector_length();
+    ASSERT(dr_set_sve_vector_length(256));
+
+#define SCALAR_BASE_REG 0
+
+#define INDEX_REG_D 0
+#define INDEX_REG_S 1
+#define BASE_REG_D 2
+#define BASE_REG_S 3
+
+    dr_mcontext_t mc = {
+        .size = sizeof(dr_mcontext_t),
+        .flags = DR_MC_ALL,
+        .r0 = 0x8000000000000000, /* SCALAR_BASE_REG */
+        .r1 = 1,
+        .r2 = 2,
+        .r3 = 3,
+        .r4 = 4,
+        .r5 = 5,
+        .r6 = 6,
+        .r7 = 7,
+        .r8 = 0xffffffffffffffff,
+        .simd[INDEX_REG_D].u64 = { 0x0000000000010000, 0x0000000000020000,
+                                   0xffffffffffff0000, 0xfffffffffffe0000 },
+        .simd[INDEX_REG_S].u32 = { 0x00010000, 0x00020000, 0x00030000, 0x00040000,
+                                   0xffff0000, 0xfffd0000, 0xfffc0000, 0xfffb0000 },
+        .simd[BASE_REG_D].u64 = { 0x0000000000000000, 0x8000000000000000,
+                                  0xffffffffffffffff, 0x0000000010000000 },
+        .simd[BASE_REG_S].u32 = { 0x00000000, 0x80000000, 0xffffffff, 0x00010000,
+                                  0x10000000, 0x20000000, 0x30000000, 0x40000000 },
+    };
+
+    for (size_t i = BASE_REG_S + 1; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        static const uint64 poison[4] = { 0xdeaddeaddeaddead, 0xdeaddeaddeaddead,
+                                          0xdeaddeaddeaddead, 0xdeaddeaddeaddead };
+        memcpy(&mc.simd[i].u64[0], poison, sizeof(poison));
+    }
+    for (size_t i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+        mc.svep[i].u32[0] = 0xffffffff;
+    }
+
+/* Map SVE element sizes to opnd_size_t */
+#define ELSZ_B OPSZ_1
+#define ELSZ_H OPSZ_2
+#define ELSZ_S OPSZ_4
+#define ELSZ_D OPSZ_8
+
+/* Declare expected test results.*/
+#define EXPECT(...)                                \
+    app_pc addresses[] = { __VA_ARGS__ };          \
+    vector_address_test_expectation_t expected = { \
+        addresses,                                 \
+        sizeof(addresses) / sizeof(app_pc),        \
+        false,                                     \
+    }
+
+#define VEC_ADDR_TEST(op, governing_pred_reg, mask, create_mem_opnd, decl_expect)        \
+    {                                                                                    \
+        decl_expect;                                                                     \
+        expected.is_write = op_is_write(OP_##op);                                        \
+        mc.svep[governing_pred_reg].u32[0] = mask;                                       \
+        opnd_t mem_opnd = create_mem_opnd;                                               \
+        opnd_set_size(&mem_opnd, op_mem_size(OP_##op));                                  \
+        instr_t *instr = INSTR_CREATE_##op##_sve_pred(                                   \
+            drcontext,                                                                   \
+            opnd_create_reg_element_vector(DR_REG_Z31,                                   \
+                                           opnd_get_vector_element_size(mem_opnd)),      \
+            opnd_create_predicate_reg(DR_REG_P0 + governing_pred_reg, false), mem_opnd); \
+        test_compute_vector_address_helper(drcontext, instr, &mc, &expected, __LINE__);  \
+        instr_destroy(drcontext, instr);                                                 \
+        mc.svep[governing_pred_reg].u32[0] = 0xffffffff;                                 \
+    }
+
+#define SCALAR_PLUS_VECTOR(xn, zm, el_size, extend, scale)                             \
+    opnd_create_vector_base_disp_aarch64(DR_REG_X0 + xn, DR_REG_Z0 + zm, el_size,      \
+                                         DR_EXTEND_##extend, scale > 0, 0, 0, OPSZ_NA, \
+                                         scale)
+
+    /* Test all the scalar+vector addressing modes.
+     * The opcode used in the instruction shouldn't make a difference to the address
+     * calculation, so these tests cover all addressing modes but not all
+     * (opcode, addressing mode) combinations.
+     */
+
+    /* 32-bit scaled offset [<Xn|SP>, <Zm>.S, <mod> #N] */
+    VEC_ADDR_TEST(ld1h, /*governing_pred_reg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x8000000000060000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000001fffe0000, (app_pc)0x80000001fffa0000,
+                         (app_pc)0x80000001fff80000, (app_pc)0x80000001fff60000));
+    VEC_ADDR_TEST(st1h, /*governing_pred_reg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, SXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x8000000000060000, (app_pc)0x8000000000080000,
+                         (app_pc)0x7ffffffffffe0000, (app_pc)0x7ffffffffffa0000,
+                         (app_pc)0x7ffffffffff80000, (app_pc)0x7ffffffffff60000));
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000000000c0000, (app_pc)0x8000000000100000,
+                         (app_pc)0x80000003fffc0000, (app_pc)0x80000003fff40000,
+                         (app_pc)0x80000003fff00000, (app_pc)0x80000003ffec0000));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/0, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, SXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000000000c0000, (app_pc)0x8000000000100000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffff40000,
+                         (app_pc)0x7ffffffffff00000, (app_pc)0x7fffffffffec0000));
+
+    /* 32-bit unscaled offset [<Xn|SP>, <Zm>.S, <mod>] */
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/1, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x8000000000030000, (app_pc)0x8000000000040000,
+                         (app_pc)0x80000000ffff0000, (app_pc)0x80000000fffd0000,
+                         (app_pc)0x80000000fffc0000, (app_pc)0x80000000fffb0000));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/1, 0x11111111,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, SXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x8000000000030000, (app_pc)0x8000000000040000,
+                         (app_pc)0x7fffffffffff0000, (app_pc)0x7ffffffffffd0000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffffb0000));
+
+    /* 32-bit unpacked scaled offset [<Xn|SP>, <Zm>.D, <mod> #N] */
+    VEC_ADDR_TEST(ld1h, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x80000001fffe0000, (app_pc)0x80000001fffc0000));
+    VEC_ADDR_TEST(st1h, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x7ffffffffffe0000, (app_pc)0x7ffffffffffc0000));
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x80000003fffc0000, (app_pc)0x80000003fff80000));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffff80000));
+    VEC_ADDR_TEST(ld1d, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 3),
+                  EXPECT((app_pc)0x8000000000080000, (app_pc)0x8000000000100000,
+                         (app_pc)0x80000007fff80000, (app_pc)0x80000007fff00000));
+    VEC_ADDR_TEST(st1d, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 3),
+                  EXPECT((app_pc)0x8000000000080000, (app_pc)0x8000000000100000,
+                         (app_pc)0x7ffffffffff80000, (app_pc)0x7ffffffffff00000));
+
+    /* 32-bit unpacked unscaled offset [<Xn|SP>, <Zm>.D, <mod>] */
+    VEC_ADDR_TEST(ld1d, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x80000000ffff0000, (app_pc)0x80000000fffe0000));
+    VEC_ADDR_TEST(st1d, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, SXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x7fffffffffff0000, (app_pc)0x7ffffffffffe0000));
+
+    /* 64-bit scaled offset [<Xn|SP>, <Zm>.D, LSL #N] */
+    VEC_ADDR_TEST(ld1h, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 1),
+                  EXPECT((app_pc)0x8000000000020000, (app_pc)0x8000000000040000,
+                         (app_pc)0x7ffffffffffe0000, (app_pc)0x7ffffffffffc0000));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 2),
+                  EXPECT((app_pc)0x8000000000040000, (app_pc)0x8000000000080000,
+                         (app_pc)0x7ffffffffffc0000, (app_pc)0x7ffffffffff80000));
+    VEC_ADDR_TEST(ld1d, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 3),
+                  EXPECT((app_pc)0x8000000000080000, (app_pc)0x8000000000100000,
+                         (app_pc)0x7ffffffffff80000, (app_pc)0x7ffffffffff00000));
+
+    /* 64-bit unscaled offset [<Xn|SP>, <Zm>.D] */
+    VEC_ADDR_TEST(st1d, /*governing_pred_reg=*/1, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTX, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000020000,
+                         (app_pc)0x7fffffffffff0000, (app_pc)0x7ffffffffffe0000));
+
+    /* Test predicate handling. */
+
+    /* Test with all elements inactive */
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/2, 0x00000000,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT(/*nothing*/));
+    VEC_ADDR_TEST(st1d, /*governing_pred_reg=*/3, 0x00000000,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT(/*nothing*/));
+
+    /* Test with every other element active */
+    VEC_ADDR_TEST(st1b, /*governing_pred_reg=*/4, 0x01010101,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x8000000000030000,
+                         (app_pc)0x80000000ffff0000, (app_pc)0x80000000fffc0000));
+    VEC_ADDR_TEST(st1h, /*governing_pred_reg=*/5, 0x00010001,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000010000, (app_pc)0x80000000ffff0000));
+
+    /* Test with a single element active */
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/6, 0x00000010,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_S, ELSZ_S, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000020000));
+    VEC_ADDR_TEST(st1d, /*governing_pred_reg=*/7, 0x00000100,
+                  SCALAR_PLUS_VECTOR(SCALAR_BASE_REG, INDEX_REG_D, ELSZ_D, UXTW, 0),
+                  EXPECT((app_pc)0x8000000000020000));
+#undef SCALAR_PLUS_VECTOR
+
+#define VECTOR_PLUS_IMM(zn, el_size, imm)                                      \
+    opnd_create_vector_base_disp_aarch64(DR_REG_Z0 + zn, DR_REG_NULL, el_size, \
+                                         DR_EXTEND_UXTX, 0, imm, 0, OPSZ_NA, 0)
+
+    VEC_ADDR_TEST(ld1b, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 0),
+                  EXPECT((app_pc)0x0000000000000000, (app_pc)0x0000000080000000,
+                         (app_pc)0x00000000ffffffff, (app_pc)0x0000000000010000,
+                         (app_pc)0x0000000010000000, (app_pc)0x0000000020000000,
+                         (app_pc)0x0000000030000000, (app_pc)0x0000000040000000));
+    VEC_ADDR_TEST(st1b, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 31),
+                  EXPECT((app_pc)0x000000000000001f, (app_pc)0x000000008000001f,
+                         (app_pc)0x000000010000001e, (app_pc)0x000000000001001f,
+                         (app_pc)0x000000001000001f, (app_pc)0x000000002000001f,
+                         (app_pc)0x000000003000001f, (app_pc)0x000000004000001f));
+    VEC_ADDR_TEST(ld1b, /*governing_pred_reg=*/0, 0x01010101,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 0),
+                  EXPECT((app_pc)0x0000000000000000, (app_pc)0x8000000000000000,
+                         (app_pc)0xffffffffffffffff, (app_pc)0x0000000010000000));
+    VEC_ADDR_TEST(st1b, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 31),
+                  EXPECT((app_pc)0x000000000000001f, (app_pc)0x800000000000001f,
+                         (app_pc)0x000000000000001e, (app_pc)0x000000001000001f));
+
+    VEC_ADDR_TEST(ld1h, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 62),
+                  EXPECT((app_pc)0x000000000000003e, (app_pc)0x000000008000003e,
+                         (app_pc)0x000000010000003d, (app_pc)0x000000000001003e,
+                         (app_pc)0x000000001000003e, (app_pc)0x000000002000003e,
+                         (app_pc)0x000000003000003e, (app_pc)0x000000004000003e));
+    VEC_ADDR_TEST(st1h, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 62),
+                  EXPECT((app_pc)0x000000000000003e, (app_pc)0x800000000000003e,
+                         (app_pc)0x000000000000003d, (app_pc)0x000000001000003e));
+
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x000000008000007c,
+                         (app_pc)0x000000010000007b, (app_pc)0x000000000001007c,
+                         (app_pc)0x000000001000007c, (app_pc)0x000000002000007c,
+                         (app_pc)0x000000003000007c, (app_pc)0x000000004000007c));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x800000000000007c,
+                         (app_pc)0x000000000000007b, (app_pc)0x000000001000007c));
+
+    VEC_ADDR_TEST(ld1d, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 248),
+                  EXPECT((app_pc)0x00000000000000f8, (app_pc)0x80000000000000f8,
+                         (app_pc)0x00000000000000f7, (app_pc)0x00000000100000f8));
+
+    /* Test with all elements inactive */
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/0, 0x00000000,
+                  VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124), EXPECT(/*nothing*/));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/0, 0x00000000,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124), EXPECT(/*nothing*/));
+
+    /* Test with every other element active */
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/0, 0x01010101,
+                  VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x000000010000007b,
+                         (app_pc)0x000000001000007c, (app_pc)0x000000003000007c));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/0, 0x00010001,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124),
+                  EXPECT((app_pc)0x000000000000007c, (app_pc)0x000000000000007b));
+
+    /* Test with a single element active */
+    VEC_ADDR_TEST(ld1w, /*governing_pred_reg=*/0, 0x00000010,
+                  VECTOR_PLUS_IMM(BASE_REG_S, ELSZ_S, 124),
+                  EXPECT((app_pc)0x000000008000007c));
+    VEC_ADDR_TEST(st1w, /*governing_pred_reg=*/0, 0x00000100,
+                  VECTOR_PLUS_IMM(BASE_REG_D, ELSZ_D, 124),
+                  EXPECT((app_pc)0x800000000000007c));
+
+#undef VECTOR_PLUS_IMM
+
+#define VECTOR_PLUS_SCALAR(zn, el_size, xm)                                       \
+    opnd_create_vector_base_disp_aarch64(DR_REG_Z0 + zn, DR_REG_X0 + xm, el_size, \
+                                         DR_EXTEND_UXTX, 0, 0, 0, OPSZ_NA, 0)
+    VEC_ADDR_TEST(ldnt1b, /*governing_pred_reg=*/0, 0x11111111,
+                  VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 8),
+                  EXPECT((app_pc)0xffffffffffffffff, (app_pc)0x000000007fffffff,
+                         (app_pc)0x00000000fffffffe, (app_pc)0x000000000000ffff,
+                         (app_pc)0x000000000fffffff, (app_pc)0x000000001fffffff,
+                         (app_pc)0x000000002fffffff, (app_pc)0x000000003fffffff));
+    VEC_ADDR_TEST(stnt1b, /*governing_pred_reg=*/0, 0x01010101,
+                  VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 7),
+                  EXPECT((app_pc)0x0000000000000007, (app_pc)0x8000000000000007,
+                         (app_pc)0x0000000000000006, (app_pc)0x0000000010000007));
+
+    /* Test with all elements inactive */
+    VEC_ADDR_TEST(ldnt1h, /*governing_pred_reg=*/0, 0x00000000,
+                  VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 6), EXPECT(/*nothing*/));
+    VEC_ADDR_TEST(stnt1h, /*governing_pred_reg=*/0, 0x00000000,
+                  VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 5), EXPECT(/*nothing*/));
+
+    /* Test with every other element active */
+    VEC_ADDR_TEST(ldnt1w, /*governing_pred_reg=*/0, 0x01010101,
+                  VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 4),
+                  EXPECT((app_pc)0x0000000000000004, (app_pc)0x0000000100000003,
+                         (app_pc)0x0000000010000004, (app_pc)0x0000000030000004));
+    VEC_ADDR_TEST(stnt1w, /*governing_pred_reg=*/0, 0x00010001,
+                  VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 3),
+                  EXPECT((app_pc)0x0000000000000003, (app_pc)0x0000000000000002));
+
+    /* Test with a single element active */
+    VEC_ADDR_TEST(ldnt1w, /*governing_pred_reg=*/0, 0x00000010,
+                  VECTOR_PLUS_SCALAR(BASE_REG_S, ELSZ_S, 2),
+                  EXPECT((app_pc)0x0000000080000002));
+    VEC_ADDR_TEST(stnt1d, /*governing_pred_reg=*/0, 0x00000100,
+                  VECTOR_PLUS_SCALAR(BASE_REG_D, ELSZ_D, 1),
+                  EXPECT((app_pc)0x8000000000000001));
+
+#undef VECTOR_PLUS_SCALAR
+
+#undef EXPECT
+#undef VEC_ADDR_TEST
+
+    ASSERT(dr_set_sve_vector_length(original_vector_length));
+}
+
+void
+test_reg_is_simd()
+{
+    for (reg_id_t reg = DR_REG_START_32; reg <= DR_REG_STOP_32; reg++)
+        ASSERT(!reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_START_64; reg <= DR_REG_STOP_64; reg++)
+        ASSERT(!reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_Q0; reg <= DR_REG_Q0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_D0; reg <= DR_REG_D0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_S0; reg <= DR_REG_S0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_H0; reg <= DR_REG_H0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_B0; reg <= DR_REG_B0 + DR_NUM_SIMD_VECTOR_REGS - 1; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_START_Z; reg <= DR_REG_STOP_Z; reg++)
+        ASSERT(reg_is_simd(reg));
+
+    for (reg_id_t reg = DR_REG_START_P; reg <= DR_REG_STOP_P; reg++)
+        ASSERT(!reg_is_simd(reg));
+}
+
 int
 main(int argc, char *argv[])
 {
@@ -310,7 +788,7 @@ main(int argc, char *argv[])
      * on SVE h/w. This is validated with the direct read of vector length
      * using the SVE RDVL instruction in test_get_size() above.
      */
-    dr_standalone_init();
+    void *drcontext = dr_standalone_init();
 
     test_get_size();
 
@@ -318,6 +796,10 @@ main(int argc, char *argv[])
 
     test_opnd_invert_immed_int();
 
+    test_compute_vector_address(drcontext);
+
+    test_reg_is_simd();
+
     printf("all done\n");
     return 0;
 }
diff --git a/suite/tests/tools.c b/suite/tests/tools.c
index 70d33946549..44560c533f0 100644
--- a/suite/tests/tools.c
+++ b/suite/tests/tools.c
@@ -497,6 +497,114 @@ intercept_signal(int sig, handler_3_t handler, bool sigstack)
     ASSERT_NOERR(rc);
 }
 
+#        ifdef AARCH64
+#            ifdef DR_HOST_NOT_TARGET
+#                define RESERVED __reserved1
+#            else
+#                define RESERVED __reserved
+#            endif
+void
+dump_ucontext(ucontext_t *ucxt, bool is_sve, int vl_bytes)
+{
+    struct _aarch64_ctx *head = (struct _aarch64_ctx *)(ucxt->uc_mcontext.RESERVED);
+    assert(head->magic == FPSIMD_MAGIC);
+    assert(head->size == sizeof(struct fpsimd_context));
+
+    struct fpsimd_context *fpsimd = (struct fpsimd_context *)(ucxt->uc_mcontext.RESERVED);
+    print("\nfpsr 0x%x\n", fpsimd->fpsr);
+    print("fpcr 0x%x\n", fpsimd->fpcr);
+    reinterpret128_2x64_t vreg;
+    int i;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        vreg.as_128 = fpsimd->vregs[i];
+        print("q%-2d  0x%016lx %016lx\n", i, vreg.as_2x64.hi, vreg.as_2x64.lo);
+    }
+    print("\n");
+
+#            ifndef DR_HOST_NOT_TARGET
+    if (is_sve) {
+        size_t offset = sizeof(struct fpsimd_context);
+        struct _aarch64_ctx *next_head =
+            (struct _aarch64_ctx *)(ucxt->uc_mcontext.RESERVED + offset);
+        while (next_head->magic != 0) {
+            switch (next_head->magic) {
+            case ESR_MAGIC: offset += sizeof(struct esr_context); break;
+            case EXTRA_MAGIC: offset += sizeof(struct extra_context); break;
+            case SVE_MAGIC: {
+                const struct sve_context *sve = (struct sve_context *)(next_head);
+                assert(sve->vl == vl_bytes);
+                const unsigned int vq = sve_vq_from_vl(sve->vl);
+                if (sve->head.size != sizeof(struct sve_context))
+                    assert(sve->head.size == ALIGN_FORWARD(SVE_SIG_CONTEXT_SIZE(vq), 16));
+
+                dr_simd_t z;
+                int boff; /* Byte offset for each doubleword in a vector. */
+                for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+                    print("z%-2d  0x", i);
+                    for (boff = ((vq * 2) - 1); boff >= 0; boff--) {
+                        /* We access data in the scalable vector using the
+                         * kernel's SVE_SIG_ZREG_OFFSET macro which gives the
+                         * byte offset into a vector based on units of 128 bits
+                         * (quadwords). In this loop we offset from the start
+                         * of struct sve_context. We print the data as 64 bit
+                         * ints, so 2 per quadword.
+                         *
+                         * For example, for a 256 bit vector (2 quadwords, 4
+                         * doublewords), the byte offset (boff) for each
+                         * scalable vector register is:
+                         * boff=3  vdw=sve+SVE_SIG_ZREG_OFFSET+24
+                         * boff=2  vdw=sve+SVE_SIG_ZREG_OFFSET+16
+                         * boff=1  vdw=sve+SVE_SIG_ZREG_OFFSET+8
+                         * boff=0  vdw=sve+SVE_SIG_ZREG_OFFSET
+                         *
+                         * Note that at present we support little endian only.
+                         * All major Linux arm64 kernel distributions are
+                         * little-endian.
+                         */
+                        z.u64[boff] = *((uint64 *)((
+                            ((byte *)sve) + (SVE_SIG_ZREG_OFFSET(vq, i)) + (boff * 8))));
+                        print("%016lx ", z.u64[boff]);
+                    }
+                    print("\n");
+                }
+
+                print("\n");
+                /* We access data in predicate and first-fault registers using
+                 * the kernel's SVE_SIG_PREG_OFFSET and SVE_SIG_FFR_OFFSET
+                 * macros. SVE predicate and FFR registers are an 1/8th the
+                 * size of SVE vector registers (1 bit per byte) and are printed
+                 * as 32 bit ints.
+                 */
+                dr_simd_t p;
+                for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+                    p.u32[i] = *((uint32 *)((byte *)sve + SVE_SIG_PREG_OFFSET(vq, i)));
+                    print("p%-2d  0x%08lx\n", i, p.u32[i]);
+                }
+                print("\n");
+                print("FFR  0x%08lx\n\n",
+                      *((uint32 *)((byte *)sve + SVE_SIG_FFR_OFFSET(vq))));
+
+                if (sve->head.size == sizeof(struct sve_context))
+                    offset += sizeof(struct sve_context);
+                else
+                    // VL / 8  x Zn  + ((( VL / 8  / 8) x Pn) + FFR)
+                    offset += sizeof(struct sve_context) +
+                        (vl_bytes * MCXT_NUM_SIMD_SVE_SLOTS) +
+                        ((vl_bytes / 8) * MCXT_NUM_SVEP_SLOTS) + 16;
+                break;
+            }
+            default:
+                print("%s %d Unhandled section with magic number 0x%x", __func__,
+                      __LINE__, next_head->magic);
+                assert(0);
+            }
+            next_head = (struct _aarch64_ctx *)(ucxt->uc_mcontext.RESERVED + offset);
+        }
+    }
+#            endif
+}
+#        endif
+
 #    endif /* UNIX */
 
 #else /* asm code *************************************************************/
diff --git a/suite/tests/tools.h b/suite/tests/tools.h
index e9ce5ddf8b8..442fbeda987 100644
--- a/suite/tests/tools.h
+++ b/suite/tests/tools.h
@@ -301,6 +301,20 @@ typedef void (*handler_3_t)(int, siginfo_t *, ucontext_t *);
 /* set up signal_handler as the handler for signal "sig" */
 void
 intercept_signal(int sig, handler_3_t handler, bool sigstack);
+
+#    ifdef AARCH64
+void
+dump_ucontext(ucontext_t *ucxt, bool is_sve, int vl);
+
+/* Representation of quadwords as 2 doubles. */
+typedef union {
+    __uint128_t as_128;
+    struct {
+        uint64 lo;
+        uint64 hi;
+    } as_2x64;
+} reinterpret128_2x64_t;
+#    endif
 #endif
 
 /* for cross-plaform siglongjmp */