From 066eba0c64393c1f659187b57d66dee00678584b Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 3 Sep 2018 16:34:22 +0200
Subject: [PATCH 01/31] remove cmsis module; add acle module

ACLE (ARM C Language Extensions) is more general (supports ARMv4 to ARMv8) than
CMSIS (ARMv7-M and ARMv7-R)
---
 crates/core_arch/src/aarch64/mod.rs           |   2 +
 crates/core_arch/src/acle/barrier/common.rs   |  14 +
 crates/core_arch/src/acle/barrier/mod.rs      |  89 +++++
 .../core_arch/src/acle/barrier/not_mclass.rs  |  43 +++
 crates/core_arch/src/acle/barrier/v8.rs       |  23 ++
 crates/core_arch/src/acle/dsp.rs              |  24 ++
 crates/core_arch/src/acle/hints.rs            | 115 ++++++
 crates/core_arch/src/acle/mod.rs              | 134 +++++++
 .../core_arch/src/acle/registers/aarch32.rs   |   4 +
 crates/core_arch/src/acle/registers/mod.rs    | 121 +++++++
 crates/core_arch/src/acle/registers/v6m.rs    |  39 +++
 crates/core_arch/src/acle/registers/v7m.rs    |  17 +
 crates/core_arch/src/acle/simd32.rs           |  60 ++++
 crates/core_arch/src/arm/cmsis.rs             | 330 ------------------
 crates/core_arch/src/arm/mod.rs               |   8 +-
 crates/core_arch/src/mod.rs                   |   3 +
 16 files changed, 691 insertions(+), 335 deletions(-)
 create mode 100644 crates/core_arch/src/acle/barrier/common.rs
 create mode 100644 crates/core_arch/src/acle/barrier/mod.rs
 create mode 100644 crates/core_arch/src/acle/barrier/not_mclass.rs
 create mode 100644 crates/core_arch/src/acle/barrier/v8.rs
 create mode 100644 crates/core_arch/src/acle/dsp.rs
 create mode 100644 crates/core_arch/src/acle/hints.rs
 create mode 100644 crates/core_arch/src/acle/mod.rs
 create mode 100644 crates/core_arch/src/acle/registers/aarch32.rs
 create mode 100644 crates/core_arch/src/acle/registers/mod.rs
 create mode 100644 crates/core_arch/src/acle/registers/v6m.rs
 create mode 100644 crates/core_arch/src/acle/registers/v7m.rs
 create mode 100644 crates/core_arch/src/acle/simd32.rs
 delete mode 100644 crates/core_arch/src/arm/cmsis.rs

diff --git a/crates/core_arch/src/aarch64/mod.rs b/crates/core_arch/src/aarch64/mod.rs
index d573e2c0b8..4821438e9f 100644
--- a/crates/core_arch/src/aarch64/mod.rs
+++ b/crates/core_arch/src/aarch64/mod.rs
@@ -18,6 +18,8 @@ pub use self::crypto::*;
 mod crc;
 pub use self::crc::*;
 
+pub use super::acle::*;
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
diff --git a/crates/core_arch/src/acle/barrier/common.rs b/crates/core_arch/src/acle/barrier/common.rs
new file mode 100644
index 0000000000..a1d8c93e8c
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/common.rs
@@ -0,0 +1,14 @@
+//! Access types available on all architectures
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+pub struct SY;
+
+dmb_dsb!(SY);
+
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        asm!("ISB SY" : : : "memory" : "volatile")
+    }
+}
diff --git a/crates/core_arch/src/acle/barrier/mod.rs b/crates/core_arch/src/acle/barrier/mod.rs
new file mode 100644
index 0000000000..3fbf6899ff
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/mod.rs
@@ -0,0 +1,89 @@
+// Reference: Section 7.4 "Hints" of ACLE
+
+macro_rules! dmb_dsb {
+    ($A:ident) => {
+        impl super::super::sealed::Dmb for $A {
+            #[inline(always)]
+            unsafe fn __dmb(&self) {
+                asm!(concat!("DMB ", stringify!($A)) : : : "memory" : "volatile")
+            }
+        }
+
+        impl super::super::sealed::Dsb for $A {
+            #[inline(always)]
+            unsafe fn __dsb(&self) {
+                asm!(concat!("DSB ", stringify!($A)) : : : "memory" : "volatile")
+            }
+        }
+    };
+}
+
+mod common;
+
+pub use self::common::*;
+
+#[cfg(not(target_feature = "mclass"))]
+mod not_mclass;
+
+#[cfg(not(target_feature = "mclass"))]
+pub use self::not_mclass::*;
+
+#[cfg(target_arch = "aarch64")]
+mod v8;
+
+#[cfg(target_arch = "aarch64")]
+pub use self::v8::*;
+
+/// Generates a DMB (data memory barrier) instruction or equivalent CP15 instruction.
+///
+/// DMB ensures the observed ordering of memory accesses. Memory accesses of the specified type
+/// issued before the DMB are guaranteed to be observed (in the specified scope) before memory
+/// accesses issued after the DMB.
+///
+/// For example, DMB should be used between storing data, and updating a flag variable that makes
+/// that data available to another core.
+///
+/// The __dmb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+pub unsafe fn __dmb<A>(arg: A)
+where
+    A: super::sealed::Dmb,
+{
+    arg.__dmb()
+}
+
+/// Generates a DSB (data synchronization barrier) instruction or equivalent CP15 instruction.
+///
+/// DSB ensures the completion of memory accesses. A DSB behaves as the equivalent DMB and has
+/// additional properties. After a DSB instruction completes, all memory accesses of the specified
+/// type issued before the DSB are guaranteed to have completed.
+///
+/// The __dsb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+pub unsafe fn __dsb<A>(arg: A)
+where
+    A: super::sealed::Dsb,
+{
+    arg.__dsb()
+}
+
+/// Generates an ISB (instruction synchronization barrier) instruction or equivalent CP15
+/// instruction.
+///
+/// This instruction flushes the processor pipeline fetch buffers, so that following instructions
+/// are fetched from cache or memory.
+///
+/// An ISB is needed after some system maintenance operations. An ISB is also needed before
+/// transferring control to code that has been loaded or modified in memory, for example by an
+/// overlay mechanism or just-in-time code generator.  (Note that if instruction and data caches are
+/// separate, privileged cache maintenance operations would be needed in order to unify the caches.)
+///
+/// The only supported argument for the __isb() intrinsic is 15, corresponding to the SY (full
+/// system) scope of the ISB instruction.
+#[inline(always)]
+pub unsafe fn __isb<A>(arg: A)
+where
+    A: super::sealed::Isb,
+{
+    arg.__isb()
+}
diff --git a/crates/core_arch/src/acle/barrier/not_mclass.rs b/crates/core_arch/src/acle/barrier/not_mclass.rs
new file mode 100644
index 0000000000..385e1d5289
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/not_mclass.rs
@@ -0,0 +1,43 @@
+//! Access types available on v7 and v8 but not on v7(E)-M or v8-M
+
+/// Full system is the required shareability domain, writes are the required
+/// access type
+pub struct ST;
+
+dmb_dsb!(ST);
+
+/// Inner Shareable is the required shareability domain, reads and writes are
+/// the required access types
+pub struct ISH;
+
+dmb_dsb!(ISH);
+
+/// Inner Shareable is the required shareability domain, writes are the required
+/// access type
+pub struct ISHST;
+
+dmb_dsb!(ISHST);
+
+/// Non-shareable is the required shareability domain, reads and writes are the
+/// required access types
+pub struct NSH;
+
+dmb_dsb!(NSH);
+
+/// Non-shareable is the required shareability domain, writes are the required
+/// access type
+pub struct NSHST;
+
+dmb_dsb!(NSHST);
+
+/// Outer Shareable is the required shareability domain, reads and writes are
+/// the required access types
+pub struct OSH;
+
+dmb_dsb!(OSH);
+
+/// Outer Shareable is the required shareability domain, writes are the required
+/// access type
+pub struct OSHST;
+
+dmb_dsb!(OSHST);
diff --git a/crates/core_arch/src/acle/barrier/v8.rs b/crates/core_arch/src/acle/barrier/v8.rs
new file mode 100644
index 0000000000..2951a5a670
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/v8.rs
@@ -0,0 +1,23 @@
+/// Full system is the required shareability domain, reads are the required
+/// access type
+pub struct LD;
+
+dmb_dsb!(LD);
+
+/// Inner Shareable is the required shareability domain, reads are the required
+/// access type
+pub struct ISHLD;
+
+dmb_dsb!(ISHLD);
+
+/// Non-shareable is the required shareability domain, reads are the required
+/// access type
+pub struct NSHLD;
+
+dmb_dsb!(NSHLD);
+
+/// Outher Shareable is the required shareability domain, reads are the required
+/// access type
+pub struct OSHLD;
+
+dmb_dsb!(OSHLD);
diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/acle/dsp.rs
new file mode 100644
index 0000000000..4029e7aaa3
--- /dev/null
+++ b/crates/core_arch/src/acle/dsp.rs
@@ -0,0 +1,24 @@
+//! # References:
+//!
+//! - Section 8.3 "16-bit multiplications"
+//! - Section 8.4 "Saturating intrinsics"
+//!
+//! Intrinsics that could live here:
+//!
+//! - __smulbb
+//! - __smulbt
+//! - __smultb
+//! - __smultt
+//! - __smulwb
+//! - __smulwt
+//! - __ssat
+//! - __usat
+//! - __qadd
+//! - __qsub
+//! - __qdbl
+//! - __smlabb
+//! - __smlabt
+//! - __smlatb
+//! - __smlatt
+//! - __smlawb
+//! - __smlawt
diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
new file mode 100644
index 0000000000..1b77e5e64c
--- /dev/null
+++ b/crates/core_arch/src/acle/hints.rs
@@ -0,0 +1,115 @@
+// # References
+//
+// - Section 7.4 "Hints" of ACLE
+// - Section 7.7 "NOP" of ACLE
+
+/// Generates a WFI (wait for interrupt) hint instruction, or nothing.
+///
+/// The WFI instruction allows (but does not require) the processor to enter a
+/// low-power state until one of a number of asynchronous events occurs.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __wfi() {
+    asm!("WFI" : : : : "volatile")
+}
+
+/// Generates a WFE (wait for event) hint instruction, or nothing.
+///
+/// The WFE instruction allows (but does not require) the processor to enter a
+/// low-power state until some event occurs such as a SEV being issued by
+/// another processor.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __wfe() {
+    asm!("WFE" : : : : "volatile")
+}
+
+/// Generates a SEV (send a global event) hint instruction.
+///
+/// This causes an event to be signaled to all processors in a multiprocessor
+/// system. It is a NOP on a uniprocessor system.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M, 7-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __sev() {
+    asm!("SEV" : : : : "volatile")
+}
+
+/// Generates a send a local event hint instruction.
+///
+/// This causes an event to be signaled to only the processor executing this
+/// instruction. In a multiprocessor system, it is not required to affect the
+/// other processors.
+// LLVM says "instruction requires: armv8"
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub unsafe fn __sevl() {
+    asm!("SEVL" : : : : "volatile")
+}
+
+/// Generates a YIELD hint instruction.
+///
+/// This enables multithreading software to indicate to the hardware that it is
+/// performing a task, for example a spin-lock, that could be swapped out to
+/// improve overall system performance.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __yield() {
+    asm!("YIELD" : : : : "volatile")
+}
+
+/// Generates a DBG instruction.
+///
+/// This provides a hint to debugging and related systems. The argument must be
+/// a constant integer from 0 to 15 inclusive. See implementation documentation
+/// for the effect (if any) of this instruction and the meaning of the
+/// argument. This is available only when compliling for AArch32.
+// Section 10.1 of ACLE says that the supported arches are: 7, 7-M
+// LLVM says "instruction requires: thumb2" OR "instruction requires: armv7"
+#[cfg(target_feature = "v6t2")]
+#[inline(always)]
+#[rustc_args_required_const(0)]
+pub unsafe fn __dbg(imm4: u32) {
+    macro_rules! call {
+        ($imm4:expr) => {
+            asm!(concat!("DBG ", stringify!($imm4)) : : : : "volatile")
+        }
+    }
+
+    match imm4 & 0b1111 {
+        0 => call!(0),
+        1 => call!(1),
+        2 => call!(2),
+        3 => call!(3),
+        4 => call!(4),
+        5 => call!(5),
+        6 => call!(6),
+        7 => call!(7),
+        8 => call!(8),
+        9 => call!(9),
+        10 => call!(10),
+        11 => call!(11),
+        12 => call!(12),
+        13 => call!(13),
+        14 => call!(14),
+        _ => call!(15),
+    }
+}
+
+/// Generates an unspecified no-op instruction.
+///
+/// Note that not all architectures provide a distinguished NOP instruction. On
+/// those that do, it is unspecified whether this intrinsic generates it or
+/// another instruction. It is not guaranteed that inserting this instruction
+/// will increase execution time.
+#[inline(always)]
+pub unsafe fn __nop() {
+    asm!("NOP" : : : : "volatile")
+}
diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
new file mode 100644
index 0000000000..ec7dabd2b3
--- /dev/null
+++ b/crates/core_arch/src/acle/mod.rs
@@ -0,0 +1,134 @@
+//! ARM C Language Extensions (ACLE)
+//!
+//! # Developer notes
+//!
+//! Below is a list of built-in targets that are representative of the different ARM
+//! architectures; the list includes the `target_feature`s they possess.
+//!
+//! - `armv4t-unknown-linux-gnueabi` - **ARMv4** - `+v4t`
+//! - `armv5te-unknown-linux-gnueabi` - **ARMv5TE** - `+v4t +v5te`
+//! - `arm-unknown-linux-gnueabi` - **ARMv6** - `+v4t +v5te +v6`
+//! - `thumbv6m-none-eabi` - **ARMv6-M** - `+v4t +v5te +v6 +thumb-mode +mclass`
+//! - `armv7-unknown-linux-gnueabihf` - **ARMv7-A** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +aclass`
+//! - `armv7r-none-eabi` - **ARMv7-R** - `+v4t +v5te +v6 +v6k +v6t2  +v7 +dsp +thumb2 +rclass`
+//! - `thumbv7m-none-eabi` - **ARMv7-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `thumbv7em-none-eabi` - **ARMv7E-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +thumb-mode +mclass`
+//! - `aarch64-unknown-linux-gnu` - **ARMv8-A (AArch64)** - `+fp +neon`
+//!
+//! Section 10.1 of ACLE says:
+//!
+//! - "In the sequence of Arm architectures { v5, v5TE, v6, v6T2, v7 } each architecture includes
+//! its predecessor instruction set."
+//!
+//! - "In the sequence of Thumb-only architectures { v6-M, v7-M, v7E-M } each architecture includes
+//! its predecessor instruction set."
+//!
+//! From that info and from looking at how LLVM features work (using custom targets) we can identify
+//! features that are subsets of others:
+//!
+//! Legend: `a < b` reads as "`a` is a subset of `b`"; this means that if `b` is enabled then `a` is
+//! enabled as well.
+//!
+//! - `v4t < v5te < v6 < v6k < v6t2 < v7 < v8`
+//! - `v6 < v8m < v6t2`
+//! - `v7 < v8m.main`
+//!
+//! # References
+//!
+//! - [ACLE Q2 2018](https://developer.arm.com/docs/101028/latest)
+
+// Supported arches: 8, 7, 6-M. See Section 10.1 of ACLE (e.g. DMB)
+// But this is further refined within the module
+#[cfg(any(
+    // v8
+    target_arch = "aarch64",
+    // v7
+    target_feature = "v7",
+    // v6-M
+    target_feature = "mclass"
+))]
+mod barrier;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+pub use self::barrier::*;
+
+mod hints;
+
+pub use self::hints::*;
+
+mod registers;
+
+pub use self::registers::*;
+
+// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
+// But we also exclude the A profile because DSP is deprecated on that profile as of ACLE 2.0 (see
+// section 5.4.7)
+#[cfg(any(
+    // >= v5TE but excludes v7-A
+    all(target_feature = "v5te", not(target_feature = "mclass"), not(target_feature = "aclass")),
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp"),
+))]
+mod dsp;
+
+#[cfg(any(
+    all(
+        target_feature = "v5te",
+        not(target_feature = "mclass"),
+        not(target_feature = "aclass")
+    ),
+    all(target_feature = "mclass", target_feature = "dsp"),
+))]
+pub use dsp::*;
+
+// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
+// Section 5.4.9 of ACLE.
+#[cfg(any(
+    // v7-R
+    target_feature = "rclass",
+    // v7E-M
+    all(target_feature = "mclass", target_feature = "dsp")
+))]
+mod simd32;
+
+mod sealed {
+    pub trait Dmb {
+        unsafe fn __dmb(&self);
+    }
+
+    pub trait Dsb {
+        unsafe fn __dsb(&self);
+    }
+
+    pub trait Isb {
+        unsafe fn __isb(&self);
+    }
+
+    pub trait Rsr {
+        unsafe fn __rsr(&self) -> u32;
+    }
+
+    pub trait Rsr64 {
+        unsafe fn __rsr64(&self) -> u64;
+    }
+
+    pub trait Rsrp {
+        unsafe fn __rsrp(&self) -> *const u8;
+    }
+
+    pub trait Wsr {
+        unsafe fn __wsr(&self, value: u32);
+    }
+
+    pub trait Wsr64 {
+        unsafe fn __wsr64(&self, value: u64);
+    }
+
+    pub trait Wsrp {
+        unsafe fn __wsrp(&self, value: *const u8);
+    }
+}
diff --git a/crates/core_arch/src/acle/registers/aarch32.rs b/crates/core_arch/src/acle/registers/aarch32.rs
new file mode 100644
index 0000000000..f59af5d3ae
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/aarch32.rs
@@ -0,0 +1,4 @@
+/// Application Program Status Register
+pub struct APSR;
+
+rsr!(APSR);
diff --git a/crates/core_arch/src/acle/registers/mod.rs b/crates/core_arch/src/acle/registers/mod.rs
new file mode 100644
index 0000000000..73fcc2c7b0
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/mod.rs
@@ -0,0 +1,121 @@
+#[allow(unused_macros)]
+macro_rules! rsr {
+    ($R:ident) => {
+        impl super::super::sealed::Rsr for $R {
+            unsafe fn __rsr(&self) -> u32 {
+                let r: u32;
+                asm!(concat!("mrs $0,", stringify!($R)) : "=r"(r) : : : "volatile");
+                r
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! rsrp {
+    ($R:ident) => {
+        impl super::super::sealed::Rsrp for $R {
+            unsafe fn __rsrp(&self) -> *const u8 {
+                let r: *const u8;
+                asm!(concat!("mrs $0,", stringify!($R)) : "=r"(r) : : : "volatile");
+                r
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! wsr {
+    ($R:ident) => {
+        impl super::super::sealed::Wsr for $R {
+            unsafe fn __wsr(&self, value: u32) {
+                asm!(concat!("msr ", stringify!($R), ",$0") : : "r"(value) : : "volatile");
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! wsrp {
+    ($R:ident) => {
+        impl super::super::sealed::Wsrp for $R {
+            unsafe fn __wsrp(&self, value: *const u8) {
+                asm!(concat!("msr ", stringify!($R), ",$0") : : "r"(value) : : "volatile");
+            }
+        }
+    };
+}
+
+#[cfg(target_feature = "mclass")]
+mod v6m;
+
+#[cfg(target_feature = "mclass")]
+pub use self::v6m::*;
+
+#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
+mod v7m;
+
+#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
+pub use self::v7m::*;
+
+#[cfg(not(target_arch = "aarch64"))]
+mod aarch32;
+
+#[cfg(not(target_arch = "aarch64"))]
+pub use self::aarch32::*;
+
+/// Reads a 32-bit system register
+#[inline(always)]
+pub unsafe fn __rsr<R>(reg: R) -> u32
+where
+    R: super::sealed::Rsr,
+{
+    reg.__rsr()
+}
+
+/// Reads a 64-bit system register
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub unsafe fn __rsr64<R>(reg: R) -> u64
+where
+    R: super::sealed::Rsr64,
+{
+    reg.__rsr64()
+}
+
+/// Reads a system register containing an address
+#[inline(always)]
+pub unsafe fn __rsrp<R>(reg: R) -> *const u8
+where
+    R: super::sealed::Rsrp,
+{
+    reg.__rsrp()
+}
+
+/// Writes a 32-bit system register
+#[inline(always)]
+pub unsafe fn __wsr<R>(reg: R, value: u32)
+where
+    R: super::sealed::Wsr,
+{
+    reg.__wsr(value)
+}
+
+/// Writes a 64-bit system register
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub unsafe fn __wsr64<R>(reg: R, value: u64)
+where
+    R: super::sealed::Wsr64,
+{
+    reg.__wsr64(value)
+}
+
+/// Writes a system register containing an address
+#[inline(always)]
+pub unsafe fn __wsrp<R>(reg: R, value: *const u8)
+where
+    R: super::sealed::Wsrp,
+{
+    reg.__wsrp(value)
+}
diff --git a/crates/core_arch/src/acle/registers/v6m.rs b/crates/core_arch/src/acle/registers/v6m.rs
new file mode 100644
index 0000000000..7acc63b6d1
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/v6m.rs
@@ -0,0 +1,39 @@
+/// CONTROL register
+pub struct CONTROL;
+
+rsr!(CONTROL);
+wsr!(CONTROL);
+
+/// Execution Program Status Register
+pub struct EPSR;
+
+rsr!(EPSR);
+
+/// Interrupt Program Status Register
+pub struct IPSR;
+
+rsr!(IPSR);
+
+/// Main Stack Pointer
+pub struct MSP;
+
+rsrp!(MSP);
+wsrp!(MSP);
+
+/// Priority Mask Register
+pub struct PRIMASK;
+
+rsr!(PRIMASK);
+wsr!(PRIMASK);
+
+/// Process Stack Pointer
+pub struct PSP;
+
+rsrp!(PSP);
+wsrp!(PSP);
+
+/// Program Status Register
+#[allow(non_camel_case_types)]
+pub struct xPSR;
+
+rsr!(xPSR);
diff --git a/crates/core_arch/src/acle/registers/v7m.rs b/crates/core_arch/src/acle/registers/v7m.rs
new file mode 100644
index 0000000000..d1b1d474f1
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/v7m.rs
@@ -0,0 +1,17 @@
+/// Base Priority Mask Register
+pub struct BASEPRI;
+
+rsr!(BASEPRI);
+wsr!(BASEPRI);
+
+/// Base Priority Mask Register (conditional write)
+#[allow(non_camel_case_types)]
+pub struct BASEPRI_MAX;
+
+wsr!(BASEPRI_MAX);
+
+/// Fault Mask Register
+pub struct FAULTMASK;
+
+rsr!(FAULTMASK);
+wsr!(FAULTMASK);
diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
new file mode 100644
index 0000000000..6b28ec88dc
--- /dev/null
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -0,0 +1,60 @@
+//! # References
+//!
+//! - Section 8.5 "32-bit SIMD intrinsics" of ACLE
+//!
+//! Intrinsics that could live here
+//!
+//! - __ssat16
+//! - __usat16
+//! - __sxtab16
+//! - __sxtb16
+//! - __uxtab16
+//! - __uxtb16
+//! - __qsub8
+//! - __sadd8
+//! - __shadd8
+//! - __shsub8
+//! - __ssub8
+//! - __uadd8
+//! - __uhadd8
+//! - __uhsub8
+//! - __uqadd8
+//! - __uqsub8
+//! - __usub8
+//! - __usad8
+//! - __usada8
+//! - __qadd16
+//! - __qasx
+//! - __qsub16
+//! - __sadd16
+//! - __sasx
+//! - __shadd16
+//! - __shasx
+//! - __shsax
+//! - __shsub16
+//! - __ssax
+//! - __ssub16
+//! - __uadd16
+//! - __uasx
+//! - __uhadd16
+//! - __uhasx
+//! - __uhsax
+//! - __uhsub16
+//! - __uqadd16
+//! - __uqasx
+//! - __uqsax
+//! - __uqsub16
+//! - __usax
+//! - __usub16
+//! - __smlad
+//! - __smladx
+//! - __smlald
+//! - __smlaldx
+//! - __smlsd
+//! - __smlsdx
+//! - __smlsld
+//! - __smlsldx
+//! - __smuad
+//! - __smuadx
+//! - __smusd
+//! - __smusdx
diff --git a/crates/core_arch/src/arm/cmsis.rs b/crates/core_arch/src/arm/cmsis.rs
deleted file mode 100644
index bc8509d3e8..0000000000
--- a/crates/core_arch/src/arm/cmsis.rs
+++ /dev/null
@@ -1,330 +0,0 @@
-//! CMSIS: Cortex Microcontroller Software Interface Standard
-//!
-//! The version 5 of the standard can be found at:
-//!
-//! http://arm-software.github.io/CMSIS_5/Core/html/index.html
-//!
-//! The API reference of the standard can be found at:
-//!
-//! - Core function access -- http://arm-software.github.io/CMSIS_5/Core/html/group__Core__Register__gr.html
-//! - Intrinsic functions for CPU instructions -- http://arm-software.github.io/CMSIS_5/Core/html/group__intrinsic__CPU__gr.html
-//!
-//! The reference C implementation used as the base of this Rust port can be
-//! found at
-//!
-//! https://github.com/ARM-software/CMSIS_5/blob/5.3.0/CMSIS/Core/Include/cmsis_gcc.h
-
-#![allow(non_snake_case)]
-
-/* Core function access */
-
-/// Enable IRQ Interrupts
-///
-/// Enables IRQ interrupts by clearing the I-bit in the CPSR. Can only be
-/// executed in Privileged modes.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(cpsie))]
-pub unsafe fn __enable_irq() {
-    asm!("cpsie i" : : : "memory" : "volatile");
-}
-
-/// Disable IRQ Interrupts
-///
-/// Disables IRQ interrupts by setting the I-bit in the CPSR. Can only be
-/// executed in Privileged modes.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(cpsid))]
-pub unsafe fn __disable_irq() {
-    asm!("cpsid i" : : : "memory" : "volatile");
-}
-
-/// Get Control Register
-///
-/// Returns the content of the Control Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_CONTROL() -> u32 {
-    let result: u32;
-    asm!("mrs $0, CONTROL" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Set Control Register
-///
-/// Writes the given value to the Control Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_CONTROL(control: u32) {
-    asm!("msr CONTROL, $0" : : "r"(control) : "memory" : "volatile");
-}
-
-/// Get IPSR Register
-///
-/// Returns the content of the IPSR Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_IPSR() -> u32 {
-    let result: u32;
-    asm!("mrs $0, IPSR" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Get APSR Register
-///
-/// Returns the content of the APSR Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_APSR() -> u32 {
-    let result: u32;
-    asm!("mrs $0, APSR" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Get xPSR Register
-///
-/// Returns the content of the xPSR Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_xPSR() -> u32 {
-    let result: u32;
-    asm!("mrs $0, XPSR" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Get Process Stack Pointer
-///
-/// Returns the current value of the Process Stack Pointer (PSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_PSP() -> u32 {
-    let result: u32;
-    asm!("mrs $0, PSP" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Set Process Stack Pointer
-///
-/// Assigns the given value to the Process Stack Pointer (PSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_PSP(top_of_proc_stack: u32) {
-    asm!("msr PSP, $0" : : "r"(top_of_proc_stack) : : "volatile");
-}
-
-/// Get Main Stack Pointer
-///
-/// Returns the current value of the Main Stack Pointer (MSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_MSP() -> u32 {
-    let result: u32;
-    asm!("mrs $0, MSP" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Set Main Stack Pointer
-///
-/// Assigns the given value to the Main Stack Pointer (MSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_MSP(top_of_main_stack: u32) {
-    asm!("msr MSP, $0" : : "r"(top_of_main_stack) : : "volatile");
-}
-
-/// Get Priority Mask
-///
-/// Returns the current state of the priority mask bit from the Priority Mask
-/// Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_PRIMASK() -> u32 {
-    let result: u32;
-    asm!("mrs $0, PRIMASK" : "=r"(result) : : "memory" : "volatile");
-    result
-}
-
-/// Set Priority Mask
-///
-/// Assigns the given value to the Priority Mask Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_PRIMASK(pri_mask: u32) {
-    asm!("msr PRIMASK, $0" : : "r"(pri_mask) : : "volatile");
-}
-
-#[cfg(any(target_feature = "v7", dox))]
-mod v7 {
-    /// Enable FIQ
-    ///
-    /// Enables FIQ interrupts by clearing the F-bit in the CPSR. Can only be
-    /// executed in Privileged modes.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(cpsie))]
-    pub unsafe fn __enable_fault_irq() {
-        asm!("cpsie f" : : : "memory" : "volatile");
-    }
-
-    /// Disable FIQ
-    ///
-    /// Disables FIQ interrupts by setting the F-bit in the CPSR. Can only be
-    /// executed in Privileged modes.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(cpsid))]
-    pub unsafe fn __disable_fault_irq() {
-        asm!("cpsid f" : : : "memory" : "volatile");
-    }
-
-    /// Get Base Priority
-    ///
-    /// Returns the current value of the Base Priority register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(mrs))]
-    pub unsafe fn __get_BASEPRI() -> u32 {
-        let result: u32;
-        asm!("mrs $0, BASEPRI" : "=r"(result) : : : "volatile");
-        result
-    }
-
-    /// Set Base Priority
-    ///
-    /// Assigns the given value to the Base Priority register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(msr))]
-    pub unsafe fn __set_BASEPRI(base_pri: u32) {
-        asm!("msr BASEPRI, $0" : : "r"(base_pri) : "memory" : "volatile");
-    }
-
-    /// Set Base Priority with condition
-    ///
-    /// Assigns the given value to the Base Priority register only if BASEPRI
-    /// masking is disabled, or the new value increases the BASEPRI
-    /// priority level.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(mrs))]
-    pub unsafe fn __set_BASEPRI_MAX(base_pri: u32) {
-        asm!("msr BASEPRI_MAX, $0" : : "r"(base_pri) : "memory" : "volatile");
-    }
-
-    /// Get Fault Mask
-    ///
-    /// Returns the current value of the Fault Mask register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(mrs))]
-    pub unsafe fn __get_FAULTMASK() -> u32 {
-        let result: u32;
-        asm!("mrs $0, FAULTMASK" : "=r"(result) : : : "volatile");
-        result
-    }
-
-    /// Set Fault Mask
-    ///
-    /// Assigns the given value to the Fault Mask register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(msr))]
-    pub unsafe fn __set_FAULTMASK(fault_mask: u32) {
-        asm!("msr FAULTMASK, $0" : : "r"(fault_mask) : "memory" : "volatile");
-    }
-}
-
-#[cfg(any(target_feature = "v7", dox))]
-pub use self::v7::*;
-
-/* Core instruction access */
-
-/// No Operation
-///
-/// No Operation does nothing. This instruction can be used for code alignment
-/// purposes.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(nop))]
-pub unsafe fn __NOP() {
-    asm!("nop" : : : : "volatile");
-}
-
-/// Wait For Interrupt
-///
-/// Wait For Interrupt is a hint instruction that suspends execution until one
-/// of a number of events occurs.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(wfi))]
-pub unsafe fn __WFI() {
-    asm!("wfi" : : : : "volatile");
-}
-
-/// Wait For Event
-///
-/// Wait For Event is a hint instruction that permits the processor to enter a
-/// low-power state until one of a number of events occurs.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(wfe))]
-pub unsafe fn __WFE() {
-    asm!("wfe" : : : : "volatile");
-}
-
-/// Send Event
-///
-/// Send Event is a hint instruction. It causes an event to be signaled to the
-/// CPU.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(sev))]
-pub unsafe fn __SEV() {
-    asm!("sev" : : : : "volatile");
-}
-
-/// Instruction Synchronization Barrier
-///
-/// Instruction Synchronization Barrier flushes the pipeline in the processor,
-/// so that all instructions following the ISB are fetched from cache or
-/// memory, after the instruction has been completed.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(isb))]
-pub unsafe fn __ISB() {
-    asm!("isb 0xF" : : : "memory" : "volatile");
-}
-
-/// Data Synchronization Barrier
-///
-/// Acts as a special kind of Data Memory Barrier. It completes when all
-/// explicit memory accesses before this instruction complete.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(dsb))]
-pub unsafe fn __DSB() {
-    asm!("dsb 0xF" : : : "memory" : "volatile");
-}
-
-/// Data Memory Barrier
-///
-/// Ensures the apparent order of the explicit memory operations before and
-/// after the instruction, without ensuring their completion.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(dmb))]
-pub unsafe fn __DMB() {
-    asm!("dmb 0xF" : : : "memory" : "volatile");
-}
diff --git a/crates/core_arch/src/arm/mod.rs b/crates/core_arch/src/arm/mod.rs
index 30ff991f8d..dd69b11457 100644
--- a/crates/core_arch/src/arm/mod.rs
+++ b/crates/core_arch/src/arm/mod.rs
@@ -11,11 +11,6 @@ mod armclang;
 
 pub use self::armclang::*;
 
-#[cfg(any(target_feature = "mclass", dox))]
-mod cmsis;
-#[cfg(any(target_feature = "mclass", dox))]
-pub use self::cmsis::*;
-
 mod v6;
 pub use self::v6::*;
 
@@ -24,6 +19,7 @@ mod v7;
 #[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
 pub use self::v7::*;
 
+// TODO move into the `acle::{dsp,simd32}` modules
 #[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
 mod dsp;
 #[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
@@ -44,6 +40,8 @@ mod neon;
 ))]
 pub use self::neon::*;
 
+pub use super::acle::*;
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index 9705e091ca..1ca811ee75 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -3,6 +3,9 @@
 #[macro_use]
 mod macros;
 
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+mod acle;
+
 mod simd;
 
 #[cfg_attr(

From 342786c3e6c9a36a32514ac6724e10f545a30924 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 3 Sep 2018 19:20:33 +0200
Subject: [PATCH 02/31] fix relative import

---
 crates/core_arch/src/acle/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index ec7dabd2b3..3d1b0324d5 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -83,7 +83,7 @@ mod dsp;
     ),
     all(target_feature = "mclass", target_feature = "dsp"),
 ))]
-pub use dsp::*;
+pub use self::dsp::*;
 
 // Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
 // Section 5.4.9 of ACLE.

From 503f9f7cb28b03cd500be77a50803984747e9409 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 3 Sep 2018 19:21:14 +0200
Subject: [PATCH 03/31] add missing import

---
 crates/core_arch/src/acle/mod.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 3d1b0324d5..843115a366 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -95,6 +95,12 @@ pub use self::dsp::*;
 ))]
 mod simd32;
 
+#[cfg(any(
+    target_feature = "rclass",
+    all(target_feature = "mclass", target_feature = "dsp")
+))]
+pub use self::simd32::*;
+
 mod sealed {
     pub trait Dmb {
         unsafe fn __dmb(&self);

From b077652204e355e0cc741b0c4005539adbb40b90 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 17:27:48 +0100
Subject: [PATCH 04/31] acle::hints: use llvm.{arm,aarch64.hint}

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255250217
---
 crates/core_arch/src/acle/hints.rs | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
index 1b77e5e64c..78d7834073 100644
--- a/crates/core_arch/src/acle/hints.rs
+++ b/crates/core_arch/src/acle/hints.rs
@@ -12,7 +12,7 @@
 #[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __wfi() {
-    asm!("WFI" : : : : "volatile")
+    hint(HINT_WFI);
 }
 
 /// Generates a WFE (wait for event) hint instruction, or nothing.
@@ -25,7 +25,7 @@ pub unsafe fn __wfi() {
 #[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __wfe() {
-    asm!("WFE" : : : : "volatile")
+    hint(HINT_WFE);
 }
 
 /// Generates a SEV (send a global event) hint instruction.
@@ -37,7 +37,7 @@ pub unsafe fn __wfe() {
 #[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __sev() {
-    asm!("SEV" : : : : "volatile")
+    hint(HINT_SEV);
 }
 
 /// Generates a send a local event hint instruction.
@@ -49,7 +49,7 @@ pub unsafe fn __sev() {
 #[cfg(target_arch = "aarch64")]
 #[inline(always)]
 pub unsafe fn __sevl() {
-    asm!("SEVL" : : : : "volatile")
+    hint(HINT_SEVL);
 }
 
 /// Generates a YIELD hint instruction.
@@ -62,7 +62,7 @@ pub unsafe fn __sevl() {
 #[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __yield() {
-    asm!("YIELD" : : : : "volatile")
+    hint(HINT_YIELD);
 }
 
 /// Generates a DBG instruction.
@@ -111,5 +111,19 @@ pub unsafe fn __dbg(imm4: u32) {
 /// will increase execution time.
 #[inline(always)]
 pub unsafe fn __nop() {
-    asm!("NOP" : : : : "volatile")
+    hint(HINT_NOP);
 }
+
+extern "C" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.hint")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.hint")]
+    fn hint(_: i32);
+}
+
+// from LLVM 7.0.1's lib/Target/ARM/{ARMInstrThumb,ARMInstrInfo,ARMInstrThumb2}.td
+const HINT_NOP: i32 = 0;
+const HINT_YIELD: i32 = 1;
+const HINT_WFE: i32 = 2;
+const HINT_WFI: i32 = 3;
+const HINT_SEV: i32 = 4;
+const HINT_SEVL: i32 = 5;

From b6672dd6883260c81a8f8bbe32598e9c6dc5cdb5 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 17:37:46 +0100
Subject: [PATCH 05/31] acle/hints: __dbg requires 'v7'

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255250415
---
 crates/core_arch/src/acle/hints.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
index 78d7834073..52d47cc9bf 100644
--- a/crates/core_arch/src/acle/hints.rs
+++ b/crates/core_arch/src/acle/hints.rs
@@ -72,8 +72,11 @@ pub unsafe fn __yield() {
 /// for the effect (if any) of this instruction and the meaning of the
 /// argument. This is available only when compliling for AArch32.
 // Section 10.1 of ACLE says that the supported arches are: 7, 7-M
-// LLVM says "instruction requires: thumb2" OR "instruction requires: armv7"
-#[cfg(target_feature = "v6t2")]
+// "The DBG hint instruction is added in ARMv7. It is UNDEFINED in the ARMv6 base architecture, and
+// executes as a NOP instruction in ARMv6K and ARMv6T2." - ARM Architecture Reference Manual ARMv7-A
+// and ARMv7-R edition (ARM DDI 0406C.c) sections D12.4.1 "ARM instruction set support" and D12.4.2
+// "Thumb instruction set support"
+#[cfg(target_feature = "v7")]
 #[inline(always)]
 #[rustc_args_required_const(0)]
 pub unsafe fn __dbg(imm4: u32) {

From 54223249e3da5ab856a13afd2564eff82d28c5b9 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 17:44:54 +0100
Subject: [PATCH 06/31] acle/hints: use asm! for __nop

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255250890
---
 crates/core_arch/src/acle/hints.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
index 52d47cc9bf..6f9a2dfb21 100644
--- a/crates/core_arch/src/acle/hints.rs
+++ b/crates/core_arch/src/acle/hints.rs
@@ -114,7 +114,7 @@ pub unsafe fn __dbg(imm4: u32) {
 /// will increase execution time.
 #[inline(always)]
 pub unsafe fn __nop() {
-    hint(HINT_NOP);
+    asm!("NOP" : : : : "volatile")
 }
 
 extern "C" {

From 5973346dc772cbdf96f459041bd0c7276279f33c Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 18:29:48 +0100
Subject: [PATCH 07/31] acle/hints: most hints require 'v6' rather than 'v6k'

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255251241
---
 crates/core_arch/src/acle/hints.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
index 6f9a2dfb21..6f33fe3419 100644
--- a/crates/core_arch/src/acle/hints.rs
+++ b/crates/core_arch/src/acle/hints.rs
@@ -9,7 +9,7 @@
 /// low-power state until one of a number of asynchronous events occurs.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __wfi() {
     hint(HINT_WFI);
@@ -22,7 +22,7 @@ pub unsafe fn __wfi() {
 /// another processor.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __wfe() {
     hint(HINT_WFE);
@@ -34,7 +34,7 @@ pub unsafe fn __wfe() {
 /// system. It is a NOP on a uniprocessor system.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M, 7-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __sev() {
     hint(HINT_SEV);
@@ -59,7 +59,7 @@ pub unsafe fn __sevl() {
 /// improve overall system performance.
 // Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
 // LLVM says "instruction requires: armv6k"
-#[cfg(any(target_feature = "v6k", target_arch = "aarch64"))]
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
 #[inline(always)]
 pub unsafe fn __yield() {
     hint(HINT_YIELD);

From b3f0f28a4b7a577b1a2d8ac449f0611d0b68a4ce Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 18:35:26 +0100
Subject: [PATCH 08/31] acle/simd32: also expose on the A profile

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255253933
---
 crates/core_arch/src/acle/mod.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 843115a366..59197a0df9 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -86,17 +86,17 @@ mod dsp;
 pub use self::dsp::*;
 
 // Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
-// Section 5.4.9 of ACLE.
+// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
 #[cfg(any(
-    // v7-R
-    target_feature = "rclass",
+    // v7-A, v7-R
+    all(target_feature = "v6", not(target_feature = "mclass")),
     // v7E-M
     all(target_feature = "mclass", target_feature = "dsp")
 ))]
 mod simd32;
 
 #[cfg(any(
-    target_feature = "rclass",
+    all(target_feature = "v6", not(target_feature = "mclass")),
     all(target_feature = "mclass", target_feature = "dsp")
 ))]
 pub use self::simd32::*;

From 25961be88f8be13a02399b0ed68bf7ba8b6066d2 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 19:06:01 +0100
Subject: [PATCH 09/31] acle/dsp: make available on the A profile

---
 crates/core_arch/src/acle/mod.rs | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 59197a0df9..4d7e45af0d 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -65,22 +65,18 @@ mod registers;
 pub use self::registers::*;
 
 // Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
-// But we also exclude the A profile because DSP is deprecated on that profile as of ACLE 2.0 (see
+// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
 // section 5.4.7)
 #[cfg(any(
     // >= v5TE but excludes v7-A
-    all(target_feature = "v5te", not(target_feature = "mclass"), not(target_feature = "aclass")),
+    all(target_feature = "v5te", not(target_feature = "mclass")),
     // v7E-M
     all(target_feature = "mclass", target_feature = "dsp"),
 ))]
 mod dsp;
 
 #[cfg(any(
-    all(
-        target_feature = "v5te",
-        not(target_feature = "mclass"),
-        not(target_feature = "aclass")
-    ),
+    all(target_feature = "v5te", not(target_feature = "mclass")),
     all(target_feature = "mclass", target_feature = "dsp"),
 ))]
 pub use self::dsp::*;

From 74ddada5126a95537c26d74f638bbd075da66b9c Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 19:45:27 +0100
Subject: [PATCH 10/31] acle/barrier: use llvm.{arm,aarch64}.{dmb,dsb,isb}
 instead of asm!

also make these available on architectures that don't have a dedicated DMB / DSB
/ ISB instruction

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255312214
---
 crates/core_arch/src/acle/barrier/common.rs |  2 +-
 crates/core_arch/src/acle/barrier/cp15.rs   | 27 ++++++++
 crates/core_arch/src/acle/barrier/mod.rs    | 74 +++++++++++++++++++--
 crates/core_arch/src/acle/mod.rs            | 12 +---
 4 files changed, 100 insertions(+), 15 deletions(-)
 create mode 100644 crates/core_arch/src/acle/barrier/cp15.rs

diff --git a/crates/core_arch/src/acle/barrier/common.rs b/crates/core_arch/src/acle/barrier/common.rs
index a1d8c93e8c..0fb35534d1 100644
--- a/crates/core_arch/src/acle/barrier/common.rs
+++ b/crates/core_arch/src/acle/barrier/common.rs
@@ -9,6 +9,6 @@ dmb_dsb!(SY);
 impl super::super::sealed::Isb for SY {
     #[inline(always)]
     unsafe fn __isb(&self) {
-        asm!("ISB SY" : : : "memory" : "volatile")
+        super::isb(super::arg::SY)
     }
 }
diff --git a/crates/core_arch/src/acle/barrier/cp15.rs b/crates/core_arch/src/acle/barrier/cp15.rs
new file mode 100644
index 0000000000..7938acbbb4
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/cp15.rs
@@ -0,0 +1,27 @@
+// Reference: ARM11 MPCore Processor Technical Reference Manual (ARM DDI 0360E) Section 3.5 "Summary
+// of CP15 instructions"
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+pub struct SY;
+
+impl super::super::sealed::Dmb for SY {
+    #[inline(always)]
+    unsafe fn __dmb(&self) {
+        asm!("mcr p15, 0, r0, c7, c10, 5" : : : "memory" : "volatile")
+    }
+}
+
+impl super::super::sealed::Dsb for SY {
+    #[inline(always)]
+    unsafe fn __dsb(&self) {
+        asm!("mcr p15, 0, r0, c7, c10, 4" : : : "memory" : "volatile")
+    }
+}
+
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        asm!("mcr p15, 0, r0, c7, c5, 4" : : : "memory" : "volatile")
+    }
+}
diff --git a/crates/core_arch/src/acle/barrier/mod.rs b/crates/core_arch/src/acle/barrier/mod.rs
index 3fbf6899ff..61686895f0 100644
--- a/crates/core_arch/src/acle/barrier/mod.rs
+++ b/crates/core_arch/src/acle/barrier/mod.rs
@@ -1,31 +1,66 @@
 // Reference: Section 7.4 "Hints" of ACLE
 
+// CP15 instruction
+#[cfg(not(any(
+    // v8
+    target_arch = "aarch64",
+    // v7
+    target_feature = "v7",
+    // v6-M
+    target_feature = "mclass"
+)))]
+mod cp15;
+
+#[cfg(not(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+)))]
+pub use self::cp15::*;
+
+// Dedicated instructions
 macro_rules! dmb_dsb {
     ($A:ident) => {
         impl super::super::sealed::Dmb for $A {
             #[inline(always)]
             unsafe fn __dmb(&self) {
-                asm!(concat!("DMB ", stringify!($A)) : : : "memory" : "volatile")
+                super::dmb(super::arg::$A)
             }
         }
 
         impl super::super::sealed::Dsb for $A {
             #[inline(always)]
             unsafe fn __dsb(&self) {
-                asm!(concat!("DSB ", stringify!($A)) : : : "memory" : "volatile")
+                super::dsb(super::arg::$A)
             }
         }
     };
 }
 
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
 mod common;
 
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
 pub use self::common::*;
 
-#[cfg(not(target_feature = "mclass"))]
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+))]
 mod not_mclass;
 
-#[cfg(not(target_feature = "mclass"))]
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+))]
 pub use self::not_mclass::*;
 
 #[cfg(target_arch = "aarch64")]
@@ -87,3 +122,34 @@ where
 {
     arg.__isb()
 }
+
+extern "C" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.dmb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dmb")]
+    fn dmb(_: i32);
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.dsb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dsb")]
+    fn dsb(_: i32);
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.isb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.isb")]
+    fn isb(_: i32);
+}
+
+// we put these in a module to prevent weirdness with glob re-exports
+mod arg {
+    // See Section 7.3  Memory barriers of ACLE
+    pub const SY: i32 = 15;
+    pub const ST: i32 = 14;
+    pub const LD: i32 = 13;
+    pub const ISH: i32 = 11;
+    pub const ISHST: i32 = 10;
+    pub const ISHLD: i32 = 9;
+    pub const NSH: i32 = 7;
+    pub const NSHST: i32 = 6;
+    pub const NSHLD: i32 = 5;
+    pub const OSH: i32 = 3;
+    pub const OSHST: i32 = 2;
+    pub const OSHLD: i32 = 1;
+}
diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 4d7e45af0d..83aaacb5eb 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -37,16 +37,8 @@
 //!
 //! - [ACLE Q2 2018](https://developer.arm.com/docs/101028/latest)
 
-// Supported arches: 8, 7, 6-M. See Section 10.1 of ACLE (e.g. DMB)
-// But this is further refined within the module
-#[cfg(any(
-    // v8
-    target_arch = "aarch64",
-    // v7
-    target_feature = "v7",
-    // v6-M
-    target_feature = "mclass"
-))]
+// 8, 7 and 6-M are supported via dedicated instructions like DMB. All other arches are supported
+// via CP15 instructions. See Section 10.1 of ACLE
 mod barrier;
 
 #[cfg(any(

From d91daffe239e2079881dbf44825d961b27215d33 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 19:52:15 +0100
Subject: [PATCH 11/31] acle/{simd32,dsp}: not available on aarch64

addresses

https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255312249

https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255312264
---
 crates/core_arch/src/acle/mod.rs | 44 ++++++++++++++++++++------------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 83aaacb5eb..d173246511 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -59,33 +59,45 @@ pub use self::registers::*;
 // Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
 // We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
 // section 5.4.7)
-#[cfg(any(
-    // >= v5TE but excludes v7-A
-    all(target_feature = "v5te", not(target_feature = "mclass")),
-    // v7E-M
-    all(target_feature = "mclass", target_feature = "dsp"),
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        // >= v5TE but excludes v7-A
+        all(target_feature = "v5te", not(target_feature = "mclass")),
+        // v7E-M
+        all(target_feature = "mclass", target_feature = "dsp"),
+    )
 ))]
 mod dsp;
 
-#[cfg(any(
-    all(target_feature = "v5te", not(target_feature = "mclass")),
-    all(target_feature = "mclass", target_feature = "dsp"),
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        all(target_feature = "v5te", not(target_feature = "mclass")),
+        all(target_feature = "mclass", target_feature = "dsp"),
+    )
 ))]
 pub use self::dsp::*;
 
 // Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
 // Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
-#[cfg(any(
-    // v7-A, v7-R
-    all(target_feature = "v6", not(target_feature = "mclass")),
-    // v7E-M
-    all(target_feature = "mclass", target_feature = "dsp")
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        // v7-A, v7-R
+        all(target_feature = "v6", not(target_feature = "mclass")),
+        // v7E-M
+        all(target_feature = "mclass", target_feature = "dsp")
+    )
 ))]
 mod simd32;
 
-#[cfg(any(
-    all(target_feature = "v6", not(target_feature = "mclass")),
-    all(target_feature = "mclass", target_feature = "dsp")
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        all(target_feature = "v6", not(target_feature = "mclass")),
+        all(target_feature = "mclass", target_feature = "dsp")
+    )
 ))]
 pub use self::simd32::*;
 

From 0d8151926c208e6f0ac4505607efb28b8b633607 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Wed, 13 Feb 2019 20:05:32 +0100
Subject: [PATCH 12/31] acle: move saturating intrinsics into its own module

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255312560
---
 crates/core_arch/src/acle/dsp.rs |  3 ---
 crates/core_arch/src/acle/mod.rs | 13 +++++++++++++
 crates/core_arch/src/acle/sat.rs |  8 ++++++++
 3 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 crates/core_arch/src/acle/sat.rs

diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/acle/dsp.rs
index 4029e7aaa3..31817ea870 100644
--- a/crates/core_arch/src/acle/dsp.rs
+++ b/crates/core_arch/src/acle/dsp.rs
@@ -1,7 +1,6 @@
 //! # References:
 //!
 //! - Section 8.3 "16-bit multiplications"
-//! - Section 8.4 "Saturating intrinsics"
 //!
 //! Intrinsics that could live here:
 //!
@@ -11,8 +10,6 @@
 //! - __smultt
 //! - __smulwb
 //! - __smulwt
-//! - __ssat
-//! - __usat
 //! - __qadd
 //! - __qsub
 //! - __qdbl
diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index d173246511..f1be11b27b 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -79,6 +79,19 @@ mod dsp;
 ))]
 pub use self::dsp::*;
 
+// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    target_feature = "v6",
+))]
+mod sat;
+
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    target_feature = "v6",
+))]
+pub use self::sat::*;
+
 // Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
 // Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
 #[cfg(all(
diff --git a/crates/core_arch/src/acle/sat.rs b/crates/core_arch/src/acle/sat.rs
new file mode 100644
index 0000000000..38c98d7342
--- /dev/null
+++ b/crates/core_arch/src/acle/sat.rs
@@ -0,0 +1,8 @@
+//! # References:
+//!
+//! - Section 8.4 "Saturating intrinsics"
+//!
+//! Intrinsics that could live here:
+//!
+//! - __ssat
+//! - __usat

From 50b51260cd8dcb93e3b2dd07ca8afffce4669637 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 12:29:46 +0100
Subject: [PATCH 13/31] acle/hints: gate sevl on 'v8' rather than on 'aarch64'

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r256553546
---
 crates/core_arch/src/acle/hints.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
index 6f33fe3419..2288eb3c9e 100644
--- a/crates/core_arch/src/acle/hints.rs
+++ b/crates/core_arch/src/acle/hints.rs
@@ -46,7 +46,7 @@ pub unsafe fn __sev() {
 /// instruction. In a multiprocessor system, it is not required to affect the
 /// other processors.
 // LLVM says "instruction requires: armv8"
-#[cfg(target_arch = "aarch64")]
+#[cfg(target_feature = "v8")]
 #[inline(always)]
 pub unsafe fn __sevl() {
     hint(HINT_SEVL);

From b14287e42a4c263bcda2050747f9a2110aa24f63 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 12:37:11 +0100
Subject: [PATCH 14/31] acle/barrier: remove cfg from re-export

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r256556043
---
 crates/core_arch/src/acle/mod.rs | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index f1be11b27b..a9b4dbfd72 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -41,11 +41,6 @@
 // via CP15 instructions. See Section 10.1 of ACLE
 mod barrier;
 
-#[cfg(any(
-    target_arch = "aarch64",
-    target_feature = "v7",
-    target_feature = "mclass"
-))]
 pub use self::barrier::*;
 
 mod hints;

From 81ad6208986c73321852e7db2c667506f98218c7 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 12:38:27 +0100
Subject: [PATCH 15/31] acle/dsp: update comment

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r256556341
---
 crates/core_arch/src/acle/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index a9b4dbfd72..7d281538f5 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -57,7 +57,7 @@ pub use self::registers::*;
 #[cfg(all(
     not(target_arch = "aarch64"),
     any(
-        // >= v5TE but excludes v7-A
+        // >= v5TE but excludes v7-M
         all(target_feature = "v5te", not(target_feature = "mclass")),
         // v7E-M
         all(target_feature = "mclass", target_feature = "dsp"),

From fa0deb38ffe32b0e15e3fba3fe526dc806190d24 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 12:49:36 +0100
Subject: [PATCH 16/31] acle/dsp: note the difference between LLVM's +dsp and
 ACLE's __ARM_FEATURE_DSP

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r256597576
---
 crates/core_arch/src/acle/mod.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 7d281538f5..a650173618 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -33,6 +33,14 @@
 //! - `v6 < v8m < v6t2`
 //! - `v7 < v8m.main`
 //!
+//! *NOTE*: Section 5.4.7 of ACLE says:
+//!
+//! - "__ARM_FEATURE_DSP is defined to 1 if the DSP (v5E) instructions are supported and the
+//! intrinsics defined in Saturating intrinsics are available."
+//!
+//! This does *not* match how LLVM uses the '+dsp' feature; this feature is not set for v5te
+//! targets so we have to work around this difference.
+//!
 //! # References
 //!
 //! - [ACLE Q2 2018](https://developer.arm.com/docs/101028/latest)
@@ -54,6 +62,8 @@ pub use self::registers::*;
 // Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
 // We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
 // section 5.4.7)
+// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on
+// '+v5te' rather than on '+dsp'
 #[cfg(all(
     not(target_arch = "aarch64"),
     any(

From b3822ef729750c96d8bba1b4003e3c65335e3027 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 15:38:43 +0100
Subject: [PATCH 17/31] acle: add ldrex, clrex and strex

---
 crates/core_arch/src/acle/ex.rs  | 115 +++++++++++++++++++++++++++++++
 crates/core_arch/src/acle/mod.rs |   4 ++
 2 files changed, 119 insertions(+)
 create mode 100644 crates/core_arch/src/acle/ex.rs

diff --git a/crates/core_arch/src/acle/ex.rs b/crates/core_arch/src/acle/ex.rs
new file mode 100644
index 0000000000..c25e0dc37a
--- /dev/null
+++ b/crates/core_arch/src/acle/ex.rs
@@ -0,0 +1,115 @@
+// Reference: Section 5.4.4 "LDREX / STREX" of ACLE
+
+/// Removes the exclusive lock created by LDREX
+// Supported: v6, v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+))]
+pub unsafe fn __clrex() {
+    extern "C" {
+        #[link_name = "llvm.arm.clrex"]
+        fn clrex();
+    }
+
+    clrex()
+}
+
+/// Executes a exclusive LDR instruction for 8 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __ldrexb(p: *const u8) -> u8 {
+    extern "C" {
+        #[link_name = "llvm.arm.ldrex.p0i8"]
+        fn ldrex8(p: *const u8) -> u32;
+    }
+
+    ldrex8(p) as u8
+}
+
+/// Executes a exclusive LDR instruction for 16 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __ldrexh(p: *const u16) -> u16 {
+    extern "C" {
+        #[link_name = "llvm.arm.ldrex.p0i16"]
+        fn ldrex16(p: *const u16) -> u32;
+    }
+
+    ldrex16(p) as u16
+}
+
+/// Executes a exclusive LDR instruction for 32 bit value.
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+))]
+pub unsafe fn __ldrex(p: *const u32) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.ldrex.p0i32"]
+        fn ldrex32(p: *const u32) -> u32;
+    }
+
+    ldrex32(p)
+}
+
+/// Executes a exclusive STR instruction for 8 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __strexb(value: u32, addr: *const u8) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.strex.p0i8"]
+        fn strex8(value: u32, addr: *const u8) -> u32;
+    }
+
+    strex8(value, addr)
+}
+
+/// Executes a exclusive STR instruction for 16 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __strexh(value: u16, addr: *const u16) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.strex.p0i16"]
+        fn strex16(value: u32, addr: *const u16) -> u32;
+    }
+
+    strex16(value as u32, addr)
+}
+
+/// Executes a exclusive STR instruction for 32 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+))]
+pub unsafe fn __strex(value: u32, addr: *const u32) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.strex.p0i32"]
+        fn strex32(value: u32, addr: *const u32) -> u32;
+    }
+
+    strex32(value, addr)
+}
diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index a650173618..068edc8aa2 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -59,6 +59,10 @@ mod registers;
 
 pub use self::registers::*;
 
+mod ex;
+
+pub use self::ex::*;
+
 // Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
 // We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
 // section 5.4.7)

From e67cf235c5404b6bb9ae98e2a21dd7be6466f582 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 15:51:33 +0100
Subject: [PATCH 18/31] acle/docs: add armv8-m and armv8-r to the list of rustc
 targets & llvm features

---
 crates/core_arch/src/acle/mod.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 068edc8aa2..1354fe9fae 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -13,6 +13,8 @@
 //! - `armv7r-none-eabi` - **ARMv7-R** - `+v4t +v5te +v6 +v6k +v6t2  +v7 +dsp +thumb2 +rclass`
 //! - `thumbv7m-none-eabi` - **ARMv7-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
 //! - `thumbv7em-none-eabi` - **ARMv7E-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +thumb-mode +mclass`
+//! - `thumbv8m.main-none-eabi` - **ARMv8-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `armv8r-none-eabi` - **ARMv8-R** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +v8 +thumb2 +rclass`
 //! - `aarch64-unknown-linux-gnu` - **ARMv8-A (AArch64)** - `+fp +neon`
 //!
 //! Section 10.1 of ACLE says:

From 53f320e98120dff5b16e05e0bf00c31e31549c07 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 15:54:08 +0100
Subject: [PATCH 19/31] acle/hints: make sevl truly available on aarch64

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r256864336
---
 crates/core_arch/src/acle/hints.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
index 2288eb3c9e..20faed69cb 100644
--- a/crates/core_arch/src/acle/hints.rs
+++ b/crates/core_arch/src/acle/hints.rs
@@ -46,7 +46,10 @@ pub unsafe fn __sev() {
 /// instruction. In a multiprocessor system, it is not required to affect the
 /// other processors.
 // LLVM says "instruction requires: armv8"
-#[cfg(target_feature = "v8")]
+#[cfg(any(
+    target_feature = "v8", // 32-bit ARMv8
+    target_arch = "aarch64", // AArch64
+))]
 #[inline(always)]
 pub unsafe fn __sevl() {
     hint(HINT_SEVL);

From cbfd8d0e3e36e8b741fa74dd5ff6af9d6c5a0855 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Thu, 14 Feb 2019 17:08:57 +0100
Subject: [PATCH 20/31] acle: move arm/dsp into acle/{dsp,simd32}

addresses https://github.com/rust-lang-nursery/stdsimd/pull/557#discussion_r255312454
---
 crates/core_arch/src/acle/dsp.rs    |  57 ++-
 crates/core_arch/src/acle/simd32.rs | 732 ++++++++++++++++++++++++++--
 crates/core_arch/src/arm/dsp.rs     | 654 -------------------------
 crates/core_arch/src/arm/mod.rs     |   6 -
 4 files changed, 720 insertions(+), 729 deletions(-)
 delete mode 100644 crates/core_arch/src/arm/dsp.rs

diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/acle/dsp.rs
index 31817ea870..3a71f2c469 100644
--- a/crates/core_arch/src/acle/dsp.rs
+++ b/crates/core_arch/src/acle/dsp.rs
@@ -4,18 +4,45 @@
 //!
 //! Intrinsics that could live here:
 //!
-//! - __smulbb
-//! - __smulbt
-//! - __smultb
-//! - __smultt
-//! - __smulwb
-//! - __smulwt
-//! - __qadd
-//! - __qsub
-//! - __qdbl
-//! - __smlabb
-//! - __smlabt
-//! - __smlatb
-//! - __smlatt
-//! - __smlawb
-//! - __smlawt
+//! - [ ] __smulbb
+//! - [ ] __smulbt
+//! - [ ] __smultb
+//! - [ ] __smultt
+//! - [ ] __smulwb
+//! - [ ] __smulwt
+//! - [x] __qadd
+//! - [x] __qsub
+//! - [ ] __qdbl
+//! - [ ] __smlabb
+//! - [ ] __smlabt
+//! - [ ] __smlatb
+//! - [ ] __smlatt
+//! - [ ] __smlawb
+//! - [ ] __smlawt
+
+extern "C" {
+    #[link_name = "llvm.arm.qadd"]
+    fn arm_qadd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub"]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+}
+
+/// Signed saturating addition
+///
+/// Returns the 32-bit saturating signed equivalent of a + b.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn qadd(a: i32, b: i32) -> i32 {
+    arm_qadd(a, b)
+}
+
+/// Signed saturating subtraction
+///
+/// Returns the 32-bit saturating signed equivalent of a - b.
+#[inline]
+#[cfg_attr(test, assert_instr(qsub))]
+pub unsafe fn qsub(a: i32, b: i32) -> i32 {
+    arm_qsub(a, b)
+}
diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
index 6b28ec88dc..420ce2b2d0 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -4,57 +4,681 @@
 //!
 //! Intrinsics that could live here
 //!
-//! - __ssat16
-//! - __usat16
-//! - __sxtab16
-//! - __sxtb16
-//! - __uxtab16
-//! - __uxtb16
-//! - __qsub8
-//! - __sadd8
-//! - __shadd8
-//! - __shsub8
-//! - __ssub8
-//! - __uadd8
-//! - __uhadd8
-//! - __uhsub8
-//! - __uqadd8
-//! - __uqsub8
-//! - __usub8
-//! - __usad8
-//! - __usada8
-//! - __qadd16
-//! - __qasx
-//! - __qsub16
-//! - __sadd16
-//! - __sasx
-//! - __shadd16
-//! - __shasx
-//! - __shsax
-//! - __shsub16
-//! - __ssax
-//! - __ssub16
-//! - __uadd16
-//! - __uasx
-//! - __uhadd16
-//! - __uhasx
-//! - __uhsax
-//! - __uhsub16
-//! - __uqadd16
-//! - __uqasx
-//! - __uqsax
-//! - __uqsub16
-//! - __usax
-//! - __usub16
-//! - __smlad
-//! - __smladx
-//! - __smlald
-//! - __smlaldx
-//! - __smlsd
-//! - __smlsdx
-//! - __smlsld
-//! - __smlsldx
-//! - __smuad
-//! - __smuadx
-//! - __smusd
-//! - __smusdx
+//! - [x] __sel
+//! - [ ] __ssat16
+//! - [ ] __usat16
+//! - [ ] __sxtab16
+//! - [ ] __sxtb16
+//! - [ ] __uxtab16
+//! - [ ] __uxtb16
+//! - [x] __qadd8
+//! - [x] __qsub8
+//! - [x] __sadd8
+//! - [x] __shadd8
+//! - [x] __shsub8
+//! - [ ] __ssub8
+//! - [ ] __uadd8
+//! - [ ] __uhadd8
+//! - [ ] __uhsub8
+//! - [ ] __uqadd8
+//! - [ ] __uqsub8
+//! - [ ] __usub8
+//! - [x] __usad8
+//! - [x] __usada8
+//! - [x] __qadd16
+//! - [x] __qasx
+//! - [x] __qsax
+//! - [x] __qsub16
+//! - [x] __sadd16
+//! - [x] __sasx
+//! - [x] __shadd16
+//! - [ ] __shasx
+//! - [ ] __shsax
+//! - [x] __shsub16
+//! - [ ] __ssax
+//! - [ ] __ssub16
+//! - [ ] __uadd16
+//! - [ ] __uasx
+//! - [ ] __uhadd16
+//! - [ ] __uhasx
+//! - [ ] __uhsax
+//! - [ ] __uhsub16
+//! - [ ] __uqadd16
+//! - [ ] __uqasx
+//! - [x] __uqsax
+//! - [ ] __uqsub16
+//! - [ ] __usax
+//! - [ ] __usub16
+//! - [x] __smlad
+//! - [ ] __smladx
+//! - [ ] __smlald
+//! - [ ] __smlaldx
+//! - [x] __smlsd
+//! - [ ] __smlsdx
+//! - [ ] __smlsld
+//! - [ ] __smlsldx
+//! - [x] __smuad
+//! - [x] __smuadx
+//! - [x] __smusd
+//! - [x] __smusdx
+
+types! {
+    /// ARM-specific 32-bit wide vector of four packed `i8`.
+    pub struct int8x4_t(i8, i8, i8, i8);
+    /// ARM-specific 32-bit wide vector of four packed `u8`.
+    pub struct uint8x4_t(u8, u8, u8, u8);
+    /// ARM-specific 32-bit wide vector of two packed `i16`.
+    pub struct int16x2_t(i16, i16);
+    /// ARM-specific 32-bit wide vector of two packed `u16`.
+    pub struct uint16x2_t(u16, u16);
+}
+
+macro_rules! dsp_call {
+    ($name:expr, $a:expr, $b:expr) => {
+        ::mem::transmute($name(::mem::transmute($a), ::mem::transmute($b)))
+    };
+}
+
+extern "C" {
+    #[link_name = "llvm.arm.qadd8"]
+    fn arm_qadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub8"]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub16"]
+    fn arm_qsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qadd16"]
+    fn arm_qadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qasx"]
+    fn arm_qasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsax"]
+    fn arm_qsax(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd16"]
+    fn arm_sadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sadd8"]
+    fn arm_sadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlad"]
+    fn arm_smlad(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.smlsd"]
+    fn arm_smlsd(a: i32, b: i32, c: i32) -> i32;
+
+    #[link_name = "llvm.arm.sasx"]
+    fn arm_sasx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.sel"]
+    fn arm_sel(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd8"]
+    fn arm_shadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shadd16"]
+    fn arm_shadd16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub8"]
+    fn arm_shsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.shsub16"]
+    fn arm_shsub16(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuad"]
+    fn arm_smuad(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smuadx"]
+    fn arm_smuadx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusd"]
+    fn arm_smusd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.smusdx"]
+    fn arm_smusdx(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.usad8"]
+    fn arm_usad8(a: i32, b: i32) -> u32;
+}
+
+/// Saturating four 8-bit integer additions
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd8))]
+pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qadd8, a, b)
+}
+
+/// Saturating two 8-bit integer subtraction
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+/// res\[2\] = a\[2\] - b\[2\]
+/// res\[3\] = a\[3\] - b\[3\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub8))]
+pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_qsub8, a, b)
+}
+
+/// Saturating two 16-bit integer subtraction
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[0\]
+/// res\[1\] = a\[1\] - b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsub16))]
+pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsub16, a, b)
+}
+
+/// Saturating two 16-bit integer additions
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] + b\[0\]
+/// res\[1\] = a\[1\] + b\[1\]
+#[inline]
+#[cfg_attr(test, assert_instr(qadd16))]
+pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qadd16, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qasx))]
+pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qasx, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] - b\[0\]
+#[inline]
+#[cfg_attr(test, assert_instr(qsax))]
+pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_qsax, a, b)
+}
+
+/// Returns the 16-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd16))]
+pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sadd16, a, b)
+}
+
+/// Returns the 8-bit signed saturated equivalent of
+///
+/// res\[0\] = a\[0\] + b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+/// res\[2\] = a\[2\] + b\[2\]
+/// res\[3\] = a\[3\] + b\[3\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sadd8))]
+pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sadd8, a, b)
+}
+
+/// Dual 16-bit Signed Multiply with Addition of products
+/// and 32-bit accumulation.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlad))]
+pub unsafe fn smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlad(::mem::transmute(a), ::mem::transmute(b), c)
+}
+
+/// Dual 16-bit Signed Multiply with Subtraction  of products
+/// and 32-bit accumulation and overflow detection.
+///
+/// Returns the 16-bit signed equivalent of
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
+#[inline]
+#[cfg_attr(test, assert_instr(smlsd))]
+pub unsafe fn smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+    arm_smlsd(::mem::transmute(a), ::mem::transmute(b), c)
+}
+
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = a\[0\] - b\[1\]
+/// res\[1\] = a\[1\] + b\[0\]
+///
+/// and the GE bits of the APSR are set.
+#[inline]
+#[cfg_attr(test, assert_instr(sasx))]
+pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_sasx, a, b)
+}
+
+/// Select bytes from each operand according to APSR GE flags
+///
+/// Returns the equivalent of
+///
+/// res\[0\] = GE\[0\] ? a\[0\] : b\[0\]
+/// res\[1\] = GE\[1\] ? a\[1\] : b\[1\]
+/// res\[2\] = GE\[2\] ? a\[2\] : b\[2\]
+/// res\[3\] = GE\[3\] ? a\[3\] : b\[3\]
+///
+/// where GE are bits of APSR
+#[inline]
+#[cfg_attr(test, assert_instr(sel))]
+pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_sel, a, b)
+}
+
+/// Signed halving parallel byte-wise addition.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+/// res\[2\] = (a\[2\] + b\[2\]) / 2
+/// res\[3\] = (a\[3\] + b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd8))]
+pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shadd8, a, b)
+}
+
+/// Signed halving parallel halfword-wise addition.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] + b\[0\]) / 2
+/// res\[1\] = (a\[1\] + b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shadd16))]
+pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shadd16, a, b)
+}
+
+/// Signed halving parallel byte-wise subtraction.
+///
+/// Returns the 8-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+/// res\[2\] = (a\[2\] - b\[2\]) / 2
+/// res\[3\] = (a\[3\] - b\[3\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub8))]
+pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+    dsp_call!(arm_shsub8, a, b)
+}
+
+/// Signed halving parallel halfword-wise subtraction.
+///
+/// Returns the 16-bit signed equivalent of
+///
+/// res\[0\] = (a\[0\] - b\[0\]) / 2
+/// res\[1\] = (a\[1\] - b\[1\]) / 2
+#[inline]
+#[cfg_attr(test, assert_instr(shsub16))]
+pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+    dsp_call!(arm_shsub16, a, b)
+}
+
+/// Signed Dual Multiply Add.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuad))]
+pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuad(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Add Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smuadx))]
+pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smuadx(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Subtract.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusd))]
+pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusd(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Signed Dual Multiply Subtract Reversed.
+///
+/// Returns the equivalent of
+///
+/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
+///
+/// and sets the Q flag if overflow occurs on the addition.
+#[inline]
+#[cfg_attr(test, assert_instr(smusdx))]
+pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+    arm_smusdx(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Sum of 8-bit absolute differences.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn usad8(a: int8x4_t, b: int8x4_t) -> u32 {
+    arm_usad8(::mem::transmute(a), ::mem::transmute(b))
+}
+
+/// Sum of 8-bit absolute differences and constant.
+///
+/// Returns the 8-bit unsigned equivalent of
+///
+/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
+///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
+#[inline]
+#[cfg_attr(test, assert_instr(usad8))]
+pub unsafe fn usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
+    usad8(a, b) + c
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arm::*;
+    use core_arch::simd::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(dsp::qadd(-10, 60), 50);
+            assert_eq!(dsp::qadd(::std::i32::MAX, 10), ::std::i32::MAX);
+            assert_eq!(dsp::qadd(::std::i32::MIN, -10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(dsp::qsub(10, 60), -50);
+            assert_eq!(dsp::qsub(::std::i32::MAX, -10), ::std::i32::MAX);
+            assert_eq!(dsp::qsub(::std::i32::MIN, 10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
+            let b = i8x4::new(2, -1, 0, 1);
+            let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
+            let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, -1);
+            let c = i16x2::new(3, 1);
+            let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsub16() {
+        unsafe {
+            let a = i16x2::new(10, 20);
+            let b = i16x2::new(20, -10);
+            let c = i16x2::new(-10, 30);
+            let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qasx() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(-1, ::std::i16::MAX);
+            let r: i16x2 = dsp_call!(dsp::qasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn qsax() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, ::std::i16::MAX - 2);
+            let r: i16x2 = dsp_call!(dsp::qsax, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd16() {
+        unsafe {
+            let a = i16x2::new(1, ::std::i16::MAX);
+            let b = i16x2::new(2, 2);
+            let c = i16x2::new(3, -::std::i16::MAX);
+            let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn sasx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(2, 1);
+            let c = i16x2::new(0, 4);
+            let r: i16x2 = dsp_call!(dsp::sasx, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smlad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = dsp::smlad(::mem::transmute(a), ::mem::transmute(b), 10);
+            assert_eq!(r, (1 * 3) + (2 * 4) + 10);
+        }
+    }
+
+    #[test]
+    fn smlsd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(3, 4);
+            let r = dsp::smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
+            assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
+        }
+    }
+
+    #[test]
+    fn sel() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let b = i8x4::new(4, 3, 2, 2);
+            // call sadd8() to set GE bits
+            dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
+            let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
+            let r: i8x4 = dsp_call!(dsp::sel, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(3, 3, 3, 3);
+            let r: i8x4 = dsp_call!(dsp::shadd8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shadd16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(3, 3);
+            let r: i16x2 = dsp_call!(dsp::shadd16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(5, 4, 3, 2);
+            let c = i8x4::new(-2, -1, 0, 1);
+            let r: i8x4 = dsp_call!(dsp::shsub8, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn shsub16() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let c = i16x2::new(-2, -1);
+            let r: i16x2 = dsp_call!(dsp::shsub16, a, b);
+            assert_eq!(r, c);
+        }
+    }
+
+    #[test]
+    fn smuad() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smuad(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 13);
+        }
+    }
+
+    #[test]
+    fn smuadx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smuadx(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 14);
+        }
+    }
+
+    #[test]
+    fn smusd() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smusd(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, -3);
+        }
+    }
+
+    #[test]
+    fn smusdx() {
+        unsafe {
+            let a = i16x2::new(1, 2);
+            let b = i16x2::new(5, 4);
+            let r = dsp::smusdx(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, -6);
+        }
+    }
+
+    #[test]
+    fn usad8() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let r = dsp::usad8(::mem::transmute(a), ::mem::transmute(b));
+            assert_eq!(r, 8);
+        }
+    }
+
+    #[test]
+    fn usad8a() {
+        unsafe {
+            let a = i8x4::new(1, 2, 3, 4);
+            let b = i8x4::new(4, 3, 2, 1);
+            let c = 10;
+            let r = dsp::usad8a(::mem::transmute(a), ::mem::transmute(b), c);
+            assert_eq!(r, 8 + c);
+        }
+    }
+}
diff --git a/crates/core_arch/src/arm/dsp.rs b/crates/core_arch/src/arm/dsp.rs
deleted file mode 100644
index 8385e7ed21..0000000000
--- a/crates/core_arch/src/arm/dsp.rs
+++ /dev/null
@@ -1,654 +0,0 @@
-//! ARM DSP Intrinsics.
-//!
-//! Based on "Arm C Language Extensions (ACLE) Version Q2 2018"
-//!
-//! https://developer.arm.com/products/software-development-tools/compilers/arm-compiler-5/docs/101028/0006
-
-#[cfg(test)]
-use stdsimd_test::assert_instr;
-
-types! {
-    /// ARM-specific 32-bit wide vector of four packed `i8`.
-    pub struct int8x4_t(i8, i8, i8, i8);
-    /// ARM-specific 32-bit wide vector of four packed `u8`.
-    pub struct uint8x4_t(u8, u8, u8, u8);
-    /// ARM-specific 32-bit wide vector of two packed `i16`.
-    pub struct int16x2_t(i16, i16);
-    /// ARM-specific 32-bit wide vector of two packed `u16`.
-    pub struct uint16x2_t(u16, u16);
-}
-
-macro_rules! dsp_call {
-    ($name:expr, $a:expr, $b:expr) => {
-        ::mem::transmute($name(::mem::transmute($a), ::mem::transmute($b)))
-    };
-}
-
-extern "C" {
-    #[link_name = "llvm.arm.qadd"]
-    fn arm_qadd(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qadd16"]
-    fn arm_qadd16(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qadd8"]
-    fn arm_qadd8(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qasx"]
-    fn arm_qasx(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qsax"]
-    fn arm_qsax(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qsub"]
-    fn arm_qsub(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qsub8"]
-    fn arm_qsub8(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qsub16"]
-    fn arm_qsub16(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.sadd16"]
-    fn arm_sadd16(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.sadd8"]
-    fn arm_sadd8(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.sasx"]
-    fn arm_sasx(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.smlad"]
-    fn arm_smlad(a: i32, b: i32, c: i32) -> i32;
-
-    #[link_name = "llvm.arm.smlsd"]
-    fn arm_smlsd(a: i32, b: i32, c: i32) -> i32;
-
-    #[link_name = "llvm.arm.sel"]
-    fn arm_sel(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.shadd8"]
-    fn arm_shadd8(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.shadd16"]
-    fn arm_shadd16(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.shsub8"]
-    fn arm_shsub8(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.shsub16"]
-    fn arm_shsub16(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.smuad"]
-    fn arm_smuad(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.smuadx"]
-    fn arm_smuadx(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.smusd"]
-    fn arm_smusd(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.smusdx"]
-    fn arm_smusdx(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.usad8"]
-    fn arm_usad8(a: i32, b: i32) -> u32;
-}
-
-/// Signed saturating addition
-///
-/// Returns the 32-bit saturating signed equivalent of a + b.
-#[inline]
-#[cfg_attr(test, assert_instr(qadd))]
-pub unsafe fn qadd(a: i32, b: i32) -> i32 {
-    arm_qadd(a, b)
-}
-
-/// Signed saturating subtraction
-///
-/// Returns the 32-bit saturating signed equivalent of a - b.
-#[inline]
-#[cfg_attr(test, assert_instr(qsub))]
-pub unsafe fn qsub(a: i32, b: i32) -> i32 {
-    arm_qsub(a, b)
-}
-
-/// Saturating four 8-bit integer additions
-///
-/// Returns the 8-bit signed equivalent of
-///
-/// res\[0\] = a\[0\] + b\[0\]
-/// res\[1\] = a\[1\] + b\[1\]
-/// res\[2\] = a\[2\] + b\[2\]
-/// res\[3\] = a\[3\] + b\[3\]
-#[inline]
-#[cfg_attr(test, assert_instr(qadd8))]
-pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    dsp_call!(arm_qadd8, a, b)
-}
-
-/// Saturating two 8-bit integer subtraction
-///
-/// Returns the 8-bit signed equivalent of
-///
-/// res\[0\] = a\[0\] - b\[0\]
-/// res\[1\] = a\[1\] - b\[1\]
-/// res\[2\] = a\[2\] - b\[2\]
-/// res\[3\] = a\[3\] - b\[3\]
-#[inline]
-#[cfg_attr(test, assert_instr(qsub8))]
-pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    dsp_call!(arm_qsub8, a, b)
-}
-
-/// Saturating two 16-bit integer subtraction
-///
-/// Returns the 16-bit signed equivalent of
-///
-/// res\[0\] = a\[0\] - b\[0\]
-/// res\[1\] = a\[1\] - b\[1\]
-#[inline]
-#[cfg_attr(test, assert_instr(qsub16))]
-pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_qsub16, a, b)
-}
-
-/// Saturating two 16-bit integer additions
-///
-/// Returns the 16-bit signed equivalent of
-///
-/// res\[0\] = a\[0\] + b\[0\]
-/// res\[1\] = a\[1\] + b\[1\]
-#[inline]
-#[cfg_attr(test, assert_instr(qadd16))]
-pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_qadd16, a, b)
-}
-
-/// Returns the 16-bit signed saturated equivalent of
-///
-/// res\[0\] = a\[0\] - b\[1\]
-/// res\[1\] = a\[1\] + b\[0\]
-#[inline]
-#[cfg_attr(test, assert_instr(qasx))]
-pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_qasx, a, b)
-}
-
-/// Returns the 16-bit signed saturated equivalent of
-///
-/// res\[0\] = a\[0\] + b\[1\]
-/// res\[1\] = a\[1\] - b\[0\]
-#[inline]
-#[cfg_attr(test, assert_instr(qsax))]
-pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_qsax, a, b)
-}
-
-/// Returns the 16-bit signed saturated equivalent of
-///
-/// res\[0\] = a\[0\] + b\[1\]
-/// res\[1\] = a\[1\] + b\[0\]
-///
-/// and the GE bits of the APSR are set.
-#[inline]
-#[cfg_attr(test, assert_instr(sadd16))]
-pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_sadd16, a, b)
-}
-
-/// Returns the 8-bit signed saturated equivalent of
-///
-/// res\[0\] = a\[0\] + b\[1\]
-/// res\[1\] = a\[1\] + b\[0\]
-/// res\[2\] = a\[2\] + b\[2\]
-/// res\[3\] = a\[3\] + b\[3\]
-///
-/// and the GE bits of the APSR are set.
-#[inline]
-#[cfg_attr(test, assert_instr(sadd8))]
-pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    dsp_call!(arm_sadd8, a, b)
-}
-
-/// Dual 16-bit Signed Multiply with Addition of products
-/// and 32-bit accumulation.
-///
-/// Returns the 16-bit signed equivalent of
-/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
-#[inline]
-#[cfg_attr(test, assert_instr(smlad))]
-pub unsafe fn smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
-    arm_smlad(::mem::transmute(a), ::mem::transmute(b), c)
-}
-
-/// Dual 16-bit Signed Multiply with Subtraction  of products
-/// and 32-bit accumulation and overflow detection.
-///
-/// Returns the 16-bit signed equivalent of
-/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
-#[inline]
-#[cfg_attr(test, assert_instr(smlsd))]
-pub unsafe fn smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
-    arm_smlsd(::mem::transmute(a), ::mem::transmute(b), c)
-}
-
-/// Returns the 16-bit signed equivalent of
-///
-/// res\[0\] = a\[0\] - b\[1\]
-/// res\[1\] = a\[1\] + b\[0\]
-///
-/// and the GE bits of the APSR are set.
-#[inline]
-#[cfg_attr(test, assert_instr(sasx))]
-pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_sasx, a, b)
-}
-
-/// Select bytes from each operand according to APSR GE flags
-///
-/// Returns the equivalent of
-///
-/// res\[0\] = GE\[0\] ? a\[0\] : b\[0\]
-/// res\[1\] = GE\[1\] ? a\[1\] : b\[1\]
-/// res\[2\] = GE\[2\] ? a\[2\] : b\[2\]
-/// res\[3\] = GE\[3\] ? a\[3\] : b\[3\]
-///
-/// where GE are bits of APSR
-#[inline]
-#[cfg_attr(test, assert_instr(sel))]
-#[cfg(all(not(target_feature = "mclass")))]
-pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    dsp_call!(arm_sel, a, b)
-}
-
-/// Signed halving parallel byte-wise addition.
-///
-/// Returns the 8-bit signed equivalent of
-///
-/// res\[0\] = (a\[0\] + b\[0\]) / 2
-/// res\[1\] = (a\[1\] + b\[1\]) / 2
-/// res\[2\] = (a\[2\] + b\[2\]) / 2
-/// res\[3\] = (a\[3\] + b\[3\]) / 2
-#[inline]
-#[cfg_attr(test, assert_instr(shadd8))]
-pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    dsp_call!(arm_shadd8, a, b)
-}
-
-/// Signed halving parallel halfword-wise addition.
-///
-/// Returns the 16-bit signed equivalent of
-///
-/// res\[0\] = (a\[0\] + b\[0\]) / 2
-/// res\[1\] = (a\[1\] + b\[1\]) / 2
-#[inline]
-#[cfg_attr(test, assert_instr(shadd16))]
-pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_shadd16, a, b)
-}
-
-/// Signed halving parallel byte-wise subtraction.
-///
-/// Returns the 8-bit signed equivalent of
-///
-/// res\[0\] = (a\[0\] - b\[0\]) / 2
-/// res\[1\] = (a\[1\] - b\[1\]) / 2
-/// res\[2\] = (a\[2\] - b\[2\]) / 2
-/// res\[3\] = (a\[3\] - b\[3\]) / 2
-#[inline]
-#[cfg_attr(test, assert_instr(shsub8))]
-pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
-    dsp_call!(arm_shsub8, a, b)
-}
-
-/// Signed halving parallel halfword-wise subtraction.
-///
-/// Returns the 16-bit signed equivalent of
-///
-/// res\[0\] = (a\[0\] - b\[0\]) / 2
-/// res\[1\] = (a\[1\] - b\[1\]) / 2
-#[inline]
-#[cfg_attr(test, assert_instr(shsub16))]
-pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
-    dsp_call!(arm_shsub16, a, b)
-}
-
-/// Signed Dual Multiply Add.
-///
-/// Returns the equivalent of
-///
-/// res = a\[0\] * b\[0\] + a\[1\] * b\[1\]
-///
-/// and sets the Q flag if overflow occurs on the addition.
-#[inline]
-#[cfg_attr(test, assert_instr(smuad))]
-pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
-    arm_smuad(::mem::transmute(a), ::mem::transmute(b))
-}
-
-/// Signed Dual Multiply Add Reversed.
-///
-/// Returns the equivalent of
-///
-/// res = a\[0\] * b\[1\] + a\[1\] * b\[0\]
-///
-/// and sets the Q flag if overflow occurs on the addition.
-#[inline]
-#[cfg_attr(test, assert_instr(smuadx))]
-pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
-    arm_smuadx(::mem::transmute(a), ::mem::transmute(b))
-}
-
-/// Signed Dual Multiply Subtract.
-///
-/// Returns the equivalent of
-///
-/// res = a\[0\] * b\[0\] - a\[1\] * b\[1\]
-///
-/// and sets the Q flag if overflow occurs on the addition.
-#[inline]
-#[cfg_attr(test, assert_instr(smusd))]
-pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
-    arm_smusd(::mem::transmute(a), ::mem::transmute(b))
-}
-
-/// Signed Dual Multiply Subtract Reversed.
-///
-/// Returns the equivalent of
-///
-/// res = a\[0\] * b\[1\] - a\[1\] * b\[0\]
-///
-/// and sets the Q flag if overflow occurs on the addition.
-#[inline]
-#[cfg_attr(test, assert_instr(smusdx))]
-pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
-    arm_smusdx(::mem::transmute(a), ::mem::transmute(b))
-}
-
-/// Sum of 8-bit absolute differences.
-///
-/// Returns the 8-bit unsigned equivalent of
-///
-/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
-///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
-#[inline]
-#[cfg_attr(test, assert_instr(usad8))]
-pub unsafe fn usad8(a: int8x4_t, b: int8x4_t) -> u32 {
-    arm_usad8(::mem::transmute(a), ::mem::transmute(b))
-}
-
-/// Sum of 8-bit absolute differences and constant.
-///
-/// Returns the 8-bit unsigned equivalent of
-///
-/// res = abs(a\[0\] - b\[0\]) + abs(a\[1\] - b\[1\]) +\
-///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
-#[inline]
-#[cfg_attr(test, assert_instr(usad8))]
-pub unsafe fn usad8a(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
-    usad8(a, b) + c
-}
-
-#[cfg(test)]
-mod tests {
-    use core_arch::arm::*;
-    use core_arch::simd::*;
-    use std::mem;
-    use stdsimd_test::simd_test;
-
-    #[test]
-    fn qadd() {
-        unsafe {
-            assert_eq!(dsp::qadd(-10, 60), 50);
-            assert_eq!(dsp::qadd(::std::i32::MAX, 10), ::std::i32::MAX);
-            assert_eq!(dsp::qadd(::std::i32::MIN, -10), ::std::i32::MIN);
-        }
-    }
-
-    #[test]
-    fn qsub() {
-        unsafe {
-            assert_eq!(dsp::qsub(10, 60), -50);
-            assert_eq!(dsp::qsub(::std::i32::MAX, -10), ::std::i32::MAX);
-            assert_eq!(dsp::qsub(::std::i32::MIN, 10), ::std::i32::MIN);
-        }
-    }
-
-    #[test]
-    fn qadd8() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
-            let b = i8x4::new(2, -1, 0, 1);
-            let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn qsub8() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
-            let b = i8x4::new(2, -1, 0, 1);
-            let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
-            let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn qadd16() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(2, -1);
-            let c = i16x2::new(3, 1);
-            let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn qsub16() {
-        unsafe {
-            let a = i16x2::new(10, 20);
-            let b = i16x2::new(20, -10);
-            let c = i16x2::new(-10, 30);
-            let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn qasx() {
-        unsafe {
-            let a = i16x2::new(1, ::std::i16::MAX);
-            let b = i16x2::new(2, 2);
-            let c = i16x2::new(-1, ::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::qasx, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn qsax() {
-        unsafe {
-            let a = i16x2::new(1, ::std::i16::MAX);
-            let b = i16x2::new(2, 2);
-            let c = i16x2::new(3, ::std::i16::MAX - 2);
-            let r: i16x2 = dsp_call!(dsp::qsax, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn sadd16() {
-        unsafe {
-            let a = i16x2::new(1, ::std::i16::MAX);
-            let b = i16x2::new(2, 2);
-            let c = i16x2::new(3, -::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn sadd8() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
-            let b = i8x4::new(4, 3, 2, 2);
-            let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn sasx() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(2, 1);
-            let c = i16x2::new(0, 4);
-            let r: i16x2 = dsp_call!(dsp::sasx, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn smlad() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(3, 4);
-            let r = dsp::smlad(::mem::transmute(a), ::mem::transmute(b), 10);
-            assert_eq!(r, (1 * 3) + (2 * 4) + 10);
-        }
-    }
-
-    #[test]
-    fn smlsd() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(3, 4);
-            let r = dsp::smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
-            assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
-        }
-    }
-
-    #[test]
-    fn sel() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
-            let b = i8x4::new(4, 3, 2, 2);
-            // call sadd8() to set GE bits
-            dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
-            let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::sel, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn shadd8() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, 4);
-            let b = i8x4::new(5, 4, 3, 2);
-            let c = i8x4::new(3, 3, 3, 3);
-            let r: i8x4 = dsp_call!(dsp::shadd8, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn shadd16() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(5, 4);
-            let c = i16x2::new(3, 3);
-            let r: i16x2 = dsp_call!(dsp::shadd16, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn shsub8() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, 4);
-            let b = i8x4::new(5, 4, 3, 2);
-            let c = i8x4::new(-2, -1, 0, 1);
-            let r: i8x4 = dsp_call!(dsp::shsub8, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn shsub16() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(5, 4);
-            let c = i16x2::new(-2, -1);
-            let r: i16x2 = dsp_call!(dsp::shsub16, a, b);
-            assert_eq!(r, c);
-        }
-    }
-
-    #[test]
-    fn smuad() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(5, 4);
-            let r = dsp::smuad(::mem::transmute(a), ::mem::transmute(b));
-            assert_eq!(r, 13);
-        }
-    }
-
-    #[test]
-    fn smuadx() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(5, 4);
-            let r = dsp::smuadx(::mem::transmute(a), ::mem::transmute(b));
-            assert_eq!(r, 14);
-        }
-    }
-
-    #[test]
-    fn smusd() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(5, 4);
-            let r = dsp::smusd(::mem::transmute(a), ::mem::transmute(b));
-            assert_eq!(r, -3);
-        }
-    }
-
-    #[test]
-    fn smusdx() {
-        unsafe {
-            let a = i16x2::new(1, 2);
-            let b = i16x2::new(5, 4);
-            let r = dsp::smusdx(::mem::transmute(a), ::mem::transmute(b));
-            assert_eq!(r, -6);
-        }
-    }
-
-    #[test]
-    fn usad8() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, 4);
-            let b = i8x4::new(4, 3, 2, 1);
-            let r = dsp::usad8(::mem::transmute(a), ::mem::transmute(b));
-            assert_eq!(r, 8);
-        }
-    }
-
-    #[test]
-    fn usad8a() {
-        unsafe {
-            let a = i8x4::new(1, 2, 3, 4);
-            let b = i8x4::new(4, 3, 2, 1);
-            let c = 10;
-            let r = dsp::usad8a(::mem::transmute(a), ::mem::transmute(b), c);
-            assert_eq!(r, 8 + c);
-        }
-    }
-}
diff --git a/crates/core_arch/src/arm/mod.rs b/crates/core_arch/src/arm/mod.rs
index dd69b11457..e5b40c9bc7 100644
--- a/crates/core_arch/src/arm/mod.rs
+++ b/crates/core_arch/src/arm/mod.rs
@@ -19,12 +19,6 @@ mod v7;
 #[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
 pub use self::v7::*;
 
-// TODO move into the `acle::{dsp,simd32}` modules
-#[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
-mod dsp;
-#[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
-pub use self::dsp::*;
-
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon
 // features. Building ARM without neon produces incorrect codegen.
 #[cfg(any(

From 4cb6d8c4cbb35eb64b6b75ba1041a6b2703b7de4 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Fri, 15 Feb 2019 14:47:49 +0100
Subject: [PATCH 21/31] acle/ex: fix raw pointer mutability

---
 crates/core_arch/src/acle/ex.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/crates/core_arch/src/acle/ex.rs b/crates/core_arch/src/acle/ex.rs
index c25e0dc37a..c3ebe0bfad 100644
--- a/crates/core_arch/src/acle/ex.rs
+++ b/crates/core_arch/src/acle/ex.rs
@@ -70,10 +70,10 @@ pub unsafe fn __ldrex(p: *const u32) -> u32 {
 #[cfg(
     target_feature = "v6k", // includes v7-M but excludes v6-M
 )]
-pub unsafe fn __strexb(value: u32, addr: *const u8) -> u32 {
+pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
     extern "C" {
         #[link_name = "llvm.arm.strex.p0i8"]
-        fn strex8(value: u32, addr: *const u8) -> u32;
+        fn strex8(value: u32, addr: *mut u8) -> u32;
     }
 
     strex8(value, addr)
@@ -87,10 +87,10 @@ pub unsafe fn __strexb(value: u32, addr: *const u8) -> u32 {
 #[cfg(
     target_feature = "v6k", // includes v7-M but excludes v6-M
 )]
-pub unsafe fn __strexh(value: u16, addr: *const u16) -> u32 {
+pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
     extern "C" {
         #[link_name = "llvm.arm.strex.p0i16"]
-        fn strex16(value: u32, addr: *const u16) -> u32;
+        fn strex16(value: u32, addr: *mut u16) -> u32;
     }
 
     strex16(value as u32, addr)
@@ -105,10 +105,10 @@ pub unsafe fn __strexh(value: u16, addr: *const u16) -> u32 {
     all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
     all(target_feature = "v7", target_feature = "mclass"), // v7-M
 ))]
-pub unsafe fn __strex(value: u32, addr: *const u32) -> u32 {
+pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 {
     extern "C" {
         #[link_name = "llvm.arm.strex.p0i32"]
-        fn strex32(value: u32, addr: *const u32) -> u32;
+        fn strex32(value: u32, addr: *mut u32) -> u32;
     }
 
     strex32(value, addr)

From f73e69fd4709fb8f57790e4e0f06e4b0fc410afd Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Fri, 15 Feb 2019 14:50:09 +0100
Subject: [PATCH 22/31] cargo fmt

---
 crates/core_arch/src/acle/barrier/mod.rs | 10 ++--------
 crates/core_arch/src/acle/mod.rs         | 10 ++--------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/crates/core_arch/src/acle/barrier/mod.rs b/crates/core_arch/src/acle/barrier/mod.rs
index 61686895f0..47ca55cc15 100644
--- a/crates/core_arch/src/acle/barrier/mod.rs
+++ b/crates/core_arch/src/acle/barrier/mod.rs
@@ -51,16 +51,10 @@ mod common;
 ))]
 pub use self::common::*;
 
-#[cfg(any(
-    target_arch = "aarch64",
-    target_feature = "v7",
-))]
+#[cfg(any(target_arch = "aarch64", target_feature = "v7",))]
 mod not_mclass;
 
-#[cfg(any(
-    target_arch = "aarch64",
-    target_feature = "v7",
-))]
+#[cfg(any(target_arch = "aarch64", target_feature = "v7",))]
 pub use self::not_mclass::*;
 
 #[cfg(target_arch = "aarch64")]
diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
index 1354fe9fae..5f29decf5a 100644
--- a/crates/core_arch/src/acle/mod.rs
+++ b/crates/core_arch/src/acle/mod.rs
@@ -91,16 +91,10 @@ mod dsp;
 pub use self::dsp::*;
 
 // Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
-#[cfg(all(
-    not(target_arch = "aarch64"),
-    target_feature = "v6",
-))]
+#[cfg(all(not(target_arch = "aarch64"), target_feature = "v6",))]
 mod sat;
 
-#[cfg(all(
-    not(target_arch = "aarch64"),
-    target_feature = "v6",
-))]
+#[cfg(all(not(target_arch = "aarch64"), target_feature = "v6",))]
 pub use self::sat::*;
 
 // Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says

From c2e2edd4c4345459dfdfeda3c55707a8688c950e Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Fri, 15 Feb 2019 14:53:37 +0100
Subject: [PATCH 23/31] add missing imports

---
 crates/core_arch/src/acle/dsp.rs    | 3 +++
 crates/core_arch/src/acle/simd32.rs | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/acle/dsp.rs
index 3a71f2c469..b021a68042 100644
--- a/crates/core_arch/src/acle/dsp.rs
+++ b/crates/core_arch/src/acle/dsp.rs
@@ -20,6 +20,9 @@
 //! - [ ] __smlawb
 //! - [ ] __smlawt
 
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 extern "C" {
     #[link_name = "llvm.arm.qadd"]
     fn arm_qadd(a: i32, b: i32) -> i32;
diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
index 420ce2b2d0..13eda17697 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -62,6 +62,9 @@
 //! - [x] __smusd
 //! - [x] __smusdx
 
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
 types! {
     /// ARM-specific 32-bit wide vector of four packed `i8`.
     pub struct int8x4_t(i8, i8, i8, i8);

From f8ecff97b3eb9deefc4ad28266c0157962e0c085 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Fri, 15 Feb 2019 14:55:35 +0100
Subject: [PATCH 24/31] conditionally declare the dmb_dsb macro

---
 crates/core_arch/src/acle/barrier/mod.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/crates/core_arch/src/acle/barrier/mod.rs b/crates/core_arch/src/acle/barrier/mod.rs
index 47ca55cc15..b3cbf44d27 100644
--- a/crates/core_arch/src/acle/barrier/mod.rs
+++ b/crates/core_arch/src/acle/barrier/mod.rs
@@ -19,6 +19,11 @@ mod cp15;
 pub use self::cp15::*;
 
 // Dedicated instructions
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
 macro_rules! dmb_dsb {
     ($A:ident) => {
         impl super::super::sealed::Dmb for $A {

From b4836bf5df6e0c2b62c13800b7a0e2e27e3cb817 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 18 Feb 2019 13:14:14 +0100
Subject: [PATCH 25/31] acle/{dsp,simd32}: add leading underscores to match
 ACLE spec

---
 crates/core_arch/src/acle/dsp.rs    |  4 +--
 crates/core_arch/src/acle/simd32.rs | 44 ++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/acle/dsp.rs
index b021a68042..f3fe4c437c 100644
--- a/crates/core_arch/src/acle/dsp.rs
+++ b/crates/core_arch/src/acle/dsp.rs
@@ -37,7 +37,7 @@ extern "C" {
 /// Returns the 32-bit saturating signed equivalent of a + b.
 #[inline]
 #[cfg_attr(test, assert_instr(qadd))]
-pub unsafe fn qadd(a: i32, b: i32) -> i32 {
+pub unsafe fn __qadd(a: i32, b: i32) -> i32 {
     arm_qadd(a, b)
 }
 
@@ -46,6 +46,6 @@ pub unsafe fn qadd(a: i32, b: i32) -> i32 {
 /// Returns the 32-bit saturating signed equivalent of a - b.
 #[inline]
 #[cfg_attr(test, assert_instr(qsub))]
-pub unsafe fn qsub(a: i32, b: i32) -> i32 {
+pub unsafe fn __qsub(a: i32, b: i32) -> i32 {
     arm_qsub(a, b)
 }
diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
index 13eda17697..76dce39f57 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -157,7 +157,7 @@ extern "C" {
 /// res\[3\] = a\[3\] + b\[3\]
 #[inline]
 #[cfg_attr(test, assert_instr(qadd8))]
-pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_qadd8, a, b)
 }
 
@@ -171,7 +171,7 @@ pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[3\] = a\[3\] - b\[3\]
 #[inline]
 #[cfg_attr(test, assert_instr(qsub8))]
-pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_qsub8, a, b)
 }
 
@@ -183,7 +183,7 @@ pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[1\] = a\[1\] - b\[1\]
 #[inline]
 #[cfg_attr(test, assert_instr(qsub16))]
-pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qsub16, a, b)
 }
 
@@ -195,7 +195,7 @@ pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[1\] = a\[1\] + b\[1\]
 #[inline]
 #[cfg_attr(test, assert_instr(qadd16))]
-pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qadd16, a, b)
 }
 
@@ -205,7 +205,7 @@ pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[1\] = a\[1\] + b\[0\]
 #[inline]
 #[cfg_attr(test, assert_instr(qasx))]
-pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qasx, a, b)
 }
 
@@ -215,7 +215,7 @@ pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[1\] = a\[1\] - b\[0\]
 #[inline]
 #[cfg_attr(test, assert_instr(qsax))]
-pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qsax, a, b)
 }
 
@@ -227,7 +227,7 @@ pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// and the GE bits of the APSR are set.
 #[inline]
 #[cfg_attr(test, assert_instr(sadd16))]
-pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_sadd16, a, b)
 }
 
@@ -241,7 +241,7 @@ pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// and the GE bits of the APSR are set.
 #[inline]
 #[cfg_attr(test, assert_instr(sadd8))]
-pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_sadd8, a, b)
 }
 
@@ -252,7 +252,7 @@ pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
 #[inline]
 #[cfg_attr(test, assert_instr(smlad))]
-pub unsafe fn smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+pub unsafe fn __smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
     arm_smlad(::mem::transmute(a), ::mem::transmute(b), c)
 }
 
@@ -263,7 +263,7 @@ pub unsafe fn smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
 /// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
 #[inline]
 #[cfg_attr(test, assert_instr(smlsd))]
-pub unsafe fn smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+pub unsafe fn __smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
     arm_smlsd(::mem::transmute(a), ::mem::transmute(b), c)
 }
 
@@ -275,7 +275,7 @@ pub unsafe fn smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
 /// and the GE bits of the APSR are set.
 #[inline]
 #[cfg_attr(test, assert_instr(sasx))]
-pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_sasx, a, b)
 }
 
@@ -291,7 +291,7 @@ pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// where GE are bits of APSR
 #[inline]
 #[cfg_attr(test, assert_instr(sel))]
-pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_sel, a, b)
 }
 
@@ -305,7 +305,7 @@ pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[3\] = (a\[3\] + b\[3\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shadd8))]
-pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_shadd8, a, b)
 }
 
@@ -317,7 +317,7 @@ pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[1\] = (a\[1\] + b\[1\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shadd16))]
-pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_shadd16, a, b)
 }
 
@@ -331,7 +331,7 @@ pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[3\] = (a\[3\] - b\[3\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shsub8))]
-pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_shsub8, a, b)
 }
 
@@ -343,7 +343,7 @@ pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[1\] = (a\[1\] - b\[1\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shsub16))]
-pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_shsub16, a, b)
 }
 
@@ -356,7 +356,7 @@ pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smuad))]
-pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smuad(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smuad(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -369,7 +369,7 @@ pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smuadx))]
-pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smuadx(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -382,7 +382,7 @@ pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smusd))]
-pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smusd(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smusd(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -395,7 +395,7 @@ pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smusdx))]
-pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smusdx(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -407,7 +407,7 @@ pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
 ///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
 #[inline]
 #[cfg_attr(test, assert_instr(usad8))]
-pub unsafe fn usad8(a: int8x4_t, b: int8x4_t) -> u32 {
+pub unsafe fn __usad8(a: int8x4_t, b: int8x4_t) -> u32 {
     arm_usad8(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -419,7 +419,7 @@ pub unsafe fn usad8(a: int8x4_t, b: int8x4_t) -> u32 {
 ///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
 #[inline]
 #[cfg_attr(test, assert_instr(usad8))]
-pub unsafe fn usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
+pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
     usad8(a, b) + c
 }
 

From 660ead1a2b260c64cd22a6771df0f9bc9823baf4 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 18 Feb 2019 15:03:13 +0100
Subject: [PATCH 26/31] fix CI

---
 crates/core_arch/src/acle/simd32.rs | 58 ++++++++++++++---------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
index 76dce39f57..58b47304c4 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -433,18 +433,18 @@ mod tests {
     #[test]
     fn qadd() {
         unsafe {
-            assert_eq!(dsp::qadd(-10, 60), 50);
-            assert_eq!(dsp::qadd(::std::i32::MAX, 10), ::std::i32::MAX);
-            assert_eq!(dsp::qadd(::std::i32::MIN, -10), ::std::i32::MIN);
+            assert_eq!(dsp::__qadd(-10, 60), 50);
+            assert_eq!(dsp::__qadd(::std::i32::MAX, 10), ::std::i32::MAX);
+            assert_eq!(dsp::__qadd(::std::i32::MIN, -10), ::std::i32::MIN);
         }
     }
 
     #[test]
     fn qsub() {
         unsafe {
-            assert_eq!(dsp::qsub(10, 60), -50);
-            assert_eq!(dsp::qsub(::std::i32::MAX, -10), ::std::i32::MAX);
-            assert_eq!(dsp::qsub(::std::i32::MIN, 10), ::std::i32::MIN);
+            assert_eq!(dsp::__qsub(10, 60), -50);
+            assert_eq!(dsp::__qsub(::std::i32::MAX, -10), ::std::i32::MAX);
+            assert_eq!(dsp::__qsub(::std::i32::MIN, 10), ::std::i32::MIN);
         }
     }
 
@@ -454,7 +454,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
+            let r: i8x4 = dsp_call!(dsp::__qadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -465,7 +465,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
-            let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
+            let r: i8x4 = dsp_call!(dsp::__qsub8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -476,7 +476,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, -1);
             let c = i16x2::new(3, 1);
-            let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
+            let r: i16x2 = dsp_call!(dsp::__qadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -487,7 +487,7 @@ mod tests {
             let a = i16x2::new(10, 20);
             let b = i16x2::new(20, -10);
             let c = i16x2::new(-10, 30);
-            let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
+            let r: i16x2 = dsp_call!(dsp::__qsub16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -498,7 +498,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(-1, ::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::qasx, a, b);
+            let r: i16x2 = dsp_call!(dsp::__qasx, a, b);
             assert_eq!(r, c);
         }
     }
@@ -509,7 +509,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(3, ::std::i16::MAX - 2);
-            let r: i16x2 = dsp_call!(dsp::qsax, a, b);
+            let r: i16x2 = dsp_call!(dsp::__qsax, a, b);
             assert_eq!(r, c);
         }
     }
@@ -520,7 +520,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(3, -::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
+            let r: i16x2 = dsp_call!(dsp::__sadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -531,7 +531,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(4, 3, 2, 2);
             let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
+            let r: i8x4 = dsp_call!(dsp::__sadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -542,7 +542,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, 1);
             let c = i16x2::new(0, 4);
-            let r: i16x2 = dsp_call!(dsp::sasx, a, b);
+            let r: i16x2 = dsp_call!(dsp::__sasx, a, b);
             assert_eq!(r, c);
         }
     }
@@ -552,7 +552,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(3, 4);
-            let r = dsp::smlad(::mem::transmute(a), ::mem::transmute(b), 10);
+            let r = dsp::__smlad(::mem::transmute(a), ::mem::transmute(b), 10);
             assert_eq!(r, (1 * 3) + (2 * 4) + 10);
         }
     }
@@ -562,7 +562,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(3, 4);
-            let r = dsp::smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
+            let r = dsp::__smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
             assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
         }
     }
@@ -573,9 +573,9 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(4, 3, 2, 2);
             // call sadd8() to set GE bits
-            dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
+            dsp::__sadd8(::mem::transmute(a), ::mem::transmute(b));
             let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::sel, a, b);
+            let r: i8x4 = dsp_call!(dsp::__sel, a, b);
             assert_eq!(r, c);
         }
     }
@@ -586,7 +586,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(5, 4, 3, 2);
             let c = i8x4::new(3, 3, 3, 3);
-            let r: i8x4 = dsp_call!(dsp::shadd8, a, b);
+            let r: i8x4 = dsp_call!(dsp::__shadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -597,7 +597,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
             let c = i16x2::new(3, 3);
-            let r: i16x2 = dsp_call!(dsp::shadd16, a, b);
+            let r: i16x2 = dsp_call!(dsp::__shadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -608,7 +608,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(5, 4, 3, 2);
             let c = i8x4::new(-2, -1, 0, 1);
-            let r: i8x4 = dsp_call!(dsp::shsub8, a, b);
+            let r: i8x4 = dsp_call!(dsp::__shsub8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -619,7 +619,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
             let c = i16x2::new(-2, -1);
-            let r: i16x2 = dsp_call!(dsp::shsub16, a, b);
+            let r: i16x2 = dsp_call!(dsp::__shsub16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -629,7 +629,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smuad(::mem::transmute(a), ::mem::transmute(b));
+            let r = dsp::__smuad(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 13);
         }
     }
@@ -639,7 +639,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smuadx(::mem::transmute(a), ::mem::transmute(b));
+            let r = dsp::__smuadx(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 14);
         }
     }
@@ -649,7 +649,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smusd(::mem::transmute(a), ::mem::transmute(b));
+            let r = dsp::__smusd(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, -3);
         }
     }
@@ -659,7 +659,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smusdx(::mem::transmute(a), ::mem::transmute(b));
+            let r = dsp::__smusdx(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, -6);
         }
     }
@@ -669,7 +669,7 @@ mod tests {
         unsafe {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(4, 3, 2, 1);
-            let r = dsp::usad8(::mem::transmute(a), ::mem::transmute(b));
+            let r = dsp::__usad8(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 8);
         }
     }
@@ -680,7 +680,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(4, 3, 2, 1);
             let c = 10;
-            let r = dsp::usad8a(::mem::transmute(a), ::mem::transmute(b), c);
+            let r = dsp::__usad8a(::mem::transmute(a), ::mem::transmute(b), c);
             assert_eq!(r, 8 + c);
         }
     }

From 608fdd73cf1aeec1173d6a6378e86a44b47ce1a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mateusz=20Miku=C5=82a?= <mati865@users.noreply.github.com>
Date: Mon, 18 Feb 2019 16:45:02 +0100
Subject: [PATCH 27/31] Update crates/core_arch/src/acle/simd32.rs

Co-Authored-By: japaric <jorge@japaric.io>
---
 crates/core_arch/src/acle/simd32.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
index 58b47304c4..ae704ef175 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -420,7 +420,7 @@ pub unsafe fn __usad8(a: int8x4_t, b: int8x4_t) -> u32 {
 #[inline]
 #[cfg_attr(test, assert_instr(usad8))]
 pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
-    usad8(a, b) + c
+    __usad8(a, b) + c
 }
 
 #[cfg(test)]

From 3a30a06636ed086b545a5d3468f99a185dbbb38c Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 18 Feb 2019 17:39:29 +0100
Subject: [PATCH 28/31] acle/ex: CLREX requires v6k

---
 crates/core_arch/src/acle/ex.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/ex.rs b/crates/core_arch/src/acle/ex.rs
index c3ebe0bfad..0426c65186 100644
--- a/crates/core_arch/src/acle/ex.rs
+++ b/crates/core_arch/src/acle/ex.rs
@@ -3,8 +3,10 @@
 /// Removes the exclusive lock created by LDREX
 // Supported: v6, v6K, v7-M, v7-A, v7-R
 // Not supported: v5, v6-M
+// NOTE: there's no dedicated CLREX instruction in v6 (<v6k); to clear the exclusive monitor users
+// have to do a dummy STREX operation
 #[cfg(any(
-    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v6k", not(target_feature = "mclass")), // excludes v6-M
     all(target_feature = "v7", target_feature = "mclass"), // v7-M
 ))]
 pub unsafe fn __clrex() {

From f3d8449c41b87535cf4a391b91fc93edcd13b348 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 18 Feb 2019 18:01:37 +0100
Subject: [PATCH 29/31] acle/{dsp,simd32}: fix unit tests

---
 crates/core_arch/src/acle/dsp.rs    | 25 +++++++++++
 crates/core_arch/src/acle/simd32.rs | 67 +++++++++++------------------
 2 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/acle/dsp.rs
index f3fe4c437c..e929e98e40 100644
--- a/crates/core_arch/src/acle/dsp.rs
+++ b/crates/core_arch/src/acle/dsp.rs
@@ -49,3 +49,28 @@ pub unsafe fn __qadd(a: i32, b: i32) -> i32 {
 pub unsafe fn __qsub(a: i32, b: i32) -> i32 {
     arm_qsub(a, b)
 }
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arm::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(super::__qadd(-10, 60), 50);
+            assert_eq!(super::__qadd(::std::i32::MAX, 10), ::std::i32::MAX);
+            assert_eq!(super::__qadd(::std::i32::MIN, -10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(super::__qsub(10, 60), -50);
+            assert_eq!(super::__qsub(::std::i32::MAX, -10), ::std::i32::MAX);
+            assert_eq!(super::__qsub(::std::i32::MIN, 10), ::std::i32::MIN);
+        }
+    }
+}
diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
index ae704ef175..eae0f0b830 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -425,36 +425,17 @@ pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
 
 #[cfg(test)]
 mod tests {
-    use core_arch::arm::*;
-    use core_arch::simd::*;
+    use core_arch::simd::{i8x4, i16x2};
     use std::mem;
     use stdsimd_test::simd_test;
 
-    #[test]
-    fn qadd() {
-        unsafe {
-            assert_eq!(dsp::__qadd(-10, 60), 50);
-            assert_eq!(dsp::__qadd(::std::i32::MAX, 10), ::std::i32::MAX);
-            assert_eq!(dsp::__qadd(::std::i32::MIN, -10), ::std::i32::MIN);
-        }
-    }
-
-    #[test]
-    fn qsub() {
-        unsafe {
-            assert_eq!(dsp::__qsub(10, 60), -50);
-            assert_eq!(dsp::__qsub(::std::i32::MAX, -10), ::std::i32::MAX);
-            assert_eq!(dsp::__qsub(::std::i32::MIN, 10), ::std::i32::MIN);
-        }
-    }
-
     #[test]
     fn qadd8() {
         unsafe {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::__qadd8, a, b);
+            let r: i8x4 = dsp_call!(super::__qadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -465,7 +446,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
-            let r: i8x4 = dsp_call!(dsp::__qsub8, a, b);
+            let r: i8x4 = dsp_call!(super::__qsub8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -476,7 +457,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, -1);
             let c = i16x2::new(3, 1);
-            let r: i16x2 = dsp_call!(dsp::__qadd16, a, b);
+            let r: i16x2 = dsp_call!(super::__qadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -487,7 +468,7 @@ mod tests {
             let a = i16x2::new(10, 20);
             let b = i16x2::new(20, -10);
             let c = i16x2::new(-10, 30);
-            let r: i16x2 = dsp_call!(dsp::__qsub16, a, b);
+            let r: i16x2 = dsp_call!(super::__qsub16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -498,7 +479,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(-1, ::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::__qasx, a, b);
+            let r: i16x2 = dsp_call!(super::__qasx, a, b);
             assert_eq!(r, c);
         }
     }
@@ -509,7 +490,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(3, ::std::i16::MAX - 2);
-            let r: i16x2 = dsp_call!(dsp::__qsax, a, b);
+            let r: i16x2 = dsp_call!(super::__qsax, a, b);
             assert_eq!(r, c);
         }
     }
@@ -520,7 +501,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(3, -::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::__sadd16, a, b);
+            let r: i16x2 = dsp_call!(super::__sadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -531,7 +512,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(4, 3, 2, 2);
             let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::__sadd8, a, b);
+            let r: i8x4 = dsp_call!(super::__sadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -542,7 +523,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, 1);
             let c = i16x2::new(0, 4);
-            let r: i16x2 = dsp_call!(dsp::__sasx, a, b);
+            let r: i16x2 = dsp_call!(super::__sasx, a, b);
             assert_eq!(r, c);
         }
     }
@@ -552,7 +533,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(3, 4);
-            let r = dsp::__smlad(::mem::transmute(a), ::mem::transmute(b), 10);
+            let r = super::__smlad(::mem::transmute(a), ::mem::transmute(b), 10);
             assert_eq!(r, (1 * 3) + (2 * 4) + 10);
         }
     }
@@ -562,7 +543,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(3, 4);
-            let r = dsp::__smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
+            let r = super::__smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
             assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
         }
     }
@@ -573,9 +554,9 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(4, 3, 2, 2);
             // call sadd8() to set GE bits
-            dsp::__sadd8(::mem::transmute(a), ::mem::transmute(b));
+            super::__sadd8(::mem::transmute(a), ::mem::transmute(b));
             let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::__sel, a, b);
+            let r: i8x4 = dsp_call!(super::__sel, a, b);
             assert_eq!(r, c);
         }
     }
@@ -586,7 +567,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(5, 4, 3, 2);
             let c = i8x4::new(3, 3, 3, 3);
-            let r: i8x4 = dsp_call!(dsp::__shadd8, a, b);
+            let r: i8x4 = dsp_call!(super::__shadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -597,7 +578,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
             let c = i16x2::new(3, 3);
-            let r: i16x2 = dsp_call!(dsp::__shadd16, a, b);
+            let r: i16x2 = dsp_call!(super::__shadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -608,7 +589,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(5, 4, 3, 2);
             let c = i8x4::new(-2, -1, 0, 1);
-            let r: i8x4 = dsp_call!(dsp::__shsub8, a, b);
+            let r: i8x4 = dsp_call!(super::__shsub8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -619,7 +600,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
             let c = i16x2::new(-2, -1);
-            let r: i16x2 = dsp_call!(dsp::__shsub16, a, b);
+            let r: i16x2 = dsp_call!(super::__shsub16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -629,7 +610,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::__smuad(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smuad(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 13);
         }
     }
@@ -639,7 +620,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::__smuadx(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smuadx(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 14);
         }
     }
@@ -649,7 +630,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::__smusd(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smusd(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, -3);
         }
     }
@@ -659,7 +640,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::__smusdx(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smusdx(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, -6);
         }
     }
@@ -669,7 +650,7 @@ mod tests {
         unsafe {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(4, 3, 2, 1);
-            let r = dsp::__usad8(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__usad8(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 8);
         }
     }
@@ -680,7 +661,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(4, 3, 2, 1);
             let c = 10;
-            let r = dsp::__usad8a(::mem::transmute(a), ::mem::transmute(b), c);
+            let r = super::__usada8(::mem::transmute(a), ::mem::transmute(b), c);
             assert_eq!(r, 8 + c);
         }
     }

From 9d1bb44414fa8c9d6d76cea748f45182ab6753ba Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 18 Feb 2019 18:21:08 +0100
Subject: [PATCH 30/31] assert_instr: bump instruction limit for simd32

---
 crates/stdsimd-test/src/lib.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crates/stdsimd-test/src/lib.rs b/crates/stdsimd-test/src/lib.rs
index dec44401d9..66ee9dd894 100644
--- a/crates/stdsimd-test/src/lib.rs
+++ b/crates/stdsimd-test/src/lib.rs
@@ -153,6 +153,10 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
                 // in some cases exceed the limit.
                 "cvtpi2ps" => 25,
 
+                // core_arch/src/acle/simd32
+                "usad8" => 27,
+                "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" => 29,
+
                 // Original limit was 20 instructions, but ARM DSP Intrinsics
                 // are exactly 20 instructions long. So bump
                 // the limit to 22 instead of adding here a

From a36e20e22d7d34192c9a58408884800dfeeb0b48 Mon Sep 17 00:00:00 2001
From: Jorge Aparicio <jorge@japaric.io>
Date: Mon, 18 Feb 2019 18:59:27 +0100
Subject: [PATCH 31/31] cargo fmt

---
 crates/core_arch/src/acle/simd32.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/core_arch/src/acle/simd32.rs b/crates/core_arch/src/acle/simd32.rs
index eae0f0b830..a259f90d2c 100644
--- a/crates/core_arch/src/acle/simd32.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -425,7 +425,7 @@ pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
 
 #[cfg(test)]
 mod tests {
-    use core_arch::simd::{i8x4, i16x2};
+    use core_arch::simd::{i16x2, i8x4};
     use std::mem;
     use stdsimd_test::simd_test;