Unroll biginteger loops and reduce copies (#205)

* Unroll biginteger loops * Reduce field arithmetic copies * Reduce ec arithmetic copies * Fix * CHANGELOG and tweaks * Use intrinsics in `add_nocarry`/`sub_noborrow` Co-authored-by: Jon Chuang <jon-chuang@users.noreply.github.com> * Update CHANGELOG * fmt * Remove assert * minor changes for bigint * minor changes for bigint * Small clean up Co-authored-by: Jon Chuang <jon-chuang@users.noreply.github.com> Co-authored-by: jonch <9093549+jon-chuang@users.noreply.github.com>
arkworks-rs · Feb 6, 2021 · 87e25cb · 87e25cb
1 parent 80ff5ea
commit 87e25cb
Show file tree

Hide file tree

Showing 10 changed files with 160 additions and 114 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -83,6 +83,7 @@ The main features of this release are:
 - #188 (ark-ec) Make Short Weierstrass random sampling result in an element with unknown discrete log
 - #190 (ark-ec) Add curve cycle trait and extended pairing cycle trait for all types of ec cycles.
 - #201 (ark-ec, ark-ff, ark-test-curves, ark-test-templates) Remove the dependency on `rand_xorshift`
+- #205 (ark-ec, ark-ff) Unroll loops and conditionally use intrinsics in `biginteger` arithmetic, and reduce copies in `ff` and `ec` arithmetic.
 
 ### Bug fixes
 - #36 (ark-ec) In Short-Weierstrass curves, include an infinity bit in `ToConstraintField`.

diff --git a/ec/src/models/short_weierstrass_jacobian.rs b/ec/src/models/short_weierstrass_jacobian.rs
@@ -260,7 +260,6 @@ impl<P: Parameters> Default for GroupAffine<P> {
 #[derivative(
     Copy(bound = "P: Parameters"),
     Clone(bound = "P: Parameters"),
-    Eq(bound = "P: Parameters"),
     Debug(bound = "P: Parameters"),
     Hash(bound = "P: Parameters")
 )]
@@ -279,6 +278,7 @@ impl<P: Parameters> Display for GroupProjective<P> {
     }
 }
 
+impl<P: Parameters> Eq for GroupProjective<P> {}
 impl<P: Parameters> PartialEq for GroupProjective<P> {
     fn eq(&self, other: &Self) -> bool {
         if self.is_zero() {
@@ -581,10 +581,9 @@ impl<'a, P: Parameters> Add<&'a Self> for GroupProjective<P> {
     type Output = Self;
 
     #[inline]
-    fn add(self, other: &'a Self) -> Self {
-        let mut copy = self;
-        copy += other;
-        copy
+    fn add(mut self, other: &'a Self) -> Self {
+        self += other;
+        self
     }
 }
 
@@ -657,10 +656,9 @@ impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective<P> {
     type Output = Self;
 
     #[inline]
-    fn sub(self, other: &'a Self) -> Self {
-        let mut copy = self;
-        copy -= other;
-        copy
+    fn sub(mut self, other: &'a Self) -> Self {
+        self -= other;
+        self
     }
 }
 

diff --git a/ec/src/models/twisted_edwards_extended.rs b/ec/src/models/twisted_edwards_extended.rs
@@ -544,10 +544,9 @@ ark_ff::impl_additive_ops_from_ref!(GroupProjective, Parameters);
 
 impl<'a, P: Parameters> Add<&'a Self> for GroupProjective<P> {
     type Output = Self;
-    fn add(self, other: &'a Self) -> Self {
-        let mut copy = self;
-        copy += other;
-        copy
+    fn add(mut self, other: &'a Self) -> Self {
+        self += other;
+        self
     }
 }
 
@@ -597,10 +596,9 @@ impl<'a, P: Parameters> AddAssign<&'a Self> for GroupProjective<P> {
 
 impl<'a, P: Parameters> Sub<&'a Self> for GroupProjective<P> {
     type Output = Self;
-    fn sub(self, other: &'a Self) -> Self {
-        let mut copy = self;
-        copy -= other;
-        copy
+    fn sub(mut self, other: &'a Self) -> Self {
+        self -= other;
+        self
     }
 }
 

diff --git a/ff-asm/src/lib.rs b/ff-asm/src/lib.rs
@@ -88,7 +88,7 @@ pub fn x86_64_asm_mul(input: TokenStream) -> TokenStream {
         let inner_ts: Expr = syn::parse_str(&impl_block).unwrap();
         let ts = quote::quote! {
             let a = &mut #a;
-            let b = #b;
+            let b = &#b;
             #inner_ts
         };
         ts.into()
@@ -290,7 +290,7 @@ fn generate_impl(num_limbs: usize, is_mul: bool) -> String {
     let mut ctx = Context::new();
     ctx.add_declaration("a", "r", "a");
     if is_mul {
-        ctx.add_declaration("b", "r", "&b");
+        ctx.add_declaration("b", "r", "b");
     }
     ctx.add_declaration("modulus", "r", "&P::MODULUS.0");
     ctx.add_declaration("0", "i", "0u64");

diff --git a/ff/src/biginteger/macros.rs b/ff/src/biginteger/macros.rs
@@ -13,39 +13,81 @@ macro_rules! bigint_impl {
             const NUM_LIMBS: usize = $num_limbs;
 
             #[inline]
+            #[ark_ff_asm::unroll_for_loops]
             fn add_nocarry(&mut self, other: &Self) -> bool {
                 let mut carry = 0;
 
-                for (a, b) in self.0.iter_mut().zip(other.0.iter()) {
-                    *a = adc!(*a, *b, &mut carry);
+                for i in 0..$num_limbs {
+                    #[cfg(all(target_arch = "x86_64", feature = "asm"))]
+                    #[allow(unsafe_code)]
+                    unsafe {
+                        use core::arch::x86_64::_addcarry_u64;
+                        carry = _addcarry_u64(carry, self.0[i], other.0[i], &mut self.0[i])
+                    };
+
+                    #[cfg(not(all(target_arch = "x86_64", feature = "asm")))]
+                    {
+                        self.0[i] = adc!(self.0[i], other.0[i], &mut carry);
+                    }
                 }
 
                 carry != 0
             }
 
             #[inline]
+            #[ark_ff_asm::unroll_for_loops]
             fn sub_noborrow(&mut self, other: &Self) -> bool {
                 let mut borrow = 0;
 
-                for (a, b) in self.0.iter_mut().zip(other.0.iter()) {
-                    *a = sbb!(*a, *b, &mut borrow);
+                for i in 0..$num_limbs {
+                    #[cfg(all(target_arch = "x86_64", feature = "asm"))]
+                    #[allow(unsafe_code)]
+                    unsafe {
+                        use core::arch::x86_64::_subborrow_u64;
+                        borrow = _subborrow_u64(borrow, self.0[i], other.0[i], &mut self.0[i])
+                    };
+
+                    #[cfg(not(all(target_arch = "x86_64", feature = "asm")))]
+                    {
+                        self.0[i] = sbb!(self.0[i], other.0[i], &mut borrow);
+                    }
                 }
 
                 borrow != 0
             }
 
             #[inline]
+            #[ark_ff_asm::unroll_for_loops]
+            #[allow(unused)]
             fn mul2(&mut self) {
-                let mut last = 0;
-                for i in &mut self.0 {
-                    let tmp = *i >> 63;
-                    *i <<= 1;
-                    *i |= last;
-                    last = tmp;
+                #[cfg(all(target_arch = "x86_64", feature = "asm"))]
+                #[allow(unsafe_code)]
+                {
+                    let mut carry = 0;
+
+                    for i in 0..$num_limbs {
+                        unsafe {
+                            use core::arch::x86_64::_addcarry_u64;
+                            carry = _addcarry_u64(carry, self.0[i], self.0[i], &mut self.0[i])
+                        };
+                    }
+                }
+
+                #[cfg(not(all(target_arch = "x86_64", feature = "asm")))]
+                {
+                    let mut last = 0;
+                    for i in 0..$num_limbs {
+                        let a = &mut self.0[i];
+                        let tmp = *a >> 63;
+                        *a <<= 1;
+                        *a |= last;
+                        last = tmp;
+                    }
                 }
             }
 
             #[inline]
+            #[ark_ff_asm::unroll_for_loops]
             fn muln(&mut self, mut n: u32) {
                 if n >= 64 * $num_limbs {
                     *self = Self::from(0);
@@ -54,35 +96,41 @@ macro_rules! bigint_impl {
 
                 while n >= 64 {
                     let mut t = 0;
-                    for i in &mut self.0 {
-                        core::mem::swap(&mut t, i);
+                    for i in 0..$num_limbs {
+                        core::mem::swap(&mut t, &mut self.0[i]);
                     }
                     n -= 64;
                 }
 
                 if n > 0 {
                     let mut t = 0;
-                    for i in &mut self.0 {
-                        let t2 = *i >> (64 - n);
-                        *i <<= n;
-                        *i |= t;
+                    #[allow(unused)]
+                    for i in 0..$num_limbs {
+                        let a = &mut self.0[i];
+                        let t2 = *a >> (64 - n);
+                        *a <<= n;
+                        *a |= t;
                         t = t2;
                     }
                 }
             }
 
             #[inline]
+            #[ark_ff_asm::unroll_for_loops]
+            #[allow(unused)]
             fn div2(&mut self) {
                 let mut t = 0;
-                for i in self.0.iter_mut().rev() {
-                    let t2 = *i << 63;
-                    *i >>= 1;
-                    *i |= t;
+                for i in 0..$num_limbs {
+                    let a = &mut self.0[$num_limbs - i - 1];
+                    let t2 = *a << 63;
+                    *a >>= 1;
+                    *a |= t;
                     t = t2;
                 }
             }
 
             #[inline]
+            #[ark_ff_asm::unroll_for_loops]
             fn divn(&mut self, mut n: u32) {
                 if n >= 64 * $num_limbs {
                     *self = Self::from(0);
@@ -91,18 +139,20 @@ macro_rules! bigint_impl {
 
                 while n >= 64 {
                     let mut t = 0;
-                    for i in self.0.iter_mut().rev() {
-                        core::mem::swap(&mut t, i);
+                    for i in 0..$num_limbs {
+                        core::mem::swap(&mut t, &mut self.0[$num_limbs - i - 1]);
                     }
                     n -= 64;
                 }
 
                 if n > 0 {
                     let mut t = 0;
-                    for i in self.0.iter_mut().rev() {
-                        let t2 = *i << (64 - n);
-                        *i >>= n;
-                        *i |= t;
+                    #[allow(unused)]
+                    for i in 0..$num_limbs {
+                        let a = &mut self.0[$num_limbs - i - 1];
+                        let t2 = *a << (64 - n);
+                        *a >>= n;
+                        *a |= t;
                         t = t2;
                     }
                 }
@@ -120,7 +170,12 @@ macro_rules! bigint_impl {
 
             #[inline]
             fn is_zero(&self) -> bool {
-                self.0.iter().all(|&e| e == 0)
+                for i in 0..$num_limbs {
+                    if self.0[i] != 0 {
+                        return false;
+                    }
+                }
+                true
             }
 
             #[inline]
@@ -270,16 +325,19 @@ macro_rules! bigint_impl {
 
         impl Ord for $name {
             #[inline]
+            #[ark_ff_asm::unroll_for_loops]
             fn cmp(&self, other: &Self) -> ::core::cmp::Ordering {
-                for (a, b) in self.0.iter().rev().zip(other.0.iter().rev()) {
+                use core::cmp::Ordering;
+                for i in 0..$num_limbs {
+                    let a = &self.0[$num_limbs - i - 1];
+                    let b = &other.0[$num_limbs - i - 1];
                     if a < b {
-                        return core::cmp::Ordering::Less;
+                        return Ordering::Less;
                     } else if a > b {
-                        return core::cmp::Ordering::Greater;
+                        return Ordering::Greater;
                     }
                 }
-
-                core::cmp::Ordering::Equal
+                Ordering::Equal
             }
         }
 

diff --git a/ff/src/fields/arithmetic.rs b/ff/src/fields/arithmetic.rs
@@ -26,7 +26,6 @@ macro_rules! impl_field_mul_assign {
                 {
                     // Tentatively avoid using assembly for `$limbs == 1`.
                     if $limbs <= 6 && $limbs > 1 {
-                        assert!($limbs <= 6);
                         ark_ff_asm::x86_64_asm_mul!($limbs, (self.0).0, (other.0).0);
                         self.reduce();
                         return;
@@ -104,7 +103,6 @@ macro_rules! impl_field_square_in_place {
                 let _no_carry: bool = !(first_bit_set || all_bits_set);
 
                 if $limbs <= 6 && _no_carry {
-                    assert!($limbs <= 6);
                     ark_ff_asm::x86_64_asm_square!($limbs, (self.0).0);
                     self.reduce();
                     return self;

diff --git a/ff/src/fields/macros.rs b/ff/src/fields/macros.rs
@@ -233,7 +233,7 @@ macro_rules! impl_Fp {
         }
 
         impl<P: $FpParameters> $Fp<P> {
-            #[inline]
+            #[inline(always)]
             pub(crate) fn is_valid(&self) -> bool {
                 self.0 < P::MODULUS
             }
@@ -605,7 +605,7 @@ macro_rules! impl_Fp {
             #[must_use]
             fn neg(self) -> Self {
                 if !self.is_zero() {
-                    let mut tmp = P::MODULUS.clone();
+                    let mut tmp = P::MODULUS;
                     tmp.sub_noborrow(&self.0);
                     $Fp::<P>(tmp, PhantomData)
                 } else {
@@ -618,43 +618,39 @@ macro_rules! impl_Fp {
             type Output = Self;
 
             #[inline]
-            fn add(self, other: &Self) -> Self {
-                let mut result = self.clone();
-                result.add_assign(other);
-                result
+            fn add(mut self, other: &Self) -> Self {
+                self.add_assign(other);
+                self
             }
         }
 
         impl<'a, P: $FpParameters> Sub<&'a $Fp<P>> for $Fp<P> {
             type Output = Self;
 
             #[inline]
-            fn sub(self, other: &Self) -> Self {
-                let mut result = self.clone();
-                result.sub_assign(other);
-                result
+            fn sub(mut self, other: &Self) -> Self {
+                self.sub_assign(other);
+                self
             }
         }
 
         impl<'a, P: $FpParameters> Mul<&'a $Fp<P>> for $Fp<P> {
             type Output = Self;
 
             #[inline]
-            fn mul(self, other: &Self) -> Self {
-                let mut result = self.clone();
-                result.mul_assign(other);
-                result
+            fn mul(mut self, other: &Self) -> Self {
+                self.mul_assign(other);
+                self
             }
         }
 
         impl<'a, P: $FpParameters> Div<&'a $Fp<P>> for $Fp<P> {
             type Output = Self;
 
             #[inline]
-            fn div(self, other: &Self) -> Self {
-                let mut result = self.clone();
-                result.mul_assign(&other.inverse().unwrap());
-                result
+            fn div(mut self, other: &Self) -> Self {
+                self.mul_assign(&other.inverse().unwrap());
+                self
             }
         }