diff --git a/Cargo.toml b/Cargo.toml
index 2f4f8bb0a..ce3b563c7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -45,8 +45,10 @@ serde = { version = "1.0", optional = true }
 
 [dev-dependencies]
 defmac = "0.2"
-quickcheck = { version = "0.7.2", default-features = false }
+quickcheck = { version = "0.8.1", default-features = false }
+quickcheck_macros = "0.8"
 rawpointer = "0.1"
+rand = "0.5.5"
 
 [features]
 # Enable blas usage
diff --git a/benches/numeric.rs b/benches/numeric.rs
index db0252c8a..d94ad0835 100644
--- a/benches/numeric.rs
+++ b/benches/numeric.rs
@@ -1,8 +1,7 @@
 
 #![feature(test)]
-
 extern crate test;
-use test::Bencher;
+use test::{black_box, Bencher};
 
 extern crate ndarray;
 use ndarray::prelude::*;
@@ -25,3 +24,200 @@ fn clip(bench: &mut Bencher)
         })
     });
 }
+
+
+#[bench]
+fn contiguous_sum_1e7(bench: &mut Bencher)
+{
+    let n = 1e7 as usize;
+    let a = Array::linspace(-1e6, 1e6, n);
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
+#[bench]
+fn contiguous_sum_int_1e7(bench: &mut Bencher)
+{
+    let n = 1e7 as usize;
+    let a = Array::from_vec((0..n).collect());
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
+#[bench]
+fn contiguous_sum_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::linspace(-1e6, 1e6, n);
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
+#[bench]
+fn contiguous_sum_int_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::from_vec((0..n).collect());
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
+#[bench]
+fn contiguous_sum_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n);
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
+#[bench]
+fn contiguous_sum_int_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..n).collect());
+    bench.iter(|| {
+        a.sum()
+    });
+}
+
+#[bench]
+fn contiguous_sum_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n * n)
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| black_box(&a).sum());
+}
+
+#[bench]
+fn contiguous_sum_int_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..n.pow(3)).collect())
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| black_box(&a).sum());
+}
+
+#[bench]
+fn inner_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n * 2*n)
+        .into_shape([n, n, 2*n])
+        .unwrap();
+    let v = a.slice(s![.., .., ..;2]);
+    bench.iter(|| black_box(&v).sum());
+}
+
+#[bench]
+fn inner_discontiguous_sum_int_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..(n.pow(3) * 2)).collect())
+        .into_shape([n, n, 2*n])
+        .unwrap();
+    let v = a.slice(s![.., .., ..;2]);
+    bench.iter(|| black_box(&v).sum());
+}
+
+#[bench]
+fn middle_discontiguous_sum_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * 2*n * n)
+        .into_shape([n, 2*n, n])
+        .unwrap();
+    let v = a.slice(s![.., ..;2, ..]);
+    bench.iter(|| black_box(&v).sum());
+}
+
+#[bench]
+fn middle_discontiguous_sum_int_ix3_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..(n.pow(3) * 2)).collect())
+        .into_shape([n, 2*n, n])
+        .unwrap();
+    let v = a.slice(s![.., ..;2, ..]);
+    bench.iter(|| black_box(&v).sum());
+}
+
+#[bench]
+fn sum_by_row_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n)
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(0))
+    });
+}
+
+#[bench]
+fn sum_by_row_int_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::from_vec((0..n.pow(2)).collect())
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(0))
+    });
+}
+
+#[bench]
+fn sum_by_col_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n)
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}
+
+#[bench]
+fn sum_by_col_int_1e4(bench: &mut Bencher)
+{
+    let n = 1e4 as usize;
+    let a = Array::from_vec((0..n.pow(2)).collect())
+        .into_shape([n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}
+
+#[bench]
+fn sum_by_middle_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::linspace(-1e6, 1e6, n * n * n)
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}
+
+#[bench]
+fn sum_by_middle_int_1e2(bench: &mut Bencher)
+{
+    let n = 1e2 as usize;
+    let a = Array::from_vec((0..n.pow(3)).collect())
+        .into_shape([n, n, n])
+        .unwrap();
+    bench.iter(|| {
+        a.sum_axis(Axis(1))
+    });
+}
diff --git a/src/dimension/dimension_trait.rs b/src/dimension/dimension_trait.rs
index f8357f069..038e5473c 100644
--- a/src/dimension/dimension_trait.rs
+++ b/src/dimension/dimension_trait.rs
@@ -291,8 +291,8 @@ pub trait Dimension : Clone + Eq + Debug + Send + Sync + Default +
         indices
     }
 
-    /// Compute the minimum stride axis (absolute value), under the constraint
-    /// that the length of the axis is > 1;
+    /// Compute the minimum stride axis (absolute value), preferring axes with
+    /// length > 1.
     #[doc(hidden)]
     fn min_stride_axis(&self, strides: &Self) -> Axis {
         let n = match self.ndim() {
@@ -301,7 +301,7 @@ pub trait Dimension : Clone + Eq + Debug + Send + Sync + Default +
             n => n,
         };
         axes_of(self, strides)
-            .rev()
+            .filter(|ax| ax.len() > 1)
             .min_by_key(|ax| ax.stride().abs())
             .map_or(Axis(n - 1), |ax| ax.axis())
     }
@@ -588,9 +588,9 @@ impl Dimension for Dim<[Ix; 2]> {
 
     #[inline]
     fn min_stride_axis(&self, strides: &Self) -> Axis {
-        let s = get!(strides, 0) as Ixs;
-        let t = get!(strides, 1) as Ixs;
-        if s.abs() < t.abs() {
+        let s = (get!(strides, 0) as isize).abs();
+        let t = (get!(strides, 1) as isize).abs();
+        if s < t && get!(self, 0) > 1 {
             Axis(0)
         } else {
             Axis(1)
@@ -697,6 +697,23 @@ impl Dimension for Dim<[Ix; 3]> {
         Some(Ix3(i, j, k))
     }
 
+    #[inline]
+    fn min_stride_axis(&self, strides: &Self) -> Axis {
+        let s = (get!(strides, 0) as isize).abs();
+        let t = (get!(strides, 1) as isize).abs();
+        let u = (get!(strides, 2) as isize).abs();
+        let (argmin, min) = if t < u && get!(self, 1) > 1 {
+            (Axis(1), t)
+        } else {
+            (Axis(2), u)
+        };
+        if s < min && get!(self, 0) > 1 {
+            Axis(0)
+        } else {
+            argmin
+        }
+    }
+
     /// Self is an index, return the stride offset
     #[inline]
     fn stride_offset(index: &Self, strides: &Self) -> isize {
diff --git a/src/dimension/mod.rs b/src/dimension/mod.rs
index aebcfc662..719acc678 100644
--- a/src/dimension/mod.rs
+++ b/src/dimension/mod.rs
@@ -629,7 +629,8 @@ mod test {
     use crate::error::{from_kind, ErrorKind};
     use crate::slice::Slice;
     use num_integer::gcd;
-    use quickcheck::{quickcheck, TestResult};
+    use quickcheck::TestResult;
+    use quickcheck_macros::quickcheck;
 
     #[test]
     fn slice_indexing_uncommon_strides() {
@@ -738,30 +739,29 @@ mod test {
         can_index_slice::<(), _>(&[], &Ix2(0, 2), &Ix2(2, 1)).unwrap_err();
     }
 
-    quickcheck! {
-        fn can_index_slice_not_custom_same_as_can_index_slice(data: Vec<u8>, dim: Vec<usize>) -> bool {
-            let dim = IxDyn(&dim);
-            let result = can_index_slice_not_custom(&data, &dim);
-            if dim.size_checked().is_none() {
-                // Avoid overflow `dim.default_strides()` or `dim.fortran_strides()`.
-                result.is_err()
-            } else {
-                result == can_index_slice(&data, &dim, &dim.default_strides()) &&
-                    result == can_index_slice(&data, &dim, &dim.fortran_strides())
-            }
+    #[quickcheck]
+    fn can_index_slice_not_custom_same_as_can_index_slice(data: Vec<u8>, dim: Vec<usize>) -> bool {
+        let dim = IxDyn(&dim);
+        let result = can_index_slice_not_custom(&data, &dim);
+        if dim.size_checked().is_none() {
+            // Avoid overflow `dim.default_strides()` or `dim.fortran_strides()`.
+            result.is_err()
+        } else {
+            result == can_index_slice(&data, &dim, &dim.default_strides()) &&
+                result == can_index_slice(&data, &dim, &dim.fortran_strides())
         }
     }
 
-    quickcheck! {
-        fn extended_gcd_solves_eq(a: isize, b: isize) -> bool {
-            let (g, (x, y)) = extended_gcd(a, b);
-            a * x + b * y == g
-        }
+    #[quickcheck]
+    fn extended_gcd_solves_eq(a: isize, b: isize) -> bool {
+        let (g, (x, y)) = extended_gcd(a, b);
+        a * x + b * y == g
+    }
 
-        fn extended_gcd_correct_gcd(a: isize, b: isize) -> bool {
-            let (g, _) = extended_gcd(a, b);
-            g == gcd(a, b)
-        }
+    #[quickcheck]
+    fn extended_gcd_correct_gcd(a: isize, b: isize) -> bool {
+        let (g, _) = extended_gcd(a, b);
+        g == gcd(a, b)
     }
 
     #[test]
@@ -773,73 +773,72 @@ mod test {
         assert_eq!(extended_gcd(-5, 0), (5, (-1, 0)));
     }
 
-    quickcheck! {
-        fn solve_linear_diophantine_eq_solution_existence(
-            a: isize, b: isize, c: isize
-        ) -> TestResult {
-            if a == 0 || b == 0 {
-                TestResult::discard()
-            } else {
-                TestResult::from_bool(
-                    (c % gcd(a, b) == 0) == solve_linear_diophantine_eq(a, b, c).is_some()
-                )
-            }
+    #[quickcheck]
+    fn solve_linear_diophantine_eq_solution_existence(
+        a: isize, b: isize, c: isize
+    ) -> TestResult {
+        if a == 0 || b == 0 {
+            TestResult::discard()
+        } else {
+            TestResult::from_bool(
+                (c % gcd(a, b) == 0) == solve_linear_diophantine_eq(a, b, c).is_some()
+            )
         }
+    }
 
-        fn solve_linear_diophantine_eq_correct_solution(
-            a: isize, b: isize, c: isize, t: isize
-        ) -> TestResult {
-            if a == 0 || b == 0 {
-                TestResult::discard()
-            } else {
-                match solve_linear_diophantine_eq(a, b, c) {
-                    Some((x0, xd)) => {
-                        let x = x0 + xd * t;
-                        let y = (c - a * x) / b;
-                        TestResult::from_bool(a * x + b * y == c)
-                    }
-                    None => TestResult::discard(),
+    #[quickcheck]
+    fn solve_linear_diophantine_eq_correct_solution(
+        a: isize, b: isize, c: isize, t: isize
+    ) -> TestResult {
+        if a == 0 || b == 0 {
+            TestResult::discard()
+        } else {
+            match solve_linear_diophantine_eq(a, b, c) {
+                Some((x0, xd)) => {
+                    let x = x0 + xd * t;
+                    let y = (c - a * x) / b;
+                    TestResult::from_bool(a * x + b * y == c)
                 }
+                None => TestResult::discard(),
             }
         }
     }
 
-    quickcheck! {
-        fn arith_seq_intersect_correct(
-            first1: isize, len1: isize, step1: isize,
-            first2: isize, len2: isize, step2: isize
-        ) -> TestResult {
-            use std::cmp;
+    #[quickcheck]
+    fn arith_seq_intersect_correct(
+        first1: isize, len1: isize, step1: isize,
+        first2: isize, len2: isize, step2: isize
+    ) -> TestResult {
+        use std::cmp;
 
-            if len1 == 0 || len2 == 0 {
-                // This case is impossible to reach in `arith_seq_intersect()`
-                // because the `min*` and `max*` arguments are inclusive.
-                return TestResult::discard();
-            }
-            let len1 = len1.abs();
-            let len2 = len2.abs();
-
-            // Convert to `min*` and `max*` arguments for `arith_seq_intersect()`.
-            let last1 = first1 + step1 * (len1 - 1);
-            let (min1, max1) = (cmp::min(first1, last1), cmp::max(first1, last1));
-            let last2 = first2 + step2 * (len2 - 1);
-            let (min2, max2) = (cmp::min(first2, last2), cmp::max(first2, last2));
-
-            // Naively determine if the sequences intersect.
-            let seq1: Vec<_> = (0..len1)
-                .map(|n| first1 + step1 * n)
-                .collect();
-            let intersects = (0..len2)
-                .map(|n| first2 + step2 * n)
-                .any(|elem2| seq1.contains(&elem2));
-
-            TestResult::from_bool(
-                arith_seq_intersect(
-                    (min1, max1, if step1 == 0 { 1 } else { step1 }),
-                    (min2, max2, if step2 == 0 { 1 } else { step2 })
-                ) == intersects
-            )
+        if len1 == 0 || len2 == 0 {
+            // This case is impossible to reach in `arith_seq_intersect()`
+            // because the `min*` and `max*` arguments are inclusive.
+            return TestResult::discard();
         }
+        let len1 = len1.abs();
+        let len2 = len2.abs();
+
+        // Convert to `min*` and `max*` arguments for `arith_seq_intersect()`.
+        let last1 = first1 + step1 * (len1 - 1);
+        let (min1, max1) = (cmp::min(first1, last1), cmp::max(first1, last1));
+        let last2 = first2 + step2 * (len2 - 1);
+        let (min2, max2) = (cmp::min(first2, last2), cmp::max(first2, last2));
+
+        // Naively determine if the sequences intersect.
+        let seq1: Vec<_> = (0..len1)
+            .map(|n| first1 + step1 * n)
+            .collect();
+        let intersects = (0..len2)
+            .map(|n| first2 + step2 * n)
+            .any(|elem2| seq1.contains(&elem2));
+
+        TestResult::from_bool(
+            arith_seq_intersect(
+                (min1, max1, if step1 == 0 { 1 } else { step1 }),
+                (min2, max2, if step2 == 0 { 1 } else { step2 })
+            ) == intersects
+        )
     }
 
     #[test]
diff --git a/src/impl_methods.rs b/src/impl_methods.rs
index de74af795..2e2c632e9 100644
--- a/src/impl_methods.rs
+++ b/src/impl_methods.rs
@@ -1619,12 +1619,11 @@ where
         axes_of(&self.dim, &self.strides)
     }
 
-    /*
-    /// Return the axis with the least stride (by absolute value)
-    pub fn min_stride_axis(&self) -> Axis {
+    /// Return the axis with the least stride (by absolute value),
+    /// preferring axes with len > 1.
+    pub(crate) fn min_stride_axis(&self) -> Axis {
         self.dim.min_stride_axis(&self.strides)
     }
-    */
 
     /// Return the axis with the greatest stride (by absolute value),
     /// preferring axes with len > 1.
@@ -1854,25 +1853,11 @@ where
         } else {
             let mut v = self.view();
             // put the narrowest axis at the last position
-            match v.ndim() {
-                0 | 1 => {}
-                2 => {
-                    if self.len_of(Axis(1)) <= 1
-                        || self.len_of(Axis(0)) > 1
-                            && self.stride_of(Axis(0)).abs() < self.stride_of(Axis(1)).abs()
-                    {
-                        v.swap_axes(0, 1);
-                    }
-                }
-                n => {
-                    let last = n - 1;
-                    let narrow_axis = v
-                        .axes()
-                        .filter(|ax| ax.len() > 1)
-                        .min_by_key(|ax| ax.stride().abs())
-                        .map_or(last, |ax| ax.axis().index());
-                    v.swap_axes(last, narrow_axis);
-                }
+            let n = v.ndim();
+            if n > 1 {
+                let last = n - 1;
+                let narrow_axis = self.min_stride_axis();
+                v.swap_axes(last, narrow_axis.index());
             }
             v.into_elements_base().fold(init, f)
         }
@@ -2103,3 +2088,42 @@ where
         })
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use crate::prelude::*;
+
+    #[test]
+    fn min_stride_axis() {
+        let a = Array1::<u8>::zeros(10);
+        assert_eq!(a.min_stride_axis(), Axis(0));
+
+        let a = Array2::<u8>::zeros((3, 3));
+        assert_eq!(a.min_stride_axis(), Axis(1));
+        assert_eq!(a.t().min_stride_axis(), Axis(0));
+
+        let a = ArrayD::<u8>::zeros(vec![3, 3]);
+        assert_eq!(a.min_stride_axis(), Axis(1));
+        assert_eq!(a.t().min_stride_axis(), Axis(0));
+
+        let min_axis = a.axes().min_by_key(|t| t.2.abs()).unwrap().axis();
+        assert_eq!(min_axis, Axis(1));
+
+        let mut b = ArrayD::<u8>::zeros(vec![2, 3, 4, 5]);
+        assert_eq!(b.min_stride_axis(), Axis(3));
+        for ax in 0..3 {
+            b.swap_axes(3, ax);
+            assert_eq!(b.min_stride_axis(), Axis(ax));
+            b.swap_axes(3, ax);
+        }
+        let mut v = b.view();
+        v.collapse_axis(Axis(3), 0);
+        assert_eq!(v.min_stride_axis(), Axis(2));
+
+        let a = Array2::<u8>::zeros((3, 3));
+        let v = a.broadcast((8, 3, 3)).unwrap();
+        assert_eq!(v.min_stride_axis(), Axis(0));
+        let v2 = a.broadcast((1, 3, 3)).unwrap();
+        assert_eq!(v2.min_stride_axis(), Axis(2));
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index d998be49f..39402bf56 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -104,6 +104,10 @@ extern crate num_integer;
 
 #[cfg(test)]
 extern crate quickcheck;
+#[cfg(test)]
+extern crate quickcheck_macros;
+#[cfg(test)]
+extern crate rand;
 
 #[cfg(feature = "docs")]
 pub mod doc;
diff --git a/src/numeric/impl_numeric.rs b/src/numeric/impl_numeric.rs
index 0016e21c5..cf2c0eafd 100644
--- a/src/numeric/impl_numeric.rs
+++ b/src/numeric/impl_numeric.rs
@@ -8,7 +8,6 @@
 
 use std::ops::{Add, Div, Mul};
 use num_traits::{self, Zero, Float, FromPrimitive};
-use itertools::free::enumerate;
 
 use crate::imp_prelude::*;
 use crate::numeric_util;
@@ -33,17 +32,17 @@ impl<A, S, D> ArrayBase<S, D>
         where A: Clone + Add<Output=A> + num_traits::Zero,
     {
         if let Some(slc) = self.as_slice_memory_order() {
-            return numeric_util::unrolled_fold(slc, A::zero, A::add);
+            return numeric_util::pairwise_sum(&slc);
         }
-        let mut sum = A::zero();
-        for row in self.inner_rows() {
-            if let Some(slc) = row.as_slice() {
-                sum = sum + numeric_util::unrolled_fold(slc, A::zero, A::add);
-            } else {
-                sum = sum + row.iter().fold(A::zero(), |acc, elt| acc + elt.clone());
+        if self.ndim() > 1 {
+            let ax = self.dim.min_stride_axis(&self.strides);
+            if self.len_of(ax) >= numeric_util::UNROLL_SIZE && self.stride_of(ax) == 1 {
+                let partial_sums: Vec<_> =
+                    self.lanes(ax).into_iter().map(|lane| lane.sum()).collect();
+                return numeric_util::pure_pairwise_sum(&partial_sums);
             }
         }
-        sum
+        numeric_util::iterator_pairwise_sum(self.iter())
     }
 
     /// Return the sum of all elements in the array.
@@ -104,21 +103,19 @@ impl<A, S, D> ArrayBase<S, D>
               D: RemoveAxis,
     {
         let n = self.len_of(axis);
-        let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
-        let stride = self.strides()[axis.index()];
-        if self.ndim() == 2 && stride == 1 {
+        if self.stride_of(axis) == 1 {
             // contiguous along the axis we are summing
-            let ax = axis.index();
-            for (i, elt) in enumerate(&mut res) {
-                *elt = self.index_axis(Axis(1 - ax), i).sum();
-            }
+            let mut res = Array::zeros(self.raw_dim().remove_axis(axis));
+            Zip::from(&mut res)
+                .and(self.lanes(axis))
+                .apply(|sum, lane| *sum = lane.sum());
+            res
+        } else if n <= numeric_util::NAIVE_SUM_THRESHOLD {
+            self.fold_axis(axis, A::zero(), |acc, x| acc.clone() + x.clone())
         } else {
-            for i in 0..n {
-                let view = self.index_axis(axis, i);
-                res = res + &view;
-            }
+            let (v1, v2) = self.view().split_at(axis, n / 2);
+            v1.sum_axis(axis) + v2.sum_axis(axis)
         }
-        res
     }
 
     /// Return mean along `axis`.
@@ -287,3 +284,150 @@ impl<A, S, D> ArrayBase<S, D>
     }
 }
 
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use super::numeric_util::{NAIVE_SUM_THRESHOLD, UNROLL_SIZE};
+    use self::{Array, s};
+    use quickcheck::{QuickCheck, StdGen, TestResult};
+
+    #[test]
+    fn test_sum_value_does_not_depend_on_axis() {
+        // `size` controls the length of the array of data
+        // We set it to be randomly drawn between 0 and
+        // a number larger than NAIVE_SUM_THRESHOLD * UNROLL_SIZE
+        let rng = StdGen::new(
+            rand::thread_rng(),
+            5* (NAIVE_SUM_THRESHOLD * UNROLL_SIZE).pow(3)
+        );
+        let mut quickcheck = QuickCheck::new().gen(rng).tests(100);
+        quickcheck.quickcheck(
+           _sum_value_does_not_depend_on_axis
+            as fn(
+               Vec<f64>
+            ) -> TestResult,
+        );
+    }
+
+    fn _sum_value_does_not_depend_on_axis(xs: Vec<f64>) -> TestResult {
+        // We want three axis of equal length - we drop some elements
+        // to get the right number
+        let axis_length = (xs.len() as f64).cbrt().floor() as usize;
+        let xs = &xs[..axis_length.pow(3)];
+
+        // We want to check that summing with respect to an axis
+        // is independent from the specific underlying implementation of
+        // pairwise sum, which is itself conditional on the arrangement
+        // in memory of the array elements.
+        // We will thus swap axes and compute the sum, in turn, with respect to
+        // axes 0, 1 and 2, while making sure that mathematically the same
+        // number should be spit out (because we are properly transposing before summing).
+        if axis_length > 0 {
+            let (a, b, c) = equivalent_arrays(xs.to_vec(), axis_length);
+
+            let sum1 = a.sum_axis(Axis(0));
+            let sum2 = b.sum_axis(Axis(1));
+            let sum3 = c.sum_axis(Axis(2));
+
+            let tol = 1e-10;
+            let first = (sum2.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let second = (sum3.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let third = (sum3.clone() - sum2.clone()).iter().all(|x| x.abs() < tol);
+
+            if first && second && third {
+                TestResult::passed()
+            } else {
+                TestResult::failed()
+            }
+        } else {
+            TestResult::passed()
+        }
+    }
+
+    #[test]
+    fn test_sum_value_does_not_depend_on_axis_with_discontinuous_array() {
+        // `size` controls the length of the array of data
+        // We set it to be randomly drawn between 0 and
+        // a number larger than NAIVE_SUM_THRESHOLD * UNROLL_SIZE
+        let rng = StdGen::new(
+            rand::thread_rng(),
+            5* (NAIVE_SUM_THRESHOLD * UNROLL_SIZE).pow(3)
+        );
+        let mut quickcheck = QuickCheck::new().gen(rng).tests(100);
+        quickcheck.quickcheck(
+            _sum_value_does_not_depend_on_axis_w_discontinuous_array
+                as fn(
+                Vec<f64>
+            ) -> TestResult,
+        );
+    }
+
+    fn _sum_value_does_not_depend_on_axis_w_discontinuous_array(xs: Vec<f64>) -> TestResult {
+        // We want three axis of equal length - we drop some elements
+        // to get the right number
+        let axis_length = (xs.len() as f64).cbrt().floor() as usize;
+        let xs = &xs[..axis_length.pow(3)];
+
+        // We want to check that summing with respect to an axis
+        // is independent from the specific underlying implementation of
+        // pairwise sum, which is itself conditional on the arrangement
+        // in memory of the array elements.
+        // We will thus swap axes and compute the sum, in turn, with respect to
+        // axes 0, 1 and 2, while making sure that mathematically the same
+        // number should be spit out (because we are properly transposing before summing).
+        if axis_length > 0 {
+            let (a, b, c) = equivalent_arrays(xs.to_vec(), axis_length);
+
+            let sum1 = a.slice(s![..;2, .., ..]).sum_axis(Axis(0));
+            let sum2 = b.slice(s![.., ..;2, ..]).sum_axis(Axis(1));
+            let sum3 = c.slice(s![.., .., ..;2]).sum_axis(Axis(2));
+
+            let tol = 1e-10;
+            let first = (sum2.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let second = (sum3.clone() - sum1.clone()).iter().all(|x| x.abs() < tol);
+            let third = (sum3.clone() - sum2.clone()).iter().all(|x| x.abs() < tol);
+
+            if first && second && third {
+                TestResult::passed()
+            } else {
+                TestResult::failed()
+            }
+        } else {
+            TestResult::passed()
+        }
+    }
+
+    // Given a vector with axis_length^3 elements, it returns three arrays,
+    // built using the vector elements, such that (mathematically):
+    // a.sum_axis(Axis(0) == b.sum_axis(Axis(1)) == c.sum_axis(Axis(2))
+    fn equivalent_arrays(xs: Vec<f64>, axis_length: usize) -> (Array3<f64>, Array3<f64>, Array3<f64>) {
+        assert!(xs.len() == axis_length.pow(3));
+
+        let a = Array::from_vec(xs)
+            .into_shape((axis_length, axis_length, axis_length))
+            .unwrap();
+        assert!(a.is_standard_layout());
+
+        let mut b = Array::zeros(a.raw_dim());
+        assert!(b.is_standard_layout());
+        for i in 0..axis_length {
+            for j in 0..axis_length {
+                for k in 0..axis_length {
+                    b[(i, j, k)] = a[(j, i, k)].clone();
+                }
+            }
+        }
+
+        let mut c = Array::zeros(a.raw_dim());
+        assert!(c.is_standard_layout());
+        for i in 0..axis_length {
+            for j in 0..axis_length {
+                for k in 0..axis_length {
+                    c[(i, j, k)] = a[(k, i, j)].clone();
+                }
+            }
+        }
+        return (a, b, c)
+    }
+
+}
\ No newline at end of file
diff --git a/src/numeric_util.rs b/src/numeric_util.rs
index b36d11605..f6540fb92 100644
--- a/src/numeric_util.rs
+++ b/src/numeric_util.rs
@@ -6,9 +6,95 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 use std::cmp;
-
+use std::ops::Add;
+use num_traits::{self, Zero};
 use crate::LinalgScalar;
 
+/// Size threshold to switch to naive summation in all implementations of pairwise summation.
+#[cfg(not(test))]
+pub(crate) const NAIVE_SUM_THRESHOLD: usize = 64;
+// Set it to a smaller number for testing purposes
+#[cfg(test)]
+pub(crate) const NAIVE_SUM_THRESHOLD: usize = 2;
+
+/// Number of elements processed by unrolled operators (to leverage SIMD instructions).
+pub(crate) const UNROLL_SIZE: usize = 8;
+
+/// An implementation of pairwise summation for a vector slice.
+///
+/// Pairwise summation compute the sum of a set of *n* numbers by splitting
+/// it recursively in two halves, summing their elements and then adding the respective
+/// sums.
+/// It switches to the naive sum algorithm once the size of the set to be summed
+/// is below a certain pre-defined threshold ([`threshold`]).
+///
+/// Pairwise summation is useful to reduce the accumulated round-off error
+/// when summing floating point numbers.
+/// Pairwise summation provides an asymptotic error bound of *O(ε* log *n)*, where
+/// *ε* is machine precision, compared to *O(εn)* of the naive summation algorithm.
+/// For more details, see [`paper`] or [`Wikipedia`].
+///
+/// [`paper`]: https://epubs.siam.org/doi/10.1137/0914050
+/// [`Wikipedia`]: https://en.wikipedia.org/wiki/Pairwise_summation
+/// [`threshold`]: constant.NAIVE_SUM_THRESHOLD.html
+pub(crate) fn pairwise_sum<A>(v: &[A]) -> A
+where
+    A: Clone + Add<Output=A> + Zero,
+{
+    let n = v.len();
+    if n <= NAIVE_SUM_THRESHOLD * UNROLL_SIZE {
+        return unrolled_fold(v, A::zero, A::add);
+    } else {
+        let mid_index = n / 2;
+        let (v1, v2) = v.split_at(mid_index);
+        pairwise_sum(v1) + pairwise_sum(v2)
+    }
+}
+
+/// An implementation of pairwise summation for an iterator.
+///
+/// See [`pairwise_sum`] for details on the algorithm.
+///
+/// [`pairwise_sum`]: fn.pairwise_sum.html
+pub(crate) fn iterator_pairwise_sum<'a, I, A: 'a>(iter: I) -> A
+where
+    I: Iterator<Item=&'a A>,
+    A: Clone + Add<Output=A> + Zero,
+{
+    let (len, _) = iter.size_hint();
+    let cap = len / NAIVE_SUM_THRESHOLD + if len % NAIVE_SUM_THRESHOLD != 0 { 1 } else { 0 };
+    let mut partial_sums = Vec::with_capacity(cap);
+    let (_, last_sum) = iter.fold((0, A::zero()), |(count, partial_sum), x| {
+        if count < NAIVE_SUM_THRESHOLD {
+            (count + 1, partial_sum + x.clone())
+        } else {
+            partial_sums.push(partial_sum);
+            (1, x.clone())
+        }
+    });
+    partial_sums.push(last_sum);
+
+    pure_pairwise_sum(&partial_sums)
+}
+
+/// An implementation of pairwise summation for a vector slice that never
+/// switches to the naive sum algorithm.
+pub(crate) fn pure_pairwise_sum<A>(v: &[A]) -> A
+    where
+        A: Clone + Add<Output=A> + Zero,
+{
+    let n = v.len();
+    match n {
+        0 => A::zero(),
+        1 => v[0].clone(),
+        n => {
+            let mid_index = n / 2;
+            let (v1, v2) = v.split_at(mid_index);
+            pure_pairwise_sum(v1) + pure_pairwise_sum(v2)
+        }
+    }
+}
+
 /// Fold over the manually unrolled `xs` with `f`
 pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
     where A: Clone,
@@ -17,7 +103,6 @@ pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
 {
     // eightfold unrolled so that floating point can be vectorized
     // (even with strict floating point accuracy semantics)
-    let mut acc = init();
     let (mut p0, mut p1, mut p2, mut p3,
          mut p4, mut p5, mut p6, mut p7) =
         (init(), init(), init(), init(),
@@ -34,18 +119,19 @@ pub fn unrolled_fold<A, I, F>(mut xs: &[A], init: I, f: F) -> A
 
         xs = &xs[8..];
     }
-    acc = f(acc.clone(), f(p0, p4));
-    acc = f(acc.clone(), f(p1, p5));
-    acc = f(acc.clone(), f(p2, p6));
-    acc = f(acc.clone(), f(p3, p7));
+    let (q0, q1, q2, q3) = (f(p0, p4), f(p1, p5), f(p2, p6), f(p3, p7));
+    let (r0, r1) = (f(q0, q2), f(q1, q3));
+    let unrolled = f(r0, r1);
 
     // make it clear to the optimizer that this loop is short
     // and can not be autovectorized.
+    let mut partial = init();
     for i in 0..xs.len() {
         if i >= 7 { break; }
-        acc = f(acc.clone(), xs[i].clone())
+        partial = f(partial.clone(), xs[i].clone())
     }
-    acc
+
+    f(unrolled, partial)
 }
 
 /// Compute the dot product.
@@ -126,3 +212,15 @@ pub fn unrolled_eq<A>(xs: &[A], ys: &[A]) -> bool
 
     true
 }
+
+#[cfg(test)]
+mod tests {
+    use quickcheck_macros::quickcheck;
+    use std::num::Wrapping;
+    use super::iterator_pairwise_sum;
+
+    #[quickcheck]
+    fn iterator_pairwise_sum_is_correct(xs: Vec<Wrapping<i32>>) -> bool {
+        iterator_pairwise_sum(xs.iter()) == xs.iter().sum()
+    }
+}
diff --git a/tests/dimension.rs b/tests/dimension.rs
index c76b8d7ad..11d6b3e0f 100644
--- a/tests/dimension.rs
+++ b/tests/dimension.rs
@@ -132,37 +132,6 @@ fn fastest_varying_order() {
 
 type ArrayF32<D> = Array<f32, D>;
 
-/*
-#[test]
-fn min_stride_axis() {
-    let a = ArrayF32::zeros(10);
-    assert_eq!(a.min_stride_axis(), Axis(0));
-
-    let a = ArrayF32::zeros((3, 3));
-    assert_eq!(a.min_stride_axis(), Axis(1));
-    assert_eq!(a.t().min_stride_axis(), Axis(0));
-
-    let a = ArrayF32::zeros(vec![3, 3]);
-    assert_eq!(a.min_stride_axis(), Axis(1));
-    assert_eq!(a.t().min_stride_axis(), Axis(0));
-
-    let min_axis = a.axes().min_by_key(|t| t.2.abs()).unwrap().axis();
-    assert_eq!(min_axis, Axis(1));
-
-    let mut b = ArrayF32::zeros(vec![2, 3, 4, 5]);
-    assert_eq!(b.min_stride_axis(), Axis(3));
-    for ax in 0..3 {
-        b.swap_axes(3, ax);
-        assert_eq!(b.min_stride_axis(), Axis(ax));
-        b.swap_axes(3, ax);
-    }
-
-    let a = ArrayF32::zeros((3, 3));
-    let v = a.broadcast((8, 3, 3)).unwrap();
-    assert_eq!(v.min_stride_axis(), Axis(0));
-}
-*/
-
 #[test]
 fn max_stride_axis() {
     let a = ArrayF32::zeros(10);