Merge branch 'upstream-eigen3' into update_eigen3.3.8

Changes from last update: ``` git shortlog --no-merges 89449a0821676c5af96cc51973de^..0fd6b4f71dd85b20 | cat ``` ``` Alexander Grund (1): Make relative path variables of type STRING Christoph Hertzberg (7): Remove .hgignore and copy .gitignore from master branch Bug InsightSoftwareConsortium#1788: Fix rule-of-three violations inside the stable modules. This fixes deprecated-copy warnings when compiling with GCC>=9 Also protect some additional Base-constructors from getting called by user code code (InsightSoftwareConsortium#1587) Fix some maybe-uninitialized warnings Bug InsightSoftwareConsortium#2036 make sure find_standard_math_library_test_program actually compiles (and is guaranteed to call math functions) bug InsightSoftwareConsortium#1746: Removed implementation of standard copy-constructor and standard copy-assign-operator from PermutationMatrix and Transpositions to allow malloc-less std::move. Added unit-test to rvalue_types Remove unused variable Fix doxygen class block that was wrongly named. David Tellenbach (9): Fix StlDeque for GCC 10 Bump to 3.3.8-rc1 Fix failure in GEBP kernel when compiling with OpenMP and FMA Fix undefined behaviour caused by uncaught exceptions in OMP section Bump to 3.3.8 Define coeff-wise binary array operators for base class Mention problems when using potentially throwing scalars and OpenMP Rename test/array.cpp to test/array_cwise.cpp Bump to 3.3.9 Eugene Zhulenev (1): Change typedefs from private to protected to fix MSVC compilation Florian Maurin (1): Fix typo in doc Gael Guennebaud (11): Add missing footer declaration remove piwik tracker add a banner to advertise the survey Fix compilation with AVX512 Fix compilation with AVX512 and AVX/SSE packet-math functions Backport AVX512 implementation from devel branch to 3.3 (the 3.3 version had many issues) Fix InsightSoftwareConsortium#1974: assertion when reserving an empty sparse matrix Relaxed fastmath unit test: if std::foo fails, then let's only trigger a warning is numext::foo fails too. A true error will triggered only if std::foo works but our numext::foo fails. fix InsightSoftwareConsortium#1901: warning in Mode==(Upper|Lower) relax number of iterations checks to avoid false negatives check two ctors Janek Kozicki (1): Fix Yade high precision Real compilation Jim Lersch (1): Workaround for doxygen class template titles in which the template part of the class signature is lost due to a problem with forward declarations. The problem is probably caused by doxygen bug #7689. It is confirmed to be fixed in doxygen >= 1.8.19. Karl Ljungkvist (1): Fix typo in Tutorial_BlockOperations_block_assignment.cpp Luke Peterson (1): Remove error counting in OpenMP parallelize_gemm Martin Vonheim Larsen (1): Enable MathJax in Doxygen.in Rasmus Munk Larsen (1): Fix incorrect use of std::abs reported in InsightSoftwareConsortium#1823. Simon Pfreundschuh (1): Replaced call to deprecated 'load' function with appropriate call to 'on'. Tobias Bosch (1): Include <sstream> explicitly, and don't rely on the implicit include via <complex>. nluehr (1): Fix incorrect integer cast in predux<half2>(). szczepaniak bartek (1): Fix Paradiso. ```
phcerdan · Dec 8, 2020 · 3448164 · 3448164
2 parents 13089e2 + fbe9859
commit 3448164
Show file tree

Hide file tree

Showing 32 changed files with 503 additions and 507 deletions.
diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt b/Modules/ThirdParty/Eigen3/src/itkeigen/CMakeLists.txt
@@ -394,22 +394,27 @@ endif()
 
 if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
   set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
-      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
+      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
 else()
   set(INCLUDE_INSTALL_DIR
       "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
-      CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
       )
 endif()
 set(CMAKEPACKAGE_INSTALL_DIR
     "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
-    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
     )
 set(PKGCONFIG_INSTALL_DIR
     "${CMAKE_INSTALL_DATADIR}/pkgconfig"
-    CACHE PATH "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
     )
 
+foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR)
+  if(IS_ABSOLUTE "${${var}}")
+    message(FATAL_ERROR "${var} must be relative to CMAKE_PREFIX_PATH. Got: ${${var}}")
+  endif()
+endforeach()
 
 # similar to set_target_properties but append the property instead of overwriting it
 macro(ei_add_target_property target prop value)

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/Core b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/Core
@@ -279,7 +279,10 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
-#include <iosfwd>
+#include <sstream>
+#ifndef EIGEN_NO_IO
+  #include <iosfwd>
+#endif
 #include <cstring>
 #include <string>
 #include <limits>
@@ -375,7 +378,9 @@ using std::ptrdiff_t;
 
 #if defined EIGEN_VECTORIZE_AVX512
   #include "src/Core/arch/SSE/PacketMath.h"
+  #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/AVX/PacketMath.h"
+  #include "src/Core/arch/AVX/MathFunctions.h"
   #include "src/Core/arch/AVX512/PacketMath.h"
   #include "src/Core/arch/AVX512/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_AVX

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/ArrayBase.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/ArrayBase.h
@@ -153,8 +153,8 @@ template<typename Derived> class ArrayBase
 //     inline void evalTo(Dest& dst) const { dst = matrix(); }
 
   protected:
-    EIGEN_DEVICE_FUNC
-    ArrayBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
 
   private:
     explicit ArrayBase(Index);

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/CwiseUnaryView.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/CwiseUnaryView.h
@@ -121,6 +121,8 @@ class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
     {
       return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
     }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
 };
 
 } // end namespace Eigen

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/DenseBase.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/DenseBase.h
@@ -587,11 +587,12 @@ template<typename Derived> class DenseBase
     }
 
   protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
     /** Default constructor. Do nothing. */
     EIGEN_DEVICE_FUNC DenseBase()
     {
       /* Just checks for self-consistency of the flags.
-       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
+       * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
        */
 #ifdef EIGEN_INTERNAL_DEBUGGING
       EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/MapBase.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/MapBase.h
@@ -182,6 +182,8 @@ template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
     #endif
 
   protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -294,6 +296,9 @@ template<typename Derived> class MapBase<Derived, WriteAccessors>
     // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
     // see bugs 821 and 920.
     using ReadOnlyMapBase::Base::operator=;
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 };
 
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/MatrixBase.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/MatrixBase.h
@@ -464,7 +464,8 @@ template<typename Derived> class MatrixBase
     EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex<RealScalar>& p)
 
   protected:
-    EIGEN_DEVICE_FUNC MatrixBase() : Base() {}
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
 
   private:
     EIGEN_DEVICE_FUNC explicit MatrixBase(int);

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/PermutationMatrix.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/PermutationMatrix.h
@@ -87,17 +87,6 @@ class PermutationBase : public EigenBase<Derived>
       return derived();
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const PermutationBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
     /** \returns the number of rows */
     inline Index rows() const { return Index(indices().size()); }
 
@@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
     inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
-    #endif
-
     /** Generic constructor from expression of the indices. The indices
       * array has the meaning that the permutations sends each integer i to indices[i].
       *
@@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
       return Base::operator=(tr.derived());
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    PermutationMatrix& operator=(const PermutationMatrix& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** const version of indices(). */
     const IndicesType& indices() const { return m_indices; }
     /** \returns a reference to the stored array representing the permutation. */

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/Transpose.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/Transpose.h
@@ -146,6 +146,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
     {
       return derived().nestedExpression().coeffRef(index);
     }
+  protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
 };
 
 /** \returns an expression of the transpose of *this.

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/Transpositions.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/Transpositions.h
@@ -33,17 +33,6 @@ class TranspositionsBase
       indices() = other.indices();
       return derived();
     }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const TranspositionsBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
 
     /** \returns the number of transpositions */
     Index size() const { return indices().size(); }
@@ -171,12 +160,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
     inline Transpositions(const TranspositionsBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
-    #endif
-
     /** Generic constructor from expression of the transposition indices. */
     template<typename Other>
     explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
@@ -189,17 +172,6 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
       return Base::operator=(other);
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Transpositions& operator=(const Transpositions& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** Constructs an uninitialized permutation matrix of given size.
       */
     inline Transpositions(Index size) : m_indices(size)
@@ -306,17 +278,6 @@ class TranspositionsWrapper
       return Base::operator=(other);
     }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
     /** const version of indices(). */
     const IndicesType& indices() const { return m_indices; }
 

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/TriangularMatrix.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/TriangularMatrix.h
@@ -217,9 +217,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix)
     {}
 
-    using Base::operator=;
-    TriangularView& operator=(const TriangularView &other)
-    { return Base::operator=(other); }
+    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
 
     /** \copydoc EigenBase::rows() */
     EIGEN_DEVICE_FUNC
@@ -544,6 +542,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     template<typename ProductType>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta);
+  protected:
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
+
 };
 
 /***************************************************************************

diff --git a/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Modules/ThirdParty/Eigen3/src/itkeigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -29,6 +29,7 @@ namespace internal {
 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
   const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
 
+
 // Natural logarithm
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@@ -47,6 +48,7 @@ plog<Packet16f>(const Packet16f& _x) {
   // The smallest non denormalized float number.
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000);
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000);
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
 
   // Polynomial coefficients.
@@ -64,11 +66,9 @@ plog<Packet16f>(const Packet16f& _x) {
   _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f);
 
   // invalid_mask is set to true when x is NaN
-  __mmask16 invalid_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
-  __mmask16 iszero_mask =
-      _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ);
-
+  __mmask16 invalid_mask =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ);
+  __mmask16 iszero_mask  =  _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ);
+
   // Truncate input values to the minimum positive normal.
   x = pmax(x, p16f_min_norm_pos);
 
@@ -118,11 +118,18 @@ plog<Packet16f>(const Packet16f& _x) {
   x = padd(x, y);
   x = padd(x, y2);
 
-  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN,
+  //  - 0 will be -INF.
+  //  - +INF will be +INF
   return _mm512_mask_blend_ps(iszero_mask,
-                              _mm512_mask_blend_ps(invalid_mask, x, p16f_nan),
-                              p16f_minus_inf);
+            _mm512_mask_blend_ps(invalid_mask,
+              _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf),
+              p16f_nan),
+            p16f_minus_inf);
 }
+
 #endif
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -258,48 +265,39 @@ pexp<Packet8d>(const Packet8d& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 psqrt<Packet16f>(const Packet16f& _x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
-  _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
+  Packet16f neg_half = pmul(_x, pset1<Packet16f>(-.5f));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_ps_mask(_x, pset1<Packet16f>((std::numeric_limits<float>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ));
 
-  Packet16f neg_half = pmul(_x, p16f_minus_half);
-
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ);
-  Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_setzero_ps(), _mm512_rsqrt14_ps(_x));
+  Packet16f x = _mm512_rsqrt14_ps(_x);
 
   // Do a single step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet16f>(1.5f)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  // Flush results for denormals to zero.
+  return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps());
 }
 
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 psqrt<Packet8d>(const Packet8d& _x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
-  _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
+  Packet8d neg_half = pmul(_x, pset1<Packet8d>(-.5));
+  __mmask16 denormal_mask = _mm512_kand(
+      _mm512_cmp_pd_mask(_x, pset1<Packet8d>((std::numeric_limits<double>::min)()),
+                        _CMP_LT_OQ),
+      _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ));
 
-  Packet8d neg_half = pmul(_x, p8d_minus_half);
+  Packet8d x = _mm512_rsqrt14_pd(_x);
 
-  // select only the inverse sqrt of positive normal inputs (denormals are
-  // flushed to zero and cause infs as well).
-  __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ);
-  Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_setzero_pd(), _mm512_rsqrt14_pd(_x));
-
-  // Do a first step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
 
   // Do a second step of Newton's iteration.
-  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+  x = pmul(x, pmadd(neg_half, pmul(x, x), pset1<Packet8d>(1.5)));
 
-  // Multiply the original _x by it's reciprocal square root to extract the
-  // square root.
-  return pmul(_x, x);
+  return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd());
 }
 #else
 template <>