Update loops in CpuMath to be more efficient (#1177)

* Update loops to be more efficient
dotnet · Oct 31, 2018 · 71c9ff3 · 71c9ff3
1 parent 273e36c
commit 71c9ff3
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 6 deletions.
diff --git a/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs b/src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
@@ -793,6 +793,7 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
             {
                 float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
+                int destinationEnd = pDstEnd - 4;
 
                 Vector256<float> scalarVector256 = Avx.SetAllVector256(scalar);
 
@@ -807,7 +808,7 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
 
                 Vector128<float> scalarVector128 = Sse.SetAllVector128(scalar);
 
-                if (pDstCurrent + 4 <= pDstEnd)
+                if (pDstCurrent <= destinationEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
                     dstVector = Sse.Add(dstVector, scalarVector128);
@@ -956,6 +957,7 @@ public static unsafe void ScaleSrcU(float scale, ReadOnlySpan<float> src, Span<f
                 float* pDstEnd = pdst + count;
                 float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
+                int destinationEnd = pDstEnd - 4;
 
                 Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);
 
@@ -971,7 +973,7 @@ public static unsafe void ScaleSrcU(float scale, ReadOnlySpan<float> src, Span<f
 
                 Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);
 
-                if (pDstCurrent + 4 <= pDstEnd)
+                if (pDstCurrent <= destinationEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Multiply(srcVector, scaleVector128);
@@ -1000,6 +1002,7 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
             {
                 float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
+                int destinationEnd = pDstEnd - 4;
 
                 Vector256<float> a256 = Avx.SetAllVector256(a);
                 Vector256<float> b256 = Avx.SetAllVector256(b);
@@ -1017,7 +1020,7 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
                 Vector128<float> a128 = Sse.SetAllVector128(a);
                 Vector128<float> b128 = Sse.SetAllVector128(b);
 
-                if (pDstCurrent + 4 <= pDstEnd)
+                if (pDstCurrent <= destinationEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
                     dstVector = Sse.Add(dstVector, b128);

diff --git a/src/Microsoft.ML.CpuMath/SseIntrinsics.cs b/src/Microsoft.ML.CpuMath/SseIntrinsics.cs
@@ -755,10 +755,11 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
             {
                 float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
+                int destinationEnd = pDstEnd - 4;
 
                 Vector128<float> scalarVector = Sse.SetAllVector128(scalar);
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                while (pDstCurrent <= destinationEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
                     dstVector = Sse.Add(dstVector, scalarVector);
@@ -898,10 +899,11 @@ public static unsafe void ScaleSrcU(float scale, ReadOnlySpan<float> src, Span<f
                 float* pDstEnd = pdst + count;
                 float* pSrcCurrent = psrc;
                 float* pDstCurrent = pdst;
+                int destinationEnd = pDstEnd - 4;
 
                 Vector128<float> scaleVector = Sse.SetAllVector128(scale);
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                while (pDstCurrent <= destinationEnd)
                 {
                     Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
                     srcVector = Sse.Multiply(srcVector, scaleVector);
@@ -930,11 +932,12 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
             {
                 float* pDstEnd = pdst + dst.Length;
                 float* pDstCurrent = pdst;
+                int destinationEnd = pDstEnd - 4;
 
                 Vector128<float> aVector = Sse.SetAllVector128(a);
                 Vector128<float> bVector = Sse.SetAllVector128(b);
 
-                while (pDstCurrent + 4 <= pDstEnd)
+                while (pDstCurrent <= destinationEnd)
                 {
                     Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
                     dstVector = Sse.Add(dstVector, bVector);