Skip to content

Commit

Permalink
Update loops in CpuMath to be more efficient (#1177)
Browse files Browse the repository at this point in the history
* Update loops to be more efficient
  • Loading branch information
jwood803 authored and shauheen committed Oct 31, 2018
1 parent 273e36c commit 71c9ff3
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 6 deletions.
9 changes: 6 additions & 3 deletions src/Microsoft.ML.CpuMath/AvxIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,7 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
{
float* pDstEnd = pdst + dst.Length;
float* pDstCurrent = pdst;
int destinationEnd = pDstEnd - 4;

Vector256<float> scalarVector256 = Avx.SetAllVector256(scalar);

Expand All @@ -807,7 +808,7 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)

Vector128<float> scalarVector128 = Sse.SetAllVector128(scalar);

if (pDstCurrent + 4 <= pDstEnd)
if (pDstCurrent <= destinationEnd)
{
Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
dstVector = Sse.Add(dstVector, scalarVector128);
Expand Down Expand Up @@ -956,6 +957,7 @@ public static unsafe void ScaleSrcU(float scale, ReadOnlySpan<float> src, Span<f
float* pDstEnd = pdst + count;
float* pSrcCurrent = psrc;
float* pDstCurrent = pdst;
int destinationEnd = pDstEnd - 4;

Vector256<float> scaleVector256 = Avx.SetAllVector256(scale);

Expand All @@ -971,7 +973,7 @@ public static unsafe void ScaleSrcU(float scale, ReadOnlySpan<float> src, Span<f

Vector128<float> scaleVector128 = Sse.SetAllVector128(scale);

if (pDstCurrent + 4 <= pDstEnd)
if (pDstCurrent <= destinationEnd)
{
Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
srcVector = Sse.Multiply(srcVector, scaleVector128);
Expand Down Expand Up @@ -1000,6 +1002,7 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
{
float* pDstEnd = pdst + dst.Length;
float* pDstCurrent = pdst;
int destinationEnd = pDstEnd - 4;

Vector256<float> a256 = Avx.SetAllVector256(a);
Vector256<float> b256 = Avx.SetAllVector256(b);
Expand All @@ -1017,7 +1020,7 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
Vector128<float> a128 = Sse.SetAllVector128(a);
Vector128<float> b128 = Sse.SetAllVector128(b);

if (pDstCurrent + 4 <= pDstEnd)
if (pDstCurrent <= destinationEnd)
{
Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
dstVector = Sse.Add(dstVector, b128);
Expand Down
9 changes: 6 additions & 3 deletions src/Microsoft.ML.CpuMath/SseIntrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -755,10 +755,11 @@ public static unsafe void AddScalarU(float scalar, Span<float> dst)
{
float* pDstEnd = pdst + dst.Length;
float* pDstCurrent = pdst;
int destinationEnd = pDstEnd - 4;

Vector128<float> scalarVector = Sse.SetAllVector128(scalar);

while (pDstCurrent + 4 <= pDstEnd)
while (pDstCurrent <= destinationEnd)
{
Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
dstVector = Sse.Add(dstVector, scalarVector);
Expand Down Expand Up @@ -898,10 +899,11 @@ public static unsafe void ScaleSrcU(float scale, ReadOnlySpan<float> src, Span<f
float* pDstEnd = pdst + count;
float* pSrcCurrent = psrc;
float* pDstCurrent = pdst;
int destinationEnd = pDstEnd - 4;

Vector128<float> scaleVector = Sse.SetAllVector128(scale);

while (pDstCurrent + 4 <= pDstEnd)
while (pDstCurrent <= destinationEnd)
{
Vector128<float> srcVector = Sse.LoadVector128(pSrcCurrent);
srcVector = Sse.Multiply(srcVector, scaleVector);
Expand Down Expand Up @@ -930,11 +932,12 @@ public static unsafe void ScaleAddU(float a, float b, Span<float> dst)
{
float* pDstEnd = pdst + dst.Length;
float* pDstCurrent = pdst;
int destinationEnd = pDstEnd - 4;

Vector128<float> aVector = Sse.SetAllVector128(a);
Vector128<float> bVector = Sse.SetAllVector128(b);

while (pDstCurrent + 4 <= pDstEnd)
while (pDstCurrent <= destinationEnd)
{
Vector128<float> dstVector = Sse.LoadVector128(pDstCurrent);
dstVector = Sse.Add(dstVector, bVector);
Expand Down

0 comments on commit 71c9ff3

Please sign in to comment.