I accidentaly found performance reggresion using"mandelbrot algorithm" #80757

milen-denev · 2023-01-17T23:31:31Z

The place from where I copied the code: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-csharpcore-1.html

Actual Code:

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;

namespace Test;

public static class Program
{
    public static void Main() 
    {
        BenchmarkRunner.Run<Benchy>();
    }

    [MemoryDiagnoser(true)]
    [SimpleJob(RuntimeMoniker.Net70)]
    [SimpleJob(RuntimeMoniker.Net60, baseline: true)]
    public class Benchy
    {
        [Benchmark]
        public void Benchmark1()
        {
            MandelBrot.Default();
        }
    }

    public class MandelBrot
    {
        // x86 version, AVX2
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static byte Process8(double x, double y, double dx)
        {
            // initial x coords
            var x01 = Vector256.Create(x + 0 * dx, x + 1 * dx, x + 2 * dx, x + 3 * dx);
            var x02 = Vector256.Create(x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx);

            // initial y coords
            var y0 = Vector256.Create(y);

            Vector256<double> x1 = x01, y1 = y0; // current iteration 1
            Vector256<double> x2 = x02, y2 = y0; // current iteration 2

            Vector256<double> four = Vector256.Create(4.0); // 4 in each slot        

            var pass = 0;

            // temp space, C# requires init.
            Vector256<double>
                x12 = Vector256<double>.Zero,
                y12 = Vector256<double>.Zero,
                x22 = Vector256<double>.Zero,
                y22 = Vector256<double>.Zero;

            // bit masks for results
            uint res1 = 1, res2 = 1;

            while (pass < 49 && (res1 != 0 || res2 != 0))
            {

                // do several between checks a time like other code
                for (var p = 0; p < 7; ++p)
                {
                    // unroll loop 2x to decrease register stalls

                    // squares x*x and y*y
                    x12 = Avx2.Multiply(x1, x1);
                    y12 = Avx2.Multiply(y1, y1);
                    x22 = Avx2.Multiply(x2, x2);
                    y22 = Avx2.Multiply(y2, y2);

                    // mixed products x*y
                    var xy1 = Avx2.Multiply(x1, y1);
                    var xy2 = Avx2.Multiply(x2, y2);

                    // diff of squares x*x - y*y
                    var ds1 = Avx2.Subtract(x12, y12);
                    var ds2 = Avx2.Subtract(x22, y22);

                    // 2*x*y
                    xy1 = Avx2.Add(xy1, xy1);
                    xy2 = Avx2.Add(xy2, xy2);

                    // next iters
                    y1 = Avx2.Add(xy1, y0);
                    y2 = Avx2.Add(xy2, y0);
                    x1 = Avx2.Add(ds1, x01);
                    x2 = Avx2.Add(ds2, x02);
                }
                pass += 7;

                // numbers overflow, which gives an Infinity or NaN, which, 
                // when compared N < 4, results in false, which is what we want

                // sum of squares x*x + y*y, compare to 4 (escape mandelbrot)
                var ss1 = Avx2.Add(x12, y12);
                var ss2 = Avx2.Add(x22, y22);

                // compare - puts all 0 in reg if false, else all 1 (=NaN bitwise)
                // when each register is 0, then all points escaped, so exit
                var cmp1 = Avx.Compare(ss1, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);
                var cmp2 = Avx.Compare(ss2, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);

                // take top bit from each byte
                res1 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp1));
                res2 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp2));
            }

            // can make a mask of bits in any order, which is the +7, +6, .., +1, +0
            res1 &=
                (1 << (0 + 7)) |
                (1 << (8 + 6)) |
                (1 << (16 + 5)) |
                (1 << (24 + 4));
            res2 &=
                (1 << (0 + 3)) |
                (1 << (8 + 2)) |
                (1 << (16 + 1)) |
                (1 << (24 + 0));

            var res = res1 | res2;
            res |= res >> 16;
            res |= res >> 8;
            return (byte)(res);
        }

        public static void MainNew()
        {

            var size = 200;
            var lineLength = size >> 3;
            var data = new byte[size * lineLength];

            // step size
            var delta = 2.0 / size; // (0.5 - (-1.5))/size;

            Parallel.For(0, size, y =>
            {
                var yd = y * delta - 1;
                for (var x = 0; x < lineLength; x++)
                {
                    var xd = (x * 8) * delta - 1.5;
                    data[y * lineLength + x] = Process8(xd, yd, delta);
                }
            });
        }

        public static void Default()
        {
            MainNew();
        }
    }
}

Configuration

BenchmarkDotNet=v0.13.4, OS=Windows 11 (10.0.22621.1105)
Intel Core i5-10400 CPU 2.90GHz, 1 CPU, 12 logical and 6 physical cores
.NET SDK=7.0.102
[Host] : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2
.NET 6.0 : .NET 6.0.13 (6.0.1322.58009), X64 RyuJIT AVX2
.NET 7.0 : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2

   <TargetFrameworks>net6.0;net7.0;</TargetFrameworks>
   <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   <PublishAot>false</PublishAot>
   <ServerGarbageCollection>true</ServerGarbageCollection>
   <TieredPGO>true</TieredPGO>
   <TieredCompilationQuickJitForLoops>true</TieredCompilationQuickJitForLoops>
   <Platforms>AnyCPU;x64</Platforms>

Regression?

The Regression is tested from .NET 6 -> .NET 7

Data

Method	Job	Runtime	Mean	Error	StdDev	Ratio	RatioSD	Gen0	Allocated	Alloc Ratio
Benchmark1	.NET 6.0	.NET 6.0	81.72 us	1.496 us	1.400 us	1.00	0.00	0.1221	8.69 KB	1.00
Benchmark1	.NET 7.0	.NET 7.0	101.81 us	0.699 us	0.654 us	1.25	0.02	-	8.53 KB	0.98

The text was updated successfully, but these errors were encountered:

dotnet-issue-labeler · 2023-01-17T23:31:34Z

I couldn't figure out the best area label to add to this issue. If you have write-permissions please help me learn by adding exactly one area label.

ghost · 2023-01-17T23:36:57Z

Tagging subscribers to this area: @JulieLeeMSFT, @jakobbotsch, @kunalspathak
See info in area-owners.md if you want to be subscribed.

Issue Details

The place from where I copied the code: https://benchmarksgame-team.pages.debian.net/benchmarksgame/program/mandelbrot-csharpcore-1.html

Actual Code:

using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BenchmarkDotNet.Running;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;

namespace Test;

public static class Program
{
    public static void Main() 
    {
        BenchmarkRunner.Run<Benchy>();
    }

    [MemoryDiagnoser(true)]
    [SimpleJob(RuntimeMoniker.Net70)]
    [SimpleJob(RuntimeMoniker.Net60, baseline: true)]
    public class Benchy
    {
        [Benchmark]
        public void Benchmark1()
        {
            MandelBrot.Default();
        }
    }

    public class MandelBrot
    {
        // x86 version, AVX2
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        static byte Process8(double x, double y, double dx)
        {
            // initial x coords
            var x01 = Vector256.Create(x + 0 * dx, x + 1 * dx, x + 2 * dx, x + 3 * dx);
            var x02 = Vector256.Create(x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx);

            // initial y coords
            var y0 = Vector256.Create(y);

            Vector256<double> x1 = x01, y1 = y0; // current iteration 1
            Vector256<double> x2 = x02, y2 = y0; // current iteration 2

            Vector256<double> four = Vector256.Create(4.0); // 4 in each slot        

            var pass = 0;

            // temp space, C# requires init.
            Vector256<double>
                x12 = Vector256<double>.Zero,
                y12 = Vector256<double>.Zero,
                x22 = Vector256<double>.Zero,
                y22 = Vector256<double>.Zero;

            // bit masks for results
            uint res1 = 1, res2 = 1;

            while (pass < 49 && (res1 != 0 || res2 != 0))
            {

                // do several between checks a time like other code
                for (var p = 0; p < 7; ++p)
                {
                    // unroll loop 2x to decrease register stalls

                    // squares x*x and y*y
                    x12 = Avx2.Multiply(x1, x1);
                    y12 = Avx2.Multiply(y1, y1);
                    x22 = Avx2.Multiply(x2, x2);
                    y22 = Avx2.Multiply(y2, y2);

                    // mixed products x*y
                    var xy1 = Avx2.Multiply(x1, y1);
                    var xy2 = Avx2.Multiply(x2, y2);

                    // diff of squares x*x - y*y
                    var ds1 = Avx2.Subtract(x12, y12);
                    var ds2 = Avx2.Subtract(x22, y22);

                    // 2*x*y
                    xy1 = Avx2.Add(xy1, xy1);
                    xy2 = Avx2.Add(xy2, xy2);

                    // next iters
                    y1 = Avx2.Add(xy1, y0);
                    y2 = Avx2.Add(xy2, y0);
                    x1 = Avx2.Add(ds1, x01);
                    x2 = Avx2.Add(ds2, x02);
                }
                pass += 7;

                // numbers overflow, which gives an Infinity or NaN, which, 
                // when compared N < 4, results in false, which is what we want

                // sum of squares x*x + y*y, compare to 4 (escape mandelbrot)
                var ss1 = Avx2.Add(x12, y12);
                var ss2 = Avx2.Add(x22, y22);

                // compare - puts all 0 in reg if false, else all 1 (=NaN bitwise)
                // when each register is 0, then all points escaped, so exit
                var cmp1 = Avx.Compare(ss1, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);
                var cmp2 = Avx.Compare(ss2, four,
                        FloatComparisonMode.OrderedLessThanOrEqualNonSignaling);

                // take top bit from each byte
                res1 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp1));
                res2 = (uint)Avx2.MoveMask(Vector256.AsByte(cmp2));
            }

            // can make a mask of bits in any order, which is the +7, +6, .., +1, +0
            res1 &=
                (1 << (0 + 7)) |
                (1 << (8 + 6)) |
                (1 << (16 + 5)) |
                (1 << (24 + 4));
            res2 &=
                (1 << (0 + 3)) |
                (1 << (8 + 2)) |
                (1 << (16 + 1)) |
                (1 << (24 + 0));

            var res = res1 | res2;
            res |= res >> 16;
            res |= res >> 8;
            return (byte)(res);
        }

        public static void MainNew()
        {

            var size = 200;
            var lineLength = size >> 3;
            var data = new byte[size * lineLength];

            // step size
            var delta = 2.0 / size; // (0.5 - (-1.5))/size;

            Parallel.For(0, size, y =>
            {
                var yd = y * delta - 1;
                for (var x = 0; x < lineLength; x++)
                {
                    var xd = (x * 8) * delta - 1.5;
                    data[y * lineLength + x] = Process8(xd, yd, delta);
                }
            });
        }

        public static void Default()
        {
            MainNew();
        }
    }
}

Configuration

BenchmarkDotNet=v0.13.4, OS=Windows 11 (10.0.22621.1105)
Intel Core i5-10400 CPU 2.90GHz, 1 CPU, 12 logical and 6 physical cores
.NET SDK=7.0.102
[Host] : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2
.NET 6.0 : .NET 6.0.13 (6.0.1322.58009), X64 RyuJIT AVX2
.NET 7.0 : .NET 7.0.2 (7.0.222.60605), X64 RyuJIT AVX2

   <TargetFrameworks>net6.0;net7.0;</TargetFrameworks>
   <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
   <PublishAot>false</PublishAot>
   <ServerGarbageCollection>true</ServerGarbageCollection>
   <TieredPGO>true</TieredPGO>
   <TieredCompilationQuickJitForLoops>true</TieredCompilationQuickJitForLoops>
   <Platforms>AnyCPU;x64</Platforms>

Regression?

The Regression is tested from .NET 6 -> .NET 7

Data

Method	Job	Runtime	Mean	Error	StdDev	Ratio	RatioSD	Gen0	Allocated	Alloc Ratio
Benchmark1	.NET 6.0	.NET 6.0	81.72 us	1.496 us	1.400 us	1.00	0.00	0.1221	8.69 KB	1.00
Benchmark1	.NET 7.0	.NET 7.0	101.81 us	0.699 us	0.654 us	1.25	0.02	-	8.53 KB	0.98

Author:	milen-denev
Assignees:	-
Labels:	`tenet-performance`, `area-CodeGen-coreclr`, `untriaged`
Milestone:	-

AndyAyersMS · 2023-01-17T23:56:46Z

Probably measuring OSR codegen (note this is with PGO enabled, so perhaps instrumented OSR code?).

MichalPetryka · 2023-01-18T00:07:27Z

Could you rerun the benchmark with [DisassemblyDiagnoser(maxDepth = 6)] and post the generated files here?

JulieLeeMSFT · 2023-01-19T18:58:40Z

Assigning back to @milen-denev for follow up test request.

Could you rerun the benchmark with [DisassemblyDiagnoser(maxDepth = 6)] and post the generated files here?

milen-denev · 2023-01-19T22:03:07Z

So, I am sorry for my late reply. I tested again with [DisassemblyDiagnoser(maxDepth: 6)] and actually only the second time I runned the benchmark and got a result where .net7.0 was slower:

Results (Same configuration, nothing changed):

Method	Job	Runtime	Mean	Error	StdDev	Ratio	Code Size	Allocated	Alloc Ratio
Benchmark1	.NET 6.0	.NET 6.0	125.2 us	0.73 us	0.69 us	1.00	9,680 B	9.21 KB	1.00
Benchmark1	.NET 7.0	.NET 7.0	130.5 us	1.12 us	1.05 us	1.04	10,393 B	8.97 KB	0.97

Files: https://cdn.denevcloud.net/milen-denev/bin.zip

milen-denev · 2023-01-19T22:14:23Z

I also tried:

	<TieredPGO>false</TieredPGO>
	<TieredCompilationQuickJitForLoops>false</TieredCompilationQuickJitForLoops>

Method	Job	Runtime	Mean	Error	StdDev	Ratio	Code Size	Allocated	Alloc Ratio
Benchmark1	.NET 6.0	.NET 6.0	125.1 us	0.73 us	0.68 us	1.00	9,680 B	9.22 KB	1.00
Benchmark1	.NET 7.0	.NET 7.0	127.4 us	0.28 us	0.27 us	1.02	10,393 B	8.99 KB	0.97

Files: https://cdn.denevcloud.net/milen-denev/bin2.zip

AndyAyersMS · 2023-03-30T23:44:28Z

Tried this on my box with .NET 6, .NET 7, .NET 8 preview2, and a current .NET 8 main build. There seems to be as fair bit of run to run variation here but in the handful of runs I did, .NET 8 main always was fastest.

Possibly a result of #83910, though I have not done proper diligence to prove that's the case.

I'm going to close this. @milen-denev if you get a chance to try a .NET 8 preview and still see a regression then please re-open. Please try preview 4 later so you pick up the change above.

Default

BenchmarkDotNet=v0.13.5, OS=Windows 11 (10.0.22621.1413/22H2/2022Update/SunValley2)
Intel Core i7-8700 CPU 3.20GHz (Coffee Lake), 1 CPU, 12 logical and 6 physical cores
.NET SDK=8.0.100-preview.2.23157.25
[Host] : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2
Job-XFQKPO : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-OPADBB : .NET 6.0.15 (6.0.1523.11507), X64 RyuJIT AVX2
Job-RVDABM : .NET 7.0.4 (7.0.423.11508), X64 RyuJIT AVX2
Job-YLZPTG : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2

Method	Runtime	Mean	Error	StdDev	Ratio	RatioSD
Benchmark1	.NET 6.0	100.94 us	1.918 us	1.884 us	1.00	0.00
Benchmark1	.NET 7.0	91.47 us	0.445 us	0.417 us	0.91	0.02
Benchmark1	.NET 8.0 p2	93.01 us	0.403 us	0.377 us	0.92	0.02
Benchmark1	.NET 8.0 main	82.95 us	0.646 us	0.604 us	0.82	0.02

TieredPGO

Via <TieredPGO>true</TieredPGO> in the csproj. (Note this has no effect on .NET 6, and only partial effect on .NET 7)

BenchmarkDotNet=v0.13.5, OS=Windows 11 (10.0.22621.1413/22H2/2022Update/SunValley2)
Intel Core i7-8700 CPU 3.20GHz (Coffee Lake), 1 CPU, 12 logical and 6 physical cores
.NET SDK=8.0.100-preview.2.23157.25
[Host] : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2
Job-XKOTDN : .NET 8.0.0 (42.42.42.42424), X64 RyuJIT AVX2
Job-HITSWH : .NET 6.0.15 (6.0.1523.11507), X64 RyuJIT AVX2
Job-TYYPKW : .NET 7.0.4 (7.0.423.11508), X64 RyuJIT AVX2
Job-ENCIEP : .NET 8.0.0 (8.0.23.12803), X64 RyuJIT AVX2

Method	Runtime	Mean	Error	StdDev	Ratio	RatioSD
Benchmark1	.NET 6.0	91.21 us	1.824 us	4.740 us	1.00	0.00
Benchmark1	.NET 7.0	96.41 us	0.280 us	0.248 us	1.16	0.08
Benchmark1	.NET 8.0 p2	92.35 us	1.478 us	1.383 us	1.11	0.07
Benchmark1	.NET 8.0 main	82.13 us	0.292 us	0.259 us	0.99	0.07

milen-denev added the tenet-performance Performance related issue label Jan 17, 2023

ghost added the untriaged New issue has not been triaged by the area owner label Jan 17, 2023

milen-denev changed the title ~~I accidentaly found performance reggresion in "mandelbrot"~~ I accidentaly found performance reggresion using"mandelbrot algorithm" Jan 17, 2023

EgorBo added the area-CodeGen-coreclr CLR JIT compiler in src/coreclr/src/jit and related components such as SuperPMI label Jan 17, 2023

JulieLeeMSFT assigned milen-denev Jan 19, 2023

JulieLeeMSFT removed the untriaged New issue has not been triaged by the area owner label Jan 19, 2023

JulieLeeMSFT added this to the 8.0.0 milestone Jan 19, 2023

JulieLeeMSFT assigned AndyAyersMS Jan 19, 2023

milen-denev removed their assignment Jan 21, 2023

AndyAyersMS mentioned this issue Mar 30, 2023

On Stack Replacement Next Steps #33658

Open

72 tasks

AndyAyersMS closed this as completed Mar 30, 2023

ghost locked as resolved and limited conversation to collaborators Apr 30, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

I accidentaly found performance reggresion using"mandelbrot algorithm" #80757

I accidentaly found performance reggresion using"mandelbrot algorithm" #80757

milen-denev commented Jan 17, 2023

dotnet-issue-labeler bot commented Jan 17, 2023

ghost commented Jan 17, 2023

Configuration

Regression?

Data

AndyAyersMS commented Jan 17, 2023

MichalPetryka commented Jan 18, 2023 •

edited

Loading

JulieLeeMSFT commented Jan 19, 2023

milen-denev commented Jan 19, 2023 •

edited

Loading

milen-denev commented Jan 19, 2023

AndyAyersMS commented Mar 30, 2023

I accidentaly found performance reggresion using"mandelbrot algorithm" #80757

I accidentaly found performance reggresion using"mandelbrot algorithm" #80757

Comments

milen-denev commented Jan 17, 2023

Configuration

Regression?

Data

dotnet-issue-labeler bot commented Jan 17, 2023

ghost commented Jan 17, 2023

Configuration

Regression?

Data

AndyAyersMS commented Jan 17, 2023

MichalPetryka commented Jan 18, 2023 • edited Loading

JulieLeeMSFT commented Jan 19, 2023

milen-denev commented Jan 19, 2023 • edited Loading

milen-denev commented Jan 19, 2023

AndyAyersMS commented Mar 30, 2023

Default

TieredPGO

MichalPetryka commented Jan 18, 2023 •

edited

Loading

milen-denev commented Jan 19, 2023 •

edited

Loading