-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Performance of Trigonometric math function have unbelievable loss at .NET8 #95954
Comments
Tagging subscribers to this area: @dotnet/area-system-numerics Issue DetailsDescriptionI have a compute program that have intense Trigonometric calculation. I try to compile with .NET 8 . Then run a benchmark. Compute speed drop more than 10 times more. ConfigurationOS: Windows 11 x64 DataAnalysisThe function I use is It's faster than
on .NET 6 & .NET 7 Then I change my code to
Here is benchmark result. I think there are some serious problem with internal implementation of
|
I don's see a performance difference in the following benchmark. Can you share more information about your benchmark?
[SimpleJob(RuntimeMoniker.Net60)]
[SimpleJob(RuntimeMoniker.Net70)]
[SimpleJob(RuntimeMoniker.Net80)]
public class Program
{
static void Main()
{
BenchmarkRunner.Run<Program>();
}
[Params(123.45)]
public double Angle { get; set; }
[Benchmark(Baseline = true)]
public double Separate()
{
double sin = Math.Sin(Angle);
double cos = Math.Cos(Angle);
return sin + cos;
}
[Benchmark]
public double SinCos()
{
(double sin, double cos) = Math.SinCos(Angle);
return sin + cos;
}
} |
Here is a performance profiler result for 1000 loop of the hotspot function. In debug mode, .NET7 and .NET8 are in the same behaviour. .NET 7 DEBUG.NET 8 DEBUGBut in Release mode , .NET8 Cost many times than .NET7. .NET 7 Release.NET 8 ReleaseAs I mentioned before, change code to seperate sin&cos gonna make .NET 7 and 8 in same execute time. So my inference is that |
I'm going deep in Assembly level and found this blowing my mind. .NET 7 REALEASE
.NET 8 REALEASE
compiler is far beyond my knowledge. Hope these infomations will be useful |
This looks like Tier-0 (unoptimized code). For BenchmarkDotNet you can use the |
Please share the full benchmark you're running. |
Check this Repo. VSOP2013 run demo will lead to benchmark result |
Can you add the It's possible this is something like the JCC Erratum that exists on Intel processors. It's also possible there is some subtle codegen difference that is impacting things here. Notably this currently just defers down to the C Runtime and there were no explicit changes to the logic done in the .NET 7/8 timeframe. |
I'm seeing a regression on Intel 11th gen (Tiger Lake) as well, though not of the same magnitude as shown on the hybrid models above Here's the DisassemblyDiagnoser output Edit: Oops, pasted without reading. It's useless... Expand.NET 6.0.25 (6.0.2523.51912), X64 RyuJIT AVX2; Demo.PerfTest.Compute()
mov [rsp+8],rcx
mov rcx,[rcx+8]
mov r8,[rsp+8]
mov r8,[r8+10]
mov edx,4
cmp [rcx],ecx
jmp near ptr VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
; Total bytes of code 30 ; VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
push rdi
push rsi
push rbp
push rbx
sub rsp,38
mov rsi,rcx
mov ebx,edx
mov rdi,r8
mov rcx,offset MT_VSOP2013.Calculator+<>c__DisplayClass4_0
call CORINFO_HELP_NEWSFAST
mov rbp,rax
lea rcx,[rbp+10]
mov rdx,rsi
call CORINFO_HELP_ASSIGN_REF
mov [rbp+20],ebx
lea rcx,[rbp+18]
mov rdx,rdi
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Double[]
mov edx,6
call CORINFO_HELP_NEWARR_1_VC
lea rcx,[rbp+8]
mov rdx,rax
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Action`1[[System.Int32, System.Private.CoreLib]]
call CORINFO_HELP_NEWSFAST
mov rsi,rax
lea rcx,[rsi+8]
mov rdx,rbp
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset VSOP2013.Calculator+<>c__DisplayClass4_0.<GetPlanetPosition>b__0(Int32)
mov [rsi+18],rcx
lea rcx,[rsp+20]
mov r9,rsi
xor edx,edx
mov r8d,6
call System.Threading.Tasks.Parallel.For(Int32, Int32, System.Action`1<Int32>)
mov rcx,offset MT_VSOP2013.VSOPResult_ELL
call CORINFO_HELP_NEWSFAST
mov rsi,rax
mov edx,[rbp+20]
mov rax,[rbp+18]
mov rdi,[rbp+8]
mov [rsi+18],edx
lea rcx,[rsi+8]
mov rdx,rax
call CORINFO_HELP_ASSIGN_REF
lea rcx,[rsi+10]
mov rdx,rdi
call CORINFO_HELP_ASSIGN_REF
mov rax,rsi
add rsp,38
pop rbx
pop rbp
pop rsi
pop rdi
ret
; Total bytes of code 226 .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX-512F+CD+BW+DQ+VL+VBMI; Demo.PerfTest.Compute()
mov r8,rcx
mov rcx,[r8+8]
mov r8,[r8+10]
mov edx,4
cmp [rcx],ecx
jmp qword ptr [7FFC1FB8F960]; VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
; Total bytes of code 24 ; VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
push rbp
push r14
push rdi
push rsi
push rbx
sub rsp,70
lea rbp,[rsp+90]
mov rbx,rcx
mov edi,edx
mov rsi,r8
mov rcx,offset MT_VSOP2013.Calculator+<>c__DisplayClass4_0
call CORINFO_HELP_NEWSFAST
mov r14,rax
lea rcx,[r14+10]
mov rdx,rbx
call CORINFO_HELP_ASSIGN_REF
mov [r14+20],edi
lea rcx,[r14+18]
mov rdx,rsi
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Double[]
mov edx,6
call CORINFO_HELP_NEWARR_1_VC
lea rcx,[r14+8]
mov rdx,rax
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Action`1[[System.Int32, System.Private.CoreLib]]
call CORINFO_HELP_NEWSFAST
mov rbx,rax
lea rcx,[rbx+8]
mov rdx,r14
call CORINFO_HELP_ASSIGN_REF
mov rcx,7FFC200B8888
mov [rbx+18],rcx
mov rcx,1B07C404478
mov rcx,[rcx]
mov [rsp+20],rcx
mov [rsp+28],rbx
xor ecx,ecx
mov [rsp+30],rcx
mov [rsp+38],rcx
mov [rsp+40],rcx
mov [rsp+48],rcx
lea rcx,[rbp-38]
mov rdx,offset MD_System.Threading.Tasks.Parallel.ForWorker[[System.Object, System.Private.CoreLib],[System.Int32, System.Private.CoreLib]](Int32, Int32, System.Threading.Tasks.ParallelOptions, System.Action`1<Int32>, System.Action`2<Int32,System.Threading.Tasks.ParallelLoopState>, System.Func`4<Int32,System.Threading.Tasks.ParallelLoopState,System.Object,System.Object>, System.Func`1<System.Object>, System.Action`1<System.Object>)
xor r8d,r8d
mov r9d,6
call qword ptr [7FFC1FBE55A8]; System.Threading.Tasks.Parallel.ForWorker[[System.__Canon, System.Private.CoreLib],[System.Int32, System.Private.CoreLib]](Int32, Int32, System.Threading.Tasks.ParallelOptions, System.Action`1<Int32>, System.Action`2<Int32,System.Threading.Tasks.ParallelLoopState>, System.Func`4<Int32,System.Threading.Tasks.ParallelLoopState,System.__Canon,System.__Canon>, System.Func`1<System.__Canon>, System.Action`1<System.__Canon>)
mov rcx,offset MT_VSOP2013.VSOPResult_ELL
call CORINFO_HELP_NEWSFAST
mov rbx,rax
mov ecx,[r14+20]
mov rdx,[r14+18]
mov rsi,[r14+8]
mov [rbx+18],ecx
lea rcx,[rbx+8]
call CORINFO_HELP_ASSIGN_REF
lea rcx,[rbx+10]
mov rdx,rsi
call CORINFO_HELP_ASSIGN_REF
mov rax,rbx
add rsp,70
pop rbx
pop rsi
pop rdi
pop r14
pop rbp
ret
; Total bytes of code 290 |
.NET 6.0.25 (6.0.2523.51912), X64 RyuJIT AVX2; Demo.PerfTest.Compute()
mov [rsp+8],rcx
mov rcx,[rcx+8]
mov r8,[rsp+8]
mov r8,[r8+10]
mov edx,4
cmp [rcx],ecx
jmp near ptr VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
; Total bytes of code 30 ; VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
push rdi
push rsi
push rbp
push rbx
sub rsp,38
mov rsi,rcx
mov ebx,edx
mov rdi,r8
mov rcx,offset MT_VSOP2013.Calculator+<>c__DisplayClass4_0
call CORINFO_HELP_NEWSFAST
mov rbp,rax
lea rcx,[rbp+10]
mov rdx,rsi
call CORINFO_HELP_ASSIGN_REF
mov [rbp+20],ebx
lea rcx,[rbp+18]
mov rdx,rdi
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Double[]
mov edx,6
call CORINFO_HELP_NEWARR_1_VC
lea rcx,[rbp+8]
mov rdx,rax
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Action`1[[System.Int32, System.Private.CoreLib]]
call CORINFO_HELP_NEWSFAST
mov rsi,rax
lea rcx,[rsi+8]
mov rdx,rbp
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset VSOP2013.Calculator+<>c__DisplayClass4_0.<GetPlanetPosition>b__0(Int32)
mov [rsi+18],rcx
lea rcx,[rsp+20]
mov r9,rsi
xor edx,edx
mov r8d,6
call System.Threading.Tasks.Parallel.For(Int32, Int32, System.Action`1<Int32>)
mov rcx,offset MT_VSOP2013.VSOPResult_ELL
call CORINFO_HELP_NEWSFAST
mov rsi,rax
mov edx,[rbp+20]
mov rax,[rbp+18]
mov rdi,[rbp+8]
mov [rsi+18],edx
lea rcx,[rsi+8]
mov rdx,rax
call CORINFO_HELP_ASSIGN_REF
lea rcx,[rsi+10]
mov rdx,rdi
call CORINFO_HELP_ASSIGN_REF
mov rax,rsi
add rsp,38
pop rbx
pop rbp
pop rsi
pop rdi
ret
; Total bytes of code 226 .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2; Demo.PerfTest.Compute()
mov r8,rcx
mov rcx,[r8+8]
mov r8,[r8+10]
mov edx,4
cmp [rcx],ecx
jmp qword ptr [7FFE58BD7870]; VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
; Total bytes of code 24 ; VSOP2013.Calculator.GetPlanetPosition(VSOP2013.VSOPBody, VSOP2013.VSOPTime)
push rbp
push r14
push rdi
push rsi
push rbx
sub rsp,70
lea rbp,[rsp+90]
mov rbx,rcx
mov edi,edx
mov rsi,r8
mov rcx,offset MT_VSOP2013.Calculator+<>c__DisplayClass4_0
call CORINFO_HELP_NEWSFAST
mov r14,rax
lea rcx,[r14+10]
mov rdx,rbx
call CORINFO_HELP_ASSIGN_REF
mov [r14+20],edi
lea rcx,[r14+18]
mov rdx,rsi
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Double[]
mov edx,6
call CORINFO_HELP_NEWARR_1_VC
lea rcx,[r14+8]
mov rdx,rax
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset MT_System.Action`1[[System.Int32, System.Private.CoreLib]]
call CORINFO_HELP_NEWSFAST
mov rbx,rax
lea rcx,[rbx+8]
mov rdx,r14
call CORINFO_HELP_ASSIGN_REF
mov rcx,offset VSOP2013.Calculator+<>c__DisplayClass4_0.<GetPlanetPosition>b__0(Int32)
mov [rbx+18],rcx
mov rcx,15CCB006470
mov rcx,[rcx]
mov [rsp+20],rcx
mov [rsp+28],rbx
xor ecx,ecx
mov [rsp+30],rcx
mov [rsp+38],rcx
mov [rsp+40],rcx
mov [rsp+48],rcx
lea rcx,[rbp-38]
mov rdx,offset MD_System.Threading.Tasks.Parallel.ForWorker[[System.Object, System.Private.CoreLib],[System.Int32, System.Private.CoreLib]](Int32, Int32, System.Threading.Tasks.ParallelOptions, System.Action`1<Int32>, System.Action`2<Int32,System.Threading.Tasks.ParallelLoopState>, System.Func`4<Int32,System.Threading.Tasks.ParallelLoopState,System.Object,System.Object>, System.Func`1<System.Object>, System.Action`1<System.Object>)
xor r8d,r8d
mov r9d,6
call qword ptr [7FFE58BDCED0]; System.Threading.Tasks.Parallel.ForWorker[[System.__Canon, System.Private.CoreLib],[System.Int32, System.Private.CoreLib]](Int32, Int32, System.Threading.Tasks.ParallelOptions, System.Action`1<Int32>, System.Action`2<Int32,System.Threading.Tasks.ParallelLoopState>, System.Func`4<Int32,System.Threading.Tasks.ParallelLoopState,System.__Canon,System.__Canon>, System.Func`1<System.__Canon>, System.Action`1<System.__Canon>)
mov rcx,offset MT_VSOP2013.VSOPResult_ELL
call CORINFO_HELP_NEWSFAST
mov rbx,rax
mov ecx,[r14+20]
mov rdx,[r14+18]
mov rsi,[r14+8]
mov [rbx+18],ecx
lea rcx,[rbx+8]
call CORINFO_HELP_ASSIGN_REF
lea rcx,[rbx+10]
mov rdx,rsi
call CORINFO_HELP_ASSIGN_REF
mov rax,rbx
add rsp,70
pop rbx
pop rsi
pop rdi
pop r14
pop rbp
ret
; Total bytes of code 290 |
You point out
|
Inference based on current situation.
|
If not using stackalloc fixes things for you, then great. It would be good to understand what is going on more deeply as it is still a bit mysterious, especially the huge performance differences you see. The only things I know of that cause that magnitude of perf issues are some very rare cases like handling partially initialized vector data. If you can run as admin on your box where you see very slow behavior, can you try and capture the ETW profile like I did above?
BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); instead of using
As part of the run it will print a line like the following:
Share out the ETL files for .NET 6 and .NET 8 and I or somebody else will dig in. Feel free to investigate on your own, if you know your way around perfview. |
etl files here |
I found IL Code of .Net6 and .Net8 are identical.
|
Thanks for the ETL files—is the table just above that text from the same run? When I analyze those files I didn't immediately see the huge slowdown in .NET 8. The raw data shows
and the time for 8.0 if anything is a bit faster. This assumes that Benchmark Dot Net actually did the same amount of work (iterations) per interval. To verify this we need to look at the log—do you happen to still have On my local runs BDN did 256 iterations per interval, but looking at your data above and I suspect it may have only done 1024.
|
here is the log. It was generated simutaniously with etl files. |
When I look at this native assymbly text compare result Further inference: stackalloc variable will cause RyuJIT think "not enough register" Left .NET6/ Right .NET8same situation on Left .NET6/ Right .NET8This is the limit of my knowledge. |
Per the log, BDN is NOT doing the same amount of work. It runs 1024 iterations/invocation with .NET 6, but only 64 for .NET 8.
Given that, the reported results are correct:
So the questions are: (1) why does BDN's strategy diverge, and (2) why does this lead to quite different results overall? Not clear yet which is cause and effect, but it seems like the benchmark runs slower so BDN does less work to meet its iteration time goal. In my local profiles I see the memory traffic in |
BDN (by default) does a variable amount of iterations depending on how much time the code takes. |
That's correct. It targets a run time of ~.5s and sets the You can set |
Thanks all, I know all too well how BDN's strategy can shift about. @kingsznhone can you try this experiment (with the unmodifed
We suspect maybe we are seeing AVX-SSE transition penalties triggered by the fact that the This fits what we know so far pretty well:
|
@AndyAyersMS Here is the result you need. It's closer to the answer. I attached etl files below btw, I wonder why I would prefer
|
Related: #82132 (comment) |
@tannergooding any thoughts on this? |
The issue here isn't the tuple, that's desirable from a usability perspective and for performance/efficiency on most platforms. The actual issue here ends being two parts:
For 1, there isn't much we can do. The code quality will ideally improve over time or we may be able to explicitly work around the issue by calling Sin and Cos from Managed for Windows in particular. For 2, this can be closed as a duplicate of #82132 The simple fix is we should be emitting On modern hardware, We have a separate issue (#11496) tracking our existing overuse of |
|
Thanks for your work. Will this fix patch be released with the next .NET8 minor Release? |
@kingsznhone I commented on that here: #98261 (comment) Reposting for convenience:
|
Description
I have a compute program that have intense Trigonometric calculation.
I try to compile with .NET 8 . Then run a benchmark.
Compute speed drop more than 10 times more.
Configuration
OS: Windows 11 x64
Runtime: .NET 6 7 8
Data
Analysis
The function I use is
(su,cu)= Math.SinCos(u);
It's faster than
on .NET 6 & .NET 7
Then I change my code to
Here is benchmark result.
I think there are some serious problem with internal implementation of
Math.SinCos()
method.The text was updated successfully, but these errors were encountered: